diff options
Diffstat (limited to 'net')
742 files changed, 39004 insertions, 20893 deletions
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig index e4a02ef55102..7fa0f382e7d1 100644 --- a/net/6lowpan/Kconfig +++ b/net/6lowpan/Kconfig @@ -1,6 +1,61 @@ -config 6LOWPAN +menuconfig 6LOWPAN tristate "6LoWPAN Support" depends on IPV6 ---help--- This enables IPv6 over Low power Wireless Personal Area Network - "6LoWPAN" which is supported by IEEE 802.15.4 or Bluetooth stacks. + +menuconfig 6LOWPAN_NHC + tristate "Next Header Compression Support" + depends on 6LOWPAN + default y + ---help--- + Support for next header compression. + +if 6LOWPAN_NHC + +config 6LOWPAN_NHC_DEST + tristate "Destination Options Header Support" + default y + ---help--- + 6LoWPAN IPv6 Destination Options Header compression according to + RFC6282. + +config 6LOWPAN_NHC_FRAGMENT + tristate "Fragment Header Support" + default y + ---help--- + 6LoWPAN IPv6 Fragment Header compression according to RFC6282. + +config 6LOWPAN_NHC_HOP + tristate "Hop-by-Hop Options Header Support" + default y + ---help--- + 6LoWPAN IPv6 Hop-by-Hop Options Header compression according to + RFC6282. + +config 6LOWPAN_NHC_IPV6 + tristate "IPv6 Header Support" + default y + ---help--- + 6LoWPAN IPv6 Header compression according to RFC6282. + +config 6LOWPAN_NHC_MOBILITY + tristate "Mobility Header Support" + default y + ---help--- + 6LoWPAN IPv6 Mobility Header compression according to RFC6282. + +config 6LOWPAN_NHC_ROUTING + tristate "Routing Header Support" + default y + ---help--- + 6LoWPAN IPv6 Routing Header compression according to RFC6282. + +config 6LOWPAN_NHC_UDP + tristate "UDP Header Support" + default y + ---help--- + 6LoWPAN IPv6 UDP Header compression according to RFC6282. + +endif diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile index 415886bb456a..eb8baa72adc8 100644 --- a/net/6lowpan/Makefile +++ b/net/6lowpan/Makefile @@ -1,3 +1,12 @@ -obj-$(CONFIG_6LOWPAN) := 6lowpan.o +obj-$(CONFIG_6LOWPAN) += 6lowpan.o -6lowpan-y := iphc.o +6lowpan-y := iphc.o nhc.o + +#rfc6282 nhcs +obj-$(CONFIG_6LOWPAN_NHC_DEST) += nhc_dest.o +obj-$(CONFIG_6LOWPAN_NHC_FRAGMENT) += nhc_fragment.o +obj-$(CONFIG_6LOWPAN_NHC_HOP) += nhc_hop.o +obj-$(CONFIG_6LOWPAN_NHC_IPV6) += nhc_ipv6.o +obj-$(CONFIG_6LOWPAN_NHC_MOBILITY) += nhc_mobility.o +obj-$(CONFIG_6LOWPAN_NHC_ROUTING) += nhc_routing.o +obj-$(CONFIG_6LOWPAN_NHC_UDP) += nhc_udp.o diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 32ffec6ef164..94a375c04f21 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -54,6 +54,8 @@ #include <net/ipv6.h> #include <net/af_ieee802154.h> +#include "nhc.h" + /* Uncompress address function for source and * destination address(non-multicast). * @@ -224,77 +226,6 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb, return 0; } -static int uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) -{ - bool fail; - u8 tmp = 0, val = 0; - - fail = lowpan_fetch_skb(skb, &tmp, sizeof(tmp)); - - if ((tmp & LOWPAN_NHC_UDP_MASK) == LOWPAN_NHC_UDP_ID) { - pr_debug("UDP header uncompression\n"); - switch (tmp & LOWPAN_NHC_UDP_CS_P_11) { - case LOWPAN_NHC_UDP_CS_P_00: - fail |= lowpan_fetch_skb(skb, &uh->source, - sizeof(uh->source)); - fail |= lowpan_fetch_skb(skb, &uh->dest, - sizeof(uh->dest)); - break; - case LOWPAN_NHC_UDP_CS_P_01: - fail |= lowpan_fetch_skb(skb, &uh->source, - sizeof(uh->source)); - fail |= lowpan_fetch_skb(skb, &val, sizeof(val)); - uh->dest = htons(val + LOWPAN_NHC_UDP_8BIT_PORT); - break; - case LOWPAN_NHC_UDP_CS_P_10: - fail |= lowpan_fetch_skb(skb, &val, sizeof(val)); - uh->source = htons(val + LOWPAN_NHC_UDP_8BIT_PORT); - fail |= lowpan_fetch_skb(skb, &uh->dest, - sizeof(uh->dest)); - break; - case LOWPAN_NHC_UDP_CS_P_11: - fail |= lowpan_fetch_skb(skb, &val, sizeof(val)); - uh->source = htons(LOWPAN_NHC_UDP_4BIT_PORT + - (val >> 4)); - uh->dest = htons(LOWPAN_NHC_UDP_4BIT_PORT + - (val & 0x0f)); - break; - default: - pr_debug("ERROR: unknown UDP format\n"); - goto err; - } - - pr_debug("uncompressed UDP ports: src = %d, dst = %d\n", - ntohs(uh->source), ntohs(uh->dest)); - - /* checksum */ - if (tmp & LOWPAN_NHC_UDP_CS_C) { - pr_debug_ratelimited("checksum elided currently not supported\n"); - goto err; - } else { - fail |= lowpan_fetch_skb(skb, &uh->check, - sizeof(uh->check)); - } - - /* UDP length needs to be infered from the lower layers - * here, we obtain the hint from the remaining size of the - * frame - */ - uh->len = htons(skb->len + sizeof(struct udphdr)); - pr_debug("uncompressed UDP length: src = %d", ntohs(uh->len)); - } else { - pr_debug("ERROR: unsupported NH format\n"); - goto err; - } - - if (fail) - goto err; - - return 0; -err: - return -EINVAL; -} - /* TTL uncompression values */ static const u8 lowpan_ttl_values[] = { 0, 1, 64, 255 }; @@ -425,29 +356,11 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, return -EINVAL; } - /* UDP data uncompression */ + /* Next header data uncompression */ if (iphc0 & LOWPAN_IPHC_NH_C) { - struct udphdr uh; - const int needed = sizeof(struct udphdr) + sizeof(hdr); - - if (uncompress_udp_header(skb, &uh)) - return -EINVAL; - - /* replace the compressed UDP head by the uncompressed UDP - * header - */ - err = skb_cow(skb, needed); - if (unlikely(err)) + err = lowpan_nhc_do_uncompression(skb, dev, &hdr); + if (err < 0) return err; - - skb_push(skb, sizeof(struct udphdr)); - skb_reset_transport_header(skb); - skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr)); - - raw_dump_table(__func__, "raw UDP header dump", - (u8 *)&uh, sizeof(uh)); - - hdr.nexthdr = UIP_PROTO_UDP; } else { err = skb_cow(skb, sizeof(hdr)); if (unlikely(err)) @@ -500,71 +413,6 @@ static u8 lowpan_compress_addr_64(u8 **hc_ptr, u8 shift, return rol8(val, shift); } -static void compress_udp_header(u8 **hc_ptr, struct sk_buff *skb) -{ - struct udphdr *uh; - u8 tmp; - - /* In the case of RAW sockets the transport header is not set by - * the ip6 stack so we must set it ourselves - */ - if (skb->transport_header == skb->network_header) - skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - - uh = udp_hdr(skb); - - if (((ntohs(uh->source) & LOWPAN_NHC_UDP_4BIT_MASK) == - LOWPAN_NHC_UDP_4BIT_PORT) && - ((ntohs(uh->dest) & LOWPAN_NHC_UDP_4BIT_MASK) == - LOWPAN_NHC_UDP_4BIT_PORT)) { - pr_debug("UDP header: both ports compression to 4 bits\n"); - /* compression value */ - tmp = LOWPAN_NHC_UDP_CS_P_11; - lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); - /* source and destination port */ - tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_4BIT_PORT + - ((ntohs(uh->source) - LOWPAN_NHC_UDP_4BIT_PORT) << 4); - lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); - } else if ((ntohs(uh->dest) & LOWPAN_NHC_UDP_8BIT_MASK) == - LOWPAN_NHC_UDP_8BIT_PORT) { - pr_debug("UDP header: remove 8 bits of dest\n"); - /* compression value */ - tmp = LOWPAN_NHC_UDP_CS_P_01; - lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); - /* source port */ - lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source)); - /* destination port */ - tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_8BIT_PORT; - lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); - } else if ((ntohs(uh->source) & LOWPAN_NHC_UDP_8BIT_MASK) == - LOWPAN_NHC_UDP_8BIT_PORT) { - pr_debug("UDP header: remove 8 bits of source\n"); - /* compression value */ - tmp = LOWPAN_NHC_UDP_CS_P_10; - lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); - /* source port */ - tmp = ntohs(uh->source) - LOWPAN_NHC_UDP_8BIT_PORT; - lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); - /* destination port */ - lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest)); - } else { - pr_debug("UDP header: can't compress\n"); - /* compression value */ - tmp = LOWPAN_NHC_UDP_CS_P_00; - lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); - /* source port */ - lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source)); - /* destination port */ - lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest)); - } - - /* checksum is always inline */ - lowpan_push_hc_data(hc_ptr, &uh->check, sizeof(uh->check)); - - /* skip the UDP header */ - skb_pull(skb, sizeof(struct udphdr)); -} - int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) @@ -572,7 +420,7 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, u8 tmp, iphc0, iphc1, *hc_ptr; struct ipv6hdr *hdr; u8 head[100] = {}; - int addr_type; + int ret, addr_type; if (type != ETH_P_IPV6) return -EINVAL; @@ -649,13 +497,12 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, /* NOTE: payload length is always compressed */ - /* Next Header is compress if UDP */ - if (hdr->nexthdr == UIP_PROTO_UDP) - iphc0 |= LOWPAN_IPHC_NH_C; - - if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) - lowpan_push_hc_data(&hc_ptr, &hdr->nexthdr, - sizeof(hdr->nexthdr)); + /* Check if we provide the nhc format for nexthdr and compression + * functionality. If not nexthdr is handled inline and not compressed. + */ + ret = lowpan_nhc_check_compression(skb, hdr, &hc_ptr, &iphc0); + if (ret < 0) + return ret; /* Hop limit * if 1: compress, encoding is 01 @@ -741,9 +588,12 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, } } - /* UDP header compression */ - if (hdr->nexthdr == UIP_PROTO_UDP) - compress_udp_header(&hc_ptr, skb); + /* next header compression */ + if (iphc0 & LOWPAN_IPHC_NH_C) { + ret = lowpan_nhc_do_compression(skb, hdr, &hc_ptr); + if (ret < 0) + return ret; + } head[0] = iphc0; head[1] = iphc1; @@ -761,4 +611,18 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, } EXPORT_SYMBOL_GPL(lowpan_header_compress); +static int __init lowpan_module_init(void) +{ + request_module_nowait("nhc_dest"); + request_module_nowait("nhc_fragment"); + request_module_nowait("nhc_hop"); + request_module_nowait("nhc_ipv6"); + request_module_nowait("nhc_mobility"); + request_module_nowait("nhc_routing"); + request_module_nowait("nhc_udp"); + + return 0; +} +module_init(lowpan_module_init); + MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c new file mode 100644 index 000000000000..fd20fc51a7c4 --- /dev/null +++ b/net/6lowpan/nhc.c @@ -0,0 +1,241 @@ +/* + * 6LoWPAN next header compression + * + * + * Authors: + * Alexander Aring <aar@pengutronix.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/netdevice.h> + +#include <net/ipv6.h> + +#include "nhc.h" + +static struct rb_root rb_root = RB_ROOT; +static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX]; +static DEFINE_SPINLOCK(lowpan_nhc_lock); + +static int lowpan_nhc_insert(struct lowpan_nhc *nhc) +{ + struct rb_node **new = &rb_root.rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct lowpan_nhc *this = container_of(*new, struct lowpan_nhc, + node); + int result, len_dif, len; + + len_dif = nhc->idlen - this->idlen; + + if (nhc->idlen < this->idlen) + len = nhc->idlen; + else + len = this->idlen; + + result = memcmp(nhc->id, this->id, len); + if (!result) + result = len_dif; + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&nhc->node, parent, new); + rb_insert_color(&nhc->node, &rb_root); + + return 0; +} + +static void lowpan_nhc_remove(struct lowpan_nhc *nhc) +{ + rb_erase(&nhc->node, &rb_root); +} + +static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb) +{ + struct rb_node *node = rb_root.rb_node; + const u8 *nhcid_skb_ptr = skb->data; + + while (node) { + struct lowpan_nhc *nhc = container_of(node, struct lowpan_nhc, + node); + u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN]; + int result, i; + + if (nhcid_skb_ptr + nhc->idlen > skb->data + skb->len) + return NULL; + + /* copy and mask afterwards the nhid value from skb */ + memcpy(nhcid_skb_ptr_masked, nhcid_skb_ptr, nhc->idlen); + for (i = 0; i < nhc->idlen; i++) + nhcid_skb_ptr_masked[i] &= nhc->idmask[i]; + + result = memcmp(nhcid_skb_ptr_masked, nhc->id, nhc->idlen); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return nhc; + } + + return NULL; +} + +int lowpan_nhc_check_compression(struct sk_buff *skb, + const struct ipv6hdr *hdr, u8 **hc_ptr, + u8 *iphc0) +{ + struct lowpan_nhc *nhc; + + spin_lock_bh(&lowpan_nhc_lock); + + nhc = lowpan_nexthdr_nhcs[hdr->nexthdr]; + if (nhc && nhc->compress) + *iphc0 |= LOWPAN_IPHC_NH_C; + else + lowpan_push_hc_data(hc_ptr, &hdr->nexthdr, + sizeof(hdr->nexthdr)); + + spin_unlock_bh(&lowpan_nhc_lock); + + return 0; +} + +int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr, + u8 **hc_ptr) +{ + int ret; + struct lowpan_nhc *nhc; + + spin_lock_bh(&lowpan_nhc_lock); + + nhc = lowpan_nexthdr_nhcs[hdr->nexthdr]; + /* check if the nhc module was removed in unlocked part. + * TODO: this is a workaround we should prevent unloading + * of nhc modules while unlocked part, this will always drop + * the lowpan packet but it's very unlikely. + * + * Solution isn't easy because we need to decide at + * lowpan_nhc_check_compression if we do a compression or not. + * Because the inline data which is added to skb, we can't move this + * handling. + */ + if (unlikely(!nhc || !nhc->compress)) { + ret = -EINVAL; + goto out; + } + + /* In the case of RAW sockets the transport header is not set by + * the ip6 stack so we must set it ourselves + */ + if (skb->transport_header == skb->network_header) + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + ret = nhc->compress(skb, hc_ptr); + if (ret < 0) + goto out; + + /* skip the transport header */ + skb_pull(skb, nhc->nexthdrlen); + +out: + spin_unlock_bh(&lowpan_nhc_lock); + + return ret; +} + +int lowpan_nhc_do_uncompression(struct sk_buff *skb, struct net_device *dev, + struct ipv6hdr *hdr) +{ + struct lowpan_nhc *nhc; + int ret; + + spin_lock_bh(&lowpan_nhc_lock); + + nhc = lowpan_nhc_by_nhcid(skb); + if (nhc) { + if (nhc->uncompress) { + ret = nhc->uncompress(skb, sizeof(struct ipv6hdr) + + nhc->nexthdrlen); + if (ret < 0) { + spin_unlock_bh(&lowpan_nhc_lock); + return ret; + } + } else { + spin_unlock_bh(&lowpan_nhc_lock); + netdev_warn(dev, "received nhc id for %s which is not implemented.\n", + nhc->name); + return -ENOTSUPP; + } + } else { + spin_unlock_bh(&lowpan_nhc_lock); + netdev_warn(dev, "received unknown nhc id which was not found.\n"); + return -ENOENT; + } + + hdr->nexthdr = nhc->nexthdr; + skb_reset_transport_header(skb); + raw_dump_table(__func__, "raw transport header dump", + skb_transport_header(skb), nhc->nexthdrlen); + + spin_unlock_bh(&lowpan_nhc_lock); + + return 0; +} + +int lowpan_nhc_add(struct lowpan_nhc *nhc) +{ + int ret; + + if (!nhc->idlen || !nhc->idsetup) + return -EINVAL; + + WARN_ONCE(nhc->idlen > LOWPAN_NHC_MAX_ID_LEN, + "LOWPAN_NHC_MAX_ID_LEN should be updated to %zd.\n", + nhc->idlen); + + nhc->idsetup(nhc); + + spin_lock_bh(&lowpan_nhc_lock); + + if (lowpan_nexthdr_nhcs[nhc->nexthdr]) { + ret = -EEXIST; + goto out; + } + + ret = lowpan_nhc_insert(nhc); + if (ret < 0) + goto out; + + lowpan_nexthdr_nhcs[nhc->nexthdr] = nhc; +out: + spin_unlock_bh(&lowpan_nhc_lock); + return ret; +} +EXPORT_SYMBOL(lowpan_nhc_add); + +void lowpan_nhc_del(struct lowpan_nhc *nhc) +{ + spin_lock_bh(&lowpan_nhc_lock); + + lowpan_nhc_remove(nhc); + lowpan_nexthdr_nhcs[nhc->nexthdr] = NULL; + + spin_unlock_bh(&lowpan_nhc_lock); + + synchronize_net(); +} +EXPORT_SYMBOL(lowpan_nhc_del); diff --git a/net/6lowpan/nhc.h b/net/6lowpan/nhc.h new file mode 100644 index 000000000000..ed44938eb5de --- /dev/null +++ b/net/6lowpan/nhc.h @@ -0,0 +1,146 @@ +#ifndef __6LOWPAN_NHC_H +#define __6LOWPAN_NHC_H + +#include <linux/skbuff.h> +#include <linux/rbtree.h> +#include <linux/module.h> + +#include <net/6lowpan.h> +#include <net/ipv6.h> + +#define LOWPAN_NHC_MAX_ID_LEN 1 + +/** + * LOWPAN_NHC - helper macro to generate nh id fields and lowpan_nhc struct + * + * @__nhc: variable name of the lowpan_nhc struct. + * @_name: const char * of common header compression name. + * @_nexthdr: ipv6 nexthdr field for the header compression. + * @_nexthdrlen: ipv6 nexthdr len for the reserved space. + * @_idsetup: callback to setup id and mask values. + * @_idlen: len for the next header id and mask, should be always the same. + * @_uncompress: callback for uncompression call. + * @_compress: callback for compression call. + */ +#define LOWPAN_NHC(__nhc, _name, _nexthdr, \ + _hdrlen, _idsetup, _idlen, \ + _uncompress, _compress) \ +static u8 __nhc##_val[_idlen]; \ +static u8 __nhc##_mask[_idlen]; \ +static struct lowpan_nhc __nhc = { \ + .name = _name, \ + .nexthdr = _nexthdr, \ + .nexthdrlen = _hdrlen, \ + .id = __nhc##_val, \ + .idmask = __nhc##_mask, \ + .idlen = _idlen, \ + .idsetup = _idsetup, \ + .uncompress = _uncompress, \ + .compress = _compress, \ +} + +#define module_lowpan_nhc(__nhc) \ +static int __init __nhc##_init(void) \ +{ \ + return lowpan_nhc_add(&(__nhc)); \ +} \ +module_init(__nhc##_init); \ +static void __exit __nhc##_exit(void) \ +{ \ + lowpan_nhc_del(&(__nhc)); \ +} \ +module_exit(__nhc##_exit); + +/** + * struct lowpan_nhc - hold 6lowpan next hdr compression ifnformation + * + * @node: holder for the rbtree. + * @name: name of the specific next header compression + * @nexthdr: next header value of the protocol which should be compressed. + * @nexthdrlen: ipv6 nexthdr len for the reserved space. + * @id: array for nhc id. Note this need to be in network byteorder. + * @mask: array for nhc id mask. Note this need to be in network byteorder. + * @len: the length of the next header id and mask. + * @setup: callback to setup fill the next header id value and mask. + * @compress: callback to do the header compression. + * @uncompress: callback to do the header uncompression. + */ +struct lowpan_nhc { + struct rb_node node; + const char *name; + const u8 nexthdr; + const size_t nexthdrlen; + u8 *id; + u8 *idmask; + const size_t idlen; + + void (*idsetup)(struct lowpan_nhc *nhc); + int (*uncompress)(struct sk_buff *skb, size_t needed); + int (*compress)(struct sk_buff *skb, u8 **hc_ptr); +}; + +/** + * lowpan_nhc_by_nexthdr - return the 6lowpan nhc by ipv6 nexthdr. + * + * @nexthdr: ipv6 nexthdr value. + */ +struct lowpan_nhc *lowpan_nhc_by_nexthdr(u8 nexthdr); + +/** + * lowpan_nhc_check_compression - checks if we support compression format. If + * we support the nhc by nexthdr field, the 6LoWPAN iphc NHC bit will be + * set. If we don't support nexthdr will be added as inline data to the + * 6LoWPAN header. + * + * @skb: skb of 6LoWPAN header to read nhc and replace header. + * @hdr: ipv6hdr to check the nexthdr value + * @hc_ptr: pointer for 6LoWPAN header which should increment at the end of + * replaced header. + * @iphc0: iphc0 pointer to set the 6LoWPAN NHC bit + */ +int lowpan_nhc_check_compression(struct sk_buff *skb, + const struct ipv6hdr *hdr, u8 **hc_ptr, + u8 *iphc0); + +/** + * lowpan_nhc_do_compression - calling compress callback for nhc + * + * @skb: skb of 6LoWPAN header to read nhc and replace header. + * @hdr: ipv6hdr to set the nexthdr value + * @hc_ptr: pointer for 6LoWPAN header which should increment at the end of + * replaced header. + */ +int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr, + u8 **hc_ptr); + +/** + * lowpan_nhc_do_uncompression - calling uncompress callback for nhc + * + * @nhc: 6LoWPAN nhc context, get by lowpan_nhc_by_ functions. + * @skb: skb of 6LoWPAN header, skb->data should be pointed to nhc id value. + * @dev: netdevice for print logging information. + * @hdr: ipv6hdr for setting nexthdr value. + */ +int lowpan_nhc_do_uncompression(struct sk_buff *skb, struct net_device *dev, + struct ipv6hdr *hdr); + +/** + * lowpan_nhc_add - register a next header compression to framework + * + * @nhc: nhc which should be add. + */ +int lowpan_nhc_add(struct lowpan_nhc *nhc); + +/** + * lowpan_nhc_del - delete a next header compression from framework + * + * @nhc: nhc which should be delete. + */ +void lowpan_nhc_del(struct lowpan_nhc *nhc); + +/** + * lowpan_nhc_init - adding all default nhcs + */ +void lowpan_nhc_init(void); + +#endif /* __6LOWPAN_NHC_H */ diff --git a/net/6lowpan/nhc_dest.c b/net/6lowpan/nhc_dest.c new file mode 100644 index 000000000000..0b292c9646eb --- /dev/null +++ b/net/6lowpan/nhc_dest.c @@ -0,0 +1,28 @@ +/* + * 6LoWPAN IPv6 Destination Options Header compression according to + * RFC6282 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_NHC_DEST_IDLEN 1 +#define LOWPAN_NHC_DEST_ID_0 0xe6 +#define LOWPAN_NHC_DEST_MASK_0 0xfe + +static void dest_nhid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_NHC_DEST_ID_0; + nhc->idmask[0] = LOWPAN_NHC_DEST_MASK_0; +} + +LOWPAN_NHC(nhc_dest, "RFC6282 Destination Options", NEXTHDR_DEST, 0, + dest_nhid_setup, LOWPAN_NHC_DEST_IDLEN, NULL, NULL); + +module_lowpan_nhc(nhc_dest); +MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Destination Options compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_fragment.c b/net/6lowpan/nhc_fragment.c new file mode 100644 index 000000000000..473dbc58ef84 --- /dev/null +++ b/net/6lowpan/nhc_fragment.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN IPv6 Fragment Header compression according to RFC6282 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_NHC_FRAGMENT_IDLEN 1 +#define LOWPAN_NHC_FRAGMENT_ID_0 0xe4 +#define LOWPAN_NHC_FRAGMENT_MASK_0 0xfe + +static void fragment_nhid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_NHC_FRAGMENT_ID_0; + nhc->idmask[0] = LOWPAN_NHC_FRAGMENT_MASK_0; +} + +LOWPAN_NHC(nhc_fragment, "RFC6282 Fragment", NEXTHDR_FRAGMENT, 0, + fragment_nhid_setup, LOWPAN_NHC_FRAGMENT_IDLEN, NULL, NULL); + +module_lowpan_nhc(nhc_fragment); +MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Fragment compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_hop.c b/net/6lowpan/nhc_hop.c new file mode 100644 index 000000000000..1eb66be16f19 --- /dev/null +++ b/net/6lowpan/nhc_hop.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN IPv6 Hop-by-Hop Options Header compression according to RFC6282 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_NHC_HOP_IDLEN 1 +#define LOWPAN_NHC_HOP_ID_0 0xe0 +#define LOWPAN_NHC_HOP_MASK_0 0xfe + +static void hop_nhid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_NHC_HOP_ID_0; + nhc->idmask[0] = LOWPAN_NHC_HOP_MASK_0; +} + +LOWPAN_NHC(nhc_hop, "RFC6282 Hop-by-Hop Options", NEXTHDR_HOP, 0, + hop_nhid_setup, LOWPAN_NHC_HOP_IDLEN, NULL, NULL); + +module_lowpan_nhc(nhc_hop); +MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Hop-by-Hop Options compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_ipv6.c b/net/6lowpan/nhc_ipv6.c new file mode 100644 index 000000000000..2313d1600af3 --- /dev/null +++ b/net/6lowpan/nhc_ipv6.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN IPv6 Header compression according to RFC6282 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_NHC_IPV6_IDLEN 1 +#define LOWPAN_NHC_IPV6_ID_0 0xee +#define LOWPAN_NHC_IPV6_MASK_0 0xfe + +static void ipv6_nhid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_NHC_IPV6_ID_0; + nhc->idmask[0] = LOWPAN_NHC_IPV6_MASK_0; +} + +LOWPAN_NHC(nhc_ipv6, "RFC6282 IPv6", NEXTHDR_IPV6, 0, ipv6_nhid_setup, + LOWPAN_NHC_IPV6_IDLEN, NULL, NULL); + +module_lowpan_nhc(nhc_ipv6); +MODULE_DESCRIPTION("6LoWPAN next header RFC6282 IPv6 compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_mobility.c b/net/6lowpan/nhc_mobility.c new file mode 100644 index 000000000000..60d3f3886c98 --- /dev/null +++ b/net/6lowpan/nhc_mobility.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN IPv6 Mobility Header compression according to RFC6282 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_NHC_MOBILITY_IDLEN 1 +#define LOWPAN_NHC_MOBILITY_ID_0 0xe8 +#define LOWPAN_NHC_MOBILITY_MASK_0 0xfe + +static void mobility_nhid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_NHC_MOBILITY_ID_0; + nhc->idmask[0] = LOWPAN_NHC_MOBILITY_MASK_0; +} + +LOWPAN_NHC(nhc_mobility, "RFC6282 Mobility", NEXTHDR_MOBILITY, 0, + mobility_nhid_setup, LOWPAN_NHC_MOBILITY_IDLEN, NULL, NULL); + +module_lowpan_nhc(nhc_mobility); +MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Mobility compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_routing.c b/net/6lowpan/nhc_routing.c new file mode 100644 index 000000000000..c393280f11c4 --- /dev/null +++ b/net/6lowpan/nhc_routing.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN IPv6 Routing Header compression according to RFC6282 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_NHC_ROUTING_IDLEN 1 +#define LOWPAN_NHC_ROUTING_ID_0 0xe2 +#define LOWPAN_NHC_ROUTING_MASK_0 0xfe + +static void routing_nhid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_NHC_ROUTING_ID_0; + nhc->idmask[0] = LOWPAN_NHC_ROUTING_MASK_0; +} + +LOWPAN_NHC(nhc_routing, "RFC6282 Routing", NEXTHDR_ROUTING, 0, + routing_nhid_setup, LOWPAN_NHC_ROUTING_IDLEN, NULL, NULL); + +module_lowpan_nhc(nhc_routing); +MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Routing compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c new file mode 100644 index 000000000000..c6bcaeb428ae --- /dev/null +++ b/net/6lowpan/nhc_udp.c @@ -0,0 +1,157 @@ +/* + * 6LoWPAN IPv6 UDP compression according to RFC6282 + * + * + * Authors: + * Alexander Aring <aar@pengutronix.de> + * + * Orignal written by: + * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> + * Jon Smirl <jonsmirl@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_NHC_UDP_IDLEN 1 + +static int udp_uncompress(struct sk_buff *skb, size_t needed) +{ + u8 tmp = 0, val = 0; + struct udphdr uh; + bool fail; + int err; + + fail = lowpan_fetch_skb(skb, &tmp, sizeof(tmp)); + + pr_debug("UDP header uncompression\n"); + switch (tmp & LOWPAN_NHC_UDP_CS_P_11) { + case LOWPAN_NHC_UDP_CS_P_00: + fail |= lowpan_fetch_skb(skb, &uh.source, sizeof(uh.source)); + fail |= lowpan_fetch_skb(skb, &uh.dest, sizeof(uh.dest)); + break; + case LOWPAN_NHC_UDP_CS_P_01: + fail |= lowpan_fetch_skb(skb, &uh.source, sizeof(uh.source)); + fail |= lowpan_fetch_skb(skb, &val, sizeof(val)); + uh.dest = htons(val + LOWPAN_NHC_UDP_8BIT_PORT); + break; + case LOWPAN_NHC_UDP_CS_P_10: + fail |= lowpan_fetch_skb(skb, &val, sizeof(val)); + uh.source = htons(val + LOWPAN_NHC_UDP_8BIT_PORT); + fail |= lowpan_fetch_skb(skb, &uh.dest, sizeof(uh.dest)); + break; + case LOWPAN_NHC_UDP_CS_P_11: + fail |= lowpan_fetch_skb(skb, &val, sizeof(val)); + uh.source = htons(LOWPAN_NHC_UDP_4BIT_PORT + (val >> 4)); + uh.dest = htons(LOWPAN_NHC_UDP_4BIT_PORT + (val & 0x0f)); + break; + default: + BUG(); + } + + pr_debug("uncompressed UDP ports: src = %d, dst = %d\n", + ntohs(uh.source), ntohs(uh.dest)); + + /* checksum */ + if (tmp & LOWPAN_NHC_UDP_CS_C) { + pr_debug_ratelimited("checksum elided currently not supported\n"); + fail = true; + } else { + fail |= lowpan_fetch_skb(skb, &uh.check, sizeof(uh.check)); + } + + if (fail) + return -EINVAL; + + /* UDP length needs to be infered from the lower layers + * here, we obtain the hint from the remaining size of the + * frame + */ + uh.len = htons(skb->len + sizeof(struct udphdr)); + pr_debug("uncompressed UDP length: src = %d", ntohs(uh.len)); + + /* replace the compressed UDP head by the uncompressed UDP + * header + */ + err = skb_cow(skb, needed); + if (unlikely(err)) + return err; + + skb_push(skb, sizeof(struct udphdr)); + skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr)); + + return 0; +} + +static int udp_compress(struct sk_buff *skb, u8 **hc_ptr) +{ + const struct udphdr *uh = udp_hdr(skb); + u8 tmp; + + if (((ntohs(uh->source) & LOWPAN_NHC_UDP_4BIT_MASK) == + LOWPAN_NHC_UDP_4BIT_PORT) && + ((ntohs(uh->dest) & LOWPAN_NHC_UDP_4BIT_MASK) == + LOWPAN_NHC_UDP_4BIT_PORT)) { + pr_debug("UDP header: both ports compression to 4 bits\n"); + /* compression value */ + tmp = LOWPAN_NHC_UDP_CS_P_11; + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); + /* source and destination port */ + tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_4BIT_PORT + + ((ntohs(uh->source) - LOWPAN_NHC_UDP_4BIT_PORT) << 4); + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); + } else if ((ntohs(uh->dest) & LOWPAN_NHC_UDP_8BIT_MASK) == + LOWPAN_NHC_UDP_8BIT_PORT) { + pr_debug("UDP header: remove 8 bits of dest\n"); + /* compression value */ + tmp = LOWPAN_NHC_UDP_CS_P_01; + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); + /* source port */ + lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source)); + /* destination port */ + tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_8BIT_PORT; + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); + } else if ((ntohs(uh->source) & LOWPAN_NHC_UDP_8BIT_MASK) == + LOWPAN_NHC_UDP_8BIT_PORT) { + pr_debug("UDP header: remove 8 bits of source\n"); + /* compression value */ + tmp = LOWPAN_NHC_UDP_CS_P_10; + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); + /* source port */ + tmp = ntohs(uh->source) - LOWPAN_NHC_UDP_8BIT_PORT; + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); + /* destination port */ + lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest)); + } else { + pr_debug("UDP header: can't compress\n"); + /* compression value */ + tmp = LOWPAN_NHC_UDP_CS_P_00; + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); + /* source port */ + lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source)); + /* destination port */ + lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest)); + } + + /* checksum is always inline */ + lowpan_push_hc_data(hc_ptr, &uh->check, sizeof(uh->check)); + + return 0; +} + +static void udp_nhid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_NHC_UDP_ID; + nhc->idmask[0] = LOWPAN_NHC_UDP_MASK; +} + +LOWPAN_NHC(nhc_udp, "RFC6282 UDP", NEXTHDR_UDP, sizeof(struct udphdr), + udp_nhid_setup, LOWPAN_NHC_UDP_IDLEN, udp_uncompress, udp_compress); + +module_lowpan_nhc(nhc_udp); +MODULE_DESCRIPTION("6LoWPAN next header RFC6282 UDP compression"); +MODULE_LICENSE("GPL"); diff --git a/net/802/fc.c b/net/802/fc.c index 7c174b6750cd..7b9219022418 100644 --- a/net/802/fc.c +++ b/net/802/fc.c @@ -75,29 +75,8 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev, return -hdr_len; } -/* - * A neighbour discovery of some species (eg arp) has completed. We - * can now send the packet. - */ - -static int fc_rebuild_header(struct sk_buff *skb) -{ -#ifdef CONFIG_INET - struct fch_hdr *fch=(struct fch_hdr *)skb->data; - struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr)); - if(fcllc->ethertype != htons(ETH_P_IP)) { - printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype)); - return 0; - } - return arp_find(fch->daddr, skb); -#else - return 0; -#endif -} - static const struct header_ops fc_header_ops = { .create = fc_header, - .rebuild = fc_rebuild_header, }; static void fc_setup(struct net_device *dev) diff --git a/net/802/fddi.c b/net/802/fddi.c index 59e7346f1193..7d3a0af954e8 100644 --- a/net/802/fddi.c +++ b/net/802/fddi.c @@ -87,31 +87,6 @@ static int fddi_header(struct sk_buff *skb, struct net_device *dev, return -hl; } - -/* - * Rebuild the FDDI MAC header. This is called after an ARP - * (or in future other address resolution) has completed on - * this sk_buff. We now let ARP fill in the other fields. - */ - -static int fddi_rebuild_header(struct sk_buff *skb) -{ - struct fddihdr *fddi = (struct fddihdr *)skb->data; - -#ifdef CONFIG_INET - if (fddi->hdr.llc_snap.ethertype == htons(ETH_P_IP)) - /* Try to get ARP to resolve the header and fill destination address */ - return arp_find(fddi->daddr, skb); - else -#endif - { - printk("%s: Don't know how to resolve type %04X addresses.\n", - skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype)); - return 0; - } -} - - /* * Determine the packet's protocol ID and fill in skb fields. * This routine is called before an incoming packet is passed @@ -177,7 +152,6 @@ EXPORT_SYMBOL(fddi_change_mtu); static const struct header_ops fddi_header_ops = { .create = fddi_header, - .rebuild = fddi_rebuild_header, }; diff --git a/net/802/hippi.c b/net/802/hippi.c index 2e03f8259dd5..ade1a52cdcff 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -91,33 +91,6 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev, /* - * Rebuild the HIPPI MAC header. This is called after an ARP has - * completed on this sk_buff. We now let ARP fill in the other fields. - */ - -static int hippi_rebuild_header(struct sk_buff *skb) -{ - struct hippi_hdr *hip = (struct hippi_hdr *)skb->data; - - /* - * Only IP is currently supported - */ - - if(hip->snap.ethertype != htons(ETH_P_IP)) - { - printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n",skb->dev->name,ntohs(hip->snap.ethertype)); - return 0; - } - - /* - * We don't support dynamic ARP on HIPPI, but we use the ARP - * static ARP tables to hold the I-FIELDs. - */ - return arp_find(hip->le.daddr, skb); -} - - -/* * Determine the packet's protocol ID. */ @@ -186,7 +159,6 @@ EXPORT_SYMBOL(hippi_neigh_setup_dev); static const struct header_ops hippi_header_ops = { .create = hippi_header, - .rebuild = hippi_rebuild_header, }; diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 64c6bed4a3d3..d2cd9de4b724 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -413,7 +413,10 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan_transfer_features(dev, vlandev); break; - case NETDEV_DOWN: + case NETDEV_DOWN: { + struct net_device *tmp; + LIST_HEAD(close_list); + if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); @@ -425,15 +428,22 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) - dev_change_flags(vlandev, flgs & ~IFF_UP); + list_add(&vlandev->close_list, &close_list); + } + + dev_close_many(&close_list, false); + + list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { netif_stacked_transfer_operstate(dev, vlandev); + list_del_init(&vlandev->close_list); } + list_del(&close_list); break; - + } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { - flgs = vlandev->flags; + flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; @@ -608,6 +618,92 @@ out: return err; } +static struct sk_buff **vlan_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct sk_buff *p, **pp = NULL; + struct vlan_hdr *vhdr; + unsigned int hlen, off_vlan; + const struct packet_offload *ptype; + __be16 type; + int flush = 1; + + off_vlan = skb_gro_offset(skb); + hlen = off_vlan + sizeof(*vhdr); + vhdr = skb_gro_header_fast(skb, off_vlan); + if (skb_gro_header_hard(skb, hlen)) { + vhdr = skb_gro_header_slow(skb, hlen, off_vlan); + if (unlikely(!vhdr)) + goto out; + } + + type = vhdr->h_vlan_encapsulated_proto; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) + goto out_unlock; + + flush = 0; + + for (p = *head; p; p = p->next) { + struct vlan_hdr *vhdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); + if (compare_vlan_header(vhdr, vhdr2)) + NAPI_GRO_CB(p)->same_flow = 0; + } + + skb_gro_pull(skb, sizeof(*vhdr)); + skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int vlan_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); + __be16 type = vhdr->h_vlan_encapsulated_proto; + struct packet_offload *ptype; + int err = -ENOENT; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); + + rcu_read_unlock(); + return err; +} + +static struct packet_offload vlan_packet_offloads[] __read_mostly = { + { + .type = cpu_to_be16(ETH_P_8021Q), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, + { + .type = cpu_to_be16(ETH_P_8021AD), + .priority = 10, + .callbacks = { + .gro_receive = vlan_gro_receive, + .gro_complete = vlan_gro_complete, + }, + }, +}; + static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); @@ -635,6 +731,7 @@ static struct pernet_operations vlan_net_ops = { static int __init vlan_proto_init(void) { int err; + unsigned int i; pr_info("%s v%s\n", vlan_fullname, vlan_version); @@ -658,6 +755,9 @@ static int __init vlan_proto_init(void) if (err < 0) goto err5; + for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) + dev_add_offload(&vlan_packet_offloads[i]); + vlan_ioctl_set(vlan_ioctl_handler); return 0; @@ -675,7 +775,13 @@ err0: static void __exit vlan_cleanup_module(void) { + unsigned int i; + vlan_ioctl_set(NULL); + + for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) + dev_remove_offload(&vlan_packet_offloads[i]); + vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 118956448cf6..01d7ba840df8 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -37,39 +37,6 @@ #include <linux/netpoll.h> /* - * Rebuild the Ethernet MAC header. This is called after an ARP - * (or in future other address resolution) has completed on this - * sk_buff. We now let ARP fill in the other fields. - * - * This routine CANNOT use cached dst->neigh! - * Really, it is used only when dst->neigh is wrong. - * - * TODO: This needs a checkup, I'm ignorant here. --BLG - */ -static int vlan_dev_rebuild_header(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); - - switch (veth->h_vlan_encapsulated_proto) { -#ifdef CONFIG_INET - case htons(ETH_P_IP): - - /* TODO: Confirm this will work with VLAN headers... */ - return arp_find(veth->h_dest, skb); -#endif - default: - pr_debug("%s: unable to resolve type %X addresses\n", - dev->name, ntohs(veth->h_vlan_encapsulated_proto)); - - ether_addr_copy(veth->h_source, dev->dev_addr); - break; - } - - return 0; -} - -/* * Create the VLAN header for an arbitrary protocol layer * * saddr=NULL means use device source address @@ -534,7 +501,6 @@ static int vlan_dev_get_lock_subclass(struct net_device *dev) static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, - .rebuild = vlan_dev_rebuild_header, .parse = eth_header_parse, }; @@ -554,7 +520,6 @@ static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev static const struct header_ops vlan_passthru_header_ops = { .create = vlan_passthru_hard_header, - .rebuild = dev_rebuild_header, .parse = eth_header_parse, }; @@ -573,7 +538,6 @@ static int vlan_dev_init(struct net_device *dev) /* IFF_BROADCAST|IFF_MULTICAST; ??? */ dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_MASTER | IFF_SLAVE); - dev->iflink = real_dev->ifindex; dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); @@ -589,6 +553,7 @@ static int vlan_dev_init(struct net_device *dev) if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); + dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE; /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; @@ -767,6 +732,13 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev) } #endif /* CONFIG_NET_POLL_CONTROLLER */ +static int vlan_dev_get_iflink(const struct net_device *dev) +{ + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + + return real_dev->ifindex; +} + static const struct ethtool_ops vlan_ethtool_ops = { .get_settings = vlan_ethtool_get_settings, .get_drvinfo = vlan_ethtool_get_drvinfo, @@ -803,6 +775,7 @@ static const struct net_device_ops vlan_netdev_ops = { #endif .ndo_fix_features = vlan_dev_fix_features, .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, + .ndo_get_iflink = vlan_dev_get_iflink, }; static void vlan_dev_free(struct net_device *dev) @@ -827,5 +800,5 @@ void vlan_setup(struct net_device *dev) dev->destructor = vlan_dev_free; dev->ethtool_ops = &vlan_ethtool_ops; - memset(dev->broadcast, 0, ETH_ALEN); + eth_zero_addr(dev->broadcast); } diff --git a/net/9p/client.c b/net/9p/client.c index e86a9bea1d16..498454b3c06c 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -34,6 +34,7 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/uaccess.h> +#include <linux/uio.h> #include <net/9p/9p.h> #include <linux/parser.h> #include <net/9p/client.h> @@ -555,7 +556,7 @@ out_err: */ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, - char *uidata, int in_hdrlen, int kern_buf) + struct iov_iter *uidata, int in_hdrlen) { int err; int ecode; @@ -591,16 +592,11 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, ename = &req->rc->sdata[req->rc->offset]; if (len > inline_len) { /* We have error in external buffer */ - if (kern_buf) { - memcpy(ename + inline_len, uidata, - len - inline_len); - } else { - err = copy_from_user(ename + inline_len, - uidata, len - inline_len); - if (err) { - err = -EFAULT; - goto out_err; - } + err = copy_from_iter(ename + inline_len, + len - inline_len, uidata); + if (err != len - inline_len) { + err = -EFAULT; + goto out_err; } } ename = NULL; @@ -806,8 +802,8 @@ reterr: * p9_client_zc_rpc - issue a request and wait for a response * @c: client session * @type: type of request - * @uidata: user bffer that should be ued for zero copy read - * @uodata: user buffer that shoud be user for zero copy write + * @uidata: destination for zero copy read + * @uodata: source for zero copy write * @inlen: read buffer size * @olen: write buffer size * @hdrlen: reader header size, This is the size of response protocol data @@ -816,9 +812,10 @@ reterr: * Returns request structure (which client must free using p9_free_req) */ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, - char *uidata, char *uodata, + struct iov_iter *uidata, + struct iov_iter *uodata, int inlen, int olen, int in_hdrlen, - int kern_buf, const char *fmt, ...) + const char *fmt, ...) { va_list ap; int sigpending, err; @@ -841,16 +838,13 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, } else sigpending = 0; - /* If we are called with KERNEL_DS force kern_buf */ - if (segment_eq(get_fs(), KERNEL_DS)) - kern_buf = 1; - err = c->trans_mod->zc_request(c, req, uidata, uodata, - inlen, olen, in_hdrlen, kern_buf); + inlen, olen, in_hdrlen); if (err < 0) { if (err == -EIO) c->status = Disconnected; - goto reterr; + if (err != -ERESTARTSYS) + goto reterr; } if (req->status == REQ_STATUS_ERROR) { p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); @@ -876,7 +870,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, if (err < 0) goto reterr; - err = p9_check_zc_errors(c, req, uidata, in_hdrlen, kern_buf); + err = p9_check_zc_errors(c, req, uidata, in_hdrlen); trace_9p_client_res(c, type, req->rc->tag, err); if (!err) return req; @@ -1123,6 +1117,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, fid = NULL; goto error; } + fid->uid = n_uname; req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid, afid ? afid->fid : P9_NOFID, uname, aname, n_uname); @@ -1541,142 +1536,137 @@ error: EXPORT_SYMBOL(p9_client_unlinkat); int -p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, - u32 count) +p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) { - char *dataptr; - int kernel_buf = 0; + struct p9_client *clnt = fid->clnt; struct p9_req_t *req; - struct p9_client *clnt; - int err, rsize, non_zc = 0; - + int total = 0; p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", - fid->fid, (unsigned long long) offset, count); - err = 0; - clnt = fid->clnt; - - rsize = fid->iounit; - if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) - rsize = clnt->msize - P9_IOHDRSZ; - - if (count < rsize) - rsize = count; - - /* Don't bother zerocopy for small IO (< 1024) */ - if (clnt->trans_mod->zc_request && rsize > 1024) { - char *indata; - if (data) { - kernel_buf = 1; - indata = data; - } else - indata = (__force char *)udata; - /* - * response header len is 11 - * PDU Header(7) + IO Size (4) - */ - req = p9_client_zc_rpc(clnt, P9_TREAD, indata, NULL, rsize, 0, - 11, kernel_buf, "dqd", fid->fid, - offset, rsize); - } else { - non_zc = 1; - req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, - rsize); - } - if (IS_ERR(req)) { - err = PTR_ERR(req); - goto error; - } + fid->fid, (unsigned long long) offset, (int)iov_iter_count(to)); + + while (iov_iter_count(to)) { + int count = iov_iter_count(to); + int rsize, non_zc = 0; + char *dataptr; + + rsize = fid->iounit; + if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) + rsize = clnt->msize - P9_IOHDRSZ; + + if (count < rsize) + rsize = count; + + /* Don't bother zerocopy for small IO (< 1024) */ + if (clnt->trans_mod->zc_request && rsize > 1024) { + /* + * response header len is 11 + * PDU Header(7) + IO Size (4) + */ + req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize, + 0, 11, "dqd", fid->fid, + offset, rsize); + } else { + non_zc = 1; + req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, + rsize); + } + if (IS_ERR(req)) { + *err = PTR_ERR(req); + break; + } - err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); - if (err) { - trace_9p_protocol_dump(clnt, req->rc); - goto free_and_error; - } + *err = p9pdu_readf(req->rc, clnt->proto_version, + "D", &count, &dataptr); + if (*err) { + trace_9p_protocol_dump(clnt, req->rc); + p9_free_req(clnt, req); + break; + } + if (rsize < count) { + pr_err("bogus RREAD count (%d > %d)\n", count, rsize); + count = rsize; + } - p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); + if (!count) { + p9_free_req(clnt, req); + break; + } - if (non_zc) { - if (data) { - memmove(data, dataptr, count); - } else { - err = copy_to_user(udata, dataptr, count); - if (err) { - err = -EFAULT; - goto free_and_error; + if (non_zc) { + int n = copy_to_iter(dataptr, count, to); + total += n; + offset += n; + if (n != count) { + *err = -EFAULT; + p9_free_req(clnt, req); + break; } + } else { + iov_iter_advance(to, count); + total += count; + offset += count; } + p9_free_req(clnt, req); } - p9_free_req(clnt, req); - return count; - -free_and_error: - p9_free_req(clnt, req); -error: - return err; + return total; } EXPORT_SYMBOL(p9_client_read); int -p9_client_write(struct p9_fid *fid, char *data, const char __user *udata, - u64 offset, u32 count) +p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) { - int err, rsize; - int kernel_buf = 0; - struct p9_client *clnt; + struct p9_client *clnt = fid->clnt; struct p9_req_t *req; + int total = 0; + + p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n", + fid->fid, (unsigned long long) offset, + iov_iter_count(from)); + + while (iov_iter_count(from)) { + int count = iov_iter_count(from); + int rsize = fid->iounit; + if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) + rsize = clnt->msize - P9_IOHDRSZ; + + if (count < rsize) + rsize = count; + + /* Don't bother zerocopy for small IO (< 1024) */ + if (clnt->trans_mod->zc_request && rsize > 1024) { + req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0, + rsize, P9_ZC_HDR_SZ, "dqd", + fid->fid, offset, rsize); + } else { + req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid, + offset, rsize, from); + } + if (IS_ERR(req)) { + *err = PTR_ERR(req); + break; + } - p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d\n", - fid->fid, (unsigned long long) offset, count); - err = 0; - clnt = fid->clnt; - - rsize = fid->iounit; - if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) - rsize = clnt->msize - P9_IOHDRSZ; + *err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count); + if (*err) { + trace_9p_protocol_dump(clnt, req->rc); + p9_free_req(clnt, req); + break; + } + if (rsize < count) { + pr_err("bogus RWRITE count (%d > %d)\n", count, rsize); + count = rsize; + } - if (count < rsize) - rsize = count; + p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); - /* Don't bother zerocopy for small IO (< 1024) */ - if (clnt->trans_mod->zc_request && rsize > 1024) { - char *odata; - if (data) { - kernel_buf = 1; - odata = data; - } else - odata = (char *)udata; - req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, odata, 0, rsize, - P9_ZC_HDR_SZ, kernel_buf, "dqd", - fid->fid, offset, rsize); - } else { - if (data) - req = p9_client_rpc(clnt, P9_TWRITE, "dqD", fid->fid, - offset, rsize, data); - else - req = p9_client_rpc(clnt, P9_TWRITE, "dqU", fid->fid, - offset, rsize, udata); - } - if (IS_ERR(req)) { - err = PTR_ERR(req); - goto error; - } - - err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count); - if (err) { - trace_9p_protocol_dump(clnt, req->rc); - goto free_and_error; + p9_free_req(clnt, req); + iov_iter_advance(from, count); + total += count; + offset += count; } - - p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); - - p9_free_req(clnt, req); - return count; - -free_and_error: - p9_free_req(clnt, req); -error: - return err; + return total; } EXPORT_SYMBOL(p9_client_write); @@ -2068,6 +2058,10 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) struct p9_client *clnt; struct p9_req_t *req; char *dataptr; + struct kvec kv = {.iov_base = data, .iov_len = count}; + struct iov_iter to; + + iov_iter_kvec(&to, READ | ITER_KVEC, &kv, 1, count); p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", fid->fid, (unsigned long long) offset, count); @@ -2088,8 +2082,8 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) * response header len is 11 * PDU Header(7) + IO Size (4) */ - req = p9_client_zc_rpc(clnt, P9_TREADDIR, data, NULL, rsize, 0, - 11, 1, "dqd", fid->fid, offset, rsize); + req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0, + 11, "dqd", fid->fid, offset, rsize); } else { non_zc = 1; req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid, diff --git a/net/9p/protocol.c b/net/9p/protocol.c index ab9127ec5b7a..16d287565987 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -33,6 +33,7 @@ #include <linux/sched.h> #include <linux/stddef.h> #include <linux/types.h> +#include <linux/uio.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "protocol.h" @@ -69,10 +70,11 @@ static size_t pdu_write(struct p9_fcall *pdu, const void *data, size_t size) } static size_t -pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size) +pdu_write_u(struct p9_fcall *pdu, struct iov_iter *from, size_t size) { size_t len = min(pdu->capacity - pdu->size, size); - if (copy_from_user(&pdu->sdata[pdu->size], udata, len)) + struct iov_iter i = *from; + if (copy_from_iter(&pdu->sdata[pdu->size], len, &i) != len) len = 0; pdu->size += len; @@ -273,7 +275,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, } break; case 'R':{ - int16_t *nwqid = va_arg(ap, int16_t *); + uint16_t *nwqid = va_arg(ap, uint16_t *); struct p9_qid **wqids = va_arg(ap, struct p9_qid **); @@ -437,23 +439,13 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, stbuf->extension, stbuf->n_uid, stbuf->n_gid, stbuf->n_muid); } break; - case 'D':{ + case 'V':{ uint32_t count = va_arg(ap, uint32_t); - const void *data = va_arg(ap, const void *); - - errcode = p9pdu_writef(pdu, proto_version, "d", - count); - if (!errcode && pdu_write(pdu, data, count)) - errcode = -EFAULT; - } - break; - case 'U':{ - int32_t count = va_arg(ap, int32_t); - const char __user *udata = - va_arg(ap, const void __user *); + struct iov_iter *from = + va_arg(ap, struct iov_iter *); errcode = p9pdu_writef(pdu, proto_version, "d", count); - if (!errcode && pdu_write_u(pdu, udata, count)) + if (!errcode && pdu_write_u(pdu, from, count)) errcode = -EFAULT; } break; @@ -479,7 +471,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, } break; case 'R':{ - int16_t nwqid = va_arg(ap, int); + uint16_t nwqid = va_arg(ap, int); struct p9_qid *wqids = va_arg(ap, struct p9_qid *); diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index 2ee3879161b1..38aa6345bdfa 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -12,12 +12,8 @@ * */ -#include <linux/slab.h> +#include <linux/mm.h> #include <linux/module.h> -#include <net/9p/9p.h> -#include <net/9p/client.h> -#include <linux/scatterlist.h> -#include "trans_common.h" /** * p9_release_req_pages - Release pages after the transaction. @@ -31,39 +27,3 @@ void p9_release_pages(struct page **pages, int nr_pages) put_page(pages[i]); } EXPORT_SYMBOL(p9_release_pages); - -/** - * p9_nr_pages - Return number of pages needed to accommodate the payload. - */ -int p9_nr_pages(char *data, int len) -{ - unsigned long start_page, end_page; - start_page = (unsigned long)data >> PAGE_SHIFT; - end_page = ((unsigned long)data + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - return end_page - start_page; -} -EXPORT_SYMBOL(p9_nr_pages); - -/** - * payload_gup - Translates user buffer into kernel pages and - * pins them either for read/write through get_user_pages_fast(). - * @req: Request to be sent to server. - * @pdata_off: data offset into the first page after translation (gup). - * @pdata_len: Total length of the IO. gup may not return requested # of pages. - * @nr_pages: number of pages to accommodate the payload - * @rw: Indicates if the pages are for read or write. - */ - -int p9_payload_gup(char *data, int *nr_pages, struct page **pages, int write) -{ - int nr_mapped_pages; - - nr_mapped_pages = get_user_pages_fast((unsigned long)data, - *nr_pages, write, pages); - if (nr_mapped_pages <= 0) - return nr_mapped_pages; - - *nr_pages = nr_mapped_pages; - return 0; -} -EXPORT_SYMBOL(p9_payload_gup); diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h index 173bb550a9eb..c43babb3f635 100644 --- a/net/9p/trans_common.h +++ b/net/9p/trans_common.h @@ -13,5 +13,3 @@ */ void p9_release_pages(struct page **, int); -int p9_payload_gup(char *, int *, struct page **, int); -int p9_nr_pages(char *, int); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 80d08f6664cb..bced8c074c12 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -734,6 +734,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts) opts->port = P9_PORT; opts->rfd = ~0; opts->wfd = ~0; + opts->privport = 0; if (!params) return 0; @@ -940,7 +941,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) sin_server.sin_family = AF_INET; sin_server.sin_addr.s_addr = in_aton(addr); sin_server.sin_port = htons(opts.port); - err = __sock_create(read_pnet(¤t->nsproxy->net_ns), PF_INET, + err = __sock_create(current->nsproxy->net_ns, PF_INET, SOCK_STREAM, IPPROTO_TCP, &csocket, 1); if (err) { pr_err("%s (%d): problem creating socket\n", @@ -988,7 +989,7 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) sun_server.sun_family = PF_UNIX; strcpy(sun_server.sun_path, addr); - err = __sock_create(read_pnet(¤t->nsproxy->net_ns), PF_UNIX, + err = __sock_create(current->nsproxy->net_ns, PF_UNIX, SOCK_STREAM, 0, &csocket, 1); if (err < 0) { pr_err("%s (%d): problem creating socket\n", @@ -1013,7 +1014,6 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) { int err; struct p9_fd_opts opts; - struct p9_trans_fd *p; parse_opts(args, &opts); @@ -1026,7 +1026,6 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) if (err < 0) return err; - p = (struct p9_trans_fd *) client->trans; p9_conn_create(client); return 0; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 14ad43b5cf89..37a78d20c0f6 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -139,6 +139,7 @@ struct p9_rdma_opts { int sq_depth; int rq_depth; long timeout; + int privport; }; /* @@ -146,7 +147,10 @@ struct p9_rdma_opts { */ enum { /* Options that take integer arguments */ - Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err, + Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, + /* Options that take no argument */ + Opt_privport, + Opt_err, }; static match_table_t tokens = { @@ -154,6 +158,7 @@ static match_table_t tokens = { {Opt_sq_depth, "sq=%u"}, {Opt_rq_depth, "rq=%u"}, {Opt_timeout, "timeout=%u"}, + {Opt_privport, "privport"}, {Opt_err, NULL}, }; @@ -175,6 +180,7 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) opts->sq_depth = P9_RDMA_SQ_DEPTH; opts->rq_depth = P9_RDMA_RQ_DEPTH; opts->timeout = P9_RDMA_TIMEOUT; + opts->privport = 0; if (!params) return 0; @@ -193,13 +199,13 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) if (!*p) continue; token = match_token(p, tokens, args); - if (token == Opt_err) - continue; - r = match_int(&args[0], &option); - if (r < 0) { - p9_debug(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); - continue; + if ((token != Opt_err) && (token != Opt_privport)) { + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + continue; + } } switch (token) { case Opt_port: @@ -214,6 +220,9 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) case Opt_timeout: opts->timeout = option; break; + case Opt_privport: + opts->privport = 1; + break; default: continue; } @@ -607,6 +616,23 @@ static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) return 0; } +static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) +{ + struct sockaddr_in cl = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + int port, err = -EINVAL; + + for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) { + cl.sin_port = htons((ushort)port); + err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl); + if (err != -EADDRINUSE) + break; + } + return err; +} + /** * trans_create_rdma - Transport method for creating atransport instance * @client: client instance @@ -622,6 +648,7 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; + struct ib_cq_init_attr cq_attr = {}; /* Parse the transport specific mount options */ err = parse_opts(args, &opts); @@ -642,6 +669,16 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) /* Associate the client with the transport */ client->trans = rdma; + /* Bind to a privileged port if we need to */ + if (opts.privport) { + err = p9_rdma_bind_privport(rdma); + if (err < 0) { + pr_err("%s (%d): problem binding to privport: %d\n", + __func__, task_pid_nr(current), -err); + goto error; + } + } + /* Resolve the server's address */ rdma->addr.sin_family = AF_INET; rdma->addr.sin_addr.s_addr = in_aton(addr); @@ -669,9 +706,10 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) goto error; /* Create the Completion Queue */ + cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1; rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, cq_event_handler, client, - opts.sq_depth + opts.rq_depth + 1, 0); + &cq_attr); if (IS_ERR(rdma->cq)) goto error; ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 36a1a739ad68..6e70ddb158b4 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -217,15 +217,15 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) * @start: which segment of the sg_list to start at * @pdata: a list of pages to add into sg. * @nr_pages: number of pages to pack into the scatter/gather list - * @data: data to pack into scatter/gather list + * @offs: amount of data in the beginning of first page _not_ to pack * @count: amount of data to pack into the scatter/gather list */ static int pack_sg_list_p(struct scatterlist *sg, int start, int limit, - struct page **pdata, int nr_pages, char *data, int count) + struct page **pdata, int nr_pages, size_t offs, int count) { int i = 0, s; - int data_off; + int data_off = offs; int index = start; BUG_ON(nr_pages > (limit - start)); @@ -233,16 +233,14 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit, * if the first page doesn't start at * page boundary find the offset */ - data_off = offset_in_page(data); while (nr_pages) { - s = rest_of_page(data); + s = PAGE_SIZE - data_off; if (s > count) s = count; /* Make sure we don't terminate early. */ sg_unmark_end(&sg[index]); sg_set_page(&sg[index++], pdata[i++], s, data_off); data_off = 0; - data += s; count -= s; nr_pages--; } @@ -314,11 +312,20 @@ req_retry: } static int p9_get_mapped_pages(struct virtio_chan *chan, - struct page **pages, char *data, - int nr_pages, int write, int kern_buf) + struct page ***pages, + struct iov_iter *data, + int count, + size_t *offs, + int *need_drop) { + int nr_pages; int err; - if (!kern_buf) { + + if (!iov_iter_count(data)) + return 0; + + if (!(data->type & ITER_KVEC)) { + int n; /* * We allow only p9_max_pages pinned. We wait for the * Other zc request to finish here @@ -329,26 +336,49 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, if (err == -ERESTARTSYS) return err; } - err = p9_payload_gup(data, &nr_pages, pages, write); - if (err < 0) - return err; + n = iov_iter_get_pages_alloc(data, pages, count, offs); + if (n < 0) + return n; + *need_drop = 1; + nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE); atomic_add(nr_pages, &vp_pinned); + return n; } else { /* kernel buffer, no need to pin pages */ - int s, index = 0; - int count = nr_pages; - while (nr_pages) { - s = rest_of_page(data); - if (is_vmalloc_addr(data)) - pages[index++] = vmalloc_to_page(data); + int index; + size_t len; + void *p; + + /* we'd already checked that it's non-empty */ + while (1) { + len = iov_iter_single_seg_count(data); + if (likely(len)) { + p = data->kvec->iov_base + data->iov_offset; + break; + } + iov_iter_advance(data, 0); + } + if (len > count) + len = count; + + nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) - + (unsigned long)p / PAGE_SIZE; + + *pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + if (!*pages) + return -ENOMEM; + + *need_drop = 0; + p -= (*offs = (unsigned long)p % PAGE_SIZE); + for (index = 0; index < nr_pages; index++) { + if (is_vmalloc_addr(p)) + (*pages)[index] = vmalloc_to_page(p); else - pages[index++] = kmap_to_page(data); - data += s; - nr_pages--; + (*pages)[index] = kmap_to_page(p); + p += PAGE_SIZE; } - nr_pages = count; + return len; } - return nr_pages; } /** @@ -364,8 +394,8 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, */ static int p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, - char *uidata, char *uodata, int inlen, - int outlen, int in_hdr_len, int kern_buf) + struct iov_iter *uidata, struct iov_iter *uodata, + int inlen, int outlen, int in_hdr_len) { int in, out, err, out_sgs, in_sgs; unsigned long flags; @@ -373,41 +403,32 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, struct page **in_pages = NULL, **out_pages = NULL; struct virtio_chan *chan = client->trans; struct scatterlist *sgs[4]; + size_t offs; + int need_drop = 0; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); if (uodata) { - out_nr_pages = p9_nr_pages(uodata, outlen); - out_pages = kmalloc(sizeof(struct page *) * out_nr_pages, - GFP_NOFS); - if (!out_pages) { - err = -ENOMEM; - goto err_out; + int n = p9_get_mapped_pages(chan, &out_pages, uodata, + outlen, &offs, &need_drop); + if (n < 0) + return n; + out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); + if (n != outlen) { + __le32 v = cpu_to_le32(n); + memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); + outlen = n; } - out_nr_pages = p9_get_mapped_pages(chan, out_pages, uodata, - out_nr_pages, 0, kern_buf); - if (out_nr_pages < 0) { - err = out_nr_pages; - kfree(out_pages); - out_pages = NULL; - goto err_out; - } - } - if (uidata) { - in_nr_pages = p9_nr_pages(uidata, inlen); - in_pages = kmalloc(sizeof(struct page *) * in_nr_pages, - GFP_NOFS); - if (!in_pages) { - err = -ENOMEM; - goto err_out; - } - in_nr_pages = p9_get_mapped_pages(chan, in_pages, uidata, - in_nr_pages, 1, kern_buf); - if (in_nr_pages < 0) { - err = in_nr_pages; - kfree(in_pages); - in_pages = NULL; - goto err_out; + } else if (uidata) { + int n = p9_get_mapped_pages(chan, &in_pages, uidata, + inlen, &offs, &need_drop); + if (n < 0) + return n; + in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); + if (n != inlen) { + __le32 v = cpu_to_le32(n); + memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); + inlen = n; } } req->status = REQ_STATUS_SENT; @@ -426,7 +447,7 @@ req_retry_pinned: if (out_pages) { sgs[out_sgs++] = chan->sg + out; out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, - out_pages, out_nr_pages, uodata, outlen); + out_pages, out_nr_pages, offs, outlen); } /* @@ -444,7 +465,7 @@ req_retry_pinned: if (in_pages) { sgs[out_sgs + in_sgs++] = chan->sg + out + in; in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, - in_pages, in_nr_pages, uidata, inlen); + in_pages, in_nr_pages, offs, inlen); } BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs)); @@ -478,7 +499,7 @@ req_retry_pinned: * Non kernel buffers are pinned, unpin them */ err_out: - if (!kern_buf) { + if (need_drop) { if (in_pages) { p9_release_pages(in_pages, in_nr_pages); atomic_sub(in_nr_pages, &vp_pinned); @@ -504,7 +525,10 @@ static ssize_t p9_mount_tag_show(struct device *dev, vdev = dev_to_virtio(dev); chan = vdev->priv; - return snprintf(buf, chan->tag_len + 1, "%s", chan->tag); + memcpy(buf, chan->tag, chan->tag_len); + buf[chan->tag_len] = 0; + + return chan->tag_len + 1; } static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); @@ -680,6 +704,7 @@ static void p9_virtio_remove(struct virtio_device *vdev) mutex_unlock(&virtio_9p_lock); + vdev->config->reset(vdev); vdev->config->del_vqs(vdev); sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); diff --git a/net/Kconfig b/net/Kconfig index 44dd5786ee91..57a7c5af3175 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -45,6 +45,9 @@ config COMPAT_NETLINK_MESSAGES Newly written code should NEVER need this option but do compat-independent messages instead! +config NET_INGRESS + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/Makefile b/net/Makefile index 38704bdf941a..3995613e5510 100644 --- a/net/Makefile +++ b/net/Makefile @@ -69,7 +69,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ obj-$(CONFIG_NFC) += nfc/ obj-$(CONFIG_OPENVSWITCH) += openvswitch/ obj-$(CONFIG_VSOCKETS) += vmw_vsock/ -obj-$(CONFIG_NET_MPLS_GSO) += mpls/ +obj-$(CONFIG_MPLS) += mpls/ obj-$(CONFIG_HSR) += hsr/ ifneq ($(CONFIG_NET_SWITCHDEV),) obj-y += switchdev/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index d1c55d8dd0a2..8ad3ec2610b6 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -141,7 +141,7 @@ static void __aarp_send_query(struct aarp_entry *a) eah->pa_src_net = sat->s_net; eah->pa_src_node = sat->s_node; - memset(eah->hw_dst, '\0', ETH_ALEN); + eth_zero_addr(eah->hw_dst); eah->pa_dst_zero = 0; eah->pa_dst_net = a->target_addr.s_net; @@ -189,7 +189,7 @@ static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us, eah->pa_src_node = us->s_node; if (!sha) - memset(eah->hw_dst, '\0', ETH_ALEN); + eth_zero_addr(eah->hw_dst); else ether_addr_copy(eah->hw_dst, sha); @@ -239,7 +239,7 @@ static void aarp_send_probe(struct net_device *dev, struct atalk_addr *us) eah->pa_src_net = us->s_net; eah->pa_src_node = us->s_node; - memset(eah->hw_dst, '\0', ETH_ALEN); + eth_zero_addr(eah->hw_dst); eah->pa_dst_zero = 0; eah->pa_dst_net = us->s_net; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 0d0766ea5ab1..d5871ac493eb 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1030,7 +1030,7 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) goto out; rc = -ENOMEM; - sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto); + sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern); if (!sk) goto out; rc = 0; @@ -1559,8 +1559,7 @@ freeit: return 0; } -static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t len) +static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct atalk_sock *at = at_sk(sk); @@ -1728,8 +1727,8 @@ out: return err ? : len; } -static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size, int flags) +static int atalk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; struct ddpehdr *ddp; diff --git a/net/atm/common.c b/net/atm/common.c index b84057e41bd6..49a872db7e42 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -141,7 +141,7 @@ static struct proto vcc_proto = { .release_cb = vcc_release_cb, }; -int vcc_create(struct net *net, struct socket *sock, int protocol, int family) +int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern) { struct sock *sk; struct atm_vcc *vcc; @@ -149,7 +149,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family) sock->sk = NULL; if (sock->type == SOCK_STREAM) return -EINVAL; - sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto); + sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); @@ -523,8 +523,8 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci) return 0; } -int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size, int flags) +int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; struct atm_vcc *vcc; @@ -569,8 +569,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, return copied; } -int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, - size_t size) +int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) { struct sock *sk = sock->sk; DEFINE_WAIT(wait); diff --git a/net/atm/common.h b/net/atm/common.h index cc3c2dae4d79..959436b87182 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -10,13 +10,12 @@ #include <linux/poll.h> /* for poll_table */ -int vcc_create(struct net *net, struct socket *sock, int protocol, int family); +int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern); int vcc_release(struct socket *sock); int vcc_connect(struct socket *sock, int itf, short vpi, int vci); -int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size, int flags); -int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, - size_t total_len); +int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags); +int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len); unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/net/atm/lec.c b/net/atm/lec.c index 4b98f897044a..cd3b37989057 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -2001,7 +2001,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, if (entry == NULL) goto out; memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); - memset(entry->mac_addr, 0, ETH_ALEN); + eth_zero_addr(entry->mac_addr); entry->recv_vcc = vcc; entry->old_recv_push = old_push; entry->status = ESI_UNKNOWN; @@ -2086,7 +2086,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, entry->vcc = vcc; entry->old_push = old_push; memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); - memset(entry->mac_addr, 0, ETH_ALEN); + eth_zero_addr(entry->mac_addr); entry->status = ESI_UNKNOWN; hlist_add_head(&entry->next, &priv->lec_arp_empty_ones); entry->timer.expires = jiffies + priv->vcc_timeout_period; diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index 5bdd300db0f7..2df34eb5d65f 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -272,7 +272,7 @@ static int parse_qos(const char *buff) qos.rxtp.max_pcr = rx_pcr; qos.rxtp.max_sdu = rx_sdu; qos.aal = ATM_AAL5; - dprintk("parse_qos(): setting qos paramameters to tx=%d,%d rx=%d,%d\n", + dprintk("parse_qos(): setting qos parameters to tx=%d,%d rx=%d,%d\n", qos.txtp.max_pcr, qos.txtp.max_sdu, qos.rxtp.max_pcr, qos.rxtp.max_sdu); diff --git a/net/atm/pvc.c b/net/atm/pvc.c index ae0324021407..040207ec399f 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -136,7 +136,7 @@ static int pvc_create(struct net *net, struct socket *sock, int protocol, return -EAFNOSUPPORT; sock->ops = &pvc_proto_ops; - return vcc_create(net, sock, protocol, PF_ATMPVC); + return vcc_create(net, sock, protocol, PF_ATMPVC, kern); } static const struct net_proto_family pvc_family_ops = { diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 523bce72f698..4fd6af47383a 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -19,36 +19,15 @@ #include "resources.h" #include "signaling.h" -#undef WAIT_FOR_DEMON /* #define this if system calls on SVC sockets - should block until the demon runs. - Danger: may cause nasty hangs if the demon - crashes. */ - struct atm_vcc *sigd = NULL; -#ifdef WAIT_FOR_DEMON -static DECLARE_WAIT_QUEUE_HEAD(sigd_sleep); -#endif static void sigd_put_skb(struct sk_buff *skb) { -#ifdef WAIT_FOR_DEMON - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&sigd_sleep, &wait); - while (!sigd) { - set_current_state(TASK_UNINTERRUPTIBLE); - pr_debug("atmsvc: waiting for signaling daemon...\n"); - schedule(); - } - current->state = TASK_RUNNING; - remove_wait_queue(&sigd_sleep, &wait); -#else if (!sigd) { pr_debug("atmsvc: no signaling daemon\n"); kfree_skb(skb); return; } -#endif atm_force_charge(sigd, skb->truesize); skb_queue_tail(&sk_atm(sigd)->sk_receive_queue, skb); sk_atm(sigd)->sk_data_ready(sk_atm(sigd)); @@ -261,8 +240,5 @@ int sigd_attach(struct atm_vcc *vcc) vcc_insert_socket(sk_atm(vcc)); set_bit(ATM_VF_META, &vcc->flags); set_bit(ATM_VF_READY, &vcc->flags); -#ifdef WAIT_FOR_DEMON - wake_up(&sigd_sleep); -#endif return 0; } diff --git a/net/atm/svc.c b/net/atm/svc.c index 1ba23f5018e7..3fa0a9ee98d1 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -660,7 +660,7 @@ static int svc_create(struct net *net, struct socket *sock, int protocol, return -EAFNOSUPPORT; sock->ops = &svc_proto_ops; - error = vcc_create(net, sock, protocol, AF_ATMSVC); + error = vcc_create(net, sock, protocol, AF_ATMSVC, kern); if (error) return error; ATM_SD(sock)->local.sas_family = AF_ATMSVC; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ca049a7c9287..ae3a47f9d1d5 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -40,7 +40,6 @@ #include <linux/notifier.h> #include <linux/proc_fs.h> #include <linux/stat.h> -#include <linux/netfilter.h> #include <linux/sysctl.h> #include <linux/init.h> #include <linux/spinlock.h> @@ -58,7 +57,7 @@ static const struct proto_ops ax25_proto_ops; static void ax25_free_sock(struct sock *sk) { - ax25_cb_put(ax25_sk(sk)); + ax25_cb_put(sk_to_ax25(sk)); } /* @@ -307,7 +306,7 @@ void ax25_destroy_socket(ax25_cb *ax25) while ((skb = skb_dequeue(&ax25->sk->sk_receive_queue)) != NULL) { if (skb->sk != ax25->sk) { /* A pending connection */ - ax25_cb *sax25 = ax25_sk(skb->sk); + ax25_cb *sax25 = sk_to_ax25(skb->sk); /* Queue the unaccepted socket for death */ sock_orphan(skb->sk); @@ -552,7 +551,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); switch (optname) { case AX25_WINDOW: @@ -698,7 +697,7 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname, length = min_t(unsigned int, maxlen, sizeof(int)); lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); switch (optname) { case AX25_WINDOW: @@ -797,7 +796,7 @@ out: static struct proto ax25_proto = { .name = "AX25", .owner = THIS_MODULE, - .obj_size = sizeof(struct sock), + .obj_size = sizeof(struct ax25_sock), }; static int ax25_create(struct net *net, struct socket *sock, int protocol, @@ -855,11 +854,11 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto); + sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern); if (sk == NULL) return -ENOMEM; - ax25 = sk->sk_protinfo = ax25_create_cb(); + ax25 = ax25_sk(sk)->cb = ax25_create_cb(); if (!ax25) { sk_free(sk); return -ENOMEM; @@ -881,7 +880,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) struct sock *sk; ax25_cb *ax25, *oax25; - sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot); + sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; @@ -911,7 +910,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); - oax25 = ax25_sk(osk); + oax25 = sk_to_ax25(osk); ax25->modulus = oax25->modulus; ax25->backoff = oax25->backoff; @@ -939,7 +938,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) } } - sk->sk_protinfo = ax25; + ax25_sk(sk)->cb = ax25; sk->sk_destruct = ax25_free_sock; ax25->sk = sk; @@ -957,7 +956,7 @@ static int ax25_release(struct socket *sock) sock_hold(sk); sock_orphan(sk); lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) { @@ -1067,7 +1066,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); if (!sock_flag(sk, SOCK_ZAPPED)) { err = -EINVAL; goto out; @@ -1114,7 +1113,7 @@ static int __must_check ax25_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; - ax25_cb *ax25 = ax25_sk(sk), *ax25t; + ax25_cb *ax25 = sk_to_ax25(sk), *ax25t; struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; ax25_digi *digi = NULL; int ct = 0, err = 0; @@ -1395,7 +1394,7 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, memset(fsa, 0, sizeof(*fsa)); lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); if (peer != 0) { if (sk->sk_state != TCP_ESTABLISHED) { @@ -1432,8 +1431,7 @@ out: return err; } -static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name); struct sock *sk = sock->sk; @@ -1448,7 +1446,7 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, return -EINVAL; lock_sock(sk); - ax25 = ax25_sk(sk); + ax25 = sk_to_ax25(sk); if (sock_flag(sk, SOCK_ZAPPED)) { err = -EADDRNOTAVAIL; @@ -1599,8 +1597,8 @@ out: return err; } -static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; @@ -1623,7 +1621,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb == NULL) goto out; - if (!ax25_sk(sk)->pidincl) + if (!sk_to_ax25(sk)->pidincl) skb_pull(skb, 1); /* Remove PID */ skb_reset_transport_header(skb); @@ -1764,7 +1762,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCAX25GETINFO: case SIOCAX25GETINFOOLD: { - ax25_cb *ax25 = ax25_sk(sk); + ax25_cb *ax25 = sk_to_ax25(sk); struct ax25_info_struct ax25_info; ax25_info.t1 = ax25->t1 / HZ; diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 7ed8ab724819..bb5a0e4e98d9 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -23,7 +23,6 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> -#include <linux/netfilter.h> #include <net/sock.h> #include <net/tcp_states.h> #include <asm/uaccess.h> @@ -354,7 +353,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, return 0; } - ax25 = ax25_sk(make); + ax25 = sk_to_ax25(make); skb_set_owner_r(skb, make); skb_queue_head(&sk->sk_receive_queue, skb); diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 67de6b33f2c3..b563a3f5f2a8 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -31,7 +31,6 @@ #include <linux/notifier.h> #include <linux/proc_fs.h> #include <linux/stat.h> -#include <linux/netfilter.h> #include <linux/sysctl.h> #include <net/ip.h> #include <net/arp.h> @@ -46,9 +45,9 @@ #ifdef CONFIG_INET -int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, const void *daddr, - const void *saddr, unsigned int len) +static int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { unsigned char *buff; @@ -100,7 +99,7 @@ int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, return -AX25_HEADER_LEN; /* Unfinished header */ } -int ax25_rebuild_header(struct sk_buff *skb) +netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) { struct sk_buff *ourskb; unsigned char *bp = skb->data; @@ -115,9 +114,6 @@ int ax25_rebuild_header(struct sk_buff *skb) dst = (ax25_address *)(bp + 1); src = (ax25_address *)(bp + 8); - if (arp_find(bp + 1, skb)) - return 1; - route = ax25_get_route(dst, NULL); if (route) { digipeat = route->digipeat; @@ -129,6 +125,7 @@ int ax25_rebuild_header(struct sk_buff *skb) dev = skb->dev; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) { + kfree_skb(skb); goto put; } @@ -212,31 +209,29 @@ put: if (route) ax25_put_route(route); - return 1; + return NETDEV_TX_OK; } #else /* INET */ -int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, const void *daddr, - const void *saddr, unsigned int len) +static int ax25_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { return -AX25_HEADER_LEN; } -int ax25_rebuild_header(struct sk_buff *skb) +netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) { - return 1; + kfree_skb(skb); + return NETDEV_TX_OK; } - #endif const struct header_ops ax25_header_ops = { .create = ax25_hard_header, - .rebuild = ax25_rebuild_header, }; -EXPORT_SYMBOL(ax25_hard_header); -EXPORT_SYMBOL(ax25_rebuild_header); EXPORT_SYMBOL(ax25_header_ops); +EXPORT_SYMBOL(ax25_ip_xmit); diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index be2acab9be9d..8ddd41baa81c 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -24,7 +24,6 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> -#include <linux/netfilter.h> #include <net/sock.h> #include <asm/uaccess.h> #include <linux/fcntl.h> diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 1997538a5d23..3b78e8473a01 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -264,6 +264,7 @@ void ax25_disconnect(ax25_cb *ax25, int reason) { ax25_clear_queues(ax25); + ax25_stop_heartbeat(ax25); ax25_stop_t1timer(ax25); ax25_stop_t2timer(ax25); ax25_stop_t3timer(ax25); diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c index 71c4badbc807..4ad2fb7bcd35 100644 --- a/net/ax25/ax25_uid.c +++ b/net/ax25/ax25_uid.c @@ -34,7 +34,6 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> -#include <linux/netfilter.h> #include <linux/sysctl.h> #include <linux/export.h> #include <net/ip.h> diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index eb7d8c0388e4..21434ab79d2c 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # -# Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # @@ -20,7 +20,7 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv.o batman-adv-y += bat_iv_ogm.o batman-adv-y += bitarray.o batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o -batman-adv-y += debugfs.o +batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o batman-adv-y += fragmentation.o batman-adv-y += gateway_client.o @@ -29,6 +29,7 @@ batman-adv-y += hard-interface.o batman-adv-y += hash.o batman-adv-y += icmp_socket.o batman-adv-y += main.o +batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += routing.o @@ -36,4 +37,3 @@ batman-adv-y += send.o batman-adv-y += soft-interface.o batman-adv-y += sysfs.o batman-adv-y += translation-table.o -batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 4e49666f8c65..4e59cf3eb079 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 00e00e09b000..753383c2215c 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,20 +15,50 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ +#include "bat_algo.h" #include "main.h" -#include "translation-table.h" + +#include <linux/atomic.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/bug.h> +#include <linux/byteorder/generic.h> +#include <linux/cache.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/pkt_sched.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include "bitarray.h" +#include "hard-interface.h" +#include "hash.h" +#include "network-coding.h" #include "originator.h" +#include "packet.h" #include "routing.h" -#include "gateway_common.h" -#include "gateway_client.h" -#include "hard-interface.h" #include "send.h" -#include "bat_algo.h" -#include "network-coding.h" +#include "translation-table.h" /** * enum batadv_dup_status - duplicate status - * @BATADV_NO_DUP: the packet is a duplicate + * @BATADV_NO_DUP: the packet is no duplicate * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the * neighbor) * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor @@ -55,7 +85,7 @@ static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, } /** - * batadv_ring_buffer_set - compute the average of all non-zero values stored + * batadv_ring_buffer_avg - compute the average of all non-zero values stored * in the given ring buffer * @lq_recv: pointer to the ring buffer * @@ -64,7 +94,9 @@ static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]) { const uint8_t *ptr; - uint16_t count = 0, i = 0, sum = 0; + uint16_t count = 0; + uint16_t i = 0; + uint16_t sum = 0; ptr = lq_recv; @@ -308,7 +340,6 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) struct batadv_ogm_packet *batadv_ogm_packet; unsigned char *ogm_buff; uint32_t random_seqno; - int res = -ENOMEM; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); @@ -317,7 +348,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) - goto out; + return -ENOMEM; hard_iface->bat_iv.ogm_buff = ogm_buff; @@ -329,10 +360,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; - res = 0; - -out: - return res; + return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) @@ -396,8 +424,8 @@ static uint8_t batadv_hop_penalty(uint8_t tq, } /* is there another aggregated packet here? */ -static int batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, - __be16 tvlv_len) +static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, + __be16 tvlv_len) { int next_buff_pos = 0; @@ -413,7 +441,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - char *fwd_str; + const char *fwd_str; uint8_t packet_num; int16_t buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; @@ -451,7 +479,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, batadv_ogm_packet->orig, ntohl(batadv_ogm_packet->seqno), batadv_ogm_packet->tq, batadv_ogm_packet->ttl, - (batadv_ogm_packet->flags & BATADV_DIRECTLINK ? + ((batadv_ogm_packet->flags & BATADV_DIRECTLINK) ? "on" : "off"), hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); @@ -548,58 +576,62 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, * - the send time is within our MAX_AGGREGATION_MS time * - the resulting packet wont be bigger than * MAX_AGGREGATION_BYTES + * otherwise aggregation is not possible */ - if (time_before(send_time, forw_packet->send_time) && - time_after_eq(aggregation_end_time, forw_packet->send_time) && - (aggregated_bytes <= BATADV_MAX_AGGREGATION_BYTES)) { - /* check aggregation compatibility - * -> direct link packets are broadcasted on - * their interface only - * -> aggregate packet if the current packet is - * a "global" packet as well as the base - * packet - */ - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto out; - - /* packet is not leaving on the same interface. */ - if (forw_packet->if_outgoing != if_outgoing) - goto out; + if (!time_before(send_time, forw_packet->send_time) || + !time_after_eq(aggregation_end_time, forw_packet->send_time)) + return false; + + if (aggregated_bytes > BATADV_MAX_AGGREGATION_BYTES) + return false; + + /* packet is not leaving on the same interface. */ + if (forw_packet->if_outgoing != if_outgoing) + return false; + + /* check aggregation compatibility + * -> direct link packets are broadcasted on + * their interface only + * -> aggregate packet if the current packet is + * a "global" packet as well as the base + * packet + */ + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + return false; - /* packets without direct link flag and high TTL - * are flooded through the net - */ - if ((!directlink) && - (!(batadv_ogm_packet->flags & BATADV_DIRECTLINK)) && - (batadv_ogm_packet->ttl != 1) && - - /* own packets originating non-primary - * interfaces leave only that interface - */ - ((!forw_packet->own) || - (forw_packet->if_incoming == primary_if))) { - res = true; - goto out; - } + /* packets without direct link flag and high TTL + * are flooded through the net + */ + if (!directlink && + !(batadv_ogm_packet->flags & BATADV_DIRECTLINK) && + batadv_ogm_packet->ttl != 1 && + + /* own packets originating non-primary + * interfaces leave only that interface + */ + (!forw_packet->own || + forw_packet->if_incoming == primary_if)) { + res = true; + goto out; + } - /* if the incoming packet is sent via this one - * interface only - we still can aggregate - */ - if ((directlink) && - (new_bat_ogm_packet->ttl == 1) && - (forw_packet->if_incoming == if_incoming) && - - /* packets from direct neighbors or - * own secondary interface packets - * (= secondary interface packets in general) - */ - (batadv_ogm_packet->flags & BATADV_DIRECTLINK || - (forw_packet->own && - forw_packet->if_incoming != primary_if))) { - res = true; - goto out; - } + /* if the incoming packet is sent via this one + * interface only - we still can aggregate + */ + if (directlink && + new_bat_ogm_packet->ttl == 1 && + forw_packet->if_incoming == if_incoming && + + /* packets from direct neighbors or + * own secondary interface packets + * (= secondary interface packets in general) + */ + (batadv_ogm_packet->flags & BATADV_DIRECTLINK || + (forw_packet->own && + forw_packet->if_incoming != primary_if))) { + res = true; + goto out; } out: @@ -642,19 +674,16 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "batman packet queue full\n"); - goto out; + goto out_free_outgoing; } } forw_packet_aggr = kmalloc(sizeof(*forw_packet_aggr), GFP_ATOMIC); - if (!forw_packet_aggr) { - if (!own_packet) - atomic_inc(&bat_priv->batman_queue_left); - goto out; - } + if (!forw_packet_aggr) + goto out_nomem; - if ((atomic_read(&bat_priv->aggregated_ogms)) && - (packet_len < BATADV_MAX_AGGREGATION_BYTES)) + if (atomic_read(&bat_priv->aggregated_ogms) && + packet_len < BATADV_MAX_AGGREGATION_BYTES) skb_size = BATADV_MAX_AGGREGATION_BYTES; else skb_size = packet_len; @@ -662,12 +691,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, skb_size += ETH_HLEN; forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size); - if (!forw_packet_aggr->skb) { - if (!own_packet) - atomic_inc(&bat_priv->batman_queue_left); - kfree(forw_packet_aggr); - goto out; - } + if (!forw_packet_aggr->skb) + goto out_free_forw_packet; forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); @@ -699,7 +724,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, send_time - jiffies); return; -out: +out_free_forw_packet: + kfree(forw_packet_aggr); +out_nomem: + if (!own_packet) + atomic_inc(&bat_priv->batman_queue_left); +out_free_outgoing: batadv_hardif_free_ref(if_outgoing); out_free_incoming: batadv_hardif_free_ref(if_incoming); @@ -752,13 +782,13 @@ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned long max_aggregation_jiffies; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; - direct_link = batadv_ogm_packet->flags & BATADV_DIRECTLINK ? 1 : 0; + direct_link = !!(batadv_ogm_packet->flags & BATADV_DIRECTLINK); max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* find position for the packet in the forward queue */ spin_lock_bh(&bat_priv->forw_bat_list_lock); /* own packets are not to be aggregated */ - if ((atomic_read(&bat_priv->aggregated_ogms)) && (!own_packet)) { + if (atomic_read(&bat_priv->aggregated_ogms) && !own_packet) { hlist_for_each_entry(forw_packet_pos, &bat_priv->forw_bat_list, list) { if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet, @@ -1034,9 +1064,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, batadv_orig_node_free_ref(orig_tmp); if (!neigh_node) goto unlock; - } else + } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Updating existing last-hop neighbor of originator\n"); + } rcu_read_unlock(); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); @@ -1081,7 +1112,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, * won't consider it either */ if (router_ifinfo && - (neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg)) { + neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { orig_node_tmp = router->orig_node; spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock); if_num = router->if_incoming->if_num; @@ -1356,8 +1387,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_node_free_ref(orig_node); - if (orig_ifinfo) - batadv_orig_ifinfo_free_ref(orig_ifinfo); + batadv_orig_ifinfo_free_ref(orig_ifinfo); return ret; } diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index e3da07a64026..cf68c328345e 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -15,10 +15,10 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" #include "bitarray.h" +#include "main.h" -#include <linux/bitops.h> +#include <linux/bitmap.h> /* shift the packet array by n places. */ static void batadv_bitmap_shift_left(unsigned long *seq_bits, int32_t n) diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 2acaafe60188..0c2456225fae 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -18,6 +18,12 @@ #ifndef _NET_BATMAN_ADV_BITARRAY_H_ #define _NET_BATMAN_ADV_BITARRAY_H_ +#include "main.h" + +#include <linux/bitops.h> +#include <linux/compiler.h> +#include <linux/types.h> + /* Returns 1 if the corresponding bit in the given seq_bits indicates true * and curr_seqno is within range of last_seqno. Otherwise returns 0. */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ac4b96eccade..ba0609292ae7 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -15,19 +15,41 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" -#include "hash.h" -#include "hard-interface.h" -#include "originator.h" #include "bridge_loop_avoidance.h" -#include "translation-table.h" -#include "send.h" +#include "main.h" -#include <linux/etherdevice.h> +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/compiler.h> #include <linux/crc16.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> #include <linux/if_arp.h> -#include <net/arp.h> +#include <linux/if_ether.h> #include <linux/if_vlan.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/workqueue.h> +#include <net/arp.h> + +#include "hard-interface.h" +#include "hash.h" +#include "originator.h" +#include "packet.h" +#include "translation-table.h" static const uint8_t batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; @@ -42,12 +64,8 @@ static inline uint32_t batadv_choose_claim(const void *data, uint32_t size) struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; uint32_t hash = 0; - hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr)); - hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&claim->addr, sizeof(claim->addr), hash); + hash = jhash(&claim->vid, sizeof(claim->vid), hash); return hash % size; } @@ -59,12 +77,8 @@ static inline uint32_t batadv_choose_backbone_gw(const void *data, const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; uint32_t hash = 0; - hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr)); - hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&claim->addr, sizeof(claim->addr), hash); + hash = jhash(&claim->vid, sizeof(claim->vid), hash); return hash % size; } diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 43c985d92c3e..0282690389ac 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich * @@ -18,6 +18,16 @@ #ifndef _NET_BATMAN_ADV_BLA_H_ #define _NET_BATMAN_ADV_BLA_H_ +#include "main.h" + +#include <linux/types.h> + +struct batadv_hard_iface; +struct batadv_orig_node; +struct batadv_priv; +struct seq_file; +struct sk_buff; + #ifdef CONFIG_BATMAN_ADV_BLA int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, bool is_bcast); diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index a4972874c056..c4c1e8030ba0 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,21 +15,42 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ +#include "debugfs.h" #include "main.h" +#include <linux/compiler.h> #include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/errno.h> +#include <linux/export.h> +#include <linux/fcntl.h> +#include <linux/fs.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/poll.h> +#include <linux/printk.h> +#include <linux/sched.h> /* for linux/wait.h */ +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stat.h> +#include <linux/stddef.h> +#include <linux/stringify.h> +#include <linux/sysfs.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/wait.h> +#include <stdarg.h> -#include "debugfs.h" -#include "translation-table.h" -#include "originator.h" -#include "hard-interface.h" -#include "gateway_common.h" -#include "gateway_client.h" -#include "soft-interface.h" -#include "icmp_socket.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" +#include "gateway_client.h" +#include "icmp_socket.h" #include "network-coding.h" +#include "originator.h" +#include "translation-table.h" static struct dentry *batadv_debugfs; @@ -482,11 +503,7 @@ rem_attr: debugfs_remove_recursive(hard_iface->debug_dir); hard_iface->debug_dir = NULL; out: -#ifdef CONFIG_DEBUG_FS return -ENOMEM; -#else - return 0; -#endif /* CONFIG_DEBUG_FS */ } /** @@ -541,11 +558,7 @@ rem_attr: debugfs_remove_recursive(bat_priv->debug_dir); bat_priv->debug_dir = NULL; out: -#ifdef CONFIG_DEBUG_FS return -ENOMEM; -#else - return 0; -#endif /* CONFIG_DEBUG_FS */ } void batadv_debugfs_del_meshif(struct net_device *dev) diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 37c4d6ddd04d..187acdc85dfa 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,8 +18,17 @@ #ifndef _NET_BATMAN_ADV_DEBUGFS_H_ #define _NET_BATMAN_ADV_DEBUGFS_H_ +#include "main.h" + +#include <linux/kconfig.h> + +struct batadv_hard_iface; +struct net_device; + #define BATADV_DEBUGFS_SUBDIR "batman_adv" +#if IS_ENABLED(CONFIG_DEBUG_FS) + void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); @@ -27,4 +36,36 @@ void batadv_debugfs_del_meshif(struct net_device *dev); int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); +#else + +static inline void batadv_debugfs_init(void) +{ +} + +static inline void batadv_debugfs_destroy(void) +{ +} + +static inline int batadv_debugfs_add_meshif(struct net_device *dev) +{ + return 0; +} + +static inline void batadv_debugfs_del_meshif(struct net_device *dev) +{ +} + +static inline +int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) +{ + return 0; +} + +static inline +void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) +{ +} + +#endif + #endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index aad022dd15df..fb54e6aed096 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -15,18 +15,36 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include <linux/if_ether.h> +#include "distributed-arp-table.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> #include <linux/if_arp.h> +#include <linux/if_ether.h> #include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/workqueue.h> #include <net/arp.h> -#include "main.h" -#include "hash.h" -#include "distributed-arp-table.h" #include "hard-interface.h" +#include "hash.h" #include "originator.h" #include "send.h" -#include "types.h" #include "translation-table.h" static void batadv_dat_purge(struct work_struct *work); @@ -206,9 +224,22 @@ static uint32_t batadv_hash_dat(const void *data, uint32_t size) { uint32_t hash = 0; const struct batadv_dat_entry *dat = data; + const unsigned char *key; + uint32_t i; - hash = batadv_hash_bytes(hash, &dat->ip, sizeof(dat->ip)); - hash = batadv_hash_bytes(hash, &dat->vid, sizeof(dat->vid)); + key = (const unsigned char *)&dat->ip; + for (i = 0; i < sizeof(dat->ip); i++) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + + key = (const unsigned char *)&dat->vid; + for (i = 0; i < sizeof(dat->vid); i++) { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } hash += (hash << 3); hash ^= (hash >> 11); diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 2fe0764c64be..3181507ebc14 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors: * * Antonio Quartulli * @@ -18,12 +18,19 @@ #ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ #define _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ -#ifdef CONFIG_BATMAN_ADV_DAT +#include "main.h" + +#include <linux/compiler.h> +#include <linux/netdevice.h> +#include <linux/types.h> -#include "types.h" #include "originator.h" +#include "packet.h" -#include <linux/if_arp.h> +struct seq_file; +struct sk_buff; + +#ifdef CONFIG_BATMAN_ADV_DAT /* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */ #define BATADV_DAT_ADDR_MAX ((batadv_dat_addr_t)~(batadv_dat_addr_t)0) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 3d1dcaa3e8b5..c0f0d01ab244 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -15,12 +15,28 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" #include "fragmentation.h" -#include "send.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/pkt_sched.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/string.h> + +#include "hard-interface.h" #include "originator.h" +#include "packet.h" #include "routing.h" -#include "hard-interface.h" +#include "send.h" #include "soft-interface.h" /** @@ -161,6 +177,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, hlist_add_head(&frag_entry_new->list, &chain->head); chain->size = skb->len - hdr_size; chain->timestamp = jiffies; + chain->total_size = ntohs(frag_packet->total_size); ret = true; goto out; } @@ -195,9 +212,11 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, out: if (chain->size > batadv_frag_size_limit() || - ntohs(frag_packet->total_size) > batadv_frag_size_limit()) { + chain->total_size != ntohs(frag_packet->total_size) || + chain->total_size > batadv_frag_size_limit()) { /* Clear chain if total size of either the list or the packet - * exceeds the maximum size of one merged packet. + * exceeds the maximum size of one merged packet. Don't allow + * packets to have different total_size. */ batadv_frag_clear_chain(&chain->head); chain->size = 0; @@ -228,19 +247,13 @@ err: * Returns the merged skb or NULL on error. */ static struct sk_buff * -batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb) +batadv_frag_merge_packets(struct hlist_head *chain) { struct batadv_frag_packet *packet; struct batadv_frag_list_entry *entry; struct sk_buff *skb_out = NULL; int size, hdr_size = sizeof(struct batadv_frag_packet); - /* Make sure incoming skb has non-bogus data. */ - packet = (struct batadv_frag_packet *)skb->data; - size = ntohs(packet->total_size); - if (size > batadv_frag_size_limit()) - goto free; - /* Remove first entry, as this is the destination for the rest of the * fragments. */ @@ -249,6 +262,9 @@ batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb) skb_out = entry->skb; kfree(entry); + packet = (struct batadv_frag_packet *)skb_out->data; + size = ntohs(packet->total_size); + /* Make room for the rest of the fragments. */ if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) { kfree_skb(skb_out); @@ -304,7 +320,7 @@ bool batadv_frag_skb_buffer(struct sk_buff **skb, if (hlist_empty(&head)) goto out; - skb_out = batadv_frag_merge_packets(&head, *skb); + skb_out = batadv_frag_merge_packets(&head); if (!skb_out) goto out_err; diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index d848cf6676a2..8b9877e70b95 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> * @@ -18,6 +18,15 @@ #ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_ #define _NET_BATMAN_ADV_FRAGMENTATION_H_ +#include "main.h" + +#include <linux/compiler.h> +#include <linux/list.h> +#include <linux/stddef.h> +#include <linux/types.h> + +struct sk_buff; + void batadv_frag_purge_orig(struct batadv_orig_node *orig, bool (*check_cb)(struct batadv_frag_table_entry *)); bool batadv_frag_skb_fwd(struct sk_buff *skb, diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 27649e85f3f6..bb0158620628 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,18 +15,38 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" -#include "sysfs.h" #include "gateway_client.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/udp.h> + #include "gateway_common.h" #include "hard-interface.h" #include "originator.h" -#include "translation-table.h" +#include "packet.h" #include "routing.h" -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/udp.h> -#include <linux/if_vlan.h> +#include "sysfs.h" +#include "translation-table.h" /* These are the offsets of the "hw type" and "hw address length" in the dhcp * packet starting at the beginning of the dhcp header @@ -592,15 +612,16 @@ static int batadv_write_buffer_text(struct batadv_priv *bat_priv, curr_gw = batadv_gw_get_selected_gw_node(bat_priv); - ret = seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n", - (curr_gw == gw_node ? "=>" : " "), - gw_node->orig_node->orig, - router_ifinfo->bat_iv.tq_avg, router->addr, - router->if_incoming->net_dev->name, - gw_node->bandwidth_down / 10, - gw_node->bandwidth_down % 10, - gw_node->bandwidth_up / 10, - gw_node->bandwidth_up % 10); + seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n", + (curr_gw == gw_node ? "=>" : " "), + gw_node->orig_node->orig, + router_ifinfo->bat_iv.tq_avg, router->addr, + router->if_incoming->net_dev->name, + gw_node->bandwidth_down / 10, + gw_node->bandwidth_down % 10, + gw_node->bandwidth_up / 10, + gw_node->bandwidth_up % 10); + ret = seq_has_overflowed(seq) ? -1 : 0; if (curr_gw) batadv_gw_node_free_ref(curr_gw); @@ -732,11 +753,6 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) return BATADV_DHCP_NO; - /* skb->data might have been reallocated by pskb_may_pull() */ - ethhdr = eth_hdr(skb); - if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) - ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN); - udphdr = (struct udphdr *)(skb->data + *header_len); *header_len += sizeof(*udphdr); diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 7ee53bb7d50f..89565b451c18 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,14 @@ #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ #define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ +#include "main.h" + +#include <linux/types.h> + +struct batadv_tvlv_gateway_data; +struct seq_file; +struct sk_buff; + void batadv_gw_check_client_stop(struct batadv_priv *bat_priv); void batadv_gw_reselect(struct batadv_priv *bat_priv); void batadv_gw_election(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 88a1bc3804d1..39cf44ccebd4 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,9 +15,18 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" #include "gateway_common.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/stddef.h> +#include <linux/string.h> + #include "gateway_client.h" +#include "packet.h" /** * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index aa5116561947..bd5c812cebf4 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,13 @@ #ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_ #define _NET_BATMAN_ADV_GATEWAY_COMMON_H_ +#include "main.h" + +#include <linux/types.h> + +struct batadv_priv; +struct net_device; + enum batadv_gw_modes { BATADV_GW_MODE_OFF, BATADV_GW_MODE_CLIENT, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index fbda6b54baff..f4a15d2e5eaf 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,22 +15,36 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" -#include "distributed-arp-table.h" #include "hard-interface.h" -#include "soft-interface.h" -#include "send.h" -#include "translation-table.h" -#include "routing.h" -#include "sysfs.h" -#include "debugfs.h" -#include "originator.h" -#include "hash.h" -#include "bridge_loop_avoidance.h" -#include "gateway_client.h" +#include "main.h" +#include <linux/bug.h> +#include <linux/byteorder/generic.h> +#include <linux/errno.h> +#include <linux/fs.h> #include <linux/if_arp.h> #include <linux/if_ether.h> +#include <linux/if.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/printk.h> +#include <linux/rculist.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <net/net_namespace.h> + +#include "bridge_loop_avoidance.h" +#include "debugfs.h" +#include "distributed-arp-table.h" +#include "gateway_client.h" +#include "originator.h" +#include "packet.h" +#include "send.h" +#include "soft-interface.h" +#include "sysfs.h" +#include "translation-table.h" void batadv_hardif_free_rcu(struct rcu_head *rcu) { @@ -83,11 +97,12 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return true; /* no more parents..stop recursion */ - if (net_dev->iflink == 0 || net_dev->iflink == net_dev->ifindex) + if (dev_get_iflink(net_dev) == 0 || + dev_get_iflink(net_dev) == net_dev->ifindex) return false; /* recurse over the parent device */ - parent_dev = __dev_get_by_index(&init_net, net_dev->iflink); + parent_dev = __dev_get_by_index(&init_net, dev_get_iflink(net_dev)); /* if we got a NULL parent_dev there is something broken.. */ if (WARN(!parent_dev, "Cannot find parent device")) return false; diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 1918cd50b62e..5a31420513e1 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,17 @@ #ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_ #define _NET_BATMAN_ADV_HARD_INTERFACE_H_ +#include "main.h" + +#include <linux/atomic.h> +#include <linux/compiler.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/stddef.h> +#include <linux/types.h> + +struct net_device; + enum batadv_hard_if_state { BATADV_IF_NOT_IN_USE, BATADV_IF_TO_BE_REMOVED, diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 7c1c63080e20..e89f3146b092 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -15,8 +15,12 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" #include "hash.h" +#include "main.h" + +#include <linux/fs.h> +#include <linux/lockdep.h> +#include <linux/slab.h> /* clears the hash */ static void batadv_hash_init(struct batadv_hashtable *hash) diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 539fc1266793..5065f50c9c3c 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * @@ -18,7 +18,16 @@ #ifndef _NET_BATMAN_ADV_HASH_H_ #define _NET_BATMAN_ADV_HASH_H_ +#include "main.h" + +#include <linux/compiler.h> #include <linux/list.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/types.h> + +struct lock_class_key; /* callback to a compare function. should compare 2 element datas for their * keys, return 0 if same and not 0 if not same @@ -80,28 +89,6 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, } /** - * batadv_hash_bytes - hash some bytes and add them to the previous hash - * @hash: previous hash value - * @data: data to be hashed - * @size: number of bytes to be hashed - * - * Returns the new hash value. - */ -static inline uint32_t batadv_hash_bytes(uint32_t hash, const void *data, - uint32_t size) -{ - const unsigned char *key = data; - int i; - - for (i = 0; i < size; i++) { - hash += key[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - return hash; -} - -/** * batadv_hash_add - adds data to the hashtable * @hash: storage hash table * @compare: callback to determine if 2 hash elements are identical diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 161ef8f17d2e..07061bcbaa04 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,14 +15,39 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ +#include "icmp_socket.h" #include "main.h" + +#include <linux/atomic.h> +#include <linux/compiler.h> #include <linux/debugfs.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/export.h> +#include <linux/fcntl.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/pkt_sched.h> +#include <linux/poll.h> +#include <linux/printk.h> +#include <linux/sched.h> /* for linux/wait.h */ +#include <linux/skbuff.h> #include <linux/slab.h> -#include "icmp_socket.h" -#include "send.h" -#include "hash.h" -#include "originator.h" +#include <linux/spinlock.h> +#include <linux/stat.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/wait.h> + #include "hard-interface.h" +#include "originator.h" +#include "packet.h" +#include "send.h" static struct batadv_socket_client *batadv_socket_client_hash[256]; diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 0c33950aa4aa..7de7fce4b48c 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,13 @@ #ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_ #define _NET_BATMAN_ADV_ICMP_SOCKET_H_ +#include "main.h" + +#include <linux/types.h> + +struct batadv_icmp_header; +struct batadv_priv; + #define BATADV_ICMP_SOCKET "socket" void batadv_socket_init(void); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 12fc77bef23f..8457097f1643 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,31 +15,53 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ +#include "main.h" + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/byteorder/generic.h> #include <linux/crc32c.h> -#include <linux/highmem.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/if_ether.h> #include <linux/if_vlan.h> -#include <net/ip.h> -#include <net/ipv6.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netdevice.h> +#include <linux/pkt_sched.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/workqueue.h> #include <net/dsfield.h> -#include "main.h" -#include "sysfs.h" +#include <net/rtnetlink.h> + +#include "bat_algo.h" +#include "bridge_loop_avoidance.h" #include "debugfs.h" +#include "distributed-arp-table.h" +#include "gateway_client.h" +#include "gateway_common.h" +#include "hard-interface.h" +#include "icmp_socket.h" +#include "multicast.h" +#include "network-coding.h" +#include "originator.h" +#include "packet.h" #include "routing.h" #include "send.h" -#include "originator.h" #include "soft-interface.h" -#include "icmp_socket.h" #include "translation-table.h" -#include "hard-interface.h" -#include "gateway_client.h" -#include "bridge_loop_avoidance.h" -#include "distributed-arp-table.h" -#include "multicast.h" -#include "gateway_common.h" -#include "hash.h" -#include "bat_algo.h" -#include "network-coding.h" -#include "fragmentation.h" /* List manipulations on hardif_list have to be rtnl_lock()'ed, * list traversals just rcu-locked @@ -209,10 +231,13 @@ void batadv_mesh_free(struct net_device *soft_iface) * interfaces in the current mesh * @bat_priv: the bat priv with all the soft interface information * @addr: the address to check + * + * Returns 'true' if the mac address was found, false otherwise. */ -int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) +bool batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) { const struct batadv_hard_iface *hard_iface; + bool is_my_mac = false; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { @@ -223,12 +248,12 @@ int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) continue; if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) { - rcu_read_unlock(); - return 1; + is_my_mac = true; + break; } } rcu_read_unlock(); - return 0; + return is_my_mac; } /** @@ -510,14 +535,12 @@ static struct batadv_algo_ops *batadv_algo_get(char *name) int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) { struct batadv_algo_ops *bat_algo_ops_tmp; - int ret; bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); if (bat_algo_ops_tmp) { pr_info("Trying to register already registered routing algorithm: %s\n", bat_algo_ops->name); - ret = -EEXIST; - goto out; + return -EEXIST; } /* all algorithms must implement all ops (for now) */ @@ -531,32 +554,26 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) !bat_algo_ops->bat_neigh_is_equiv_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); - ret = -EINVAL; - goto out; + return -EINVAL; } INIT_HLIST_NODE(&bat_algo_ops->list); hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); - ret = 0; -out: - return ret; + return 0; } int batadv_algo_select(struct batadv_priv *bat_priv, char *name) { struct batadv_algo_ops *bat_algo_ops; - int ret = -EINVAL; bat_algo_ops = batadv_algo_get(name); if (!bat_algo_ops) - goto out; + return -EINVAL; bat_priv->bat_algo_ops = bat_algo_ops; - ret = 0; -out: - return ret; + return 0; } int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) @@ -819,15 +836,15 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); /* keep old buffer if kmalloc should fail */ - if (new_buff) { - memcpy(new_buff, *packet_buff, min_packet_len); - kfree(*packet_buff); - *packet_buff = new_buff; - *packet_buff_len = min_packet_len + additional_packet_len; - return true; - } + if (!new_buff) + return false; + + memcpy(new_buff, *packet_buff, min_packet_len); + kfree(*packet_buff); + *packet_buff = new_buff; + *packet_buff_len = min_packet_len + additional_packet_len; - return false; + return true; } /** diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 4d2318829a34..41d27c7872b9 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2015.0" +#define BATADV_SOURCE_VERSION "2015.1" #endif /* B.A.T.M.A.N. parameters */ @@ -44,7 +44,7 @@ #define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */ #define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */ -#define BATADV_DAT_ENTRY_TIMEOUT (5*60000) /* 5 mins in milliseconds */ +#define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */ /* sliding packet range of received originator messages in sequence numbers * (should be a multiple of our word size) */ @@ -163,28 +163,26 @@ enum batadv_uev_type { /* Kernel headers */ -#include <linux/mutex.h> /* mutex */ -#include <linux/module.h> /* needed by all modules */ -#include <linux/netdevice.h> /* netdevice */ -#include <linux/etherdevice.h> /* ethernet address classification */ -#include <linux/if_ether.h> /* ethernet header */ -#include <linux/poll.h> /* poll_table */ -#include <linux/kthread.h> /* kernel threads */ -#include <linux/pkt_sched.h> /* schedule types */ -#include <linux/workqueue.h> /* workqueue */ +#include <linux/atomic.h> +#include <linux/bitops.h> /* for packet.h */ +#include <linux/compiler.h> +#include <linux/cpumask.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> /* for packet.h */ +#include <linux/netdevice.h> +#include <linux/printk.h> +#include <linux/types.h> #include <linux/percpu.h> -#include <linux/slab.h> -#include <net/sock.h> /* struct sock */ -#include <net/addrconf.h> /* ipv6 address stuff */ -#include <linux/ip.h> -#include <net/rtnetlink.h> #include <linux/jiffies.h> -#include <linux/seq_file.h> #include <linux/if_vlan.h> #include "types.h" -#define BATADV_PRINT_VID(vid) (vid & BATADV_VLAN_HAS_TAG ? \ +struct batadv_ogm_packet; +struct seq_file; +struct sk_buff; + +#define BATADV_PRINT_VID(vid) ((vid & BATADV_VLAN_HAS_TAG) ? \ (int)(vid & VLAN_VID_MASK) : -1) extern char batadv_routing_algo[]; @@ -195,7 +193,7 @@ extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *soft_iface); void batadv_mesh_free(struct net_device *soft_iface); -int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr); +bool batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr); struct batadv_hard_iface * batadv_seq_print_text_primary_if_get(struct seq_file *seq); int batadv_max_header_len(void); @@ -279,7 +277,7 @@ static inline void _batadv_dbg(int type __always_unused, * * note: can't use ether_addr_equal() as it requires aligned memory */ -static inline int batadv_compare_eth(const void *data1, const void *data2) +static inline bool batadv_compare_eth(const void *data1, const void *data2) { return ether_addr_equal_unaligned(data1, data2); } diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index b24e4bb64fb5..7aa480b7edd0 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -15,10 +15,33 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" #include "multicast.h" -#include "originator.h" -#include "hard-interface.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> +#include <net/addrconf.h> +#include <net/ipv6.h> + +#include "packet.h" #include "translation-table.h" /** diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 3a44ebdb43cb..beb6e56c624a 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors: * * Linus Lüssing * @@ -18,6 +18,12 @@ #ifndef _NET_BATMAN_ADV_MULTICAST_H_ #define _NET_BATMAN_ADV_MULTICAST_H_ +#include "main.h" + +struct batadv_orig_node; +struct batadv_priv; +struct sk_buff; + /** * batadv_forw_mode - the way a packet should be forwarded as * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 127cc4d7380a..f0a50f31d822 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * @@ -15,15 +15,44 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ +#include "network-coding.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/compiler.h> #include <linux/debugfs.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/init.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/netdevice.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stat.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/workqueue.h> -#include "main.h" +#include "hard-interface.h" #include "hash.h" -#include "network-coding.h" -#include "send.h" #include "originator.h" -#include "hard-interface.h" +#include "packet.h" #include "routing.h" +#include "send.h" static struct lock_class_key batadv_nc_coding_hash_lock_class_key; static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; @@ -155,7 +184,7 @@ err: */ void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) { - atomic_set(&bat_priv->network_coding, 1); + atomic_set(&bat_priv->network_coding, 0); bat_priv->nc.min_tq = 200; bat_priv->nc.max_fwd_delay = 10; bat_priv->nc.max_buffer_time = 200; @@ -275,7 +304,7 @@ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, * max_buffer time */ return batadv_has_timed_out(nc_path->last_valid, - bat_priv->nc.max_buffer_time*10); + bat_priv->nc.max_buffer_time * 10); } /** @@ -453,14 +482,8 @@ static uint32_t batadv_nc_hash_choose(const void *data, uint32_t size) const struct batadv_nc_path *nc_path = data; uint32_t hash = 0; - hash = batadv_hash_bytes(hash, &nc_path->prev_hop, - sizeof(nc_path->prev_hop)); - hash = batadv_hash_bytes(hash, &nc_path->next_hop, - sizeof(nc_path->next_hop)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash); + hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash); return hash % size; } diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 358c0d686ab0..5b79aa8c64c1 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * @@ -18,6 +18,19 @@ #ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ #define _NET_BATMAN_ADV_NETWORK_CODING_H_ +#include "main.h" + +#include <linux/types.h> + +struct batadv_nc_node; +struct batadv_neigh_node; +struct batadv_ogm_packet; +struct batadv_orig_node; +struct batadv_priv; +struct net_device; +struct seq_file; +struct sk_buff; + #ifdef CONFIG_BATMAN_ADV_NC void batadv_nc_status_update(struct net_device *net_dev); diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 90e805aba379..018b7495ad84 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,19 +15,31 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ +#include "originator.h" #include "main.h" + +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/netdevice.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> + #include "distributed-arp-table.h" -#include "originator.h" -#include "hash.h" -#include "translation-table.h" -#include "routing.h" +#include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" -#include "soft-interface.h" -#include "bridge_loop_avoidance.h" -#include "network-coding.h" -#include "fragmentation.h" +#include "hash.h" #include "multicast.h" +#include "network-coding.h" +#include "routing.h" +#include "translation-table.h" /* hash class keys */ static struct lock_class_key batadv_orig_hash_lock_class_key; @@ -197,13 +209,19 @@ static void batadv_neigh_node_free_rcu(struct rcu_head *rcu) struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; + struct batadv_algo_ops *bao; neigh_node = container_of(rcu, struct batadv_neigh_node, rcu); + bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo); } + + if (bao->bat_neigh_free) + bao->bat_neigh_free(neigh_node); + batadv_hardif_free_ref_now(neigh_node->if_incoming); kfree(neigh_node); diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index aa4a43696295..79734d302010 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,8 +18,21 @@ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ #define _NET_BATMAN_ADV_ORIGINATOR_H_ +#include "main.h" + +#include <linux/atomic.h> +#include <linux/compiler.h> +#include <linux/if_ether.h> +#include <linux/jhash.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/stddef.h> +#include <linux/types.h> + #include "hash.h" +struct seq_file; + int batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); @@ -75,20 +88,9 @@ void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan); */ static inline uint32_t batadv_choose_orig(const void *data, uint32_t size) { - const unsigned char *key = data; uint32_t hash = 0; - size_t i; - - for (i = 0; i < 6; i++) { - hash += key[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(data, ETH_ALEN, hash); return hash % size; } diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index b81fbbf21a63..9e747c08d0bc 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,9 @@ #ifndef _NET_BATMAN_ADV_PACKET_H_ #define _NET_BATMAN_ADV_PACKET_H_ +#include <asm/byteorder.h> +#include <linux/types.h> + /** * enum batadv_packettype - types for batman-adv encapsulated packets * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index da83982bf974..c360c0cd19c2 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,20 +15,36 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" #include "routing.h" -#include "send.h" -#include "soft-interface.h" -#include "hard-interface.h" -#include "icmp_socket.h" -#include "translation-table.h" -#include "originator.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/jiffies.h> +#include <linux/netdevice.h> +#include <linux/printk.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> + +#include "bitarray.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" -#include "network-coding.h" #include "fragmentation.h" - -#include <linux/if_vlan.h> +#include "hard-interface.h" +#include "icmp_socket.h" +#include "network-coding.h" +#include "originator.h" +#include "packet.h" +#include "send.h" +#include "soft-interface.h" +#include "translation-table.h" static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 557d3d12a9ab..6bc29d33abc1 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,16 @@ #ifndef _NET_BATMAN_ADV_ROUTING_H_ #define _NET_BATMAN_ADV_ROUTING_H_ +#include "main.h" + +#include <linux/types.h> + +struct batadv_hard_iface; +struct batadv_neigh_node; +struct batadv_orig_node; +struct batadv_priv; +struct sk_buff; + bool batadv_check_management_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, int header_len); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 3d64ed20c393..0a01992e65ab 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,19 +15,37 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ +#include "send.h" #include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/if.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/printk.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/workqueue.h> + #include "distributed-arp-table.h" -#include "send.h" -#include "routing.h" -#include "translation-table.h" -#include "soft-interface.h" -#include "hard-interface.h" -#include "gateway_common.h" +#include "fragmentation.h" #include "gateway_client.h" -#include "originator.h" +#include "hard-interface.h" #include "network-coding.h" -#include "fragmentation.h" -#include "multicast.h" +#include "originator.h" +#include "routing.h" +#include "soft-interface.h" +#include "translation-table.h" static void batadv_send_outstanding_bcast_packet(struct work_struct *work); @@ -255,8 +273,8 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, unsigned short vid) { - struct ethhdr *ethhdr; struct batadv_unicast_packet *unicast_packet; + struct ethhdr *ethhdr; int ret = NET_XMIT_DROP; if (!orig_node) diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 38d0ec1833ae..0536835fe503 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,6 +18,19 @@ #ifndef _NET_BATMAN_ADV_SEND_H_ #define _NET_BATMAN_ADV_SEND_H_ +#include "main.h" + +#include <linux/compiler.h> +#include <linux/types.h> + +#include "packet.h" + +struct batadv_hard_iface; +struct batadv_orig_node; +struct batadv_priv; +struct sk_buff; +struct work_struct; + int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const uint8_t *dst_addr); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5ec31d7de24f..c002961da75d 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -15,26 +15,50 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" #include "soft-interface.h" -#include "hard-interface.h" -#include "distributed-arp-table.h" -#include "routing.h" -#include "send.h" -#include "debugfs.h" -#include "translation-table.h" -#include "hash.h" -#include "gateway_common.h" -#include "gateway_client.h" -#include "sysfs.h" -#include "originator.h" -#include <linux/slab.h> -#include <linux/ethtool.h> +#include "main.h" + +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> +#include <linux/cache.h> +#include <linux/compiler.h> +#include <linux/errno.h> #include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/fs.h> +#include <linux/if_ether.h> #include <linux/if_vlan.h> -#include "multicast.h" +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/netdevice.h> +#include <linux/percpu.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/socket.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> + #include "bridge_loop_avoidance.h" +#include "debugfs.h" +#include "distributed-arp-table.h" +#include "gateway_client.h" +#include "gateway_common.h" +#include "hard-interface.h" +#include "multicast.h" #include "network-coding.h" +#include "packet.h" +#include "send.h" +#include "sysfs.h" +#include "translation-table.h" static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd); static void batadv_get_drvinfo(struct net_device *dev, @@ -105,6 +129,7 @@ static struct net_device_stats *batadv_interface_stats(struct net_device *dev) static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; uint8_t old_addr[ETH_ALEN]; @@ -115,12 +140,17 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) ether_addr_copy(dev->dev_addr, addr->sa_data); /* only modify transtable if it has been initialized before */ - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE) { - batadv_tt_local_remove(bat_priv, old_addr, BATADV_NO_FLAGS, + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + return 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { + batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); - batadv_tt_local_add(dev, addr->sa_data, BATADV_NO_FLAGS, + batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } + rcu_read_unlock(); return 0; } @@ -732,7 +762,7 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA - atomic_set(&bat_priv->bridge_loop_avoidance, 0); + atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); @@ -818,7 +848,7 @@ static int batadv_softif_slave_add(struct net_device *dev, int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); - if (!hard_iface || hard_iface->soft_iface != NULL) + if (!hard_iface || hard_iface->soft_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev->name); diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index dbab22fd89a5..578e8a663c30 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,17 @@ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ +#include "main.h" + +#include <net/rtnetlink.h> + +struct batadv_hard_iface; +struct batadv_orig_node; +struct batadv_priv; +struct batadv_softif_vlan; +struct net_device; +struct sk_buff; + int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, struct batadv_hard_iface *recv_if, diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index a75dc12f96f8..d6a312a82c03 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -15,16 +15,35 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" #include "sysfs.h" -#include "translation-table.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/compiler.h> +#include <linux/device.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/if.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/printk.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/stringify.h> + #include "distributed-arp-table.h" -#include "network-coding.h" -#include "originator.h" +#include "gateway_client.h" +#include "gateway_common.h" #include "hard-interface.h" +#include "network-coding.h" +#include "packet.h" #include "soft-interface.h" -#include "gateway_common.h" -#include "gateway_client.h" static struct net_device *batadv_kobj_to_netdev(struct kobject *obj) { @@ -151,7 +170,7 @@ ssize_t batadv_show_##_name(struct kobject *kobj, \ static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ batadv_store_##_name) -#define BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func) \ +#define BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func) \ ssize_t batadv_store_##_name(struct kobject *kobj, \ struct attribute *attr, char *buff, \ size_t count) \ @@ -161,24 +180,24 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ \ return __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ - &bat_priv->_name, net_dev); \ + &bat_priv->_var, net_dev); \ } -#define BATADV_ATTR_SIF_SHOW_UINT(_name) \ +#define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ ssize_t batadv_show_##_name(struct kobject *kobj, \ struct attribute *attr, char *buff) \ { \ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \ \ - return sprintf(buff, "%i\n", atomic_read(&bat_priv->_name)); \ + return sprintf(buff, "%i\n", atomic_read(&bat_priv->_var)); \ } \ /* Use this, if you are going to set [name] in the soft-interface * (bat_priv) to an unsigned integer value */ -#define BATADV_ATTR_SIF_UINT(_name, _mode, _min, _max, _post_func) \ - static BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func)\ - static BATADV_ATTR_SIF_SHOW_UINT(_name) \ +#define BATADV_ATTR_SIF_UINT(_name, _var, _mode, _min, _max, _post_func)\ + static BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func)\ + static BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ static BATADV_ATTR(_name, _mode, batadv_show_##_name, \ batadv_store_##_name) @@ -540,19 +559,20 @@ BATADV_ATTR_SIF_BOOL(fragmentation, S_IRUGO | S_IWUSR, batadv_update_min_mtu); static BATADV_ATTR(routing_algo, S_IRUGO, batadv_show_bat_algo, NULL); static BATADV_ATTR(gw_mode, S_IRUGO | S_IWUSR, batadv_show_gw_mode, batadv_store_gw_mode); -BATADV_ATTR_SIF_UINT(orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER, - INT_MAX, NULL); -BATADV_ATTR_SIF_UINT(hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE, - NULL); -BATADV_ATTR_SIF_UINT(gw_sel_class, S_IRUGO | S_IWUSR, 1, BATADV_TQ_MAX_VALUE, - batadv_post_gw_reselect); +BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR, + 2 * BATADV_JITTER, INT_MAX, NULL); +BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0, + BATADV_TQ_MAX_VALUE, NULL); +BATADV_ATTR_SIF_UINT(gw_sel_class, gw_sel_class, S_IRUGO | S_IWUSR, 1, + BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect); static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, batadv_store_gw_bwidth); #ifdef CONFIG_BATMAN_ADV_MCAST BATADV_ATTR_SIF_BOOL(multicast_mode, S_IRUGO | S_IWUSR, NULL); #endif #ifdef CONFIG_BATMAN_ADV_DEBUG -BATADV_ATTR_SIF_UINT(log_level, S_IRUGO | S_IWUSR, 0, BATADV_DBG_ALL, NULL); +BATADV_ATTR_SIF_UINT(log_level, log_level, S_IRUGO | S_IWUSR, 0, + BATADV_DBG_ALL, NULL); #endif #ifdef CONFIG_BATMAN_ADV_NC BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR, diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index b715b60db7cd..2294583f7cf9 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors: * * Marek Lindner * @@ -18,6 +18,16 @@ #ifndef _NET_BATMAN_ADV_SYSFS_H_ #define _NET_BATMAN_ADV_SYSFS_H_ +#include "main.h" + +#include <linux/sysfs.h> +#include <linux/types.h> + +struct batadv_priv; +struct batadv_softif_vlan; +struct kobject; +struct net_device; + #define BATADV_SYSFS_IF_MESH_SUBDIR "mesh" #define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv" /** diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 07b263a437d1..b4824951010b 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -15,18 +15,41 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "main.h" #include "translation-table.h" -#include "soft-interface.h" +#include "main.h" + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/byteorder/generic.h> +#include <linux/compiler.h> +#include <linux/crc32c.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/fs.h> +#include <linux/if_ether.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/workqueue.h> +#include <net/net_namespace.h> + +#include "bridge_loop_avoidance.h" #include "hard-interface.h" -#include "send.h" #include "hash.h" -#include "originator.h" -#include "routing.h" -#include "bridge_loop_avoidance.h" #include "multicast.h" - -#include <linux/crc32c.h> +#include "originator.h" +#include "packet.h" +#include "soft-interface.h" /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; @@ -67,12 +90,8 @@ static inline uint32_t batadv_choose_tt(const void *data, uint32_t size) uint32_t hash = 0; tt = (struct batadv_tt_common_entry *)data; - hash = batadv_hash_bytes(hash, &tt->addr, ETH_ALEN); - hash = batadv_hash_bytes(hash, &tt->vid, sizeof(tt->vid)); - - hash += (hash << 3); - hash ^= (hash >> 11); - hash += (hash << 15); + hash = jhash(&tt->addr, ETH_ALEN, hash); + hash = jhash(&tt->vid, sizeof(tt->vid), hash); return hash % size; } @@ -954,17 +973,17 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", tt_common_entry->addr, BATADV_PRINT_VID(tt_common_entry->vid), - (tt_common_entry->flags & - BATADV_TT_CLIENT_ROAM ? 'R' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), no_purge ? 'P' : '.', - (tt_common_entry->flags & - BATADV_TT_CLIENT_NEW ? 'N' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_PENDING ? 'X' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_WIFI ? 'W' : '.'), - (tt_common_entry->flags & - BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_NEW) ? 'N' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_PENDING) ? 'X' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), + ((tt_common_entry->flags & + BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), no_purge ? 0 : last_seen_secs, no_purge ? 0 : last_seen_msecs, vlan->tt.crc); @@ -1528,10 +1547,10 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, BATADV_PRINT_VID(tt_global_entry->common.vid), best_entry->ttvn, best_entry->orig_node->orig, last_ttvn, vlan->tt.crc, - (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), - (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), - (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), - (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.')); + ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), + ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), + ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), + ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); batadv_orig_node_vlan_free_ref(vlan); } @@ -1560,10 +1579,10 @@ print_list: BATADV_PRINT_VID(tt_global_entry->common.vid), orig_entry->ttvn, orig_entry->orig_node->orig, last_ttvn, vlan->tt.crc, - (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'), - (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'), - (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'), - (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.')); + ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), + ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'), + ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), + ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.')); batadv_orig_node_vlan_free_ref(vlan); } @@ -2529,7 +2548,7 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n", req_src, tt_data->ttvn, req_dst, - (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.')); + ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); /* Let's get the orig node of the REAL destination */ req_dst_orig_node = batadv_orig_hash_find(bat_priv, req_dst); @@ -2660,7 +2679,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n", req_src, tt_data->ttvn, - (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.')); + ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); spin_lock_bh(&bat_priv->tt.commit_lock); @@ -2899,7 +2918,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n", resp_src, tt_data->ttvn, num_entries, - (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.')); + ((tt_data->flags & BATADV_TT_FULL_TABLE) ? 'F' : '.')); orig_node = batadv_orig_hash_find(bat_priv, resp_src); if (!orig_node) diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index ad84d7b89e39..6acc25d3a925 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * @@ -18,6 +18,15 @@ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ +#include "main.h" + +#include <linux/types.h> + +struct batadv_orig_node; +struct batadv_priv; +struct net_device; +struct seq_file; + int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, unsigned short vid, int ifindex, uint32_t mark); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 9398c3fb4174..67d63483618e 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * @@ -18,9 +18,23 @@ #ifndef _NET_BATMAN_ADV_TYPES_H_ #define _NET_BATMAN_ADV_TYPES_H_ +#ifndef _NET_BATMAN_ADV_MAIN_H_ +#error only "main.h" can be included directly +#endif + +#include <linux/bitops.h> +#include <linux/compiler.h> +#include <linux/if_ether.h> +#include <linux/netdevice.h> +#include <linux/sched.h> /* for linux/wait.h */ +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/wait.h> +#include <linux/workqueue.h> + #include "packet.h" -#include "bitarray.h" -#include <linux/kernel.h> + +struct seq_file; #ifdef CONFIG_BATMAN_ADV_DAT @@ -132,6 +146,7 @@ struct batadv_orig_ifinfo { * @timestamp: time (jiffie) of last received fragment * @seqno: sequence number of the fragments in the list * @size: accumulated size of packets in list + * @total_size: expected size of the assembled packet */ struct batadv_frag_table_entry { struct hlist_head head; @@ -139,6 +154,7 @@ struct batadv_frag_table_entry { unsigned long timestamp; uint16_t seqno; uint16_t size; + uint16_t total_size; }; /** @@ -181,9 +197,10 @@ struct batadv_orig_node_vlan { /** * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members - * @bcast_own: bitfield containing the number of our OGMs this orig_node - * rebroadcasted "back" to us (relative to last_real_seqno) - * @bcast_own_sum: counted result of bcast_own + * @bcast_own: set of bitfields (one per hard interface) where each one counts + * the number of our OGMs this orig_node rebroadcasted "back" to us (relative + * to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long. + * @bcast_own_sum: sum of bcast_own * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum, * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count */ @@ -1118,6 +1135,8 @@ struct batadv_forw_packet { * @bat_neigh_is_equiv_or_better: check if neigh1 is equally good or better * than neigh2 for their respective outgoing interface from the metric * prospective + * @bat_neigh_free: free the resources allocated by the routing algorithm for a + * neigh_node object * @bat_orig_print: print the originator table (optional) * @bat_orig_free: free the resources allocated by the routing algorithm for an * orig_node object @@ -1135,6 +1154,7 @@ struct batadv_algo_ops { void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface); void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface); void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet); + /* neigh_node handling API */ int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, @@ -1144,6 +1164,7 @@ struct batadv_algo_ops { struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); + void (*bat_neigh_free)(struct batadv_neigh_node *neigh); /* orig_node handling API */ void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq, struct batadv_hard_iface *hard_iface); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 1742b849fcff..2fb7b3064904 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -192,7 +192,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev, if (ipv6_addr_any(nexthop)) return NULL; } else { - nexthop = rt6_nexthop(rt); + nexthop = rt6_nexthop(rt, daddr); /* We need to remember the address because it is needed * by bt_xmit() when sending the packet. In bt_xmit(), the @@ -856,7 +856,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) set_dev_addr(netdev, &chan->src, chan->src_type); netdev->netdev_ops = &netdev_ops; - SET_NETDEV_DEV(netdev, &chan->conn->hcon->dev); + SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); err = register_netdev(netdev); @@ -928,7 +928,7 @@ static void delete_netdev(struct work_struct *work) unregister_netdev(entry->netdev); - /* The entry pointer is deleted in device_event() */ + /* The entry pointer is deleted by the netdev destructor. */ } static void chan_close_cb(struct l2cap_chan *chan) @@ -937,7 +937,7 @@ static void chan_close_cb(struct l2cap_chan *chan) struct lowpan_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; - bool last = false, removed = true; + bool last = false, remove = true; BT_DBG("chan %p conn %p", chan, chan->conn); @@ -948,7 +948,7 @@ static void chan_close_cb(struct l2cap_chan *chan) /* If conn is set, then the netdev is also there and we should * not remove it. */ - removed = false; + remove = false; } spin_lock(&devices_lock); @@ -977,7 +977,7 @@ static void chan_close_cb(struct l2cap_chan *chan) ifdown(dev->netdev); - if (!removed) { + if (remove) { INIT_WORK(&entry->delete_netdev, delete_netdev); schedule_work(&entry->delete_netdev); } @@ -1208,8 +1208,6 @@ static void disconnect_all_peers(void) list_del_rcu(&peer->list); kfree_rcu(peer, rcu); - - module_put(THIS_MODULE); } spin_unlock(&devices_lock); } @@ -1418,7 +1416,6 @@ static int device_event(struct notifier_block *unused, BT_DBG("Unregistered netdev %s %p", netdev->name, netdev); list_del(&entry->list); - kfree(entry); break; } } diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 7de74635a110..b8c794b87523 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -91,4 +91,12 @@ config BT_SELFTEST_SMP Run test cases for SMP cryptographic functionality, including both legacy SMP as well as the Secure Connections features. +config BT_DEBUGFS + bool "Export Bluetooth internals in debugfs" + depends on BT && DEBUG_FS + default y + help + Provide extensive information about internal Bluetooth states + in debugfs. + source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 8e96e3072266..29c12ae72a66 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -12,9 +12,11 @@ obj-$(CONFIG_BT_6LOWPAN) += bluetooth_6lowpan.o bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ - hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o sco.o lib.o \ - a2mp.o amp.o ecc.o hci_request.o hci_debugfs.o + hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ + a2mp.o amp.o ecc.o hci_request.o mgmt_util.o +bluetooth-$(CONFIG_BT_BREDR) += sco.o +bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o subdir-ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index cedfbda15dad..5a04eb1a7e57 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -19,9 +19,11 @@ #include "a2mp.h" #include "amp.h" +#define A2MP_FEAT_EXT 0x8000 + /* Global AMP Manager list */ -LIST_HEAD(amp_mgr_list); -DEFINE_MUTEX(amp_mgr_list_lock); +static LIST_HEAD(amp_mgr_list); +static DEFINE_MUTEX(amp_mgr_list_lock); /* A2MP build & send command helper functions */ static struct a2mp_cmd *__a2mp_build(u8 code, u8 ident, u16 len, void *data) @@ -43,7 +45,7 @@ static struct a2mp_cmd *__a2mp_build(u8 code, u8 ident, u16 len, void *data) return cmd; } -void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data) +static void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data) { struct l2cap_chan *chan = mgr->a2mp_chan; struct a2mp_cmd *cmd; @@ -67,7 +69,7 @@ void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data) kfree(cmd); } -u8 __next_ident(struct amp_mgr *mgr) +static u8 __next_ident(struct amp_mgr *mgr) { if (++mgr->ident == 0) mgr->ident = 1; @@ -75,6 +77,23 @@ u8 __next_ident(struct amp_mgr *mgr) return mgr->ident; } +static struct amp_mgr *amp_mgr_lookup_by_state(u8 state) +{ + struct amp_mgr *mgr; + + mutex_lock(&_mgr_list_lock); + list_for_each_entry(mgr, &_mgr_list, list) { + if (test_and_clear_bit(state, &mgr->state)) { + amp_mgr_get(mgr); + mutex_unlock(&_mgr_list_lock); + return mgr; + } + } + mutex_unlock(&_mgr_list_lock); + + return NULL; +} + /* hci_dev_list shall be locked */ static void __a2mp_add_cl(struct amp_mgr *mgr, struct a2mp_cl *cl) { @@ -860,23 +879,6 @@ struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn, return mgr->a2mp_chan; } -struct amp_mgr *amp_mgr_lookup_by_state(u8 state) -{ - struct amp_mgr *mgr; - - mutex_lock(&_mgr_list_lock); - list_for_each_entry(mgr, &_mgr_list, list) { - if (test_and_clear_bit(state, &mgr->state)) { - amp_mgr_get(mgr); - mutex_unlock(&_mgr_list_lock); - return mgr; - } - } - mutex_unlock(&_mgr_list_lock); - - return NULL; -} - void a2mp_send_getinfo_rsp(struct hci_dev *hdev) { struct amp_mgr *mgr; diff --git a/net/bluetooth/a2mp.h b/net/bluetooth/a2mp.h index 487b54c1308f..296f665adb09 100644 --- a/net/bluetooth/a2mp.h +++ b/net/bluetooth/a2mp.h @@ -17,8 +17,6 @@ #include <net/bluetooth/l2cap.h> -#define A2MP_FEAT_EXT 0x8000 - enum amp_mgr_state { READ_LOC_AMP_INFO, READ_LOC_AMP_ASSOC, @@ -131,16 +129,10 @@ struct a2mp_physlink_rsp { #define A2MP_STATUS_PHYS_LINK_EXISTS 0x05 #define A2MP_STATUS_SECURITY_VIOLATION 0x06 -extern struct list_head amp_mgr_list; -extern struct mutex amp_mgr_list_lock; - struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr); int amp_mgr_put(struct amp_mgr *mgr); -u8 __next_ident(struct amp_mgr *mgr); struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn, struct sk_buff *skb); -struct amp_mgr *amp_mgr_lookup_by_state(u8 state); -void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data); void a2mp_discover_amp(struct l2cap_chan *chan); void a2mp_send_getinfo_rsp(struct hci_dev *hdev); void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ce22e0cfa923..70f9d945faf7 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -210,8 +210,8 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) } EXPORT_SYMBOL(bt_accept_dequeue); -int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; @@ -283,8 +283,8 @@ static long bt_sock_data_wait(struct sock *sk, long timeo) return timeo; } -int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { struct sock *sk = sock->sk; int err = 0; @@ -711,10 +711,9 @@ EXPORT_SYMBOL_GPL(bt_debugfs); static int __init bt_init(void) { - struct sk_buff *skb; int err; - BUILD_BUG_ON(sizeof(struct bt_skb_cb) > sizeof(skb->cb)); + sock_skb_cb_check_size(sizeof(struct bt_skb_cb)); BT_INFO("Core ver %s", VERSION); @@ -750,6 +749,13 @@ static int __init bt_init(void) goto sock_err; } + err = mgmt_init(); + if (err < 0) { + sco_exit(); + l2cap_exit(); + goto sock_err; + } + return 0; sock_err: @@ -764,6 +770,8 @@ error: static void __exit bt_exit(void) { + mgmt_exit(); + sco_exit(); l2cap_exit(); diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h index 5a5b16f365e9..40854c99bc1e 100644 --- a/net/bluetooth/bnep/bnep.h +++ b/net/bluetooth/bnep/bnep.h @@ -111,6 +111,10 @@ struct bnep_ext_hdr { #define BNEPCONNDEL _IOW('B', 201, int) #define BNEPGETCONNLIST _IOR('B', 210, int) #define BNEPGETCONNINFO _IOR('B', 211, int) +#define BNEPGETSUPPFEAT _IOR('B', 212, int) + +#define BNEP_SETUP_RESPONSE 0 +#define BNEP_SETUP_RSP_SENT 10 struct bnep_connadd_req { int sock; /* Connected socket */ diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 05f57e491ccb..1641367e54ca 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -231,7 +231,14 @@ static int bnep_rx_control(struct bnep_session *s, void *data, int len) break; case BNEP_SETUP_CONN_REQ: - err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP, BNEP_CONN_NOT_ALLOWED); + /* Successful response should be sent only once */ + if (test_bit(BNEP_SETUP_RESPONSE, &s->flags) && + !test_and_set_bit(BNEP_SETUP_RSP_SENT, &s->flags)) + err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP, + BNEP_SUCCESS); + else + err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP, + BNEP_CONN_NOT_ALLOWED); break; default: { @@ -239,7 +246,7 @@ static int bnep_rx_control(struct bnep_session *s, void *data, int len) pkt[0] = BNEP_CONTROL; pkt[1] = BNEP_CMD_NOT_UNDERSTOOD; pkt[2] = cmd; - bnep_send(s, pkt, sizeof(pkt)); + err = bnep_send(s, pkt, sizeof(pkt)); } break; } @@ -292,29 +299,55 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) { struct net_device *dev = s->dev; struct sk_buff *nskb; - u8 type; + u8 type, ctrl_type; dev->stats.rx_bytes += skb->len; type = *(u8 *) skb->data; skb_pull(skb, 1); + ctrl_type = *(u8 *)skb->data; if ((type & BNEP_TYPE_MASK) >= sizeof(__bnep_rx_hlen)) goto badframe; if ((type & BNEP_TYPE_MASK) == BNEP_CONTROL) { - bnep_rx_control(s, skb->data, skb->len); - kfree_skb(skb); - return 0; - } + if (bnep_rx_control(s, skb->data, skb->len) < 0) { + dev->stats.tx_errors++; + kfree_skb(skb); + return 0; + } - skb_reset_mac_header(skb); + if (!(type & BNEP_EXT_HEADER)) { + kfree_skb(skb); + return 0; + } - /* Verify and pull out header */ - if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK])) - goto badframe; + /* Verify and pull ctrl message since it's already processed */ + switch (ctrl_type) { + case BNEP_SETUP_CONN_REQ: + /* Pull: ctrl type (1 b), len (1 b), data (len bytes) */ + if (!skb_pull(skb, 2 + *(u8 *)(skb->data + 1) * 2)) + goto badframe; + break; + case BNEP_FILTER_MULTI_ADDR_SET: + case BNEP_FILTER_NET_TYPE_SET: + /* Pull: ctrl type (1 b), len (2 b), data (len bytes) */ + if (!skb_pull(skb, 3 + *(u16 *)(skb->data + 1) * 2)) + goto badframe; + break; + default: + kfree_skb(skb); + return 0; + } + } else { + skb_reset_mac_header(skb); - s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2)); + /* Verify and pull out header */ + if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK])) + goto badframe; + + s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2)); + } if (type & BNEP_EXT_HEADER) { if (bnep_rx_extension(s, skb) < 0) @@ -525,6 +558,7 @@ static struct device_type bnep_type = { int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) { + u32 valid_flags = BIT(BNEP_SETUP_RESPONSE); struct net_device *dev; struct bnep_session *s, *ss; u8 dst[ETH_ALEN], src[ETH_ALEN]; @@ -535,6 +569,9 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) if (!l2cap_is_socket(sock)) return -EBADFD; + if (req->flags & ~valid_flags) + return -EINVAL; + baswap((void *) dst, &l2cap_pi(sock->sk)->chan->dst); baswap((void *) src, &l2cap_pi(sock->sk)->chan->src); @@ -566,6 +603,7 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) s->sock = sock; s->role = req->role; s->state = BT_CONNECTED; + s->flags = req->flags; s->msg.msg_flags = MSG_NOSIGNAL; @@ -611,11 +649,15 @@ failed: int bnep_del_connection(struct bnep_conndel_req *req) { + u32 valid_flags = 0; struct bnep_session *s; int err = 0; BT_DBG(""); + if (req->flags & ~valid_flags) + return -EINVAL; + down_read(&bnep_session_sem); s = __bnep_get_session(req->dst); @@ -631,10 +673,12 @@ int bnep_del_connection(struct bnep_conndel_req *req) static void __bnep_copy_ci(struct bnep_conninfo *ci, struct bnep_session *s) { + u32 valid_flags = BIT(BNEP_SETUP_RESPONSE); + memset(ci, 0, sizeof(*ci)); memcpy(ci->dst, s->eh.h_source, ETH_ALEN); strcpy(ci->device, s->dev->name); - ci->flags = s->flags; + ci->flags = s->flags & valid_flags; ci->state = s->state; ci->role = s->role; } diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c index 4b488ec26105..6ceb5d36a32b 100644 --- a/net/bluetooth/bnep/netdev.c +++ b/net/bluetooth/bnep/netdev.c @@ -218,7 +218,7 @@ static const struct net_device_ops bnep_netdev_ops = { void bnep_net_setup(struct net_device *dev) { - memset(dev->broadcast, 0xff, ETH_ALEN); + eth_broadcast_addr(dev->broadcast); dev->addr_len = ETH_ALEN; ether_setup(dev); diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 5f051290daba..b5116fa9835e 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -57,6 +57,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long struct bnep_conninfo ci; struct socket *nsock; void __user *argp = (void __user *)arg; + __u32 supp_feat = BIT(BNEP_SETUP_RESPONSE); int err; BT_DBG("cmd %x arg %lx", cmd, arg); @@ -120,6 +121,12 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return err; + case BNEPGETSUPPFEAT: + if (copy_to_user(argp, &supp_feat, sizeof(supp_feat))) + return -EFAULT; + + return 0; + default: return -EINVAL; } @@ -195,7 +202,7 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index 75bd2c42e3e7..b0c6c6af76ef 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -333,7 +333,7 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb) return; } - if (session->flags & (1 << CMTP_LOOPBACK)) { + if (session->flags & BIT(CMTP_LOOPBACK)) { kfree_skb(skb); return; } diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 278a194e6af4..298ed37010e6 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -75,10 +75,11 @@ static void __cmtp_unlink_session(struct cmtp_session *session) static void __cmtp_copy_session(struct cmtp_session *session, struct cmtp_conninfo *ci) { + u32 valid_flags = BIT(CMTP_LOOPBACK); memset(ci, 0, sizeof(*ci)); bacpy(&ci->bdaddr, &session->bdaddr); - ci->flags = session->flags; + ci->flags = session->flags & valid_flags; ci->state = session->state; ci->num = session->num; @@ -313,7 +314,7 @@ static int cmtp_session(void *arg) down_write(&cmtp_session_sem); - if (!(session->flags & (1 << CMTP_LOOPBACK))) + if (!(session->flags & BIT(CMTP_LOOPBACK))) cmtp_detach_device(session); fput(session->sock->file); @@ -329,6 +330,7 @@ static int cmtp_session(void *arg) int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) { + u32 valid_flags = BIT(CMTP_LOOPBACK); struct cmtp_session *session, *s; int i, err; @@ -337,6 +339,9 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) if (!l2cap_is_socket(sock)) return -EBADFD; + if (req->flags & ~valid_flags) + return -EINVAL; + session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL); if (!session) return -ENOMEM; @@ -385,7 +390,7 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) goto unlink; } - if (!(session->flags & (1 << CMTP_LOOPBACK))) { + if (!(session->flags & BIT(CMTP_LOOPBACK))) { err = cmtp_attach_device(session); if (err < 0) { atomic_inc(&session->terminate); @@ -409,11 +414,15 @@ failed: int cmtp_del_connection(struct cmtp_conndel_req *req) { + u32 valid_flags = 0; struct cmtp_session *session; int err = 0; BT_DBG(""); + if (req->flags & ~valid_flags) + return -EINVAL; + down_read(&cmtp_session_sem); session = __cmtp_get_session(&req->bdaddr); diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index d82787d417bd..ce86a7bae844 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -205,7 +205,7 @@ static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index c9b8fa544785..2c48bf0b5afb 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -276,7 +276,7 @@ u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, - __u8 ltk[16]) + __u8 ltk[16], __u8 key_size) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_start_enc cp; @@ -288,7 +288,7 @@ void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, cp.handle = cpu_to_le16(conn->handle); cp.rand = rand; cp.ediv = ediv; - memcpy(cp.ltk, ltk, sizeof(cp.ltk)); + memcpy(cp.ltk, ltk, key_size); hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp); } @@ -309,7 +309,7 @@ void hci_sco_setup(struct hci_conn *conn, __u8 status) else hci_add_sco(sco, conn->handle); } else { - hci_proto_connect_cfm(sco, status); + hci_connect_cfm(sco, status); hci_conn_del(sco); } } @@ -571,7 +571,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) list_for_each_entry(d, &hci_dev_list, list) { if (!test_bit(HCI_UP, &d->flags) || - test_bit(HCI_USER_CHANNEL, &d->dev_flags) || + hci_dev_test_flag(d, HCI_USER_CHANNEL) || d->dev_type != HCI_BREDR) continue; @@ -618,7 +618,7 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); - hci_proto_connect_cfm(conn, status); + hci_connect_cfm(conn, status); hci_conn_del(conn); @@ -700,7 +700,7 @@ static void hci_req_directed_advertising(struct hci_request *req, * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ - clear_bit(HCI_LE_ADV, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LE_ADV); /* Set require_privacy to false so that the remote device has a * chance of identifying us. @@ -733,6 +733,14 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, struct hci_request req; int err; + /* Let's make sure that le is enabled.*/ + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { + if (lmp_le_capable(hdev)) + return ERR_PTR(-ECONNREFUSED); + + return ERR_PTR(-EOPNOTSUPP); + } + /* Some devices send ATT messages as soon as the physical link is * established. To be able to handle these ATT messages, the user- * space first establishes the connection and then starts the pairing @@ -791,7 +799,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, * anyway have to disable it in order to start directed * advertising. */ - if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) { u8 enable = 0x00; hci_req_add(&req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); @@ -802,7 +810,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, /* If we're active scanning most controllers are unable * to initiate advertising. Simply reject the attempt. */ - if (test_bit(HCI_LE_SCAN, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->le_scan_type == LE_SCAN_ACTIVE) { skb_queue_purge(&req.cmd_q); hci_conn_del(conn); @@ -832,9 +840,9 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, * handler for scan disabling knows to set the correct discovery * state. */ - if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { hci_req_add_le_scan_disable(&req); - set_bit(HCI_LE_SCAN_INTERRUPTED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); } hci_req_add_le_create_conn(&req, conn); @@ -856,8 +864,12 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, { struct hci_conn *acl; - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { + if (lmp_bredr_capable(hdev)) + return ERR_PTR(-ECONNREFUSED); + return ERR_PTR(-EOPNOTSUPP); + } acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { @@ -930,7 +942,7 @@ int hci_conn_check_link_mode(struct hci_conn *conn) * Connections is used and the link is encrypted with AES-CCM * using a P-256 authenticated combination key. */ - if (test_bit(HCI_SC_ONLY, &conn->hdev->flags)) { + if (hci_dev_test_flag(conn->hdev, HCI_SC_ONLY)) { if (!hci_conn_sc_enabled(conn) || !test_bit(HCI_CONN_AES_CCM, &conn->flags) || conn->key_type != HCI_LK_AUTH_COMBINATION_P256) @@ -1139,7 +1151,7 @@ void hci_conn_hash_flush(struct hci_dev *hdev) list_for_each_entry_safe(c, n, &h->list, list) { c->state = BT_CLOSED; - hci_proto_disconn_cfm(c, HCI_ERROR_LOCAL_HOST_TERM); + hci_disconn_cfm(c, HCI_ERROR_LOCAL_HOST_TERM); hci_conn_del(c); } } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 3322d3f4c85a..2f8fb33067e1 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -51,7 +51,7 @@ DEFINE_RWLOCK(hci_dev_list_lock); /* HCI callback list */ LIST_HEAD(hci_cb_list); -DEFINE_RWLOCK(hci_cb_list_lock); +DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); @@ -80,7 +80,7 @@ static ssize_t dut_mode_read(struct file *file, char __user *user_buf, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = test_bit(HCI_DUT_MODE, &hdev->dbg_flags) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y': 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -94,7 +94,6 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, char buf[32]; size_t buf_size = min(count, (sizeof(buf)-1)); bool enable; - int err; if (!test_bit(HCI_UP, &hdev->flags)) return -ENETDOWN; @@ -106,7 +105,7 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, if (strtobool(buf, &enable)) return -EINVAL; - if (enable == test_bit(HCI_DUT_MODE, &hdev->dbg_flags)) + if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE)) return -EALREADY; hci_req_lock(hdev); @@ -121,13 +120,9 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, if (IS_ERR(skb)) return PTR_ERR(skb); - err = -bt_to_errno(skb->data[0]); kfree_skb(skb); - if (err < 0) - return err; - - change_bit(HCI_DUT_MODE, &hdev->dbg_flags); + hci_dev_change_flag(hdev, HCI_DUT_MODE); return count; } @@ -141,13 +136,16 @@ static const struct file_operations dut_mode_fops = { /* ---- HCI requests ---- */ -static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode) +static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, + struct sk_buff *skb) { BT_DBG("%s result 0x%2.2x", hdev->name, result); if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = result; hdev->req_status = HCI_REQ_DONE; + if (skb) + hdev->req_skb = skb_get(skb); wake_up_interruptible(&hdev->req_wait_q); } } @@ -163,66 +161,12 @@ static void hci_req_cancel(struct hci_dev *hdev, int err) } } -static struct sk_buff *hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, - u8 event) -{ - struct hci_ev_cmd_complete *ev; - struct hci_event_hdr *hdr; - struct sk_buff *skb; - - hci_dev_lock(hdev); - - skb = hdev->recv_evt; - hdev->recv_evt = NULL; - - hci_dev_unlock(hdev); - - if (!skb) - return ERR_PTR(-ENODATA); - - if (skb->len < sizeof(*hdr)) { - BT_ERR("Too short HCI event"); - goto failed; - } - - hdr = (void *) skb->data; - skb_pull(skb, HCI_EVENT_HDR_SIZE); - - if (event) { - if (hdr->evt != event) - goto failed; - return skb; - } - - if (hdr->evt != HCI_EV_CMD_COMPLETE) { - BT_DBG("Last event is not cmd complete (0x%2.2x)", hdr->evt); - goto failed; - } - - if (skb->len < sizeof(*ev)) { - BT_ERR("Too short cmd_complete event"); - goto failed; - } - - ev = (void *) skb->data; - skb_pull(skb, sizeof(*ev)); - - if (opcode == __le16_to_cpu(ev->opcode)) - return skb; - - BT_DBG("opcode doesn't match (0x%2.2x != 0x%2.2x)", opcode, - __le16_to_cpu(ev->opcode)); - -failed: - kfree_skb(skb); - return ERR_PTR(-ENODATA); -} - struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout) { DECLARE_WAITQUEUE(wait, current); struct hci_request req; + struct sk_buff *skb; int err = 0; BT_DBG("%s", hdev->name); @@ -236,7 +180,7 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, add_wait_queue(&hdev->req_wait_q, &wait); set_current_state(TASK_INTERRUPTIBLE); - err = hci_req_run(&req, hci_req_sync_complete); + err = hci_req_run_skb(&req, hci_req_sync_complete); if (err < 0) { remove_wait_queue(&hdev->req_wait_q, &wait); set_current_state(TASK_RUNNING); @@ -265,13 +209,20 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, } hdev->req_status = hdev->req_result = 0; + skb = hdev->req_skb; + hdev->req_skb = NULL; BT_DBG("%s end: err %d", hdev->name, err); - if (err < 0) + if (err < 0) { + kfree_skb(skb); return ERR_PTR(err); + } - return hci_get_cmd_complete(hdev, opcode, event); + if (!skb) + return ERR_PTR(-ENODATA); + + return skb; } EXPORT_SYMBOL(__hci_cmd_sync_ev); @@ -303,7 +254,7 @@ static int __hci_req_sync(struct hci_dev *hdev, add_wait_queue(&hdev->req_wait_q, &wait); set_current_state(TASK_INTERRUPTIBLE); - err = hci_req_run(&req, hci_req_sync_complete); + err = hci_req_run_skb(&req, hci_req_sync_complete); if (err < 0) { hdev->req_status = 0; @@ -390,7 +341,7 @@ static void bredr_init(struct hci_request *req) hci_req_add(req, HCI_OP_READ_BD_ADDR, 0, NULL); } -static void amp_init(struct hci_request *req) +static void amp_init1(struct hci_request *req) { req->hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_BLOCK_BASED; @@ -400,9 +351,6 @@ static void amp_init(struct hci_request *req) /* Read Local Supported Commands */ hci_req_add(req, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL); - /* Read Local Supported Features */ - hci_req_add(req, HCI_OP_READ_LOCAL_FEATURES, 0, NULL); - /* Read Local AMP Info */ hci_req_add(req, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL); @@ -416,6 +364,16 @@ static void amp_init(struct hci_request *req) hci_req_add(req, HCI_OP_READ_LOCATION_DATA, 0, NULL); } +static void amp_init2(struct hci_request *req) +{ + /* Read Local Supported Features. Not all AMP controllers + * support this so it's placed conditionally in the second + * stage init. + */ + if (req->hdev->commands[14] & 0x20) + hci_req_add(req, HCI_OP_READ_LOCAL_FEATURES, 0, NULL); +} + static void hci_init1_req(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; @@ -432,7 +390,7 @@ static void hci_init1_req(struct hci_request *req, unsigned long opt) break; case HCI_AMP: - amp_init(req); + amp_init1(req); break; default: @@ -494,7 +452,7 @@ static void le_setup(struct hci_request *req) /* LE-only controllers have LE implicitly enabled */ if (!lmp_bredr_capable(hdev)) - set_bit(HCI_LE_ENABLED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_LE_ENABLED); } static void hci_setup_event_mask(struct hci_request *req) @@ -578,10 +536,13 @@ static void hci_init2_req(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; + if (hdev->dev_type == HCI_AMP) + return amp_init2(req); + if (lmp_bredr_capable(hdev)) bredr_setup(req); else - clear_bit(HCI_BREDR_ENABLED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED); if (lmp_le_capable(hdev)) le_setup(req); @@ -607,7 +568,7 @@ static void hci_init2_req(struct hci_request *req, unsigned long opt) */ hdev->max_page = 0x01; - if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { u8 mode = 0x01; hci_req_add(req, HCI_OP_WRITE_SSP_MODE, @@ -646,7 +607,7 @@ static void hci_init2_req(struct hci_request *req, unsigned long opt) sizeof(cp), &cp); } - if (test_bit(HCI_LINK_SECURITY, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_LINK_SECURITY)) { u8 enable = 1; hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, sizeof(enable), &enable); @@ -683,7 +644,7 @@ static void hci_set_le_support(struct hci_request *req) memset(&cp, 0, sizeof(cp)); - if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { cp.le = 0x01; cp.simul = 0x00; } @@ -871,7 +832,7 @@ static void hci_init4_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_READ_SYNC_TRAIN_PARAMS, 0, NULL); /* Enable Secure Connections if supported and configured */ - if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) && bredr_sc_enabled(hdev)) { u8 support = 0x01; @@ -891,22 +852,22 @@ static int __hci_init(struct hci_dev *hdev) /* The Device Under Test (DUT) mode is special and available for * all controller types. So just create it early on. */ - if (test_bit(HCI_SETUP, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_SETUP)) { debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, &dut_mode_fops); } + err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT); + if (err < 0) + return err; + /* HCI_BREDR covers both single-mode LE, BR/EDR and dual-mode * BR/EDR/LE type controllers. AMP controllers only need the - * first stage init. + * first two stages of init. */ if (hdev->dev_type != HCI_BREDR) return 0; - err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT); - if (err < 0) - return err; - err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT); if (err < 0) return err; @@ -927,8 +888,8 @@ static int __hci_init(struct hci_dev *hdev) * So only when in setup phase or config phase, create the debugfs * entries and register the SMP channels. */ - if (!test_bit(HCI_SETUP, &hdev->dev_flags) && - !test_bit(HCI_CONFIG, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_SETUP) && + !hci_dev_test_flag(hdev, HCI_CONFIG)) return 0; hci_debugfs_create_common(hdev); @@ -1290,12 +1251,12 @@ int hci_inquiry(void __user *arg) if (!hdev) return -ENODEV; - if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } @@ -1305,7 +1266,7 @@ int hci_inquiry(void __user *arg) goto done; } - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) { + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } @@ -1377,17 +1338,17 @@ static int hci_dev_do_open(struct hci_dev *hdev) hci_req_lock(hdev); - if (test_bit(HCI_UNREGISTER, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { ret = -ENODEV; goto done; } - if (!test_bit(HCI_SETUP, &hdev->dev_flags) && - !test_bit(HCI_CONFIG, &hdev->dev_flags)) { + if (!hci_dev_test_flag(hdev, HCI_SETUP) && + !hci_dev_test_flag(hdev, HCI_CONFIG)) { /* Check for rfkill but allow the HCI setup stage to * proceed (which in itself doesn't cause any RF activity). */ - if (test_bit(HCI_RFKILLED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_RFKILLED)) { ret = -ERFKILL; goto done; } @@ -1404,7 +1365,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) * This check is only valid for BR/EDR controllers * since AMP controllers do not have an address. */ - if (!test_bit(HCI_USER_CHANNEL, &hdev->dev_flags) && + if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hdev->dev_type == HCI_BREDR && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY)) { @@ -1426,7 +1387,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); - if (test_bit(HCI_SETUP, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_SETUP)) { if (hdev->setup) ret = hdev->setup(hdev); @@ -1438,7 +1399,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) */ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks)) - set_bit(HCI_UNCONFIGURED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* For an unconfigured controller it is required to * read at least the version information provided by @@ -1448,11 +1409,11 @@ static int hci_dev_do_open(struct hci_dev *hdev) * also the original Bluetooth public device address * will be read using the Read BD Address command. */ - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) ret = __hci_unconf_init(hdev); } - if (test_bit(HCI_CONFIG, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_CONFIG)) { /* If public address change is configured, ensure that * the address gets programmed. If the driver does not * support changing the public address, fail the power @@ -1466,8 +1427,8 @@ static int hci_dev_do_open(struct hci_dev *hdev) } if (!ret) { - if (!test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && - !test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) ret = __hci_init(hdev); } @@ -1475,13 +1436,13 @@ static int hci_dev_do_open(struct hci_dev *hdev) if (!ret) { hci_dev_hold(hdev); - set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); set_bit(HCI_UP, &hdev->flags); hci_notify(hdev, HCI_DEV_UP); - if (!test_bit(HCI_SETUP, &hdev->dev_flags) && - !test_bit(HCI_CONFIG, &hdev->dev_flags) && - !test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && - !test_bit(HCI_USER_CHANNEL, &hdev->dev_flags) && + if (!hci_dev_test_flag(hdev, HCI_SETUP) && + !hci_dev_test_flag(hdev, HCI_CONFIG) && + !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hdev->dev_type == HCI_BREDR) { hci_dev_lock(hdev); mgmt_powered(hdev, 1); @@ -1533,8 +1494,8 @@ int hci_dev_open(__u16 dev) * HCI_USER_CHANNEL will be set first before attempting to * open the device. */ - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && - !test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EOPNOTSUPP; goto done; } @@ -1544,7 +1505,7 @@ int hci_dev_open(__u16 dev) * particularly important if the setup procedure has not yet * completed. */ - if (test_and_clear_bit(HCI_AUTO_OFF, &hdev->dev_flags)) + if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); /* After this call it is guaranteed that the setup procedure @@ -1559,9 +1520,9 @@ int hci_dev_open(__u16 dev) * is in use this bit will be cleared again and userspace has * to explicitly enable it. */ - if (!test_bit(HCI_USER_CHANNEL, &hdev->dev_flags) && - !test_bit(HCI_MGMT, &hdev->dev_flags)) - set_bit(HCI_BONDABLE, &hdev->dev_flags); + if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + !hci_dev_test_flag(hdev, HCI_MGMT)) + hci_dev_set_flag(hdev, HCI_BONDABLE); err = hci_dev_do_open(hdev); @@ -1591,6 +1552,14 @@ static int hci_dev_do_close(struct hci_dev *hdev) { BT_DBG("%s %p", hdev->name, hdev); + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + test_bit(HCI_UP, &hdev->flags)) { + /* Execute vendor specific shutdown routine */ + if (hdev->shutdown) + hdev->shutdown(hdev); + } + cancel_delayed_work(&hdev->power_off); hci_req_cancel(hdev, ENODEV); @@ -1609,19 +1578,24 @@ static int hci_dev_do_close(struct hci_dev *hdev) if (hdev->discov_timeout > 0) { cancel_delayed_work(&hdev->discov_off); hdev->discov_timeout = 0; - clear_bit(HCI_DISCOVERABLE, &hdev->dev_flags); - clear_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); + hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); } - if (test_and_clear_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) + if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) cancel_delayed_work(&hdev->service_cache); cancel_delayed_work_sync(&hdev->le_scan_disable); cancel_delayed_work_sync(&hdev->le_scan_restart); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) cancel_delayed_work_sync(&hdev->rpa_expired); + if (hdev->adv_instance_timeout) { + cancel_delayed_work_sync(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ @@ -1631,7 +1605,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - if (!test_and_clear_bit(HCI_AUTO_OFF, &hdev->dev_flags)) { + if (!hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { if (hdev->dev_type == HCI_BREDR) mgmt_powered(hdev, 0); } @@ -1651,8 +1625,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); - if (!test_bit(HCI_AUTO_OFF, &hdev->dev_flags) && - !test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && + if (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) && + !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { set_bit(HCI_INIT, &hdev->flags); __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT); @@ -1674,16 +1648,13 @@ static int hci_dev_do_close(struct hci_dev *hdev) hdev->sent_cmd = NULL; } - kfree_skb(hdev->recv_evt); - hdev->recv_evt = NULL; - /* After this point our queues are empty * and no tasks are scheduled. */ hdev->close(hdev); /* Clear flags */ hdev->flags &= BIT(HCI_RAW); - hdev->dev_flags &= ~HCI_PERSISTENT_MASK; + hci_dev_clear_volatile_flags(hdev); /* Controller radio is available but is currently powered down */ hdev->amp_status = AMP_STATUS_POWERED_DOWN; @@ -1707,12 +1678,12 @@ int hci_dev_close(__u16 dev) if (!hdev) return -ENODEV; - if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } - if (test_and_clear_bit(HCI_AUTO_OFF, &hdev->dev_flags)) + if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); err = hci_dev_do_close(hdev); @@ -1770,12 +1741,12 @@ int hci_dev_reset(__u16 dev) goto done; } - if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } @@ -1796,12 +1767,12 @@ int hci_dev_reset_stat(__u16 dev) if (!hdev) return -ENODEV; - if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = -EBUSY; goto done; } - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { ret = -EOPNOTSUPP; goto done; } @@ -1820,29 +1791,29 @@ static void hci_update_scan_state(struct hci_dev *hdev, u8 scan) BT_DBG("%s scan 0x%02x", hdev->name, scan); if ((scan & SCAN_PAGE)) - conn_changed = !test_and_set_bit(HCI_CONNECTABLE, - &hdev->dev_flags); + conn_changed = !hci_dev_test_and_set_flag(hdev, + HCI_CONNECTABLE); else - conn_changed = test_and_clear_bit(HCI_CONNECTABLE, - &hdev->dev_flags); + conn_changed = hci_dev_test_and_clear_flag(hdev, + HCI_CONNECTABLE); if ((scan & SCAN_INQUIRY)) { - discov_changed = !test_and_set_bit(HCI_DISCOVERABLE, - &hdev->dev_flags); + discov_changed = !hci_dev_test_and_set_flag(hdev, + HCI_DISCOVERABLE); } else { - clear_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); - discov_changed = test_and_clear_bit(HCI_DISCOVERABLE, - &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); + discov_changed = hci_dev_test_and_clear_flag(hdev, + HCI_DISCOVERABLE); } - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_MGMT)) return; if (conn_changed || discov_changed) { /* In case this was disabled through mgmt */ - set_bit(HCI_BREDR_ENABLED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); - if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) mgmt_update_adv_data(hdev); mgmt_new_settings(hdev); @@ -1862,12 +1833,12 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) if (!hdev) return -ENODEV; - if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } @@ -1877,7 +1848,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) goto done; } - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) { + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } @@ -1981,7 +1952,7 @@ int hci_get_dev_list(void __user *arg) * is running, but in that case still indicate that the * device is actually down. */ - if (test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags &= ~BIT(HCI_UP); (dr + n)->dev_id = hdev->id; @@ -2019,7 +1990,7 @@ int hci_get_dev_info(void __user *arg) * is running, but in that case still indicate that the * device is actually down. */ - if (test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags = hdev->flags & ~BIT(HCI_UP); else flags = hdev->flags; @@ -2062,16 +2033,16 @@ static int hci_rfkill_set_block(void *data, bool blocked) BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); - if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; if (blocked) { - set_bit(HCI_RFKILLED, &hdev->dev_flags); - if (!test_bit(HCI_SETUP, &hdev->dev_flags) && - !test_bit(HCI_CONFIG, &hdev->dev_flags)) + hci_dev_set_flag(hdev, HCI_RFKILLED); + if (!hci_dev_test_flag(hdev, HCI_SETUP) && + !hci_dev_test_flag(hdev, HCI_CONFIG)) hci_dev_do_close(hdev); } else { - clear_bit(HCI_RFKILLED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_RFKILLED); } return 0; @@ -2100,23 +2071,23 @@ static void hci_power_on(struct work_struct *work) * ignored and they need to be checked now. If they are still * valid, it is important to turn the device back off. */ - if (test_bit(HCI_RFKILLED, &hdev->dev_flags) || - test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) || + if (hci_dev_test_flag(hdev, HCI_RFKILLED) || + hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || (hdev->dev_type == HCI_BREDR && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { - clear_bit(HCI_AUTO_OFF, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_do_close(hdev); - } else if (test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) { + } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) { queue_delayed_work(hdev->req_workqueue, &hdev->power_off, HCI_AUTO_OFF_TIMEOUT); } - if (test_and_clear_bit(HCI_SETUP, &hdev->dev_flags)) { + if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) { /* For unconfigured devices, set the HCI_RAW flag * so that userspace can easily identify them. */ - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) set_bit(HCI_RAW, &hdev->flags); /* For fully configured devices, this will send @@ -2127,11 +2098,11 @@ static void hci_power_on(struct work_struct *work) * and no event will be send. */ mgmt_index_added(hdev); - } else if (test_and_clear_bit(HCI_CONFIG, &hdev->dev_flags)) { + } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) { /* When the controller is now configured, then it * is important to clear the HCI_RAW flag. */ - if (!test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) clear_bit(HCI_RAW, &hdev->flags); /* Powering on the controller with HCI_CONFIG set only @@ -2181,6 +2152,17 @@ static void hci_discov_off(struct work_struct *work) mgmt_discoverable_timeout(hdev); } +static void hci_adv_timeout_expire(struct work_struct *work) +{ + struct hci_dev *hdev; + + hdev = container_of(work, struct hci_dev, adv_instance_expire.work); + + BT_DBG("%s", hdev->name); + + mgmt_adv_timeout_expired(hdev); +} + void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; @@ -2500,6 +2482,42 @@ void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) } } +bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) +{ + struct smp_ltk *k; + struct smp_irk *irk; + u8 addr_type; + + if (type == BDADDR_BREDR) { + if (hci_find_link_key(hdev, bdaddr)) + return true; + return false; + } + + /* Convert to HCI addr type which struct smp_ltk uses */ + if (type == BDADDR_LE_PUBLIC) + addr_type = ADDR_LE_DEV_PUBLIC; + else + addr_type = ADDR_LE_DEV_RANDOM; + + irk = hci_get_irk(hdev, bdaddr, addr_type); + if (irk) { + bdaddr = &irk->bdaddr; + addr_type = irk->addr_type; + } + + rcu_read_lock(); + list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { + if (k->bdaddr_type == addr_type && !bacmp(bdaddr, &k->bdaddr)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + + return false; +} + /* HCI command timer function */ static void hci_cmd_timeout(struct work_struct *work) { @@ -2608,6 +2626,130 @@ int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, return 0; } +/* This function requires the caller holds hdev->lock */ +struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv_instance; + + list_for_each_entry(adv_instance, &hdev->adv_instances, list) { + if (adv_instance->instance == instance) + return adv_instance; + } + + return NULL; +} + +/* This function requires the caller holds hdev->lock */ +struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { + struct adv_info *cur_instance; + + cur_instance = hci_find_adv_instance(hdev, instance); + if (!cur_instance) + return NULL; + + if (cur_instance == list_last_entry(&hdev->adv_instances, + struct adv_info, list)) + return list_first_entry(&hdev->adv_instances, + struct adv_info, list); + else + return list_next_entry(cur_instance, list); +} + +/* This function requires the caller holds hdev->lock */ +int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -ENOENT; + + BT_DBG("%s removing %dMR", hdev->name, instance); + + if (hdev->cur_adv_instance == instance && hdev->adv_instance_timeout) { + cancel_delayed_work(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + + list_del(&adv_instance->list); + kfree(adv_instance); + + hdev->adv_instance_cnt--; + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +void hci_adv_instances_clear(struct hci_dev *hdev) +{ + struct adv_info *adv_instance, *n; + + if (hdev->adv_instance_timeout) { + cancel_delayed_work(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { + list_del(&adv_instance->list); + kfree(adv_instance); + } + + hdev->adv_instance_cnt = 0; +} + +/* This function requires the caller holds hdev->lock */ +int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, + u16 adv_data_len, u8 *adv_data, + u16 scan_rsp_len, u8 *scan_rsp_data, + u16 timeout, u16 duration) +{ + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (adv_instance) { + memset(adv_instance->adv_data, 0, + sizeof(adv_instance->adv_data)); + memset(adv_instance->scan_rsp_data, 0, + sizeof(adv_instance->scan_rsp_data)); + } else { + if (hdev->adv_instance_cnt >= HCI_MAX_ADV_INSTANCES || + instance < 1 || instance > HCI_MAX_ADV_INSTANCES) + return -EOVERFLOW; + + adv_instance = kzalloc(sizeof(*adv_instance), GFP_KERNEL); + if (!adv_instance) + return -ENOMEM; + + adv_instance->pending = true; + adv_instance->instance = instance; + list_add(&adv_instance->list, &hdev->adv_instances); + hdev->adv_instance_cnt++; + } + + adv_instance->flags = flags; + adv_instance->adv_data_len = adv_data_len; + adv_instance->scan_rsp_len = scan_rsp_len; + + if (adv_data_len) + memcpy(adv_instance->adv_data, adv_data, adv_data_len); + + if (scan_rsp_len) + memcpy(adv_instance->scan_rsp_data, + scan_rsp_data, scan_rsp_len); + + adv_instance->timeout = timeout; + adv_instance->remaining_time = timeout; + + if (duration == 0) + adv_instance->duration = HCI_DEFAULT_ADV_DURATION; + else + adv_instance->duration = duration; + + BT_DBG("%s for %dMR", hdev->name, instance); + + return 0; +} + struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { @@ -2822,7 +2964,6 @@ static void le_scan_disable_work_complete(struct hci_dev *hdev, u8 status, { /* General inquiry access code (GIAC) */ u8 lap[3] = { 0x33, 0x8b, 0x9e }; - struct hci_request req; struct hci_cp_inquiry cp; int err; @@ -2841,21 +2982,39 @@ static void le_scan_disable_work_complete(struct hci_dev *hdev, u8 status, break; case DISCOV_TYPE_INTERLEAVED: - hci_req_init(&req, hdev); + hci_dev_lock(hdev); - memset(&cp, 0, sizeof(cp)); - memcpy(&cp.lap, lap, sizeof(cp.lap)); - cp.length = DISCOV_INTERLEAVED_INQUIRY_LEN; - hci_req_add(&req, HCI_OP_INQUIRY, sizeof(cp), &cp); + if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, + &hdev->quirks)) { + /* If we were running LE only scan, change discovery + * state. If we were running both LE and BR/EDR inquiry + * simultaneously, and BR/EDR inquiry is already + * finished, stop discovery, otherwise BR/EDR inquiry + * will stop discovery when finished. If we will resolve + * remote device name, do not change discovery state. + */ + if (!test_bit(HCI_INQUIRY, &hdev->flags) && + hdev->discovery.state != DISCOVERY_RESOLVING) + hci_discovery_set_state(hdev, + DISCOVERY_STOPPED); + } else { + struct hci_request req; - hci_dev_lock(hdev); + hci_inquiry_cache_flush(hdev); - hci_inquiry_cache_flush(hdev); + hci_req_init(&req, hdev); - err = hci_req_run(&req, inquiry_complete); - if (err) { - BT_ERR("Inquiry request failed: err %d", err); - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + memset(&cp, 0, sizeof(cp)); + memcpy(&cp.lap, lap, sizeof(cp.lap)); + cp.length = DISCOV_INTERLEAVED_INQUIRY_LEN; + hci_req_add(&req, HCI_OP_INQUIRY, sizeof(cp), &cp); + + err = hci_req_run(&req, inquiry_complete); + if (err) { + BT_ERR("Inquiry request failed: err %d", err); + hci_discovery_set_state(hdev, + DISCOVERY_STOPPED); + } } hci_dev_unlock(hdev); @@ -2934,7 +3093,7 @@ static void le_scan_restart_work(struct work_struct *work) BT_DBG("%s", hdev->name); /* If controller is not scanning we are done. */ - if (!test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return; hci_req_init(&req, hdev); @@ -2967,9 +3126,9 @@ static void le_scan_restart_work(struct work_struct *work) void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *bdaddr_type) { - if (test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags) || + if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || - (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags) && + (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { bacpy(bdaddr, &hdev->static_addr); *bdaddr_type = ADDR_LE_DEV_RANDOM; @@ -2996,6 +3155,9 @@ struct hci_dev *hci_alloc_dev(void) hdev->manufacturer = 0xffff; /* Default to internal use */ hdev->inq_tx_power = HCI_TX_POWER_INVALID; hdev->adv_tx_power = HCI_TX_POWER_INVALID; + hdev->adv_instance_cnt = 0; + hdev->cur_adv_instance = 0x00; + hdev->adv_instance_timeout = 0; hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; @@ -3037,6 +3199,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); + INIT_LIST_HEAD(&hdev->adv_instances); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); @@ -3048,6 +3211,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_DELAYED_WORK(&hdev->discov_off, hci_discov_off); INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work); INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work); + INIT_DELAYED_WORK(&hdev->adv_instance_expire, hci_adv_timeout_expire); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); @@ -3137,16 +3301,16 @@ int hci_register_dev(struct hci_dev *hdev) } if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) - set_bit(HCI_RFKILLED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_RFKILLED); - set_bit(HCI_SETUP, &hdev->dev_flags); - set_bit(HCI_AUTO_OFF, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_SETUP); + hci_dev_set_flag(hdev, HCI_AUTO_OFF); if (hdev->dev_type == HCI_BREDR) { /* Assume BR/EDR support until proven otherwise (such as * through reading supported features during init. */ - set_bit(HCI_BREDR_ENABLED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); } write_lock(&hci_dev_list_lock); @@ -3157,7 +3321,7 @@ int hci_register_dev(struct hci_dev *hdev) * and should not be included in normal operation. */ if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) - set_bit(HCI_UNCONFIGURED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_UNCONFIGURED); hci_notify(hdev, HCI_DEV_REG); hci_dev_hold(hdev); @@ -3179,11 +3343,11 @@ EXPORT_SYMBOL(hci_register_dev); /* Unregister HCI device */ void hci_unregister_dev(struct hci_dev *hdev) { - int i, id; + int id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); - set_bit(HCI_UNREGISTER, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_UNREGISTER); id = hdev->id; @@ -3193,14 +3357,11 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_dev_do_close(hdev); - for (i = 0; i < NUM_REASSEMBLY; i++) - kfree_skb(hdev->reassembly[i]); - cancel_work_sync(&hdev->power_on); if (!test_bit(HCI_INIT, &hdev->flags) && - !test_bit(HCI_SETUP, &hdev->dev_flags) && - !test_bit(HCI_CONFIG, &hdev->dev_flags)) { + !hci_dev_test_flag(hdev, HCI_SETUP) && + !hci_dev_test_flag(hdev, HCI_CONFIG)) { hci_dev_lock(hdev); mgmt_index_removed(hdev); hci_dev_unlock(hdev); @@ -3232,6 +3393,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_smp_ltks_clear(hdev); hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); + hci_adv_instances_clear(hdev); hci_bdaddr_list_clear(&hdev->le_white_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); @@ -3299,158 +3461,15 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) } EXPORT_SYMBOL(hci_recv_frame); -static int hci_reassembly(struct hci_dev *hdev, int type, void *data, - int count, __u8 index) -{ - int len = 0; - int hlen = 0; - int remain = count; - struct sk_buff *skb; - struct bt_skb_cb *scb; - - if ((type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT) || - index >= NUM_REASSEMBLY) - return -EILSEQ; - - skb = hdev->reassembly[index]; - - if (!skb) { - switch (type) { - case HCI_ACLDATA_PKT: - len = HCI_MAX_FRAME_SIZE; - hlen = HCI_ACL_HDR_SIZE; - break; - case HCI_EVENT_PKT: - len = HCI_MAX_EVENT_SIZE; - hlen = HCI_EVENT_HDR_SIZE; - break; - case HCI_SCODATA_PKT: - len = HCI_MAX_SCO_SIZE; - hlen = HCI_SCO_HDR_SIZE; - break; - } - - skb = bt_skb_alloc(len, GFP_ATOMIC); - if (!skb) - return -ENOMEM; - - scb = (void *) skb->cb; - scb->expect = hlen; - scb->pkt_type = type; - - hdev->reassembly[index] = skb; - } - - while (count) { - scb = (void *) skb->cb; - len = min_t(uint, scb->expect, count); - - memcpy(skb_put(skb, len), data, len); - - count -= len; - data += len; - scb->expect -= len; - remain = count; - - switch (type) { - case HCI_EVENT_PKT: - if (skb->len == HCI_EVENT_HDR_SIZE) { - struct hci_event_hdr *h = hci_event_hdr(skb); - scb->expect = h->plen; - - if (skb_tailroom(skb) < scb->expect) { - kfree_skb(skb); - hdev->reassembly[index] = NULL; - return -ENOMEM; - } - } - break; - - case HCI_ACLDATA_PKT: - if (skb->len == HCI_ACL_HDR_SIZE) { - struct hci_acl_hdr *h = hci_acl_hdr(skb); - scb->expect = __le16_to_cpu(h->dlen); - - if (skb_tailroom(skb) < scb->expect) { - kfree_skb(skb); - hdev->reassembly[index] = NULL; - return -ENOMEM; - } - } - break; - - case HCI_SCODATA_PKT: - if (skb->len == HCI_SCO_HDR_SIZE) { - struct hci_sco_hdr *h = hci_sco_hdr(skb); - scb->expect = h->dlen; - - if (skb_tailroom(skb) < scb->expect) { - kfree_skb(skb); - hdev->reassembly[index] = NULL; - return -ENOMEM; - } - } - break; - } - - if (scb->expect == 0) { - /* Complete frame */ - - bt_cb(skb)->pkt_type = type; - hci_recv_frame(hdev, skb); - - hdev->reassembly[index] = NULL; - return remain; - } - } - - return remain; -} - -#define STREAM_REASSEMBLY 0 - -int hci_recv_stream_fragment(struct hci_dev *hdev, void *data, int count) -{ - int type; - int rem = 0; - - while (count) { - struct sk_buff *skb = hdev->reassembly[STREAM_REASSEMBLY]; - - if (!skb) { - struct { char type; } *pkt; - - /* Start of the frame */ - pkt = data; - type = pkt->type; - - data++; - count--; - } else - type = bt_cb(skb)->pkt_type; - - rem = hci_reassembly(hdev, type, data, count, - STREAM_REASSEMBLY); - if (rem < 0) - return rem; - - data += (count - rem); - count = rem; - } - - return rem; -} -EXPORT_SYMBOL(hci_recv_stream_fragment); - /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); - write_lock(&hci_cb_list_lock); - list_add(&cb->list, &hci_cb_list); - write_unlock(&hci_cb_list_lock); + mutex_lock(&hci_cb_list_lock); + list_add_tail(&cb->list, &hci_cb_list); + mutex_unlock(&hci_cb_list_lock); return 0; } @@ -3460,9 +3479,9 @@ int hci_unregister_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); - write_lock(&hci_cb_list_lock); + mutex_lock(&hci_cb_list_lock); list_del(&cb->list); - write_unlock(&hci_cb_list_lock); + mutex_unlock(&hci_cb_list_lock); return 0; } @@ -3495,11 +3514,6 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) } } -bool hci_req_pending(struct hci_dev *hdev) -{ - return (hdev->req_status == HCI_REQ_PEND); -} - /* Send HCI command */ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param) @@ -3874,7 +3888,7 @@ static inline int __get_blocks(struct hci_dev *hdev, struct sk_buff *skb) static void __check_timeout(struct hci_dev *hdev, unsigned int cnt) { - if (!test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { /* ACL tx timeout must be longer than maximum * link supervision timeout (40.9 seconds) */ if (!cnt && time_after(jiffies, hdev->acl_last_tx + @@ -4057,7 +4071,7 @@ static void hci_sched_le(struct hci_dev *hdev) if (!hci_conn_num(hdev, LE_LINK)) return; - if (!test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { /* LE tx timeout must be longer than maximum * link supervision timeout (40.9 seconds) */ if (!hdev->le_cnt && hdev->le_pkts && @@ -4105,7 +4119,7 @@ static void hci_tx_work(struct work_struct *work) BT_DBG("%s acl %d sco %d le %d", hdev->name, hdev->acl_cnt, hdev->sco_cnt, hdev->le_cnt); - if (!test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ hci_sched_acl(hdev); hci_sched_sco(hdev); @@ -4220,9 +4234,10 @@ static void hci_resend_last(struct hci_dev *hdev) queue_work(hdev->workqueue, &hdev->cmd_work); } -void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status) +void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, + hci_req_complete_t *req_complete, + hci_req_complete_skb_t *req_complete_skb) { - hci_req_complete_t req_complete = NULL; struct sk_buff *skb; unsigned long flags; @@ -4254,18 +4269,14 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status) * callback would be found in hdev->sent_cmd instead of the * command queue (hdev->cmd_q). */ - if (hdev->sent_cmd) { - req_complete = bt_cb(hdev->sent_cmd)->req.complete; - - if (req_complete) { - /* We must set the complete callback to NULL to - * avoid calling the callback more than once if - * this function gets called again. - */ - bt_cb(hdev->sent_cmd)->req.complete = NULL; + if (bt_cb(hdev->sent_cmd)->req.complete) { + *req_complete = bt_cb(hdev->sent_cmd)->req.complete; + return; + } - goto call_complete; - } + if (bt_cb(hdev->sent_cmd)->req.complete_skb) { + *req_complete_skb = bt_cb(hdev->sent_cmd)->req.complete_skb; + return; } /* Remove all pending commands belonging to this request */ @@ -4276,14 +4287,11 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status) break; } - req_complete = bt_cb(skb)->req.complete; + *req_complete = bt_cb(skb)->req.complete; + *req_complete_skb = bt_cb(skb)->req.complete_skb; kfree_skb(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); - -call_complete: - if (req_complete) - req_complete(hdev, status, status ? opcode : HCI_OP_NOP); } static void hci_rx_work(struct work_struct *work) @@ -4302,7 +4310,7 @@ static void hci_rx_work(struct work_struct *work) hci_send_to_sock(hdev, skb); } - if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { kfree_skb(skb); continue; } diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 65261e5d4b84..7db4220941cc 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -28,6 +28,54 @@ #include "hci_debugfs.h" +#define DEFINE_QUIRK_ATTRIBUTE(__name, __quirk) \ +static ssize_t __name ## _read(struct file *file, \ + char __user *user_buf, \ + size_t count, loff_t *ppos) \ +{ \ + struct hci_dev *hdev = file->private_data; \ + char buf[3]; \ + \ + buf[0] = test_bit(__quirk, &hdev->quirks) ? 'Y' : 'N'; \ + buf[1] = '\n'; \ + buf[2] = '\0'; \ + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); \ +} \ + \ +static ssize_t __name ## _write(struct file *file, \ + const char __user *user_buf, \ + size_t count, loff_t *ppos) \ +{ \ + struct hci_dev *hdev = file->private_data; \ + char buf[32]; \ + size_t buf_size = min(count, (sizeof(buf) - 1)); \ + bool enable; \ + \ + if (test_bit(HCI_UP, &hdev->flags)) \ + return -EBUSY; \ + \ + if (copy_from_user(buf, user_buf, buf_size)) \ + return -EFAULT; \ + \ + buf[buf_size] = '\0'; \ + if (strtobool(buf, &enable)) \ + return -EINVAL; \ + \ + if (enable == test_bit(__quirk, &hdev->quirks)) \ + return -EALREADY; \ + \ + change_bit(__quirk, &hdev->quirks); \ + \ + return count; \ +} \ + \ +static const struct file_operations __name ## _fops = { \ + .open = simple_open, \ + .read = __name ## _read, \ + .write = __name ## _write, \ + .llseek = default_llseek, \ +} \ + static int features_show(struct seq_file *f, void *ptr) { struct hci_dev *hdev = f->private; @@ -66,6 +114,30 @@ static const struct file_operations features_fops = { .release = single_release, }; +static int device_id_show(struct seq_file *f, void *ptr) +{ + struct hci_dev *hdev = f->private; + + hci_dev_lock(hdev); + seq_printf(f, "%4.4x:%4.4x:%4.4x:%4.4x\n", hdev->devid_source, + hdev->devid_vendor, hdev->devid_product, hdev->devid_version); + hci_dev_unlock(hdev); + + return 0; +} + +static int device_id_open(struct inode *inode, struct file *file) +{ + return single_open(file, device_id_show, inode->i_private); +} + +static const struct file_operations device_id_fops = { + .open = device_id_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int device_list_show(struct seq_file *f, void *ptr) { struct hci_dev *hdev = f->private; @@ -166,7 +238,7 @@ static int remote_oob_show(struct seq_file *f, void *ptr) seq_printf(f, "%pMR (type %u) %u %*phN %*phN %*phN %*phN\n", &data->bdaddr, data->bdaddr_type, data->present, 16, data->hash192, 16, data->rand192, - 16, data->hash256, 19, data->rand256); + 16, data->hash256, 16, data->rand256); } hci_dev_unlock(hdev); @@ -247,7 +319,7 @@ static ssize_t use_debug_keys_read(struct file *file, char __user *user_buf, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = test_bit(HCI_USE_DEBUG_KEYS, &hdev->dev_flags) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS) ? 'Y': 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -265,7 +337,7 @@ static ssize_t sc_only_mode_read(struct file *file, char __user *user_buf, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = test_bit(HCI_SC_ONLY, &hdev->dev_flags) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_SC_ONLY) ? 'Y': 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -287,6 +359,8 @@ void hci_debugfs_create_common(struct hci_dev *hdev) debugfs_create_u16("hci_revision", 0444, hdev->debugfs, &hdev->hci_rev); debugfs_create_u8("hardware_error", 0444, hdev->debugfs, &hdev->hw_error_code); + debugfs_create_file("device_id", 0444, hdev->debugfs, hdev, + &device_id_fops); debugfs_create_file("device_list", 0444, hdev->debugfs, hdev, &device_list_fops); @@ -679,7 +753,7 @@ static ssize_t force_static_address_read(struct file *file, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ? 'Y': 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -704,10 +778,10 @@ static ssize_t force_static_address_write(struct file *file, if (strtobool(buf, &enable)) return -EINVAL; - if (enable == test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags)) + if (enable == hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR)) return -EALREADY; - change_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags); + hci_dev_change_flag(hdev, HCI_FORCE_STATIC_ADDR); return count; } @@ -997,6 +1071,11 @@ static int adv_max_interval_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get, adv_max_interval_set, "%llu\n"); +DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter, + HCI_QUIRK_STRICT_DUPLICATE_FILTER); +DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery, + HCI_QUIRK_SIMULTANEOUS_DISCOVERY); + void hci_debugfs_create_le(struct hci_dev *hdev) { debugfs_create_file("identity", 0400, hdev->debugfs, hdev, @@ -1041,6 +1120,13 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &adv_max_interval_fops); debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs, &hdev->discov_interleaved_timeout); + + debugfs_create_file("quirk_strict_duplicate_filter", 0644, + hdev->debugfs, hdev, + &quirk_strict_duplicate_filter_fops); + debugfs_create_file("quirk_simultaneous_discovery", 0644, + hdev->debugfs, hdev, + &quirk_simultaneous_discovery_fops); } void hci_debugfs_create_conn(struct hci_conn *conn) diff --git a/net/bluetooth/hci_debugfs.h b/net/bluetooth/hci_debugfs.h index fb68efe083c5..4444dc8cedc2 100644 --- a/net/bluetooth/hci_debugfs.h +++ b/net/bluetooth/hci_debugfs.h @@ -20,7 +20,29 @@ SOFTWARE IS DISCLAIMED. */ +#if IS_ENABLED(CONFIG_BT_DEBUGFS) + void hci_debugfs_create_common(struct hci_dev *hdev); void hci_debugfs_create_bredr(struct hci_dev *hdev); void hci_debugfs_create_le(struct hci_dev *hdev); void hci_debugfs_create_conn(struct hci_conn *conn); + +#else + +static inline void hci_debugfs_create_common(struct hci_dev *hdev) +{ +} + +static inline void hci_debugfs_create_bredr(struct hci_dev *hdev) +{ +} + +static inline void hci_debugfs_create_le(struct hci_dev *hdev) +{ +} + +static inline void hci_debugfs_create_conn(struct hci_conn *conn) +{ +} + +#endif diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index a3fb094822b6..32363c2b7f83 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -70,7 +70,7 @@ static void hci_cc_periodic_inq(struct hci_dev *hdev, struct sk_buff *skb) if (status) return; - set_bit(HCI_PERIODIC_INQ, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_PERIODIC_INQ); } static void hci_cc_exit_periodic_inq(struct hci_dev *hdev, struct sk_buff *skb) @@ -82,7 +82,7 @@ static void hci_cc_exit_periodic_inq(struct hci_dev *hdev, struct sk_buff *skb) if (status) return; - clear_bit(HCI_PERIODIC_INQ, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); hci_conn_check_pending(hdev); } @@ -198,7 +198,7 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) return; /* Reset all non-persistent flags */ - hdev->dev_flags &= ~HCI_PERSISTENT_MASK; + hci_dev_clear_volatile_flags(hdev); hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -265,7 +265,7 @@ static void hci_cc_write_local_name(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_set_local_name_complete(hdev, sent, status); else if (!status) memcpy(hdev->dev_name, sent, HCI_MAX_NAME_LENGTH); @@ -282,8 +282,8 @@ static void hci_cc_read_local_name(struct hci_dev *hdev, struct sk_buff *skb) if (rp->status) return; - if (test_bit(HCI_SETUP, &hdev->dev_flags) || - test_bit(HCI_CONFIG, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG)) memcpy(hdev->dev_name, rp->name, HCI_MAX_NAME_LENGTH); } @@ -309,7 +309,7 @@ static void hci_cc_write_auth_enable(struct hci_dev *hdev, struct sk_buff *skb) clear_bit(HCI_AUTH, &hdev->flags); } - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_auth_enable_complete(hdev, status); hci_dev_unlock(hdev); @@ -404,7 +404,7 @@ static void hci_cc_write_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb) if (status == 0) memcpy(hdev->dev_class, sent, 3); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_set_class_of_dev_complete(hdev, sent, status); hci_dev_unlock(hdev); @@ -497,13 +497,13 @@ static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb) hdev->features[1][0] &= ~LMP_HOST_SSP; } - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_ssp_enable_complete(hdev, sent->mode, status); else if (!status) { if (sent->mode) - set_bit(HCI_SSP_ENABLED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_SSP_ENABLED); else - clear_bit(HCI_SSP_ENABLED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_SSP_ENABLED); } hci_dev_unlock(hdev); @@ -529,11 +529,11 @@ static void hci_cc_write_sc_support(struct hci_dev *hdev, struct sk_buff *skb) hdev->features[1][0] &= ~LMP_HOST_SC; } - if (!test_bit(HCI_MGMT, &hdev->dev_flags) && !status) { + if (!hci_dev_test_flag(hdev, HCI_MGMT) && !status) { if (sent->support) - set_bit(HCI_SC_ENABLED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_SC_ENABLED); else - clear_bit(HCI_SC_ENABLED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_SC_ENABLED); } hci_dev_unlock(hdev); @@ -548,8 +548,8 @@ static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb) if (rp->status) return; - if (test_bit(HCI_SETUP, &hdev->dev_flags) || - test_bit(HCI_CONFIG, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG)) { hdev->hci_ver = rp->hci_ver; hdev->hci_rev = __le16_to_cpu(rp->hci_rev); hdev->lmp_ver = rp->lmp_ver; @@ -568,8 +568,8 @@ static void hci_cc_read_local_commands(struct hci_dev *hdev, if (rp->status) return; - if (test_bit(HCI_SETUP, &hdev->dev_flags) || - test_bit(HCI_CONFIG, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG)) memcpy(hdev->commands, rp->commands, sizeof(hdev->commands)); } @@ -691,7 +691,7 @@ static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb) if (test_bit(HCI_INIT, &hdev->flags)) bacpy(&hdev->bdaddr, &rp->bdaddr); - if (test_bit(HCI_SETUP, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_SETUP)) bacpy(&hdev->setup_addr, &rp->bdaddr); } @@ -900,7 +900,7 @@ static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_pin_code_reply_complete(hdev, &rp->bdaddr, rp->status); if (rp->status) @@ -926,7 +926,7 @@ static void hci_cc_pin_code_neg_reply(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_pin_code_neg_reply_complete(hdev, &rp->bdaddr, rp->status); @@ -985,7 +985,7 @@ static void hci_cc_user_confirm_reply(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_user_confirm_reply_complete(hdev, &rp->bdaddr, ACL_LINK, 0, rp->status); @@ -1001,7 +1001,7 @@ static void hci_cc_user_confirm_neg_reply(struct hci_dev *hdev, hci_dev_lock(hdev); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_user_confirm_neg_reply_complete(hdev, &rp->bdaddr, ACL_LINK, 0, rp->status); @@ -1016,7 +1016,7 @@ static void hci_cc_user_passkey_reply(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_user_passkey_reply_complete(hdev, &rp->bdaddr, ACL_LINK, 0, rp->status); @@ -1032,7 +1032,7 @@ static void hci_cc_user_passkey_neg_reply(struct hci_dev *hdev, hci_dev_lock(hdev); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_user_passkey_neg_reply_complete(hdev, &rp->bdaddr, ACL_LINK, 0, rp->status); @@ -1045,11 +1045,6 @@ static void hci_cc_read_local_oob_data(struct hci_dev *hdev, struct hci_rp_read_local_oob_data *rp = (void *) skb->data; BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - - hci_dev_lock(hdev); - mgmt_read_local_oob_data_complete(hdev, rp->hash, rp->rand, NULL, NULL, - rp->status); - hci_dev_unlock(hdev); } static void hci_cc_read_local_oob_ext_data(struct hci_dev *hdev, @@ -1058,15 +1053,8 @@ static void hci_cc_read_local_oob_ext_data(struct hci_dev *hdev, struct hci_rp_read_local_oob_ext_data *rp = (void *) skb->data; BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - - hci_dev_lock(hdev); - mgmt_read_local_oob_data_complete(hdev, rp->hash192, rp->rand192, - rp->hash256, rp->rand256, - rp->status); - hci_dev_unlock(hdev); } - static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb) { __u8 status = *((__u8 *) skb->data); @@ -1109,7 +1097,7 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) if (*sent) { struct hci_conn *conn; - set_bit(HCI_LE_ADV, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_LE_ADV); conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); if (conn) @@ -1117,7 +1105,7 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) &conn->le_conn_timeout, conn->conn_timeout); } else { - clear_bit(HCI_LE_ADV, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LE_ADV); } hci_dev_unlock(hdev); @@ -1192,7 +1180,7 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, switch (cp->enable) { case LE_SCAN_ENABLE: - set_bit(HCI_LE_SCAN, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_LE_SCAN); if (hdev->le_scan_type == LE_SCAN_ACTIVE) clear_pending_adv_report(hdev); break; @@ -1217,7 +1205,7 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, */ cancel_delayed_work(&hdev->le_scan_disable); - clear_bit(HCI_LE_SCAN, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LE_SCAN); /* The HCI_LE_SCAN_INTERRUPTED flag indicates that we * interrupted scanning due to a connect request. Mark @@ -1226,10 +1214,9 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, * been disabled because of active scanning, so * re-enable it again if necessary. */ - if (test_and_clear_bit(HCI_LE_SCAN_INTERRUPTED, - &hdev->dev_flags)) + if (hci_dev_test_and_clear_flag(hdev, HCI_LE_SCAN_INTERRUPTED)) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - else if (!test_bit(HCI_LE_ADV, &hdev->dev_flags) && + else if (!hci_dev_test_flag(hdev, HCI_LE_ADV) && hdev->discovery.state == DISCOVERY_FINDING) mgmt_reenable_advertising(hdev); @@ -1388,11 +1375,11 @@ static void hci_cc_write_le_host_supported(struct hci_dev *hdev, if (sent->le) { hdev->features[1][0] |= LMP_HOST_LE; - set_bit(HCI_LE_ENABLED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_LE_ENABLED); } else { hdev->features[1][0] &= ~LMP_HOST_LE; - clear_bit(HCI_LE_ENABLED, &hdev->dev_flags); - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LE_ENABLED); + hci_dev_clear_flag(hdev, HCI_ADVERTISING); } if (sent->simul) @@ -1537,7 +1524,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) if (conn && conn->state == BT_CONNECT) { if (status != 0x0c || conn->attempt > 2) { conn->state = BT_CLOSED; - hci_proto_connect_cfm(conn, status); + hci_connect_cfm(conn, status); hci_conn_del(conn); } else conn->state = BT_CONNECT2; @@ -1581,7 +1568,7 @@ static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status) if (sco) { sco->state = BT_CLOSED; - hci_proto_connect_cfm(sco, status); + hci_connect_cfm(sco, status); hci_conn_del(sco); } } @@ -1608,7 +1595,7 @@ static void hci_cs_auth_requested(struct hci_dev *hdev, __u8 status) conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); if (conn) { if (conn->state == BT_CONFIG) { - hci_proto_connect_cfm(conn, status); + hci_connect_cfm(conn, status); hci_conn_drop(conn); } } @@ -1635,7 +1622,7 @@ static void hci_cs_set_conn_encrypt(struct hci_dev *hdev, __u8 status) conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); if (conn) { if (conn->state == BT_CONFIG) { - hci_proto_connect_cfm(conn, status); + hci_connect_cfm(conn, status); hci_conn_drop(conn); } } @@ -1769,7 +1756,7 @@ static void hci_cs_remote_name_req(struct hci_dev *hdev, __u8 status) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) hci_check_pending_name(hdev, conn, &cp->bdaddr, NULL, 0); if (!conn) @@ -1811,7 +1798,7 @@ static void hci_cs_read_remote_features(struct hci_dev *hdev, __u8 status) conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); if (conn) { if (conn->state == BT_CONFIG) { - hci_proto_connect_cfm(conn, status); + hci_connect_cfm(conn, status); hci_conn_drop(conn); } } @@ -1838,7 +1825,7 @@ static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status) conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); if (conn) { if (conn->state == BT_CONFIG) { - hci_proto_connect_cfm(conn, status); + hci_connect_cfm(conn, status); hci_conn_drop(conn); } } @@ -1873,7 +1860,7 @@ static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status) if (sco) { sco->state = BT_CLOSED; - hci_proto_connect_cfm(sco, status); + hci_connect_cfm(sco, status); hci_conn_del(sco); } } @@ -2049,6 +2036,33 @@ unlock: hci_dev_unlock(hdev); } +static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_read_remote_features *cp; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_READ_REMOTE_FEATURES); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn) { + if (conn->state == BT_CONFIG) { + hci_connect_cfm(conn, status); + hci_conn_drop(conn); + } + } + + hci_dev_unlock(hdev); +} + static void hci_cs_le_start_enc(struct hci_dev *hdev, u8 status) { struct hci_cp_le_start_enc *cp; @@ -2118,7 +2132,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_MGMT)) return; hci_dev_lock(hdev); @@ -2127,7 +2141,16 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; if (list_empty(&discov->resolve)) { - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + /* When BR/EDR inquiry is active and no LE scanning is in + * progress, then change discovery state to indicate completion. + * + * When running LE scanning and BR/EDR inquiry simultaneously + * and the LE scan already finished, then change the discovery + * state to indicate completion. + */ + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) || + !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); goto unlock; } @@ -2136,7 +2159,16 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) e->name_state = NAME_PENDING; hci_discovery_set_state(hdev, DISCOVERY_RESOLVING); } else { - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + /* When BR/EDR inquiry is active and no LE scanning is in + * progress, then change discovery state to indicate completion. + * + * When running LE scanning and BR/EDR inquiry simultaneously + * and the LE scan already finished, then change the discovery + * state to indicate completion. + */ + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) || + !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); } unlock: @@ -2154,7 +2186,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!num_rsp) return; - if (test_bit(HCI_PERIODIC_INQ, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) return; hci_dev_lock(hdev); @@ -2255,10 +2287,10 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_sco_setup(conn, ev->status); if (ev->status) { - hci_proto_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, ev->status); hci_conn_del(conn); } else if (ev->link_type != ACL_LINK) - hci_proto_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, ev->status); unlock: hci_dev_unlock(hdev); @@ -2304,8 +2336,8 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) * connection. These features are only touched through mgmt so * only do the checks if HCI_MGMT is set. */ - if (test_bit(HCI_MGMT, &hdev->dev_flags) && - !test_bit(HCI_CONNECTABLE, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_MGMT) && + !hci_dev_test_flag(hdev, HCI_CONNECTABLE) && !hci_bdaddr_list_lookup(&hdev->whitelist, &ev->bdaddr, BDADDR_BREDR)) { hci_reject_conn(hdev, &ev->bdaddr); @@ -2366,7 +2398,7 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) &cp); } else { conn->state = BT_CONNECT2; - hci_proto_connect_cfm(conn, 0); + hci_connect_cfm(conn, 0); } } @@ -2444,7 +2476,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) type = conn->type; - hci_proto_disconn_cfm(conn, ev->reason); + hci_disconn_cfm(conn, ev->reason); hci_conn_del(conn); /* Re-enable advertising if necessary, since it might @@ -2501,7 +2533,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) &cp); } else { conn->state = BT_CONNECTED; - hci_proto_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, ev->status); hci_conn_drop(conn); } } else { @@ -2542,7 +2574,7 @@ static void hci_remote_name_evt(struct hci_dev *hdev, struct sk_buff *skb) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_MGMT)) goto check_auth; if (ev->status == 0) @@ -2571,6 +2603,63 @@ unlock: hci_dev_unlock(hdev); } +static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + const struct hci_rp_read_enc_key_size *rp; + struct hci_conn *conn; + u16 handle; + + BT_DBG("%s status 0x%02x", hdev->name, status); + + if (!skb || skb->len < sizeof(*rp)) { + BT_ERR("%s invalid HCI Read Encryption Key Size response", + hdev->name); + return; + } + + rp = (void *)skb->data; + handle = le16_to_cpu(rp->handle); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) + goto unlock; + + /* If we fail to read the encryption key size, assume maximum + * (which is the same we do also when this HCI command isn't + * supported. + */ + if (rp->status) { + BT_ERR("%s failed to read key size for handle %u", hdev->name, + handle); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } else { + conn->enc_key_size = rp->key_size; + } + + if (conn->state == BT_CONFIG) { + conn->state = BT_CONNECTED; + hci_connect_cfm(conn, 0); + hci_conn_drop(conn); + } else { + u8 encrypt; + + if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) + encrypt = 0x00; + else if (test_bit(HCI_CONN_AES_CCM, &conn->flags)) + encrypt = 0x02; + else + encrypt = 0x01; + + hci_encrypt_cfm(conn, 0, encrypt); + } + +unlock: + hci_dev_unlock(hdev); +} + static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_encrypt_change *ev = (void *) skb->data; @@ -2608,7 +2697,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) * whenever the encryption procedure fails. */ if (ev->status && conn->type == LE_LINK) - set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); @@ -2618,23 +2707,52 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; } - if (conn->state == BT_CONFIG) { - if (!ev->status) - conn->state = BT_CONNECTED; + /* In Secure Connections Only mode, do not allow any connections + * that are not encrypted with AES-CCM using a P-256 authenticated + * combination key. + */ + if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && + (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || + conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { + hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); + hci_conn_drop(conn); + goto unlock; + } + + /* Try reading the encryption key size for encrypted ACL links */ + if (!ev->status && ev->encrypt && conn->type == ACL_LINK) { + struct hci_cp_read_enc_key_size cp; + struct hci_request req; - /* In Secure Connections Only mode, do not allow any - * connections that are not encrypted with AES-CCM - * using a P-256 authenticated combination key. + /* Only send HCI_Read_Encryption_Key_Size if the + * controller really supports it. If it doesn't, assume + * the default size (16). */ - if (test_bit(HCI_SC_ONLY, &hdev->dev_flags) && - (!test_bit(HCI_CONN_AES_CCM, &conn->flags) || - conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) { - hci_proto_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE); - hci_conn_drop(conn); - goto unlock; + if (!(hdev->commands[20] & 0x10)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + goto notify; + } + + hci_req_init(&req, hdev); + + cp.handle = cpu_to_le16(conn->handle); + hci_req_add(&req, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp); + + if (hci_req_run_skb(&req, read_enc_key_size_complete)) { + BT_ERR("Sending HCI Read Encryption Key Size failed"); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + goto notify; } - hci_proto_connect_cfm(conn, ev->status); + goto unlock; + } + +notify: + if (conn->state == BT_CONFIG) { + if (!ev->status) + conn->state = BT_CONNECTED; + + hci_connect_cfm(conn, ev->status); hci_conn_drop(conn); } else hci_encrypt_cfm(conn, ev->status, ev->encrypt); @@ -2707,7 +2825,7 @@ static void hci_remote_features_evt(struct hci_dev *hdev, if (!hci_outgoing_auth_needed(hdev, conn)) { conn->state = BT_CONNECTED; - hci_proto_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, ev->status); hci_conn_drop(conn); } @@ -2715,17 +2833,19 @@ unlock: hci_dev_unlock(hdev); } -static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, + u16 *opcode, u8 *status, + hci_req_complete_t *req_complete, + hci_req_complete_skb_t *req_complete_skb) { struct hci_ev_cmd_complete *ev = (void *) skb->data; - u8 status = skb->data[sizeof(*ev)]; - __u16 opcode; - skb_pull(skb, sizeof(*ev)); + *opcode = __le16_to_cpu(ev->opcode); + *status = skb->data[sizeof(*ev)]; - opcode = __le16_to_cpu(ev->opcode); + skb_pull(skb, sizeof(*ev)); - switch (opcode) { + switch (*opcode) { case HCI_OP_INQUIRY_CANCEL: hci_cc_inquiry_cancel(hdev, skb); break; @@ -3003,32 +3123,36 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) break; default: - BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode); + BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; } - if (opcode != HCI_OP_NOP) + if (*opcode != HCI_OP_NOP) cancel_delayed_work(&hdev->cmd_timer); - hci_req_cmd_complete(hdev, opcode, status); - - if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) { + if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) atomic_set(&hdev->cmd_cnt, 1); - if (!skb_queue_empty(&hdev->cmd_q)) - queue_work(hdev->workqueue, &hdev->cmd_work); - } + + hci_req_cmd_complete(hdev, *opcode, *status, req_complete, + req_complete_skb); + + if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q)) + queue_work(hdev->workqueue, &hdev->cmd_work); } -static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb) +static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, + u16 *opcode, u8 *status, + hci_req_complete_t *req_complete, + hci_req_complete_skb_t *req_complete_skb) { struct hci_ev_cmd_status *ev = (void *) skb->data; - __u16 opcode; skb_pull(skb, sizeof(*ev)); - opcode = __le16_to_cpu(ev->opcode); + *opcode = __le16_to_cpu(ev->opcode); + *status = ev->status; - switch (opcode) { + switch (*opcode) { case HCI_OP_INQUIRY: hci_cs_inquiry(hdev, ev->status); break; @@ -3093,27 +3217,38 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cs_le_create_conn(hdev, ev->status); break; + case HCI_OP_LE_READ_REMOTE_FEATURES: + hci_cs_le_read_remote_features(hdev, ev->status); + break; + case HCI_OP_LE_START_ENC: hci_cs_le_start_enc(hdev, ev->status); break; default: - BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode); + BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; } - if (opcode != HCI_OP_NOP) + if (*opcode != HCI_OP_NOP) cancel_delayed_work(&hdev->cmd_timer); + if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) + atomic_set(&hdev->cmd_cnt, 1); + + /* Indicate request completion if the command failed. Also, if + * we're not waiting for a special event and we get a success + * command status we should try to flag the request as completed + * (since for this kind of commands there will not be a command + * complete event). + */ if (ev->status || (hdev->sent_cmd && !bt_cb(hdev->sent_cmd)->req.event)) - hci_req_cmd_complete(hdev, opcode, ev->status); + hci_req_cmd_complete(hdev, *opcode, ev->status, req_complete, + req_complete_skb); - if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) { - atomic_set(&hdev->cmd_cnt, 1); - if (!skb_queue_empty(&hdev->cmd_q)) - queue_work(hdev->workqueue, &hdev->cmd_work); - } + if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q)) + queue_work(hdev->workqueue, &hdev->cmd_work); } static void hci_hardware_error_evt(struct hci_dev *hdev, struct sk_buff *skb) @@ -3331,11 +3466,11 @@ static void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_conn_drop(conn); } - if (!test_bit(HCI_BONDABLE, &hdev->dev_flags) && + if (!hci_dev_test_flag(hdev, HCI_BONDABLE) && !test_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags)) { hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY, sizeof(ev->bdaddr), &ev->bdaddr); - } else if (test_bit(HCI_MGMT, &hdev->dev_flags)) { + } else if (hci_dev_test_flag(hdev, HCI_MGMT)) { u8 secure; if (conn->pending_sec_level == BT_SECURITY_HIGH) @@ -3391,7 +3526,7 @@ static void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s", hdev->name); - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_MGMT)) return; hci_dev_lock(hdev); @@ -3465,7 +3600,7 @@ static void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) set_bit(HCI_CONN_NEW_LINK_KEY, &conn->flags); conn_set_key(conn, ev->key_type, conn->pin_length); - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_MGMT)) goto unlock; key = hci_add_link_key(hdev, conn, &ev->bdaddr, ev->link_key, @@ -3487,7 +3622,7 @@ static void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) * store_hint being 0). */ if (key->type == HCI_LK_DEBUG_COMBINATION && - !test_bit(HCI_KEEP_DEBUG_KEYS, &hdev->dev_flags)) { + !hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS)) { list_del_rcu(&key->list); kfree_rcu(key, rcu); goto unlock; @@ -3570,7 +3705,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, if (!num_rsp) return; - if (test_bit(HCI_PERIODIC_INQ, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) return; hci_dev_lock(hdev); @@ -3679,7 +3814,7 @@ static void hci_remote_ext_features_evt(struct hci_dev *hdev, if (!hci_outgoing_auth_needed(hdev, conn)) { conn->state = BT_CONNECTED; - hci_proto_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, ev->status); hci_conn_drop(conn); } @@ -3738,7 +3873,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, break; } - hci_proto_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, ev->status); if (ev->status) hci_conn_del(conn); @@ -3776,7 +3911,7 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, if (!num_rsp) return; - if (test_bit(HCI_PERIODIC_INQ, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) return; hci_dev_lock(hdev); @@ -3794,7 +3929,7 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, data.rssi = info->rssi; data.ssp_mode = 0x01; - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) name_known = eir_has_data_type(info->data, sizeof(info->data), EIR_NAME_COMPLETE); @@ -3849,7 +3984,7 @@ static void hci_key_refresh_complete_evt(struct hci_dev *hdev, if (!ev->status) conn->state = BT_CONNECTED; - hci_proto_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, ev->status); hci_conn_drop(conn); } else { hci_auth_cfm(conn, ev->status); @@ -3890,41 +4025,37 @@ static u8 bredr_oob_data_present(struct hci_conn *conn) if (!data) return 0x00; - if (conn->out || test_bit(HCI_CONN_REMOTE_OOB, &conn->flags)) { - if (bredr_sc_enabled(hdev)) { - /* When Secure Connections is enabled, then just - * return the present value stored with the OOB - * data. The stored value contains the right present - * information. However it can only be trusted when - * not in Secure Connection Only mode. - */ - if (!test_bit(HCI_SC_ONLY, &hdev->dev_flags)) - return data->present; - - /* When Secure Connections Only mode is enabled, then - * the P-256 values are required. If they are not - * available, then do not declare that OOB data is - * present. - */ - if (!memcmp(data->rand256, ZERO_KEY, 16) || - !memcmp(data->hash256, ZERO_KEY, 16)) - return 0x00; - - return 0x02; - } + if (bredr_sc_enabled(hdev)) { + /* When Secure Connections is enabled, then just + * return the present value stored with the OOB + * data. The stored value contains the right present + * information. However it can only be trusted when + * not in Secure Connection Only mode. + */ + if (!hci_dev_test_flag(hdev, HCI_SC_ONLY)) + return data->present; - /* When Secure Connections is not enabled or actually - * not supported by the hardware, then check that if - * P-192 data values are present. + /* When Secure Connections Only mode is enabled, then + * the P-256 values are required. If they are not + * available, then do not declare that OOB data is + * present. */ - if (!memcmp(data->rand192, ZERO_KEY, 16) || - !memcmp(data->hash192, ZERO_KEY, 16)) + if (!memcmp(data->rand256, ZERO_KEY, 16) || + !memcmp(data->hash256, ZERO_KEY, 16)) return 0x00; - return 0x01; + return 0x02; } - return 0x00; + /* When Secure Connections is not enabled or actually + * not supported by the hardware, then check that if + * P-192 data values are present. + */ + if (!memcmp(data->rand192, ZERO_KEY, 16) || + !memcmp(data->hash192, ZERO_KEY, 16)) + return 0x00; + + return 0x01; } static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) @@ -3942,13 +4073,13 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_conn_hold(conn); - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_MGMT)) goto unlock; /* Allow pairing if we're pairable, the initiators of the * pairing or if the remote is not requesting bonding. */ - if (test_bit(HCI_BONDABLE, &hdev->dev_flags) || + if (hci_dev_test_flag(hdev, HCI_BONDABLE) || test_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags) || (conn->remote_auth & ~0x01) == HCI_AT_NO_BONDING) { struct hci_cp_io_capability_reply cp; @@ -3974,7 +4105,7 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) /* If we're not bondable, force one of the non-bondable * authentication requirement values. */ - if (!test_bit(HCI_BONDABLE, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_BONDABLE)) conn->auth_type &= HCI_AT_NO_BONDING_MITM; cp.authentication = conn->auth_type; @@ -4011,8 +4142,6 @@ static void hci_io_capa_reply_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->remote_cap = ev->capability; conn->remote_auth = ev->authentication; - if (ev->oob_data) - set_bit(HCI_CONN_REMOTE_OOB, &conn->flags); unlock: hci_dev_unlock(hdev); @@ -4029,7 +4158,7 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev, hci_dev_lock(hdev); - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_MGMT)) goto unlock; conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); @@ -4100,7 +4229,7 @@ static void hci_user_passkey_request_evt(struct hci_dev *hdev, BT_DBG("%s", hdev->name); - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_user_passkey_request(hdev, &ev->bdaddr, ACL_LINK, 0); } @@ -4119,7 +4248,7 @@ static void hci_user_passkey_notify_evt(struct hci_dev *hdev, conn->passkey_notify = __le32_to_cpu(ev->passkey); conn->passkey_entered = 0; - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_user_passkey_notify(hdev, &conn->dst, conn->type, conn->dst_type, conn->passkey_notify, conn->passkey_entered); @@ -4157,7 +4286,7 @@ static void hci_keypress_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) return; } - if (test_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_user_passkey_notify(hdev, &conn->dst, conn->type, conn->dst_type, conn->passkey_notify, conn->passkey_entered); @@ -4226,7 +4355,7 @@ static void hci_remote_oob_data_request_evt(struct hci_dev *hdev, hci_dev_lock(hdev); - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_MGMT)) goto unlock; data = hci_find_remote_oob_data(hdev, &ev->bdaddr, BDADDR_BREDR); @@ -4243,7 +4372,7 @@ static void hci_remote_oob_data_request_evt(struct hci_dev *hdev, struct hci_cp_remote_oob_ext_data_reply cp; bacpy(&cp.bdaddr, &ev->bdaddr); - if (test_bit(HCI_SC_ONLY, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { memset(cp.hash192, 0, sizeof(cp.hash192)); memset(cp.rand192, 0, sizeof(cp.rand192)); } else { @@ -4409,7 +4538,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) /* All controllers implicitly stop advertising in the event of a * connection, so ensure that the state bit is cleared. */ - clear_bit(HCI_LE_ADV, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LE_ADV); conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); if (!conn) { @@ -4432,7 +4561,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (conn->out) { conn->resp_addr_type = ev->bdaddr_type; bacpy(&conn->resp_addr, &ev->bdaddr); - if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_PRIVACY)) { conn->init_addr_type = ADDR_LE_DEV_RANDOM; bacpy(&conn->init_addr, &hdev->rpa); } else { @@ -4503,7 +4632,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->sec_level = BT_SECURITY_LOW; conn->handle = __le16_to_cpu(ev->handle); - conn->state = BT_CONNECTED; + conn->state = BT_CONFIG; conn->le_conn_interval = le16_to_cpu(ev->interval); conn->le_conn_latency = le16_to_cpu(ev->latency); @@ -4512,7 +4641,33 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); - hci_proto_connect_cfm(conn, ev->status); + if (!ev->status) { + /* The remote features procedure is defined for master + * role only. So only in case of an initiated connection + * request the remote features. + * + * If the local controller supports slave-initiated features + * exchange, then requesting the remote features in slave + * role is possible. Otherwise just transition into the + * connected state without requesting the remote features. + */ + if (conn->out || + (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) { + struct hci_cp_le_read_remote_features cp; + + cp.handle = __cpu_to_le16(conn->handle); + + hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES, + sizeof(cp), &cp); + + hci_conn_hold(conn); + } else { + conn->state = BT_CONNECTED; + hci_connect_cfm(conn, ev->status); + } + } else { + hci_connect_cfm(conn, ev->status); + } params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, conn->dst_type); @@ -4658,7 +4813,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, /* If the controller is not using resolvable random * addresses, then this report can be ignored. */ - if (!test_bit(HCI_PRIVACY, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) return; /* If the local IRK of the controller does not match @@ -4814,6 +4969,48 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_ev_le_remote_feat_complete *ev = (void *)skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (conn) { + if (!ev->status) + memcpy(conn->features[0], ev->features, 8); + + if (conn->state == BT_CONFIG) { + __u8 status; + + /* If the local controller supports slave-initiated + * features exchange, but the remote controller does + * not, then it is possible that the error code 0x1a + * for unsupported remote feature gets returned. + * + * In this specific case, allow the connection to + * transition into connected state and mark it as + * successful. + */ + if ((hdev->le_features[0] & HCI_LE_SLAVE_FEATURES) && + !conn->out && ev->status == 0x1a) + status = 0x00; + else + status = ev->status; + + conn->state = BT_CONNECTED; + hci_connect_cfm(conn, status); + hci_conn_drop(conn); + } + } + + hci_dev_unlock(hdev); +} + static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_le_ltk_req *ev = (void *) skb->data; @@ -4844,7 +5041,8 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) goto not_found; } - memcpy(cp.ltk, ltk->val, sizeof(ltk->val)); + memcpy(cp.ltk, ltk->val, ltk->enc_size); + memset(cp.ltk + ltk->enc_size, 0, sizeof(cp.ltk) - ltk->enc_size); cp.handle = cpu_to_le16(conn->handle); conn->pending_sec_level = smp_ltk_sec_level(ltk); @@ -4987,6 +5185,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_adv_report_evt(hdev, skb); break; + case HCI_EV_LE_REMOTE_FEAT_COMPLETE: + hci_le_remote_feat_complete_evt(hdev, skb); + break; + case HCI_EV_LE_LTK_REQ: hci_le_ltk_request_evt(hdev, skb); break; @@ -5020,32 +5222,79 @@ static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb) amp_read_loc_assoc_final_data(hdev, hcon); } -void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) +static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, + u8 event, struct sk_buff *skb) { - struct hci_event_hdr *hdr = (void *) skb->data; - __u8 event = hdr->evt; + struct hci_ev_cmd_complete *ev; + struct hci_event_hdr *hdr; - hci_dev_lock(hdev); + if (!skb) + return false; - /* Received events are (currently) only needed when a request is - * ongoing so avoid unnecessary memory allocation. - */ - if (hci_req_pending(hdev)) { - kfree_skb(hdev->recv_evt); - hdev->recv_evt = skb_clone(skb, GFP_KERNEL); + if (skb->len < sizeof(*hdr)) { + BT_ERR("Too short HCI event"); + return false; } - hci_dev_unlock(hdev); - + hdr = (void *) skb->data; skb_pull(skb, HCI_EVENT_HDR_SIZE); + if (event) { + if (hdr->evt != event) + return false; + return true; + } + + if (hdr->evt != HCI_EV_CMD_COMPLETE) { + BT_DBG("Last event is not cmd complete (0x%2.2x)", hdr->evt); + return false; + } + + if (skb->len < sizeof(*ev)) { + BT_ERR("Too short cmd_complete event"); + return false; + } + + ev = (void *) skb->data; + skb_pull(skb, sizeof(*ev)); + + if (opcode != __le16_to_cpu(ev->opcode)) { + BT_DBG("opcode doesn't match (0x%2.2x != 0x%2.2x)", opcode, + __le16_to_cpu(ev->opcode)); + return false; + } + + return true; +} + +void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_event_hdr *hdr = (void *) skb->data; + hci_req_complete_t req_complete = NULL; + hci_req_complete_skb_t req_complete_skb = NULL; + struct sk_buff *orig_skb = NULL; + u8 status = 0, event = hdr->evt, req_evt = 0; + u16 opcode = HCI_OP_NOP; + if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->req.event == event) { struct hci_command_hdr *cmd_hdr = (void *) hdev->sent_cmd->data; - u16 opcode = __le16_to_cpu(cmd_hdr->opcode); - - hci_req_cmd_complete(hdev, opcode, 0); + opcode = __le16_to_cpu(cmd_hdr->opcode); + hci_req_cmd_complete(hdev, opcode, status, &req_complete, + &req_complete_skb); + req_evt = event; } + /* If it looks like we might end up having to call + * req_complete_skb, store a pristine copy of the skb since the + * various handlers may modify the original one through + * skb_pull() calls, etc. + */ + if (req_complete_skb || event == HCI_EV_CMD_STATUS || + event == HCI_EV_CMD_COMPLETE) + orig_skb = skb_clone(skb, GFP_KERNEL); + + skb_pull(skb, HCI_EVENT_HDR_SIZE); + switch (event) { case HCI_EV_INQUIRY_COMPLETE: hci_inquiry_complete_evt(hdev, skb); @@ -5088,11 +5337,13 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) break; case HCI_EV_CMD_COMPLETE: - hci_cmd_complete_evt(hdev, skb); + hci_cmd_complete_evt(hdev, skb, &opcode, &status, + &req_complete, &req_complete_skb); break; case HCI_EV_CMD_STATUS: - hci_cmd_status_evt(hdev, skb); + hci_cmd_status_evt(hdev, skb, &opcode, &status, &req_complete, + &req_complete_skb); break; case HCI_EV_HARDWARE_ERROR: @@ -5224,6 +5475,17 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) break; } + if (req_complete) { + req_complete(hdev, status, opcode); + } else if (req_complete_skb) { + if (!hci_get_cmd_complete(hdev, opcode, req_evt, orig_skb)) { + kfree_skb(orig_skb); + orig_skb = NULL; + } + req_complete_skb(hdev, status, opcode, orig_skb); + } + + kfree_skb(orig_skb); kfree_skb(skb); hdev->stat.evt_rx++; } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index b59f92c6df0c..d6025d6e6d59 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -34,7 +34,8 @@ void hci_req_init(struct hci_request *req, struct hci_dev *hdev) req->err = 0; } -int hci_req_run(struct hci_request *req, hci_req_complete_t complete) +static int req_run(struct hci_request *req, hci_req_complete_t complete, + hci_req_complete_skb_t complete_skb) { struct hci_dev *hdev = req->hdev; struct sk_buff *skb; @@ -56,6 +57,7 @@ int hci_req_run(struct hci_request *req, hci_req_complete_t complete) skb = skb_peek_tail(&req->cmd_q); bt_cb(skb)->req.complete = complete; + bt_cb(skb)->req.complete_skb = complete_skb; spin_lock_irqsave(&hdev->cmd_q.lock, flags); skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q); @@ -66,6 +68,16 @@ int hci_req_run(struct hci_request *req, hci_req_complete_t complete) return 0; } +int hci_req_run(struct hci_request *req, hci_req_complete_t complete) +{ + return req_run(req, complete, NULL); +} + +int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete) +{ + return req_run(req, NULL, complete); +} + struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param) { @@ -270,7 +282,7 @@ void hci_req_add_le_passive_scan(struct hci_request *req) * and 0x01 (whitelist enabled) use the new filter policies * 0x02 (no whitelist) and 0x03 (whitelist enabled). */ - if (test_bit(HCI_PRIVACY, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_PRIVACY) && (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)) filter_policy |= 0x02; @@ -304,10 +316,10 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ - if (test_bit(HCI_LE_ADV, &hdev->dev_flags) || + if (hci_dev_test_flag(hdev, HCI_LE_ADV) || hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) { BT_DBG("Deferring random address update"); - set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return; } @@ -324,12 +336,12 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, * current RPA has expired or there is something else than * the current RPA in use, then generate a new one. */ - if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_PRIVACY)) { int to; *own_addr_type = ADDR_LE_DEV_RANDOM; - if (!test_and_clear_bit(HCI_RPA_EXPIRED, &hdev->dev_flags) && + if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) && !bacmp(&hdev->random_addr, &hdev->rpa)) return 0; @@ -383,9 +395,9 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, * and a static address has been configured, then use that * address instead of the public BR/EDR address. */ - if (test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags) || + if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || - (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags) && + (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { *own_addr_type = ADDR_LE_DEV_RANDOM; if (bacmp(&hdev->static_addr, &hdev->random_addr)) @@ -425,7 +437,7 @@ void __hci_update_page_scan(struct hci_request *req) struct hci_dev *hdev = req->hdev; u8 scan; - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return; if (!hdev_is_powered(hdev)) @@ -434,7 +446,7 @@ void __hci_update_page_scan(struct hci_request *req) if (mgmt_powering_down(hdev)) return; - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags) || + if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) || disconnected_whitelist_entries(hdev)) scan = SCAN_PAGE; else @@ -443,7 +455,7 @@ void __hci_update_page_scan(struct hci_request *req) if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE)) return; - if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) scan |= SCAN_INQUIRY; hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); @@ -471,14 +483,14 @@ void __hci_update_background_scan(struct hci_request *req) if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || - test_bit(HCI_SETUP, &hdev->dev_flags) || - test_bit(HCI_CONFIG, &hdev->dev_flags) || - test_bit(HCI_AUTO_OFF, &hdev->dev_flags) || - test_bit(HCI_UNREGISTER, &hdev->dev_flags)) + hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG) || + hci_dev_test_flag(hdev, HCI_AUTO_OFF) || + hci_dev_test_flag(hdev, HCI_UNREGISTER)) return; /* No point in doing scanning if LE support hasn't been enabled */ - if (!test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; /* If discovery is active don't interfere with it */ @@ -502,7 +514,7 @@ void __hci_update_background_scan(struct hci_request *req) */ /* If controller is not scanning we are done. */ - if (!test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return; hci_req_add_le_scan_disable(req); @@ -524,7 +536,7 @@ void __hci_update_background_scan(struct hci_request *req) /* If controller is currently scanning, we stop it to ensure we * don't miss any advertising (due to duplicates filter). */ - if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) hci_req_add_le_scan_disable(req); hci_req_add_le_passive_scan(req); diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index adf074d33544..bf6df92f42db 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -32,11 +32,14 @@ struct hci_request { void hci_req_init(struct hci_request *req, struct hci_dev *hdev); int hci_req_run(struct hci_request *req, hci_req_complete_t complete); +int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete); void hci_req_add(struct hci_request *req, u16 opcode, u32 plen, const void *param); void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen, const void *param, u8 event); -void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status); +void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, + hci_req_complete_t *req_complete, + hci_req_complete_skb_t *req_complete_skb); struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1d65c5be7c82..f2d30d1156c9 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -30,6 +30,12 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/hci_mon.h> +#include <net/bluetooth/mgmt.h> + +#include "mgmt_util.h" + +static LIST_HEAD(mgmt_chan_list); +static DEFINE_MUTEX(mgmt_chan_list_lock); static atomic_t monitor_promisc = ATOMIC_INIT(0); @@ -44,11 +50,32 @@ struct hci_pinfo { struct hci_filter filter; __u32 cmsg_mask; unsigned short channel; + unsigned long flags; }; -static inline int hci_test_bit(int nr, void *addr) +void hci_sock_set_flag(struct sock *sk, int nr) +{ + set_bit(nr, &hci_pi(sk)->flags); +} + +void hci_sock_clear_flag(struct sock *sk, int nr) +{ + clear_bit(nr, &hci_pi(sk)->flags); +} + +int hci_sock_test_flag(struct sock *sk, int nr) +{ + return test_bit(nr, &hci_pi(sk)->flags); +} + +unsigned short hci_sock_get_channel(struct sock *sk) { - return *((__u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); + return hci_pi(sk)->channel; +} + +static inline int hci_test_bit(int nr, const void *addr) +{ + return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); } /* Security filter */ @@ -183,54 +210,31 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(skb_copy); } -/* Send frame to control socket */ -void hci_send_to_control(struct sk_buff *skb, struct sock *skip_sk) +/* Send frame to sockets with specific channel */ +void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, + int flag, struct sock *skip_sk) { struct sock *sk; - BT_DBG("len %d", skb->len); + BT_DBG("channel %u len %d", channel, skb->len); read_lock(&hci_sk_list.lock); sk_for_each(sk, &hci_sk_list.head) { struct sk_buff *nskb; - /* Skip the original socket */ - if (sk == skip_sk) - continue; - - if (sk->sk_state != BT_BOUND) - continue; - - if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) + /* Ignore socket without the flag set */ + if (!hci_sock_test_flag(sk, flag)) continue; - nskb = skb_clone(skb, GFP_ATOMIC); - if (!nskb) + /* Skip the original socket */ + if (sk == skip_sk) continue; - if (sock_queue_rcv_skb(sk, nskb)) - kfree_skb(nskb); - } - - read_unlock(&hci_sk_list.lock); -} - -static void queue_monitor_skb(struct sk_buff *skb) -{ - struct sock *sk; - - BT_DBG("len %d", skb->len); - - read_lock(&hci_sk_list.lock); - - sk_for_each(sk, &hci_sk_list.head) { - struct sk_buff *nskb; - if (sk->sk_state != BT_BOUND) continue; - if (hci_pi(sk)->channel != HCI_CHANNEL_MONITOR) + if (hci_pi(sk)->channel != channel) continue; nskb = skb_clone(skb, GFP_ATOMIC); @@ -290,7 +294,8 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) hdr->index = cpu_to_le16(hdev->id); hdr->len = cpu_to_le16(skb->len); - queue_monitor_skb(skb_copy); + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb_copy, + HCI_SOCK_TRUSTED, NULL); kfree_skb(skb_copy); } @@ -397,7 +402,8 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event) skb = create_monitor_event(hdev, event); if (skb) { - queue_monitor_skb(skb); + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); kfree_skb(skb); } } @@ -428,6 +434,56 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event) } } +static struct hci_mgmt_chan *__hci_mgmt_chan_find(unsigned short channel) +{ + struct hci_mgmt_chan *c; + + list_for_each_entry(c, &mgmt_chan_list, list) { + if (c->channel == channel) + return c; + } + + return NULL; +} + +static struct hci_mgmt_chan *hci_mgmt_chan_find(unsigned short channel) +{ + struct hci_mgmt_chan *c; + + mutex_lock(&mgmt_chan_list_lock); + c = __hci_mgmt_chan_find(channel); + mutex_unlock(&mgmt_chan_list_lock); + + return c; +} + +int hci_mgmt_chan_register(struct hci_mgmt_chan *c) +{ + if (c->channel < HCI_CHANNEL_CONTROL) + return -EINVAL; + + mutex_lock(&mgmt_chan_list_lock); + if (__hci_mgmt_chan_find(c->channel)) { + mutex_unlock(&mgmt_chan_list_lock); + return -EALREADY; + } + + list_add_tail(&c->list, &mgmt_chan_list); + + mutex_unlock(&mgmt_chan_list_lock); + + return 0; +} +EXPORT_SYMBOL(hci_mgmt_chan_register); + +void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c) +{ + mutex_lock(&mgmt_chan_list_lock); + list_del(&c->list); + mutex_unlock(&mgmt_chan_list_lock); +} +EXPORT_SYMBOL(hci_mgmt_chan_unregister); + static int hci_sock_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -447,9 +503,9 @@ static int hci_sock_release(struct socket *sock) if (hdev) { if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { - mgmt_index_added(hdev); - clear_bit(HCI_USER_CHANNEL, &hdev->dev_flags); hci_dev_close(hdev->id); + hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); + mgmt_index_added(hdev); } atomic_dec(&hdev->promisc); @@ -508,10 +564,10 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, if (!hdev) return -EBADFD; - if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return -EOPNOTSUPP; if (hdev->dev_type != HCI_BREDR) @@ -685,16 +741,17 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, goto done; } - if (test_bit(HCI_UP, &hdev->flags) || - test_bit(HCI_INIT, &hdev->flags) || - test_bit(HCI_SETUP, &hdev->dev_flags) || - test_bit(HCI_CONFIG, &hdev->dev_flags)) { + if (test_bit(HCI_INIT, &hdev->flags) || + hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG) || + (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) && + test_bit(HCI_UP, &hdev->flags))) { err = -EBUSY; hci_dev_put(hdev); goto done; } - if (test_and_set_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (hci_dev_test_and_set_flag(hdev, HCI_USER_CHANNEL)) { err = -EUSERS; hci_dev_put(hdev); goto done; @@ -704,10 +761,21 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, err = hci_dev_open(hdev->id); if (err) { - clear_bit(HCI_USER_CHANNEL, &hdev->dev_flags); - mgmt_index_added(hdev); - hci_dev_put(hdev); - goto done; + if (err == -EALREADY) { + /* In case the transport is already up and + * running, clear the error here. + * + * This can happen when opening an user + * channel and HCI_AUTO_OFF grace period + * is still active. + */ + err = 0; + } else { + hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); + mgmt_index_added(hdev); + hci_dev_put(hdev); + goto done; + } } atomic_inc(&hdev->promisc); @@ -715,38 +783,62 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, hci_pi(sk)->hdev = hdev; break; - case HCI_CHANNEL_CONTROL: + case HCI_CHANNEL_MONITOR: if (haddr.hci_dev != HCI_DEV_NONE) { err = -EINVAL; goto done; } - if (!capable(CAP_NET_ADMIN)) { + if (!capable(CAP_NET_RAW)) { err = -EPERM; goto done; } + /* The monitor interface is restricted to CAP_NET_RAW + * capabilities and with that implicitly trusted. + */ + hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + + send_monitor_replay(sk); + + atomic_inc(&monitor_promisc); break; - case HCI_CHANNEL_MONITOR: - if (haddr.hci_dev != HCI_DEV_NONE) { + default: + if (!hci_mgmt_chan_find(haddr.hci_channel)) { err = -EINVAL; goto done; } - if (!capable(CAP_NET_RAW)) { - err = -EPERM; + if (haddr.hci_dev != HCI_DEV_NONE) { + err = -EINVAL; goto done; } - send_monitor_replay(sk); - - atomic_inc(&monitor_promisc); + /* Users with CAP_NET_ADMIN capabilities are allowed + * access to all management commands and events. For + * untrusted users the interface is restricted and + * also only untrusted events are sent. + */ + if (capable(CAP_NET_ADMIN)) + hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + + /* At the moment the index and unconfigured index events + * are enabled unconditionally. Setting them on each + * socket when binding keeps this functionality. They + * however might be cleared later and then sending of these + * events will be disabled, but that is then intentional. + * + * This also enables generic events that are safe to be + * received by untrusted users. Example for such events + * are changes to settings, class of device, name etc. + */ + if (haddr.hci_channel == HCI_CHANNEL_CONTROL) { + hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS); + } break; - - default: - err = -EINVAL; - goto done; } @@ -826,8 +918,8 @@ static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, } } -static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; @@ -860,10 +952,13 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, hci_sock_cmsg(sk, msg, skb); break; case HCI_CHANNEL_USER: - case HCI_CHANNEL_CONTROL: case HCI_CHANNEL_MONITOR: sock_recv_timestamp(msg, sk, skb); break; + default: + if (hci_mgmt_chan_find(hci_pi(sk)->channel)) + sock_recv_timestamp(msg, sk, skb); + break; } skb_free_datagram(sk, skb); @@ -871,10 +966,122 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, return err ? : copied; } -static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, + struct msghdr *msg, size_t msglen) +{ + void *buf; + u8 *cp; + struct mgmt_hdr *hdr; + u16 opcode, index, len; + struct hci_dev *hdev = NULL; + const struct hci_mgmt_handler *handler; + bool var_len, no_hdev; + int err; + + BT_DBG("got %zu bytes", msglen); + + if (msglen < sizeof(*hdr)) + return -EINVAL; + + buf = kmalloc(msglen, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (memcpy_from_msg(buf, msg, msglen)) { + err = -EFAULT; + goto done; + } + + hdr = buf; + opcode = __le16_to_cpu(hdr->opcode); + index = __le16_to_cpu(hdr->index); + len = __le16_to_cpu(hdr->len); + + if (len != msglen - sizeof(*hdr)) { + err = -EINVAL; + goto done; + } + + if (opcode >= chan->handler_count || + chan->handlers[opcode].func == NULL) { + BT_DBG("Unknown op %u", opcode); + err = mgmt_cmd_status(sk, index, opcode, + MGMT_STATUS_UNKNOWN_COMMAND); + goto done; + } + + handler = &chan->handlers[opcode]; + + if (!hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) && + !(handler->flags & HCI_MGMT_UNTRUSTED)) { + err = mgmt_cmd_status(sk, index, opcode, + MGMT_STATUS_PERMISSION_DENIED); + goto done; + } + + if (index != MGMT_INDEX_NONE) { + hdev = hci_dev_get(index); + if (!hdev) { + err = mgmt_cmd_status(sk, index, opcode, + MGMT_STATUS_INVALID_INDEX); + goto done; + } + + if (hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG) || + hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { + err = mgmt_cmd_status(sk, index, opcode, + MGMT_STATUS_INVALID_INDEX); + goto done; + } + + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && + !(handler->flags & HCI_MGMT_UNCONFIGURED)) { + err = mgmt_cmd_status(sk, index, opcode, + MGMT_STATUS_INVALID_INDEX); + goto done; + } + } + + no_hdev = (handler->flags & HCI_MGMT_NO_HDEV); + if (no_hdev != !hdev) { + err = mgmt_cmd_status(sk, index, opcode, + MGMT_STATUS_INVALID_INDEX); + goto done; + } + + var_len = (handler->flags & HCI_MGMT_VAR_LEN); + if ((var_len && len < handler->data_len) || + (!var_len && len != handler->data_len)) { + err = mgmt_cmd_status(sk, index, opcode, + MGMT_STATUS_INVALID_PARAMS); + goto done; + } + + if (hdev && chan->hdev_init) + chan->hdev_init(sk, hdev); + + cp = buf + sizeof(*hdr); + + err = handler->func(sk, hdev, cp, len); + if (err < 0) + goto done; + + err = msglen; + +done: + if (hdev) + hci_dev_put(hdev); + + kfree(buf); + return err; +} + +static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; + struct hci_mgmt_chan *chan; struct hci_dev *hdev; struct sk_buff *skb; int err; @@ -896,14 +1103,18 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, case HCI_CHANNEL_RAW: case HCI_CHANNEL_USER: break; - case HCI_CHANNEL_CONTROL: - err = mgmt_control(sk, msg, len); - goto done; case HCI_CHANNEL_MONITOR: err = -EOPNOTSUPP; goto done; default: - err = -EINVAL; + mutex_lock(&mgmt_chan_list_lock); + chan = __hci_mgmt_chan_find(hci_pi(sk)->channel); + if (chan) + err = hci_mgmt_cmd(chan, sk, msg, len); + else + err = -EINVAL; + + mutex_unlock(&mgmt_chan_list_lock); goto done; } @@ -1178,7 +1389,7 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &hci_sock_ops; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 07348e142f16..f1a117f8cad2 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -70,10 +70,11 @@ static void hidp_session_terminate(struct hidp_session *s); static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci) { + u32 valid_flags = 0; memset(ci, 0, sizeof(*ci)); bacpy(&ci->bdaddr, &session->bdaddr); - ci->flags = session->flags; + ci->flags = session->flags & valid_flags; ci->state = BT_CONNECTED; if (session->input) { @@ -907,13 +908,14 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, kref_init(&session->ref); atomic_set(&session->state, HIDP_SESSION_IDLING); init_waitqueue_head(&session->state_queue); - session->flags = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID); + session->flags = req->flags & BIT(HIDP_BLUETOOTH_VENDOR_ID); /* connection management */ bacpy(&session->bdaddr, bdaddr); session->conn = l2cap_conn_get(conn); session->user.probe = hidp_session_probe; session->user.remove = hidp_session_remove; + INIT_LIST_HEAD(&session->user.list); session->ctrl_sock = ctrl_sock; session->intr_sock = intr_sock; skb_queue_head_init(&session->ctrl_transmit); @@ -1312,6 +1314,8 @@ int hidp_connection_add(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock) { + u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG) | + BIT(HIDP_BOOT_PROTOCOL_MODE); struct hidp_session *session; struct l2cap_conn *conn; struct l2cap_chan *chan; @@ -1321,6 +1325,9 @@ int hidp_connection_add(struct hidp_connadd_req *req, if (ret) return ret; + if (req->flags & ~valid_flags) + return -EINVAL; + chan = l2cap_pi(ctrl_sock->sk)->chan; conn = NULL; l2cap_chan_lock(chan); @@ -1351,13 +1358,17 @@ out_conn: int hidp_connection_del(struct hidp_conndel_req *req) { + u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG); struct hidp_session *session; + if (req->flags & ~valid_flags) + return -EINVAL; + session = hidp_session_find(&req->bdaddr); if (!session) return -ENOENT; - if (req->flags & (1 << HIDP_VIRTUAL_CABLE_UNPLUG)) + if (req->flags & BIT(HIDP_VIRTUAL_CABLE_UNPLUG)) hidp_send_ctrl_message(session, HIDP_TRANS_HID_CONTROL | HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index cb3fdde1968a..008ba439bd62 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -235,7 +235,7 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto); + sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 6ba33f9631e8..45fffa413642 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -292,7 +292,7 @@ static struct sk_buff *l2cap_ertm_seq_in_queue(struct sk_buff_head *head, struct sk_buff *skb; skb_queue_walk(head, skb) { - if (bt_cb(skb)->control.txseq == seq) + if (bt_cb(skb)->l2cap.txseq == seq) return skb; } @@ -954,11 +954,11 @@ static inline void __unpack_control(struct l2cap_chan *chan, { if (test_bit(FLAG_EXT_CTRL, &chan->flags)) { __unpack_extended_control(get_unaligned_le32(skb->data), - &bt_cb(skb)->control); + &bt_cb(skb)->l2cap); skb_pull(skb, L2CAP_EXT_CTRL_SIZE); } else { __unpack_enhanced_control(get_unaligned_le16(skb->data), - &bt_cb(skb)->control); + &bt_cb(skb)->l2cap); skb_pull(skb, L2CAP_ENH_CTRL_SIZE); } } @@ -1200,8 +1200,8 @@ static void l2cap_move_setup(struct l2cap_chan *chan) chan->retry_count = 0; skb_queue_walk(&chan->tx_q, skb) { - if (bt_cb(skb)->control.retries) - bt_cb(skb)->control.retries = 1; + if (bt_cb(skb)->l2cap.retries) + bt_cb(skb)->l2cap.retries = 1; else break; } @@ -1244,6 +1244,13 @@ static void l2cap_move_done(struct l2cap_chan *chan) static void l2cap_chan_ready(struct l2cap_chan *chan) { + /* The channel may have already been flagged as connected in + * case of receiving data before the L2CAP info req/rsp + * procedure is complete. + */ + if (chan->state == BT_CONNECTED) + return; + /* This clears all conf flags, including CONF_NOT_COMPLETE */ chan->conf_state = 0; __clear_chan_timer(chan); @@ -1594,7 +1601,7 @@ int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user) hci_dev_lock(hdev); - if (user->list.next || user->list.prev) { + if (!list_empty(&user->list)) { ret = -EINVAL; goto out_unlock; } @@ -1624,12 +1631,10 @@ void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user) hci_dev_lock(hdev); - if (!user->list.next || !user->list.prev) + if (list_empty(&user->list)) goto out_unlock; - list_del(&user->list); - user->list.next = NULL; - user->list.prev = NULL; + list_del_init(&user->list); user->remove(conn, user); out_unlock: @@ -1643,9 +1648,7 @@ static void l2cap_unregister_all_users(struct l2cap_conn *conn) while (!list_empty(&conn->users)) { user = list_first_entry(&conn->users, struct l2cap_user, list); - list_del(&user->list); - user->list.next = NULL; - user->list.prev = NULL; + list_del_init(&user->list); user->remove(conn, user); } } @@ -1839,8 +1842,8 @@ static void l2cap_streaming_send(struct l2cap_chan *chan, skb = skb_dequeue(&chan->tx_q); - bt_cb(skb)->control.retries = 1; - control = &bt_cb(skb)->control; + bt_cb(skb)->l2cap.retries = 1; + control = &bt_cb(skb)->l2cap; control->reqseq = 0; control->txseq = chan->next_tx_seq; @@ -1884,8 +1887,8 @@ static int l2cap_ertm_send(struct l2cap_chan *chan) skb = chan->tx_send_head; - bt_cb(skb)->control.retries = 1; - control = &bt_cb(skb)->control; + bt_cb(skb)->l2cap.retries = 1; + control = &bt_cb(skb)->l2cap; if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state)) control->final = 1; @@ -1956,11 +1959,11 @@ static void l2cap_ertm_resend(struct l2cap_chan *chan) continue; } - bt_cb(skb)->control.retries++; - control = bt_cb(skb)->control; + bt_cb(skb)->l2cap.retries++; + control = bt_cb(skb)->l2cap; if (chan->max_tx != 0 && - bt_cb(skb)->control.retries > chan->max_tx) { + bt_cb(skb)->l2cap.retries > chan->max_tx) { BT_DBG("Retry limit exceeded (%d)", chan->max_tx); l2cap_send_disconn_req(chan, ECONNRESET); l2cap_seq_list_clear(&chan->retrans_list); @@ -2038,7 +2041,7 @@ static void l2cap_retransmit_all(struct l2cap_chan *chan, if (chan->unacked_frames) { skb_queue_walk(&chan->tx_q, skb) { - if (bt_cb(skb)->control.txseq == control->reqseq || + if (bt_cb(skb)->l2cap.txseq == control->reqseq || skb == chan->tx_send_head) break; } @@ -2048,7 +2051,7 @@ static void l2cap_retransmit_all(struct l2cap_chan *chan, break; l2cap_seq_list_append(&chan->retrans_list, - bt_cb(skb)->control.txseq); + bt_cb(skb)->l2cap.txseq); } l2cap_ertm_resend(chan); @@ -2260,8 +2263,8 @@ static struct sk_buff *l2cap_create_iframe_pdu(struct l2cap_chan *chan, return ERR_PTR(err); } - bt_cb(skb)->control.fcs = chan->fcs; - bt_cb(skb)->control.retries = 0; + bt_cb(skb)->l2cap.fcs = chan->fcs; + bt_cb(skb)->l2cap.retries = 0; return skb; } @@ -2314,7 +2317,7 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan, return PTR_ERR(skb); } - bt_cb(skb)->control.sar = sar; + bt_cb(skb)->l2cap.sar = sar; __skb_queue_tail(seg_queue, skb); len -= pdu_len; @@ -2849,7 +2852,7 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) continue; /* Don't send frame to the channel it came from */ - if (bt_cb(skb)->chan == chan) + if (bt_cb(skb)->l2cap.chan == chan) continue; nskb = skb_clone(skb, GFP_KERNEL); @@ -3893,7 +3896,7 @@ static int l2cap_connect_req(struct l2cap_conn *conn, return -EPROTO; hci_dev_lock(hdev); - if (test_bit(HCI_MGMT, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_MGMT) && !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &hcon->flags)) mgmt_device_connected(hdev, hcon, 0, NULL, 0); hci_dev_unlock(hdev); @@ -5911,7 +5914,7 @@ static int l2cap_rx_queued_iframes(struct l2cap_chan *chan) skb_unlink(skb, &chan->srej_q); chan->buffer_seq = __next_seq(chan, chan->buffer_seq); - err = l2cap_reassemble_sdu(chan, skb, &bt_cb(skb)->control); + err = l2cap_reassemble_sdu(chan, skb, &bt_cb(skb)->l2cap); if (err) break; } @@ -5945,7 +5948,7 @@ static void l2cap_handle_srej(struct l2cap_chan *chan, return; } - if (chan->max_tx != 0 && bt_cb(skb)->control.retries >= chan->max_tx) { + if (chan->max_tx != 0 && bt_cb(skb)->l2cap.retries >= chan->max_tx) { BT_DBG("Retry limit exceeded (%d)", chan->max_tx); l2cap_send_disconn_req(chan, ECONNRESET); return; @@ -5998,7 +6001,7 @@ static void l2cap_handle_rej(struct l2cap_chan *chan, skb = l2cap_ertm_seq_in_queue(&chan->tx_q, control->reqseq); if (chan->max_tx && skb && - bt_cb(skb)->control.retries >= chan->max_tx) { + bt_cb(skb)->l2cap.retries >= chan->max_tx) { BT_DBG("Retry limit exceeded (%d)", chan->max_tx); l2cap_send_disconn_req(chan, ECONNRESET); return; @@ -6558,7 +6561,7 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) { - struct l2cap_ctrl *control = &bt_cb(skb)->control; + struct l2cap_ctrl *control = &bt_cb(skb)->l2cap; u16 len; u8 event; @@ -6785,6 +6788,13 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid, BT_DBG("chan %p, len %d", chan, skb->len); + /* If we receive data on a fixed channel before the info req/rsp + * procdure is done simply assume that the channel is supported + * and mark it as ready. + */ + if (chan->chan_type == L2CAP_CHAN_FIXED) + l2cap_chan_ready(chan); + if (chan->state != BT_CONNECTED) goto drop; @@ -6850,8 +6860,8 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, goto drop; /* Store remote BD_ADDR and PSM for msg_name */ - bacpy(&bt_cb(skb)->bdaddr, &hcon->dst); - bt_cb(skb)->psm = psm; + bacpy(&bt_cb(skb)->l2cap.bdaddr, &hcon->dst); + bt_cb(skb)->l2cap.psm = psm; if (!chan->ops->recv(chan, skb)) { l2cap_chan_put(chan); @@ -6973,12 +6983,12 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) conn->local_fixed_chan = L2CAP_FC_SIG_BREDR | L2CAP_FC_CONNLESS; if (hcon->type == ACL_LINK && - test_bit(HCI_HS_ENABLED, &hcon->hdev->dev_flags)) + hci_dev_test_flag(hcon->hdev, HCI_HS_ENABLED)) conn->local_fixed_chan |= L2CAP_FC_A2MP; - if (test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags) && + if (hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED) && (bredr_sc_enabled(hcon->hdev) || - test_bit(HCI_FORCE_BREDR_SMP, &hcon->hdev->dbg_flags))) + hci_dev_test_flag(hcon->hdev, HCI_FORCE_BREDR_SMP))) conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR; mutex_init(&conn->ident_lock); @@ -7098,7 +7108,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, else dst_type = ADDR_LE_DEV_RANDOM; - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) role = HCI_ROLE_SLAVE; else role = HCI_ROLE_MASTER; @@ -7238,13 +7248,16 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, return NULL; } -void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) +static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) { struct hci_dev *hdev = hcon->hdev; struct l2cap_conn *conn; struct l2cap_chan *pchan; u8 dst_type; + if (hcon->type != ACL_LINK && hcon->type != LE_LINK) + return; + BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); if (status) { @@ -7307,8 +7320,11 @@ int l2cap_disconn_ind(struct hci_conn *hcon) return conn->disc_reason; } -void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) +static void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) { + if (hcon->type != ACL_LINK && hcon->type != LE_LINK) + return; + BT_DBG("hcon %p reason %d", hcon, reason); l2cap_conn_del(hcon, bt_to_errno(reason)); @@ -7331,13 +7347,13 @@ static inline void l2cap_check_encryption(struct l2cap_chan *chan, u8 encrypt) } } -int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) +static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_chan *chan; if (!conn) - return 0; + return; BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt); @@ -7420,11 +7436,9 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) } mutex_unlock(&conn->chan_lock); - - return 0; } -int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_hdr *hdr; @@ -7467,7 +7481,7 @@ int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ l2cap_recv_frame(conn, skb); - return 0; + return; } BT_DBG("Start: total len %d, frag len %d", len, skb->len); @@ -7526,9 +7540,15 @@ int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) drop: kfree_skb(skb); - return 0; } +static struct hci_cb l2cap_cb = { + .name = "L2CAP", + .connect_cfm = l2cap_connect_cfm, + .disconn_cfm = l2cap_disconn_cfm, + .security_cfm = l2cap_security_cfm, +}; + static int l2cap_debugfs_show(struct seq_file *f, void *p) { struct l2cap_chan *c; @@ -7570,6 +7590,8 @@ int __init l2cap_init(void) if (err < 0) return err; + hci_register_cb(&l2cap_cb); + if (IS_ERR_OR_NULL(bt_debugfs)) return 0; @@ -7587,6 +7609,7 @@ int __init l2cap_init(void) void l2cap_exit(void) { debugfs_remove(l2cap_debugfs); + hci_unregister_cb(&l2cap_cb); l2cap_cleanup_sockets(); } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 60694f0f4c73..244287706f91 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -43,7 +43,7 @@ static struct bt_sock_list l2cap_sk_list = { static const struct proto_ops l2cap_sock_ops; static void l2cap_sock_init(struct sock *sk, struct sock *parent); static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, - int proto, gfp_t prio); + int proto, gfp_t prio, int kern); bool l2cap_is_socket(struct socket *sock) { @@ -944,8 +944,8 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, return err; } -static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; @@ -976,8 +976,8 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, return err; } -static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) { struct sock *sk = sock->sk; struct l2cap_pinfo *pi = l2cap_pi(sk); @@ -1004,9 +1004,9 @@ static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, release_sock(sk); if (sock->type == SOCK_STREAM) - err = bt_sock_stream_recvmsg(iocb, sock, msg, len, flags); + err = bt_sock_stream_recvmsg(sock, msg, len, flags); else - err = bt_sock_recvmsg(iocb, sock, msg, len, flags); + err = bt_sock_recvmsg(sock, msg, len, flags); if (pi->chan->mode != L2CAP_MODE_ERTM) return err; @@ -1193,7 +1193,7 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) } sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP, - GFP_ATOMIC); + GFP_ATOMIC, 0); if (!sk) { release_sock(parent); return NULL; @@ -1330,7 +1330,7 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan, skb->priority = sk->sk_priority; - bt_cb(skb)->chan = chan; + bt_cb(skb)->l2cap.chan = chan; return skb; } @@ -1444,8 +1444,8 @@ static void l2cap_skb_msg_name(struct sk_buff *skb, void *msg_name, memset(la, 0, sizeof(struct sockaddr_l2)); la->l2_family = AF_BLUETOOTH; - la->l2_psm = bt_cb(skb)->psm; - bacpy(&la->l2_bdaddr, &bt_cb(skb)->bdaddr); + la->l2_psm = bt_cb(skb)->l2cap.psm; + bacpy(&la->l2_bdaddr, &bt_cb(skb)->l2cap.bdaddr); *msg_namelen = sizeof(struct sockaddr_l2); } @@ -1523,12 +1523,12 @@ static struct proto l2cap_proto = { }; static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, - int proto, gfp_t prio) + int proto, gfp_t prio, int kern) { struct sock *sk; struct l2cap_chan *chan; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern); if (!sk) return NULL; @@ -1574,7 +1574,7 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &l2cap_sock_ops; - sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 9ec5390c85eb..7998fb279165 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -29,14 +29,16 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> +#include <net/bluetooth/hci_sock.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "hci_request.h" #include "smp.h" +#include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 8 +#define MGMT_REVISION 10 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -95,6 +97,11 @@ static const u16 mgmt_commands[] = { MGMT_OP_SET_EXTERNAL_CONFIG, MGMT_OP_SET_PUBLIC_ADDRESS, MGMT_OP_START_SERVICE_DISCOVERY, + MGMT_OP_READ_LOCAL_OOB_EXT_DATA, + MGMT_OP_READ_EXT_INDEX_LIST, + MGMT_OP_READ_ADV_FEATURES, + MGMT_OP_ADD_ADVERTISING, + MGMT_OP_REMOVE_ADVERTISING, }; static const u16 mgmt_events[] = { @@ -127,6 +134,32 @@ static const u16 mgmt_events[] = { MGMT_EV_UNCONF_INDEX_ADDED, MGMT_EV_UNCONF_INDEX_REMOVED, MGMT_EV_NEW_CONFIG_OPTIONS, + MGMT_EV_EXT_INDEX_ADDED, + MGMT_EV_EXT_INDEX_REMOVED, + MGMT_EV_LOCAL_OOB_DATA_UPDATED, + MGMT_EV_ADVERTISING_ADDED, + MGMT_EV_ADVERTISING_REMOVED, +}; + +static const u16 mgmt_untrusted_commands[] = { + MGMT_OP_READ_INDEX_LIST, + MGMT_OP_READ_INFO, + MGMT_OP_READ_UNCONF_INDEX_LIST, + MGMT_OP_READ_CONFIG_INFO, + MGMT_OP_READ_EXT_INDEX_LIST, +}; + +static const u16 mgmt_untrusted_events[] = { + MGMT_EV_INDEX_ADDED, + MGMT_EV_INDEX_REMOVED, + MGMT_EV_NEW_SETTINGS, + MGMT_EV_CLASS_OF_DEV_CHANGED, + MGMT_EV_LOCAL_NAME_CHANGED, + MGMT_EV_UNCONF_INDEX_ADDED, + MGMT_EV_UNCONF_INDEX_REMOVED, + MGMT_EV_NEW_CONFIG_OPTIONS, + MGMT_EV_EXT_INDEX_ADDED, + MGMT_EV_EXT_INDEX_REMOVED, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -134,17 +167,6 @@ static const u16 mgmt_events[] = { #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" -struct pending_cmd { - struct list_head list; - u16 opcode; - int index; - void *param; - size_t param_len; - struct sock *sk; - void *user_data; - int (*cmd_complete)(struct pending_cmd *cmd, u8 status); -}; - /* HCI to MGMT error code conversion table */ static u8 mgmt_status_table[] = { MGMT_STATUS_SUCCESS, @@ -218,98 +240,32 @@ static u8 mgmt_status(u8 hci_status) return MGMT_STATUS_FAILED; } -static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 data_len, - struct sock *skip_sk) +static int mgmt_index_event(u16 event, struct hci_dev *hdev, void *data, + u16 len, int flag) { - struct sk_buff *skb; - struct mgmt_hdr *hdr; - - skb = alloc_skb(sizeof(*hdr) + data_len, GFP_KERNEL); - if (!skb) - return -ENOMEM; - - hdr = (void *) skb_put(skb, sizeof(*hdr)); - hdr->opcode = cpu_to_le16(event); - if (hdev) - hdr->index = cpu_to_le16(hdev->id); - else - hdr->index = cpu_to_le16(MGMT_INDEX_NONE); - hdr->len = cpu_to_le16(data_len); - - if (data) - memcpy(skb_put(skb, data_len), data, data_len); - - /* Time stamp */ - __net_timestamp(skb); - - hci_send_to_control(skb, skip_sk); - kfree_skb(skb); - - return 0; + return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, + flag, NULL); } -static int cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) +static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data, + u16 len, int flag, struct sock *skip_sk) { - struct sk_buff *skb; - struct mgmt_hdr *hdr; - struct mgmt_ev_cmd_status *ev; - int err; - - BT_DBG("sock %p, index %u, cmd %u, status %u", sk, index, cmd, status); - - skb = alloc_skb(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL); - if (!skb) - return -ENOMEM; - - hdr = (void *) skb_put(skb, sizeof(*hdr)); - - hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS); - hdr->index = cpu_to_le16(index); - hdr->len = cpu_to_le16(sizeof(*ev)); - - ev = (void *) skb_put(skb, sizeof(*ev)); - ev->status = status; - ev->opcode = cpu_to_le16(cmd); - - err = sock_queue_rcv_skb(sk, skb); - if (err < 0) - kfree_skb(skb); - - return err; + return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, + flag, skip_sk); } -static int cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, - void *rp, size_t rp_len) +static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data, + u16 len, struct sock *skip_sk) { - struct sk_buff *skb; - struct mgmt_hdr *hdr; - struct mgmt_ev_cmd_complete *ev; - int err; - - BT_DBG("sock %p", sk); - - skb = alloc_skb(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL); - if (!skb) - return -ENOMEM; - - hdr = (void *) skb_put(skb, sizeof(*hdr)); - - hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE); - hdr->index = cpu_to_le16(index); - hdr->len = cpu_to_le16(sizeof(*ev) + rp_len); - - ev = (void *) skb_put(skb, sizeof(*ev) + rp_len); - ev->opcode = cpu_to_le16(cmd); - ev->status = status; - - if (rp) - memcpy(ev->data, rp, rp_len); - - err = sock_queue_rcv_skb(sk, skb); - if (err < 0) - kfree_skb(skb); + return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, + HCI_MGMT_GENERIC_EVENTS, skip_sk); +} - return err; +static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len, + struct sock *skip_sk) +{ + return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, + HCI_SOCK_TRUSTED, skip_sk); } static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, @@ -322,22 +278,28 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, rp.version = MGMT_VERSION; rp.revision = cpu_to_le16(MGMT_REVISION); - return cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0, &rp, - sizeof(rp)); + return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0, + &rp, sizeof(rp)); } static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_commands *rp; - const u16 num_commands = ARRAY_SIZE(mgmt_commands); - const u16 num_events = ARRAY_SIZE(mgmt_events); - __le16 *opcode; + u16 num_commands, num_events; size_t rp_size; int i, err; BT_DBG("sock %p", sk); + if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) { + num_commands = ARRAY_SIZE(mgmt_commands); + num_events = ARRAY_SIZE(mgmt_events); + } else { + num_commands = ARRAY_SIZE(mgmt_untrusted_commands); + num_events = ARRAY_SIZE(mgmt_untrusted_events); + } + rp_size = sizeof(*rp) + ((num_commands + num_events) * sizeof(u16)); rp = kmalloc(rp_size, GFP_KERNEL); @@ -347,14 +309,26 @@ static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data, rp->num_commands = cpu_to_le16(num_commands); rp->num_events = cpu_to_le16(num_events); - for (i = 0, opcode = rp->opcodes; i < num_commands; i++, opcode++) - put_unaligned_le16(mgmt_commands[i], opcode); + if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) { + __le16 *opcode = rp->opcodes; - for (i = 0; i < num_events; i++, opcode++) - put_unaligned_le16(mgmt_events[i], opcode); + for (i = 0; i < num_commands; i++, opcode++) + put_unaligned_le16(mgmt_commands[i], opcode); - err = cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_COMMANDS, 0, rp, - rp_size); + for (i = 0; i < num_events; i++, opcode++) + put_unaligned_le16(mgmt_events[i], opcode); + } else { + __le16 *opcode = rp->opcodes; + + for (i = 0; i < num_commands; i++, opcode++) + put_unaligned_le16(mgmt_untrusted_commands[i], opcode); + + for (i = 0; i < num_events; i++, opcode++) + put_unaligned_le16(mgmt_untrusted_events[i], opcode); + } + + err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_COMMANDS, 0, + rp, rp_size); kfree(rp); return err; @@ -376,7 +350,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, count = 0; list_for_each_entry(d, &hci_dev_list, list) { if (d->dev_type == HCI_BREDR && - !test_bit(HCI_UNCONFIGURED, &d->dev_flags)) + !hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -389,9 +363,9 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (test_bit(HCI_SETUP, &d->dev_flags) || - test_bit(HCI_CONFIG, &d->dev_flags) || - test_bit(HCI_USER_CHANNEL, &d->dev_flags)) + if (hci_dev_test_flag(d, HCI_SETUP) || + hci_dev_test_flag(d, HCI_CONFIG) || + hci_dev_test_flag(d, HCI_USER_CHANNEL)) continue; /* Devices marked as raw-only are neither configured @@ -401,7 +375,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, continue; if (d->dev_type == HCI_BREDR && - !test_bit(HCI_UNCONFIGURED, &d->dev_flags)) { + !hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); BT_DBG("Added hci%u", d->id); } @@ -412,8 +386,8 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, read_unlock(&hci_dev_list_lock); - err = cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_INDEX_LIST, 0, rp, - rp_len); + err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_INDEX_LIST, + 0, rp, rp_len); kfree(rp); @@ -436,7 +410,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, count = 0; list_for_each_entry(d, &hci_dev_list, list) { if (d->dev_type == HCI_BREDR && - test_bit(HCI_UNCONFIGURED, &d->dev_flags)) + hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -449,9 +423,9 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (test_bit(HCI_SETUP, &d->dev_flags) || - test_bit(HCI_CONFIG, &d->dev_flags) || - test_bit(HCI_USER_CHANNEL, &d->dev_flags)) + if (hci_dev_test_flag(d, HCI_SETUP) || + hci_dev_test_flag(d, HCI_CONFIG) || + hci_dev_test_flag(d, HCI_USER_CHANNEL)) continue; /* Devices marked as raw-only are neither configured @@ -461,7 +435,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, continue; if (d->dev_type == HCI_BREDR && - test_bit(HCI_UNCONFIGURED, &d->dev_flags)) { + hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); BT_DBG("Added hci%u", d->id); } @@ -472,8 +446,84 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, read_unlock(&hci_dev_list_lock); - err = cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_UNCONF_INDEX_LIST, - 0, rp, rp_len); + err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, + MGMT_OP_READ_UNCONF_INDEX_LIST, 0, rp, rp_len); + + kfree(rp); + + return err; +} + +static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_rp_read_ext_index_list *rp; + struct hci_dev *d; + size_t rp_len; + u16 count; + int err; + + BT_DBG("sock %p", sk); + + read_lock(&hci_dev_list_lock); + + count = 0; + list_for_each_entry(d, &hci_dev_list, list) { + if (d->dev_type == HCI_BREDR || d->dev_type == HCI_AMP) + count++; + } + + rp_len = sizeof(*rp) + (sizeof(rp->entry[0]) * count); + rp = kmalloc(rp_len, GFP_ATOMIC); + if (!rp) { + read_unlock(&hci_dev_list_lock); + return -ENOMEM; + } + + count = 0; + list_for_each_entry(d, &hci_dev_list, list) { + if (hci_dev_test_flag(d, HCI_SETUP) || + hci_dev_test_flag(d, HCI_CONFIG) || + hci_dev_test_flag(d, HCI_USER_CHANNEL)) + continue; + + /* Devices marked as raw-only are neither configured + * nor unconfigured controllers. + */ + if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) + continue; + + if (d->dev_type == HCI_BREDR) { + if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) + rp->entry[count].type = 0x01; + else + rp->entry[count].type = 0x00; + } else if (d->dev_type == HCI_AMP) { + rp->entry[count].type = 0x02; + } else { + continue; + } + + rp->entry[count].bus = d->bus; + rp->entry[count++].index = cpu_to_le16(d->id); + BT_DBG("Added hci%u", d->id); + } + + rp->num_controllers = cpu_to_le16(count); + rp_len = sizeof(*rp) + (sizeof(rp->entry[0]) * count); + + read_unlock(&hci_dev_list_lock); + + /* If this command is called at least once, then all the + * default index and unconfigured index events are disabled + * and from now on only extended index events are used. + */ + hci_sock_set_flag(sk, HCI_MGMT_EXT_INDEX_EVENTS); + hci_sock_clear_flag(sk, HCI_MGMT_INDEX_EVENTS); + hci_sock_clear_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); + + err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, + MGMT_OP_READ_EXT_INDEX_LIST, 0, rp, rp_len); kfree(rp); @@ -483,7 +533,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, static bool is_configured(struct hci_dev *hdev) { if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && - !test_bit(HCI_EXT_CONFIGURED, &hdev->dev_flags)) + !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED)) return false; if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) && @@ -498,7 +548,7 @@ static __le32 get_missing_options(struct hci_dev *hdev) u32 options = 0; if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && - !test_bit(HCI_EXT_CONFIGURED, &hdev->dev_flags)) + !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED)) options |= MGMT_OPTION_EXTERNAL_CONFIG; if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) && @@ -512,16 +562,16 @@ static int new_options(struct hci_dev *hdev, struct sock *skip) { __le32 options = get_missing_options(hdev); - return mgmt_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, - sizeof(options), skip); + return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, + sizeof(options), skip); } static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) { __le32 options = get_missing_options(hdev); - return cmd_complete(sk, hdev->id, opcode, 0, &options, - sizeof(options)); + return mgmt_cmd_complete(sk, hdev->id, opcode, 0, &options, + sizeof(options)); } static int read_config_info(struct sock *sk, struct hci_dev *hdev, @@ -548,8 +598,8 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev, hci_dev_unlock(hdev); - return cmd_complete(sk, hdev->id, MGMT_OP_READ_CONFIG_INFO, 0, &rp, - sizeof(rp)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_CONFIG_INFO, 0, + &rp, sizeof(rp)); } static u32 get_supported_settings(struct hci_dev *hdev) @@ -582,6 +632,7 @@ static u32 get_supported_settings(struct hci_dev *hdev) settings |= MGMT_SETTING_ADVERTISING; settings |= MGMT_SETTING_SECURE_CONN; settings |= MGMT_SETTING_PRIVACY; + settings |= MGMT_SETTING_STATIC_ADDRESS; } if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || @@ -598,45 +649,64 @@ static u32 get_current_settings(struct hci_dev *hdev) if (hdev_is_powered(hdev)) settings |= MGMT_SETTING_POWERED; - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_CONNECTABLE)) settings |= MGMT_SETTING_CONNECTABLE; - if (test_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) settings |= MGMT_SETTING_FAST_CONNECTABLE; - if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) settings |= MGMT_SETTING_DISCOVERABLE; - if (test_bit(HCI_BONDABLE, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_BONDABLE)) settings |= MGMT_SETTING_BONDABLE; - if (test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) settings |= MGMT_SETTING_BREDR; - if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) settings |= MGMT_SETTING_LE; - if (test_bit(HCI_LINK_SECURITY, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_LINK_SECURITY)) settings |= MGMT_SETTING_LINK_SECURITY; - if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) settings |= MGMT_SETTING_SSP; - if (test_bit(HCI_HS_ENABLED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_HS_ENABLED)) settings |= MGMT_SETTING_HS; - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) settings |= MGMT_SETTING_ADVERTISING; - if (test_bit(HCI_SC_ENABLED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_SC_ENABLED)) settings |= MGMT_SETTING_SECURE_CONN; - if (test_bit(HCI_KEEP_DEBUG_KEYS, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS)) settings |= MGMT_SETTING_DEBUG_KEYS; - if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_PRIVACY)) settings |= MGMT_SETTING_PRIVACY; + /* The current setting for static address has two purposes. The + * first is to indicate if the static address will be used and + * the second is to indicate if it is actually set. + * + * This means if the static address is not configured, this flag + * will never be set. If the address is configured, then if the + * address is actually used decides if the flag is set or not. + * + * For single mode LE only controllers and dual-mode controllers + * with BR/EDR disabled, the existence of the static address will + * be evaluated. + */ + if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || + !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) || + !bacmp(&hdev->bdaddr, BDADDR_ANY)) { + if (bacmp(&hdev->static_addr, BDADDR_ANY)) + settings |= MGMT_SETTING_STATIC_ADDRESS; + } + return settings; } @@ -750,35 +820,33 @@ static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) return ptr; } -static struct pending_cmd *mgmt_pending_find(u16 opcode, struct hci_dev *hdev) +static struct mgmt_pending_cmd *pending_find(u16 opcode, struct hci_dev *hdev) { - struct pending_cmd *cmd; - - list_for_each_entry(cmd, &hdev->mgmt_pending, list) { - if (cmd->opcode == opcode) - return cmd; - } - - return NULL; + return mgmt_pending_find(HCI_CHANNEL_CONTROL, opcode, hdev); } -static struct pending_cmd *mgmt_pending_find_data(u16 opcode, +static struct mgmt_pending_cmd *pending_find_data(u16 opcode, struct hci_dev *hdev, const void *data) { - struct pending_cmd *cmd; + return mgmt_pending_find_data(HCI_CHANNEL_CONTROL, opcode, hdev, data); +} - list_for_each_entry(cmd, &hdev->mgmt_pending, list) { - if (cmd->user_data != data) - continue; - if (cmd->opcode == opcode) - return cmd; - } +static u8 get_current_adv_instance(struct hci_dev *hdev) +{ + /* The "Set Advertising" setting supersedes the "Add Advertising" + * setting. Here we set the advertising data based on which + * setting was set. When neither apply, default to the global settings, + * represented by instance "0". + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && + !hci_dev_test_flag(hdev, HCI_ADVERTISING)) + return hdev->cur_adv_instance; - return NULL; + return 0x00; } -static u8 create_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) +static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) { u8 ad_len = 0; size_t name_len; @@ -804,21 +872,42 @@ static u8 create_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) return ad_len; } -static void update_scan_rsp_data(struct hci_request *req) +static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, + u8 *ptr) +{ + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + + /* TODO: Set the appropriate entries based on advertising instance flags + * here once flags other than 0 are supported. + */ + memcpy(ptr, adv_instance->scan_rsp_data, + adv_instance->scan_rsp_len); + + return adv_instance->scan_rsp_len; +} + +static void update_inst_scan_rsp_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; struct hci_cp_le_set_scan_rsp_data cp; u8 len; - if (!test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; memset(&cp, 0, sizeof(cp)); - len = create_scan_rsp_data(hdev, cp.data); + if (instance) + len = create_instance_scan_rsp_data(hdev, instance, cp.data); + else + len = create_default_scan_rsp_data(hdev, cp.data); if (hdev->scan_rsp_data_len == len && - memcmp(cp.data, hdev->scan_rsp_data, len) == 0) + !memcmp(cp.data, hdev->scan_rsp_data, len)) return; memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); @@ -829,14 +918,19 @@ static void update_scan_rsp_data(struct hci_request *req) hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp); } +static void update_scan_rsp_data(struct hci_request *req) +{ + update_inst_scan_rsp_data(req, get_current_adv_instance(req->hdev)); +} + static u8 get_adv_discov_flags(struct hci_dev *hdev) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; /* If there's a pending mgmt command the flags will not yet have * their final values, so check for this first. */ - cmd = mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, hdev); + cmd = pending_find(MGMT_OP_SET_DISCOVERABLE, hdev); if (cmd) { struct mgmt_mode *cp = cmd->param; if (cp->val == 0x01) @@ -844,39 +938,140 @@ static u8 get_adv_discov_flags(struct hci_dev *hdev) else if (cp->val == 0x02) return LE_AD_LIMITED; } else { - if (test_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) return LE_AD_LIMITED; - else if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) + else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) return LE_AD_GENERAL; } return 0; } -static u8 create_adv_data(struct hci_dev *hdev, u8 *ptr) +static bool get_connectable(struct hci_dev *hdev) { + struct mgmt_pending_cmd *cmd; + + /* If there's a pending mgmt command the flag will not yet have + * it's final value, so check for this first. + */ + cmd = pending_find(MGMT_OP_SET_CONNECTABLE, hdev); + if (cmd) { + struct mgmt_mode *cp = cmd->param; + + return cp->val; + } + + return hci_dev_test_flag(hdev, HCI_CONNECTABLE); +} + +static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) +{ + u32 flags; + struct adv_info *adv_instance; + + if (instance == 0x00) { + /* Instance 0 always manages the "Tx Power" and "Flags" + * fields + */ + flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; + + /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting + * corresponds to the "connectable" instance flag. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) + flags |= MGMT_ADV_FLAG_CONNECTABLE; + + return flags; + } + + adv_instance = hci_find_adv_instance(hdev, instance); + + /* Return 0 when we got an invalid instance identifier. */ + if (!adv_instance) + return 0; + + return adv_instance->flags; +} + +static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) +{ + u8 instance = get_current_adv_instance(hdev); + struct adv_info *adv_instance; + + /* Ignore instance 0 */ + if (instance == 0x00) + return 0; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + + /* TODO: Take into account the "appearance" and "local-name" flags here. + * These are currently being ignored as they are not supported. + */ + return adv_instance->scan_rsp_len; +} + +static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) +{ + struct adv_info *adv_instance = NULL; u8 ad_len = 0, flags = 0; + u32 instance_flags; + + /* Return 0 when the current instance identifier is invalid. */ + if (instance) { + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + } - flags |= get_adv_discov_flags(hdev); + instance_flags = get_adv_instance_flags(hdev, instance); - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - flags |= LE_AD_NO_BREDR; + /* The Add Advertising command allows userspace to set both the general + * and limited discoverable flags. + */ + if (instance_flags & MGMT_ADV_FLAG_DISCOV) + flags |= LE_AD_GENERAL; - if (flags) { - BT_DBG("adv flags 0x%02x", flags); + if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV) + flags |= LE_AD_LIMITED; - ptr[0] = 2; - ptr[1] = EIR_FLAGS; - ptr[2] = flags; + if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) { + /* If a discovery flag wasn't provided, simply use the global + * settings. + */ + if (!flags) + flags |= get_adv_discov_flags(hdev); - ad_len += 3; - ptr += 3; + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + flags |= LE_AD_NO_BREDR; + + /* If flags would still be empty, then there is no need to + * include the "Flags" AD field". + */ + if (flags) { + ptr[0] = 0x02; + ptr[1] = EIR_FLAGS; + ptr[2] = flags; + + ad_len += 3; + ptr += 3; + } } - if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) { - ptr[0] = 2; + if (adv_instance) { + memcpy(ptr, adv_instance->adv_data, + adv_instance->adv_data_len); + ad_len += adv_instance->adv_data_len; + ptr += adv_instance->adv_data_len; + } + + /* Provide Tx Power only if we can provide a valid value for it */ + if (hdev->adv_tx_power != HCI_TX_POWER_INVALID && + (instance_flags & MGMT_ADV_FLAG_TX_POWER)) { + ptr[0] = 0x02; ptr[1] = EIR_TX_POWER; - ptr[2] = (u8) hdev->adv_tx_power; + ptr[2] = (u8)hdev->adv_tx_power; ad_len += 3; ptr += 3; @@ -885,19 +1080,20 @@ static u8 create_adv_data(struct hci_dev *hdev, u8 *ptr) return ad_len; } -static void update_adv_data(struct hci_request *req) +static void update_inst_adv_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; struct hci_cp_le_set_adv_data cp; u8 len; - if (!test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; memset(&cp, 0, sizeof(cp)); - len = create_adv_data(hdev, cp.data); + len = create_instance_adv_data(hdev, instance, cp.data); + /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && memcmp(cp.data, hdev->adv_data, len) == 0) return; @@ -910,6 +1106,11 @@ static void update_adv_data(struct hci_request *req) hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); } +static void update_adv_data(struct hci_request *req) +{ + update_inst_adv_data(req, get_current_adv_instance(req->hdev)); +} + int mgmt_update_adv_data(struct hci_dev *hdev) { struct hci_request req; @@ -979,10 +1180,10 @@ static void update_eir(struct hci_request *req) if (!lmp_ext_inq_capable(hdev)) return; - if (!test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return; - if (test_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) return; memset(&cp, 0, sizeof(cp)); @@ -1018,17 +1219,17 @@ static void update_class(struct hci_request *req) if (!hdev_is_powered(hdev)) return; - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return; - if (test_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) return; cod[0] = hdev->minor_class; cod[1] = hdev->major_class; cod[2] = get_service_classes(hdev); - if (test_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) cod[1] |= 0x20; if (memcmp(cod, hdev->dev_class, 3) == 0) @@ -1037,22 +1238,6 @@ static void update_class(struct hci_request *req) hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod); } -static bool get_connectable(struct hci_dev *hdev) -{ - struct pending_cmd *cmd; - - /* If there's a pending mgmt command the flag will not yet have - * it's final value, so check for this first. - */ - cmd = mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev); - if (cmd) { - struct mgmt_mode *cp = cmd->param; - return cp->val; - } - - return test_bit(HCI_CONNECTABLE, &hdev->dev_flags); -} - static void disable_advertising(struct hci_request *req) { u8 enable = 0x00; @@ -1066,11 +1251,13 @@ static void enable_advertising(struct hci_request *req) struct hci_cp_le_set_adv_param cp; u8 own_addr_type, enable = 0x01; bool connectable; + u8 instance; + u32 flags; if (hci_conn_num(hdev, LE_LINK) > 0) return; - if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) disable_advertising(req); /* Clear the HCI_LE_ADV bit temporarily so that the @@ -1078,9 +1265,16 @@ static void enable_advertising(struct hci_request *req) * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ - clear_bit(HCI_LE_ADV, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LE_ADV); - connectable = get_connectable(hdev); + instance = get_current_adv_instance(hdev); + flags = get_adv_instance_flags(hdev, instance); + + /* If the "connectable" instance flag was not set, then choose between + * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. + */ + connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || + get_connectable(hdev); /* Set require_privacy to true only when non-connectable * advertising is used. In that case it is fine to use a @@ -1092,7 +1286,14 @@ static void enable_advertising(struct hci_request *req) memset(&cp, 0, sizeof(cp)); cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval); cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval); - cp.type = connectable ? LE_ADV_IND : LE_ADV_NONCONN_IND; + + if (connectable) + cp.type = LE_ADV_IND; + else if (get_cur_adv_instance_scan_rsp_len(hdev)) + cp.type = LE_ADV_SCAN_IND; + else + cp.type = LE_ADV_NONCONN_IND; + cp.own_address_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; @@ -1107,7 +1308,7 @@ static void service_cache_off(struct work_struct *work) service_cache.work); struct hci_request req; - if (!test_and_clear_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) + if (!hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) return; hci_req_init(&req, hdev); @@ -1130,9 +1331,9 @@ static void rpa_expired(struct work_struct *work) BT_DBG(""); - set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); - if (!test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_ADVERTISING)) return; /* The generation of a new RPA and programming it into the @@ -1145,7 +1346,7 @@ static void rpa_expired(struct work_struct *work) static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev) { - if (test_and_set_bit(HCI_MGMT, &hdev->dev_flags)) + if (hci_dev_test_and_set_flag(hdev, HCI_MGMT)) return; INIT_DELAYED_WORK(&hdev->service_cache, service_cache_off); @@ -1156,7 +1357,7 @@ static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev) * for mgmt we require user-space to explicitly enable * it */ - clear_bit(HCI_BONDABLE, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_BONDABLE); } static int read_controller_info(struct sock *sk, struct hci_dev *hdev, @@ -1185,73 +1386,16 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev, hci_dev_unlock(hdev); - return cmd_complete(sk, hdev->id, MGMT_OP_READ_INFO, 0, &rp, - sizeof(rp)); -} - -static void mgmt_pending_free(struct pending_cmd *cmd) -{ - sock_put(cmd->sk); - kfree(cmd->param); - kfree(cmd); -} - -static struct pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, - struct hci_dev *hdev, void *data, - u16 len) -{ - struct pending_cmd *cmd; - - cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); - if (!cmd) - return NULL; - - cmd->opcode = opcode; - cmd->index = hdev->id; - - cmd->param = kmemdup(data, len, GFP_KERNEL); - if (!cmd->param) { - kfree(cmd); - return NULL; - } - - cmd->param_len = len; - - cmd->sk = sk; - sock_hold(sk); - - list_add(&cmd->list, &hdev->mgmt_pending); - - return cmd; -} - -static void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, - void (*cb)(struct pending_cmd *cmd, - void *data), - void *data) -{ - struct pending_cmd *cmd, *tmp; - - list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { - if (opcode > 0 && cmd->opcode != opcode) - continue; - - cb(cmd, data); - } -} - -static void mgmt_pending_remove(struct pending_cmd *cmd) -{ - list_del(&cmd->list); - mgmt_pending_free(cmd); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_INFO, 0, &rp, + sizeof(rp)); } static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) { __le32 settings = cpu_to_le32(get_current_settings(hdev)); - return cmd_complete(sk, hdev->id, opcode, 0, &settings, - sizeof(settings)); + return mgmt_cmd_complete(sk, hdev->id, opcode, 0, &settings, + sizeof(settings)); } static void clean_up_hci_complete(struct hci_dev *hdev, u8 status, u16 opcode) @@ -1272,9 +1416,10 @@ static bool hci_stop_discovery(struct hci_request *req) switch (hdev->discovery.state) { case DISCOVERY_FINDING: - if (test_bit(HCI_INQUIRY, &hdev->flags)) { + if (test_bit(HCI_INQUIRY, &hdev->flags)) hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL); - } else { + + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { cancel_delayed_work(&hdev->le_scan_disable); hci_req_add_le_scan_disable(req); } @@ -1295,7 +1440,7 @@ static bool hci_stop_discovery(struct hci_request *req) default: /* Passive scanning */ - if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { hci_req_add_le_scan_disable(req); return true; } @@ -1306,6 +1451,163 @@ static bool hci_stop_discovery(struct hci_request *req) return false; } +static void advertising_added(struct sock *sk, struct hci_dev *hdev, + u8 instance) +{ + struct mgmt_ev_advertising_added ev; + + ev.instance = instance; + + mgmt_event(MGMT_EV_ADVERTISING_ADDED, hdev, &ev, sizeof(ev), sk); +} + +static void advertising_removed(struct sock *sk, struct hci_dev *hdev, + u8 instance) +{ + struct mgmt_ev_advertising_removed ev; + + ev.instance = instance; + + mgmt_event(MGMT_EV_ADVERTISING_REMOVED, hdev, &ev, sizeof(ev), sk); +} + +static int schedule_adv_instance(struct hci_request *req, u8 instance, + bool force) { + struct hci_dev *hdev = req->hdev; + struct adv_info *adv_instance = NULL; + u16 timeout; + + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) + return -EPERM; + + if (hdev->adv_instance_timeout) + return -EBUSY; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -ENOENT; + + /* A zero timeout means unlimited advertising. As long as there is + * only one instance, duration should be ignored. We still set a timeout + * in case further instances are being added later on. + * + * If the remaining lifetime of the instance is more than the duration + * then the timeout corresponds to the duration, otherwise it will be + * reduced to the remaining instance lifetime. + */ + if (adv_instance->timeout == 0 || + adv_instance->duration <= adv_instance->remaining_time) + timeout = adv_instance->duration; + else + timeout = adv_instance->remaining_time; + + /* The remaining time is being reduced unless the instance is being + * advertised without time limit. + */ + if (adv_instance->timeout) + adv_instance->remaining_time = + adv_instance->remaining_time - timeout; + + hdev->adv_instance_timeout = timeout; + queue_delayed_work(hdev->workqueue, + &hdev->adv_instance_expire, + msecs_to_jiffies(timeout * 1000)); + + /* If we're just re-scheduling the same instance again then do not + * execute any HCI commands. This happens when a single instance is + * being advertised. + */ + if (!force && hdev->cur_adv_instance == instance && + hci_dev_test_flag(hdev, HCI_LE_ADV)) + return 0; + + hdev->cur_adv_instance = instance; + update_adv_data(req); + update_scan_rsp_data(req); + enable_advertising(req); + + return 0; +} + +static void cancel_adv_timeout(struct hci_dev *hdev) +{ + if (hdev->adv_instance_timeout) { + hdev->adv_instance_timeout = 0; + cancel_delayed_work(&hdev->adv_instance_expire); + } +} + +/* For a single instance: + * - force == true: The instance will be removed even when its remaining + * lifetime is not zero. + * - force == false: the instance will be deactivated but kept stored unless + * the remaining lifetime is zero. + * + * For instance == 0x00: + * - force == true: All instances will be removed regardless of their timeout + * setting. + * - force == false: Only instances that have a timeout will be removed. + */ +static void clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, + u8 instance, bool force) +{ + struct adv_info *adv_instance, *n, *next_instance = NULL; + int err; + u8 rem_inst; + + /* Cancel any timeout concerning the removed instance(s). */ + if (!instance || hdev->cur_adv_instance == instance) + cancel_adv_timeout(hdev); + + /* Get the next instance to advertise BEFORE we remove + * the current one. This can be the same instance again + * if there is only one instance. + */ + if (instance && hdev->cur_adv_instance == instance) + next_instance = hci_get_next_instance(hdev, instance); + + if (instance == 0x00) { + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, + list) { + if (!(force || adv_instance->timeout)) + continue; + + rem_inst = adv_instance->instance; + err = hci_remove_adv_instance(hdev, rem_inst); + if (!err) + advertising_removed(NULL, hdev, rem_inst); + } + hdev->cur_adv_instance = 0x00; + } else { + adv_instance = hci_find_adv_instance(hdev, instance); + + if (force || (adv_instance && adv_instance->timeout && + !adv_instance->remaining_time)) { + /* Don't advertise a removed instance. */ + if (next_instance && + next_instance->instance == instance) + next_instance = NULL; + + err = hci_remove_adv_instance(hdev, instance); + if (!err) + advertising_removed(NULL, hdev, instance); + } + } + + if (list_empty(&hdev->adv_instances)) { + hdev->cur_adv_instance = 0x00; + hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); + } + + if (!req || !hdev_is_powered(hdev) || + hci_dev_test_flag(hdev, HCI_ADVERTISING)) + return; + + if (next_instance) + schedule_adv_instance(req, next_instance->instance, false); +} + static int clean_up_hci_state(struct hci_dev *hdev) { struct hci_request req; @@ -1321,7 +1623,9 @@ static int clean_up_hci_state(struct hci_dev *hdev) hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } - if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) + clear_adv_instance(hdev, NULL, 0x00, false); + + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) disable_advertising(&req); discov_stopped = hci_stop_discovery(&req); @@ -1369,24 +1673,24 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; int err; BT_DBG("request for %s", hdev->name); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); - if (mgmt_pending_find(MGMT_OP_SET_POWERED, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_POWERED, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, + MGMT_STATUS_BUSY); goto failed; } - if (test_and_clear_bit(HCI_AUTO_OFF, &hdev->dev_flags)) { + if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { cancel_delayed_work(&hdev->power_off); if (cp->val) { @@ -1433,11 +1737,10 @@ failed: static int new_settings(struct hci_dev *hdev, struct sock *skip) { - __le32 ev; - - ev = cpu_to_le32(get_current_settings(hdev)); + __le32 ev = cpu_to_le32(get_current_settings(hdev)); - return mgmt_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, sizeof(ev), skip); + return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, + sizeof(ev), skip); } int mgmt_new_settings(struct hci_dev *hdev) @@ -1451,7 +1754,7 @@ struct cmd_lookup { u8 mgmt_status; }; -static void settings_rsp(struct pending_cmd *cmd, void *data) +static void settings_rsp(struct mgmt_pending_cmd *cmd, void *data) { struct cmd_lookup *match = data; @@ -1467,15 +1770,15 @@ static void settings_rsp(struct pending_cmd *cmd, void *data) mgmt_pending_free(cmd); } -static void cmd_status_rsp(struct pending_cmd *cmd, void *data) +static void cmd_status_rsp(struct mgmt_pending_cmd *cmd, void *data) { u8 *status = data; - cmd_status(cmd->sk, cmd->index, cmd->opcode, *status); + mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, *status); mgmt_pending_remove(cmd); } -static void cmd_complete_rsp(struct pending_cmd *cmd, void *data) +static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) { if (cmd->cmd_complete) { u8 *status = data; @@ -1489,23 +1792,23 @@ static void cmd_complete_rsp(struct pending_cmd *cmd, void *data) cmd_status_rsp(cmd, data); } -static int generic_cmd_complete(struct pending_cmd *cmd, u8 status) +static int generic_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { - return cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, - cmd->param, cmd->param_len); + return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, + cmd->param, cmd->param_len); } -static int addr_cmd_complete(struct pending_cmd *cmd, u8 status) +static int addr_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { - return cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, cmd->param, - sizeof(struct mgmt_addr_info)); + return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, + cmd->param, sizeof(struct mgmt_addr_info)); } static u8 mgmt_bredr_support(struct hci_dev *hdev) { if (!lmp_bredr_capable(hdev)) return MGMT_STATUS_NOT_SUPPORTED; - else if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + else if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return MGMT_STATUS_REJECTED; else return MGMT_STATUS_SUCCESS; @@ -1515,7 +1818,7 @@ static u8 mgmt_le_support(struct hci_dev *hdev) { if (!lmp_le_capable(hdev)) return MGMT_STATUS_NOT_SUPPORTED; - else if (!test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) + else if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return MGMT_STATUS_REJECTED; else return MGMT_STATUS_SUCCESS; @@ -1524,7 +1827,7 @@ static u8 mgmt_le_support(struct hci_dev *hdev) static void set_discoverable_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct mgmt_mode *cp; struct hci_request req; bool changed; @@ -1533,21 +1836,20 @@ static void set_discoverable_complete(struct hci_dev *hdev, u8 status, hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, hdev); + cmd = pending_find(MGMT_OP_SET_DISCOVERABLE, hdev); if (!cmd) goto unlock; if (status) { u8 mgmt_err = mgmt_status(status); - cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); - clear_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); + mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); goto remove_cmd; } cp = cmd->param; if (cp->val) { - changed = !test_and_set_bit(HCI_DISCOVERABLE, - &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, HCI_DISCOVERABLE); if (hdev->discov_timeout > 0) { int to = msecs_to_jiffies(hdev->discov_timeout * 1000); @@ -1555,8 +1857,7 @@ static void set_discoverable_complete(struct hci_dev *hdev, u8 status, to); } } else { - changed = test_and_clear_bit(HCI_DISCOVERABLE, - &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, HCI_DISCOVERABLE); } send_settings_rsp(cmd->sk, MGMT_OP_SET_DISCOVERABLE, hdev); @@ -1585,7 +1886,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_discoverable *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; u16 timeout; u8 scan; @@ -1593,14 +1894,14 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("request for %s", hdev->name); - if (!test_bit(HCI_LE_ENABLED, &hdev->dev_flags) && - !test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, - MGMT_STATUS_REJECTED); + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) && + !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, + MGMT_STATUS_REJECTED); if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) - return cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, + MGMT_STATUS_INVALID_PARAMS); timeout = __le16_to_cpu(cp->timeout); @@ -1609,27 +1910,27 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, */ if ((cp->val == 0x00 && timeout > 0) || (cp->val == 0x02 && timeout == 0)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!hdev_is_powered(hdev) && timeout > 0) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, - MGMT_STATUS_NOT_POWERED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, + MGMT_STATUS_NOT_POWERED); goto failed; } - if (mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) || - mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) || + pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, + MGMT_STATUS_BUSY); goto failed; } - if (!test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, - MGMT_STATUS_REJECTED); + if (!hci_dev_test_flag(hdev, HCI_CONNECTABLE)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, + MGMT_STATUS_REJECTED); goto failed; } @@ -1640,8 +1941,8 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, * not a valid operation since it requires a timeout * and so no need to check HCI_LIMITED_DISCOVERABLE. */ - if (!!cp->val != test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) { - change_bit(HCI_DISCOVERABLE, &hdev->dev_flags); + if (!!cp->val != hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) { + hci_dev_change_flag(hdev, HCI_DISCOVERABLE); changed = true; } @@ -1659,9 +1960,9 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, * value with the new value. And if only the timeout gets updated, * then no need for any HCI transactions. */ - if (!!cp->val == test_bit(HCI_DISCOVERABLE, &hdev->dev_flags) && - (cp->val == 0x02) == test_bit(HCI_LIMITED_DISCOVERABLE, - &hdev->dev_flags)) { + if (!!cp->val == hci_dev_test_flag(hdev, HCI_DISCOVERABLE) && + (cp->val == 0x02) == hci_dev_test_flag(hdev, + HCI_LIMITED_DISCOVERABLE)) { cancel_delayed_work(&hdev->discov_off); hdev->discov_timeout = timeout; @@ -1690,16 +1991,16 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, /* Limited discoverable mode */ if (cp->val == 0x02) - set_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_LIMITED_DISCOVERABLE); else - clear_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); hci_req_init(&req, hdev); /* The procedure for LE-only controllers is much simpler - just * update the advertising data. */ - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) goto update_ad; scan = SCAN_PAGE; @@ -1729,7 +2030,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, scan |= SCAN_INQUIRY; } else { - clear_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); } hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, sizeof(scan), &scan); @@ -1752,7 +2053,7 @@ static void write_fast_connectable(struct hci_request *req, bool enable) struct hci_cp_write_page_scan_activity acp; u8 type; - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return; if (hdev->hci_ver < BLUETOOTH_VER_1_2) @@ -1784,7 +2085,7 @@ static void write_fast_connectable(struct hci_request *req, bool enable) static void set_connectable_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct mgmt_mode *cp; bool conn_changed, discov_changed; @@ -1792,26 +2093,26 @@ static void set_connectable_complete(struct hci_dev *hdev, u8 status, hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev); + cmd = pending_find(MGMT_OP_SET_CONNECTABLE, hdev); if (!cmd) goto unlock; if (status) { u8 mgmt_err = mgmt_status(status); - cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); goto remove_cmd; } cp = cmd->param; if (cp->val) { - conn_changed = !test_and_set_bit(HCI_CONNECTABLE, - &hdev->dev_flags); + conn_changed = !hci_dev_test_and_set_flag(hdev, + HCI_CONNECTABLE); discov_changed = false; } else { - conn_changed = test_and_clear_bit(HCI_CONNECTABLE, - &hdev->dev_flags); - discov_changed = test_and_clear_bit(HCI_DISCOVERABLE, - &hdev->dev_flags); + conn_changed = hci_dev_test_and_clear_flag(hdev, + HCI_CONNECTABLE); + discov_changed = hci_dev_test_and_clear_flag(hdev, + HCI_DISCOVERABLE); } send_settings_rsp(cmd->sk, MGMT_OP_SET_CONNECTABLE, hdev); @@ -1837,14 +2138,14 @@ static int set_connectable_update_settings(struct hci_dev *hdev, bool changed = false; int err; - if (!!val != test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) + if (!!val != hci_dev_test_flag(hdev, HCI_CONNECTABLE)) changed = true; if (val) { - set_bit(HCI_CONNECTABLE, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_CONNECTABLE); } else { - clear_bit(HCI_CONNECTABLE, &hdev->dev_flags); - clear_bit(HCI_DISCOVERABLE, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_CONNECTABLE); + hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); } err = send_settings_rsp(sk, MGMT_OP_SET_CONNECTABLE, hdev); @@ -1864,21 +2165,21 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; u8 scan; int err; BT_DBG("request for %s", hdev->name); - if (!test_bit(HCI_LE_ENABLED, &hdev->dev_flags) && - !test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, - MGMT_STATUS_REJECTED); + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) && + !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, + MGMT_STATUS_REJECTED); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); @@ -1887,10 +2188,10 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - if (mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) || - mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) || + pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, + MGMT_STATUS_BUSY); goto failed; } @@ -1906,10 +2207,10 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, * by-product of disabling connectable, we need to update the * advertising flags. */ - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) { + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { if (!cp->val) { - clear_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); - clear_bit(HCI_DISCOVERABLE, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); + hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); } update_adv_data(&req); } else if (cp->val != test_bit(HCI_PSCAN, &hdev->flags)) { @@ -1938,17 +2239,9 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, } no_scan_update: - /* If we're going from non-connectable to connectable or - * vice-versa when fast connectable is enabled ensure that fast - * connectable gets disabled. write_fast_connectable won't do - * anything if the page scan parameters are already what they - * should be. - */ - if (cp->val || test_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags)) - write_fast_connectable(&req, false); - /* Update the advertising parameters if necessary */ - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) enable_advertising(&req); err = hci_req_run(&req, set_connectable_complete); @@ -1975,15 +2268,15 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("request for %s", hdev->name); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (cp->val) - changed = !test_and_set_bit(HCI_BONDABLE, &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, HCI_BONDABLE); else - changed = test_and_clear_bit(HCI_BONDABLE, &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, HCI_BONDABLE); err = send_settings_rsp(sk, MGMT_OP_SET_BONDABLE, hdev); if (err < 0) @@ -2001,7 +2294,7 @@ static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; u8 val, status; int err; @@ -2009,21 +2302,20 @@ static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data, status = mgmt_bredr_support(hdev); if (status) - return cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, - status); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, + status); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { bool changed = false; - if (!!cp->val != test_bit(HCI_LINK_SECURITY, - &hdev->dev_flags)) { - change_bit(HCI_LINK_SECURITY, &hdev->dev_flags); + if (!!cp->val != hci_dev_test_flag(hdev, HCI_LINK_SECURITY)) { + hci_dev_change_flag(hdev, HCI_LINK_SECURITY); changed = true; } @@ -2037,9 +2329,9 @@ static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - if (mgmt_pending_find(MGMT_OP_SET_LINK_SECURITY, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_LINK_SECURITY, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, + MGMT_STATUS_BUSY); goto failed; } @@ -2070,7 +2362,7 @@ failed: static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; u8 status; int err; @@ -2078,15 +2370,15 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) status = mgmt_bredr_support(hdev); if (status) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, status); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, status); if (!lmp_ssp_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, + MGMT_STATUS_NOT_SUPPORTED); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); @@ -2094,16 +2386,16 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) bool changed; if (cp->val) { - changed = !test_and_set_bit(HCI_SSP_ENABLED, - &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, + HCI_SSP_ENABLED); } else { - changed = test_and_clear_bit(HCI_SSP_ENABLED, - &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, + HCI_SSP_ENABLED); if (!changed) - changed = test_and_clear_bit(HCI_HS_ENABLED, - &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, + HCI_HS_ENABLED); else - clear_bit(HCI_HS_ENABLED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_HS_ENABLED); } err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev); @@ -2116,14 +2408,13 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) goto failed; } - if (mgmt_pending_find(MGMT_OP_SET_SSP, hdev) || - mgmt_pending_find(MGMT_OP_SET_HS, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_SSP, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, + MGMT_STATUS_BUSY); goto failed; } - if (!!cp->val == test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { + if (!!cp->val == hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev); goto failed; } @@ -2134,7 +2425,7 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) goto failed; } - if (!cp->val && test_bit(HCI_USE_DEBUG_KEYS, &hdev->dev_flags)) + if (!cp->val && hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) hci_send_cmd(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE, sizeof(cp->val), &cp->val); @@ -2160,32 +2451,38 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) status = mgmt_bredr_support(hdev); if (status) - return cmd_status(sk, hdev->id, MGMT_OP_SET_HS, status); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, status); if (!lmp_ssp_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_HS, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, + MGMT_STATUS_NOT_SUPPORTED); - if (!test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_HS, - MGMT_STATUS_REJECTED); + if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, + MGMT_STATUS_REJECTED); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_HS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); + if (pending_find(MGMT_OP_SET_SSP, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, + MGMT_STATUS_BUSY); + goto unlock; + } + if (cp->val) { - changed = !test_and_set_bit(HCI_HS_ENABLED, &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, HCI_HS_ENABLED); } else { if (hdev_is_powered(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_HS, - MGMT_STATUS_REJECTED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, + MGMT_STATUS_REJECTED); goto unlock; } - changed = test_and_clear_bit(HCI_HS_ENABLED, &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, HCI_HS_ENABLED); } err = send_settings_rsp(sk, MGMT_OP_SET_HS, hdev); @@ -2226,7 +2523,7 @@ static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) * has actually been enabled. During power on, the * update in powered_update_hci will take care of it. */ - if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { struct hci_request req; hci_req_init(&req, hdev); @@ -2244,7 +2541,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct hci_cp_write_le_host_supported hci_cp; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; int err; u8 val, enabled; @@ -2252,33 +2549,48 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) BT_DBG("request for %s", hdev->name); if (!lmp_le_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_LE, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, + MGMT_STATUS_NOT_SUPPORTED); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_LE, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, + MGMT_STATUS_INVALID_PARAMS); + + /* Bluetooth single mode LE only controllers or dual-mode + * controllers configured as LE only devices, do not allow + * switching LE off. These have either LE enabled explicitly + * or BR/EDR has been previously switched off. + * + * When trying to enable an already enabled LE, then gracefully + * send a positive response. Trying to disable it however will + * result into rejection. + */ + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { + if (cp->val == 0x01) + return send_settings_rsp(sk, MGMT_OP_SET_LE, hdev); - /* LE-only devices do not allow toggling LE on/off */ - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_LE, - MGMT_STATUS_REJECTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, + MGMT_STATUS_REJECTED); + } hci_dev_lock(hdev); val = !!cp->val; enabled = lmp_host_le_capable(hdev); + if (!val) + clear_adv_instance(hdev, NULL, 0x00, true); + if (!hdev_is_powered(hdev) || val == enabled) { bool changed = false; - if (val != test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) { - change_bit(HCI_LE_ENABLED, &hdev->dev_flags); + if (val != hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { + hci_dev_change_flag(hdev, HCI_LE_ENABLED); changed = true; } - if (!val && test_bit(HCI_ADVERTISING, &hdev->dev_flags)) { - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); + if (!val && hci_dev_test_flag(hdev, HCI_ADVERTISING)) { + hci_dev_clear_flag(hdev, HCI_ADVERTISING); changed = true; } @@ -2292,10 +2604,10 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) goto unlock; } - if (mgmt_pending_find(MGMT_OP_SET_LE, hdev) || - mgmt_pending_find(MGMT_OP_SET_ADVERTISING, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_LE, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_LE, hdev) || + pending_find(MGMT_OP_SET_ADVERTISING, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, + MGMT_STATUS_BUSY); goto unlock; } @@ -2313,7 +2625,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) hci_cp.le = val; hci_cp.simul = 0x00; } else { - if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) disable_advertising(&req); } @@ -2337,7 +2649,7 @@ unlock: */ static bool pending_eir_or_class(struct hci_dev *hdev) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; list_for_each_entry(cmd, &hdev->mgmt_pending, list) { switch (cmd->opcode) { @@ -2373,16 +2685,16 @@ static u8 get_uuid_size(const u8 *uuid) static void mgmt_class_complete(struct hci_dev *hdev, u16 mgmt_op, u8 status) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; hci_dev_lock(hdev); - cmd = mgmt_pending_find(mgmt_op, hdev); + cmd = pending_find(mgmt_op, hdev); if (!cmd) goto unlock; - cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(status), - hdev->dev_class, 3); + mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status), hdev->dev_class, 3); mgmt_pending_remove(cmd); @@ -2400,7 +2712,7 @@ static void add_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode) static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_add_uuid *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; struct bt_uuid *uuid; int err; @@ -2410,8 +2722,8 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) hci_dev_lock(hdev); if (pending_eir_or_class(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_ADD_UUID, - MGMT_STATUS_BUSY); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_UUID, + MGMT_STATUS_BUSY); goto failed; } @@ -2437,8 +2749,8 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) if (err != -ENODATA) goto failed; - err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_UUID, 0, - hdev->dev_class, 3); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_UUID, 0, + hdev->dev_class, 3); goto failed; } @@ -2460,7 +2772,7 @@ static bool enable_service_cache(struct hci_dev *hdev) if (!hdev_is_powered(hdev)) return false; - if (!test_and_set_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) { + if (!hci_dev_test_and_set_flag(hdev, HCI_SERVICE_CACHE)) { queue_delayed_work(hdev->workqueue, &hdev->service_cache, CACHE_TIMEOUT); return true; @@ -2480,7 +2792,7 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_remove_uuid *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct bt_uuid *match, *tmp; u8 bt_uuid_any[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; struct hci_request req; @@ -2491,8 +2803,8 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); if (pending_eir_or_class(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID, - MGMT_STATUS_BUSY); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID, + MGMT_STATUS_BUSY); goto unlock; } @@ -2500,8 +2812,9 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, hci_uuids_clear(hdev); if (enable_service_cache(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_UUID, - 0, hdev->dev_class, 3); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_UUID, + 0, hdev->dev_class, 3); goto unlock; } @@ -2520,8 +2833,8 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, } if (found == 0) { - err = cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID, - MGMT_STATUS_INVALID_PARAMS); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID, + MGMT_STATUS_INVALID_PARAMS); goto unlock; } @@ -2536,8 +2849,8 @@ update_class: if (err != -ENODATA) goto unlock; - err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_UUID, 0, - hdev->dev_class, 3); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_UUID, 0, + hdev->dev_class, 3); goto unlock; } @@ -2565,27 +2878,27 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_dev_class *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; int err; BT_DBG("request for %s", hdev->name); if (!lmp_bredr_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, + MGMT_STATUS_NOT_SUPPORTED); hci_dev_lock(hdev); if (pending_eir_or_class(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, - MGMT_STATUS_BUSY); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, + MGMT_STATUS_BUSY); goto unlock; } if ((cp->minor & 0x03) != 0 || (cp->major & 0xe0) != 0) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, - MGMT_STATUS_INVALID_PARAMS); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, + MGMT_STATUS_INVALID_PARAMS); goto unlock; } @@ -2593,14 +2906,14 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, hdev->minor_class = cp->minor; if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0, - hdev->dev_class, 3); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0, + hdev->dev_class, 3); goto unlock; } hci_req_init(&req, hdev); - if (test_and_clear_bit(HCI_SERVICE_CACHE, &hdev->dev_flags)) { + if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) { hci_dev_unlock(hdev); cancel_delayed_work_sync(&hdev->service_cache); hci_dev_lock(hdev); @@ -2614,8 +2927,8 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, if (err != -ENODATA) goto unlock; - err = cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0, - hdev->dev_class, 3); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0, + hdev->dev_class, 3); goto unlock; } @@ -2645,15 +2958,15 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("request for %s", hdev->name); if (!lmp_bredr_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, + MGMT_STATUS_NOT_SUPPORTED); key_count = __le16_to_cpu(cp->key_count); if (key_count > max_key_count) { BT_ERR("load_link_keys: too big key_count value %u", key_count); - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, + MGMT_STATUS_INVALID_PARAMS); } expected_len = sizeof(*cp) + key_count * @@ -2661,13 +2974,13 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, if (expected_len != len) { BT_ERR("load_link_keys: expected %u bytes, got %u bytes", expected_len, len); - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, + MGMT_STATUS_INVALID_PARAMS); } if (cp->debug_keys != 0x00 && cp->debug_keys != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, + MGMT_STATUS_INVALID_PARAMS); BT_DBG("%s debug_keys %u key_count %u", hdev->name, cp->debug_keys, key_count); @@ -2676,8 +2989,9 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_link_key_info *key = &cp->keys[i]; if (key->addr.type != BDADDR_BREDR || key->type > 0x08) - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_LOAD_LINK_KEYS, + MGMT_STATUS_INVALID_PARAMS); } hci_dev_lock(hdev); @@ -2685,11 +2999,10 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, hci_link_keys_clear(hdev); if (cp->debug_keys) - changed = !test_and_set_bit(HCI_KEEP_DEBUG_KEYS, - &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, HCI_KEEP_DEBUG_KEYS); else - changed = test_and_clear_bit(HCI_KEEP_DEBUG_KEYS, - &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, + HCI_KEEP_DEBUG_KEYS); if (changed) new_settings(hdev, NULL); @@ -2707,7 +3020,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, key->type, key->pin_len, NULL); } - cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, 0, NULL, 0); + mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, 0, NULL, 0); hci_dev_unlock(hdev); @@ -2732,7 +3045,7 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_cp_unpair_device *cp = data; struct mgmt_rp_unpair_device rp; struct hci_cp_disconnect dc; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_conn *conn; int err; @@ -2741,20 +3054,21 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, rp.addr.type = cp->addr.type; if (!bdaddr_type_is_valid(cp->addr.type)) - return cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, - MGMT_STATUS_INVALID_PARAMS, - &rp, sizeof(rp)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); if (cp->disconnect != 0x00 && cp->disconnect != 0x01) - return cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, - MGMT_STATUS_INVALID_PARAMS, - &rp, sizeof(rp)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, - MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, + MGMT_STATUS_NOT_POWERED, &rp, + sizeof(rp)); goto unlock; } @@ -2804,8 +3118,9 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, } if (err < 0) { - err = cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, - MGMT_STATUS_NOT_PAIRED, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, + MGMT_STATUS_NOT_PAIRED, &rp, + sizeof(rp)); goto unlock; } @@ -2813,8 +3128,8 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, * link is requested. */ if (!conn) { - err = cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, 0, - &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, 0, + &rp, sizeof(rp)); device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, sk); goto unlock; } @@ -2844,7 +3159,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_disconnect *cp = data; struct mgmt_rp_disconnect rp; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_conn *conn; int err; @@ -2855,21 +3170,22 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, rp.addr.type = cp->addr.type; if (!bdaddr_type_is_valid(cp->addr.type)) - return cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_INVALID_PARAMS, - &rp, sizeof(rp)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); hci_dev_lock(hdev); if (!test_bit(HCI_UP, &hdev->flags)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, + MGMT_STATUS_NOT_POWERED, &rp, + sizeof(rp)); goto failed; } - if (mgmt_pending_find(MGMT_OP_DISCONNECT, hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_BUSY, &rp, sizeof(rp)); + if (pending_find(MGMT_OP_DISCONNECT, hdev)) { + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, + MGMT_STATUS_BUSY, &rp, sizeof(rp)); goto failed; } @@ -2880,8 +3196,9 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr); if (!conn || conn->state == BT_OPEN || conn->state == BT_CLOSED) { - err = cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_NOT_CONNECTED, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, + MGMT_STATUS_NOT_CONNECTED, &rp, + sizeof(rp)); goto failed; } @@ -2935,8 +3252,8 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, - MGMT_STATUS_NOT_POWERED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, + MGMT_STATUS_NOT_POWERED); goto unlock; } @@ -2969,8 +3286,8 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data, /* Recalculate length in case of filtered SCO connections, etc */ rp_len = sizeof(*rp) + (i * sizeof(struct mgmt_addr_info)); - err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, 0, rp, - rp_len); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, 0, rp, + rp_len); kfree(rp); @@ -2982,7 +3299,7 @@ unlock: static int send_pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_pin_code_neg_reply *cp) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; int err; cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_NEG_REPLY, hdev, cp, @@ -3004,7 +3321,7 @@ static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; struct mgmt_cp_pin_code_reply *cp = data; struct hci_cp_pin_code_reply reply; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; int err; BT_DBG(""); @@ -3012,15 +3329,15 @@ static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY, - MGMT_STATUS_NOT_POWERED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY, + MGMT_STATUS_NOT_POWERED); goto failed; } conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); if (!conn) { - err = cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY, - MGMT_STATUS_NOT_CONNECTED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY, + MGMT_STATUS_NOT_CONNECTED); goto failed; } @@ -3033,8 +3350,8 @@ static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data, err = send_pin_code_neg_reply(sk, hdev, &ncp); if (err >= 0) - err = cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY, - MGMT_STATUS_INVALID_PARAMS); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY, + MGMT_STATUS_INVALID_PARAMS); goto failed; } @@ -3068,8 +3385,8 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG(""); if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY) - return cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, - MGMT_STATUS_INVALID_PARAMS, NULL, 0); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, + MGMT_STATUS_INVALID_PARAMS, NULL, 0); hci_dev_lock(hdev); @@ -3080,14 +3397,14 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_unlock(hdev); - return cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, 0, NULL, - 0); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, 0, + NULL, 0); } -static struct pending_cmd *find_pairing(struct hci_conn *conn) +static struct mgmt_pending_cmd *find_pairing(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; list_for_each_entry(cmd, &hdev->mgmt_pending, list) { if (cmd->opcode != MGMT_OP_PAIR_DEVICE) @@ -3102,7 +3419,7 @@ static struct pending_cmd *find_pairing(struct hci_conn *conn) return NULL; } -static int pairing_complete(struct pending_cmd *cmd, u8 status) +static int pairing_complete(struct mgmt_pending_cmd *cmd, u8 status) { struct mgmt_rp_pair_device rp; struct hci_conn *conn = cmd->user_data; @@ -3111,8 +3428,8 @@ static int pairing_complete(struct pending_cmd *cmd, u8 status) bacpy(&rp.addr.bdaddr, &conn->dst); rp.addr.type = link_to_bdaddr(conn->type, conn->dst_type); - err = cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE, status, - &rp, sizeof(rp)); + err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE, + status, &rp, sizeof(rp)); /* So we don't get further callbacks for this connection */ conn->connect_cfm_cb = NULL; @@ -3134,7 +3451,7 @@ static int pairing_complete(struct pending_cmd *cmd, u8 status) void mgmt_smp_complete(struct hci_conn *conn, bool complete) { u8 status = complete ? MGMT_STATUS_SUCCESS : MGMT_STATUS_FAILED; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; cmd = find_pairing(conn); if (cmd) { @@ -3145,7 +3462,7 @@ void mgmt_smp_complete(struct hci_conn *conn, bool complete) static void pairing_complete_cb(struct hci_conn *conn, u8 status) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; BT_DBG("status %u", status); @@ -3161,7 +3478,7 @@ static void pairing_complete_cb(struct hci_conn *conn, u8 status) static void le_pairing_complete_cb(struct hci_conn *conn, u8 status) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; BT_DBG("status %u", status); @@ -3183,7 +3500,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_pair_device *cp = data; struct mgmt_rp_pair_device rp; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; u8 sec_level, auth_type; struct hci_conn *conn; int err; @@ -3195,20 +3512,28 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, rp.addr.type = cp->addr.type; if (!bdaddr_type_is_valid(cp->addr.type)) - return cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, - MGMT_STATUS_INVALID_PARAMS, - &rp, sizeof(rp)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); if (cp->io_cap > SMP_IO_KEYBOARD_DISPLAY) - return cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, - MGMT_STATUS_INVALID_PARAMS, - &rp, sizeof(rp)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, - MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, + MGMT_STATUS_NOT_POWERED, &rp, + sizeof(rp)); + goto unlock; + } + + if (hci_bdaddr_is_paired(hdev, &cp->addr.bdaddr, cp->addr.type)) { + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, + MGMT_STATUS_ALREADY_PAIRED, &rp, + sizeof(rp)); goto unlock; } @@ -3249,19 +3574,22 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, if (PTR_ERR(conn) == -EBUSY) status = MGMT_STATUS_BUSY; + else if (PTR_ERR(conn) == -EOPNOTSUPP) + status = MGMT_STATUS_NOT_SUPPORTED; + else if (PTR_ERR(conn) == -ECONNREFUSED) + status = MGMT_STATUS_REJECTED; else status = MGMT_STATUS_CONNECT_FAILED; - err = cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, - status, &rp, - sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, + status, &rp, sizeof(rp)); goto unlock; } if (conn->connect_cfm_cb) { hci_conn_drop(conn); - err = cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, - MGMT_STATUS_BUSY, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, + MGMT_STATUS_BUSY, &rp, sizeof(rp)); goto unlock; } @@ -3305,7 +3633,7 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_addr_info *addr = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_conn *conn; int err; @@ -3314,31 +3642,31 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, - MGMT_STATUS_NOT_POWERED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, + MGMT_STATUS_NOT_POWERED); goto unlock; } - cmd = mgmt_pending_find(MGMT_OP_PAIR_DEVICE, hdev); + cmd = pending_find(MGMT_OP_PAIR_DEVICE, hdev); if (!cmd) { - err = cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, - MGMT_STATUS_INVALID_PARAMS); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS); goto unlock; } conn = cmd->user_data; if (bacmp(&addr->bdaddr, &conn->dst) != 0) { - err = cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, - MGMT_STATUS_INVALID_PARAMS); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS); goto unlock; } cmd->cmd_complete(cmd, MGMT_STATUS_CANCELLED); mgmt_pending_remove(cmd); - err = cmd_complete(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, 0, - addr, sizeof(*addr)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, 0, + addr, sizeof(*addr)); unlock: hci_dev_unlock(hdev); return err; @@ -3348,16 +3676,16 @@ static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, struct mgmt_addr_info *addr, u16 mgmt_op, u16 hci_op, __le32 passkey) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_conn *conn; int err; hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, mgmt_op, - MGMT_STATUS_NOT_POWERED, addr, - sizeof(*addr)); + err = mgmt_cmd_complete(sk, hdev->id, mgmt_op, + MGMT_STATUS_NOT_POWERED, addr, + sizeof(*addr)); goto done; } @@ -3367,22 +3695,22 @@ static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &addr->bdaddr); if (!conn) { - err = cmd_complete(sk, hdev->id, mgmt_op, - MGMT_STATUS_NOT_CONNECTED, addr, - sizeof(*addr)); + err = mgmt_cmd_complete(sk, hdev->id, mgmt_op, + MGMT_STATUS_NOT_CONNECTED, addr, + sizeof(*addr)); goto done; } if (addr->type == BDADDR_LE_PUBLIC || addr->type == BDADDR_LE_RANDOM) { err = smp_user_confirm_reply(conn, mgmt_op, passkey); if (!err) - err = cmd_complete(sk, hdev->id, mgmt_op, - MGMT_STATUS_SUCCESS, addr, - sizeof(*addr)); + err = mgmt_cmd_complete(sk, hdev->id, mgmt_op, + MGMT_STATUS_SUCCESS, addr, + sizeof(*addr)); else - err = cmd_complete(sk, hdev->id, mgmt_op, - MGMT_STATUS_FAILED, addr, - sizeof(*addr)); + err = mgmt_cmd_complete(sk, hdev->id, mgmt_op, + MGMT_STATUS_FAILED, addr, + sizeof(*addr)); goto done; } @@ -3434,8 +3762,8 @@ static int user_confirm_reply(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG(""); if (len != sizeof(*cp)) - return cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY, + MGMT_STATUS_INVALID_PARAMS); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_CONFIRM_REPLY, @@ -3491,24 +3819,24 @@ static void update_name(struct hci_request *req) static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_cp_set_local_name *cp; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; BT_DBG("status 0x%02x", status); hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_SET_LOCAL_NAME, hdev); + cmd = pending_find(MGMT_OP_SET_LOCAL_NAME, hdev); if (!cmd) goto unlock; cp = cmd->param; if (status) - cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, - mgmt_status(status)); + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, + mgmt_status(status)); else - cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, - cp, sizeof(*cp)); + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, + cp, sizeof(*cp)); mgmt_pending_remove(cmd); @@ -3520,7 +3848,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_local_name *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; int err; @@ -3534,8 +3862,8 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, if (!memcmp(hdev->dev_name, cp->name, sizeof(hdev->dev_name)) && !memcmp(hdev->short_name, cp->short_name, sizeof(hdev->short_name))) { - err = cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, - data, len); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, + data, len); goto failed; } @@ -3544,13 +3872,13 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, if (!hdev_is_powered(hdev)) { memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name)); - err = cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, - data, len); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, + data, len); if (err < 0) goto failed; - err = mgmt_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data, len, - sk); + err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, + data, len, sk); goto failed; } @@ -3585,10 +3913,70 @@ failed: return err; } +static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + struct mgmt_rp_read_local_oob_data mgmt_rp; + size_t rp_size = sizeof(mgmt_rp); + struct mgmt_pending_cmd *cmd; + + BT_DBG("%s status %u", hdev->name, status); + + cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev); + if (!cmd) + return; + + if (status || !skb) { + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, + status ? mgmt_status(status) : MGMT_STATUS_FAILED); + goto remove; + } + + memset(&mgmt_rp, 0, sizeof(mgmt_rp)); + + if (opcode == HCI_OP_READ_LOCAL_OOB_DATA) { + struct hci_rp_read_local_oob_data *rp = (void *) skb->data; + + if (skb->len < sizeof(*rp)) { + mgmt_cmd_status(cmd->sk, hdev->id, + MGMT_OP_READ_LOCAL_OOB_DATA, + MGMT_STATUS_FAILED); + goto remove; + } + + memcpy(mgmt_rp.hash192, rp->hash, sizeof(rp->hash)); + memcpy(mgmt_rp.rand192, rp->rand, sizeof(rp->rand)); + + rp_size -= sizeof(mgmt_rp.hash256) + sizeof(mgmt_rp.rand256); + } else { + struct hci_rp_read_local_oob_ext_data *rp = (void *) skb->data; + + if (skb->len < sizeof(*rp)) { + mgmt_cmd_status(cmd->sk, hdev->id, + MGMT_OP_READ_LOCAL_OOB_DATA, + MGMT_STATUS_FAILED); + goto remove; + } + + memcpy(mgmt_rp.hash192, rp->hash192, sizeof(rp->hash192)); + memcpy(mgmt_rp.rand192, rp->rand192, sizeof(rp->rand192)); + + memcpy(mgmt_rp.hash256, rp->hash256, sizeof(rp->hash256)); + memcpy(mgmt_rp.rand256, rp->rand256, sizeof(rp->rand256)); + } + + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, + MGMT_STATUS_SUCCESS, &mgmt_rp, rp_size); + +remove: + mgmt_pending_remove(cmd); +} + static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; + struct hci_request req; int err; BT_DBG("%s", hdev->name); @@ -3596,20 +3984,20 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - MGMT_STATUS_NOT_POWERED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, + MGMT_STATUS_NOT_POWERED); goto unlock; } if (!lmp_ssp_capable(hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - MGMT_STATUS_NOT_SUPPORTED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, + MGMT_STATUS_NOT_SUPPORTED); goto unlock; } - if (mgmt_pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, + MGMT_STATUS_BUSY); goto unlock; } @@ -3619,12 +4007,14 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, goto unlock; } + hci_req_init(&req, hdev); + if (bredr_sc_enabled(hdev)) - err = hci_send_cmd(hdev, HCI_OP_READ_LOCAL_OOB_EXT_DATA, - 0, NULL); + hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_EXT_DATA, 0, NULL); else - err = hci_send_cmd(hdev, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL); + hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL); + err = hci_req_run_skb(&req, read_local_oob_data_complete); if (err < 0) mgmt_pending_remove(cmd); @@ -3642,9 +4032,10 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, BT_DBG("%s ", hdev->name); if (!bdaddr_type_is_valid(addr->type)) - return cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, - MGMT_STATUS_INVALID_PARAMS, addr, - sizeof(*addr)); + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_ADD_REMOTE_OOB_DATA, + MGMT_STATUS_INVALID_PARAMS, + addr, sizeof(*addr)); hci_dev_lock(hdev); @@ -3653,10 +4044,10 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, u8 status; if (cp->addr.type != BDADDR_BREDR) { - err = cmd_complete(sk, hdev->id, - MGMT_OP_ADD_REMOTE_OOB_DATA, - MGMT_STATUS_INVALID_PARAMS, - &cp->addr, sizeof(cp->addr)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_ADD_REMOTE_OOB_DATA, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); goto unlock; } @@ -3668,8 +4059,9 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, else status = MGMT_STATUS_SUCCESS; - err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, - status, &cp->addr, sizeof(cp->addr)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_ADD_REMOTE_OOB_DATA, status, + &cp->addr, sizeof(cp->addr)); } else if (len == MGMT_ADD_REMOTE_OOB_EXT_DATA_SIZE) { struct mgmt_cp_add_remote_oob_ext_data *cp = data; u8 *rand192, *hash192, *rand256, *hash256; @@ -3681,10 +4073,10 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, */ if (memcmp(cp->rand192, ZERO_KEY, 16) || memcmp(cp->hash192, ZERO_KEY, 16)) { - err = cmd_complete(sk, hdev->id, - MGMT_OP_ADD_REMOTE_OOB_DATA, - MGMT_STATUS_INVALID_PARAMS, - addr, sizeof(*addr)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_ADD_REMOTE_OOB_DATA, + MGMT_STATUS_INVALID_PARAMS, + addr, sizeof(*addr)); goto unlock; } @@ -3724,12 +4116,13 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, else status = MGMT_STATUS_SUCCESS; - err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, - status, &cp->addr, sizeof(cp->addr)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_ADD_REMOTE_OOB_DATA, + status, &cp->addr, sizeof(cp->addr)); } else { BT_ERR("add_remote_oob_data: invalid length of %u bytes", len); - err = cmd_status(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, - MGMT_STATUS_INVALID_PARAMS); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, + MGMT_STATUS_INVALID_PARAMS); } unlock: @@ -3747,9 +4140,10 @@ static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev, BT_DBG("%s", hdev->name); if (cp->addr.type != BDADDR_BREDR) - return cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_REMOTE_OOB_DATA, - MGMT_STATUS_INVALID_PARAMS, - &cp->addr, sizeof(cp->addr)); + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_REMOTE_OOB_DATA, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); hci_dev_lock(hdev); @@ -3766,100 +4160,137 @@ static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev, status = MGMT_STATUS_SUCCESS; done: - err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_REMOTE_OOB_DATA, - status, &cp->addr, sizeof(cp->addr)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_REMOTE_OOB_DATA, + status, &cp->addr, sizeof(cp->addr)); hci_dev_unlock(hdev); return err; } -static bool trigger_discovery(struct hci_request *req, u8 *status) +static bool trigger_bredr_inquiry(struct hci_request *req, u8 *status) { struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_scan_param param_cp; - struct hci_cp_le_set_scan_enable enable_cp; - struct hci_cp_inquiry inq_cp; + struct hci_cp_inquiry cp; /* General inquiry access code (GIAC) */ u8 lap[3] = { 0x33, 0x8b, 0x9e }; + + *status = mgmt_bredr_support(hdev); + if (*status) + return false; + + if (hci_dev_test_flag(hdev, HCI_INQUIRY)) { + *status = MGMT_STATUS_BUSY; + return false; + } + + hci_inquiry_cache_flush(hdev); + + memset(&cp, 0, sizeof(cp)); + memcpy(&cp.lap, lap, sizeof(cp.lap)); + cp.length = DISCOV_BREDR_INQUIRY_LEN; + + hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); + + return true; +} + +static bool trigger_le_scan(struct hci_request *req, u16 interval, u8 *status) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_scan_param param_cp; + struct hci_cp_le_set_scan_enable enable_cp; u8 own_addr_type; int err; - switch (hdev->discovery.type) { - case DISCOV_TYPE_BREDR: - *status = mgmt_bredr_support(hdev); - if (*status) - return false; + *status = mgmt_le_support(hdev); + if (*status) + return false; - if (test_bit(HCI_INQUIRY, &hdev->flags)) { - *status = MGMT_STATUS_BUSY; + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) { + /* Don't let discovery abort an outgoing connection attempt + * that's using directed advertising. + */ + if (hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) { + *status = MGMT_STATUS_REJECTED; return false; } - hci_inquiry_cache_flush(hdev); + cancel_adv_timeout(hdev); + disable_advertising(req); + } - memset(&inq_cp, 0, sizeof(inq_cp)); - memcpy(&inq_cp.lap, lap, sizeof(inq_cp.lap)); - inq_cp.length = DISCOV_BREDR_INQUIRY_LEN; - hci_req_add(req, HCI_OP_INQUIRY, sizeof(inq_cp), &inq_cp); - break; + /* If controller is scanning, it means the background scanning is + * running. Thus, we should temporarily stop it in order to set the + * discovery scanning parameters. + */ + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) + hci_req_add_le_scan_disable(req); - case DISCOV_TYPE_LE: - case DISCOV_TYPE_INTERLEAVED: - *status = mgmt_le_support(hdev); - if (*status) - return false; + /* All active scans will be done with either a resolvable private + * address (when privacy feature has been enabled) or non-resolvable + * private address. + */ + err = hci_update_random_address(req, true, &own_addr_type); + if (err < 0) { + *status = MGMT_STATUS_FAILED; + return false; + } - if (hdev->discovery.type == DISCOV_TYPE_INTERLEAVED && - !test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) { - *status = MGMT_STATUS_NOT_SUPPORTED; + memset(¶m_cp, 0, sizeof(param_cp)); + param_cp.type = LE_SCAN_ACTIVE; + param_cp.interval = cpu_to_le16(interval); + param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN); + param_cp.own_address_type = own_addr_type; + + hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), + ¶m_cp); + + memset(&enable_cp, 0, sizeof(enable_cp)); + enable_cp.enable = LE_SCAN_ENABLE; + enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), + &enable_cp); + + return true; +} + +static bool trigger_discovery(struct hci_request *req, u8 *status) +{ + struct hci_dev *hdev = req->hdev; + + switch (hdev->discovery.type) { + case DISCOV_TYPE_BREDR: + if (!trigger_bredr_inquiry(req, status)) return false; - } + break; - if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) { - /* Don't let discovery abort an outgoing - * connection attempt that's using directed - * advertising. + case DISCOV_TYPE_INTERLEAVED: + if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, + &hdev->quirks)) { + /* During simultaneous discovery, we double LE scan + * interval. We must leave some time for the controller + * to do BR/EDR inquiry. */ - if (hci_conn_hash_lookup_state(hdev, LE_LINK, - BT_CONNECT)) { - *status = MGMT_STATUS_REJECTED; + if (!trigger_le_scan(req, DISCOV_LE_SCAN_INT * 2, + status)) return false; - } - disable_advertising(req); - } - - /* If controller is scanning, it means the background scanning - * is running. Thus, we should temporarily stop it in order to - * set the discovery scanning parameters. - */ - if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) - hci_req_add_le_scan_disable(req); + if (!trigger_bredr_inquiry(req, status)) + return false; - memset(¶m_cp, 0, sizeof(param_cp)); + return true; + } - /* All active scans will be done with either a resolvable - * private address (when privacy feature has been enabled) - * or non-resolvable private address. - */ - err = hci_update_random_address(req, true, &own_addr_type); - if (err < 0) { - *status = MGMT_STATUS_FAILED; + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { + *status = MGMT_STATUS_NOT_SUPPORTED; return false; } + /* fall through */ - param_cp.type = LE_SCAN_ACTIVE; - param_cp.interval = cpu_to_le16(DISCOV_LE_SCAN_INT); - param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN); - param_cp.own_address_type = own_addr_type; - hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), - ¶m_cp); - - memset(&enable_cp, 0, sizeof(enable_cp)); - enable_cp.enable = LE_SCAN_ENABLE; - enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), - &enable_cp); + case DISCOV_TYPE_LE: + if (!trigger_le_scan(req, DISCOV_LE_SCAN_INT, status)) + return false; break; default: @@ -3873,16 +4304,16 @@ static bool trigger_discovery(struct hci_request *req, u8 *status) static void start_discovery_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; unsigned long timeout; BT_DBG("status %d", status); hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_START_DISCOVERY, hdev); + cmd = pending_find(MGMT_OP_START_DISCOVERY, hdev); if (!cmd) - cmd = mgmt_pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev); + cmd = pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev); if (cmd) { cmd->cmd_complete(cmd, mgmt_status(status)); @@ -3904,7 +4335,18 @@ static void start_discovery_complete(struct hci_dev *hdev, u8 status, timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); break; case DISCOV_TYPE_INTERLEAVED: - timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); + /* When running simultaneous discovery, the LE scanning time + * should occupy the whole discovery time sine BR/EDR inquiry + * and LE scanning are scheduled by the controller. + * + * For interleaving discovery in comparison, BR/EDR inquiry + * and LE scanning are done sequentially with separate + * timeouts. + */ + if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) + timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); + else + timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); break; case DISCOV_TYPE_BREDR: timeout = 0; @@ -3923,8 +4365,7 @@ static void start_discovery_complete(struct hci_dev *hdev, u8 status, */ if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) && - (hdev->discovery.uuid_count > 0 || - hdev->discovery.rssi != HCI_RSSI_INVALID)) { + hdev->discovery.result_filtering) { hdev->discovery.scan_start = jiffies; hdev->discovery.scan_duration = timeout; } @@ -3941,7 +4382,7 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_start_discovery *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; u8 status; int err; @@ -3951,17 +4392,17 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY, - MGMT_STATUS_NOT_POWERED, - &cp->type, sizeof(cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY, + MGMT_STATUS_NOT_POWERED, + &cp->type, sizeof(cp->type)); goto failed; } if (hdev->discovery.state != DISCOVERY_STOPPED || - test_bit(HCI_PERIODIC_INQ, &hdev->dev_flags)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY, - MGMT_STATUS_BUSY, &cp->type, - sizeof(cp->type)); + hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) { + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY, + MGMT_STATUS_BUSY, &cp->type, + sizeof(cp->type)); goto failed; } @@ -3984,8 +4425,8 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); if (!trigger_discovery(&req, &status)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY, - status, &cp->type, sizeof(cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY, + status, &cp->type, sizeof(cp->type)); mgmt_pending_remove(cmd); goto failed; } @@ -4003,17 +4444,18 @@ failed: return err; } -static int service_discovery_cmd_complete(struct pending_cmd *cmd, u8 status) +static int service_discovery_cmd_complete(struct mgmt_pending_cmd *cmd, + u8 status) { - return cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, - cmd->param, 1); + return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, + cmd->param, 1); } static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_start_service_discovery *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; const u16 max_uuid_count = ((U16_MAX - sizeof(*cp)) / 16); u16 uuid_count, expected_len; @@ -4025,19 +4467,19 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, - MGMT_OP_START_SERVICE_DISCOVERY, - MGMT_STATUS_NOT_POWERED, - &cp->type, sizeof(cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_START_SERVICE_DISCOVERY, + MGMT_STATUS_NOT_POWERED, + &cp->type, sizeof(cp->type)); goto failed; } if (hdev->discovery.state != DISCOVERY_STOPPED || - test_bit(HCI_PERIODIC_INQ, &hdev->dev_flags)) { - err = cmd_complete(sk, hdev->id, - MGMT_OP_START_SERVICE_DISCOVERY, - MGMT_STATUS_BUSY, &cp->type, - sizeof(cp->type)); + hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) { + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_START_SERVICE_DISCOVERY, + MGMT_STATUS_BUSY, &cp->type, + sizeof(cp->type)); goto failed; } @@ -4045,10 +4487,10 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, if (uuid_count > max_uuid_count) { BT_ERR("service_discovery: too big uuid_count value %u", uuid_count); - err = cmd_complete(sk, hdev->id, - MGMT_OP_START_SERVICE_DISCOVERY, - MGMT_STATUS_INVALID_PARAMS, &cp->type, - sizeof(cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_START_SERVICE_DISCOVERY, + MGMT_STATUS_INVALID_PARAMS, &cp->type, + sizeof(cp->type)); goto failed; } @@ -4056,10 +4498,10 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, if (expected_len != len) { BT_ERR("service_discovery: expected %u bytes, got %u bytes", expected_len, len); - err = cmd_complete(sk, hdev->id, - MGMT_OP_START_SERVICE_DISCOVERY, - MGMT_STATUS_INVALID_PARAMS, &cp->type, - sizeof(cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_START_SERVICE_DISCOVERY, + MGMT_STATUS_INVALID_PARAMS, &cp->type, + sizeof(cp->type)); goto failed; } @@ -4077,6 +4519,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, */ hci_discovery_filter_clear(hdev); + hdev->discovery.result_filtering = true; hdev->discovery.type = cp->type; hdev->discovery.rssi = cp->rssi; hdev->discovery.uuid_count = uuid_count; @@ -4085,10 +4528,10 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, hdev->discovery.uuids = kmemdup(cp->uuids, uuid_count * 16, GFP_KERNEL); if (!hdev->discovery.uuids) { - err = cmd_complete(sk, hdev->id, - MGMT_OP_START_SERVICE_DISCOVERY, - MGMT_STATUS_FAILED, - &cp->type, sizeof(cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_START_SERVICE_DISCOVERY, + MGMT_STATUS_FAILED, + &cp->type, sizeof(cp->type)); mgmt_pending_remove(cmd); goto failed; } @@ -4097,9 +4540,9 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); if (!trigger_discovery(&req, &status)) { - err = cmd_complete(sk, hdev->id, - MGMT_OP_START_SERVICE_DISCOVERY, - status, &cp->type, sizeof(cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_START_SERVICE_DISCOVERY, + status, &cp->type, sizeof(cp->type)); mgmt_pending_remove(cmd); goto failed; } @@ -4119,13 +4562,13 @@ failed: static void stop_discovery_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; BT_DBG("status %d", status); hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_STOP_DISCOVERY, hdev); + cmd = pending_find(MGMT_OP_STOP_DISCOVERY, hdev); if (cmd) { cmd->cmd_complete(cmd, mgmt_status(status)); mgmt_pending_remove(cmd); @@ -4141,7 +4584,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_stop_discovery *mgmt_cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; int err; @@ -4150,16 +4593,16 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); if (!hci_discovery_active(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, - MGMT_STATUS_REJECTED, &mgmt_cp->type, - sizeof(mgmt_cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, + MGMT_STATUS_REJECTED, &mgmt_cp->type, + sizeof(mgmt_cp->type)); goto unlock; } if (hdev->discovery.type != mgmt_cp->type) { - err = cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, - MGMT_STATUS_INVALID_PARAMS, &mgmt_cp->type, - sizeof(mgmt_cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, + MGMT_STATUS_INVALID_PARAMS, + &mgmt_cp->type, sizeof(mgmt_cp->type)); goto unlock; } @@ -4185,8 +4628,8 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, /* If no HCI commands were sent we're done */ if (err == -ENODATA) { - err = cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, 0, - &mgmt_cp->type, sizeof(mgmt_cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, 0, + &mgmt_cp->type, sizeof(mgmt_cp->type)); hci_discovery_set_state(hdev, DISCOVERY_STOPPED); } @@ -4207,17 +4650,17 @@ static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); if (!hci_discovery_active(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, - MGMT_STATUS_FAILED, &cp->addr, - sizeof(cp->addr)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, + MGMT_STATUS_FAILED, &cp->addr, + sizeof(cp->addr)); goto failed; } e = hci_inquiry_cache_lookup_unknown(hdev, &cp->addr.bdaddr); if (!e) { - err = cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, - MGMT_STATUS_INVALID_PARAMS, &cp->addr, - sizeof(cp->addr)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, + MGMT_STATUS_INVALID_PARAMS, &cp->addr, + sizeof(cp->addr)); goto failed; } @@ -4229,8 +4672,8 @@ static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data, hci_inquiry_cache_update_resolve(hdev, e); } - err = cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, 0, &cp->addr, - sizeof(cp->addr)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, 0, + &cp->addr, sizeof(cp->addr)); failed: hci_dev_unlock(hdev); @@ -4247,9 +4690,9 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("%s", hdev->name); if (!bdaddr_type_is_valid(cp->addr.type)) - return cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, - MGMT_STATUS_INVALID_PARAMS, - &cp->addr, sizeof(cp->addr)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); hci_dev_lock(hdev); @@ -4265,8 +4708,8 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data, status = MGMT_STATUS_SUCCESS; done: - err = cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, status, - &cp->addr, sizeof(cp->addr)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, status, + &cp->addr, sizeof(cp->addr)); hci_dev_unlock(hdev); @@ -4283,9 +4726,9 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("%s", hdev->name); if (!bdaddr_type_is_valid(cp->addr.type)) - return cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, - MGMT_STATUS_INVALID_PARAMS, - &cp->addr, sizeof(cp->addr)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); hci_dev_lock(hdev); @@ -4301,8 +4744,8 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data, status = MGMT_STATUS_SUCCESS; done: - err = cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, status, - &cp->addr, sizeof(cp->addr)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, status, + &cp->addr, sizeof(cp->addr)); hci_dev_unlock(hdev); @@ -4322,8 +4765,8 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, source = __le16_to_cpu(cp->source); if (source > 0x0002) - return cmd_status(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); @@ -4332,7 +4775,8 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, hdev->devid_product = __le16_to_cpu(cp->product); hdev->devid_version = __le16_to_cpu(cp->version); - err = cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, 0, NULL, 0); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, 0, + NULL, 0); hci_req_init(&req, hdev); update_eir(&req); @@ -4343,10 +4787,20 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, return err; } +static void enable_advertising_instance(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + BT_DBG("status %d", status); +} + static void set_advertising_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct cmd_lookup match = { NULL, hdev }; + struct hci_request req; + u8 instance; + struct adv_info *adv_instance; + int err; hci_dev_lock(hdev); @@ -4358,10 +4812,10 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, goto unlock; } - if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) - set_bit(HCI_ADVERTISING, &hdev->dev_flags); + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + hci_dev_set_flag(hdev, HCI_ADVERTISING); else - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_ADVERTISING); mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, settings_rsp, &match); @@ -4371,6 +4825,34 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, if (match.sk) sock_put(match.sk); + /* If "Set Advertising" was just disabled and instance advertising was + * set up earlier, then re-enable multi-instance advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) || + list_empty(&hdev->adv_instances)) + goto unlock; + + instance = hdev->cur_adv_instance; + if (!instance) { + adv_instance = list_first_entry_or_null(&hdev->adv_instances, + struct adv_info, list); + if (!adv_instance) + goto unlock; + + instance = adv_instance->instance; + } + + hci_req_init(&req, hdev); + + err = schedule_adv_instance(&req, instance, true); + + if (!err) + err = hci_req_run(&req, enable_advertising_instance); + + if (err) + BT_ERR("Failed to re-configure advertising"); + unlock: hci_dev_unlock(hdev); } @@ -4379,41 +4861,48 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; - u8 val, enabled, status; + u8 val, status; int err; BT_DBG("request for %s", hdev->name); status = mgmt_le_support(hdev); if (status) - return cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, - status); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + status); - if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, - MGMT_STATUS_INVALID_PARAMS); + if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); val = !!cp->val; - enabled = test_bit(HCI_ADVERTISING, &hdev->dev_flags); /* The following conditions are ones which mean that we should * not do any HCI communication but directly send a mgmt * response to user space (after toggling the flag if * necessary). */ - if (!hdev_is_powered(hdev) || val == enabled || + if (!hdev_is_powered(hdev) || + (val == hci_dev_test_flag(hdev, HCI_ADVERTISING) && + (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) || hci_conn_num(hdev, LE_LINK) > 0 || - (test_bit(HCI_LE_SCAN, &hdev->dev_flags) && + (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->le_scan_type == LE_SCAN_ACTIVE)) { - bool changed = false; + bool changed; - if (val != test_bit(HCI_ADVERTISING, &hdev->dev_flags)) { - change_bit(HCI_ADVERTISING, &hdev->dev_flags); - changed = true; + if (cp->val) { + changed = !hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING); + if (cp->val == 0x02) + hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE); + else + hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); + } else { + changed = hci_dev_test_and_clear_flag(hdev, HCI_ADVERTISING); + hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); } err = send_settings_rsp(sk, MGMT_OP_SET_ADVERTISING, hdev); @@ -4426,10 +4915,10 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - if (mgmt_pending_find(MGMT_OP_SET_ADVERTISING, hdev) || - mgmt_pending_find(MGMT_OP_SET_LE, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_ADVERTISING, hdev) || + pending_find(MGMT_OP_SET_LE, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_BUSY); goto unlock; } @@ -4441,10 +4930,24 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, hci_req_init(&req, hdev); - if (val) - enable_advertising(&req); + if (cp->val == 0x02) + hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE); else + hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); + + cancel_adv_timeout(hdev); + + if (val) { + /* Switch to instance "0" for the Set Advertising setting. + * We cannot use update_[adv|scan_rsp]_data() here as the + * HCI_ADVERTISING flag is not yet set. + */ + update_inst_adv_data(&req, 0x00); + update_inst_scan_rsp_data(&req, 0x00); + enable_advertising(&req); + } else { disable_advertising(&req); + } err = hci_req_run(&req, set_advertising_complete); if (err < 0) @@ -4464,34 +4967,38 @@ static int set_static_address(struct sock *sk, struct hci_dev *hdev, BT_DBG("%s", hdev->name); if (!lmp_le_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, + MGMT_STATUS_NOT_SUPPORTED); if (hdev_is_powered(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, - MGMT_STATUS_REJECTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, + MGMT_STATUS_REJECTED); if (bacmp(&cp->bdaddr, BDADDR_ANY)) { if (!bacmp(&cp->bdaddr, BDADDR_NONE)) - return cmd_status(sk, hdev->id, - MGMT_OP_SET_STATIC_ADDRESS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_STATIC_ADDRESS, + MGMT_STATUS_INVALID_PARAMS); /* Two most significant bits shall be set */ if ((cp->bdaddr.b[5] & 0xc0) != 0xc0) - return cmd_status(sk, hdev->id, - MGMT_OP_SET_STATIC_ADDRESS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_STATIC_ADDRESS, + MGMT_STATUS_INVALID_PARAMS); } hci_dev_lock(hdev); bacpy(&hdev->static_addr, &cp->bdaddr); - err = cmd_complete(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, 0, NULL, 0); + err = send_settings_rsp(sk, MGMT_OP_SET_STATIC_ADDRESS, hdev); + if (err < 0) + goto unlock; - hci_dev_unlock(hdev); + err = new_settings(hdev, sk); +unlock: + hci_dev_unlock(hdev); return err; } @@ -4505,36 +5012,37 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev, BT_DBG("%s", hdev->name); if (!lmp_le_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, + MGMT_STATUS_NOT_SUPPORTED); interval = __le16_to_cpu(cp->interval); if (interval < 0x0004 || interval > 0x4000) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, + MGMT_STATUS_INVALID_PARAMS); window = __le16_to_cpu(cp->window); if (window < 0x0004 || window > 0x4000) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, + MGMT_STATUS_INVALID_PARAMS); if (window > interval) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); hdev->le_scan_interval = interval; hdev->le_scan_window = window; - err = cmd_complete(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, 0, NULL, 0); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, 0, + NULL, 0); /* If background scan is running, restart it so new parameters are * loaded. */ - if (test_bit(HCI_LE_SCAN, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->discovery.state == DISCOVERY_STOPPED) { struct hci_request req; @@ -4554,26 +5062,26 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev, static void fast_connectable_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; BT_DBG("status 0x%02x", status); hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev); + cmd = pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev); if (!cmd) goto unlock; if (status) { - cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, - mgmt_status(status)); + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, + mgmt_status(status)); } else { struct mgmt_mode *cp = cmd->param; if (cp->val) - set_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_FAST_CONNECTABLE); else - clear_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE); send_settings_rsp(cmd->sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev); new_settings(hdev, cmd->sk); @@ -4589,43 +5097,43 @@ static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; int err; BT_DBG("%s", hdev->name); - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags) || + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) || hdev->hci_ver < BLUETOOTH_VER_1_2) - return cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, + MGMT_STATUS_NOT_SUPPORTED); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, - MGMT_STATUS_INVALID_PARAMS); - - if (!hdev_is_powered(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, - MGMT_STATUS_NOT_POWERED); - - if (!test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, - MGMT_STATUS_REJECTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); - if (mgmt_pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, + MGMT_STATUS_BUSY); goto unlock; } - if (!!cp->val == test_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags)) { + if (!!cp->val == hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) { err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev); goto unlock; } + if (!hdev_is_powered(hdev)) { + hci_dev_change_flag(hdev, HCI_FAST_CONNECTABLE); + err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE, + hdev); + new_settings(hdev, sk); + goto unlock; + } + cmd = mgmt_pending_add(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev, data, len); if (!cmd) { @@ -4639,8 +5147,8 @@ static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev, err = hci_req_run(&req, fast_connectable_complete); if (err < 0) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, - MGMT_STATUS_FAILED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, + MGMT_STATUS_FAILED); mgmt_pending_remove(cmd); } @@ -4652,13 +5160,13 @@ unlock: static void set_bredr_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; BT_DBG("status 0x%02x", status); hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_SET_BREDR, hdev); + cmd = pending_find(MGMT_OP_SET_BREDR, hdev); if (!cmd) goto unlock; @@ -4668,9 +5176,9 @@ static void set_bredr_complete(struct hci_dev *hdev, u8 status, u16 opcode) /* We need to restore the flag if related HCI commands * failed. */ - clear_bit(HCI_BREDR_ENABLED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED); - cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); } else { send_settings_rsp(cmd->sk, MGMT_OP_SET_BREDR, hdev); new_settings(hdev, cmd->sk); @@ -4685,41 +5193,41 @@ unlock: static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; int err; BT_DBG("request for %s", hdev->name); if (!lmp_bredr_capable(hdev) || !lmp_le_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, + MGMT_STATUS_NOT_SUPPORTED); - if (!test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, - MGMT_STATUS_REJECTED); + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, + MGMT_STATUS_REJECTED); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); - if (cp->val == test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) { + if (cp->val == hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = send_settings_rsp(sk, MGMT_OP_SET_BREDR, hdev); goto unlock; } if (!hdev_is_powered(hdev)) { if (!cp->val) { - clear_bit(HCI_DISCOVERABLE, &hdev->dev_flags); - clear_bit(HCI_SSP_ENABLED, &hdev->dev_flags); - clear_bit(HCI_LINK_SECURITY, &hdev->dev_flags); - clear_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags); - clear_bit(HCI_HS_ENABLED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); + hci_dev_clear_flag(hdev, HCI_SSP_ENABLED); + hci_dev_clear_flag(hdev, HCI_LINK_SECURITY); + hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE); + hci_dev_clear_flag(hdev, HCI_HS_ENABLED); } - change_bit(HCI_BREDR_ENABLED, &hdev->dev_flags); + hci_dev_change_flag(hdev, HCI_BREDR_ENABLED); err = send_settings_rsp(sk, MGMT_OP_SET_BREDR, hdev); if (err < 0) @@ -4731,8 +5239,8 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) /* Reject disabling when powered on */ if (!cp->val) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, - MGMT_STATUS_REJECTED); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, + MGMT_STATUS_REJECTED); goto unlock; } else { /* When configuring a dual-mode controller to operate @@ -4749,18 +5257,18 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) * switching BR/EDR back on when secure connections has been * enabled is not a supported transaction. */ - if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags) && + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && (bacmp(&hdev->static_addr, BDADDR_ANY) || - test_bit(HCI_SC_ENABLED, &hdev->dev_flags))) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, - MGMT_STATUS_REJECTED); + hci_dev_test_flag(hdev, HCI_SC_ENABLED))) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, + MGMT_STATUS_REJECTED); goto unlock; } } - if (mgmt_pending_find(MGMT_OP_SET_BREDR, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_BREDR, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, + MGMT_STATUS_BUSY); goto unlock; } @@ -4773,7 +5281,7 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) /* We need to flip the bit already here so that update_adv_data * generates the correct flags. */ - set_bit(HCI_BREDR_ENABLED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); hci_req_init(&req, hdev); @@ -4796,20 +5304,20 @@ unlock: static void sc_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct mgmt_mode *cp; BT_DBG("%s status %u", hdev->name, status); hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_SET_SECURE_CONN, hdev); + cmd = pending_find(MGMT_OP_SET_SECURE_CONN, hdev); if (!cmd) goto unlock; if (status) { - cmd_status(cmd->sk, cmd->index, cmd->opcode, - mgmt_status(status)); + mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status)); goto remove; } @@ -4817,16 +5325,16 @@ static void sc_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) switch (cp->val) { case 0x00: - clear_bit(HCI_SC_ENABLED, &hdev->dev_flags); - clear_bit(HCI_SC_ONLY, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_SC_ENABLED); + hci_dev_clear_flag(hdev, HCI_SC_ONLY); break; case 0x01: - set_bit(HCI_SC_ENABLED, &hdev->dev_flags); - clear_bit(HCI_SC_ONLY, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_SC_ENABLED); + hci_dev_clear_flag(hdev, HCI_SC_ONLY); break; case 0x02: - set_bit(HCI_SC_ENABLED, &hdev->dev_flags); - set_bit(HCI_SC_ONLY, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_SC_ENABLED); + hci_dev_set_flag(hdev, HCI_SC_ONLY); break; } @@ -4843,7 +5351,7 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; u8 val; int err; @@ -4851,37 +5359,37 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev, BT_DBG("request for %s", hdev->name); if (!lmp_sc_capable(hdev) && - !test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, - MGMT_STATUS_NOT_SUPPORTED); + !hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, + MGMT_STATUS_NOT_SUPPORTED); - if (test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && lmp_sc_capable(hdev) && - !test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, - MGMT_STATUS_REJECTED); + !hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, + MGMT_STATUS_REJECTED); if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) - return cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!hdev_is_powered(hdev) || !lmp_sc_capable(hdev) || - !test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) { + !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { bool changed; if (cp->val) { - changed = !test_and_set_bit(HCI_SC_ENABLED, - &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, + HCI_SC_ENABLED); if (cp->val == 0x02) - set_bit(HCI_SC_ONLY, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_SC_ONLY); else - clear_bit(HCI_SC_ONLY, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_SC_ONLY); } else { - changed = test_and_clear_bit(HCI_SC_ENABLED, - &hdev->dev_flags); - clear_bit(HCI_SC_ONLY, &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, + HCI_SC_ENABLED); + hci_dev_clear_flag(hdev, HCI_SC_ONLY); } err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev); @@ -4894,16 +5402,16 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev, goto failed; } - if (mgmt_pending_find(MGMT_OP_SET_SECURE_CONN, hdev)) { - err = cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, - MGMT_STATUS_BUSY); + if (pending_find(MGMT_OP_SET_SECURE_CONN, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, + MGMT_STATUS_BUSY); goto failed; } val = !!cp->val; - if (val == test_bit(HCI_SC_ENABLED, &hdev->dev_flags) && - (cp->val == 0x02) == test_bit(HCI_SC_ONLY, &hdev->dev_flags)) { + if (val == hci_dev_test_flag(hdev, HCI_SC_ENABLED) && + (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_SC_ONLY)) { err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev); goto failed; } @@ -4937,27 +5445,26 @@ static int set_debug_keys(struct sock *sk, struct hci_dev *hdev, BT_DBG("request for %s", hdev->name); if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) - return cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (cp->val) - changed = !test_and_set_bit(HCI_KEEP_DEBUG_KEYS, - &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, HCI_KEEP_DEBUG_KEYS); else - changed = test_and_clear_bit(HCI_KEEP_DEBUG_KEYS, - &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, + HCI_KEEP_DEBUG_KEYS); if (cp->val == 0x02) - use_changed = !test_and_set_bit(HCI_USE_DEBUG_KEYS, - &hdev->dev_flags); + use_changed = !hci_dev_test_and_set_flag(hdev, + HCI_USE_DEBUG_KEYS); else - use_changed = test_and_clear_bit(HCI_USE_DEBUG_KEYS, - &hdev->dev_flags); + use_changed = hci_dev_test_and_clear_flag(hdev, + HCI_USE_DEBUG_KEYS); if (hdev_is_powered(hdev) && use_changed && - test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { + hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { u8 mode = (cp->val == 0x02) ? 0x01 : 0x00; hci_send_cmd(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE, sizeof(mode), &mode); @@ -4985,32 +5492,32 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, BT_DBG("request for %s", hdev->name); if (!lmp_le_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, + MGMT_STATUS_NOT_SUPPORTED); if (cp->privacy != 0x00 && cp->privacy != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, + MGMT_STATUS_INVALID_PARAMS); if (hdev_is_powered(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, - MGMT_STATUS_REJECTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, + MGMT_STATUS_REJECTED); hci_dev_lock(hdev); /* If user space supports this command it is also expected to * handle IRKs. Therefore, set the HCI_RPA_RESOLVING flag. */ - set_bit(HCI_RPA_RESOLVING, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_RPA_RESOLVING); if (cp->privacy) { - changed = !test_and_set_bit(HCI_PRIVACY, &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY); memcpy(hdev->irk, cp->irk, sizeof(hdev->irk)); - set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); } else { - changed = test_and_clear_bit(HCI_PRIVACY, &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY); memset(hdev->irk, 0, sizeof(hdev->irk)); - clear_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED); } err = send_settings_rsp(sk, MGMT_OP_SET_PRIVACY, hdev); @@ -5053,22 +5560,22 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, BT_DBG("request for %s", hdev->name); if (!lmp_le_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, + MGMT_STATUS_NOT_SUPPORTED); irk_count = __le16_to_cpu(cp->irk_count); if (irk_count > max_irk_count) { BT_ERR("load_irks: too big irk_count value %u", irk_count); - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, + MGMT_STATUS_INVALID_PARAMS); } expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info); if (expected_len != len) { BT_ERR("load_irks: expected %u bytes, got %u bytes", expected_len, len); - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, + MGMT_STATUS_INVALID_PARAMS); } BT_DBG("%s irk_count %u", hdev->name, irk_count); @@ -5077,9 +5584,9 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, struct mgmt_irk_info *key = &cp->irks[i]; if (!irk_is_valid(key)) - return cmd_status(sk, hdev->id, - MGMT_OP_LOAD_IRKS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_LOAD_IRKS, + MGMT_STATUS_INVALID_PARAMS); } hci_dev_lock(hdev); @@ -5099,9 +5606,9 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, BDADDR_ANY); } - set_bit(HCI_RPA_RESOLVING, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_RPA_RESOLVING); - err = cmd_complete(sk, hdev->id, MGMT_OP_LOAD_IRKS, 0, NULL, 0); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_IRKS, 0, NULL, 0); hci_dev_unlock(hdev); @@ -5139,14 +5646,14 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, BT_DBG("request for %s", hdev->name); if (!lmp_le_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, + MGMT_STATUS_NOT_SUPPORTED); key_count = __le16_to_cpu(cp->key_count); if (key_count > max_key_count) { BT_ERR("load_ltks: too big key_count value %u", key_count); - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, + MGMT_STATUS_INVALID_PARAMS); } expected_len = sizeof(*cp) + key_count * @@ -5154,8 +5661,8 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, if (expected_len != len) { BT_ERR("load_keys: expected %u bytes, got %u bytes", expected_len, len); - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, + MGMT_STATUS_INVALID_PARAMS); } BT_DBG("%s key_count %u", hdev->name, key_count); @@ -5164,9 +5671,9 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, struct mgmt_ltk_info *key = &cp->keys[i]; if (!ltk_is_valid(key)) - return cmd_status(sk, hdev->id, - MGMT_OP_LOAD_LONG_TERM_KEYS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_LOAD_LONG_TERM_KEYS, + MGMT_STATUS_INVALID_PARAMS); } hci_dev_lock(hdev); @@ -5211,7 +5718,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, key->rand); } - err = cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, 0, + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, 0, NULL, 0); hci_dev_unlock(hdev); @@ -5219,7 +5726,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, return err; } -static int conn_info_cmd_complete(struct pending_cmd *cmd, u8 status) +static int conn_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { struct hci_conn *conn = cmd->user_data; struct mgmt_rp_get_conn_info rp; @@ -5237,8 +5744,8 @@ static int conn_info_cmd_complete(struct pending_cmd *cmd, u8 status) rp.max_tx_power = HCI_TX_POWER_INVALID; } - err = cmd_complete(cmd->sk, cmd->index, MGMT_OP_GET_CONN_INFO, status, - &rp, sizeof(rp)); + err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_GET_CONN_INFO, + status, &rp, sizeof(rp)); hci_conn_drop(conn); hci_conn_put(conn); @@ -5250,7 +5757,7 @@ static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status, u16 opcode) { struct hci_cp_read_rssi *cp; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_conn *conn; u16 handle; u8 status; @@ -5288,7 +5795,7 @@ static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status, goto unlock; } - cmd = mgmt_pending_find_data(MGMT_OP_GET_CONN_INFO, hdev, conn); + cmd = pending_find_data(MGMT_OP_GET_CONN_INFO, hdev, conn); if (!cmd) goto unlock; @@ -5315,15 +5822,16 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, rp.addr.type = cp->addr.type; if (!bdaddr_type_is_valid(cp->addr.type)) - return cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, - MGMT_STATUS_INVALID_PARAMS, - &rp, sizeof(rp)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, - MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, + MGMT_STATUS_NOT_POWERED, &rp, + sizeof(rp)); goto unlock; } @@ -5334,14 +5842,15 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr); if (!conn || conn->state != BT_CONNECTED) { - err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, - MGMT_STATUS_NOT_CONNECTED, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, + MGMT_STATUS_NOT_CONNECTED, &rp, + sizeof(rp)); goto unlock; } - if (mgmt_pending_find_data(MGMT_OP_GET_CONN_INFO, hdev, conn)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, - MGMT_STATUS_BUSY, &rp, sizeof(rp)); + if (pending_find_data(MGMT_OP_GET_CONN_INFO, hdev, conn)) { + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, + MGMT_STATUS_BUSY, &rp, sizeof(rp)); goto unlock; } @@ -5361,7 +5870,7 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_request req; struct hci_cp_read_tx_power req_txp_cp; struct hci_cp_read_rssi req_rssi_cp; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; hci_req_init(&req, hdev); req_rssi_cp.handle = cpu_to_le16(conn->handle); @@ -5409,8 +5918,8 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, rp.tx_power = conn->tx_power; rp.max_tx_power = conn->max_tx_power; - err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, - MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); } unlock: @@ -5418,7 +5927,7 @@ unlock: return err; } -static int clock_info_cmd_complete(struct pending_cmd *cmd, u8 status) +static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { struct hci_conn *conn = cmd->user_data; struct mgmt_rp_get_clock_info rp; @@ -5443,8 +5952,8 @@ static int clock_info_cmd_complete(struct pending_cmd *cmd, u8 status) } complete: - err = cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, &rp, - sizeof(rp)); + err = mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, &rp, + sizeof(rp)); if (conn) { hci_conn_drop(conn); @@ -5457,7 +5966,7 @@ complete: static void get_clock_info_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct hci_cp_read_clock *hci_cp; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_conn *conn; BT_DBG("%s status %u", hdev->name, status); @@ -5475,7 +5984,7 @@ static void get_clock_info_complete(struct hci_dev *hdev, u8 status, u16 opcode) conn = NULL; } - cmd = mgmt_pending_find_data(MGMT_OP_GET_CLOCK_INFO, hdev, conn); + cmd = pending_find_data(MGMT_OP_GET_CLOCK_INFO, hdev, conn); if (!cmd) goto unlock; @@ -5492,7 +6001,7 @@ static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_cp_get_clock_info *cp = data; struct mgmt_rp_get_clock_info rp; struct hci_cp_read_clock hci_cp; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; struct hci_conn *conn; int err; @@ -5504,15 +6013,16 @@ static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data, rp.addr.type = cp->addr.type; if (cp->addr.type != BDADDR_BREDR) - return cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, - MGMT_STATUS_INVALID_PARAMS, - &rp, sizeof(rp)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, - MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, + MGMT_STATUS_NOT_POWERED, &rp, + sizeof(rp)); goto unlock; } @@ -5520,10 +6030,10 @@ static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); if (!conn || conn->state != BT_CONNECTED) { - err = cmd_complete(sk, hdev->id, - MGMT_OP_GET_CLOCK_INFO, - MGMT_STATUS_NOT_CONNECTED, - &rp, sizeof(rp)); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_GET_CLOCK_INFO, + MGMT_STATUS_NOT_CONNECTED, + &rp, sizeof(rp)); goto unlock; } } else { @@ -5634,13 +6144,13 @@ static void device_added(struct sock *sk, struct hci_dev *hdev, static void add_device_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; BT_DBG("status 0x%02x", status); hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_ADD_DEVICE, hdev); + cmd = pending_find(MGMT_OP_ADD_DEVICE, hdev); if (!cmd) goto unlock; @@ -5655,7 +6165,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_add_device *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; u8 auto_conn, addr_type; int err; @@ -5664,14 +6174,14 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, if (!bdaddr_type_is_valid(cp->addr.type) || !bacmp(&cp->addr.bdaddr, BDADDR_ANY)) - return cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, - MGMT_STATUS_INVALID_PARAMS, - &cp->addr, sizeof(cp->addr)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); if (cp->action != 0x00 && cp->action != 0x01 && cp->action != 0x02) - return cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, - MGMT_STATUS_INVALID_PARAMS, - &cp->addr, sizeof(cp->addr)); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); hci_req_init(&req, hdev); @@ -5757,13 +6267,13 @@ static void device_removed(struct sock *sk, struct hci_dev *hdev, static void remove_device_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; BT_DBG("status 0x%02x", status); hci_dev_lock(hdev); - cmd = mgmt_pending_find(MGMT_OP_REMOVE_DEVICE, hdev); + cmd = pending_find(MGMT_OP_REMOVE_DEVICE, hdev); if (!cmd) goto unlock; @@ -5778,7 +6288,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_remove_device *cp = data; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct hci_request req; int err; @@ -5911,15 +6421,15 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, int i; if (!lmp_le_capable(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, + MGMT_STATUS_NOT_SUPPORTED); param_count = __le16_to_cpu(cp->param_count); if (param_count > max_param_count) { BT_ERR("load_conn_param: too big param_count value %u", param_count); - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, + MGMT_STATUS_INVALID_PARAMS); } expected_len = sizeof(*cp) + param_count * @@ -5927,8 +6437,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, if (expected_len != len) { BT_ERR("load_conn_param: expected %u bytes, got %u bytes", expected_len, len); - return cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, + MGMT_STATUS_INVALID_PARAMS); } BT_DBG("%s param_count %u", hdev->name, param_count); @@ -5983,7 +6493,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_unlock(hdev); - return cmd_complete(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, 0, NULL, 0); + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, 0, + NULL, 0); } static int set_external_config(struct sock *sk, struct hci_dev *hdev, @@ -5996,25 +6507,23 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev, BT_DBG("%s", hdev->name); if (hdev_is_powered(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, - MGMT_STATUS_REJECTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, + MGMT_STATUS_REJECTED); if (cp->config != 0x00 && cp->config != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, + MGMT_STATUS_INVALID_PARAMS); if (!test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, + MGMT_STATUS_NOT_SUPPORTED); hci_dev_lock(hdev); if (cp->config) - changed = !test_and_set_bit(HCI_EXT_CONFIGURED, - &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, HCI_EXT_CONFIGURED); else - changed = test_and_clear_bit(HCI_EXT_CONFIGURED, - &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, HCI_EXT_CONFIGURED); err = send_options_rsp(sk, MGMT_OP_SET_EXTERNAL_CONFIG, hdev); if (err < 0) @@ -6025,12 +6534,12 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev, err = new_options(hdev, sk); - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) == is_configured(hdev)) { + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) == is_configured(hdev)) { mgmt_index_removed(hdev); - if (test_and_change_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { - set_bit(HCI_CONFIG, &hdev->dev_flags); - set_bit(HCI_AUTO_OFF, &hdev->dev_flags); + if (hci_dev_test_and_change_flag(hdev, HCI_UNCONFIGURED)) { + hci_dev_set_flag(hdev, HCI_CONFIG); + hci_dev_set_flag(hdev, HCI_AUTO_OFF); queue_work(hdev->req_workqueue, &hdev->power_on); } else { @@ -6054,16 +6563,16 @@ static int set_public_address(struct sock *sk, struct hci_dev *hdev, BT_DBG("%s", hdev->name); if (hdev_is_powered(hdev)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, - MGMT_STATUS_REJECTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, + MGMT_STATUS_REJECTED); if (!bacmp(&cp->bdaddr, BDADDR_ANY)) - return cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, - MGMT_STATUS_INVALID_PARAMS); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, + MGMT_STATUS_INVALID_PARAMS); if (!hdev->set_bdaddr) - return cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, - MGMT_STATUS_NOT_SUPPORTED); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, + MGMT_STATUS_NOT_SUPPORTED); hci_dev_lock(hdev); @@ -6077,16 +6586,16 @@ static int set_public_address(struct sock *sk, struct hci_dev *hdev, if (!changed) goto unlock; - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) err = new_options(hdev, sk); if (is_configured(hdev)) { mgmt_index_removed(hdev); - clear_bit(HCI_UNCONFIGURED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_UNCONFIGURED); - set_bit(HCI_CONFIG, &hdev->dev_flags); - set_bit(HCI_AUTO_OFF, &hdev->dev_flags); + hci_dev_set_flag(hdev, HCI_CONFIG); + hci_dev_set_flag(hdev, HCI_AUTO_OFF); queue_work(hdev->req_workqueue, &hdev->power_on); } @@ -6096,213 +6605,902 @@ unlock: return err; } -static const struct mgmt_handler { - int (*func) (struct sock *sk, struct hci_dev *hdev, void *data, - u16 data_len); - bool var_len; - size_t data_len; -} mgmt_handlers[] = { - { NULL }, /* 0x0000 (no command) */ - { read_version, false, MGMT_READ_VERSION_SIZE }, - { read_commands, false, MGMT_READ_COMMANDS_SIZE }, - { read_index_list, false, MGMT_READ_INDEX_LIST_SIZE }, - { read_controller_info, false, MGMT_READ_INFO_SIZE }, - { set_powered, false, MGMT_SETTING_SIZE }, - { set_discoverable, false, MGMT_SET_DISCOVERABLE_SIZE }, - { set_connectable, false, MGMT_SETTING_SIZE }, - { set_fast_connectable, false, MGMT_SETTING_SIZE }, - { set_bondable, false, MGMT_SETTING_SIZE }, - { set_link_security, false, MGMT_SETTING_SIZE }, - { set_ssp, false, MGMT_SETTING_SIZE }, - { set_hs, false, MGMT_SETTING_SIZE }, - { set_le, false, MGMT_SETTING_SIZE }, - { set_dev_class, false, MGMT_SET_DEV_CLASS_SIZE }, - { set_local_name, false, MGMT_SET_LOCAL_NAME_SIZE }, - { add_uuid, false, MGMT_ADD_UUID_SIZE }, - { remove_uuid, false, MGMT_REMOVE_UUID_SIZE }, - { load_link_keys, true, MGMT_LOAD_LINK_KEYS_SIZE }, - { load_long_term_keys, true, MGMT_LOAD_LONG_TERM_KEYS_SIZE }, - { disconnect, false, MGMT_DISCONNECT_SIZE }, - { get_connections, false, MGMT_GET_CONNECTIONS_SIZE }, - { pin_code_reply, false, MGMT_PIN_CODE_REPLY_SIZE }, - { pin_code_neg_reply, false, MGMT_PIN_CODE_NEG_REPLY_SIZE }, - { set_io_capability, false, MGMT_SET_IO_CAPABILITY_SIZE }, - { pair_device, false, MGMT_PAIR_DEVICE_SIZE }, - { cancel_pair_device, false, MGMT_CANCEL_PAIR_DEVICE_SIZE }, - { unpair_device, false, MGMT_UNPAIR_DEVICE_SIZE }, - { user_confirm_reply, false, MGMT_USER_CONFIRM_REPLY_SIZE }, - { user_confirm_neg_reply, false, MGMT_USER_CONFIRM_NEG_REPLY_SIZE }, - { user_passkey_reply, false, MGMT_USER_PASSKEY_REPLY_SIZE }, - { user_passkey_neg_reply, false, MGMT_USER_PASSKEY_NEG_REPLY_SIZE }, - { read_local_oob_data, false, MGMT_READ_LOCAL_OOB_DATA_SIZE }, - { add_remote_oob_data, true, MGMT_ADD_REMOTE_OOB_DATA_SIZE }, - { remove_remote_oob_data, false, MGMT_REMOVE_REMOTE_OOB_DATA_SIZE }, - { start_discovery, false, MGMT_START_DISCOVERY_SIZE }, - { stop_discovery, false, MGMT_STOP_DISCOVERY_SIZE }, - { confirm_name, false, MGMT_CONFIRM_NAME_SIZE }, - { block_device, false, MGMT_BLOCK_DEVICE_SIZE }, - { unblock_device, false, MGMT_UNBLOCK_DEVICE_SIZE }, - { set_device_id, false, MGMT_SET_DEVICE_ID_SIZE }, - { set_advertising, false, MGMT_SETTING_SIZE }, - { set_bredr, false, MGMT_SETTING_SIZE }, - { set_static_address, false, MGMT_SET_STATIC_ADDRESS_SIZE }, - { set_scan_params, false, MGMT_SET_SCAN_PARAMS_SIZE }, - { set_secure_conn, false, MGMT_SETTING_SIZE }, - { set_debug_keys, false, MGMT_SETTING_SIZE }, - { set_privacy, false, MGMT_SET_PRIVACY_SIZE }, - { load_irks, true, MGMT_LOAD_IRKS_SIZE }, - { get_conn_info, false, MGMT_GET_CONN_INFO_SIZE }, - { get_clock_info, false, MGMT_GET_CLOCK_INFO_SIZE }, - { add_device, false, MGMT_ADD_DEVICE_SIZE }, - { remove_device, false, MGMT_REMOVE_DEVICE_SIZE }, - { load_conn_param, true, MGMT_LOAD_CONN_PARAM_SIZE }, - { read_unconf_index_list, false, MGMT_READ_UNCONF_INDEX_LIST_SIZE }, - { read_config_info, false, MGMT_READ_CONFIG_INFO_SIZE }, - { set_external_config, false, MGMT_SET_EXTERNAL_CONFIG_SIZE }, - { set_public_address, false, MGMT_SET_PUBLIC_ADDRESS_SIZE }, - { start_service_discovery,true, MGMT_START_SERVICE_DISCOVERY_SIZE }, -}; +static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, + u8 data_len) +{ + eir[eir_len++] = sizeof(type) + data_len; + eir[eir_len++] = type; + memcpy(&eir[eir_len], data, data_len); + eir_len += data_len; -int mgmt_control(struct sock *sk, struct msghdr *msg, size_t msglen) + return eir_len; +} + +static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) { - void *buf; - u8 *cp; - struct mgmt_hdr *hdr; - u16 opcode, index, len; - struct hci_dev *hdev = NULL; - const struct mgmt_handler *handler; + const struct mgmt_cp_read_local_oob_ext_data *mgmt_cp; + struct mgmt_rp_read_local_oob_ext_data *mgmt_rp; + u8 *h192, *r192, *h256, *r256; + struct mgmt_pending_cmd *cmd; + u16 eir_len; int err; - BT_DBG("got %zu bytes", msglen); + BT_DBG("%s status %u", hdev->name, status); + + cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev); + if (!cmd) + return; - if (msglen < sizeof(*hdr)) - return -EINVAL; + mgmt_cp = cmd->param; - buf = kmalloc(msglen, GFP_KERNEL); - if (!buf) - return -ENOMEM; + if (status) { + status = mgmt_status(status); + eir_len = 0; + + h192 = NULL; + r192 = NULL; + h256 = NULL; + r256 = NULL; + } else if (opcode == HCI_OP_READ_LOCAL_OOB_DATA) { + struct hci_rp_read_local_oob_data *rp; + + if (skb->len != sizeof(*rp)) { + status = MGMT_STATUS_FAILED; + eir_len = 0; + } else { + status = MGMT_STATUS_SUCCESS; + rp = (void *)skb->data; + + eir_len = 5 + 18 + 18; + h192 = rp->hash; + r192 = rp->rand; + h256 = NULL; + r256 = NULL; + } + } else { + struct hci_rp_read_local_oob_ext_data *rp; + + if (skb->len != sizeof(*rp)) { + status = MGMT_STATUS_FAILED; + eir_len = 0; + } else { + status = MGMT_STATUS_SUCCESS; + rp = (void *)skb->data; + + if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { + eir_len = 5 + 18 + 18; + h192 = NULL; + r192 = NULL; + } else { + eir_len = 5 + 18 + 18 + 18 + 18; + h192 = rp->hash192; + r192 = rp->rand192; + } - if (memcpy_from_msg(buf, msg, msglen)) { - err = -EFAULT; + h256 = rp->hash256; + r256 = rp->rand256; + } + } + + mgmt_rp = kmalloc(sizeof(*mgmt_rp) + eir_len, GFP_KERNEL); + if (!mgmt_rp) goto done; + + if (status) + goto send_rsp; + + eir_len = eir_append_data(mgmt_rp->eir, 0, EIR_CLASS_OF_DEV, + hdev->dev_class, 3); + + if (h192 && r192) { + eir_len = eir_append_data(mgmt_rp->eir, eir_len, + EIR_SSP_HASH_C192, h192, 16); + eir_len = eir_append_data(mgmt_rp->eir, eir_len, + EIR_SSP_RAND_R192, r192, 16); } - hdr = buf; - opcode = __le16_to_cpu(hdr->opcode); - index = __le16_to_cpu(hdr->index); - len = __le16_to_cpu(hdr->len); + if (h256 && r256) { + eir_len = eir_append_data(mgmt_rp->eir, eir_len, + EIR_SSP_HASH_C256, h256, 16); + eir_len = eir_append_data(mgmt_rp->eir, eir_len, + EIR_SSP_RAND_R256, r256, 16); + } - if (len != msglen - sizeof(*hdr)) { - err = -EINVAL; +send_rsp: + mgmt_rp->type = mgmt_cp->type; + mgmt_rp->eir_len = cpu_to_le16(eir_len); + + err = mgmt_cmd_complete(cmd->sk, hdev->id, + MGMT_OP_READ_LOCAL_OOB_EXT_DATA, status, + mgmt_rp, sizeof(*mgmt_rp) + eir_len); + if (err < 0 || status) goto done; + + hci_sock_set_flag(cmd->sk, HCI_MGMT_OOB_DATA_EVENTS); + + err = mgmt_limited_event(MGMT_EV_LOCAL_OOB_DATA_UPDATED, hdev, + mgmt_rp, sizeof(*mgmt_rp) + eir_len, + HCI_MGMT_OOB_DATA_EVENTS, cmd->sk); +done: + kfree(mgmt_rp); + mgmt_pending_remove(cmd); +} + +static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, + struct mgmt_cp_read_local_oob_ext_data *cp) +{ + struct mgmt_pending_cmd *cmd; + struct hci_request req; + int err; + + cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, + cp, sizeof(*cp)); + if (!cmd) + return -ENOMEM; + + hci_req_init(&req, hdev); + + if (bredr_sc_enabled(hdev)) + hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_EXT_DATA, 0, NULL); + else + hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL); + + err = hci_req_run_skb(&req, read_local_oob_ext_data_complete); + if (err < 0) { + mgmt_pending_remove(cmd); + return err; } - if (index != MGMT_INDEX_NONE) { - hdev = hci_dev_get(index); - if (!hdev) { - err = cmd_status(sk, index, opcode, - MGMT_STATUS_INVALID_INDEX); - goto done; + return 0; +} + +static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_cp_read_local_oob_ext_data *cp = data; + struct mgmt_rp_read_local_oob_ext_data *rp; + size_t rp_len; + u16 eir_len; + u8 status, flags, role, addr[7], hash[16], rand[16]; + int err; + + BT_DBG("%s", hdev->name); + + if (hdev_is_powered(hdev)) { + switch (cp->type) { + case BIT(BDADDR_BREDR): + status = mgmt_bredr_support(hdev); + if (status) + eir_len = 0; + else + eir_len = 5; + break; + case (BIT(BDADDR_LE_PUBLIC) | BIT(BDADDR_LE_RANDOM)): + status = mgmt_le_support(hdev); + if (status) + eir_len = 0; + else + eir_len = 9 + 3 + 18 + 18 + 3; + break; + default: + status = MGMT_STATUS_INVALID_PARAMS; + eir_len = 0; + break; } + } else { + status = MGMT_STATUS_NOT_POWERED; + eir_len = 0; + } - if (test_bit(HCI_SETUP, &hdev->dev_flags) || - test_bit(HCI_CONFIG, &hdev->dev_flags) || - test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { - err = cmd_status(sk, index, opcode, - MGMT_STATUS_INVALID_INDEX); - goto done; + rp_len = sizeof(*rp) + eir_len; + rp = kmalloc(rp_len, GFP_ATOMIC); + if (!rp) + return -ENOMEM; + + if (status) + goto complete; + + hci_dev_lock(hdev); + + eir_len = 0; + switch (cp->type) { + case BIT(BDADDR_BREDR): + if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { + err = read_local_ssp_oob_req(hdev, sk, cp); + hci_dev_unlock(hdev); + if (!err) + goto done; + + status = MGMT_STATUS_FAILED; + goto complete; + } else { + eir_len = eir_append_data(rp->eir, eir_len, + EIR_CLASS_OF_DEV, + hdev->dev_class, 3); + } + break; + case (BIT(BDADDR_LE_PUBLIC) | BIT(BDADDR_LE_RANDOM)): + if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) && + smp_generate_oob(hdev, hash, rand) < 0) { + hci_dev_unlock(hdev); + status = MGMT_STATUS_FAILED; + goto complete; + } + + /* This should return the active RPA, but since the RPA + * is only programmed on demand, it is really hard to fill + * this in at the moment. For now disallow retrieving + * local out-of-band data when privacy is in use. + * + * Returning the identity address will not help here since + * pairing happens before the identity resolving key is + * known and thus the connection establishment happens + * based on the RPA and not the identity address. + */ + if (hci_dev_test_flag(hdev, HCI_PRIVACY)) { + hci_dev_unlock(hdev); + status = MGMT_STATUS_REJECTED; + goto complete; } - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && - opcode != MGMT_OP_READ_CONFIG_INFO && - opcode != MGMT_OP_SET_EXTERNAL_CONFIG && - opcode != MGMT_OP_SET_PUBLIC_ADDRESS) { - err = cmd_status(sk, index, opcode, - MGMT_STATUS_INVALID_INDEX); - goto done; + if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || + !bacmp(&hdev->bdaddr, BDADDR_ANY) || + (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && + bacmp(&hdev->static_addr, BDADDR_ANY))) { + memcpy(addr, &hdev->static_addr, 6); + addr[6] = 0x01; + } else { + memcpy(addr, &hdev->bdaddr, 6); + addr[6] = 0x00; } + + eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_BDADDR, + addr, sizeof(addr)); + + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) + role = 0x02; + else + role = 0x01; + + eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_ROLE, + &role, sizeof(role)); + + if (hci_dev_test_flag(hdev, HCI_SC_ENABLED)) { + eir_len = eir_append_data(rp->eir, eir_len, + EIR_LE_SC_CONFIRM, + hash, sizeof(hash)); + + eir_len = eir_append_data(rp->eir, eir_len, + EIR_LE_SC_RANDOM, + rand, sizeof(rand)); + } + + flags = get_adv_discov_flags(hdev); + + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + flags |= LE_AD_NO_BREDR; + + eir_len = eir_append_data(rp->eir, eir_len, EIR_FLAGS, + &flags, sizeof(flags)); + break; } - if (opcode >= ARRAY_SIZE(mgmt_handlers) || - mgmt_handlers[opcode].func == NULL) { - BT_DBG("Unknown op %u", opcode); - err = cmd_status(sk, index, opcode, - MGMT_STATUS_UNKNOWN_COMMAND); + hci_dev_unlock(hdev); + + hci_sock_set_flag(sk, HCI_MGMT_OOB_DATA_EVENTS); + + status = MGMT_STATUS_SUCCESS; + +complete: + rp->type = cp->type; + rp->eir_len = cpu_to_le16(eir_len); + + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, + status, rp, sizeof(*rp) + eir_len); + if (err < 0 || status) goto done; + + err = mgmt_limited_event(MGMT_EV_LOCAL_OOB_DATA_UPDATED, hdev, + rp, sizeof(*rp) + eir_len, + HCI_MGMT_OOB_DATA_EVENTS, sk); + +done: + kfree(rp); + + return err; +} + +static u32 get_supported_adv_flags(struct hci_dev *hdev) +{ + u32 flags = 0; + + flags |= MGMT_ADV_FLAG_CONNECTABLE; + flags |= MGMT_ADV_FLAG_DISCOV; + flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; + flags |= MGMT_ADV_FLAG_MANAGED_FLAGS; + + if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) + flags |= MGMT_ADV_FLAG_TX_POWER; + + return flags; +} + +static int read_adv_features(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_rp_read_adv_features *rp; + size_t rp_len; + int err, i; + bool instance; + struct adv_info *adv_instance; + u32 supported_flags; + + BT_DBG("%s", hdev->name); + + if (!lmp_le_capable(hdev)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES, + MGMT_STATUS_REJECTED); + + hci_dev_lock(hdev); + + rp_len = sizeof(*rp); + + instance = hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE); + if (instance) + rp_len += hdev->adv_instance_cnt; + + rp = kmalloc(rp_len, GFP_ATOMIC); + if (!rp) { + hci_dev_unlock(hdev); + return -ENOMEM; } - if (hdev && (opcode <= MGMT_OP_READ_INDEX_LIST || - opcode == MGMT_OP_READ_UNCONF_INDEX_LIST)) { - err = cmd_status(sk, index, opcode, - MGMT_STATUS_INVALID_INDEX); - goto done; + supported_flags = get_supported_adv_flags(hdev); + + rp->supported_flags = cpu_to_le32(supported_flags); + rp->max_adv_data_len = HCI_MAX_AD_LENGTH; + rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH; + rp->max_instances = HCI_MAX_ADV_INSTANCES; + + if (instance) { + i = 0; + list_for_each_entry(adv_instance, &hdev->adv_instances, list) { + if (i >= hdev->adv_instance_cnt) + break; + + rp->instance[i] = adv_instance->instance; + i++; + } + rp->num_instances = hdev->adv_instance_cnt; + } else { + rp->num_instances = 0; } - if (!hdev && (opcode > MGMT_OP_READ_INDEX_LIST && - opcode != MGMT_OP_READ_UNCONF_INDEX_LIST)) { - err = cmd_status(sk, index, opcode, - MGMT_STATUS_INVALID_INDEX); - goto done; + hci_dev_unlock(hdev); + + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES, + MGMT_STATUS_SUCCESS, rp, rp_len); + + kfree(rp); + + return err; +} + +static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, + u8 len, bool is_adv_data) +{ + u8 max_len = HCI_MAX_AD_LENGTH; + int i, cur_len; + bool flags_managed = false; + bool tx_power_managed = false; + u32 flags_params = MGMT_ADV_FLAG_DISCOV | MGMT_ADV_FLAG_LIMITED_DISCOV | + MGMT_ADV_FLAG_MANAGED_FLAGS; + + if (is_adv_data && (adv_flags & flags_params)) { + flags_managed = true; + max_len -= 3; } - handler = &mgmt_handlers[opcode]; + if (is_adv_data && (adv_flags & MGMT_ADV_FLAG_TX_POWER)) { + tx_power_managed = true; + max_len -= 3; + } - if ((handler->var_len && len < handler->data_len) || - (!handler->var_len && len != handler->data_len)) { - err = cmd_status(sk, index, opcode, - MGMT_STATUS_INVALID_PARAMS); - goto done; + if (len > max_len) + return false; + + /* Make sure that the data is correctly formatted. */ + for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) { + cur_len = data[i]; + + if (flags_managed && data[i + 1] == EIR_FLAGS) + return false; + + if (tx_power_managed && data[i + 1] == EIR_TX_POWER) + return false; + + /* If the current field length would exceed the total data + * length, then it's invalid. + */ + if (i + cur_len >= len) + return false; + } + + return true; +} + +static void add_advertising_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + struct mgmt_pending_cmd *cmd; + struct mgmt_cp_add_advertising *cp; + struct mgmt_rp_add_advertising rp; + struct adv_info *adv_instance, *n; + u8 instance; + + BT_DBG("status %d", status); + + hci_dev_lock(hdev); + + cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev); + + if (status) + hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); + + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { + if (!adv_instance->pending) + continue; + + if (!status) { + adv_instance->pending = false; + continue; + } + + instance = adv_instance->instance; + + if (hdev->cur_adv_instance == instance) + cancel_adv_timeout(hdev); + + hci_remove_adv_instance(hdev, instance); + advertising_removed(cmd ? cmd->sk : NULL, hdev, instance); + } + + if (!cmd) + goto unlock; + + cp = cmd->param; + rp.instance = cp->instance; + + if (status) + mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status)); + else + mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_status(status), &rp, sizeof(rp)); + + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); +} + +void mgmt_adv_timeout_expired(struct hci_dev *hdev) +{ + u8 instance; + struct hci_request req; + + hdev->adv_instance_timeout = 0; + + instance = get_current_adv_instance(hdev); + if (instance == 0x00) + return; + + hci_dev_lock(hdev); + hci_req_init(&req, hdev); + + clear_adv_instance(hdev, &req, instance, false); + + if (list_empty(&hdev->adv_instances)) + disable_advertising(&req); + + if (!skb_queue_empty(&req.cmd_q)) + hci_req_run(&req, NULL); + + hci_dev_unlock(hdev); +} + +static int add_advertising(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_cp_add_advertising *cp = data; + struct mgmt_rp_add_advertising rp; + u32 flags; + u32 supported_flags; + u8 status; + u16 timeout, duration; + unsigned int prev_instance_cnt = hdev->adv_instance_cnt; + u8 schedule_instance = 0; + struct adv_info *next_instance; + int err; + struct mgmt_pending_cmd *cmd; + struct hci_request req; + + BT_DBG("%s", hdev->name); + + status = mgmt_le_support(hdev); + if (status) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + status); + + flags = __le32_to_cpu(cp->flags); + timeout = __le16_to_cpu(cp->timeout); + duration = __le16_to_cpu(cp->duration); + + /* The current implementation only supports a subset of the specified + * flags. + */ + supported_flags = get_supported_adv_flags(hdev); + if (flags & ~supported_flags) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_INVALID_PARAMS); + + hci_dev_lock(hdev); + + if (timeout && !hdev_is_powered(hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_REJECTED); + goto unlock; + } + + if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || + pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || + pending_find(MGMT_OP_SET_LE, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_BUSY); + goto unlock; + } + + if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) || + !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len, + cp->scan_rsp_len, false)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_INVALID_PARAMS); + goto unlock; + } + + err = hci_add_adv_instance(hdev, cp->instance, flags, + cp->adv_data_len, cp->data, + cp->scan_rsp_len, + cp->data + cp->adv_data_len, + timeout, duration); + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_FAILED); + goto unlock; + } + + /* Only trigger an advertising added event if a new instance was + * actually added. + */ + if (hdev->adv_instance_cnt > prev_instance_cnt) + advertising_added(sk, hdev, cp->instance); + + hci_dev_set_flag(hdev, HCI_ADVERTISING_INSTANCE); + + if (hdev->cur_adv_instance == cp->instance) { + /* If the currently advertised instance is being changed then + * cancel the current advertising and schedule the next + * instance. If there is only one instance then the overridden + * advertising data will be visible right away. + */ + cancel_adv_timeout(hdev); + + next_instance = hci_get_next_instance(hdev, cp->instance); + if (next_instance) + schedule_instance = next_instance->instance; + } else if (!hdev->adv_instance_timeout) { + /* Immediately advertise the new instance if no other + * instance is currently being advertised. + */ + schedule_instance = cp->instance; + } + + /* If the HCI_ADVERTISING flag is set or the device isn't powered or + * there is no instance to be advertised then we have no HCI + * communication to make. Simply return. + */ + if (!hdev_is_powered(hdev) || + hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !schedule_instance) { + rp.instance = cp->instance; + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + goto unlock; + } + + /* We're good to go, update advertising data, parameters, and start + * advertising. + */ + cmd = mgmt_pending_add(sk, MGMT_OP_ADD_ADVERTISING, hdev, data, + data_len); + if (!cmd) { + err = -ENOMEM; + goto unlock; } - if (hdev) - mgmt_init_hdev(sk, hdev); + hci_req_init(&req, hdev); + + err = schedule_adv_instance(&req, schedule_instance, true); - cp = buf + sizeof(*hdr); + if (!err) + err = hci_req_run(&req, add_advertising_complete); - err = handler->func(sk, hdev, cp, len); if (err < 0) - goto done; + mgmt_pending_remove(cmd); - err = msglen; +unlock: + hci_dev_unlock(hdev); -done: - if (hdev) - hci_dev_put(hdev); + return err; +} + +static void remove_advertising_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + struct mgmt_pending_cmd *cmd; + struct mgmt_cp_remove_advertising *cp; + struct mgmt_rp_remove_advertising rp; + + BT_DBG("status %d", status); + + hci_dev_lock(hdev); + + /* A failure status here only means that we failed to disable + * advertising. Otherwise, the advertising instance has been removed, + * so report success. + */ + cmd = pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev); + if (!cmd) + goto unlock; + + cp = cmd->param; + rp.instance = cp->instance; + + mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, MGMT_STATUS_SUCCESS, + &rp, sizeof(rp)); + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); +} + +static int remove_advertising(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_cp_remove_advertising *cp = data; + struct mgmt_rp_remove_advertising rp; + struct mgmt_pending_cmd *cmd; + struct hci_request req; + int err; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + if (cp->instance && !hci_find_adv_instance(hdev, cp->instance)) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_REMOVE_ADVERTISING, + MGMT_STATUS_INVALID_PARAMS); + goto unlock; + } + + if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) || + pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) || + pending_find(MGMT_OP_SET_LE, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, + MGMT_STATUS_BUSY); + goto unlock; + } + + if (!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, + MGMT_STATUS_INVALID_PARAMS); + goto unlock; + } + + hci_req_init(&req, hdev); + + clear_adv_instance(hdev, &req, cp->instance, true); + + if (list_empty(&hdev->adv_instances)) + disable_advertising(&req); + + /* If no HCI commands have been collected so far or the HCI_ADVERTISING + * flag is set or the device isn't powered then we have no HCI + * communication to make. Simply return. + */ + if (skb_queue_empty(&req.cmd_q) || + !hdev_is_powered(hdev) || + hci_dev_test_flag(hdev, HCI_ADVERTISING)) { + rp.instance = cp->instance; + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_ADVERTISING, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + goto unlock; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_ADVERTISING, hdev, data, + data_len); + if (!cmd) { + err = -ENOMEM; + goto unlock; + } + + err = hci_req_run(&req, remove_advertising_complete); + if (err < 0) + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); - kfree(buf); return err; } +static const struct hci_mgmt_handler mgmt_handlers[] = { + { NULL }, /* 0x0000 (no command) */ + { read_version, MGMT_READ_VERSION_SIZE, + HCI_MGMT_NO_HDEV | + HCI_MGMT_UNTRUSTED }, + { read_commands, MGMT_READ_COMMANDS_SIZE, + HCI_MGMT_NO_HDEV | + HCI_MGMT_UNTRUSTED }, + { read_index_list, MGMT_READ_INDEX_LIST_SIZE, + HCI_MGMT_NO_HDEV | + HCI_MGMT_UNTRUSTED }, + { read_controller_info, MGMT_READ_INFO_SIZE, + HCI_MGMT_UNTRUSTED }, + { set_powered, MGMT_SETTING_SIZE }, + { set_discoverable, MGMT_SET_DISCOVERABLE_SIZE }, + { set_connectable, MGMT_SETTING_SIZE }, + { set_fast_connectable, MGMT_SETTING_SIZE }, + { set_bondable, MGMT_SETTING_SIZE }, + { set_link_security, MGMT_SETTING_SIZE }, + { set_ssp, MGMT_SETTING_SIZE }, + { set_hs, MGMT_SETTING_SIZE }, + { set_le, MGMT_SETTING_SIZE }, + { set_dev_class, MGMT_SET_DEV_CLASS_SIZE }, + { set_local_name, MGMT_SET_LOCAL_NAME_SIZE }, + { add_uuid, MGMT_ADD_UUID_SIZE }, + { remove_uuid, MGMT_REMOVE_UUID_SIZE }, + { load_link_keys, MGMT_LOAD_LINK_KEYS_SIZE, + HCI_MGMT_VAR_LEN }, + { load_long_term_keys, MGMT_LOAD_LONG_TERM_KEYS_SIZE, + HCI_MGMT_VAR_LEN }, + { disconnect, MGMT_DISCONNECT_SIZE }, + { get_connections, MGMT_GET_CONNECTIONS_SIZE }, + { pin_code_reply, MGMT_PIN_CODE_REPLY_SIZE }, + { pin_code_neg_reply, MGMT_PIN_CODE_NEG_REPLY_SIZE }, + { set_io_capability, MGMT_SET_IO_CAPABILITY_SIZE }, + { pair_device, MGMT_PAIR_DEVICE_SIZE }, + { cancel_pair_device, MGMT_CANCEL_PAIR_DEVICE_SIZE }, + { unpair_device, MGMT_UNPAIR_DEVICE_SIZE }, + { user_confirm_reply, MGMT_USER_CONFIRM_REPLY_SIZE }, + { user_confirm_neg_reply, MGMT_USER_CONFIRM_NEG_REPLY_SIZE }, + { user_passkey_reply, MGMT_USER_PASSKEY_REPLY_SIZE }, + { user_passkey_neg_reply, MGMT_USER_PASSKEY_NEG_REPLY_SIZE }, + { read_local_oob_data, MGMT_READ_LOCAL_OOB_DATA_SIZE }, + { add_remote_oob_data, MGMT_ADD_REMOTE_OOB_DATA_SIZE, + HCI_MGMT_VAR_LEN }, + { remove_remote_oob_data, MGMT_REMOVE_REMOTE_OOB_DATA_SIZE }, + { start_discovery, MGMT_START_DISCOVERY_SIZE }, + { stop_discovery, MGMT_STOP_DISCOVERY_SIZE }, + { confirm_name, MGMT_CONFIRM_NAME_SIZE }, + { block_device, MGMT_BLOCK_DEVICE_SIZE }, + { unblock_device, MGMT_UNBLOCK_DEVICE_SIZE }, + { set_device_id, MGMT_SET_DEVICE_ID_SIZE }, + { set_advertising, MGMT_SETTING_SIZE }, + { set_bredr, MGMT_SETTING_SIZE }, + { set_static_address, MGMT_SET_STATIC_ADDRESS_SIZE }, + { set_scan_params, MGMT_SET_SCAN_PARAMS_SIZE }, + { set_secure_conn, MGMT_SETTING_SIZE }, + { set_debug_keys, MGMT_SETTING_SIZE }, + { set_privacy, MGMT_SET_PRIVACY_SIZE }, + { load_irks, MGMT_LOAD_IRKS_SIZE, + HCI_MGMT_VAR_LEN }, + { get_conn_info, MGMT_GET_CONN_INFO_SIZE }, + { get_clock_info, MGMT_GET_CLOCK_INFO_SIZE }, + { add_device, MGMT_ADD_DEVICE_SIZE }, + { remove_device, MGMT_REMOVE_DEVICE_SIZE }, + { load_conn_param, MGMT_LOAD_CONN_PARAM_SIZE, + HCI_MGMT_VAR_LEN }, + { read_unconf_index_list, MGMT_READ_UNCONF_INDEX_LIST_SIZE, + HCI_MGMT_NO_HDEV | + HCI_MGMT_UNTRUSTED }, + { read_config_info, MGMT_READ_CONFIG_INFO_SIZE, + HCI_MGMT_UNCONFIGURED | + HCI_MGMT_UNTRUSTED }, + { set_external_config, MGMT_SET_EXTERNAL_CONFIG_SIZE, + HCI_MGMT_UNCONFIGURED }, + { set_public_address, MGMT_SET_PUBLIC_ADDRESS_SIZE, + HCI_MGMT_UNCONFIGURED }, + { start_service_discovery, MGMT_START_SERVICE_DISCOVERY_SIZE, + HCI_MGMT_VAR_LEN }, + { read_local_oob_ext_data, MGMT_READ_LOCAL_OOB_EXT_DATA_SIZE }, + { read_ext_index_list, MGMT_READ_EXT_INDEX_LIST_SIZE, + HCI_MGMT_NO_HDEV | + HCI_MGMT_UNTRUSTED }, + { read_adv_features, MGMT_READ_ADV_FEATURES_SIZE }, + { add_advertising, MGMT_ADD_ADVERTISING_SIZE, + HCI_MGMT_VAR_LEN }, + { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE }, +}; + void mgmt_index_added(struct hci_dev *hdev) { - if (hdev->dev_type != HCI_BREDR) - return; + struct mgmt_ev_ext_index ev; if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return; - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) - mgmt_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0, NULL); - else - mgmt_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0, NULL); + switch (hdev->dev_type) { + case HCI_BREDR: + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { + mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, + NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS); + ev.type = 0x01; + } else { + mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0, + HCI_MGMT_INDEX_EVENTS); + ev.type = 0x00; + } + break; + case HCI_AMP: + ev.type = 0x02; + break; + default: + return; + } + + ev.bus = hdev->bus; + + mgmt_index_event(MGMT_EV_EXT_INDEX_ADDED, hdev, &ev, sizeof(ev), + HCI_MGMT_EXT_INDEX_EVENTS); } void mgmt_index_removed(struct hci_dev *hdev) { + struct mgmt_ev_ext_index ev; u8 status = MGMT_STATUS_INVALID_INDEX; - if (hdev->dev_type != HCI_BREDR) + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return; - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + switch (hdev->dev_type) { + case HCI_BREDR: + mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); + + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { + mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, + NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS); + ev.type = 0x01; + } else { + mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0, + HCI_MGMT_INDEX_EVENTS); + ev.type = 0x00; + } + break; + case HCI_AMP: + ev.type = 0x02; + break; + default: return; + } - mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); + ev.bus = hdev->bus; - if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) - mgmt_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0, NULL); - else - mgmt_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0, NULL); + mgmt_index_event(MGMT_EV_EXT_INDEX_REMOVED, hdev, &ev, sizeof(ev), + HCI_MGMT_EXT_INDEX_EVENTS); } /* This function requires the caller holds hdev->lock */ @@ -6363,11 +7561,12 @@ static void powered_complete(struct hci_dev *hdev, u8 status, u16 opcode) static int powered_update_hci(struct hci_dev *hdev) { struct hci_request req; + struct adv_info *adv_instance; u8 link_sec; hci_req_init(&req, hdev); - if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) && !lmp_host_ssp_capable(hdev)) { u8 mode = 0x01; @@ -6381,7 +7580,7 @@ static int powered_update_hci(struct hci_dev *hdev) } } - if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) && lmp_bredr_capable(hdev)) { struct hci_cp_write_le_host_supported cp; @@ -6402,24 +7601,41 @@ static int powered_update_hci(struct hci_dev *hdev) * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ - if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) && + (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))) { update_adv_data(&req); update_scan_rsp_data(&req); } - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && + hdev->cur_adv_instance == 0x00 && + !list_empty(&hdev->adv_instances)) { + adv_instance = list_first_entry(&hdev->adv_instances, + struct adv_info, list); + hdev->cur_adv_instance = adv_instance->instance; + } + + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) enable_advertising(&req); + else if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && + hdev->cur_adv_instance) + schedule_adv_instance(&req, hdev->cur_adv_instance, + true); restart_le_actions(&req); } - link_sec = test_bit(HCI_LINK_SECURITY, &hdev->dev_flags); + link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY); if (link_sec != test_bit(HCI_AUTH, &hdev->flags)) hci_req_add(&req, HCI_OP_WRITE_AUTH_ENABLE, sizeof(link_sec), &link_sec); if (lmp_bredr_capable(hdev)) { - write_fast_connectable(&req, false); + if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) + write_fast_connectable(&req, true); + else + write_fast_connectable(&req, false); __hci_update_page_scan(&req); update_class(&req); update_name(&req); @@ -6435,7 +7651,7 @@ int mgmt_powered(struct hci_dev *hdev, u8 powered) u8 status, zero_cod[] = { 0, 0, 0 }; int err; - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_MGMT)) return 0; if (powered) { @@ -6456,7 +7672,7 @@ int mgmt_powered(struct hci_dev *hdev, u8 powered) * been triggered, potentially causing misleading DISCONNECTED * status responses. */ - if (test_bit(HCI_UNREGISTER, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) status = MGMT_STATUS_INVALID_INDEX; else status = MGMT_STATUS_NOT_POWERED; @@ -6464,8 +7680,8 @@ int mgmt_powered(struct hci_dev *hdev, u8 powered) mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) - mgmt_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, - zero_cod, sizeof(zero_cod), NULL); + mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, + zero_cod, sizeof(zero_cod), NULL); new_settings: err = new_settings(hdev, match.sk); @@ -6478,10 +7694,10 @@ new_settings: void mgmt_set_powered_failed(struct hci_dev *hdev, int err) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; u8 status; - cmd = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); + cmd = pending_find(MGMT_OP_SET_POWERED, hdev); if (!cmd) return; @@ -6490,7 +7706,7 @@ void mgmt_set_powered_failed(struct hci_dev *hdev, int err) else status = MGMT_STATUS_FAILED; - cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_POWERED, status); + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_POWERED, status); mgmt_pending_remove(cmd); } @@ -6506,17 +7722,23 @@ void mgmt_discoverable_timeout(struct hci_dev *hdev) * of a timeout triggered from general discoverable, it is * safe to unconditionally clear the flag. */ - clear_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); - clear_bit(HCI_DISCOVERABLE, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); + hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hci_req_init(&req, hdev); - if (test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { u8 scan = SCAN_PAGE; hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, sizeof(scan), &scan); } update_class(&req); - update_adv_data(&req); + + /* Advertising instances don't use the global discoverable setting, so + * only update AD if advertising was enabled using Set Advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) + update_adv_data(&req); + hci_req_run(&req, NULL); hdev->discov_timeout = 0; @@ -6569,7 +7791,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) memset(&ev, 0, sizeof(ev)); /* Devices using resolvable or non-resolvable random addresses - * without providing an indentity resolving key don't require + * without providing an identity resolving key don't require * to store long term keys. Their addresses will change the * next time around. * @@ -6595,7 +7817,12 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) if (key->type == SMP_LTK) ev.key.master = 1; - memcpy(ev.key.val, key->val, sizeof(key->val)); + /* Make sure we copy only the significant bytes based on the + * encryption key size, and set the rest of the value to zeroes. + */ + memcpy(ev.key.val, key->val, sizeof(key->enc_size)); + memset(ev.key.val + key->enc_size, 0, + sizeof(ev.key.val) - key->enc_size); mgmt_event(MGMT_EV_NEW_LONG_TERM_KEY, hdev, &ev, sizeof(ev), NULL); } @@ -6609,7 +7836,7 @@ void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk) /* For identity resolving keys from devices that are already * using a public address or static random address, do not * ask for storing this key. The identity resolving key really - * is only mandatory for devices using resovlable random + * is only mandatory for devices using resolvable random * addresses. * * Storing all identity resolving keys has the downside that @@ -6638,7 +7865,7 @@ void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, memset(&ev, 0, sizeof(ev)); /* Devices using resolvable or non-resolvable random addresses - * without providing an indentity resolving key don't require + * without providing an identity resolving key don't require * to store signature resolving keys. Their addresses will change * the next time around. * @@ -6654,7 +7881,7 @@ void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, bacpy(&ev.key.addr.bdaddr, &csrk->bdaddr); ev.key.addr.type = link_to_bdaddr(LE_LINK, csrk->bdaddr_type); - ev.key.master = csrk->master; + ev.key.type = csrk->type; memcpy(ev.key.val, csrk->val, sizeof(csrk->val)); mgmt_event(MGMT_EV_NEW_CSRK, hdev, &ev, sizeof(ev), NULL); @@ -6681,17 +7908,6 @@ void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, mgmt_event(MGMT_EV_NEW_CONN_PARAM, hdev, &ev, sizeof(ev), NULL); } -static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, - u8 data_len) -{ - eir[eir_len++] = sizeof(type) + data_len; - eir[eir_len++] = type; - memcpy(&eir[eir_len], data, data_len); - eir_len += data_len; - - return eir_len; -} - void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, u32 flags, u8 *name, u8 name_len) { @@ -6729,7 +7945,7 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, sizeof(*ev) + eir_len, NULL); } -static void disconnect_rsp(struct pending_cmd *cmd, void *data) +static void disconnect_rsp(struct mgmt_pending_cmd *cmd, void *data) { struct sock **sk = data; @@ -6741,7 +7957,7 @@ static void disconnect_rsp(struct pending_cmd *cmd, void *data) mgmt_pending_remove(cmd); } -static void unpair_device_rsp(struct pending_cmd *cmd, void *data) +static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data) { struct hci_dev *hdev = data; struct mgmt_cp_unpair_device *cp = cmd->param; @@ -6754,10 +7970,10 @@ static void unpair_device_rsp(struct pending_cmd *cmd, void *data) bool mgmt_powering_down(struct hci_dev *hdev) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; struct mgmt_mode *cp; - cmd = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); + cmd = pending_find(MGMT_OP_SET_POWERED, hdev); if (!cmd) return false; @@ -6809,12 +8025,12 @@ void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, { u8 bdaddr_type = link_to_bdaddr(link_type, addr_type); struct mgmt_cp_disconnect *cp; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp, hdev); - cmd = mgmt_pending_find(MGMT_OP_DISCONNECT, hdev); + cmd = pending_find(MGMT_OP_DISCONNECT, hdev); if (!cmd) return; @@ -6864,9 +8080,9 @@ void mgmt_pin_code_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 secure) void mgmt_pin_code_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 status) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; - cmd = mgmt_pending_find(MGMT_OP_PIN_CODE_REPLY, hdev); + cmd = pending_find(MGMT_OP_PIN_CODE_REPLY, hdev); if (!cmd) return; @@ -6877,9 +8093,9 @@ void mgmt_pin_code_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, void mgmt_pin_code_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 status) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; - cmd = mgmt_pending_find(MGMT_OP_PIN_CODE_NEG_REPLY, hdev); + cmd = pending_find(MGMT_OP_PIN_CODE_NEG_REPLY, hdev); if (!cmd) return; @@ -6922,9 +8138,9 @@ static int user_pairing_resp_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status, u8 opcode) { - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; - cmd = mgmt_pending_find(opcode, hdev); + cmd = pending_find(opcode, hdev); if (!cmd) return -ENOENT; @@ -6983,7 +8199,7 @@ int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, void mgmt_auth_failed(struct hci_conn *conn, u8 hci_status) { struct mgmt_ev_auth_failed ev; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; u8 status = mgmt_status(hci_status); bacpy(&ev.addr.bdaddr, &conn->dst); @@ -7014,11 +8230,9 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) } if (test_bit(HCI_AUTH, &hdev->flags)) - changed = !test_and_set_bit(HCI_LINK_SECURITY, - &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, HCI_LINK_SECURITY); else - changed = test_and_clear_bit(HCI_LINK_SECURITY, - &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, HCI_LINK_SECURITY); mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, settings_rsp, &match); @@ -7054,9 +8268,9 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) if (status) { u8 mgmt_err = mgmt_status(status); - if (enable && test_and_clear_bit(HCI_SSP_ENABLED, - &hdev->dev_flags)) { - clear_bit(HCI_HS_ENABLED, &hdev->dev_flags); + if (enable && hci_dev_test_and_clear_flag(hdev, + HCI_SSP_ENABLED)) { + hci_dev_clear_flag(hdev, HCI_HS_ENABLED); new_settings(hdev, NULL); } @@ -7066,14 +8280,14 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) } if (enable) { - changed = !test_and_set_bit(HCI_SSP_ENABLED, &hdev->dev_flags); + changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED); } else { - changed = test_and_clear_bit(HCI_SSP_ENABLED, &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); if (!changed) - changed = test_and_clear_bit(HCI_HS_ENABLED, - &hdev->dev_flags); + changed = hci_dev_test_and_clear_flag(hdev, + HCI_HS_ENABLED); else - clear_bit(HCI_HS_ENABLED, &hdev->dev_flags); + hci_dev_clear_flag(hdev, HCI_HS_ENABLED); } mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, settings_rsp, &match); @@ -7086,8 +8300,8 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) hci_req_init(&req, hdev); - if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { - if (test_bit(HCI_USE_DEBUG_KEYS, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { + if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) hci_req_add(&req, HCI_OP_WRITE_SSP_DEBUG_MODE, sizeof(enable), &enable); update_eir(&req); @@ -7098,7 +8312,7 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) hci_req_run(&req, NULL); } -static void sk_lookup(struct pending_cmd *cmd, void *data) +static void sk_lookup(struct mgmt_pending_cmd *cmd, void *data) { struct cmd_lookup *match = data; @@ -7118,8 +8332,8 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); if (!status) - mgmt_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, 3, - NULL); + mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, + dev_class, 3, NULL); if (match.sk) sock_put(match.sk); @@ -7128,7 +8342,7 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) { struct mgmt_cp_set_local_name ev; - struct pending_cmd *cmd; + struct mgmt_pending_cmd *cmd; if (status) return; @@ -7137,55 +8351,19 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) memcpy(ev.name, name, HCI_MAX_NAME_LENGTH); memcpy(ev.short_name, hdev->short_name, HCI_MAX_SHORT_NAME_LENGTH); - cmd = mgmt_pending_find(MGMT_OP_SET_LOCAL_NAME, hdev); + cmd = pending_find(MGMT_OP_SET_LOCAL_NAME, hdev); if (!cmd) { memcpy(hdev->dev_name, name, sizeof(hdev->dev_name)); /* If this is a HCI command related to powering on the * HCI dev don't send any mgmt signals. */ - if (mgmt_pending_find(MGMT_OP_SET_POWERED, hdev)) + if (pending_find(MGMT_OP_SET_POWERED, hdev)) return; } - mgmt_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), - cmd ? cmd->sk : NULL); -} - -void mgmt_read_local_oob_data_complete(struct hci_dev *hdev, u8 *hash192, - u8 *rand192, u8 *hash256, u8 *rand256, - u8 status) -{ - struct pending_cmd *cmd; - - BT_DBG("%s status %u", hdev->name, status); - - cmd = mgmt_pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev); - if (!cmd) - return; - - if (status) { - cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, - mgmt_status(status)); - } else { - struct mgmt_rp_read_local_oob_data rp; - size_t rp_size = sizeof(rp); - - memcpy(rp.hash192, hash192, sizeof(rp.hash192)); - memcpy(rp.rand192, rand192, sizeof(rp.rand192)); - - if (bredr_sc_enabled(hdev) && hash256 && rand256) { - memcpy(rp.hash256, hash256, sizeof(rp.hash256)); - memcpy(rp.rand256, rand256, sizeof(rp.rand256)); - } else { - rp_size -= sizeof(rp.hash256) + sizeof(rp.rand256); - } - - cmd_complete(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, 0, - &rp, rp_size); - } - - mgmt_pending_remove(cmd); + mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), + cmd ? cmd->sk : NULL); } static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16]) @@ -7258,7 +8436,7 @@ static bool eir_has_uuids(u8 *eir, u16 eir_len, u16 uuid_count, u8 (*uuids)[16]) static void restart_le_scan(struct hci_dev *hdev) { /* If controller is not scanning we are done. */ - if (!test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return; if (time_after(jiffies + DISCOV_LE_RESTART_DELAY, @@ -7270,14 +8448,58 @@ static void restart_le_scan(struct hci_dev *hdev) DISCOV_LE_RESTART_DELAY); } +static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir, + u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len) +{ + /* If a RSSI threshold has been specified, and + * HCI_QUIRK_STRICT_DUPLICATE_FILTER is not set, then all results with + * a RSSI smaller than the RSSI threshold will be dropped. If the quirk + * is set, let it through for further processing, as we might need to + * restart the scan. + * + * For BR/EDR devices (pre 1.2) providing no RSSI during inquiry, + * the results are also dropped. + */ + if (hdev->discovery.rssi != HCI_RSSI_INVALID && + (rssi == HCI_RSSI_INVALID || + (rssi < hdev->discovery.rssi && + !test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)))) + return false; + + if (hdev->discovery.uuid_count != 0) { + /* If a list of UUIDs is provided in filter, results with no + * matching UUID should be dropped. + */ + if (!eir_has_uuids(eir, eir_len, hdev->discovery.uuid_count, + hdev->discovery.uuids) && + !eir_has_uuids(scan_rsp, scan_rsp_len, + hdev->discovery.uuid_count, + hdev->discovery.uuids)) + return false; + } + + /* If duplicate filtering does not report RSSI changes, then restart + * scanning to ensure updated result with updated RSSI values. + */ + if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)) { + restart_le_scan(hdev); + + /* Validate RSSI value against the RSSI threshold once more. */ + if (hdev->discovery.rssi != HCI_RSSI_INVALID && + rssi < hdev->discovery.rssi) + return false; + } + + return true; +} + void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len) { char buf[512]; - struct mgmt_ev_device_found *ev = (void *) buf; + struct mgmt_ev_device_found *ev = (void *)buf; size_t ev_size; - bool match; /* Don't send events for a non-kernel initiated discovery. With * LE one exception is if we have pend_le_reports > 0 in which @@ -7290,21 +8512,12 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, return; } - /* When using service discovery with a RSSI threshold, then check - * if such a RSSI threshold is specified. If a RSSI threshold has - * been specified, and HCI_QUIRK_STRICT_DUPLICATE_FILTER is not set, - * then all results with a RSSI smaller than the RSSI threshold will be - * dropped. If the quirk is set, let it through for further processing, - * as we might need to restart the scan. - * - * For BR/EDR devices (pre 1.2) providing no RSSI during inquiry, - * the results are also dropped. - */ - if (hdev->discovery.rssi != HCI_RSSI_INVALID && - (rssi == HCI_RSSI_INVALID || - (rssi < hdev->discovery.rssi && - !test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)))) - return; + if (hdev->discovery.result_filtering) { + /* We are using service discovery */ + if (!is_filter_match(hdev, rssi, eir, eir_len, scan_rsp, + scan_rsp_len)) + return; + } /* Make sure that the buffer is big enough. The 5 extra bytes * are for the potential CoD field. @@ -7331,87 +8544,17 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, ev->rssi = rssi; ev->flags = cpu_to_le32(flags); - if (eir_len > 0) { - /* When using service discovery and a list of UUID is - * provided, results with no matching UUID should be - * dropped. In case there is a match the result is - * kept and checking possible scan response data - * will be skipped. - */ - if (hdev->discovery.uuid_count > 0) { - match = eir_has_uuids(eir, eir_len, - hdev->discovery.uuid_count, - hdev->discovery.uuids); - /* If duplicate filtering does not report RSSI changes, - * then restart scanning to ensure updated result with - * updated RSSI values. - */ - if (match && test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, - &hdev->quirks)) - restart_le_scan(hdev); - } else { - match = true; - } - - if (!match && !scan_rsp_len) - return; - + if (eir_len > 0) /* Copy EIR or advertising data into event */ memcpy(ev->eir, eir, eir_len); - } else { - /* When using service discovery and a list of UUID is - * provided, results with empty EIR or advertising data - * should be dropped since they do not match any UUID. - */ - if (hdev->discovery.uuid_count > 0 && !scan_rsp_len) - return; - - match = false; - } if (dev_class && !eir_has_data_type(ev->eir, eir_len, EIR_CLASS_OF_DEV)) eir_len = eir_append_data(ev->eir, eir_len, EIR_CLASS_OF_DEV, dev_class, 3); - if (scan_rsp_len > 0) { - /* When using service discovery and a list of UUID is - * provided, results with no matching UUID should be - * dropped if there is no previous match from the - * advertising data. - */ - if (hdev->discovery.uuid_count > 0) { - if (!match && !eir_has_uuids(scan_rsp, scan_rsp_len, - hdev->discovery.uuid_count, - hdev->discovery.uuids)) - return; - - /* If duplicate filtering does not report RSSI changes, - * then restart scanning to ensure updated result with - * updated RSSI values. - */ - if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, - &hdev->quirks)) - restart_le_scan(hdev); - } - + if (scan_rsp_len > 0) /* Append scan response data to event */ memcpy(ev->eir + eir_len, scan_rsp, scan_rsp_len); - } else { - /* When using service discovery and a list of UUID is - * provided, results with empty scan response and no - * previous matched advertising data should be dropped. - */ - if (hdev->discovery.uuid_count > 0 && !match) - return; - } - - /* Validate the reported RSSI value against the RSSI threshold once more - * incase HCI_QUIRK_STRICT_DUPLICATE_FILTER forced a restart of LE - * scanning. - */ - if (hdev->discovery.rssi != HCI_RSSI_INVALID && - rssi < hdev->discovery.rssi) - return; ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len); ev_size = sizeof(*ev) + eir_len + scan_rsp_len; @@ -7463,11 +8606,40 @@ static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) void mgmt_reenable_advertising(struct hci_dev *hdev) { struct hci_request req; + u8 instance; - if (!test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && + !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) return; + instance = get_current_adv_instance(hdev); + hci_req_init(&req, hdev); - enable_advertising(&req); + + if (instance) { + schedule_adv_instance(&req, instance, true); + } else { + update_adv_data(&req); + update_scan_rsp_data(&req); + enable_advertising(&req); + } + hci_req_run(&req, adv_enable_complete); } + +static struct hci_mgmt_chan chan = { + .channel = HCI_CHANNEL_CONTROL, + .handler_count = ARRAY_SIZE(mgmt_handlers), + .handlers = mgmt_handlers, + .hdev_init = mgmt_init_hdev, +}; + +int mgmt_init(void) +{ + return hci_mgmt_chan_register(&chan); +} + +void mgmt_exit(void) +{ + hci_mgmt_chan_unregister(&chan); +} diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c new file mode 100644 index 000000000000..8c30c7eb8bef --- /dev/null +++ b/net/bluetooth/mgmt_util.c @@ -0,0 +1,210 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + + Copyright (C) 2015 Intel Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> +#include <net/bluetooth/mgmt.h> + +#include "mgmt_util.h" + +int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, + void *data, u16 data_len, int flag, struct sock *skip_sk) +{ + struct sk_buff *skb; + struct mgmt_hdr *hdr; + + skb = alloc_skb(sizeof(*hdr) + data_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = (void *) skb_put(skb, sizeof(*hdr)); + hdr->opcode = cpu_to_le16(event); + if (hdev) + hdr->index = cpu_to_le16(hdev->id); + else + hdr->index = cpu_to_le16(MGMT_INDEX_NONE); + hdr->len = cpu_to_le16(data_len); + + if (data) + memcpy(skb_put(skb, data_len), data, data_len); + + /* Time stamp */ + __net_timestamp(skb); + + hci_send_to_channel(channel, skb, flag, skip_sk); + kfree_skb(skb); + + return 0; +} + +int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) +{ + struct sk_buff *skb; + struct mgmt_hdr *hdr; + struct mgmt_ev_cmd_status *ev; + int err; + + BT_DBG("sock %p, index %u, cmd %u, status %u", sk, index, cmd, status); + + skb = alloc_skb(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = (void *) skb_put(skb, sizeof(*hdr)); + + hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS); + hdr->index = cpu_to_le16(index); + hdr->len = cpu_to_le16(sizeof(*ev)); + + ev = (void *) skb_put(skb, sizeof(*ev)); + ev->status = status; + ev->opcode = cpu_to_le16(cmd); + + err = sock_queue_rcv_skb(sk, skb); + if (err < 0) + kfree_skb(skb); + + return err; +} + +int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, + void *rp, size_t rp_len) +{ + struct sk_buff *skb; + struct mgmt_hdr *hdr; + struct mgmt_ev_cmd_complete *ev; + int err; + + BT_DBG("sock %p", sk); + + skb = alloc_skb(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = (void *) skb_put(skb, sizeof(*hdr)); + + hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE); + hdr->index = cpu_to_le16(index); + hdr->len = cpu_to_le16(sizeof(*ev) + rp_len); + + ev = (void *) skb_put(skb, sizeof(*ev) + rp_len); + ev->opcode = cpu_to_le16(cmd); + ev->status = status; + + if (rp) + memcpy(ev->data, rp, rp_len); + + err = sock_queue_rcv_skb(sk, skb); + if (err < 0) + kfree_skb(skb); + + return err; +} + +struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, + struct hci_dev *hdev) +{ + struct mgmt_pending_cmd *cmd; + + list_for_each_entry(cmd, &hdev->mgmt_pending, list) { + if (hci_sock_get_channel(cmd->sk) != channel) + continue; + if (cmd->opcode == opcode) + return cmd; + } + + return NULL; +} + +struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel, + u16 opcode, + struct hci_dev *hdev, + const void *data) +{ + struct mgmt_pending_cmd *cmd; + + list_for_each_entry(cmd, &hdev->mgmt_pending, list) { + if (cmd->user_data != data) + continue; + if (cmd->opcode == opcode) + return cmd; + } + + return NULL; +} + +void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, + void (*cb)(struct mgmt_pending_cmd *cmd, void *data), + void *data) +{ + struct mgmt_pending_cmd *cmd, *tmp; + + list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { + if (opcode > 0 && cmd->opcode != opcode) + continue; + + cb(cmd, data); + } +} + +struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, + struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_pending_cmd *cmd; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return NULL; + + cmd->opcode = opcode; + cmd->index = hdev->id; + + cmd->param = kmemdup(data, len, GFP_KERNEL); + if (!cmd->param) { + kfree(cmd); + return NULL; + } + + cmd->param_len = len; + + cmd->sk = sk; + sock_hold(sk); + + list_add(&cmd->list, &hdev->mgmt_pending); + + return cmd; +} + +void mgmt_pending_free(struct mgmt_pending_cmd *cmd) +{ + sock_put(cmd->sk); + kfree(cmd->param); + kfree(cmd); +} + +void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) +{ + list_del(&cmd->list); + mgmt_pending_free(cmd); +} diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h new file mode 100644 index 000000000000..6559f189213c --- /dev/null +++ b/net/bluetooth/mgmt_util.h @@ -0,0 +1,53 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2015 Intel Coropration + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +struct mgmt_pending_cmd { + struct list_head list; + u16 opcode; + int index; + void *param; + size_t param_len; + struct sock *sk; + void *user_data; + int (*cmd_complete)(struct mgmt_pending_cmd *cmd, u8 status); +}; + +int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, + void *data, u16 data_len, int flag, struct sock *skip_sk); +int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status); +int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, + void *rp, size_t rp_len); + +struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, + struct hci_dev *hdev); +struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel, + u16 opcode, + struct hci_dev *hdev, + const void *data); +void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, + void (*cb)(struct mgmt_pending_cmd *cmd, void *data), + void *data); +struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, + struct hci_dev *hdev, + void *data, u16 len); +void mgmt_pending_free(struct mgmt_pending_cmd *cmd); +void mgmt_pending_remove(struct mgmt_pending_cmd *cmd); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 4fea24275b17..29709fbfd1f5 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -200,7 +200,7 @@ static int rfcomm_l2sock_create(struct socket **sock) BT_DBG(""); - err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); + err = sock_create_kern(&init_net, PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); if (!err) { struct sock *sk = (*sock)->sk; sk->sk_data_ready = rfcomm_l2data_ready; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 3c6d2c8ac1a4..7511df72347f 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -269,12 +269,12 @@ static struct proto rfcomm_proto = { .obj_size = sizeof(struct rfcomm_pinfo) }; -static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct rfcomm_dlc *d; struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern); if (!sk) return NULL; @@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, sock->ops = &rfcomm_sock_ops; - sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; @@ -334,16 +334,19 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { - struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sockaddr_rc sa; struct sock *sk = sock->sk; - int chan = sa->rc_channel; - int err = 0; - - BT_DBG("sk %p %pMR", sk, &sa->rc_bdaddr); + int len, err = 0; if (!addr || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + memset(&sa, 0, sizeof(sa)); + len = min_t(unsigned int, sizeof(sa), addr_len); + memcpy(&sa, addr, len); + + BT_DBG("sk %p %pMR", sk, &sa.rc_bdaddr); + lock_sock(sk); if (sk->sk_state != BT_OPEN) { @@ -358,12 +361,13 @@ static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr write_lock(&rfcomm_sk_list.lock); - if (chan && __rfcomm_get_listen_sock_by_addr(chan, &sa->rc_bdaddr)) { + if (sa.rc_channel && + __rfcomm_get_listen_sock_by_addr(sa.rc_channel, &sa.rc_bdaddr)) { err = -EADDRINUSE; } else { /* Save source address */ - bacpy(&rfcomm_pi(sk)->src, &sa->rc_bdaddr); - rfcomm_pi(sk)->channel = chan; + bacpy(&rfcomm_pi(sk)->src, &sa.rc_bdaddr); + rfcomm_pi(sk)->channel = sa.rc_channel; sk->sk_state = BT_BOUND; } @@ -549,8 +553,8 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int * return 0; } -static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; @@ -615,8 +619,8 @@ done: return sent; } -static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int rfcomm_sock_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { struct sock *sk = sock->sk; struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; @@ -627,7 +631,7 @@ static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock, return 0; } - len = bt_sock_stream_recvmsg(iocb, sock, msg, size, flags); + len = bt_sock_stream_recvmsg(sock, msg, size, flags); lock_sock(sk); if (!(flags & MSG_PEEK) && len > 0) @@ -969,7 +973,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * goto done; } - sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC); + sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0); if (!sk) goto done; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 76321b546e84..688a040c5626 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -460,11 +460,11 @@ static struct proto sco_proto = { .obj_size = sizeof(struct sco_pinfo) }; -static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio) +static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto); + sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern); if (!sk) return NULL; @@ -501,7 +501,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &sco_sock_ops; - sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC); + sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; @@ -688,8 +688,8 @@ static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int *len return 0; } -static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; int err; @@ -758,8 +758,8 @@ static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting) } } -static int sco_sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) { struct sock *sk = sock->sk; struct sco_pinfo *pi = sco_pi(sk); @@ -777,7 +777,7 @@ static int sco_sock_recvmsg(struct kiocb *iocb, struct socket *sock, release_sock(sk); - return bt_sock_recvmsg(iocb, sock, msg, len, flags); + return bt_sock_recvmsg(sock, msg, len, flags); } static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) @@ -1026,7 +1026,7 @@ static void sco_conn_ready(struct sco_conn *conn) bh_lock_sock(parent); sk = sco_sock_alloc(sock_net(parent), NULL, - BTPROTO_SCO, GFP_ATOMIC); + BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { bh_unlock_sock(parent); sco_conn_unlock(conn); @@ -1083,9 +1083,13 @@ int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) return lm; } -void sco_connect_cfm(struct hci_conn *hcon, __u8 status) +static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) { + if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) + return; + BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); + if (!status) { struct sco_conn *conn; @@ -1096,14 +1100,17 @@ void sco_connect_cfm(struct hci_conn *hcon, __u8 status) sco_conn_del(hcon, bt_to_errno(status)); } -void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) +static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) { + if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) + return; + BT_DBG("hcon %p reason %d", hcon, reason); sco_conn_del(hcon, bt_to_errno(reason)); } -int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) { struct sco_conn *conn = hcon->sco_data; @@ -1114,14 +1121,19 @@ int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) if (skb->len) { sco_recv_frame(conn, skb); - return 0; + return; } drop: kfree_skb(skb); - return 0; } +static struct hci_cb sco_cb = { + .name = "SCO", + .connect_cfm = sco_connect_cfm, + .disconn_cfm = sco_disconn_cfm, +}; + static int sco_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; @@ -1203,6 +1215,8 @@ int __init sco_init(void) BT_INFO("SCO socket layer initialized"); + hci_register_cb(&sco_cb); + if (IS_ERR_OR_NULL(bt_debugfs)) return 0; @@ -1216,12 +1230,14 @@ error: return err; } -void __exit sco_exit(void) +void sco_exit(void) { bt_procfs_cleanup(&init_net, "sco"); debugfs_remove(sco_debugfs); + hci_unregister_cb(&sco_cb); + bt_sock_unregister(BTPROTO_SCO); proto_unregister(&sco_proto); diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c index 378f4064952c..dc688f13e496 100644 --- a/net/bluetooth/selftest.c +++ b/net/bluetooth/selftest.c @@ -21,6 +21,8 @@ SOFTWARE IS DISCLAIMED. */ +#include <linux/debugfs.h> + #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> @@ -154,6 +156,21 @@ static int __init test_ecdh_sample(const u8 priv_a[32], const u8 priv_b[32], return 0; } +static char test_ecdh_buffer[32]; + +static ssize_t test_ecdh_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + return simple_read_from_buffer(user_buf, count, ppos, test_ecdh_buffer, + strlen(test_ecdh_buffer)); +} + +static const struct file_operations test_ecdh_fops = { + .open = simple_open, + .read = test_ecdh_read, + .llseek = default_llseek, +}; + static int __init test_ecdh(void) { ktime_t calltime, delta, rettime; @@ -165,19 +182,19 @@ static int __init test_ecdh(void) err = test_ecdh_sample(priv_a_1, priv_b_1, pub_a_1, pub_b_1, dhkey_1); if (err) { BT_ERR("ECDH sample 1 failed"); - return err; + goto done; } err = test_ecdh_sample(priv_a_2, priv_b_2, pub_a_2, pub_b_2, dhkey_2); if (err) { BT_ERR("ECDH sample 2 failed"); - return err; + goto done; } err = test_ecdh_sample(priv_a_3, priv_a_3, pub_a_3, pub_a_3, dhkey_3); if (err) { BT_ERR("ECDH sample 3 failed"); - return err; + goto done; } rettime = ktime_get(); @@ -186,7 +203,17 @@ static int __init test_ecdh(void) BT_INFO("ECDH test passed in %llu usecs", duration); - return 0; +done: + if (!err) + snprintf(test_ecdh_buffer, sizeof(test_ecdh_buffer), + "PASS (%llu usecs)\n", duration); + else + snprintf(test_ecdh_buffer, sizeof(test_ecdh_buffer), "FAIL\n"); + + debugfs_create_file("selftest_ecdh", 0444, bt_debugfs, NULL, + &test_ecdh_fops); + + return err; } #else diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index c09a821f381d..ad82324f710f 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -33,6 +33,9 @@ #include "ecc.h" #include "smp.h" +#define SMP_DEV(hdev) \ + ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data) + /* Low-level debug macros to be used for stuff that we don't want * accidentially in dmesg, i.e. the values of the various crypto keys * and the inputs & outputs of crypto functions. @@ -52,7 +55,7 @@ #define SMP_TIMEOUT msecs_to_jiffies(30000) -#define AUTH_REQ_MASK(dev) (test_bit(HCI_SC_ENABLED, &(dev)->dev_flags) ? \ +#define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \ 0x1f : 0x07) #define KEY_DIST_MASK 0x07 @@ -70,7 +73,22 @@ enum { SMP_FLAG_DEBUG_KEY, SMP_FLAG_WAIT_USER, SMP_FLAG_DHKEY_PENDING, - SMP_FLAG_OOB, + SMP_FLAG_REMOTE_OOB, + SMP_FLAG_LOCAL_OOB, +}; + +struct smp_dev { + /* Secure Connections OOB data */ + u8 local_pk[64]; + u8 local_sk[32]; + u8 local_rand[16]; + bool debug_key; + + u8 min_key_size; + u8 max_key_size; + + struct crypto_blkcipher *tfm_aes; + struct crypto_hash *tfm_cmac; }; struct smp_chan { @@ -84,7 +102,8 @@ struct smp_chan { u8 rrnd[16]; /* SMP Pairing Random (remote) */ u8 pcnf[16]; /* SMP Pairing Confirm */ u8 tk[16]; /* SMP Temporary Key */ - u8 rr[16]; + u8 rr[16]; /* Remote OOB ra/rb value */ + u8 lr[16]; /* Local OOB ra/rb value */ u8 enc_key_size; u8 remote_key_dist; bdaddr_t id_addr; @@ -358,6 +377,8 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) uint8_t tmp[16], data[16]; int err; + SMP_DBG("k %16phN r %16phN", k, r); + if (!tfm) { BT_ERR("tfm %p", tfm); return -EINVAL; @@ -387,6 +408,8 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); + SMP_DBG("r %16phN", r); + return err; } @@ -397,6 +420,10 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], u8 p1[16], p2[16]; int err; + SMP_DBG("k %16phN r %16phN", k, r); + SMP_DBG("iat %u ia %6phN rat %u ra %6phN", _iat, ia, _rat, ra); + SMP_DBG("preq %7phN pres %7phN", preq, pres); + memset(p1, 0, 16); /* p1 = pres || preq || _rat || _iat */ @@ -405,10 +432,7 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], memcpy(p1 + 2, preq, 7); memcpy(p1 + 9, pres, 7); - /* p2 = padding || ia || ra */ - memcpy(p2, ra, 6); - memcpy(p2 + 6, ia, 6); - memset(p2 + 12, 0, 4); + SMP_DBG("p1 %16phN", p1); /* res = r XOR p1 */ u128_xor((u128 *) res, (u128 *) r, (u128 *) p1); @@ -420,6 +444,13 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16], return err; } + /* p2 = padding || ia || ra */ + memcpy(p2, ra, 6); + memcpy(p2 + 6, ia, 6); + memset(p2 + 12, 0, 4); + + SMP_DBG("p2 %16phN", p2); + /* res = res XOR p2 */ u128_xor((u128 *) res, (u128 *) res, (u128 *) p2); @@ -478,18 +509,18 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], const bdaddr_t *bdaddr) { struct l2cap_chan *chan = hdev->smp_data; - struct crypto_blkcipher *tfm; + struct smp_dev *smp; u8 hash[3]; int err; if (!chan || !chan->data) return false; - tfm = chan->data; + smp = chan->data; BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); - err = smp_ah(tfm, irk, &bdaddr->b[3], hash); + err = smp_ah(smp->tfm_aes, irk, &bdaddr->b[3], hash); if (err) return false; @@ -499,20 +530,20 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) { struct l2cap_chan *chan = hdev->smp_data; - struct crypto_blkcipher *tfm; + struct smp_dev *smp; int err; if (!chan || !chan->data) return -EOPNOTSUPP; - tfm = chan->data; + smp = chan->data; get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ rpa->b[5] |= 0x40; /* Set second most significant bit */ - err = smp_ah(tfm, irk, &rpa->b[3], rpa->b); + err = smp_ah(smp->tfm_aes, irk, &rpa->b[3], rpa->b); if (err < 0) return err; @@ -521,6 +552,53 @@ int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) return 0; } +int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) +{ + struct l2cap_chan *chan = hdev->smp_data; + struct smp_dev *smp; + int err; + + if (!chan || !chan->data) + return -EOPNOTSUPP; + + smp = chan->data; + + if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { + BT_DBG("Using debug keys"); + memcpy(smp->local_pk, debug_pk, 64); + memcpy(smp->local_sk, debug_sk, 32); + smp->debug_key = true; + } else { + while (true) { + /* Generate local key pair for Secure Connections */ + if (!ecc_make_key(smp->local_pk, smp->local_sk)) + return -EIO; + + /* This is unlikely, but we need to check that + * we didn't accidentially generate a debug key. + */ + if (memcmp(smp->local_sk, debug_sk, 32)) + break; + } + smp->debug_key = false; + } + + SMP_DBG("OOB Public Key X: %32phN", smp->local_pk); + SMP_DBG("OOB Public Key Y: %32phN", smp->local_pk + 32); + SMP_DBG("OOB Private Key: %32phN", smp->local_sk); + + get_random_bytes(smp->local_rand, 16); + + err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->local_pk, + smp->local_rand, 0, hash); + if (err < 0) + return err; + + memcpy(rand, smp->local_rand, 16); + + return 0; +} + static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) { struct l2cap_chan *chan = conn->smp; @@ -589,7 +667,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, struct hci_dev *hdev = hcon->hdev; u8 local_dist = 0, remote_dist = 0, oob_flag = SMP_OOB_NOT_PRESENT; - if (test_bit(HCI_BONDABLE, &conn->hcon->hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_BONDABLE)) { local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; authreq |= SMP_AUTH_BONDING; @@ -597,18 +675,18 @@ static void build_pairing_cmd(struct l2cap_conn *conn, authreq &= ~SMP_AUTH_BONDING; } - if (test_bit(HCI_RPA_RESOLVING, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING)) remote_dist |= SMP_DIST_ID_KEY; - if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_PRIVACY)) local_dist |= SMP_DIST_ID_KEY; - if (test_bit(HCI_SC_ENABLED, &hdev->dev_flags) && + if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) && (authreq & SMP_AUTH_SC)) { struct oob_data *oob_data; u8 bdaddr_type; - if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { local_dist |= SMP_DIST_LINK_KEY; remote_dist |= SMP_DIST_LINK_KEY; } @@ -621,10 +699,12 @@ static void build_pairing_cmd(struct l2cap_conn *conn, oob_data = hci_find_remote_oob_data(hdev, &hcon->dst, bdaddr_type); if (oob_data && oob_data->present) { - set_bit(SMP_FLAG_OOB, &smp->flags); + set_bit(SMP_FLAG_REMOTE_OOB, &smp->flags); oob_flag = SMP_OOB_PRESENT; memcpy(smp->rr, oob_data->rand256, 16); memcpy(smp->pcnf, oob_data->hash256, 16); + SMP_DBG("OOB Remote Confirmation: %16phN", smp->pcnf); + SMP_DBG("OOB Remote Random: %16phN", smp->rr); } } else { @@ -634,7 +714,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, if (rsp == NULL) { req->io_capability = conn->hcon->io_capability; req->oob_flag = oob_flag; - req->max_key_size = SMP_MAX_ENC_KEY_SIZE; + req->max_key_size = SMP_DEV(hdev)->max_key_size; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -645,7 +725,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, rsp->io_capability = conn->hcon->io_capability; rsp->oob_flag = oob_flag; - rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; + rsp->max_key_size = SMP_DEV(hdev)->max_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev)); @@ -656,10 +736,11 @@ static void build_pairing_cmd(struct l2cap_conn *conn, static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) { struct l2cap_chan *chan = conn->smp; + struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; - if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) || - (max_key_size < SMP_MIN_ENC_KEY_SIZE)) + if (max_key_size > SMP_DEV(hdev)->max_key_size || + max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; smp->enc_key_size = max_key_size; @@ -681,9 +762,9 @@ static void smp_chan_destroy(struct l2cap_conn *conn) complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); mgmt_smp_complete(hcon, complete); - kfree(smp->csrk); - kfree(smp->slave_csrk); - kfree(smp->link_key); + kzfree(smp->csrk); + kzfree(smp->slave_csrk); + kzfree(smp->link_key); crypto_free_blkcipher(smp->tfm_aes); crypto_free_hash(smp->tfm_cmac); @@ -692,7 +773,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn) * support hasn't been explicitly enabled. */ if (smp->ltk && smp->ltk->type == SMP_LTK_P256_DEBUG && - !test_bit(HCI_KEEP_DEBUG_KEYS, &hcon->hdev->dev_flags)) { + !hci_dev_test_flag(hcon->hdev, HCI_KEEP_DEBUG_KEYS)) { list_del_rcu(&smp->ltk->list); kfree_rcu(smp->ltk, rcu); smp->ltk = NULL; @@ -717,7 +798,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn) } chan->data = NULL; - kfree(smp); + kzfree(smp); hci_conn_drop(hcon); } @@ -818,6 +899,12 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, return 0; } + /* If this function is used for SC -> legacy fallback we + * can only recover the just-works case. + */ + if (test_bit(SMP_FLAG_SC, &smp->flags)) + return -EINVAL; + /* Not Just Works/Confirm results in MITM Authentication */ if (smp->method != JUST_CFM) { set_bit(SMP_FLAG_MITM_AUTH, &smp->flags); @@ -917,13 +1004,10 @@ static u8 smp_random(struct smp_chan *smp) smp_s1(smp->tfm_aes, smp->tk, smp->rrnd, smp->prnd, stk); - memset(stk + smp->enc_key_size, 0, - SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return SMP_UNSPECIFIED; - hci_le_start_enc(hcon, ediv, rand, stk); + hci_le_start_enc(hcon, ediv, rand, stk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); } else { @@ -936,9 +1020,6 @@ static u8 smp_random(struct smp_chan *smp) smp_s1(smp->tfm_aes, smp->tk, smp->prnd, smp->rrnd, stk); - memset(stk + smp->enc_key_size, 0, - SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - if (hcon->pending_sec_level == BT_SECURITY_HIGH) auth = 1; else @@ -1052,7 +1133,7 @@ static void smp_notify_keys(struct l2cap_conn *conn) /* Don't keep debug keys around if the relevant * flag is not set. */ - if (!test_bit(HCI_KEEP_DEBUG_KEYS, &hdev->dev_flags) && + if (!hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS) && key->type == HCI_LK_DEBUG_COMBINATION) { list_del_rcu(&key->list); kfree_rcu(key, rcu); @@ -1076,9 +1157,6 @@ static void sc_add_ltk(struct smp_chan *smp) else auth = 0; - memset(smp->tk + smp->enc_key_size, 0, - SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); - smp->ltk = hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, key_type, auth, smp->tk, smp->enc_key_size, 0, 0); @@ -1097,13 +1175,13 @@ static void sc_generate_link_key(struct smp_chan *smp) return; if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) { - kfree(smp->link_key); + kzfree(smp->link_key); smp->link_key = NULL; return; } if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) { - kfree(smp->link_key); + kzfree(smp->link_key); smp->link_key = NULL; return; } @@ -1200,7 +1278,14 @@ static void smp_distribute_keys(struct smp_chan *smp) __le16 ediv; __le64 rand; - get_random_bytes(enc.ltk, sizeof(enc.ltk)); + /* Make sure we generate only the significant amount of + * bytes based on the encryption key size, and set the rest + * of the value to zeroes. + */ + get_random_bytes(enc.ltk, smp->enc_key_size); + memset(enc.ltk + smp->enc_key_size, 0, + sizeof(enc.ltk) - smp->enc_key_size); + get_random_bytes(&ediv, sizeof(ediv)); get_random_bytes(&rand, sizeof(rand)); @@ -1252,7 +1337,10 @@ static void smp_distribute_keys(struct smp_chan *smp) csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); if (csrk) { - csrk->master = 0x00; + if (hcon->sec_level > BT_SECURITY_MEDIUM) + csrk->type = MGMT_CSRK_LOCAL_AUTHENTICATED; + else + csrk->type = MGMT_CSRK_LOCAL_UNAUTHENTICATED; memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); } smp->slave_csrk = csrk; @@ -1297,7 +1385,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) smp->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(smp->tfm_aes)) { BT_ERR("Unable to create ECB crypto context"); - kfree(smp); + kzfree(smp); return NULL; } @@ -1305,7 +1393,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) if (IS_ERR(smp->tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); crypto_free_blkcipher(smp->tfm_aes); - kfree(smp); + kzfree(smp); return NULL; } @@ -1601,15 +1689,15 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp, struct hci_dev *hdev = conn->hcon->hdev; u8 local_dist = 0, remote_dist = 0; - if (test_bit(HCI_BONDABLE, &hdev->dev_flags)) { + if (hci_dev_test_flag(hdev, HCI_BONDABLE)) { local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; } - if (test_bit(HCI_RPA_RESOLVING, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING)) remote_dist |= SMP_DIST_ID_KEY; - if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) + if (hci_dev_test_flag(hdev, HCI_PRIVACY)) local_dist |= SMP_DIST_ID_KEY; if (!rsp) { @@ -1617,7 +1705,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp, req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; - req->max_key_size = SMP_MAX_ENC_KEY_SIZE; + req->max_key_size = conn->hcon->enc_key_size; smp->remote_key_dist = remote_dist; @@ -1626,7 +1714,7 @@ static void build_bredr_pairing_cmd(struct smp_chan *smp, memset(rsp, 0, sizeof(*rsp)); - rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; + rsp->max_key_size = conn->hcon->enc_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; @@ -1661,22 +1749,29 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) /* We didn't start the pairing, so match remote */ auth = req->auth_req & AUTH_REQ_MASK(hdev); - if (!test_bit(HCI_BONDABLE, &hdev->dev_flags) && + if (!hci_dev_test_flag(hdev, HCI_BONDABLE) && (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; - if (test_bit(HCI_SC_ONLY, &hdev->dev_flags) && !(auth & SMP_AUTH_SC)) + if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); + /* If the remote side's OOB flag is set it means it has + * successfully received our local OOB data - therefore set the + * flag to indicate that local OOB is in use. + */ + if (req->oob_flag == SMP_OOB_PRESENT) + set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); + /* SMP over BR/EDR requires special treatment */ if (conn->hcon->type == ACL_LINK) { /* We must have a BR/EDR SC link */ if (!test_bit(HCI_CONN_AES_CCM, &conn->hcon->flags) && - !test_bit(HCI_FORCE_BREDR_SMP, &hdev->dbg_flags)) + !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return SMP_CROSS_TRANSP_NOT_ALLOWED; set_bit(SMP_FLAG_SC, &smp->flags); @@ -1734,14 +1829,19 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) clear_bit(SMP_FLAG_INITIATOR, &smp->flags); + /* Strictly speaking we shouldn't allow Pairing Confirm for the + * SC case, however some implementations incorrectly copy RFU auth + * req bits from our security request, which may create a false + * positive SC enablement. + */ + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); + if (test_bit(SMP_FLAG_SC, &smp->flags)) { SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY); /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; /* Wait for Public Key from Initiating Device */ return 0; - } else { - SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); } /* Request setup of TK */ @@ -1758,7 +1858,26 @@ static u8 sc_send_public_key(struct smp_chan *smp) BT_DBG(""); - if (test_bit(HCI_USE_DEBUG_KEYS, &hdev->dev_flags)) { + if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { + struct l2cap_chan *chan = hdev->smp_data; + struct smp_dev *smp_dev; + + if (!chan || !chan->data) + return SMP_UNSPECIFIED; + + smp_dev = chan->data; + + memcpy(smp->local_pk, smp_dev->local_pk, 64); + memcpy(smp->local_sk, smp_dev->local_sk, 32); + memcpy(smp->lr, smp_dev->local_rand, 16); + + if (smp_dev->debug_key) + set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); + + goto done; + } + + if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { BT_DBG("Using debug keys"); memcpy(smp->local_pk, debug_pk, 64); memcpy(smp->local_sk, debug_sk, 32); @@ -1777,8 +1896,9 @@ static u8 sc_send_public_key(struct smp_chan *smp) } } +done: SMP_DBG("Local Public Key X: %32phN", smp->local_pk); - SMP_DBG("Local Public Key Y: %32phN", &smp->local_pk[32]); + SMP_DBG("Local Public Key Y: %32phN", smp->local_pk + 32); SMP_DBG("Local Private Key: %32phN", smp->local_sk); smp_send_cmd(smp->conn, SMP_CMD_PUBLIC_KEY, 64, smp->local_pk); @@ -1813,9 +1933,16 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) auth = rsp->auth_req & AUTH_REQ_MASK(hdev); - if (test_bit(HCI_SC_ONLY, &hdev->dev_flags) && !(auth & SMP_AUTH_SC)) + if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; + /* If the remote side's OOB flag is set it means it has + * successfully received our local OOB data - therefore set the + * flag to indicate that local OOB is in use. + */ + if (rsp->oob_flag == SMP_OOB_PRESENT) + set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); + smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], rsp, sizeof(*rsp)); @@ -1882,10 +2009,6 @@ static u8 sc_check_confirm(struct smp_chan *smp) BT_DBG(""); - /* Public Key exchange must happen before any other steps */ - if (!test_bit(SMP_FLAG_REMOTE_PK, &smp->flags)) - return SMP_UNSPECIFIED; - if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM); @@ -1898,6 +2021,47 @@ static u8 sc_check_confirm(struct smp_chan *smp) return 0; } +/* Work-around for some implementations that incorrectly copy RFU bits + * from our security request and thereby create the impression that + * we're doing SC when in fact the remote doesn't support it. + */ +static int fixup_sc_false_positive(struct smp_chan *smp) +{ + struct l2cap_conn *conn = smp->conn; + struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; + struct smp_cmd_pairing *req, *rsp; + u8 auth; + + /* The issue is only observed when we're in slave role */ + if (hcon->out) + return SMP_UNSPECIFIED; + + if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { + BT_ERR("Refusing SMP SC -> legacy fallback in SC-only mode"); + return SMP_UNSPECIFIED; + } + + BT_ERR("Trying to fall back to legacy SMP"); + + req = (void *) &smp->preq[1]; + rsp = (void *) &smp->prsp[1]; + + /* Rebuild key dist flags which may have been cleared for SC */ + smp->remote_key_dist = (req->init_key_dist & rsp->resp_key_dist); + + auth = req->auth_req & AUTH_REQ_MASK(hdev); + + if (tk_request(conn, 0, auth, rsp->io_capability, req->io_capability)) { + BT_ERR("Failed to fall back to legacy SMP"); + return SMP_UNSPECIFIED; + } + + clear_bit(SMP_FLAG_SC, &smp->flags); + + return 0; +} + static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_chan *chan = conn->smp; @@ -1911,8 +2075,19 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf)); skb_pull(skb, sizeof(smp->pcnf)); - if (test_bit(SMP_FLAG_SC, &smp->flags)) - return sc_check_confirm(smp); + if (test_bit(SMP_FLAG_SC, &smp->flags)) { + int ret; + + /* Public Key exchange must happen before any other steps */ + if (test_bit(SMP_FLAG_REMOTE_PK, &smp->flags)) + return sc_check_confirm(smp); + + BT_ERR("Unexpected SMP Pairing Confirm"); + + ret = fixup_sc_false_positive(smp); + if (ret) + return ret; + } if (conn->hcon->out) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), @@ -1923,8 +2098,8 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) return smp_confirm(smp); - else - set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); + + set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); return 0; } @@ -2032,7 +2207,7 @@ static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return true; - hci_le_start_enc(hcon, key->ediv, key->rand, key->val); + hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size); hcon->enc_key_size = key->enc_size; /* We never store STKs for master role, so clear this flag */ @@ -2083,7 +2258,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) auth = rp->auth_req & AUTH_REQ_MASK(hdev); - if (test_bit(HCI_SC_ONLY, &hdev->dev_flags) && !(auth & SMP_AUTH_SC)) + if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; if (hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) @@ -2104,7 +2279,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) if (!smp) return SMP_UNSPECIFIED; - if (!test_bit(HCI_BONDABLE, &hcon->hdev->dev_flags) && + if (!hci_dev_test_flag(hdev, HCI_BONDABLE) && (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; @@ -2137,8 +2312,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) return 1; chan = conn->smp; + if (!chan) { + BT_ERR("SMP security requested but not available"); + return 1; + } - if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) + if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) return 1; if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) @@ -2167,7 +2346,7 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) authreq = seclevel_to_authreq(sec_level); - if (test_bit(HCI_SC_ENABLED, &hcon->hdev->dev_flags)) + if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) authreq |= SMP_AUTH_SC; /* Require MITM if IO Capability allows or the security level @@ -2352,7 +2531,10 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); if (csrk) { - csrk->master = 0x01; + if (conn->hcon->sec_level > BT_SECURITY_MEDIUM) + csrk->type = MGMT_CSRK_REMOTE_AUTHENTICATED; + else + csrk->type = MGMT_CSRK_REMOTE_UNAUTHENTICATED; memcpy(csrk->val, rp->csrk, sizeof(csrk->val)); } smp->csrk = csrk; @@ -2368,7 +2550,8 @@ static u8 sc_select_method(struct smp_chan *smp) struct smp_cmd_pairing *local, *remote; u8 local_mitm, remote_mitm, local_io, remote_io, method; - if (test_bit(SMP_FLAG_OOB, &smp->flags)) + if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags) || + test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) return REQ_OOB; /* The preq/prsp contain the raw Pairing Request/Response PDUs @@ -2422,6 +2605,16 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(smp->remote_pk, key, 64); + if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags)) { + err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->remote_pk, + smp->rr, 0, cfm.confirm_val); + if (err) + return SMP_UNSPECIFIED; + + if (memcmp(cfm.confirm_val, smp->pcnf, 16)) + return SMP_CONFIRM_FAILED; + } + /* Non-initiating device sends its public key after receiving * the key from the initiating device. */ @@ -2432,7 +2625,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) } SMP_DBG("Remote Public Key X: %32phN", smp->remote_pk); - SMP_DBG("Remote Public Key Y: %32phN", &smp->remote_pk[32]); + SMP_DBG("Remote Public Key Y: %32phN", smp->remote_pk + 32); if (!ecdh_shared_secret(smp->remote_pk, smp->local_sk, smp->dhkey)) return SMP_UNSPECIFIED; @@ -2470,14 +2663,6 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) } if (smp->method == REQ_OOB) { - err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->remote_pk, - smp->rr, 0, cfm.confirm_val); - if (err) - return SMP_UNSPECIFIED; - - if (memcmp(cfm.confirm_val, smp->pcnf, 16)) - return SMP_CONFIRM_FAILED; - if (hcon->out) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); @@ -2550,6 +2735,8 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) put_unaligned_le32(hcon->passkey_notify, r); + else if (smp->method == REQ_OOB) + memcpy(r, smp->lr, 16); err = smp_f6(smp->tfm_cmac, smp->mackey, smp->rrnd, smp->prnd, r, io_cap, remote_addr, local_addr, e); @@ -2572,7 +2759,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) sc_add_ltk(smp); if (hcon->out) { - hci_le_start_enc(hcon, 0, 0, smp->tk); + hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; } @@ -2600,7 +2787,7 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) if (skb->len < 1) return -EILSEQ; - if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) { + if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) { reason = SMP_PAIRING_NOTSUPP; goto done; } @@ -2738,16 +2925,16 @@ static void bredr_pairing(struct l2cap_chan *chan) return; /* Secure Connections support must be enabled */ - if (!test_bit(HCI_SC_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_SC_ENABLED)) return; /* BR/EDR must use Secure Connections for SMP */ if (!test_bit(HCI_CONN_AES_CCM, &hcon->flags) && - !test_bit(HCI_FORCE_BREDR_SMP, &hdev->dbg_flags)) + !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return; /* If our LE support is not enabled don't do anything */ - if (!test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; /* Don't bother if remote LE support is not enabled */ @@ -2851,7 +3038,7 @@ static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, return ERR_PTR(-ENOMEM); skb->priority = HCI_PRIO_MAX; - bt_cb(skb)->chan = chan; + bt_cb(skb)->l2cap.chan = chan; return skb; } @@ -2924,51 +3111,65 @@ static const struct l2cap_ops smp_root_chan_ops = { static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) { struct l2cap_chan *chan; - struct crypto_blkcipher *tfm_aes; + struct smp_dev *smp; + struct crypto_blkcipher *tfm_aes; + struct crypto_hash *tfm_cmac; if (cid == L2CAP_CID_SMP_BREDR) { - tfm_aes = NULL; + smp = NULL; goto create_chan; } - tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, 0); + smp = kzalloc(sizeof(*smp), GFP_KERNEL); + if (!smp) + return ERR_PTR(-ENOMEM); + + tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_aes)) { - BT_ERR("Unable to create crypto context"); + BT_ERR("Unable to create ECB crypto context"); + kzfree(smp); return ERR_CAST(tfm_aes); } + tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm_cmac)) { + BT_ERR("Unable to create CMAC crypto context"); + crypto_free_blkcipher(tfm_aes); + kzfree(smp); + return ERR_CAST(tfm_cmac); + } + + smp->tfm_aes = tfm_aes; + smp->tfm_cmac = tfm_cmac; + smp->min_key_size = SMP_MIN_ENC_KEY_SIZE; + smp->max_key_size = SMP_MAX_ENC_KEY_SIZE; + create_chan: chan = l2cap_chan_create(); if (!chan) { - crypto_free_blkcipher(tfm_aes); + if (smp) { + crypto_free_blkcipher(smp->tfm_aes); + crypto_free_hash(smp->tfm_cmac); + kzfree(smp); + } return ERR_PTR(-ENOMEM); } - chan->data = tfm_aes; + chan->data = smp; l2cap_add_scid(chan, cid); l2cap_chan_set_defaults(chan); if (cid == L2CAP_CID_SMP) { - /* If usage of static address is forced or if the devices - * does not have a public address, then listen on the static - * address. - * - * In case BR/EDR has been disabled on a dual-mode controller - * and a static address has been configued, then listen on - * the static address instead. - */ - if (test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags) || - !bacmp(&hdev->bdaddr, BDADDR_ANY) || - (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags) && - bacmp(&hdev->static_addr, BDADDR_ANY))) { - bacpy(&chan->src, &hdev->static_addr); - chan->src_type = BDADDR_LE_RANDOM; - } else { - bacpy(&chan->src, &hdev->bdaddr); + u8 bdaddr_type; + + hci_copy_identity_address(hdev, &chan->src, &bdaddr_type); + + if (bdaddr_type == ADDR_LE_DEV_PUBLIC) chan->src_type = BDADDR_LE_PUBLIC; - } + else + chan->src_type = BDADDR_LE_RANDOM; } else { bacpy(&chan->src, &hdev->bdaddr); chan->src_type = BDADDR_BREDR; @@ -2987,14 +3188,18 @@ create_chan: static void smp_del_chan(struct l2cap_chan *chan) { - struct crypto_blkcipher *tfm_aes; + struct smp_dev *smp; BT_DBG("chan %p", chan); - tfm_aes = chan->data; - if (tfm_aes) { + smp = chan->data; + if (smp) { chan->data = NULL; - crypto_free_blkcipher(tfm_aes); + if (smp->tfm_aes) + crypto_free_blkcipher(smp->tfm_aes); + if (smp->tfm_cmac) + crypto_free_hash(smp->tfm_cmac); + kzfree(smp); } l2cap_chan_put(chan); @@ -3007,7 +3212,7 @@ static ssize_t force_bredr_smp_read(struct file *file, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = test_bit(HCI_FORCE_BREDR_SMP, &hdev->dbg_flags) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP) ? 'Y': 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -3029,7 +3234,7 @@ static ssize_t force_bredr_smp_write(struct file *file, if (strtobool(buf, &enable)) return -EINVAL; - if (enable == test_bit(HCI_FORCE_BREDR_SMP, &hdev->dbg_flags)) + if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return -EALREADY; if (enable) { @@ -3048,7 +3253,7 @@ static ssize_t force_bredr_smp_write(struct file *file, smp_del_chan(chan); } - change_bit(HCI_FORCE_BREDR_SMP, &hdev->dbg_flags); + hci_dev_change_flag(hdev, HCI_FORCE_BREDR_SMP); return count; } @@ -3060,6 +3265,94 @@ static const struct file_operations force_bredr_smp_fops = { .llseek = default_llseek, }; +static ssize_t le_min_key_size_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[4]; + + snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size); + + return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); +} + +static ssize_t le_min_key_size_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf) - 1)); + u8 key_size; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + + sscanf(buf, "%hhu", &key_size); + + if (key_size > SMP_DEV(hdev)->max_key_size || + key_size < SMP_MIN_ENC_KEY_SIZE) + return -EINVAL; + + SMP_DEV(hdev)->min_key_size = key_size; + + return count; +} + +static const struct file_operations le_min_key_size_fops = { + .open = simple_open, + .read = le_min_key_size_read, + .write = le_min_key_size_write, + .llseek = default_llseek, +}; + +static ssize_t le_max_key_size_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[4]; + + snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size); + + return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); +} + +static ssize_t le_max_key_size_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf) - 1)); + u8 key_size; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + + sscanf(buf, "%hhu", &key_size); + + if (key_size > SMP_MAX_ENC_KEY_SIZE || + key_size < SMP_DEV(hdev)->min_key_size) + return -EINVAL; + + SMP_DEV(hdev)->max_key_size = key_size; + + return count; +} + +static const struct file_operations le_max_key_size_fops = { + .open = simple_open, + .read = le_max_key_size_read, + .write = le_max_key_size_write, + .llseek = default_llseek, +}; + int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; @@ -3084,6 +3377,11 @@ int smp_register(struct hci_dev *hdev) hdev->smp_data = chan; + debugfs_create_file("le_min_key_size", 0644, hdev->debugfs, hdev, + &le_min_key_size_fops); + debugfs_create_file("le_max_key_size", 0644, hdev->debugfs, hdev, + &le_max_key_size_fops); + /* If the controller does not support BR/EDR Secure Connections * feature, then the BR/EDR SMP channel shall not be present. * @@ -3367,6 +3665,21 @@ static int __init test_h6(struct crypto_hash *tfm_cmac) return 0; } +static char test_smp_buffer[32]; + +static ssize_t test_smp_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + return simple_read_from_buffer(user_buf, count, ppos, test_smp_buffer, + strlen(test_smp_buffer)); +} + +static const struct file_operations test_smp_fops = { + .open = simple_open, + .read = test_smp_read, + .llseek = default_llseek, +}; + static int __init run_selftests(struct crypto_blkcipher *tfm_aes, struct crypto_hash *tfm_cmac) { @@ -3379,49 +3692,49 @@ static int __init run_selftests(struct crypto_blkcipher *tfm_aes, err = test_ah(tfm_aes); if (err) { BT_ERR("smp_ah test failed"); - return err; + goto done; } err = test_c1(tfm_aes); if (err) { BT_ERR("smp_c1 test failed"); - return err; + goto done; } err = test_s1(tfm_aes); if (err) { BT_ERR("smp_s1 test failed"); - return err; + goto done; } err = test_f4(tfm_cmac); if (err) { BT_ERR("smp_f4 test failed"); - return err; + goto done; } err = test_f5(tfm_cmac); if (err) { BT_ERR("smp_f5 test failed"); - return err; + goto done; } err = test_f6(tfm_cmac); if (err) { BT_ERR("smp_f6 test failed"); - return err; + goto done; } err = test_g2(tfm_cmac); if (err) { BT_ERR("smp_g2 test failed"); - return err; + goto done; } err = test_h6(tfm_cmac); if (err) { BT_ERR("smp_h6 test failed"); - return err; + goto done; } rettime = ktime_get(); @@ -3430,7 +3743,17 @@ static int __init run_selftests(struct crypto_blkcipher *tfm_aes, BT_INFO("SMP test passed in %llu usecs", duration); - return 0; +done: + if (!err) + snprintf(test_smp_buffer, sizeof(test_smp_buffer), + "PASS (%llu usecs)\n", duration); + else + snprintf(test_smp_buffer, sizeof(test_smp_buffer), "FAIL\n"); + + debugfs_create_file("selftest_smp", 0444, bt_debugfs, NULL, + &test_smp_fops); + + return err; } int __init bt_selftest_smp(void) diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 60c5b73fcb4b..6cf872563ea7 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -188,6 +188,7 @@ int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], const bdaddr_t *bdaddr); int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa); +int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]); int smp_register(struct hci_dev *hdev); void smp_unregister(struct hci_dev *hdev); diff --git a/net/bridge/Makefile b/net/bridge/Makefile index fd7ee03c59b3..a1cda5d4718d 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -12,6 +12,8 @@ bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o +br_netfilter-y := br_netfilter_hooks.o +br_netfilter-$(subst m,y,$(CONFIG_IPV6)) += br_netfilter_ipv6.o obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 02c24cf63c34..a1abe4936fe1 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -121,13 +121,13 @@ static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; -static int br_netdev_switch_event(struct notifier_block *unused, - unsigned long event, void *ptr) +static int br_switchdev_event(struct notifier_block *unused, + unsigned long event, void *ptr) { - struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr); + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; - struct netdev_switch_notifier_fdb_info *fdb_info; + struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; rtnl_lock(); @@ -138,14 +138,14 @@ static int br_netdev_switch_event(struct notifier_block *unused, br = p->br; switch (event) { - case NETDEV_SWITCH_FDB_ADD: + case SWITCHDEV_FDB_ADD: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, fdb_info->vid); if (err) err = notifier_from_errno(err); break; - case NETDEV_SWITCH_FDB_DEL: + case SWITCHDEV_FDB_DEL: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, fdb_info->vid); @@ -159,8 +159,8 @@ out: return err; } -static struct notifier_block br_netdev_switch_notifier = { - .notifier_call = br_netdev_switch_event, +static struct notifier_block br_switchdev_notifier = { + .notifier_call = br_switchdev_event, }; static void __net_exit br_net_exit(struct net *net) @@ -214,7 +214,7 @@ static int __init br_init(void) if (err) goto err_out3; - err = register_netdev_switch_notifier(&br_netdev_switch_notifier); + err = register_switchdev_notifier(&br_switchdev_notifier); if (err) goto err_out4; @@ -235,7 +235,7 @@ static int __init br_init(void) return 0; err_out5: - unregister_netdev_switch_notifier(&br_netdev_switch_notifier); + unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: @@ -253,7 +253,7 @@ static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); - unregister_netdev_switch_notifier(&br_netdev_switch_notifier); + unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); unregister_pernet_subsys(&br_net_ops); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ffd379db5938..4ff77a16956c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -25,6 +25,9 @@ #define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \ NETIF_F_GSO_MASK | NETIF_F_HW_CSUM) +const struct nf_br_ops __rcu *nf_br_ops __read_mostly; +EXPORT_SYMBOL_GPL(nf_br_ops); + /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -33,16 +36,15 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); + const struct nf_br_ops *nf_ops; u16 vid = 0; rcu_read_lock(); -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { - br_nf_pre_routing_finish_bridge_slow(skb); + nf_ops = rcu_dereference(nf_br_ops); + if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) { rcu_read_unlock(); return NETDEV_TX_OK; } -#endif u64_stats_update_begin(&brstats->syncp); brstats->tx_packets++; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e0670d7054f9..9e9875da0a4f 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -24,6 +24,7 @@ #include <linux/atomic.h> #include <asm/unaligned.h> #include <linux/if_vlan.h> +#include <net/switchdev.h> #include "br_private.h" static struct kmem_cache *br_fdb_cache __read_mostly; @@ -130,11 +131,27 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr) } } +static void fdb_del_external_learn(struct net_bridge_fdb_entry *f) +{ + struct switchdev_obj obj = { + .id = SWITCHDEV_OBJ_PORT_FDB, + .u.fdb = { + .addr = f->addr.addr, + .vid = f->vlan_id, + }, + }; + + switchdev_port_obj_del(f->dst->dev, &obj); +} + static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) { if (f->is_static) fdb_del_hw_addr(br, f->addr.addr); + if (f->added_by_external_learn) + fdb_del_external_learn(f); + hlist_del_rcu(&f->hlist); fdb_notify(br, f, RTM_DELNEIGH); call_rcu(&f->rcu, fdb_rcu_free); @@ -313,9 +330,11 @@ void br_fdb_flush(struct net_bridge *br) /* Flush all entries referring to a specific port. * if do_all is set also flush static entries + * if vid is set delete all entries that match the vlan_id */ void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, + u16 vid, int do_all) { int i; @@ -330,8 +349,9 @@ void br_fdb_delete_by_port(struct net_bridge *br, if (f->dst != p) continue; - if (f->is_static && !do_all) - continue; + if (!do_all) + if (f->is_static || (vid && f->vlan_id != vid)) + continue; if (f->is_local) fdb_delete_local(br, p, f); @@ -736,6 +756,12 @@ static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, struct net_bridge_fdb_entry *fdb; bool modified = false; + /* If the port cannot learn allow only local and static entries */ + if (!(state & NUD_PERMANENT) && !(state & NUD_NOARP) && + !(source->state == BR_STATE_LEARNING || + source->state == BR_STATE_FORWARDING)) + return -EPERM; + fdb = fdb_find(head, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -796,9 +822,11 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p, int err = 0; if (ndm->ndm_flags & NTF_USE) { + local_bh_disable(); rcu_read_lock(); br_fdb_update(p->br, p, addr, vid, true); rcu_read_unlock(); + local_bh_enable(); } else { spin_lock_bh(&p->br->hash_lock); err = fdb_add_entry(p, addr, ndm->ndm_state, @@ -865,13 +893,15 @@ out: return err; } -static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vlan) +static int fdb_delete_by_addr_and_port(struct net_bridge_port *p, + const u8 *addr, u16 vlan) { + struct net_bridge *br = p->br; struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)]; struct net_bridge_fdb_entry *fdb; fdb = fdb_find(head, addr, vlan); - if (!fdb) + if (!fdb || fdb->dst != p) return -ENOENT; fdb_delete(br, fdb); @@ -884,7 +914,7 @@ static int __br_fdb_delete(struct net_bridge_port *p, int err; spin_lock_bh(&p->br->hash_lock); - err = fdb_delete_by_addr(p->br, addr, vid); + err = fdb_delete_by_addr_and_port(p, addr, vid); spin_unlock_bh(&p->br->hash_lock); return err; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index f96933a823e3..fa7bfced888e 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -35,25 +35,40 @@ static inline int should_deliver(const struct net_bridge_port *p, p->state == BR_STATE_FORWARDING; } -int br_dev_queue_push_xmit(struct sk_buff *skb) +int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb) { - /* ip_fragment doesn't copy the MAC header */ - if (nf_bridge_maybe_copy_header(skb) || - !is_skb_forwardable(skb->dev, skb)) { - kfree_skb(skb); - } else { - skb_push(skb, ETH_HLEN); - br_drop_fake_rtable(skb); - dev_queue_xmit(skb); + if (!is_skb_forwardable(skb->dev, skb)) + goto drop; + + skb_push(skb, ETH_HLEN); + br_drop_fake_rtable(skb); + skb_sender_cpu_clear(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + (skb->protocol == htons(ETH_P_8021Q) || + skb->protocol == htons(ETH_P_8021AD))) { + int depth; + + if (!__vlan_get_protocol(skb, skb->protocol, &depth)) + goto drop; + + skb_set_network_header(skb, depth); } + dev_queue_xmit(skb); + + return 0; + +drop: + kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); -int br_forward_finish(struct sk_buff *skb) +int br_forward_finish(struct sock *sk, struct sk_buff *skb) { - return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, sk, skb, + NULL, skb->dev, br_dev_queue_push_xmit); } @@ -77,7 +92,8 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) return; } - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb, + NULL, skb->dev, br_forward_finish); } @@ -98,7 +114,8 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) skb->dev = to->dev; skb_forward_csum(skb); - NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, NULL, skb, + indev, skb->dev, br_forward_finish); } @@ -188,6 +205,9 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) continue; + if ((p->flags & BR_PROXYARP_WIFI) && + BR_INPUT_SKB_CB(skb)->proxyarp_replied) + continue; prev = maybe_deliver(prev, p, skb, __packet_hook); if (IS_ERR(prev)) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 1849d96b3c91..a538cb1199a3 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -249,7 +249,7 @@ static void del_nbp(struct net_bridge_port *p) list_del_rcu(&p->list); nbp_vlan_flush(p); - br_fdb_delete_by_port(br, p, 1); + br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); @@ -278,7 +278,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) del_nbp(p); } - br_fdb_delete_by_port(br, NULL, 1); + br_fdb_delete_by_port(br, NULL, 0, 1); br_vlan_flush(br); del_timer_sync(&br->gc_timer); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index e2aa7be3a847..f921a5dce22d 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -55,12 +55,13 @@ static int br_pass_frame_up(struct sk_buff *skb) if (!skb) return NET_RX_DROP; - return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL, - netif_receive_skb); + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb, + indev, NULL, + netif_receive_skb_sk); } static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, - u16 vid) + u16 vid, struct net_bridge_port *p) { struct net_device *dev = br->dev; struct neighbour *n; @@ -68,6 +69,8 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, u8 *arpptr, *sha; __be32 sip, tip; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + if (dev->flags & IFF_NOARP) return; @@ -105,16 +108,19 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, } f = __br_fdb_get(br, n->ha, vid); - if (f) + if (f && ((p->flags & BR_PROXYARP) || + (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) { arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, sha, n->ha, sha); + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + } neigh_release(n); } } /* note: already called with rcu_read_lock */ -int br_handle_frame_finish(struct sk_buff *skb) +int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb) { const unsigned char *dest = eth_hdr(skb)->h_dest; struct net_bridge_port *p = br_port_get_rcu(skb->dev); @@ -153,12 +159,10 @@ int br_handle_frame_finish(struct sk_buff *skb) dst = NULL; - if (is_broadcast_ether_addr(dest)) { - if (IS_ENABLED(CONFIG_INET) && - p->flags & BR_PROXYARP && - skb->protocol == htons(ETH_P_ARP)) - br_do_proxy_arp(skb, br, vid); + if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) + br_do_proxy_arp(skb, br, vid, p); + if (is_broadcast_ether_addr(dest)) { skb2 = skb; unicast = false; } else if (is_multicast_ether_addr(dest)) { @@ -204,7 +208,7 @@ drop: EXPORT_SYMBOL_GPL(br_handle_frame_finish); /* note: already called with rcu_read_lock */ -static int br_handle_local_finish(struct sk_buff *skb) +static int br_handle_local_finish(struct sock *sk, struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; @@ -274,8 +278,8 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } /* Deliver packet to local host only */ - if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, - NULL, br_handle_local_finish)) { + if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb, + skb->dev, NULL, br_handle_local_finish)) { return RX_HANDLER_CONSUMED; /* consumed by filter */ } else { *pskb = skb; @@ -299,7 +303,8 @@ forward: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; - NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, NULL, skb, + skb->dev, NULL, br_handle_frame_finish); break; default: diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index a9a4a1b7863d..8d423bc649b9 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -247,9 +247,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - spin_lock_bh(&br->lock); br_stp_set_bridge_priority(br, args[1]); - spin_unlock_bh(&br->lock); return 0; case BRCTL_SET_PORT_PRIORITY: diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 409608960899..c94321955db7 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -170,7 +170,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb, struct br_port_msg *bpm; struct nlattr *nest, *nest2; - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; @@ -323,6 +323,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct net_bridge_mdb_htable *mdb; + unsigned long now = jiffies; int err; mdb = mlock_dereference(br->mdb, br); @@ -347,8 +348,9 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); + if (state == MDB_TEMPORARY) + mod_timer(&p->timer, now + br->multicast_membership_interval); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB); return 0; } @@ -371,6 +373,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, if (!p || p->br != br || p->state == BR_STATE_DISABLED) return -EINVAL; + memset(&ip, 0, sizeof(ip)); ip.proto = entry->addr.proto; if (ip.proto == htons(ETH_P_IP)) ip.u.ip4 = entry->addr.u.ip4; @@ -417,20 +420,14 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!netif_running(br->dev) || br->multicast_disabled) return -EINVAL; + memset(&ip, 0, sizeof(ip)); ip.proto = entry->addr.proto; - if (ip.proto == htons(ETH_P_IP)) { - if (timer_pending(&br->ip4_other_query.timer)) - return -EBUSY; - + if (ip.proto == htons(ETH_P_IP)) ip.u.ip4 = entry->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) - } else { - if (timer_pending(&br->ip6_other_query.timer)) - return -EBUSY; - + else ip.u.ip6 = entry->addr.u.ip6; #endif - } spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); @@ -448,6 +445,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (p->port->state == BR_STATE_DISABLED) goto unlock; + entry->state = p->state; rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index c465876c7861..0b39dcc65b94 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -37,6 +37,18 @@ static void br_multicast_start_querier(struct net_bridge *br, struct bridge_mcast_own_query *query); +static void br_multicast_add_router(struct net_bridge *br, + struct net_bridge_port *port); +static void br_ip4_multicast_leave_group(struct net_bridge *br, + struct net_bridge_port *port, + __be32 group, + __u16 vid); +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_leave_group(struct net_bridge *br, + struct net_bridge_port *port, + const struct in6_addr *group, + __u16 vid); +#endif unsigned int br_mdb_rehash_seq; static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) @@ -814,7 +826,8 @@ static void __br_multicast_send_query(struct net_bridge *br, if (port) { skb->dev = port->dev; - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb, + NULL, skb->dev, br_dev_queue_push_xmit); } else { br_multicast_select_own_querier(br, ip, skb); @@ -935,6 +948,8 @@ void br_multicast_enable_port(struct net_bridge_port *port) #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&port->ip6_own_query); #endif + if (port->multicast_router == 2 && hlist_unhashed(&port->rlist)) + br_multicast_add_router(br, port); out: spin_unlock(&br->multicast_lock); @@ -974,9 +989,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int err = 0; __be32 group; - if (!pskb_may_pull(skb, sizeof(*ih))) - return -EINVAL; - ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = sizeof(*ih); @@ -1008,9 +1020,15 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, continue; } - err = br_ip4_multicast_add_group(br, port, group, vid); - if (err) - break; + if ((type == IGMPV3_CHANGE_TO_INCLUDE || + type == IGMPV3_MODE_IS_INCLUDE) && + ntohs(grec->grec_nsrcs) == 0) { + br_ip4_multicast_leave_group(br, port, group, vid); + } else { + err = br_ip4_multicast_add_group(br, port, group, vid); + if (err) + break; + } } return err; @@ -1069,10 +1087,17 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, continue; } - err = br_ip6_multicast_add_group(br, port, &grec->grec_mca, - vid); - if (!err) - break; + if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || + grec->grec_type == MLD2_MODE_IS_INCLUDE) && + ntohs(*nsrcs) == 0) { + br_ip6_multicast_leave_group(br, port, &grec->grec_mca, + vid); + } else { + err = br_ip6_multicast_add_group(br, port, + &grec->grec_mca, vid); + if (!err) + break; + } } return err; @@ -1166,6 +1191,9 @@ static void br_multicast_add_router(struct net_bridge *br, struct net_bridge_port *p; struct hlist_node *slot = NULL; + if (!hlist_unhashed(&port->rlist)) + return; + hlist_for_each_entry(p, &br->router_list, rlist) { if ((unsigned long) port >= (unsigned long) p) break; @@ -1193,12 +1221,8 @@ static void br_multicast_mark_router(struct net_bridge *br, if (port->multicast_router != 1) return; - if (!hlist_unhashed(&port->rlist)) - goto timer; - br_multicast_add_router(br, port); -timer: mod_timer(&port->multicast_router_timer, now + br->multicast_querier_interval); } @@ -1247,25 +1271,14 @@ static int br_ip4_multicast_query(struct net_bridge *br, max_delay = 10 * HZ; group = 0; } - } else { - if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) { - err = -EINVAL; - goto out; - } - + } else if (skb->len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) goto out; max_delay = ih3->code ? IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; - } - - /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer - * all-systems destination addresses (224.0.0.1) for general queries - */ - if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) { - err = -EINVAL; + } else { goto out; } @@ -1328,12 +1341,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ - if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) { - err = -EINVAL; - goto out; - } - if (skb->len == sizeof(*mld)) { if (!pskb_may_pull(skb, sizeof(*mld))) { err = -EINVAL; @@ -1357,14 +1364,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, is_general_query = group && ipv6_addr_any(group); - /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer - * all-nodes destination address (ff02::1) for general queries - */ - if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) { - err = -EINVAL; - goto out; - } - if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); saddr.u.ip6 = ip6h->saddr; @@ -1417,8 +1416,7 @@ br_multicast_leave_group(struct net_bridge *br, spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED) || - timer_pending(&other_query->timer)) + (port && port->state == BR_STATE_DISABLED)) goto out; mdb = mlock_dereference(br->mdb, br); @@ -1426,6 +1424,31 @@ br_multicast_leave_group(struct net_bridge *br, if (!mp) goto out; + if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) { + struct net_bridge_port_group __rcu **pp; + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, br)) != NULL; + pp = &p->next) { + if (p->port != port) + continue; + + rcu_assign_pointer(*pp, p->next); + hlist_del_init(&p->mglist); + del_timer(&p->timer); + call_rcu_bh(&p->rcu, br_multicast_free_pg); + br_mdb_notify(br->dev, port, group, RTM_DELMDB); + + if (!mp->ports && !mp->mglist && + netif_running(br->dev)) + mod_timer(&mp->timer, jiffies); + } + goto out; + } + + if (timer_pending(&other_query->timer)) + goto out; + if (br->multicast_querier) { __br_multicast_send_query(br, port, &mp->addr); @@ -1451,28 +1474,6 @@ br_multicast_leave_group(struct net_bridge *br, } } - if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) { - struct net_bridge_port_group __rcu **pp; - - for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; - pp = &p->next) { - if (p->port != port) - continue; - - rcu_assign_pointer(*pp, p->next); - hlist_del_init(&p->mglist); - del_timer(&p->timer); - call_rcu_bh(&p->rcu, br_multicast_free_pg); - br_mdb_notify(br->dev, port, group, RTM_DELMDB); - - if (!mp->ports && !mp->mglist && - netif_running(br->dev)) - mod_timer(&mp->timer, jiffies); - } - goto out; - } - now = jiffies; time = now + br->multicast_last_member_count * br->multicast_last_member_interval; @@ -1556,74 +1557,22 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2 = skb; - const struct iphdr *iph; + struct sk_buff *skb_trimmed = NULL; struct igmphdr *ih; - unsigned int len; - unsigned int offset; int err; - /* We treat OOM as packet loss for now. */ - if (!pskb_may_pull(skb, sizeof(*iph))) - return -EINVAL; - - iph = ip_hdr(skb); - - if (iph->ihl < 5 || iph->version != 4) - return -EINVAL; - - if (!pskb_may_pull(skb, ip_hdrlen(skb))) - return -EINVAL; - - iph = ip_hdr(skb); - - if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) - return -EINVAL; + err = ip_mc_check_igmp(skb, &skb_trimmed); - if (iph->protocol != IPPROTO_IGMP) { - if (!ipv4_is_local_multicast(iph->daddr)) + if (err == -ENOMSG) { + if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; + } else if (err < 0) { + return err; } - len = ntohs(iph->tot_len); - if (skb->len < len || len < ip_hdrlen(skb)) - return -EINVAL; - - if (skb->len > len) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = pskb_trim_rcsum(skb2, len); - if (err) - goto err_out; - } - - len -= ip_hdrlen(skb2); - offset = skb_network_offset(skb2) + ip_hdrlen(skb2); - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - - err = -EINVAL; - if (!pskb_may_pull(skb2, sizeof(*ih))) - goto out; - - switch (skb2->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb2->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb2->csum = 0; - if (skb_checksum_complete(skb2)) - goto out; - } - - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; - ih = igmp_hdr(skb2); + ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: @@ -1632,21 +1581,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, err = br_ip4_multicast_add_group(br, port, ih->group, vid); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb2, vid); + err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - err = br_ip4_multicast_query(br, port, skb2, vid); + err = br_ip4_multicast_query(br, port, skb_trimmed, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(br, port, ih->group, vid); break; } -out: - __skb_push(skb2, offset); -err_out: - if (skb2 != skb) - kfree_skb(skb2); + if (skb_trimmed) + kfree_skb(skb_trimmed); + return err; } @@ -1656,138 +1603,42 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - struct sk_buff *skb2; - const struct ipv6hdr *ip6h; - u8 icmp6_type; - u8 nexthdr; - __be16 frag_off; - unsigned int len; - int offset; + struct sk_buff *skb_trimmed = NULL; + struct mld_msg *mld; int err; - if (!pskb_may_pull(skb, sizeof(*ip6h))) - return -EINVAL; - - ip6h = ipv6_hdr(skb); - - /* - * We're interested in MLD messages only. - * - Version is 6 - * - MLD has always Router Alert hop-by-hop option - * - But we do not support jumbrograms. - */ - if (ip6h->version != 6) - return 0; - - /* Prevent flooding this packet if there is no listener present */ - if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) - BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - - if (ip6h->nexthdr != IPPROTO_HOPOPTS || - ip6h->payload_len == 0) - return 0; - - len = ntohs(ip6h->payload_len) + sizeof(*ip6h); - if (skb->len < len) - return -EINVAL; - - nexthdr = ip6h->nexthdr; - offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off); + err = ipv6_mc_check_mld(skb, &skb_trimmed); - if (offset < 0 || nexthdr != IPPROTO_ICMPV6) + if (err == -ENOMSG) { + if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; - - /* Okay, we found ICMPv6 header */ - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - return -ENOMEM; - - err = -EINVAL; - if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr))) - goto out; - - len -= offset - skb_network_offset(skb2); - - __skb_pull(skb2, offset); - skb_reset_transport_header(skb2); - skb_postpull_rcsum(skb2, skb_network_header(skb2), - skb_network_header_len(skb2)); - - icmp6_type = icmp6_hdr(skb2)->icmp6_type; - - switch (icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - break; - default: - err = 0; - goto out; - } - - /* Okay, we found MLD message. Check further. */ - if (skb2->len > len) { - err = pskb_trim_rcsum(skb2, len); - if (err) - goto out; - err = -EINVAL; - } - - ip6h = ipv6_hdr(skb2); - - switch (skb2->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb2->len, - IPPROTO_ICMPV6, skb2->csum)) - break; - /*FALLTHROUGH*/ - case CHECKSUM_NONE: - skb2->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, - &ip6h->daddr, - skb2->len, - IPPROTO_ICMPV6, 0)); - if (__skb_checksum_complete(skb2)) - goto out; + } else if (err < 0) { + return err; } - err = 0; - BR_INPUT_SKB_CB(skb)->igmp = 1; + mld = (struct mld_msg *)skb_transport_header(skb); - switch (icmp6_type) { + switch (mld->mld_type) { case ICMPV6_MGM_REPORT: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid); break; - } case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb2, vid); + err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb2, vid); + err = br_ip6_multicast_query(br, port, skb_trimmed, vid); break; case ICMPV6_MGM_REDUCTION: - { - struct mld_msg *mld; - if (!pskb_may_pull(skb2, sizeof(*mld))) { - err = -EINVAL; - goto out; - } - mld = (struct mld_msg *)skb_transport_header(skb2); br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid); - } + break; } -out: - kfree_skb(skb2); + if (skb_trimmed) + kfree_skb(skb_trimmed); + return err; } #endif @@ -1821,7 +1672,7 @@ static void br_multicast_query_expired(struct net_bridge *br, if (query->startup_sent < br->multicast_startup_query_count) query->startup_sent++; - RCU_INIT_POINTER(querier, NULL); + RCU_INIT_POINTER(querier->port, NULL); br_multicast_send_query(br, NULL, query); spin_unlock(&br->multicast_lock); } @@ -1949,11 +1800,9 @@ out: int br_multicast_set_router(struct net_bridge *br, unsigned long val) { - int err = -ENOENT; + int err = -EINVAL; spin_lock_bh(&br->multicast_lock); - if (!netif_running(br->dev)) - goto unlock; switch (val) { case 0: @@ -1964,13 +1813,8 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) br->multicast_router = val; err = 0; break; - - default: - err = -EINVAL; - break; } -unlock: spin_unlock_bh(&br->multicast_lock); return err; @@ -1979,11 +1823,9 @@ unlock: int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) { struct net_bridge *br = p->br; - int err = -ENOENT; + int err = -EINVAL; spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || p->state == BR_STATE_DISABLED) - goto unlock; switch (val) { case 0: @@ -2005,13 +1847,8 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) br_multicast_add_router(br, p); break; - - default: - err = -EINVAL; - break; } -unlock: spin_unlock(&br->multicast_lock); return err; @@ -2116,15 +1953,11 @@ unlock: int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val) { - int err = -ENOENT; + int err = -EINVAL; u32 old; struct net_bridge_mdb_htable *mdb; spin_lock_bh(&br->multicast_lock); - if (!netif_running(br->dev)) - goto unlock; - - err = -EINVAL; if (!is_power_of_2(val)) goto unlock; diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter_hooks.c index 0ee453fad3de..c8b9bcfe997e 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter_hooks.c @@ -34,6 +34,7 @@ #include <net/ip.h> #include <net/ipv6.h> +#include <net/addrconf.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> @@ -43,11 +44,6 @@ #include <linux/sysctl.h> #endif -#define skb_origaddr(skb) (((struct bridge_skb_cb *) \ - (skb->nf_bridge->data))->daddr.ipv4) -#define store_orig_dstaddr(skb) (skb_origaddr(skb) = ip_hdr(skb)->daddr) -#define dnat_took_place(skb) (skb_origaddr(skb) != ip_hdr(skb)->daddr) - #ifdef CONFIG_SYSCTL static struct ctl_table_header *brnf_sysctl_header; static int brnf_call_iptables __read_mostly = 1; @@ -112,12 +108,27 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) pppoe_proto(skb) == htons(PPP_IPV6) && \ brnf_filter_pppoe_tagged) -static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) -{ - struct net_bridge_port *port; +/* largest possible L2 header, see br_nf_dev_queue_xmit() */ +#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) - port = br_port_get_rcu(dev); - return port ? &port->br->fake_rtable : NULL; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) +struct brnf_frag_data { + char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; + u8 encap_size; + u8 size; + u16 vlan_tci; + __be16 vlan_proto; +}; + +static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); +#endif + +static void nf_bridge_info_free(struct sk_buff *skb) +{ + if (skb->nf_bridge) { + nf_bridge_put(skb->nf_bridge); + skb->nf_bridge = NULL; + } } static inline struct net_device *bridge_parent(const struct net_device *dev) @@ -128,15 +139,6 @@ static inline struct net_device *bridge_parent(const struct net_device *dev) return port ? port->br->dev : NULL; } -static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) -{ - skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC); - if (likely(skb->nf_bridge)) - atomic_set(&(skb->nf_bridge->use), 1); - - return skb->nf_bridge; -} - static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = skb->nf_bridge; @@ -154,12 +156,16 @@ static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) return nf_bridge; } -static inline void nf_bridge_push_encap_header(struct sk_buff *skb) +unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { - unsigned int len = nf_bridge_encap_header_len(skb); - - skb_push(skb, len); - skb->network_header -= len; + switch (skb->protocol) { + case __cpu_to_be16(ETH_P_8021Q): + return VLAN_HLEN; + case __cpu_to_be16(ETH_P_PPP_SES): + return PPPOE_SES_HLEN; + default: + return 0; + } } static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) @@ -178,20 +184,12 @@ static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) skb->network_header += len; } -static inline void nf_bridge_save_header(struct sk_buff *skb) -{ - int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); - - skb_copy_from_linear_data_offset(skb, -header_size, - skb->nf_bridge->data, header_size); -} - /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format */ -static int br_parse_ip_options(struct sk_buff *skb) +static int br_validate_ipv4(struct sk_buff *skb) { const struct iphdr *iph; struct net_device *dev = skb->dev; @@ -239,34 +237,18 @@ drop: return -1; } -/* PF_BRIDGE/PRE_ROUTING *********************************************/ -/* Undo the changes made for ip6tables PREROUTING and continue the - * bridge PRE_ROUTING hook. */ -static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb) +void nf_bridge_update_protocol(struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge = skb->nf_bridge; - struct rtable *rt; - - if (nf_bridge->mask & BRNF_PKT_TYPE) { - skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->mask ^= BRNF_PKT_TYPE; + switch (skb->nf_bridge->orig_proto) { + case BRNF_PROTO_8021Q: + skb->protocol = htons(ETH_P_8021Q); + break; + case BRNF_PROTO_PPPOE: + skb->protocol = htons(ETH_P_PPP_SES); + break; + case BRNF_PROTO_UNCHANGED: + break; } - nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; - - rt = bridge_parent_rtable(nf_bridge->physindev); - if (!rt) { - kfree_skb(skb); - return 0; - } - skb_dst_set_noref(skb, &rt->dst); - - skb->dev = nf_bridge->physindev; - nf_bridge_update_protocol(skb); - nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, - br_handle_frame_finish, 1); - - return 0; } /* Obtain the correct destination MAC address, while preserving the original @@ -274,9 +256,8 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb) * don't, we use the neighbour framework to find out. In both cases, we make * sure that br_handle_frame_finish() is called afterwards. */ -static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) +int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge = skb->nf_bridge; struct neighbour *neigh; struct dst_entry *dst; @@ -286,12 +267,13 @@ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) dst = skb_dst(skb); neigh = dst_neigh_lookup_skb(dst, skb); if (neigh) { + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); int ret; if (neigh->hh.hh_len) { neigh_hh_bridge(&neigh->hh, skb); skb->dev = nf_bridge->physindev; - ret = br_handle_frame_finish(skb); + ret = br_handle_frame_finish(sk, skb); } else { /* the neighbour function below overwrites the complete * MAC header, so we save the Ethernet source address and @@ -299,7 +281,7 @@ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) */ skb_copy_from_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN), - skb->nf_bridge->data, + nf_bridge->neigh_header, ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ nf_bridge->mask |= BRNF_BRIDGED_DNAT; @@ -314,8 +296,16 @@ free_skb: return 0; } +static inline bool +br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) +{ + return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; +} + /* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. + * This is also true when SNAT takes place (for the reply direction). * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge @@ -352,24 +342,22 @@ free_skb: * device, we proceed as if ip_route_input() succeeded. If it differs from the * logical bridge port or if ip_route_output_key() fails we drop the packet. */ -static int br_nf_pre_routing_finish(struct sk_buff *skb) +static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct iphdr *iph = ip_hdr(skb); - struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; int err; - int frag_max_size; - frag_max_size = IPCB(skb)->frag_max_size; - BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size; + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; - if (nf_bridge->mask & BRNF_PKT_TYPE) { + if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->mask ^= BRNF_PKT_TYPE; + nf_bridge->pkt_otherhost = false; } - nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; - if (dnat_took_place(skb)) { + nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -405,7 +393,7 @@ bridged_dnat: nf_bridge_push_encap_header(skb); NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - skb, skb->dev, NULL, + sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge, 1); return 0; @@ -425,7 +413,8 @@ bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, + skb->dev, NULL, br_handle_frame_finish, 1); return 0; @@ -446,129 +435,29 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct } /* Some common code for IPv4/IPv6 */ -static struct net_device *setup_pre_routing(struct sk_buff *skb) +struct net_device *setup_pre_routing(struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; - nf_bridge->mask |= BRNF_PKT_TYPE; + nf_bridge->pkt_otherhost = true; } nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; nf_bridge->physindev = skb->dev; skb->dev = brnf_get_logical_dev(skb, skb->dev); + if (skb->protocol == htons(ETH_P_8021Q)) - nf_bridge->mask |= BRNF_8021Q; + nf_bridge->orig_proto = BRNF_PROTO_8021Q; else if (skb->protocol == htons(ETH_P_PPP_SES)) - nf_bridge->mask |= BRNF_PPPoE; + nf_bridge->orig_proto = BRNF_PROTO_PPPOE; /* Must drop socket now because of tproxy. */ skb_orphan(skb); return skb->dev; } -/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */ -static int check_hbh_len(struct sk_buff *skb) -{ - unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); - u32 pkt_len; - const unsigned char *nh = skb_network_header(skb); - int off = raw - nh; - int len = (raw[1] + 1) << 3; - - if ((raw + len) - skb->data > skb_headlen(skb)) - goto bad; - - off += 2; - len -= 2; - - while (len > 0) { - int optlen = nh[off + 1] + 2; - - switch (nh[off]) { - case IPV6_TLV_PAD1: - optlen = 1; - break; - - case IPV6_TLV_PADN: - break; - - case IPV6_TLV_JUMBO: - if (nh[off + 1] != 4 || (off & 3) != 2) - goto bad; - pkt_len = ntohl(*(__be32 *) (nh + off + 2)); - if (pkt_len <= IPV6_MAXPLEN || - ipv6_hdr(skb)->payload_len) - goto bad; - if (pkt_len > skb->len - sizeof(struct ipv6hdr)) - goto bad; - if (pskb_trim_rcsum(skb, - pkt_len + sizeof(struct ipv6hdr))) - goto bad; - nh = skb_network_header(skb); - break; - default: - if (optlen > len) - goto bad; - break; - } - off += optlen; - len -= optlen; - } - if (len == 0) - return 0; -bad: - return -1; - -} - -/* Replicate the checks that IPv6 does on packet reception and pass the packet - * to ip6tables, which doesn't support NAT, so things are fairly simple. */ -static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - const struct ipv6hdr *hdr; - u32 pkt_len; - - if (skb->len < sizeof(struct ipv6hdr)) - return NF_DROP; - - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return NF_DROP; - - hdr = ipv6_hdr(skb); - - if (hdr->version != 6) - return NF_DROP; - - pkt_len = ntohs(hdr->payload_len); - - if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) - return NF_DROP; - if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) - return NF_DROP; - } - if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) - return NF_DROP; - - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) - return NF_DROP; - if (!setup_pre_routing(skb)) - return NF_DROP; - - skb->protocol = htons(ETH_P_IPV6); - NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, - br_nf_pre_routing_finish_ipv6); - - return NF_STOLEN; -} - /* Direct IPv6 traffic to br_nf_pre_routing_ipv6. * Replicate the checks that IPv4 does on packet reception. * Set skb->dev to the bridge device (i.e. parent of the @@ -577,10 +466,9 @@ static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, * address to be able to detect DNAT afterwards. */ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { + struct nf_bridge_info *nf_bridge; struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); @@ -588,7 +476,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, if (unlikely(!pskb_may_pull(skb, len))) return NF_DROP; - p = br_port_get_rcu(in); + p = br_port_get_rcu(state->in); if (p == NULL) return NF_DROP; br = p->br; @@ -598,7 +486,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); - return br_nf_pre_routing_ipv6(ops, skb, in, out, okfn); + return br_nf_pre_routing_ipv6(ops, skb, state); } if (!brnf_call_iptables && !br->nf_call_iptables) @@ -609,7 +497,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, nf_bridge_pull_encap_header_rcsum(skb); - if (br_parse_ip_options(skb)) + if (br_validate_ipv4(skb)) return NF_DROP; nf_bridge_put(skb->nf_bridge); @@ -617,10 +505,14 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, return NF_DROP; if (!setup_pre_routing(skb)) return NF_DROP; - store_orig_dstaddr(skb); + + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; + skb->protocol = htons(ETH_P_IP); - NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb, + skb->dev, NULL, br_nf_pre_routing_finish); return NF_STOLEN; @@ -636,25 +528,30 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, * prevent this from happening. */ static unsigned int br_nf_local_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { br_drop_fake_rtable(skb); return NF_ACCEPT; } /* PF_BRIDGE/FORWARD *************************************************/ -static int br_nf_forward_finish(struct sk_buff *skb) +static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *in; if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { + + if (skb->protocol == htons(ETH_P_IP)) + nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; + + if (skb->protocol == htons(ETH_P_IPV6)) + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; + in = nf_bridge->physindev; - if (nf_bridge->mask & BRNF_PKT_TYPE) { + if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; - nf_bridge->mask ^= BRNF_PKT_TYPE; + nf_bridge->pkt_otherhost = false; } nf_bridge_update_protocol(skb); } else { @@ -662,8 +559,8 @@ static int br_nf_forward_finish(struct sk_buff *skb) } nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, in, - skb->dev, br_forward_finish, 1); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb, + in, skb->dev, br_forward_finish, 1); return 0; } @@ -675,9 +572,7 @@ static int br_nf_forward_finish(struct sk_buff *skb) * bridge ports. */ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; struct net_device *parent; @@ -691,7 +586,11 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, if (!nf_bridge_unshare(skb)) return NF_DROP; - parent = bridge_parent(out); + nf_bridge = nf_bridge_info_get(skb); + if (!nf_bridge) + return NF_DROP; + + parent = bridge_parent(state->out); if (!parent) return NF_DROP; @@ -704,40 +603,45 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops, nf_bridge_pull_encap_header(skb); - nf_bridge = skb->nf_bridge; if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; - nf_bridge->mask |= BRNF_PKT_TYPE; + nf_bridge->pkt_otherhost = true; } - if (pf == NFPROTO_IPV4 && br_parse_ip_options(skb)) - return NF_DROP; + if (pf == NFPROTO_IPV4) { + if (br_validate_ipv4(skb)) + return NF_DROP; + IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; + } + + if (pf == NFPROTO_IPV6) { + if (br_validate_ipv6(skb)) + return NF_DROP; + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + } - /* The physdev module checks on this */ - nf_bridge->mask |= BRNF_BRIDGED; nf_bridge->physoutdev = skb->dev; if (pf == NFPROTO_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); - NF_HOOK(pf, NF_INET_FORWARD, skb, brnf_get_logical_dev(skb, in), parent, - br_nf_forward_finish); + NF_HOOK(pf, NF_INET_FORWARD, NULL, skb, + brnf_get_logical_dev(skb, state->in), + parent, br_nf_forward_finish); return NF_STOLEN; } static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct net_bridge_port *p; struct net_bridge *br; struct net_device **d = (struct net_device **)(skb->cb); - p = br_port_get_rcu(out); + p = br_port_get_rcu(state->out); if (p == NULL) return NF_ACCEPT; br = p->br; @@ -756,55 +660,157 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, nf_bridge_push_encap_header(skb); return NF_ACCEPT; } - *d = (struct net_device *)in; - NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, skb, (struct net_device *)in, - (struct net_device *)out, br_nf_forward_finish); + *d = state->in; + NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb, + state->in, state->out, br_nf_forward_finish); return NF_STOLEN; } +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) +static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb) +{ + struct brnf_frag_data *data; + int err; + + data = this_cpu_ptr(&brnf_frag_data_storage); + err = skb_cow_head(skb, data->size); + + if (err) { + kfree_skb(skb); + return 0; + } + + if (data->vlan_tci) { + skb->vlan_tci = data->vlan_tci; + skb->vlan_proto = data->vlan_proto; + } + + skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); + __skb_push(skb, data->encap_size); + + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(sk, skb); +} +#endif + #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) -static int br_nf_dev_queue_xmit(struct sk_buff *skb) +static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) { - int ret; - int frag_max_size; + unsigned int mtu = ip_skb_dst_mtu(skb); + struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; - /* This is wrong! We should preserve the original fragment - * boundaries by preserving frag_list rather than refragmenting. - */ - if (skb->protocol == htons(ETH_P_IP) && - skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && - !skb_is_gso(skb)) { - frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; - if (br_parse_ip_options(skb)) - /* Drop invalid packet */ - return NF_DROP; - IPCB(skb)->frag_max_size = frag_max_size; - ret = ip_fragment(skb, br_dev_queue_push_xmit); - } else - ret = br_dev_queue_push_xmit(skb); + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } - return ret; + return ip_do_fragment(sk, skb, output); } -#else -static int br_nf_dev_queue_xmit(struct sk_buff *skb) +#endif + +static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { - return br_dev_queue_push_xmit(skb); + if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) + return PPPOE_SES_HLEN; + return 0; } + +static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge; + unsigned int mtu_reserved; + + mtu_reserved = nf_bridge_mtu_reduction(skb); + + if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(sk, skb); + } + + nf_bridge = nf_bridge_info_get(skb); + +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + /* This is wrong! We should preserve the original fragment + * boundaries by preserving frag_list rather than refragmenting. + */ + if (skb->protocol == htons(ETH_P_IP)) { + struct brnf_frag_data *data; + + if (br_validate_ipv4(skb)) + goto drop; + + IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; + + nf_bridge_update_protocol(skb); + + data = this_cpu_ptr(&brnf_frag_data_storage); + + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + data->encap_size = nf_bridge_encap_header_len(skb); + data->size = ETH_HLEN + data->encap_size; + + skb_copy_from_linear_data_offset(skb, -data->size, data->mac, + data->size); + + return br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit); + } #endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + if (skb->protocol == htons(ETH_P_IPV6)) { + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + struct brnf_frag_data *data; + + if (br_validate_ipv6(skb)) + goto drop; + + IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; + + nf_bridge_update_protocol(skb); + + data = this_cpu_ptr(&brnf_frag_data_storage); + data->encap_size = nf_bridge_encap_header_len(skb); + data->size = ETH_HLEN + data->encap_size; + + skb_copy_from_linear_data_offset(skb, -data->size, data->mac, + data->size); + + if (v6ops) + return v6ops->fragment(sk, skb, br_nf_push_frag_xmit); + + kfree_skb(skb); + return -EMSGSIZE; + } +#endif + nf_bridge_info_free(skb); + return br_dev_queue_push_xmit(sk, skb); + drop: + kfree_skb(skb); + return 0; +} /* PF_BRIDGE/POST_ROUTING ********************************************/ static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *realoutdev = bridge_parent(skb->dev); u_int8_t pf; - if (!nf_bridge || !(nf_bridge->mask & BRNF_BRIDGED)) + /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in + * on a bridge, but was delivered locally and is now being routed: + * + * POST_ROUTING was already invoked from the ip stack. + */ + if (!nf_bridge || !nf_bridge->physoutdev) return NF_ACCEPT; if (!realoutdev) @@ -821,17 +827,17 @@ static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, * about the value of skb->pkt_type. */ if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; - nf_bridge->mask |= BRNF_PKT_TYPE; + nf_bridge->pkt_otherhost = true; } nf_bridge_pull_encap_header(skb); - nf_bridge_save_header(skb); if (pf == NFPROTO_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); - NF_HOOK(pf, NF_INET_POST_ROUTING, skb, NULL, realoutdev, + NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb, + NULL, realoutdev, br_nf_dev_queue_xmit); return NF_STOLEN; @@ -842,9 +848,7 @@ static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops, * for the second time. */ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { if (skb->nf_bridge && !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) { @@ -854,6 +858,46 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, return NF_ACCEPT; } +/* This is called when br_netfilter has called into iptables/netfilter, + * and DNAT has taken place on a bridge-forwarded packet. + * + * neigh->output has created a new MAC header, with local br0 MAC + * as saddr. + * + * This restores the original MAC saddr of the bridged packet + * before invoking bridge forward logic to transmit the packet. + */ +static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + skb_pull(skb, ETH_HLEN); + nf_bridge->mask &= ~BRNF_BRIDGED_DNAT; + + BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); + + skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), + nf_bridge->neigh_header, + ETH_HLEN - ETH_ALEN); + skb->dev = nf_bridge->physindev; + + nf_bridge->physoutdev = NULL; + br_handle_frame_finish(NULL, skb); +} + +static int br_nf_dev_xmit(struct sk_buff *skb) +{ + if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { + br_nf_pre_routing_finish_bridge_slow(skb); + return 1; + } + return 0; +} + +static const struct nf_br_ops br_ops = { + .br_dev_xmit_hook = br_nf_dev_xmit, +}; + void br_netfilter_enable(void) { } @@ -991,12 +1035,14 @@ static int __init br_netfilter_init(void) return -ENOMEM; } #endif + RCU_INIT_POINTER(nf_br_ops, &br_ops); printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; } static void __exit br_netfilter_fini(void) { + RCU_INIT_POINTER(nf_br_ops, NULL); nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(brnf_sysctl_header); diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c new file mode 100644 index 000000000000..13b7d1e3d185 --- /dev/null +++ b/net/bridge/br_netfilter_ipv6.c @@ -0,0 +1,245 @@ +/* + * Handle firewalling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek <buytenh@gnu.org> + * Bart De Schuymer <bdschuym@pandora.be> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/ip.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> +#include <linux/netfilter_bridge.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_arp.h> +#include <linux/in_route.h> +#include <linux/inetdevice.h> + +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/route.h> +#include <net/netfilter/br_netfilter.h> + +#include <asm/uaccess.h> +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +/* We only check the length. A bridge shouldn't do any hop-by-hop stuff + * anyway + */ +static int br_nf_check_hbh_len(struct sk_buff *skb) +{ + unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1); + u32 pkt_len; + const unsigned char *nh = skb_network_header(skb); + int off = raw - nh; + int len = (raw[1] + 1) << 3; + + if ((raw + len) - skb->data > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = nh[off + 1] + 2; + + switch (nh[off]) { + case IPV6_TLV_PAD1: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + case IPV6_TLV_JUMBO: + if (nh[off + 1] != 4 || (off & 3) != 2) + goto bad; + pkt_len = ntohl(*(__be32 *)(nh + off + 2)); + if (pkt_len <= IPV6_MAXPLEN || + ipv6_hdr(skb)->payload_len) + goto bad; + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) + goto bad; + if (pskb_trim_rcsum(skb, + pkt_len + sizeof(struct ipv6hdr))) + goto bad; + nh = skb_network_header(skb); + break; + default: + if (optlen > len) + goto bad; + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 0; +bad: + return -1; +} + +int br_validate_ipv6(struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + struct net_device *dev = skb->dev; + struct inet6_dev *idev = __in6_dev_get(skb->dev); + u32 pkt_len; + u8 ip6h_len = sizeof(struct ipv6hdr); + + if (!pskb_may_pull(skb, ip6h_len)) + goto inhdr_error; + + if (skb->len < ip6h_len) + goto drop; + + hdr = ipv6_hdr(skb); + + if (hdr->version != 6) + goto inhdr_error; + + pkt_len = ntohs(hdr->payload_len); + + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + ip6h_len > skb->len) { + IP6_INC_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { + IP6_INC_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb)) + goto drop; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + /* No IP options in IPv6 header; however it should be + * checked if some next headers need special treatment + */ + return 0; + +inhdr_error: + IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS); +drop: + return -1; +} + +static inline bool +br_nf_ipv6_daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) +{ + return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr, + sizeof(ipv6_hdr(skb)->daddr)) != 0; +} + +/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables + * PREROUTING and continue the bridge PRE_ROUTING hook. See comment + * for br_nf_pre_routing_finish(), same logic is used here but + * equivalent IPv6 function ip6_route_input() called indirectly. + */ +static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct rtable *rt; + struct net_device *dev = skb->dev; + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + + nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; + + if (nf_bridge->pkt_otherhost) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->pkt_otherhost = false; + } + nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) { + skb_dst_drop(skb); + v6ops->route_input(skb); + + if (skb_dst(skb)->error) { + kfree_skb(skb); + return 0; + } + + if (skb_dst(skb)->dev == dev) { + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, + sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge, + 1); + return 0; + } + ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); + skb->pkt_type = PACKET_HOST; + } else { + rt = bridge_parent_rtable(nf_bridge->physindev); + if (!rt) { + kfree_skb(skb); + return 0; + } + skb_dst_set_noref(skb, &rt->dst); + } + + skb->dev = nf_bridge->physindev; + nf_bridge_update_protocol(skb); + nf_bridge_push_encap_header(skb); + NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb, + skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +/* Replicate the checks that IPv6 does on packet reception and pass the packet + * to ip6tables. + */ +unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_bridge_info *nf_bridge; + + if (br_validate_ipv6(skb)) + return NF_DROP; + + nf_bridge_put(skb->nf_bridge); + if (!nf_bridge_alloc(skb)) + return NF_DROP; + if (!setup_pre_routing(skb)) + return NF_DROP; + + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; + + skb->protocol = htons(ETH_P_IPV6); + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb, + skb->dev, NULL, + br_nf_pre_routing_finish_ipv6); + + return NF_STOLEN; +} diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4fbcea0e7ecb..3da5525eb8a2 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -22,6 +22,85 @@ #include "br_private.h" #include "br_private_stp.h" +static int br_get_num_vlan_infos(const struct net_port_vlans *pv, + u32 filter_mask) +{ + u16 vid_range_start = 0, vid_range_end = 0; + u16 vid_range_flags = 0; + u16 pvid, vid, flags; + int num_vlans = 0; + + if (filter_mask & RTEXT_FILTER_BRVLAN) + return pv->num_vlans; + + if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) + return 0; + + /* Count number of vlan info's + */ + pvid = br_get_pvid(pv); + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { + flags = 0; + if (vid == pvid) + flags |= BRIDGE_VLAN_INFO_PVID; + + if (test_bit(vid, pv->untagged_bitmap)) + flags |= BRIDGE_VLAN_INFO_UNTAGGED; + + if (vid_range_start == 0) { + goto initvars; + } else if ((vid - vid_range_end) == 1 && + flags == vid_range_flags) { + vid_range_end = vid; + continue; + } else { + if ((vid_range_end - vid_range_start) > 0) + num_vlans += 2; + else + num_vlans += 1; + } +initvars: + vid_range_start = vid; + vid_range_end = vid; + vid_range_flags = flags; + } + + if (vid_range_start != 0) { + if ((vid_range_end - vid_range_start) > 0) + num_vlans += 2; + else + num_vlans += 1; + } + + return num_vlans; +} + +static size_t br_get_link_af_size_filtered(const struct net_device *dev, + u32 filter_mask) +{ + struct net_port_vlans *pv; + int num_vlan_infos; + + rcu_read_lock(); + if (br_port_exists(dev)) + pv = nbp_get_vlan_info(br_port_get_rcu(dev)); + else if (dev->priv_flags & IFF_EBRIDGE) + pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev)); + else + pv = NULL; + if (pv) + num_vlan_infos = br_get_num_vlan_infos(pv, filter_mask); + else + num_vlan_infos = 0; + rcu_read_unlock(); + + if (!num_vlan_infos) + return 0; + + /* Each VLAN is returned in bridge_vlan_info along with flags */ + return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info)); +} + static inline size_t br_port_info_size(void) { return nla_total_size(1) /* IFLA_BRPORT_STATE */ @@ -36,7 +115,7 @@ static inline size_t br_port_info_size(void) + 0; } -static inline size_t br_nlmsg_size(void) +static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ @@ -45,7 +124,9 @@ static inline size_t br_nlmsg_size(void) + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(1) /* IFLA_OPERSTATE */ - + nla_total_size(br_port_info_size()); /* IFLA_PROTINFO */ + + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */ + + nla_total_size(br_get_link_af_size_filtered(dev, + filter_mask)); /* IFLA_AF_SPEC */ } static int br_port_fill_attrs(struct sk_buff *skb, @@ -62,7 +143,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || - nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP))) + nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || + nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, + !!(p->flags & BR_PROXYARP_WIFI))) return -EMSGSIZE; return 0; @@ -222,8 +305,8 @@ static int br_fill_ifinfo(struct sk_buff *skb, nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink))) + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; if (event == RTM_NEWLINK && port) { @@ -280,6 +363,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) struct net *net; struct sk_buff *skb; int err = -ENOBUFS; + u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; if (!port) return; @@ -288,11 +372,11 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) br_debug(port->br, "port %u(%s) event %d\n", (unsigned int)port->port_no, port->dev->name, event); - skb = nlmsg_new(br_nlmsg_size(), GFP_ATOMIC); + skb = nlmsg_new(br_nlmsg_size(port->dev, filter), GFP_ATOMIC); if (skb == NULL) goto errout; - err = br_fill_ifinfo(skb, port, 0, 0, event, 0, 0, port->dev); + err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, port->dev); if (err < 0) { /* -EMSGSIZE implies BUG in br_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -310,7 +394,7 @@ errout: * Dump information about all ports, in response to GETLINK */ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, - struct net_device *dev, u32 filter_mask) + struct net_device *dev, u32 filter_mask, int nlflags) { struct net_bridge_port *port = br_port_get_rtnl(dev); @@ -318,7 +402,7 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) return 0; - return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI, + return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, filter_mask, dev); } @@ -373,6 +457,8 @@ static int br_afspec(struct net_bridge *br, if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; vinfo = nla_data(attr); + if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK) + return -EINVAL; if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (vinfo_start) return -EINVAL; @@ -471,6 +557,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); + br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); @@ -501,7 +588,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) struct nlattr *afspec; struct net_bridge_port *p; struct nlattr *tb[IFLA_BRPORT_MAX + 1]; - int err = 0, ret_offload = 0; + int err = 0; protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO); afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); @@ -543,16 +630,6 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) afspec, RTM_SETLINK); } - if (p && !(flags & BRIDGE_FLAGS_SELF)) { - /* set bridge attributes in hardware if supported - */ - ret_offload = netdev_switch_port_bridge_setlink(dev, nlh, - flags); - if (ret_offload && ret_offload != -EOPNOTSUPP) - br_warn(p->br, "error setting attrs on port %u(%s)\n", - (unsigned int)p->port_no, p->dev->name); - } - if (err == 0) br_ifinfo_notify(RTM_NEWLINK, p); out: @@ -564,7 +641,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) { struct nlattr *afspec; struct net_bridge_port *p; - int err = 0, ret_offload = 0; + int err = 0; afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (!afspec) @@ -583,16 +660,6 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) */ br_ifinfo_notify(RTM_NEWLINK, p); - if (p && !(flags & BRIDGE_FLAGS_SELF)) { - /* del bridge attributes in hardware - */ - ret_offload = netdev_switch_port_bridge_dellink(dev, nlh, - flags); - if (ret_offload && ret_offload != -EOPNOTSUPP) - br_warn(p->br, "error deleting attrs on port %u (%s)\n", - (unsigned int)p->port_no, p->dev->name); - } - return err; } static int br_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -626,9 +693,17 @@ static int br_port_slave_changelink(struct net_device *brdev, struct nlattr *tb[], struct nlattr *data[]) { + struct net_bridge *br = netdev_priv(brdev); + int ret; + if (!data) return 0; - return br_setport(br_port_get_rtnl(dev), data); + + spin_lock_bh(&br->lock); + ret = br_setport(br_port_get_rtnl(dev), data); + spin_unlock_bh(&br->lock); + + return ret; } static int br_port_fill_slave_info(struct sk_buff *skb, @@ -648,6 +723,9 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, + [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, + [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, + [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -677,6 +755,24 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } + if (data[IFLA_BR_AGEING_TIME]) { + u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]); + + br->ageing_time = clock_t_to_jiffies(ageing_time); + } + + if (data[IFLA_BR_STP_STATE]) { + u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); + + br_stp_set_enabled(br, stp_enabled); + } + + if (data[IFLA_BR_PRIORITY]) { + u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]); + + br_stp_set_bridge_priority(br, priority); + } + return 0; } @@ -685,6 +781,9 @@ static size_t br_get_size(const struct net_device *brdev) return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ + nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ 0; } @@ -694,10 +793,16 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 forward_delay = jiffies_to_clock_t(br->forward_delay); u32 hello_time = jiffies_to_clock_t(br->hello_time); u32 age_time = jiffies_to_clock_t(br->max_age); + u32 ageing_time = jiffies_to_clock_t(br->ageing_time); + u32 stp_enabled = br->stp_enabled; + u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || - nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time)) + nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || + nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || + nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || + nla_put_u16(skb, IFLA_BR_PRIORITY, priority)) return -EMSGSIZE; return 0; diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 387cb3bd017c..20cbb727df4d 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -54,7 +54,6 @@ static unsigned int fake_mtu(const struct dst_entry *dst) static struct dst_ops fake_dst_ops = { .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), .update_pmtu = fake_update_pmtu, .redirect = fake_redirect, .cow_metrics = fake_cow_metrics, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index de0919975a25..8b21146b24a0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -18,6 +18,7 @@ #include <linux/netpoll.h> #include <linux/u64_stats_sync.h> #include <net/route.h> +#include <net/ip6_fib.h> #include <linux/if_vlan.h> #define BR_HASH_BITS 8 @@ -33,8 +34,8 @@ /* Control of forwarding link local multicast */ #define BR_GROUPFWD_DEFAULT 0 -/* Don't allow forwarding control protocols like STP and LLDP */ -#define BR_GROUPFWD_RESTRICTED 0x4007u +/* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */ +#define BR_GROUPFWD_RESTRICTED 0x0007u /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */ #define BR_GROUPFWD_8021AD 0xB801u @@ -214,7 +215,10 @@ struct net_bridge spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct rtable fake_rtable; + union { + struct rtable fake_rtable; + struct rt6_info fake_rt6_info; + }; bool nf_call_iptables; bool nf_call_ip6tables; bool nf_call_arptables; @@ -304,7 +308,7 @@ struct br_input_skb_cb { int mrouters_only; #endif - u16 frag_max_size; + bool proxyarp_replied; #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool vlan_filtered; @@ -383,7 +387,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(unsigned long arg); void br_fdb_delete_by_port(struct net_bridge *br, - const struct net_bridge_port *p, int do_all); + const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, const unsigned char *addr, __u16 vid); int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); @@ -409,10 +413,10 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, /* br_forward.c */ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb); -int br_dev_queue_push_xmit(struct sk_buff *skb); +int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb); void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0); -int br_forward_finish(struct sk_buff *skb); +int br_forward_finish(struct sock *sk, struct sk_buff *skb); void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast); void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, struct sk_buff *skb2, bool unicast); @@ -430,7 +434,7 @@ void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); void br_manage_promisc(struct net_bridge *br); /* br_input.c */ -int br_handle_frame_finish(struct sk_buff *skb); +int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb); rx_handler_result_t br_handle_frame(struct sk_buff **pskb); static inline bool br_rx_handler_check_rcu(const struct net_device *dev) @@ -762,6 +766,11 @@ static inline int br_vlan_enabled(struct net_bridge *br) } #endif +struct nf_br_ops { + int (*br_dev_xmit_hook)(struct sk_buff *skb); +}; +extern const struct nf_br_ops __rcu *nf_br_ops; + /* br_netfilter.c */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int br_nf_core_init(void); @@ -822,7 +831,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port); int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, - u32 filter_mask); + u32 filter_mask, int nlflags); #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index fb3ebe615513..ed74ffaa851f 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -39,10 +39,14 @@ void br_log_state(const struct net_bridge_port *p) void br_set_state(struct net_bridge_port *p, unsigned int state) { + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_STP_STATE, + .u.stp_state = state, + }; int err; p->state = state; - err = netdev_switch_port_stp_update(p->dev, state); + err = switchdev_port_attr_set(p->dev, &attr); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); @@ -205,8 +209,9 @@ void br_transmit_config(struct net_bridge_port *p) br_send_config_bpdu(p, &bpdu); p->topology_change_ack = 0; p->config_pending = 0; - mod_timer(&p->hold_timer, - round_jiffies(jiffies + BR_HOLD_TIME)); + if (p->br->stp_enabled == BR_KERNEL_STP) + mod_timer(&p->hold_timer, + round_jiffies(jiffies + BR_HOLD_TIME)); } } @@ -424,7 +429,6 @@ static void br_make_forwarding(struct net_bridge_port *p) else br_set_state(p, BR_STATE_LEARNING); - br_multicast_enable_port(p); br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); @@ -458,6 +462,12 @@ void br_port_state_selection(struct net_bridge *br) } } + if (p->state != BR_STATE_BLOCKING) + br_multicast_enable_port(p); + /* Multicast is not disabled for the port when it goes in + * blocking state because the timers will expire and stop by + * themselves without sending more queries. + */ if (p->state == BR_STATE_FORWARDING) ++liveports; } diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index bdb459d21ad8..534fc4cd263e 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -54,8 +54,9 @@ static void br_send_bpdu(struct net_bridge_port *p, skb_reset_mac_header(skb); - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, - dev_queue_xmit); + NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb, + NULL, skb->dev, + dev_queue_xmit_sk); } static inline void br_set_ticks(unsigned char *dest, int j) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 41146872c1b4..4ca449a16132 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -48,7 +48,8 @@ void br_stp_enable_bridge(struct net_bridge *br) struct net_bridge_port *p; spin_lock_bh(&br->lock); - mod_timer(&br->hello_timer, jiffies + br->hello_time); + if (br->stp_enabled == BR_KERNEL_STP) + mod_timer(&br->hello_timer, jiffies + br->hello_time); mod_timer(&br->gc_timer, jiffies + HZ/10); br_config_bpdu_generation(br); @@ -111,7 +112,7 @@ void br_stp_disable_port(struct net_bridge_port *p) del_timer(&p->forward_delay_timer); del_timer(&p->hold_timer); - br_fdb_delete_by_port(br, p, 0); + br_fdb_delete_by_port(br, p, 0, 0); br_multicast_disable_port(p); br_configuration_update(br); @@ -127,6 +128,7 @@ static void br_stp_start(struct net_bridge *br) int r; char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL }; char *envp[] = { NULL }; + struct net_bridge_port *p; r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); @@ -140,6 +142,10 @@ static void br_stp_start(struct net_bridge *br) if (r == 0) { br->stp_enabled = BR_USER_STP; br_debug(br, "userspace STP started\n"); + /* Stop hello and hold timers */ + del_timer(&br->hello_timer); + list_for_each_entry(p, &br->port_list, list) + del_timer(&p->hold_timer); } else { br->stp_enabled = BR_KERNEL_STP; br_debug(br, "using kernel STP\n"); @@ -156,12 +162,17 @@ static void br_stp_stop(struct net_bridge *br) int r; char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL }; char *envp[] = { NULL }; + struct net_bridge_port *p; if (br->stp_enabled == BR_USER_STP) { r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); br_info(br, "userspace STP stopped, return code %d\n", r); /* To start timers on any ports left in blocking */ + mod_timer(&br->hello_timer, jiffies + br->hello_time); + list_for_each_entry(p, &br->port_list, list) + mod_timer(&p->hold_timer, + round_jiffies(jiffies + BR_HOLD_TIME)); spin_lock_bh(&br->lock); br_port_state_selection(br); spin_unlock_bh(&br->lock); @@ -243,12 +254,13 @@ bool br_stp_recalculate_bridge_id(struct net_bridge *br) return true; } -/* called under bridge lock */ +/* Acquires and releases bridge lock */ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) { struct net_bridge_port *p; int wasroot; + spin_lock_bh(&br->lock); wasroot = br_is_root_bridge(br); list_for_each_entry(p, &br->port_list, list) { @@ -266,6 +278,7 @@ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); + spin_unlock_bh(&br->lock); } /* called under bridge lock */ diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 4fcaa67750fd..5f0f5af0ec35 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -40,7 +40,9 @@ static void br_hello_timer_expired(unsigned long arg) if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); - mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); + if (br->stp_enabled != BR_USER_STP) + mod_timer(&br->hello_timer, + round_jiffies(jiffies + br->hello_time)); } spin_unlock(&br->lock); } @@ -97,7 +99,9 @@ static void br_forward_delay_timer_expired(unsigned long arg) netif_carrier_on(br->dev); } br_log_state(p); + rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, p); + rcu_read_unlock(); spin_unlock(&br->lock); } diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 2de5d91199e8..efe415ad842a 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -160,7 +160,7 @@ static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); static int store_flush(struct net_bridge_port *p, unsigned long v) { - br_fdb_delete_by_port(p->br, p, 0); // Don't delete local entry + br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry return 0; } static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); @@ -171,6 +171,7 @@ BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); +BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -215,6 +216,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_multicast_fast_leave, #endif &brport_attr_proxyarp, + &brport_attr_proxyarp_wifi, NULL }; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 13013fe8db24..0d41f81838ff 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -2,6 +2,7 @@ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <net/switchdev.h> #include "br_private.h" @@ -36,6 +37,36 @@ static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags) clear_bit(vid, v->untagged_bitmap); } +static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, + u16 vid, u16 flags) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + /* If driver uses VLAN ndo ops, use 8021q to install vid + * on device, otherwise try switchdev ops to install vid. + */ + + if (ops->ndo_vlan_rx_add_vid) { + err = vlan_vid_add(dev, br->vlan_proto, vid); + } else { + struct switchdev_obj vlan_obj = { + .id = SWITCHDEV_OBJ_PORT_VLAN, + .u.vlan = { + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }, + }; + + err = switchdev_port_obj_add(dev, &vlan_obj); + if (err == -EOPNOTSUPP) + err = 0; + } + + return err; +} + static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) { struct net_bridge_port *p = NULL; @@ -62,7 +93,7 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ - err = vlan_vid_add(dev, br->vlan_proto, vid); + err = __vlan_vid_add(dev, br, vid, flags); if (err) return err; } @@ -86,6 +117,30 @@ out_filt: return err; } +static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br, + u16 vid) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + /* If driver uses VLAN ndo ops, use 8021q to delete vid + * on device, otherwise try switchdev ops to delete vid. + */ + + if (ops->ndo_vlan_rx_kill_vid) { + vlan_vid_del(dev, br->vlan_proto, vid); + } else { + struct switchdev_obj vlan_obj = { + .id = SWITCHDEV_OBJ_PORT_VLAN, + .u.vlan = { + .vid_begin = vid, + .vid_end = vid, + }, + }; + + switchdev_port_obj_del(dev, &vlan_obj); + } +} + static int __vlan_del(struct net_port_vlans *v, u16 vid) { if (!test_bit(vid, v->vlan_bitmap)) @@ -96,7 +151,7 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid) if (v->port_idx) { struct net_bridge_port *p = v->parent.port; - vlan_vid_del(p->dev, p->br->vlan_proto, vid); + __vlan_vid_del(p->dev, p->br, vid); } clear_bit(vid, v->vlan_bitmap); @@ -686,6 +741,7 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) return -EINVAL; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); + br_fdb_delete_by_port(port->br, port, vid, 0); return __vlan_del(pv, vid); } diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 071d87214dde..0c40570069ba 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -164,8 +164,10 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par) !(info->bitmask & EBT_STP_MASK)) return -EINVAL; /* Make sure the match only receives stp frames */ - if (!ether_addr_equal(e->destmac, bridge_ula) || - !ether_addr_equal(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC)) + if (!par->nft_compat && + (!ether_addr_equal(e->destmac, bridge_ula) || + !ether_addr_equal(e->destmsk, msk) || + !(e->bitmask & EBT_DESTMAC))) return -EINVAL; return 0; diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index ce205aabf9c5..8a3f63b2e807 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -58,20 +58,18 @@ static const struct ebt_table frame_filter = { static unsigned int ebt_in_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, in, out, - dev_net(in)->xt.frame_filter); + return ebt_do_table(ops->hooknum, skb, state->in, state->out, + dev_net(state->in)->xt.frame_filter); } static unsigned int ebt_out_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, in, out, - dev_net(out)->xt.frame_filter); + return ebt_do_table(ops->hooknum, skb, state->in, state->out, + dev_net(state->out)->xt.frame_filter); } static struct nf_hook_ops ebt_ops_filter[] __read_mostly = { diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index a0ac2984fb6c..c5ef5b1ab678 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -58,20 +58,18 @@ static struct ebt_table frame_nat = { static unsigned int ebt_nat_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, in, out, - dev_net(in)->xt.frame_nat); + return ebt_do_table(ops->hooknum, skb, state->in, state->out, + dev_net(state->in)->xt.frame_nat); } static unsigned int ebt_nat_out(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return ebt_do_table(ops->hooknum, skb, in, out, - dev_net(out)->xt.frame_nat); + return ebt_do_table(ops->hooknum, skb, state->in, state->out, + dev_net(state->out)->xt.frame_nat); } static struct nf_hook_ops ebt_ops_nat[] __read_mostly = { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 91180a7fc943..18ca4b24c418 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -6,7 +6,7 @@ * * ebtables.c,v 2.0, July, 2002 * - * This code is stongly inspired on the iptables code which is + * This code is strongly inspired by the iptables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * * This program is free software; you can redistribute it and/or @@ -139,7 +139,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, ethproto = h->h_proto; if (e->bitmask & EBT_802_3) { - if (FWINV2(ntohs(ethproto) >= ETH_P_802_3_MIN, EBT_IPROTO)) + if (FWINV2(eth_proto_is_802_3(ethproto), EBT_IPROTO)) return 1; } else if (!(e->bitmask & EBT_NOPROTO) && FWINV2(e->ethproto != ethproto, EBT_IPROTO)) diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 19473a9371b8..a343e62442b1 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -67,47 +67,43 @@ EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate); static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out) + const struct nf_hook_state *state) { if (nft_bridge_iphdr_validate(skb)) - nft_set_pktinfo_ipv4(pkt, ops, skb, in, out); + nft_set_pktinfo_ipv4(pkt, ops, skb, state); else - nft_set_pktinfo(pkt, ops, skb, in, out); + nft_set_pktinfo(pkt, ops, skb, state); } static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out) + const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_IPV6) if (nft_bridge_ip6hdr_validate(skb) && - nft_set_pktinfo_ipv6(pkt, ops, skb, in, out) == 0) + nft_set_pktinfo_ipv6(pkt, ops, skb, state) == 0) return; #endif - nft_set_pktinfo(pkt, ops, skb, in, out); + nft_set_pktinfo(pkt, ops, skb, state); } static unsigned int nft_do_chain_bridge(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct nft_pktinfo pkt; switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_bridge_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + nft_bridge_set_pktinfo_ipv4(&pkt, ops, skb, state); break; case htons(ETH_P_IPV6): - nft_bridge_set_pktinfo_ipv6(&pkt, ops, skb, in, out); + nft_bridge_set_pktinfo_ipv6(&pkt, ops, skb, state); break; default: - nft_set_pktinfo(&pkt, ops, skb, in, out); + nft_set_pktinfo(&pkt, ops, skb, state); break; } diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 4f02109d708f..a21269b83f16 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -19,12 +19,12 @@ #include "../br_private.h" static void nft_meta_bridge_get_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct net_device *in = pkt->in, *out = pkt->out; - struct nft_data *dest = &data[priv->dreg]; + u32 *dest = ®s->data[priv->dreg]; const struct net_bridge_port *p; switch (priv->key) { @@ -40,12 +40,12 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, goto out; } - strncpy((char *)dest->data, p->br->dev->name, sizeof(dest->data)); + strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); return; out: - return nft_meta_get_eval(expr, data, pkt); + return nft_meta_get_eval(expr, regs, pkt); err: - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + regs->verdict.code = NFT_BREAK; } static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, @@ -53,27 +53,21 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); - int err; + unsigned int len; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_BRI_IIFNAME: case NFT_META_BRI_OIFNAME: + len = IFNAMSIZ; break; default: return nft_meta_get_init(ctx, expr, tb); } - priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - - err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); - if (err < 0) - return err; - - return 0; + priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); } static struct nft_expr_type nft_meta_bridge_type; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 3244aead0926..858d848564ee 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -21,6 +21,7 @@ #include <net/ip.h> #include <net/ip6_checksum.h> #include <linux/netfilter_bridge.h> +#include <linux/netfilter_ipv6.h> #include "../br_private.h" static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, @@ -36,7 +37,12 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, skb_pull(nskb, ETH_HLEN); } -static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, int hook) +/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) + * or the bridge port (NF_BRIDGE PREROUTING). + */ +static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, + const struct net_device *dev, + int hook) { struct sk_buff *nskb; struct iphdr *niph; @@ -65,11 +71,12 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, int hook) nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(oldskb->dev), nskb); + br_deliver(br_port_get_rcu(dev), nskb); } -static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, int hook, - u8 code) +static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) { struct sk_buff *nskb; struct iphdr *niph; @@ -77,8 +84,9 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, int hook, unsigned int len; void *payload; __wsum csum; + u8 proto; - if (!nft_bridge_iphdr_validate(oldskb)) + if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb)) return; /* IP header checks: fragment. */ @@ -91,7 +99,17 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, int hook, if (!pskb_may_pull(oldskb, len)) return; - if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), 0)) + if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len))) + return; + + if (ip_hdr(oldskb)->protocol == IPPROTO_TCP || + ip_hdr(oldskb)->protocol == IPPROTO_UDP) + proto = ip_hdr(oldskb)->protocol; + else + proto = 0; + + if (!skb_csum_unnecessary(oldskb) && + nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto)) return; nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) + @@ -120,11 +138,13 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, int hook, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(oldskb->dev), nskb); + br_deliver(br_port_get_rcu(dev), nskb); } static void nft_reject_br_send_v6_tcp_reset(struct net *net, - struct sk_buff *oldskb, int hook) + struct sk_buff *oldskb, + const struct net_device *dev, + int hook) { struct sk_buff *nskb; const struct tcphdr *oth; @@ -152,12 +172,37 @@ static void nft_reject_br_send_v6_tcp_reset(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(oldskb->dev), nskb); + br_deliver(br_port_get_rcu(dev), nskb); +} + +static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int thoff; + __be16 fo; + u8 proto = ip6h->nexthdr; + + if (skb->csum_bad) + return false; + + if (skb_csum_unnecessary(skb)) + return true; + + if (ip6h->payload_len && + pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) + return false; + + thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); + if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) + return false; + + return nf_ip6_checksum(skb, hook, thoff, proto) == 0; } static void nft_reject_br_send_v6_unreach(struct net *net, - struct sk_buff *oldskb, int hook, - u8 code) + struct sk_buff *oldskb, + const struct net_device *dev, + int hook, u8 code) { struct sk_buff *nskb; struct ipv6hdr *nip6h; @@ -176,6 +221,9 @@ static void nft_reject_br_send_v6_unreach(struct net *net, if (!pskb_may_pull(oldskb, len)) return; + if (!reject6_br_csum_ok(oldskb, hook)) + return; + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmp6hdr) + LL_MAX_HEADER + len, GFP_ATOMIC); if (!nskb) @@ -205,12 +253,12 @@ static void nft_reject_br_send_v6_unreach(struct net *net, nft_reject_br_push_etherhdr(oldskb, nskb); - br_deliver(br_port_get_rcu(oldskb->dev), nskb); + br_deliver(br_port_get_rcu(dev), nskb); } static void nft_reject_bridge_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); @@ -224,16 +272,16 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, case htons(ETH_P_IP): switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nft_reject_br_send_v4_unreach(pkt->skb, + nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, pkt->ops->hooknum, priv->icmp_code); break; case NFT_REJECT_TCP_RST: - nft_reject_br_send_v4_tcp_reset(pkt->skb, + nft_reject_br_send_v4_tcp_reset(pkt->skb, pkt->in, pkt->ops->hooknum); break; case NFT_REJECT_ICMPX_UNREACH: - nft_reject_br_send_v4_unreach(pkt->skb, + nft_reject_br_send_v4_unreach(pkt->skb, pkt->in, pkt->ops->hooknum, nft_reject_icmp_code(priv->icmp_code)); break; @@ -242,16 +290,16 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, case htons(ETH_P_IPV6): switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nft_reject_br_send_v6_unreach(net, pkt->skb, + nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in, pkt->ops->hooknum, priv->icmp_code); break; case NFT_REJECT_TCP_RST: - nft_reject_br_send_v6_tcp_reset(net, pkt->skb, + nft_reject_br_send_v6_tcp_reset(net, pkt->skb, pkt->in, pkt->ops->hooknum); break; case NFT_REJECT_ICMPX_UNREACH: - nft_reject_br_send_v6_unreach(net, pkt->skb, + nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in, pkt->ops->hooknum, nft_reject_icmpv6_code(priv->icmp_code)); break; @@ -262,7 +310,7 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr, break; } out: - data[NFT_REG_VERDICT].verdict = NF_DROP; + regs->verdict.code = NF_DROP; } static int nft_reject_bridge_validate(const struct nft_ctx *ctx, @@ -323,6 +371,8 @@ static int nft_reject_bridge_dump(struct sk_buff *skb, if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; + default: + break; } return 0; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a6e2da0bc718..cc858919108e 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -121,12 +121,13 @@ static void caif_flow_ctrl(struct sock *sk, int mode) * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are * not dropped, but CAIF is sending flow off instead. */ -static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static void caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); + bool queued = false; if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { @@ -139,7 +140,8 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) err = sk_filter(sk, skb); if (err) - return err; + goto out; + if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); @@ -147,21 +149,16 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } skb->dev = NULL; skb_set_owner_r(skb, sk); - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ spin_lock_irqsave(&list->lock, flags); - if (!sock_flag(sk, SOCK_DEAD)) + queued = !sock_flag(sk, SOCK_DEAD); + if (queued) __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); - - if (!sock_flag(sk, SOCK_DEAD)) +out: + if (queued) sk->sk_data_ready(sk); else kfree_skb(skb); - return 0; } /* Packet Receive Callback function called from CAIF Stack */ @@ -271,8 +268,8 @@ static void caif_check_flow_release(struct sock *sk) * Copied from unix_dgram_recvmsg, but removed credit checks, * changed locking, address handling and added MSG_TRUNC. */ -static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t len, int flags) +static int caif_seqpkt_recvmsg(struct socket *sock, struct msghdr *m, + size_t len, int flags) { struct sock *sk = sock->sk; @@ -330,6 +327,10 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); + + if (sock_flag(sk, SOCK_DEAD)) + break; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); } @@ -343,9 +344,8 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) * Copied from unix_stream_recvmsg, but removed credit checks, * changed locking calls, changed address handling. */ -static int caif_stream_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, - int flags) +static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { struct sock *sk = sock->sk; int copied = 0; @@ -374,6 +374,10 @@ static int caif_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb; lock_sock(sk); + if (sock_flag(sk, SOCK_DEAD)) { + err = -ECONNRESET; + goto unlock; + } skb = skb_dequeue(&sk->sk_receive_queue); caif_check_flow_release(sk); @@ -511,8 +515,8 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, } /* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */ -static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -586,8 +590,8 @@ err: * Changed removed permission handling and added waiting for flow on * and other minor adaptations. */ -static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int caif_stream_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); @@ -1048,7 +1052,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol, * is really not used at all in the net/core or socket.c but the * initialization makes sure that sock->state is not uninitialized. */ - sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot); + sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern); if (!sk) return -ENOMEM; diff --git a/net/can/af_can.c b/net/can/af_can.c index 32d710eaf1fc..166d436196c1 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -89,6 +89,8 @@ struct timer_list can_stattimer; /* timer for statistics update */ struct s_stats can_stats; /* packet statistics */ struct s_pstats can_pstats; /* receive list statistics */ +static atomic_t skbcounter = ATOMIC_INIT(0); + /* * af_can socket functions */ @@ -179,7 +181,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol, sock->ops = cp->ops; - sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot); + sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern); if (!sk) { err = -ENOMEM; goto errout; @@ -679,6 +681,10 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) can_stats.rx_frames++; can_stats.rx_frames_delta++; + /* create non-zero unique skb identifier together with *skb */ + while (!(can_skb_prv(skb)->skbcnt)) + can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter); + rcu_read_lock(); /* deliver the packet to sockets listening on all devices */ diff --git a/net/can/bcm.c b/net/can/bcm.c index ee9ffd956552..a1ba6875c2a2 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -261,6 +261,7 @@ static void bcm_can_tx(struct bcm_op *op) can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; memcpy(skb_put(skb, CFSIZ), cf, CFSIZ); @@ -328,7 +329,7 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, * containing the interface index. */ - BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); + sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; @@ -1217,6 +1218,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) } can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; skb->dev = dev; can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ @@ -1231,8 +1233,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) /* * bcm_sendmsg - process BCM commands (opcodes) from the userspace */ -static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size) +static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); @@ -1535,8 +1536,8 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, return 0; } -static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; diff --git a/net/can/gw.c b/net/can/gw.c index a6f448e18ea8..455168718c2e 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -110,6 +110,7 @@ struct cf_mod { void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; + u32 uid; }; @@ -548,6 +549,11 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, goto cancel; } + if (gwj->mod.uid) { + if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) + goto cancel; + } + if (gwj->mod.csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &gwj->mod.csum.crc8) < 0) @@ -619,6 +625,7 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = { [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, + [CGW_MOD_UID] = { .type = NLA_U32 }, }; /* check for common and gwtype specific attributes */ @@ -761,6 +768,10 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, else mod->csumfunc.xor = cgw_csum_xor_neg; } + + if (tb[CGW_MOD_UID]) { + nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); + } } if (gwtype == CGW_TYPE_CAN_CAN) { @@ -802,6 +813,8 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) { struct rtcanmsg *r; struct cgw_job *gwj; + struct cf_mod mod; + struct can_can_gw ccgw; u8 limhops = 0; int err = 0; @@ -819,6 +832,36 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; + err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); + if (err < 0) + return err; + + if (mod.uid) { + + ASSERT_RTNL(); + + /* check for updating an existing job with identical uid */ + hlist_for_each_entry(gwj, &cgw_list, list) { + + if (gwj->mod.uid != mod.uid) + continue; + + /* interfaces & filters must be identical */ + if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) + return -EINVAL; + + /* update modifications with disabled softirq & quit */ + local_bh_disable(); + memcpy(&gwj->mod, &mod, sizeof(mod)); + local_bh_enable(); + return 0; + } + } + + /* ifindex == 0 is not allowed for job creation */ + if (!ccgw.src_idx || !ccgw.dst_idx) + return -ENODEV; + gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) return -ENOMEM; @@ -828,18 +871,14 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; + gwj->limit_hops = limhops; - err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw, - &limhops); - if (err < 0) - goto out; + /* insert already parsed information */ + memcpy(&gwj->mod, &mod, sizeof(mod)); + memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; - /* ifindex == 0 is not allowed for job creation */ - if (!gwj->ccgw.src_idx || !gwj->ccgw.dst_idx) - goto out; - gwj->src.dev = __dev_get_by_index(&init_net, gwj->ccgw.src_idx); if (!gwj->src.dev) @@ -856,8 +895,6 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->dst.dev->type != ARPHRD_CAN) goto out; - gwj->limit_hops = limhops; - ASSERT_RTNL(); err = cgw_register_filter(gwj); @@ -931,8 +968,15 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->limit_hops != limhops) continue; - if (memcmp(&gwj->mod, &mod, sizeof(mod))) - continue; + /* we have a match when uid is enabled and identical */ + if (gwj->mod.uid || mod.uid) { + if (gwj->mod.uid != mod.uid) + continue; + } else { + /* no uid => check for identical modifications */ + if (memcmp(&gwj->mod, &mod, sizeof(mod))) + continue; + } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) diff --git a/net/can/raw.c b/net/can/raw.c index 00c13ef23661..2e67b1423cd3 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -74,6 +74,12 @@ MODULE_ALIAS("can-proto-1"); * storing the single filter in dfilter, to avoid using dynamic memory. */ +struct uniqframe { + int skbcnt; + const struct sk_buff *skb; + unsigned int join_rx_count; +}; + struct raw_sock { struct sock sk; int bound; @@ -82,10 +88,12 @@ struct raw_sock { int loopback; int recv_own_msgs; int fd_frames; + int join_filters; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ can_err_mask_t err_mask; + struct uniqframe __percpu *uniq; }; /* @@ -95,8 +103,8 @@ struct raw_sock { */ static inline unsigned int *raw_flags(struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) <= (sizeof(struct sockaddr_can) + - sizeof(unsigned int))); + sock_skb_cb_check_size(sizeof(struct sockaddr_can) + + sizeof(unsigned int)); /* return pointer after struct sockaddr_can */ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); @@ -123,6 +131,26 @@ static void raw_rcv(struct sk_buff *oskb, void *data) if (!ro->fd_frames && oskb->len != CAN_MTU) return; + /* eliminate multiple filter matches for the same skb */ + if (this_cpu_ptr(ro->uniq)->skb == oskb && + this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) { + if (ro->join_filters) { + this_cpu_inc(ro->uniq->join_rx_count); + /* drop frame until all enabled filters matched */ + if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count) + return; + } else { + return; + } + } else { + this_cpu_ptr(ro->uniq)->skb = oskb; + this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt; + this_cpu_ptr(ro->uniq)->join_rx_count = 1; + /* drop first frame to check all enabled filters? */ + if (ro->join_filters && ro->count > 1) + return; + } + /* clone the given skb to be able to enqueue it into the rcv queue */ skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) @@ -135,7 +163,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data) * containing the interface index. */ - BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); + sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; @@ -296,6 +324,12 @@ static int raw_init(struct sock *sk) ro->loopback = 1; ro->recv_own_msgs = 0; ro->fd_frames = 0; + ro->join_filters = 0; + + /* alloc_percpu provides zero'ed memory */ + ro->uniq = alloc_percpu(struct uniqframe); + if (unlikely(!ro->uniq)) + return -ENOMEM; /* set notifier */ ro->notifier.notifier_call = raw_notifier; @@ -339,6 +373,7 @@ static int raw_release(struct socket *sock) ro->ifindex = 0; ro->bound = 0; ro->count = 0; + free_percpu(ro->uniq); sock_orphan(sk); sock->sk = NULL; @@ -583,6 +618,15 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; + case CAN_RAW_JOIN_FILTERS: + if (optlen != sizeof(ro->join_filters)) + return -EINVAL; + + if (copy_from_user(&ro->join_filters, optval, optlen)) + return -EFAULT; + + break; + default: return -ENOPROTOOPT; } @@ -647,6 +691,12 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, val = &ro->fd_frames; break; + case CAN_RAW_JOIN_FILTERS: + if (len > sizeof(int)) + len = sizeof(int); + val = &ro->join_filters; + break; + default: return -ENOPROTOOPT; } @@ -658,8 +708,7 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, return 0; } -static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size) +static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); @@ -700,6 +749,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) @@ -728,8 +778,8 @@ send_failed: return err; } -static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ec565508e904..f30329f72641 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -9,6 +9,7 @@ #include <keys/ceph-type.h> #include <linux/module.h> #include <linux/mount.h> +#include <linux/nsproxy.h> #include <linux/parser.h> #include <linux/sched.h> #include <linux/seq_file.h> @@ -16,8 +17,6 @@ #include <linux/statfs.h> #include <linux/string.h> #include <linux/vmalloc.h> -#include <linux/nsproxy.h> -#include <net/net_namespace.h> #include <linux/ceph/ceph_features.h> @@ -131,6 +130,13 @@ int ceph_compare_options(struct ceph_options *new_opt, int i; int ret; + /* + * Don't bother comparing options if network namespaces don't + * match. + */ + if (!net_eq(current->nsproxy->net_ns, read_pnet(&client->msgr.net))) + return -1; + ret = memcmp(opt1, opt2, ofs); if (ret) return ret; @@ -335,9 +341,6 @@ ceph_parse_options(char *options, const char *dev_name, int err = -ENOMEM; substring_t argstr[MAX_OPT_ARGS]; - if (current->nsproxy->net_ns != &init_net) - return ERR_PTR(-EINVAL); - opt = kzalloc(sizeof(*opt), GFP_KERNEL); if (!opt) return ERR_PTR(-ENOMEM); @@ -352,8 +355,8 @@ ceph_parse_options(char *options, const char *dev_name, /* start with defaults */ opt->flags = CEPH_OPT_DEFAULT; opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; - opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ - opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ + opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; + opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* get mon ip(s) */ /* ip1[:port1][,ip2[:port2]...] */ @@ -439,13 +442,32 @@ ceph_parse_options(char *options, const char *dev_name, pr_warn("ignoring deprecated osdtimeout option\n"); break; case Opt_osdkeepalivetimeout: - opt->osd_keepalive_timeout = intval; + /* 0 isn't well defined right now, reject it */ + if (intval < 1 || intval > INT_MAX / 1000) { + pr_err("osdkeepalive out of range\n"); + err = -EINVAL; + goto out; + } + opt->osd_keepalive_timeout = + msecs_to_jiffies(intval * 1000); break; case Opt_osd_idle_ttl: - opt->osd_idle_ttl = intval; + /* 0 isn't well defined right now, reject it */ + if (intval < 1 || intval > INT_MAX / 1000) { + pr_err("osd_idle_ttl out of range\n"); + err = -EINVAL; + goto out; + } + opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000); break; case Opt_mount_timeout: - opt->mount_timeout = intval; + /* 0 is "wait forever" (i.e. infinite timeout) */ + if (intval < 0 || intval > INT_MAX / 1000) { + pr_err("mount_timeout out of range\n"); + err = -EINVAL; + goto out; + } + opt->mount_timeout = msecs_to_jiffies(intval * 1000); break; case Opt_share: @@ -490,6 +512,45 @@ out: } EXPORT_SYMBOL(ceph_parse_options); +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) +{ + struct ceph_options *opt = client->options; + size_t pos = m->count; + + if (opt->name) + seq_printf(m, "name=%s,", opt->name); + if (opt->key) + seq_puts(m, "secret=<hidden>,"); + + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, "fsid=%pU,", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) + seq_puts(m, "noshare,"); + if (opt->flags & CEPH_OPT_NOCRC) + seq_puts(m, "nocrc,"); + if (opt->flags & CEPH_OPT_NOMSGAUTH) + seq_puts(m, "nocephx_require_signatures,"); + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) + seq_puts(m, "notcp_nodelay,"); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, "mount_timeout=%d,", + jiffies_to_msecs(opt->mount_timeout) / 1000); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, "osd_idle_ttl=%d,", + jiffies_to_msecs(opt->osd_idle_ttl) / 1000); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, "osdkeepalivetimeout=%d,", + jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000); + + /* drop redundant comma */ + if (m->count != pos) + m->count--; + + return 0; +} +EXPORT_SYMBOL(ceph_print_client_options); + u64 ceph_client_id(struct ceph_client *client) { return client->monc.auth->global_id; @@ -550,6 +611,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, fail_monc: ceph_monc_stop(&client->monc); fail: + ceph_messenger_fini(&client->msgr); kfree(client); return ERR_PTR(err); } @@ -563,8 +625,8 @@ void ceph_destroy_client(struct ceph_client *client) /* unmount */ ceph_osdc_stop(&client->osdc); - ceph_monc_stop(&client->monc); + ceph_messenger_fini(&client->msgr); ceph_debugfs_client_cleanup(client); @@ -589,8 +651,8 @@ static int have_mon_and_osd_map(struct ceph_client *client) */ int __ceph_open_session(struct ceph_client *client, unsigned long started) { - int err; - unsigned long timeout = client->options->mount_timeout * HZ; + unsigned long timeout = client->options->mount_timeout; + long err; /* open session, and wait for mon and osd maps */ err = ceph_monc_open_session(&client->monc); @@ -598,16 +660,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) return err; while (!have_mon_and_osd_map(client)) { - err = -EIO; if (timeout && time_after_eq(jiffies, started + timeout)) - return err; + return -ETIMEDOUT; /* wait */ dout("mount waiting for mon_map\n"); err = wait_event_interruptible_timeout(client->auth_wq, have_mon_and_osd_map(client) || (client->auth_err < 0), - timeout); - if (err == -EINTR || err == -ERESTARTSYS) + ceph_timeout_jiffies(timeout)); + if (err < 0) return err; if (client->auth_err < 0) return client->auth_err; @@ -684,5 +745,5 @@ module_exit(exit_ceph_lib); MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); MODULE_AUTHOR("Patience Warnick <patience@newdream.net>"); -MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_DESCRIPTION("Ceph core library"); MODULE_LICENSE("GPL"); diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 16bc199d9a62..80d7c3a97cb8 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -1,15 +1,11 @@ - #ifdef __KERNEL__ # include <linux/slab.h> +# include <linux/crush/crush.h> #else -# include <stdlib.h> -# include <assert.h> -# define kfree(x) do { if (x) free(x); } while (0) -# define BUG_ON(x) assert(!(x)) +# include "crush_compat.h" +# include "crush.h" #endif -#include <linux/crush/crush.h> - const char *crush_bucket_alg_name(int alg) { switch (alg) { @@ -17,6 +13,7 @@ const char *crush_bucket_alg_name(int alg) case CRUSH_BUCKET_LIST: return "list"; case CRUSH_BUCKET_TREE: return "tree"; case CRUSH_BUCKET_STRAW: return "straw"; + case CRUSH_BUCKET_STRAW2: return "straw2"; default: return "unknown"; } } @@ -40,6 +37,8 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; case CRUSH_BUCKET_STRAW: return ((struct crush_bucket_straw *)b)->item_weights[p]; + case CRUSH_BUCKET_STRAW2: + return ((struct crush_bucket_straw2 *)b)->item_weights[p]; } return 0; } @@ -77,6 +76,14 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) kfree(b); } +void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) +{ + kfree(b->item_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + void crush_destroy_bucket(struct crush_bucket *b) { switch (b->alg) { @@ -92,6 +99,9 @@ void crush_destroy_bucket(struct crush_bucket *b) case CRUSH_BUCKET_STRAW: crush_destroy_bucket_straw((struct crush_bucket_straw *)b); break; + case CRUSH_BUCKET_STRAW2: + crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); + break; } } @@ -120,6 +130,9 @@ void crush_destroy(struct crush_map *map) kfree(map->rules); } +#ifndef __KERNEL__ + kfree(map->choose_tries); +#endif kfree(map); } diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h new file mode 100644 index 000000000000..aae534c901a4 --- /dev/null +++ b/net/ceph/crush/crush_ln_table.h @@ -0,0 +1,164 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel Corporation All Rights Reserved + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_CRUSH_LN_H +#define CEPH_CRUSH_LN_H + +#ifdef __KERNEL__ +# include <linux/types.h> +#else +# include "crush_compat.h" +#endif + +/* + * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0) + * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0) + */ +static __s64 __RH_LH_tbl[128*2+2] = { + 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll, + 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all, + 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll, + 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll, + 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll, + 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll, + 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll, + 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell, + 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll, + 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll, + 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll, + 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll, + 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll, + 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll, + 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all, + 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll, + 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all, + 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell, + 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll, + 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll, + 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll, + 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll, + 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll, + 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll, + 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll, + 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll, + 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell, + 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll, + 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll, + 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll, + 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll, + 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll, + 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll, + 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll, + 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll, + 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll, + 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll, + 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll, + 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll, + 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll, + 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll, + 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll, + 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll, + 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll, + 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll, + 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll, + 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll, + 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll, + 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll, + 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll, + 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll, + 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll, + 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll, + 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell, + 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell, + 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll, + 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell, + 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll, + 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll, + 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll, + 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll, + 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll, + 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll, + 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll, + 0x0000800000000000ll, 0x0000ffff00000000ll, +}; + +/* + * LL_tbl[k] = 2^48*log2(1.0+k/2^15) + */ +static __s64 __LL_tbl[256] = { + 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull, + 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull, + 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull, + 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull, + 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull, + 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull, + 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull, + 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull, + 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull, + 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull, + 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull, + 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull, + 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull, + 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull, + 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull, + 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull, + 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull, + 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull, + 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull, + 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull, + 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull, + 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull, + 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull, + 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull, + 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull, + 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull, + 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull, + 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull, + 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull, + 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull, + 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull, + 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull, + 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull, + 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull, + 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull, + 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull, + 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull, + 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull, + 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull, + 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull, + 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull, + 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull, + 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull, + 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull, + 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull, + 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull, + 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull, + 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull, + 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull, + 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull, + 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull, + 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull, + 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull, + 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull, + 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull, + 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull, + 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull, + 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull, + 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull, + 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull, + 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull, + 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull, + 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull, + 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull, +}; + +#endif diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c index 5bb63e37a8a1..ed123af49eba 100644 --- a/net/ceph/crush/hash.c +++ b/net/ceph/crush/hash.c @@ -1,6 +1,8 @@ - -#include <linux/types.h> -#include <linux/crush/hash.h> +#ifdef __KERNEL__ +# include <linux/crush/hash.h> +#else +# include "hash.h" +#endif /* * Robert Jenkins' function for mixing 32-bit values diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a1ef53c04415..393bfb22d5bb 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -1,26 +1,30 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel Corporation All Rights Reserved + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ #ifdef __KERNEL__ # include <linux/string.h> # include <linux/slab.h> # include <linux/bug.h> # include <linux/kernel.h> -# ifndef dprintk -# define dprintk(args...) -# endif +# include <linux/crush/crush.h> +# include <linux/crush/hash.h> #else -# include <string.h> -# include <stdio.h> -# include <stdlib.h> -# include <assert.h> -# define BUG_ON(x) assert(!(x)) -# define dprintk(args...) /* printf(args) */ -# define kmalloc(x, f) malloc(x) -# define kfree(x) free(x) +# include "crush_compat.h" +# include "crush.h" +# include "hash.h" #endif +#include "crush_ln_table.h" -#include <linux/crush/crush.h> -#include <linux/crush/hash.h> -#include <linux/crush/mapper.h> +#define dprintk(args...) /* printf(args) */ /* * Implement the core CRUSH mapping algorithm. @@ -139,7 +143,7 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, int i; for (i = bucket->h.size-1; i >= 0; i--) { - __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], + __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i], r, bucket->h.id); w &= 0xffff; dprintk("list_choose i=%d x=%d r=%d item %d weight %x " @@ -238,6 +242,105 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, return bucket->h.items[high]; } +/* compute 2^44*log2(input+1) */ +static __u64 crush_ln(unsigned int xin) +{ + unsigned int x = xin, x1; + int iexpon, index1, index2; + __u64 RH, LH, LL, xl64, result; + + x++; + + /* normalize input */ + iexpon = 15; + while (!(x & 0x18000)) { + x <<= 1; + iexpon--; + } + + index1 = (x >> 8) << 1; + /* RH ~ 2^56/index1 */ + RH = __RH_LH_tbl[index1 - 256]; + /* LH ~ 2^48 * log2(index1/256) */ + LH = __RH_LH_tbl[index1 + 1 - 256]; + + /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */ + xl64 = (__s64)x * RH; + xl64 >>= 48; + x1 = xl64; + + result = iexpon; + result <<= (12 + 32); + + index2 = x1 & 0xff; + /* LL ~ 2^48*log2(1.0+index2/2^15) */ + LL = __LL_tbl[index2]; + + LH = LH + LL; + + LH >>= (48 - 12 - 32); + result += LH; + + return result; +} + + +/* + * straw2 + * + * for reference, see: + * + * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables + * + */ + +static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, + int x, int r) +{ + unsigned int i, high = 0; + unsigned int u; + unsigned int w; + __s64 ln, draw, high_draw = 0; + + for (i = 0; i < bucket->h.size; i++) { + w = bucket->item_weights[i]; + if (w) { + u = crush_hash32_3(bucket->h.hash, x, + bucket->h.items[i], r); + u &= 0xffff; + + /* + * for some reason slightly less than 0x10000 produces + * a slightly more accurate distribution... probably a + * rounding effect. + * + * the natural log lookup table maps [0,0xffff] + * (corresponding to real numbers [1/0x10000, 1] to + * [0, 0xffffffffffff] (corresponding to real numbers + * [-11.090355,0]). + */ + ln = crush_ln(u) - 0x1000000000000ll; + + /* + * divide by 16.16 fixed-point weight. note + * that the ln value is negative, so a larger + * weight means a larger (less negative) value + * for draw. + */ + draw = div64_s64(ln, w); + } else { + draw = S64_MIN; + } + + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + return bucket->h.items[high]; +} + + static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); @@ -255,12 +358,16 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) case CRUSH_BUCKET_STRAW: return bucket_straw_choose((struct crush_bucket_straw *)in, x, r); + case CRUSH_BUCKET_STRAW2: + return bucket_straw2_choose((struct crush_bucket_straw2 *)in, + x, r); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } + /* * true if device is marked "out" (failed, fully offloaded) * of the cluster @@ -290,6 +397,7 @@ static int is_out(const struct crush_map *map, * @type: the type of item to choose * @out: pointer to output vector * @outpos: our position in that vector + * @out_size: size of the out vector * @tries: number of attempts to make * @recurse_tries: number of attempts to have recursive chooseleaf make * @local_retries: localized retries @@ -304,6 +412,7 @@ static int crush_choose_firstn(const struct crush_map *map, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, + int out_size, unsigned int tries, unsigned int recurse_tries, unsigned int local_retries, @@ -322,6 +431,7 @@ static int crush_choose_firstn(const struct crush_map *map, int item = 0; int itemtype; int collide, reject; + int count = out_size; dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", recurse_to_leaf ? "_LEAF" : "", @@ -329,7 +439,7 @@ static int crush_choose_firstn(const struct crush_map *map, tries, recurse_tries, local_retries, local_fallback_retries, parent_r); - for (rep = outpos; rep < numrep; rep++) { + for (rep = outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -403,7 +513,7 @@ static int crush_choose_firstn(const struct crush_map *map, map->buckets[-1-item], weight, weight_max, x, outpos+1, 0, - out2, outpos, + out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, @@ -463,6 +573,11 @@ reject: dprintk("CHOOSE got %d\n", item); out[outpos] = item; outpos++; + count--; +#ifndef __KERNEL__ + if (map->choose_tries && ftotal <= map->choose_total_tries) + map->choose_tries[ftotal]++; +#endif } dprintk("CHOOSE returns %d\n", outpos); @@ -506,6 +621,20 @@ static void crush_choose_indep(const struct crush_map *map, } for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { +#ifdef DEBUG_INDEP + if (out2 && ftotal) { + dprintk("%u %d a: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out[rep]); + } + dprintk("\n"); + dprintk("%u %d b: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out2[rep]); + } + dprintk("\n"); + } +#endif for (rep = outpos; rep < endpos; rep++) { if (out[rep] != CRUSH_ITEM_UNDEF) continue; @@ -622,6 +751,24 @@ static void crush_choose_indep(const struct crush_map *map, out2[rep] = CRUSH_ITEM_NONE; } } +#ifndef __KERNEL__ + if (map->choose_tries && ftotal <= map->choose_total_tries) + map->choose_tries[ftotal]++; +#endif +#ifdef DEBUG_INDEP + if (out2) { + dprintk("%u %d a: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out[rep]); + } + dprintk("\n"); + dprintk("%u %d b: ", ftotal, left); + for (rep = outpos; rep < endpos; rep++) { + dprintk(" %d", out2[rep]); + } + dprintk("\n"); + } +#endif } /** @@ -654,6 +801,7 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; + int out_size; /* * the original choose_total_tries value was off by one (it * counted "retries" and not "tries"). add one. @@ -685,8 +833,15 @@ int crush_do_rule(const struct crush_map *map, switch (curstep->op) { case CRUSH_RULE_TAKE: - w[0] = curstep->arg1; - wsize = 1; + if ((curstep->arg1 >= 0 && + curstep->arg1 < map->max_devices) || + (-1-curstep->arg1 < map->max_buckets && + map->buckets[-1-curstep->arg1])) { + w[0] = curstep->arg1; + wsize = 1; + } else { + dprintk(" bad take value %d\n", curstep->arg1); + } break; case CRUSH_RULE_SET_CHOOSE_TRIES: @@ -761,6 +916,7 @@ int crush_do_rule(const struct crush_map *map, x, numrep, curstep->arg2, o+osize, j, + result_max-osize, choose_tries, recurse_tries, choose_local_retries, @@ -770,11 +926,13 @@ int crush_do_rule(const struct crush_map *map, c+osize, 0); } else { + out_size = ((numrep < (result_max-osize)) ? + numrep : (result_max-osize)); crush_choose_indep( map, map->buckets[-1-w[i]], weight, weight_max, - x, numrep, numrep, + x, out_size, numrep, curstep->arg2, o+osize, j, choose_tries, @@ -783,7 +941,7 @@ int crush_do_rule(const struct crush_map *map, recurse_to_leaf, c+osize, 0); - osize += numrep; + osize += out_size; } } @@ -815,5 +973,3 @@ int crush_do_rule(const struct crush_map *map, } return result_len; } - - diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 14d9995097cc..593dc2eabcc8 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -22,6 +22,7 @@ * .../monmap - current monmap * .../osdc - active osd requests * .../monc - mon client state + * .../client_options - libceph-only (i.e. not rbd or cephfs) options * .../dentry_lru - dump contents of dentry lru * .../caps - expose cap (reservation) stats * .../bdi - symlink to ../../bdi/something @@ -177,10 +178,24 @@ static int osdc_show(struct seq_file *s, void *pp) return 0; } +static int client_options_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + int ret; + + ret = ceph_print_client_options(s, client); + if (ret) + return ret; + + seq_putc(s, '\n'); + return 0; +} + CEPH_DEFINE_SHOW_FUNC(monmap_show) CEPH_DEFINE_SHOW_FUNC(osdmap_show) CEPH_DEFINE_SHOW_FUNC(monc_show) CEPH_DEFINE_SHOW_FUNC(osdc_show) +CEPH_DEFINE_SHOW_FUNC(client_options_show) int ceph_debugfs_init(void) { @@ -242,6 +257,14 @@ int ceph_debugfs_client_init(struct ceph_client *client) if (!client->debugfs_osdmap) goto out; + client->debugfs_options = debugfs_create_file("client_options", + 0600, + client->debugfs_dir, + client, + &client_options_show_fops); + if (!client->debugfs_options) + goto out; + return 0; out: @@ -252,6 +275,7 @@ out: void ceph_debugfs_client_cleanup(struct ceph_client *client) { dout("ceph_debugfs_client_cleanup %p\n", client); + debugfs_remove(client->debugfs_options); debugfs_remove(client->debugfs_osdmap); debugfs_remove(client->debugfs_monmap); debugfs_remove(client->osdc.debugfs_file); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6b3f54ed65ba..e3be1d22a247 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -6,6 +6,7 @@ #include <linux/inet.h> #include <linux/kthread.h> #include <linux/net.h> +#include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/string.h> @@ -278,7 +279,6 @@ static void _ceph_msgr_exit(void) ceph_msgr_slab_exit(); BUG_ON(zero_page == NULL); - kunmap(zero_page); page_cache_release(zero_page); zero_page = NULL; } @@ -480,11 +480,11 @@ static int ceph_tcp_connect(struct ceph_connection *con) int ret; BUG_ON(con->sock); - ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (ret) return ret; - sock->sk->sk_allocation = GFP_NOFS | __GFP_MEMALLOC; + sock->sk->sk_allocation = GFP_NOFS; #ifdef CONFIG_LOCKDEP lockdep_set_class(&sock->sk->sk_lock, &socket_class); @@ -505,8 +505,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); - con->error_msg = "connect error"; - return ret; } @@ -520,8 +518,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) ret); } - sk_set_memalloc(sock->sk); - con->sock = sock; return 0; } @@ -1549,7 +1545,7 @@ static int write_partial_message_data(struct ceph_connection *con) page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, &last_piece); ret = ceph_tcp_sendpage(con->sock, page, page_offset, - length, last_piece); + length, !last_piece); if (ret <= 0) { if (do_datacrc) msg->footer.data_crc = cpu_to_le32(crc); @@ -1736,17 +1732,17 @@ static int verify_hello(struct ceph_connection *con) static bool addr_is_blank(struct sockaddr_storage *ss) { + struct in_addr *addr = &((struct sockaddr_in *)ss)->sin_addr; + struct in6_addr *addr6 = &((struct sockaddr_in6 *)ss)->sin6_addr; + switch (ss->ss_family) { case AF_INET: - return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0; + return addr->s_addr == htonl(INADDR_ANY); case AF_INET6: - return - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0; + return ipv6_addr_any(addr6); + default: + return true; } - return false; } static int addr_port(struct sockaddr_storage *ss) @@ -2147,12 +2143,10 @@ static int process_connect(struct ceph_connection *con) * to WAIT. This shouldn't happen if we are the * client. */ - pr_err("process_connect got WAIT as client\n"); con->error_msg = "protocol error, got WAIT as client"; return -1; default: - pr_err("connect protocol error, will retry\n"); con->error_msg = "protocol error, garbage tag during connect"; return -1; } @@ -2284,8 +2278,7 @@ static int read_partial_message(struct ceph_connection *con) crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); if (cpu_to_le32(crc) != con->in_hdr.crc) { - pr_err("read_partial_message bad hdr " - " crc %u != expected %u\n", + pr_err("read_partial_message bad hdr crc %u != expected %u\n", crc, con->in_hdr.crc); return -EBADMSG; } @@ -2315,7 +2308,7 @@ static int read_partial_message(struct ceph_connection *con) pr_err("read_partial_message bad seq %lld expected %lld\n", seq, con->in_seq + 1); con->error_msg = "bad message sequence # for incoming message"; - return -EBADMSG; + return -EBADE; } /* allocate message? */ @@ -2662,6 +2655,8 @@ more: switch (ret) { case -EBADMSG: con->error_msg = "bad crc"; + /* fall through */ + case -EBADE: ret = -EIO; break; case -EIO: @@ -2808,11 +2803,8 @@ static void con_work(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); - unsigned long pflags = current->flags; bool fault; - current->flags |= PF_MEMALLOC; - mutex_lock(&con->mutex); while (true) { int ret; @@ -2843,7 +2835,8 @@ static void con_work(struct work_struct *work) if (ret < 0) { if (ret == -EAGAIN) continue; - con->error_msg = "socket error on read"; + if (!con->error_msg) + con->error_msg = "socket error on read"; fault = true; break; } @@ -2852,7 +2845,8 @@ static void con_work(struct work_struct *work) if (ret < 0) { if (ret == -EAGAIN) continue; - con->error_msg = "socket error on write"; + if (!con->error_msg) + con->error_msg = "socket error on write"; fault = true; } @@ -2866,8 +2860,6 @@ static void con_work(struct work_struct *work) con_fault_finish(con); con->ops->put(con); - - tsk_restore_flags(current, pflags, PF_MEMALLOC); } /* @@ -2876,11 +2868,13 @@ static void con_work(struct work_struct *work) */ static void con_fault(struct ceph_connection *con) { - pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); + pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); + con->error_msg = NULL; + WARN_ON(con->state != CON_STATE_CONNECTING && con->state != CON_STATE_NEGOTIATING && con->state != CON_STATE_OPEN); @@ -2951,11 +2945,18 @@ void ceph_messenger_init(struct ceph_messenger *msgr, msgr->tcp_nodelay = tcp_nodelay; atomic_set(&msgr->stopping, 0); + write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); dout("%s %p\n", __func__, msgr); } EXPORT_SYMBOL(ceph_messenger_init); +void ceph_messenger_fini(struct ceph_messenger *msgr) +{ + put_net(read_pnet(&msgr->net)); +} +EXPORT_SYMBOL(ceph_messenger_fini); + static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ @@ -3302,8 +3303,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) */ if (*skip) return 0; - con->error_msg = "error allocating memory for incoming message"; + con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 2b3cf05e87b0..9d6ff1215928 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -298,21 +298,28 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_request_next_osdmap); +/* + * Wait for an osdmap with a given epoch. + * + * @epoch: epoch to wait for + * @timeout: in jiffies, 0 means "wait forever" + */ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, unsigned long timeout) { unsigned long started = jiffies; - int ret; + long ret; mutex_lock(&monc->mutex); while (monc->have_osdmap < epoch) { mutex_unlock(&monc->mutex); - if (timeout != 0 && time_after_eq(jiffies, started + timeout)) + if (timeout && time_after_eq(jiffies, started + timeout)) return -ETIMEDOUT; ret = wait_event_interruptible_timeout(monc->client->auth_wq, - monc->have_osdmap >= epoch, timeout); + monc->have_osdmap >= epoch, + ceph_timeout_jiffies(timeout)); if (ret < 0) return ret; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 41a4abc7e98e..50033677c0fa 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -296,6 +296,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_CMPXATTR: ceph_osd_data_release(&op->xattr.osd_data); break; + case CEPH_OSD_OP_STAT: + ceph_osd_data_release(&op->raw_data_in); + break; default: break; } @@ -450,7 +453,7 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE) */ static struct ceph_osd_req_op * _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, - u16 opcode) + u16 opcode, u32 flags) { struct ceph_osd_req_op *op; @@ -460,14 +463,15 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, op = &osd_req->r_ops[which]; memset(op, 0, sizeof (*op)); op->op = opcode; + op->flags = flags; return op; } void osd_req_op_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode) + unsigned int which, u16 opcode, u32 flags) { - (void)_osd_req_op_init(osd_req, which, opcode); + (void)_osd_req_op_init(osd_req, which, opcode, flags); } EXPORT_SYMBOL(osd_req_op_init); @@ -476,7 +480,8 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && @@ -515,7 +520,8 @@ EXPORT_SYMBOL(osd_req_op_extent_update); void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; @@ -552,7 +558,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; @@ -585,7 +592,8 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + opcode, 0); BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); @@ -602,7 +610,8 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, u64 expected_write_size) { struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - CEPH_OSD_OP_SETALLOCHINT); + CEPH_OSD_OP_SETALLOCHINT, + 0); op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; @@ -786,7 +795,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { - osd_req_op_init(req, which, opcode); + osd_req_op_init(req, which, opcode, 0); } else { u32 object_size = le32_to_cpu(layout->fl_object_size); u32 object_base = off - objoff; @@ -1088,7 +1097,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc, BUG_ON(!list_empty(&osd->o_osd_lru)); list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); - osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; } static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, @@ -1199,7 +1208,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) static void __schedule_osd_timeout(struct ceph_osd_client *osdc) { schedule_delayed_work(&osdc->timeout_work, - osdc->client->options->osd_keepalive_timeout * HZ); + osdc->client->options->osd_keepalive_timeout); } static void __cancel_osd_timeout(struct ceph_osd_client *osdc) @@ -1306,8 +1315,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, if (list_empty(&req->r_osd_item)) req->r_osd = NULL; } - - list_del_init(&req->r_req_lru_item); /* can be on notarget */ ceph_osdc_put_request(req); } @@ -1569,10 +1576,9 @@ static void handle_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); + struct ceph_options *opts = osdc->client->options; struct ceph_osd_request *req; struct ceph_osd *osd; - unsigned long keepalive = - osdc->client->options->osd_keepalive_timeout * HZ; struct list_head slow_osds; dout("timeout\n"); down_read(&osdc->map_sem); @@ -1588,7 +1594,8 @@ static void handle_timeout(struct work_struct *work) */ INIT_LIST_HEAD(&slow_osds); list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { - if (time_before(jiffies, req->r_stamp + keepalive)) + if (time_before(jiffies, + req->r_stamp + opts->osd_keepalive_timeout)) break; osd = req->r_osd; @@ -1615,8 +1622,7 @@ static void handle_osds_timeout(struct work_struct *work) struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, osds_timeout_work.work); - unsigned long delay = - osdc->client->options->osd_idle_ttl * HZ >> 2; + unsigned long delay = osdc->client->options->osd_idle_ttl / 4; dout("osds timeout\n"); down_read(&osdc->map_sem); @@ -2017,20 +2023,29 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, err = __map_request(osdc, req, force_resend || force_resend_writes); dout("__map_request returned %d\n", err); - if (err == 0) - continue; /* no change and no osd was specified */ if (err < 0) continue; /* hrm! */ - if (req->r_osd == NULL) { - dout("tid %llu maps to no valid osd\n", req->r_tid); - needmap++; /* request a newer map */ - continue; - } + if (req->r_osd == NULL || err > 0) { + if (req->r_osd == NULL) { + dout("lingering %p tid %llu maps to no osd\n", + req, req->r_tid); + /* + * A homeless lingering request makes + * no sense, as it's job is to keep + * a particular OSD connection open. + * Request a newer map and kick the + * request, knowing that it won't be + * resent until we actually get a map + * that can tell us where to send it. + */ + needmap++; + } - dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - __register_request(osdc, req); - __unregister_linger_request(osdc, req); + dout("kicking lingering %p tid %llu osd%d\n", req, + req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); + __register_request(osdc, req); + __unregister_linger_request(osdc, req); + } } reset_changed_osds(osdc); mutex_unlock(&osdc->request_mutex); @@ -2612,7 +2627,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->event_count = 0; schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); + round_jiffies_relative(osdc->client->options->osd_idle_ttl)); err = -ENOMEM; osdc->req_mempool = mempool_create_kmalloc_pool(10, diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index b8c3fde5b04f..4a3125836b64 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -89,7 +89,7 @@ static int crush_decode_tree_bucket(void **p, void *end, { int j; dout("crush_decode_tree_bucket %p to %p\n", *p, end); - ceph_decode_32_safe(p, end, b->num_nodes, bad); + ceph_decode_8_safe(p, end, b->num_nodes, bad); b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); if (b->node_weights == NULL) return -ENOMEM; @@ -122,6 +122,22 @@ bad: return -EINVAL; } +static int crush_decode_straw2_bucket(void **p, void *end, + struct crush_bucket_straw2 *b) +{ + int j; + dout("crush_decode_straw2_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) + b->item_weights[j] = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + static int skip_name_map(void **p, void *end) { int len; @@ -204,6 +220,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end) case CRUSH_BUCKET_STRAW: size = sizeof(struct crush_bucket_straw); break; + case CRUSH_BUCKET_STRAW2: + size = sizeof(struct crush_bucket_straw2); + break; default: err = -EINVAL; goto bad; @@ -261,6 +280,12 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (err < 0) goto bad; break; + case CRUSH_BUCKET_STRAW2: + err = crush_decode_straw2_bucket(p, end, + (struct crush_bucket_straw2 *)b); + if (err < 0) + goto bad; + break; } } diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 096d91447e06..d4f5f220a8e5 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -51,10 +51,7 @@ void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty) set_page_dirty_lock(pages[i]); put_page(pages[i]); } - if (is_vmalloc_addr(pages)) - vfree(pages); - else - kfree(pages); + kvfree(pages); } EXPORT_SYMBOL(ceph_put_page_vector); diff --git a/net/compat.c b/net/compat.c index f7bd286a8280..5cfd26a0006f 100644 --- a/net/compat.c +++ b/net/compat.c @@ -31,10 +31,10 @@ #include <asm/uaccess.h> #include <net/compat.h> -ssize_t get_compat_msghdr(struct msghdr *kmsg, - struct compat_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec **iov) +int get_compat_msghdr(struct msghdr *kmsg, + struct compat_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec **iov) { compat_uptr_t uaddr, uiov, tmp3; compat_size_t nr_segs; @@ -79,13 +79,11 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg, if (nr_segs > UIO_MAXIOV) return -EMSGSIZE; - err = compat_rw_copy_check_uvector(save_addr ? READ : WRITE, - compat_ptr(uiov), nr_segs, - UIO_FASTIOV, *iov, iov); - if (err >= 0) - iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE, - *iov, nr_segs, err); - return err; + kmsg->msg_iocb = NULL; + + return compat_import_iovec(save_addr ? READ : WRITE, + compat_ptr(uiov), nr_segs, + UIO_FASTIOV, iov, &kmsg->msg_iter); } /* Bleech... */ @@ -515,25 +513,25 @@ COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, struct compat_group_req { __u32 gr_interface; struct __kernel_sockaddr_storage gr_group - __attribute__ ((aligned(4))); + __aligned(4); } __packed; struct compat_group_source_req { __u32 gsr_interface; struct __kernel_sockaddr_storage gsr_group - __attribute__ ((aligned(4))); + __aligned(4); struct __kernel_sockaddr_storage gsr_source - __attribute__ ((aligned(4))); + __aligned(4); } __packed; struct compat_group_filter { __u32 gf_interface; struct __kernel_sockaddr_storage gf_group - __attribute__ ((aligned(4))); + __aligned(4); __u32 gf_fmode; __u32 gf_numsrc; struct __kernel_sockaddr_storage gf_slist[1] - __attribute__ ((aligned(4))); + __aligned(4); } __packed; #define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \ diff --git a/net/core/datagram.c b/net/core/datagram.c index df493d68330c..4967262b2707 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -131,6 +131,35 @@ out_noerr: goto out; } +static int skb_set_peeked(struct sk_buff *skb) +{ + struct sk_buff *nskb; + + if (skb->peeked) + return 0; + + /* We have to unshare an skb before modifying it. */ + if (!skb_shared(skb)) + goto done; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + skb->prev->next = nskb; + skb->next->prev = nskb; + nskb->prev = skb->prev; + nskb->next = skb->next; + + consume_skb(skb); + skb = nskb; + +done: + skb->peeked = 1; + + return 0; +} + /** * __skb_recv_datagram - Receive a datagram skbuff * @sk: socket @@ -165,7 +194,9 @@ out_noerr: struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, int *peeked, int *off, int *err) { + struct sk_buff_head *queue = &sk->sk_receive_queue; struct sk_buff *skb, *last; + unsigned long cpu_flags; long timeo; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() @@ -184,8 +215,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ - unsigned long cpu_flags; - struct sk_buff_head *queue = &sk->sk_receive_queue; int _off = *off; last = (struct sk_buff *)queue; @@ -199,7 +228,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, _off -= skb->len; continue; } - skb->peeked = 1; + + error = skb_set_peeked(skb); + if (error) + goto unlock_err; + atomic_inc(&skb->users); } else __skb_unlink(skb, queue); @@ -223,6 +256,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, return NULL; +unlock_err: + spin_unlock_irqrestore(&queue->lock, cpu_flags); no_packet: *err = error; return NULL; @@ -622,7 +657,8 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev); } - skb->csum_valid = !sum; + if (!skb_shared(skb)) + skb->csum_valid = !sum; return sum; } EXPORT_SYMBOL(__skb_checksum_complete_head); @@ -642,11 +678,13 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) netdev_rx_csum_fault(skb->dev); } - /* Save full packet checksum */ - skb->csum = csum; - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - skb->csum_valid = !sum; + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } return sum; } @@ -673,7 +711,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (!chunk) return 0; - if (iov_iter_count(&msg->msg_iter) < chunk) { + if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) goto csum_error; if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) diff --git a/net/core/dev.c b/net/core/dev.c index 45109b70664e..a8e4dd430285 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -135,6 +135,7 @@ #include <linux/if_macvlan.h> #include <linux/errqueue.h> #include <linux/hrtimer.h> +#include <linux/netfilter_ingress.h> #include "net-sysfs.h" @@ -468,10 +469,14 @@ EXPORT_SYMBOL(dev_remove_pack); */ void dev_add_offload(struct packet_offload *po) { - struct list_head *head = &offload_base; + struct packet_offload *elem; spin_lock(&offload_lock); - list_add_rcu(&po->list, head); + list_for_each_entry(elem, &offload_base, list) { + if (po->priority < elem->priority) + break; + } + list_add_rcu(&po->list, elem->list.prev); spin_unlock(&offload_lock); } EXPORT_SYMBOL(dev_add_offload); @@ -660,6 +665,23 @@ __setup("netdev=", netdev_boot_setup); *******************************************************************************/ /** + * dev_get_iflink - get 'iflink' value of a interface + * @dev: targeted interface + * + * Indicates the ifindex the interface is linked to. + * Physical interfaces have the same 'ifindex' and 'iflink' values. + */ + +int dev_get_iflink(const struct net_device *dev) +{ + if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) + return dev->netdev_ops->ndo_get_iflink(dev); + + return dev->ifindex; +} +EXPORT_SYMBOL(dev_get_iflink); + +/** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find @@ -1385,7 +1407,7 @@ static int __dev_close(struct net_device *dev) return retval; } -static int dev_close_many(struct list_head *head) +int dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; @@ -1399,11 +1421,13 @@ static int dev_close_many(struct list_head *head) list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_DOWN, dev); - list_del_init(&dev->close_list); + if (unlink) + list_del_init(&dev->close_list); } return 0; } +EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. @@ -1420,7 +1444,7 @@ int dev_close(struct net_device *dev) LIST_HEAD(single); list_add(&dev->close_list, &single); - dev_close_many(&single); + dev_close_many(&single, true); list_del(&single); } return 0; @@ -1607,6 +1631,22 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); +#ifdef CONFIG_NET_INGRESS +static struct static_key ingress_needed __read_mostly; + +void net_inc_ingress_queue(void) +{ + static_key_slow_inc(&ingress_needed); +} +EXPORT_SYMBOL_GPL(net_inc_ingress_queue); + +void net_dec_ingress_queue(void) +{ + static_key_slow_dec(&ingress_needed); +} +EXPORT_SYMBOL_GPL(net_dec_ingress_queue); +#endif + static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL /* We are not allowed to call static_key_slow_dec() from irq context @@ -1679,21 +1719,15 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, GFP_ATOMIC)) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } - } - - if (unlikely(!is_skb_forwardable(dev, skb))) { + if (skb_orphan_frags(skb, GFP_ATOMIC) || + unlikely(!is_skb_forwardable(dev, skb))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } skb_scrub_packet(skb, true); + skb->priority = 0; skb->protocol = eth_type_trans(skb, dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); @@ -1737,7 +1771,8 @@ static inline int deliver_skb(struct sk_buff *skb, static inline void deliver_ptype_list_skb(struct sk_buff *skb, struct packet_type **pt, - struct net_device *dev, __be16 type, + struct net_device *orig_dev, + __be16 type, struct list_head *ptype_list) { struct packet_type *ptype, *pt_prev = *pt; @@ -1746,7 +1781,7 @@ static inline void deliver_ptype_list_skb(struct sk_buff *skb, if (ptype->type != type) continue; if (pt_prev) - deliver_skb(skb, pt_prev, dev); + deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } *pt = pt_prev; @@ -2309,6 +2344,34 @@ void netif_device_attach(struct net_device *dev) } EXPORT_SYMBOL(netif_device_attach); +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, + unsigned int num_tx_queues) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; + + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; + return hash; + } + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features = 0; @@ -2559,12 +2622,26 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, return features; } +netdev_features_t passthru_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + return features; +} +EXPORT_SYMBOL(passthru_features_check); + +static netdev_features_t dflt_features_check(const struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + return vlan_features_check(skb, features); +} + netdev_features_t netif_skb_features(struct sk_buff *skb) { struct net_device *dev = skb->dev; netdev_features_t features = dev->features; u16 gso_segs = skb_shinfo(skb)->gso_segs; - __be16 protocol = skb->protocol; if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; @@ -2576,34 +2653,17 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) if (skb->encapsulation) features &= dev->hw_enc_features; - if (!skb_vlan_tag_present(skb)) { - if (unlikely(protocol == htons(ETH_P_8021Q) || - protocol == htons(ETH_P_8021AD))) { - struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; - protocol = veh->h_vlan_encapsulated_proto; - } else { - goto finalize; - } - } - - features = netdev_intersect_features(features, - dev->vlan_features | - NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); - - if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) + if (skb_vlan_tagged(skb)) features = netdev_intersect_features(features, - NETIF_F_SG | - NETIF_F_HIGHDMA | - NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | + dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); -finalize: if (dev->netdev_ops->ndo_features_check) features &= dev->netdev_ops->ndo_features_check(skb, dev, features); + else + features &= dflt_features_check(skb, dev, features); return harmonize_features(skb, features); } @@ -2675,7 +2735,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (unlikely(!skb)) goto out_null; - if (netif_needs_gso(dev, skb, features)) { + if (netif_needs_gso(skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); @@ -2857,7 +2917,7 @@ EXPORT_SYMBOL(xmit_recursion); * dev_loopback_xmit - loop back @skb * @skb: buffer to transmit */ -int dev_loopback_xmit(struct sk_buff *skb) +int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); @@ -2870,6 +2930,84 @@ int dev_loopback_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(dev_loopback_xmit); +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[skb->sender_cpu - 1]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), + map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + int queue_index = sk_tx_queue_get(sk); + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int new_index = get_xps_queue(dev, skb); + if (new_index < 0) + new_index = skb_tx_hash(dev, skb); + + if (queue_index != new_index && sk && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, new_index); + + queue_index = new_index; + } + + return queue_index; +} + +struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb, + void *accel_priv) +{ + int queue_index = 0; + +#ifdef CONFIG_XPS + if (skb->sender_cpu == 0) + skb->sender_cpu = raw_smp_processor_id() + 1; +#endif + + if (dev->real_num_tx_queues != 1) { + const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + __netdev_pick_tx); + else + queue_index = __netdev_pick_tx(dev, skb); + + if (!accel_priv) + queue_index = netdev_cap_txqueue(dev, queue_index); + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + /** * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -2995,11 +3133,11 @@ out: return rc; } -int dev_queue_xmit(struct sk_buff *skb) +int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); } -EXPORT_SYMBOL(dev_queue_xmit); +EXPORT_SYMBOL(dev_queue_xmit_sk); int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) { @@ -3041,7 +3179,7 @@ static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { - if (next_cpu != RPS_NO_CPU) { + if (next_cpu < nr_cpu_ids) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; @@ -3146,7 +3284,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: - * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. @@ -3154,14 +3292,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && - (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } - if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; @@ -3202,14 +3340,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - int cpu; + unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = ACCESS_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - rflow->last_qtail) < (int)(10 * flow_table->mask))) @@ -3310,6 +3448,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, local_irq_save(flags); rps_lock(sd); + if (!netif_running(skb->dev)) + goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { if (qlen) { @@ -3331,6 +3471,7 @@ enqueue: goto enqueue; } +drop: sd->dropped++; rps_unlock(sd); @@ -3482,68 +3623,47 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -#ifdef CONFIG_NET_CLS_ACT -/* TODO: Maybe we should just force sch_ingress to be compiled in - * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions - * a compare and 2 stores extra right now if we dont have it on - * but have CONFIG_NET_CLS_ACT - * NOTE: This doesn't stop any functionality; if you dont have - * the ingress scheduler, you just can't add policies on ingress. - * - */ -static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) -{ - struct net_device *dev = skb->dev; - u32 ttl = G_TC_RTTL(skb->tc_verd); - int result = TC_ACT_OK; - struct Qdisc *q; - - if (unlikely(MAX_RED_LOOP < ttl++)) { - net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", - skb->skb_iif, dev->ifindex); - return TC_ACT_SHOT; - } - - skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - - q = rcu_dereference(rxq->qdisc); - if (q != &noop_qdisc) { - spin_lock(qdisc_lock(q)); - if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) - result = qdisc_enqueue_root(skb, q); - spin_unlock(qdisc_lock(q)); - } - - return result; -} - static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - - if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) - goto out; +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); + struct tcf_result cl_res; + /* If there's at least one ingress present somewhere (so + * we get here via enabled static key), remaining devices + * that are not configured with an ingress qdisc will bail + * out here. + */ + if (!cl) + return skb; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - switch (ing_filter(skb, rxq)) { + qdisc_skb_cb(skb)->pkt_len = skb->len; + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + qdisc_bstats_update_cpu(cl->q, skb); + + switch (tc_classify(skb, cl, &cl_res)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; case TC_ACT_SHOT: + qdisc_qstats_drop_cpu(cl->q); case TC_ACT_STOLEN: + case TC_ACT_QUEUED: kfree_skb(skb); return NULL; + default: + break; } - -out: - skb->tc_verd = 0; +#endif /* CONFIG_NET_CLS_ACT */ return skb; } -#endif /** * netdev_rx_handler_register - register receive handler @@ -3616,6 +3736,22 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) } } +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ +#ifdef CONFIG_NETFILTER_INGRESS + if (nf_hook_ingress_active(skb)) { + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + return nf_hook_ingress(skb); + } +#endif /* CONFIG_NETFILTER_INGRESS */ + return 0; +} + static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) { struct packet_type *ptype, *pt_prev; @@ -3638,8 +3774,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) pt_prev = NULL; - rcu_read_lock(); - another_round: skb->skb_iif = skb->dev->ifindex; @@ -3649,7 +3783,7 @@ another_round: skb->protocol == cpu_to_be16(ETH_P_8021AD)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) - goto unlock; + goto out; } #ifdef CONFIG_NET_CLS_ACT @@ -3675,13 +3809,20 @@ another_round: } skip_taps: +#ifdef CONFIG_NET_INGRESS + if (static_key_false(&ingress_needed)) { + skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto out; + + if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) + goto out; + } +#endif #ifdef CONFIG_NET_CLS_ACT - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); - if (!skb) - goto unlock; + skb->tc_verd = 0; ncls: #endif - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; @@ -3693,7 +3834,7 @@ ncls: if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) - goto unlock; + goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); @@ -3705,7 +3846,7 @@ ncls: switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; - goto unlock; + goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: @@ -3759,8 +3900,7 @@ drop: ret = NET_RX_DROP; } -unlock: - rcu_read_unlock(); +out: return ret; } @@ -3791,29 +3931,30 @@ static int __netif_receive_skb(struct sk_buff *skb) static int netif_receive_skb_internal(struct sk_buff *skb) { + int ret; + net_timestamp_check(netdev_tstamp_prequeue, skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; + rcu_read_lock(); + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; - int cpu, ret; - - rcu_read_lock(); - - cpu = get_rps_cpu(skb->dev, skb, &rflow); + int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); return ret; } - rcu_read_unlock(); } #endif - return __netif_receive_skb(skb); + ret = __netif_receive_skb(skb); + rcu_read_unlock(); + return ret; } /** @@ -3831,13 +3972,13 @@ static int netif_receive_skb_internal(struct sk_buff *skb) * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ -int netif_receive_skb(struct sk_buff *skb) +int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb) { trace_netif_receive_skb_entry(skb); return netif_receive_skb_internal(skb); } -EXPORT_SYMBOL(netif_receive_skb); +EXPORT_SYMBOL(netif_receive_skb_sk); /* Network device is going away, flush any packets still pending * Called with irqs disabled. @@ -4358,8 +4499,10 @@ static int process_backlog(struct napi_struct *napi, int quota) struct sk_buff *skb; while ((skb = __skb_dequeue(&sd->process_queue))) { + rcu_read_lock(); local_irq_enable(); __netif_receive_skb(skb); + rcu_read_unlock(); local_irq_disable(); input_queue_head_incr(sd); if (++work >= quota) { @@ -5170,7 +5313,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) + if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) @@ -5914,6 +6057,24 @@ int dev_get_phys_port_id(struct net_device *dev, EXPORT_SYMBOL(dev_get_phys_port_id); /** + * dev_get_phys_port_name - Get device physical port name + * @dev: device + * @name: port name + * + * Get device physical port name + */ +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_name) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_name(dev, name, len); +} +EXPORT_SYMBOL(dev_get_phys_port_name); + +/** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * @@ -5970,13 +6131,14 @@ static void rollback_registered_many(struct list_head *head) /* If device is running, close it first. */ list_for_each_entry(dev, head, unreg_list) list_add_tail(&dev->close_list, &close_head); - dev_close_many(&close_head); + dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); dev->reg_state = NETREG_UNREGISTERING; + on_each_cpu(flush_backlog, dev, 1); } synchronize_net(); @@ -6247,7 +6409,8 @@ static int netif_alloc_netdev_queues(struct net_device *dev) struct netdev_queue *tx; size_t sz = count * sizeof(*tx); - BUG_ON(count < 1 || count > 0xffff); + if (count < 1 || count > 0xffff) + return -EINVAL; tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!tx) { @@ -6263,6 +6426,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev) return 0; } +void netif_tx_stop_all_queues(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + netif_tx_stop_queue(txq); + } +} +EXPORT_SYMBOL(netif_tx_stop_all_queues); + /** * register_netdevice - register a network device * @dev: device to register @@ -6297,8 +6471,6 @@ int register_netdevice(struct net_device *dev) spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); - dev->iflink = -1; - ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; @@ -6328,9 +6500,6 @@ int register_netdevice(struct net_device *dev) else if (__dev_get_by_index(net, dev->ifindex)) goto err_uninit; - if (dev->iflink == -1) - dev->iflink = dev->ifindex; - /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ @@ -6605,8 +6774,6 @@ void netdev_run_todo(void) dev->reg_state = NETREG_UNREGISTERED; - on_each_cpu(flush_backlog, dev, 1); - netdev_wait_allrefs(dev); /* paranoia */ @@ -6817,6 +6984,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; + + nf_hook_ingress_init(dev); + return dev; free_all: @@ -6843,8 +7013,6 @@ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; - release_net(dev_net(dev)); - netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS kvfree(dev->_rx); @@ -7045,12 +7213,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev_net_set(dev, net); /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) { - int iflink = (dev->iflink == dev->ifindex); + if (__dev_get_by_index(net, dev->ifindex)) dev->ifindex = dev_new_index(net); - if (iflink) - dev->iflink = dev->ifindex; - } /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); diff --git a/net/core/dst.c b/net/core/dst.c index e956ce6d1378..002144bea935 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -284,7 +284,9 @@ void dst_release(struct dst_entry *dst) int newrefcnt; newrefcnt = atomic_dec_return(&dst->__refcnt); - WARN_ON(newrefcnt < 0); + if (unlikely(newrefcnt < 0)) + net_warn_ratelimited("%s: dst:%p refcnt:%d\n", + __func__, dst, newrefcnt); if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) call_rcu(&dst->rcu_head, dst_destroy_rcu); } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index aa378ecef186..b495ab1797fa 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -98,7 +98,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_BUSY_POLL_BIT] = "busy-poll", - [NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload", }; static const char @@ -107,6 +106,13 @@ rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { [ETH_RSS_HASH_XOR_BIT] = "xor", }; +static const char +tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", + [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -195,6 +201,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_RSS_HASH_FUNCS) return ARRAY_SIZE(rss_hash_func_strings); + if (sset == ETH_SS_TUNABLES) + return ARRAY_SIZE(tunable_strings); + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -212,6 +221,8 @@ static void __ethtool_get_strings(struct net_device *dev, else if (stringset == ETH_SS_RSS_HASH_FUNCS) memcpy(data, rss_hash_func_strings, sizeof(rss_hash_func_strings)); + else if (stringset == ETH_SS_TUNABLES) + memcpy(data, tunable_strings, sizeof(tunable_strings)); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); @@ -790,7 +801,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) - dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); + dev_key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index e4fdc9dfb2c7..9a12668f7d62 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -31,7 +31,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->pref = pref; r->table = table; r->flags = flags; - r->fr_net = hold_net(ops->fro_net); + r->fr_net = ops->fro_net; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; @@ -116,7 +116,6 @@ static int __fib_rules_register(struct fib_rules_ops *ops) if (ops->family == o->family) goto errout; - hold_net(net); list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: @@ -160,15 +159,6 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) } } -static void fib_rules_put_rcu(struct rcu_head *head) -{ - struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu); - struct net *net = ops->fro_net; - - release_net(net); - kfree(ops); -} - void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; @@ -178,7 +168,7 @@ void fib_rules_unregister(struct fib_rules_ops *ops) spin_unlock(&net->rules_mod_lock); fib_rules_cleanup_ops(ops); - call_rcu(&ops->rcu, fib_rules_put_rcu); + kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); @@ -303,7 +293,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) err = -ENOMEM; goto errout; } - rule->fr_net = hold_net(net); + rule->fr_net = net; if (tb[FRA_PRIORITY]) rule->pref = nla_get_u32(tb[FRA_PRIORITY]); @@ -423,7 +413,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) return 0; errout_free: - release_net(rule->fr_net); kfree(rule); errout: rules_ops_put(ops); @@ -492,6 +481,12 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) goto errout; } + if (ops->delete) { + err = ops->delete(rule); + if (err) + goto errout; + } + list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { @@ -517,8 +512,6 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); - if (ops->delete) - ops->delete(rule); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); diff --git a/net/core/filter.c b/net/core/filter.c index f6bdc2b1ba01..be3098fb65e4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -36,6 +36,7 @@ #include <net/netlink.h> #include <linux/skbuff.h> #include <net/sock.h> +#include <net/flow_dissector.h> #include <linux/errno.h> #include <linux/timer.h> #include <asm/uaccess.h> @@ -45,6 +46,7 @@ #include <linux/seccomp.h> #include <linux/if_vlan.h> #include <linux/bpf.h> +#include <net/sch_generic.h> /** * sk_filter - run a packet through a socket filter @@ -150,10 +152,62 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return prandom_u32(); } +static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (skb_field) { + case SKF_AD_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; + + case SKF_AD_PKTTYPE: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); +#endif + break; + + case SKF_AD_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, queue_mapping)); + break; + + case SKF_AD_VLAN_TAG: + case SKF_AD_VLAN_TAG_PRESENT: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_tci)); + if (skb_field == SKF_AD_VLAN_TAG) { + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, + ~VLAN_TAG_PRESENT); + } else { + /* dst_reg >>= 12 */ + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); + /* dst_reg &= 1 */ + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); + } + break; + } + + return insn - insn_buf; +} + static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; + u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: @@ -167,13 +221,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - PKT_TYPE_OFFSET()); - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); -#ifdef __BIG_ENDIAN_BITFIELD - insn++; - *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5); -#endif + cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: @@ -197,10 +246,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_MARK: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - - *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, mark)); + cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: @@ -211,29 +258,30 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_QUEUE: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); - - *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, queue_mapping)); + cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: + cnt = convert_skb_access(SKF_AD_VLAN_TAG, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; - /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */ + case SKF_AD_OFF + SKF_AD_VLAN_TPID: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); + + /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, vlan_tci)); - if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) { - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, - ~VLAN_TAG_PRESENT); - } else { - /* A >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12); - /* A &= 1 */ - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1); - } + offsetof(struct sk_buff, vlan_proto)); + /* A = ntohs(A) [emitting a nop or swap16] */ + *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: @@ -309,8 +357,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * for socket filters: ctx == 'struct sk_buff *', for seccomp: * ctx == 'struct seccomp_data *'. */ -int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len) +static int bpf_convert_filter(struct sock_filter *prog, int len, + struct bpf_insn *new_prog, int *new_len) { int new_flen = 0, pass = 0, target, i; struct bpf_insn *new_insn; @@ -325,7 +373,8 @@ int bpf_convert_filter(struct sock_filter *prog, int len, return -EINVAL; if (new_prog) { - addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL); + addrs = kcalloc(len, sizeof(*addrs), + GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } @@ -705,7 +754,8 @@ static bool chk_code_allowed(u16 code_to_probe) * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) +static int bpf_check_classic(const struct sock_filter *filter, + unsigned int flen) { bool anc_found; int pc; @@ -779,7 +829,6 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) return -EINVAL; } -EXPORT_SYMBOL(bpf_check_classic); static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) @@ -793,7 +842,9 @@ static int bpf_prog_store_orig_filter(struct bpf_prog *fp, fkprog = fp->orig_prog; fkprog->len = fprog->len; - fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL); + + fkprog->filter = kmemdup(fp->insns, fsize, + GFP_KERNEL | __GFP_NOWARN); if (!fkprog->filter) { kfree(fp->orig_prog); return -ENOMEM; @@ -814,7 +865,7 @@ static void bpf_release_orig_filter(struct bpf_prog *fp) static void __bpf_prog_release(struct bpf_prog *prog) { - if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); } else { bpf_release_orig_filter(prog); @@ -895,7 +946,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) * pass. At this time, the user BPF is stored in fp->insns. */ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; @@ -942,7 +993,8 @@ out_err: return ERR_PTR(err); } -static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) +static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans) { int err; @@ -955,6 +1007,17 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) return ERR_PTR(err); } + /* There might be additional checks and transformations + * needed on classic filters, f.e. in case of seccomp. + */ + if (trans) { + err = trans(fp->insns, fp->len); + if (err) { + __bpf_prog_release(fp); + return ERR_PTR(err); + } + } + /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ @@ -1004,7 +1067,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = bpf_prepare_filter(fp); + fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); @@ -1013,12 +1076,85 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) } EXPORT_SYMBOL_GPL(bpf_prog_create); +/** + * bpf_prog_create_from_user - create an unattached filter from user buffer + * @pfp: the unattached filter that is created + * @fprog: the filter program + * @trans: post-classic verifier transformation handler + * + * This function effectively does the same as bpf_prog_create(), only + * that it builds up its insns buffer from user space provided buffer. + * It also allows for passing a bpf_aux_classic_check_t handler. + */ +int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, + bpf_aux_classic_check_t trans) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *fp; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL) + return -EINVAL; + + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!fp) + return -ENOMEM; + + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + __bpf_prog_free(fp); + return -EFAULT; + } + + fp->len = fprog->len; + /* Since unattached filters are not copied back to user + * space through sk_get_filter(), we do not need to hold + * a copy here, and can spare us the work. + */ + fp->orig_prog = NULL; + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + fp = bpf_prepare_filter(fp, trans); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + *pfp = fp; + return 0; +} + void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); } EXPORT_SYMBOL_GPL(bpf_prog_destroy); +static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + fp->prog = prog; + atomic_set(&fp->refcnt, 0); + + if (!sk_filter_charge(sk, fp)) { + kfree(fp); + return -ENOMEM; + } + + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + + return 0; +} + /** * sk_attach_filter - attach a socket filter * @fprog: the filter program @@ -1031,7 +1167,6 @@ EXPORT_SYMBOL_GPL(bpf_prog_destroy); */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { - struct sk_filter *fp, *old_fp; unsigned int fsize = bpf_classic_proglen(fprog); unsigned int bpf_fsize = bpf_prog_size(fprog->len); struct bpf_prog *prog; @@ -1064,40 +1199,24 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - prog = bpf_prepare_filter(prog); + prog = bpf_prepare_filter(prog, NULL); if (IS_ERR(prog)) return PTR_ERR(prog); - fp = kmalloc(sizeof(*fp), GFP_KERNEL); - if (!fp) { + err = __sk_attach_prog(prog, sk); + if (err < 0) { __bpf_prog_release(prog); - return -ENOMEM; - } - fp->prog = prog; - - atomic_set(&fp->refcnt, 0); - - if (!sk_filter_charge(sk, fp)) { - __sk_filter_release(fp); - return -ENOMEM; + return err; } - old_fp = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); - rcu_assign_pointer(sk->sk_filter, fp); - - if (old_fp) - sk_filter_uncharge(sk, old_fp); - return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); -#ifdef CONFIG_BPF_SYSCALL int sk_attach_bpf(u32 ufd, struct sock *sk) { - struct sk_filter *fp, *old_fp; struct bpf_prog *prog; + int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; @@ -1106,40 +1225,207 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { - /* valid fd, but invalid program type */ + if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); return -EINVAL; } - fp = kmalloc(sizeof(*fp), GFP_KERNEL); - if (!fp) { + err = __sk_attach_prog(prog, sk); + if (err < 0) { bpf_prog_put(prog); - return -ENOMEM; + return err; } - fp->prog = prog; - atomic_set(&fp->refcnt, 0); + return 0; +} - if (!sk_filter_charge(sk, fp)) { - __sk_filter_release(fp); - return -ENOMEM; +#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) + +static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + int offset = (int) r2; + void *from = (void *) (long) r3; + unsigned int len = (unsigned int) r4; + char buf[16]; + void *ptr; + + /* bpf verifier guarantees that: + * 'from' pointer points to bpf program stack + * 'len' bytes of it were initialized + * 'len' > 0 + * 'skb' is a valid pointer to 'struct sk_buff' + * + * so check for invalid 'offset' and too large 'len' + */ + if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) + return -EFAULT; + + if (unlikely(skb_cloned(skb) && + !skb_clone_writable(skb, offset + len))) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, len, buf); + if (unlikely(!ptr)) + return -EFAULT; + + if (BPF_RECOMPUTE_CSUM(flags)) + skb_postpull_rcsum(skb, ptr, len); + + memcpy(ptr, from, len); + + if (ptr == buf) + /* skb_store_bits cannot return -EFAULT here */ + skb_store_bits(skb, offset, ptr, len); + + if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); + return 0; +} + +const struct bpf_func_proto bpf_skb_store_bytes_proto = { + .func = bpf_skb_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) +#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) + +static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + int offset = (int) r2; + __sum16 sum, *ptr; + + if (unlikely((u32) offset > 0xffff)) + return -EFAULT; + + if (unlikely(skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(sum)))) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); + if (unlikely(!ptr)) + return -EFAULT; + + switch (BPF_HEADER_FIELD_SIZE(flags)) { + case 2: + csum_replace2(ptr, from, to); + break; + case 4: + csum_replace4(ptr, from, to); + break; + default: + return -EINVAL; } - old_fp = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); - rcu_assign_pointer(sk->sk_filter, fp); + if (ptr == &sum) + /* skb_store_bits guaranteed to not return -EFAULT here */ + skb_store_bits(skb, offset, ptr, sizeof(sum)); - if (old_fp) - sk_filter_uncharge(sk, old_fp); + return 0; +} + +const struct bpf_func_proto bpf_l3_csum_replace_proto = { + .func = bpf_l3_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); + int offset = (int) r2; + __sum16 sum, *ptr; + + if (unlikely((u32) offset > 0xffff)) + return -EFAULT; + + if (unlikely(skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(sum)))) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); + if (unlikely(!ptr)) + return -EFAULT; + + switch (BPF_HEADER_FIELD_SIZE(flags)) { + case 2: + inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); + break; + case 4: + inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); + break; + default: + return -EINVAL; + } + + if (ptr == &sum) + /* skb_store_bits guaranteed to not return -EFAULT here */ + skb_store_bits(skb, offset, ptr, sizeof(sum)); return 0; } -/* allow socket filters to call - * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem() - */ -static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id) +const struct bpf_func_proto bpf_l4_csum_replace_proto = { + .func = bpf_l4_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1) + +static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; + struct net_device *dev; + + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); + if (unlikely(!dev)) + return -EINVAL; + + if (unlikely(!(dev->flags & IFF_UP))) + return -EINVAL; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!skb2)) + return -ENOMEM; + + if (BPF_IS_REDIRECT_INGRESS(flags)) + return dev_forward_skb(dev, skb2); + + skb2->dev = dev; + return dev_queue_xmit(skb2); +} + +const struct bpf_func_proto bpf_clone_redirect_proto = { + .func = bpf_clone_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_filter_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -1148,39 +1434,239 @@ static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); default: return NULL; } } -static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type) +static const struct bpf_func_proto * +tc_cls_act_func_proto(enum bpf_func_id func_id) { - /* skb fields cannot be accessed yet */ - return false; + switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + default: + return sk_filter_func_proto(func_id); + } } -static struct bpf_verifier_ops sock_filter_ops = { - .get_func_proto = sock_filter_func_proto, - .is_valid_access = sock_filter_is_valid_access, +static bool __is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* check bounds */ + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + /* all __sk_buff fields are __u32 */ + if (size != 4) + return false; + + return true; +} + +static bool sk_filter_is_valid_access(int off, int size, + enum bpf_access_type type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + + return __is_valid_access(off, size, type); +} + +static bool tc_cls_act_is_valid_access(int off, int size, + enum bpf_access_type type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, mark): + case offsetof(struct __sk_buff, tc_index): + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + return __is_valid_access(off, size, type); +} + +static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct __sk_buff, len): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, len)); + break; + + case offsetof(struct __sk_buff, protocol): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, protocol)); + break; + + case offsetof(struct __sk_buff, vlan_proto): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_proto)); + break; + + case offsetof(struct __sk_buff, priority): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, priority)); + break; + + case offsetof(struct __sk_buff, ingress_ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, skb_iif)); + break; + + case offsetof(struct __sk_buff, ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + dst_reg, src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + offsetof(struct net_device, ifindex)); + break; + + case offsetof(struct __sk_buff, mark): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; + + case offsetof(struct __sk_buff, pkt_type): + return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, queue_mapping): + return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, vlan_present): + return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, vlan_tci): + return convert_skb_access(SKF_AD_VLAN_TAG, + dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + + ctx_off -= offsetof(struct __sk_buff, cb[0]); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct qdisc_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + break; + + case offsetof(struct __sk_buff, tc_index): +#ifdef CONFIG_NET_SCHED + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, tc_index)); + else + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, tc_index)); + break; +#else + if (type == BPF_WRITE) + *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); + else + *insn++ = BPF_MOV64_IMM(dst_reg, 0); + break; +#endif + } + + return insn - insn_buf; +} + +static const struct bpf_verifier_ops sk_filter_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; -static struct bpf_prog_type_list tl = { - .ops = &sock_filter_ops, +static const struct bpf_verifier_ops tc_cls_act_ops = { + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = tc_cls_act_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, +}; + +static struct bpf_prog_type_list sk_filter_type __read_mostly = { + .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, }; -static int __init register_sock_filter_ops(void) +static struct bpf_prog_type_list sched_cls_type __read_mostly = { + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_CLS, +}; + +static struct bpf_prog_type_list sched_act_type __read_mostly = { + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_ACT, +}; + +static int __init register_sk_filter_ops(void) { - bpf_register_prog_type(&tl); + bpf_register_prog_type(&sk_filter_type); + bpf_register_prog_type(&sched_cls_type); + bpf_register_prog_type(&sched_act_type); + return 0; } -late_initcall(register_sock_filter_ops); -#else -int sk_attach_bpf(u32 ufd, struct sock *sk) -{ - return -EOPNOTSUPP; -} -#endif +late_initcall(register_sk_filter_ops); + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2c35c02a931e..2a834c6179b9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1,3 +1,4 @@ +#include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/ip.h> @@ -12,19 +13,60 @@ #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> -#include <net/flow_keys.h> +#include <linux/stddef.h> +#include <linux/if_ether.h> +#include <linux/mpls.h> +#include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> -/* copy saddr & daddr, possibly using 64bit load/store - * Equivalent to : flow->src = iph->saddr; - * flow->dst = iph->daddr; - */ -static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) +static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + return flow_dissector->used_keys & (1 << key_id); +} + +static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + flow_dissector->used_keys |= (1 << key_id); +} + +static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id, + void *target_container) +{ + return ((char *) target_container) + flow_dissector->offset[key_id]; +} + +void skb_flow_dissector_init(struct flow_dissector *flow_dissector, + const struct flow_dissector_key *key, + unsigned int key_count) { - BUILD_BUG_ON(offsetof(typeof(*flow), dst) != - offsetof(typeof(*flow), src) + sizeof(flow->src)); - memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); + unsigned int i; + + memset(flow_dissector, 0, sizeof(*flow_dissector)); + + for (i = 0; i < key_count; i++, key++) { + /* User should make sure that every key target offset is withing + * boundaries of unsigned short. + */ + BUG_ON(key->offset > USHRT_MAX); + BUG_ON(skb_flow_dissector_uses_key(flow_dissector, + key->key_id)); + + skb_flow_dissector_set_key(flow_dissector, key->key_id); + flow_dissector->offset[key->key_id] = key->offset; + } + + /* Ensure that the dissector always includes control and basic key. + * That way we are able to avoid handling lack of these in fast path. + */ + BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); + BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); } +EXPORT_SYMBOL(skb_flow_dissector_init); /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -63,18 +105,31 @@ EXPORT_SYMBOL(__skb_flow_get_ports); /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @flow_dissector: list of keys to dissect + * @target_container: target structure to put dissected values into * @data: raw buffer pointer to the packet, if NULL use skb->data * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * - * The function will try to retrieve the struct flow_keys from either the skbuff - * or a raw buffer specified by the rest parameters + * The function will try to retrieve individual keys into target specified + * by flow_dissector from either the skbuff or a raw buffer specified by the + * rest parameters. + * + * Caller must take care of zeroing target container memory. */ -bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, +bool __skb_flow_dissect(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, __be16 proto, int nhoff, int hlen) { - u8 ip_proto; + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_keyid *key_keyid; + u8 ip_proto = 0; if (!data) { data = skb->data; @@ -83,7 +138,30 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, hlen = skb_headlen(skb); } - memset(flow, 0, sizeof(*flow)); + /* It is ensured by skb_flow_dissector_init() that control key will + * be always present. + */ + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + + /* It is ensured by skb_flow_dissector_init() that basic key will + * be always present. + */ + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + struct ethhdr *eth = eth_hdr(skb); + struct flow_dissector_key_eth_addrs *key_eth_addrs; + + key_eth_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS, + target_container); + memcpy(key_eth_addrs, ð->h_dest, sizeof(*key_eth_addrs)); + } again: switch (proto) { @@ -100,14 +178,15 @@ ip: if (ip_is_fragment(iph)) ip_proto = 0; - /* skip the address processing if skb is NULL. The assumption - * here is that if there is no skb we are not looking for flow - * info but lengths and protocols. - */ - if (!skb) + if (!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; - iph_to_flow_copy_addrs(flow, iph); + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; break; } case htons(ETH_P_IPV6): { @@ -123,25 +202,27 @@ ipv6: ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - /* see comment above in IPv4 section */ - if (!skb) - break; + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; + + key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); - flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); - flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); + memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } flow_label = ip6_flowlabel(iph); if (flow_label) { - /* Awesome, IPv6 packet has a flow label so we can - * use that to represent the ports without any - * further dissection. - */ - flow->n_proto = proto; - flow->ip_proto = ip_proto; - flow->ports = flow_label; - flow->thoff = (u16)nhoff; - - return true; + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_label); + } } break; @@ -155,6 +236,15 @@ ipv6: if (!vlan) return false; + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID, + target_container); + + key_tags->vlan_id = skb_vlan_tag_get_id(skb); + } + proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); goto again; @@ -186,19 +276,58 @@ ipv6: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; - flow->src = hdr->srcnode; - flow->dst = 0; - flow->n_proto = proto; - flow->thoff = (u16)nhoff; + key_basic->n_proto = proto; + key_control->thoff = (u16)nhoff; + + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC_ADDRS, + target_container); + key_addrs->tipcaddrs.srcnode = hdr->srcnode; + key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS; + } + return true; + } + + case htons(ETH_P_MPLS_UC): + case htons(ETH_P_MPLS_MC): { + struct mpls_label *hdr, _hdr[2]; +mpls: + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) + return false; + + if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> + MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY, + target_container); + key_keyid->keyid = hdr[1].entry & + htonl(MPLS_LS_LABEL_MASK); + } + + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_control->thoff = (u16)nhoff; + + return true; + } + return true; } + case htons(ETH_P_FCOE): - flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ default: return false; } +ip_proto_again: switch (ip_proto) { case IPPROTO_GRE: { struct gre_hdr { @@ -213,30 +342,65 @@ ipv6: * Only look inside GRE if version zero and no * routing */ - if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { - proto = hdr->proto; + if (hdr->flags & (GRE_VERSION | GRE_ROUTING)) + break; + + proto = hdr->proto; + nhoff += 4; + if (hdr->flags & GRE_CSUM) nhoff += 4; - if (hdr->flags & GRE_CSUM) - nhoff += 4; - if (hdr->flags & GRE_KEY) - nhoff += 4; - if (hdr->flags & GRE_SEQ) - nhoff += 4; - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = __skb_header_pointer(skb, nhoff, - sizeof(_eth), - data, hlen, &_eth); - if (!eth) - return false; - proto = eth->h_proto; - nhoff += sizeof(*eth); + if (hdr->flags & GRE_KEY) { + const __be32 *keyid; + __be32 _keyid; + + keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid), + data, hlen, &_keyid); + + if (!keyid) + return false; + + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID, + target_container); + key_keyid->keyid = *keyid; } - goto again; + nhoff += 4; } - break; + if (hdr->flags & GRE_SEQ) + nhoff += 4; + if (proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, nhoff, + sizeof(_eth), + data, hlen, &_eth); + if (!eth) + return false; + proto = eth->h_proto; + nhoff += sizeof(*eth); + } + goto again; + } + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: { + u8 _opthdr[2], *opthdr; + + if (proto != htons(ETH_P_IPV6)) + break; + + opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), + data, hlen, &_opthdr); + if (!opthdr) + return false; + + ip_proto = opthdr[0]; + nhoff += (opthdr[1] + 1) << 3; + + goto ip_proto_again; } case IPPROTO_IPIP: proto = htons(ETH_P_IP); @@ -244,18 +408,25 @@ ipv6: case IPPROTO_IPV6: proto = htons(ETH_P_IPV6); goto ipv6; + case IPPROTO_MPLS: + proto = htons(ETH_P_MPLS_UC); + goto mpls; default: break; } - flow->n_proto = proto; - flow->ip_proto = ip_proto; - flow->thoff = (u16) nhoff; - - /* unless skb is set we don't need to record port info */ - if (skb) - flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_control->thoff = (u16)nhoff; + + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); + } return true; } @@ -267,27 +438,109 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) +static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval) { - __flow_hash_secret_init(); - return jhash_3words(a, b, c, hashrnd); + return jhash2(words, length, keyval); } -static inline u32 __flow_hash_from_keys(struct flow_keys *keys) +static inline void *flow_keys_hash_start(struct flow_keys *flow) { - u32 hash; + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); + return (void *)flow + FLOW_KEYS_HASH_OFFSET; +} + +static inline size_t flow_keys_hash_length(struct flow_keys *flow) +{ + size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); + BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != + sizeof(*flow) - sizeof(flow->addrs)); + + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + diff -= sizeof(flow->addrs.v4addrs); + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + diff -= sizeof(flow->addrs.v6addrs); + break; + case FLOW_DISSECTOR_KEY_TIPC_ADDRS: + diff -= sizeof(flow->addrs.tipcaddrs); + break; + } + return (sizeof(*flow) - diff) / sizeof(u32); +} + +__be32 flow_get_u32_src(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.src; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.src); + case FLOW_DISSECTOR_KEY_TIPC_ADDRS: + return flow->addrs.tipcaddrs.srcnode; + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_src); - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys->dst < (__force u32)keys->src) || - (((__force u32)keys->dst == (__force u32)keys->src) && - ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) { - swap(keys->dst, keys->src); - swap(keys->port16[0], keys->port16[1]); +__be32 flow_get_u32_dst(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.dst; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.dst); + default: + return 0; } +} +EXPORT_SYMBOL(flow_get_u32_dst); - hash = __flow_hash_3words((__force u32)keys->dst, - (__force u32)keys->src, - (__force u32)keys->ports); +static inline void __flow_hash_consistentify(struct flow_keys *keys) +{ + int addr_diff, i; + + switch (keys->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + addr_diff = (__force u32)keys->addrs.v4addrs.dst - + (__force u32)keys->addrs.v4addrs.src; + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + swap(keys->ports.src, keys->ports.dst); + } + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + addr_diff = memcmp(&keys->addrs.v6addrs.dst, + &keys->addrs.v6addrs.src, + sizeof(keys->addrs.v6addrs.dst)); + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + for (i = 0; i < 4; i++) + swap(keys->addrs.v6addrs.src.s6_addr32[i], + keys->addrs.v6addrs.dst.s6_addr32[i]); + swap(keys->ports.src, keys->ports.dst); + } + break; + } +} + +static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) +{ + u32 hash; + + __flow_hash_consistentify(keys); + + hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys), + flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -296,12 +549,52 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys) u32 flow_hash_from_keys(struct flow_keys *keys) { - return __flow_hash_from_keys(keys); + __flow_hash_secret_init(); + return __flow_hash_from_keys(keys, hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); -/* - * __skb_get_hash: calculate a flow hash based on src/dst addresses +static inline u32 ___skb_get_hash(const struct sk_buff *skb, + struct flow_keys *keys, u32 keyval) +{ + if (!skb_flow_dissect_flow_keys(skb, keys)) + return 0; + + return __flow_hash_from_keys(keys, keyval); +} + +struct _flow_keys_digest_data { + __be16 n_proto; + u8 ip_proto; + u8 padding; + __be32 ports; + __be32 src; + __be32 dst; +}; + +void make_flow_keys_digest(struct flow_keys_digest *digest, + const struct flow_keys *flow) +{ + struct _flow_keys_digest_data *data = + (struct _flow_keys_digest_data *)digest; + + BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); + + memset(digest, 0, sizeof(*digest)); + + data->n_proto = flow->basic.n_proto; + data->ip_proto = flow->basic.ip_proto; + data->ports = flow->ports.ports; + data->src = flow->addrs.v4addrs.src; + data->dst = flow->addrs.v4addrs.dst; +} +EXPORT_SYMBOL(make_flow_keys_digest); + +/** + * __skb_get_hash: calculate a flow hash + * @skb: sk_buff to calculate flow hash from + * + * This function calculates a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. @@ -309,53 +602,34 @@ EXPORT_SYMBOL(flow_hash_from_keys); void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; + u32 hash; - if (!skb_flow_dissect(skb, &keys)) - return; + __flow_hash_secret_init(); - if (keys.ports) + hash = ___skb_get_hash(skb, &keys, hashrnd); + if (!hash) + return; + if (keys.ports.ports) skb->l4_hash = 1; - skb->sw_hash = 1; - - skb->hash = __flow_hash_from_keys(&keys); + skb->hash = hash; } EXPORT_SYMBOL(__skb_get_hash); -/* - * Returns a Tx hash based on the given packet descriptor a Tx queues' number - * to be used as a distribution range. - */ -u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, - unsigned int num_tx_queues) +__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) { - u32 hash; - u16 qoffset = 0; - u16 qcount = num_tx_queues; - - if (skb_rx_queue_recorded(skb)) { - hash = skb_get_rx_queue(skb); - while (unlikely(hash >= num_tx_queues)) - hash -= num_tx_queues; - return hash; - } - - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; - } + struct flow_keys keys; - return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; + return ___skb_get_hash(skb, &keys, perturb); } -EXPORT_SYMBOL(__skb_tx_hash); +EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { - u32 poff = keys->thoff; + u32 poff = keys->control.thoff; - switch (keys->ip_proto) { + switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ const u8 *doff; @@ -396,8 +670,12 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, return poff; } -/* skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically +/** + * skb_get_poff - get the offset to the payload + * @skb: sk_buff to get the payload offset from + * + * The function will get the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically * truncate packets without needing to push actual payload to the user * space and can analyze headers only, instead. */ @@ -405,86 +683,76 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys keys; - if (!skb_flow_dissect(skb, &keys)) + if (!skb_flow_dissect_flow_keys(skb, &keys)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +static const struct flow_dissector_key flow_keys_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS, + .offset = offsetof(struct flow_keys, addrs.tipcaddrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, + { + .key_id = FLOW_DISSECTOR_KEY_VLANID, + .offset = offsetof(struct flow_keys, tags), + }, + { + .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, + .offset = offsetof(struct flow_keys, tags), + }, + { + .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, + .offset = offsetof(struct flow_keys, keyid), + }, +}; + +static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, +}; + +struct flow_dissector flow_keys_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_dissector); + +struct flow_dissector flow_keys_buf_dissector __read_mostly; + +static int __init init_default_flow_dissectors(void) { -#ifdef CONFIG_XPS - struct xps_dev_maps *dev_maps; - struct xps_map *map; - int queue_index = -1; - - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); - if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[skb->sender_cpu - 1]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else - queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), - map->len)]; - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; - } - } - rcu_read_unlock(); - - return queue_index; -#else - return -1; -#endif -} - -static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - int queue_index = sk_tx_queue_get(sk); - - if (queue_index < 0 || skb->ooo_okay || - queue_index >= dev->real_num_tx_queues) { - int new_index = get_xps_queue(dev, skb); - if (new_index < 0) - new_index = skb_tx_hash(dev, skb); - - if (queue_index != new_index && sk && - rcu_access_pointer(sk->sk_dst_cache)) - sk_tx_queue_set(sk, new_index); - - queue_index = new_index; - } - - return queue_index; + skb_flow_dissector_init(&flow_keys_dissector, + flow_keys_dissector_keys, + ARRAY_SIZE(flow_keys_dissector_keys)); + skb_flow_dissector_init(&flow_keys_buf_dissector, + flow_keys_buf_dissector_keys, + ARRAY_SIZE(flow_keys_buf_dissector_keys)); + return 0; } -struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb, - void *accel_priv) -{ - int queue_index = 0; - -#ifdef CONFIG_XPS - if (skb->sender_cpu == 0) - skb->sender_cpu = raw_smp_processor_id() + 1; -#endif - - if (dev->real_num_tx_queues != 1) { - const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, accel_priv, - __netdev_pick_tx); - else - queue_index = __netdev_pick_tx(dev, skb); - - if (!accel_priv) - queue_index = netdev_cap_txqueue(dev, queue_index); - } - - skb_set_queue_mapping(skb, queue_index); - return netdev_get_tx_queue(dev, queue_index); -} +late_initcall_sync(init_default_flow_dissectors); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9dfb88a933e7..92d886f4adcb 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -66,7 +66,7 @@ NOTES. - * avbps is scaled by 2^5, avpps is scaled by 2^10. + * avbps and avpps are scaled by 2^5. * both values are reported as 32 bit unsigned values. bps can overflow for fast links : max speed being 34360Mbit/sec * Minimal interval is HZ/4=250msec (it is the greatest common divisor @@ -85,10 +85,10 @@ struct gen_estimator struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; int ewma_log; + u32 last_packets; + unsigned long avpps; u64 last_bytes; u64 avbps; - u32 last_packets; - u32 avpps; struct rcu_head e_rcu; struct rb_node node; struct gnet_stats_basic_cpu __percpu *cpu_bstats; @@ -118,8 +118,8 @@ static void est_timer(unsigned long arg) rcu_read_lock(); list_for_each_entry_rcu(e, &elist[idx].list, list) { struct gnet_stats_basic_packed b = {0}; + unsigned long rate; u64 brate; - u32 rate; spin_lock(e->stats_lock); read_lock(&est_lock); @@ -133,10 +133,11 @@ static void est_timer(unsigned long arg) e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); e->rate_est->bps = (e->avbps+0xF)>>5; - rate = (b.packets - e->last_packets)<<(12 - idx); + rate = b.packets - e->last_packets; + rate <<= (7 - idx); e->last_packets = b.packets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); - e->rate_est->pps = (e->avpps+0x1FF)>>10; + e->rate_est->pps = (e->avpps + 0xF) >> 5; skip: read_unlock(&est_lock); spin_unlock(e->stats_lock); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 49a9e3e06c08..982861607f88 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -40,7 +40,7 @@ static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { if (!netif_carrier_ok(dev)) - return (dev->ifindex != dev->iflink ? + return (dev->ifindex != dev_get_iflink(dev) ? IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); if (netif_dormant(dev)) @@ -89,7 +89,7 @@ static bool linkwatch_urgent_event(struct net_device *dev) if (!netif_running(dev)) return false; - if (dev->ifindex != dev->iflink) + if (dev->ifindex != dev_get_iflink(dev)) return true; if (dev->priv_flags & IFF_TEAM_PORT) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 70fe9e10ac86..84195dacb8b6 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -397,25 +397,15 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n; - int key_len = tbl->key_len; - u32 hash_val; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); rcu_read_lock_bh(); - nht = rcu_dereference_bh(tbl->nht); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); - - for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference_bh(n->next)) { - if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { - if (!atomic_inc_not_zero(&n->refcnt)) - n = NULL; - NEIGH_CACHE_STAT_INC(tbl, hits); - break; - } + n = __neigh_lookup_noref(tbl, pkey, dev); + if (n) { + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; + NEIGH_CACHE_STAT_INC(tbl, hits); } rcu_read_unlock_bh(); @@ -601,7 +591,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (!n) goto out; - write_pnet(&n->net, hold_net(net)); + write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; if (dev) @@ -610,7 +600,6 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (tbl->pconstructor && tbl->pconstructor(n)) { if (dev) dev_put(dev); - release_net(net); kfree(n); n = NULL; goto out; @@ -644,7 +633,6 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, tbl->pdestructor(n); if (n->dev) dev_put(n->dev); - release_net(pneigh_net(n)); kfree(n); return 0; } @@ -667,7 +655,6 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) tbl->pdestructor(n); if (n->dev) dev_put(n->dev); - release_net(pneigh_net(n)); kfree(n); continue; } @@ -830,10 +817,9 @@ out: static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES); - if (!(n->nud_state & NUD_PROBE)) - max_probes += NEIGH_VAR(p, MCAST_PROBES); - return max_probes; + return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : + NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) @@ -927,6 +913,7 @@ static void neigh_timer_handler(unsigned long arg) neigh->nud_state = NUD_PROBE; neigh->updated = jiffies; atomic_set(&neigh->probes, 0); + notify = 1; next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); } } else { @@ -971,6 +958,8 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; + if (neigh->dead) + goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + @@ -1027,6 +1016,13 @@ out_unlock_bh: write_unlock(&neigh->lock); local_bh_enable(); return rc; + +out_dead: + if (neigh->nud_state & NUD_STALE) + goto out_unlock_bh; + write_unlock_bh(&neigh->lock); + kfree_skb(skb); + return 1; } EXPORT_SYMBOL(__neigh_event_send); @@ -1090,6 +1086,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; + if (neigh->dead) + goto out; if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1158,6 +1156,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (new != old) { neigh_del_timer(neigh); + if (new & NUD_PROBE) + atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? @@ -1239,6 +1239,8 @@ EXPORT_SYMBOL(neigh_update); */ void __neigh_set_probe_once(struct neighbour *neigh) { + if (neigh->dead) + return; neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; @@ -1263,10 +1265,10 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, EXPORT_SYMBOL(neigh_event_ns); /* called with read_lock_bh(&n->lock); */ -static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst) +static void neigh_hh_init(struct neighbour *n) { - struct net_device *dev = dst->dev; - __be16 prot = dst->ops->protocol; + struct net_device *dev = n->dev; + __be16 prot = n->tbl->protocol; struct hh_cache *hh = &n->hh; write_lock_bh(&n->lock); @@ -1280,43 +1282,19 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst) write_unlock_bh(&n->lock); } -/* This function can be used in contexts, where only old dev_queue_xmit - * worked, f.e. if you want to override normal output path (eql, shaper), - * but resolution is not made yet. - */ - -int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - - __skb_pull(skb, skb_network_offset(skb)); - - if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, - skb->len) < 0 && - dev_rebuild_header(skb)) - return 0; - - return dev_queue_xmit(skb); -} -EXPORT_SYMBOL(neigh_compat_output); - /* Slow and careful. */ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); int rc = 0; - if (!dst) - goto discard; - if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; unsigned int seq; if (dev->header_ops->cache && !neigh->hh.hh_len) - neigh_hh_init(neigh, dst); + neigh_hh_init(neigh); do { __skb_pull(skb, skb_network_offset(skb)); @@ -1332,8 +1310,6 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) } out: return rc; -discard: - neigh_dbg(1, "%s: dst=%p neigh=%p\n", __func__, dst, neigh); out_kfree_skb: rc = -EINVAL; kfree_skb(skb); @@ -1464,11 +1440,10 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); dev_hold(dev); p->dev = dev; - write_pnet(&p->net, hold_net(net)); + write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { - release_net(net); dev_put(dev); kfree(p); return NULL; @@ -1508,7 +1483,6 @@ EXPORT_SYMBOL(neigh_parms_release); static void neigh_parms_destroy(struct neigh_parms *parms) { - release_net(neigh_parms_net(parms)); kfree(parms); } @@ -1783,6 +1757,8 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NEIGH_VAR(parms, UCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_PROBES, NEIGH_VAR(parms, MCAST_PROBES)) || + nla_put_u32(skb, NDTPA_MCAST_REPROBES, + NEIGH_VAR(parms, MCAST_REPROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME)) || @@ -1942,6 +1918,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, @@ -2042,6 +2019,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) NEIGH_VAR_SET(p, MCAST_PROBES, nla_get_u32(tbp[i])); break; + case NDTPA_MCAST_REPROBES: + NEIGH_VAR_SET(p, MCAST_REPROBES, + nla_get_u32(tbp[i])); + break; case NDTPA_BASE_REACHABLE_TIME: NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, nla_get_msecs(tbp[i])); @@ -2427,6 +2408,40 @@ void __neigh_for_each_release(struct neigh_table *tbl, } EXPORT_SYMBOL(__neigh_for_each_release); +int neigh_xmit(int index, struct net_device *dev, + const void *addr, struct sk_buff *skb) +{ + int err = -EAFNOSUPPORT; + if (likely(index < NEIGH_NR_TABLES)) { + struct neigh_table *tbl; + struct neighbour *neigh; + + tbl = neigh_tables[index]; + if (!tbl) + goto out; + neigh = __neigh_lookup_noref(tbl, addr, dev); + if (!neigh) + neigh = __neigh_create(tbl, addr, dev, false); + err = PTR_ERR(neigh); + if (IS_ERR(neigh)) + goto out_kfree_skb; + err = neigh->output(neigh, skb); + } + else if (index == NEIGH_LINK_TABLE) { + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + addr, NULL, skb->len); + if (err < 0) + goto out_kfree_skb; + err = dev_queue_xmit(skb); + } +out: + return err; +out_kfree_skb: + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(neigh_xmit); + #ifdef CONFIG_PROC_FS static struct neighbour *neigh_get_first(struct seq_file *seq) @@ -2994,6 +3009,7 @@ static struct neigh_sysctl_table { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f2aa73bfb0e4..18b34d771ed4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,7 @@ #include <linux/export.h> #include <linux/jiffies.h> #include <linux/pm_runtime.h> +#include <linux/of.h> #include "net-sysfs.h" @@ -108,11 +109,19 @@ NETDEVICE_SHOW_RO(dev_id, fmt_hex); NETDEVICE_SHOW_RO(dev_port, fmt_dec); NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); NETDEVICE_SHOW_RO(addr_len, fmt_dec); -NETDEVICE_SHOW_RO(iflink, fmt_dec); NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); +static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, dev_get_iflink(ndev)); +} +static DEVICE_ATTR_RO(iflink); + static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { return sprintf(buf, fmt_dec, dev->name_assign_type); @@ -417,6 +426,28 @@ static ssize_t phys_port_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_port_id); +static ssize_t phys_port_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + char name[IFNAMSIZ]; + + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sprintf(buf, "%s\n", name); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_name); + static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -427,11 +458,15 @@ static ssize_t phys_switch_id_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; - ret = netdev_switch_parent_id_get(netdev, &ppid); + ret = switchdev_port_attr_get(netdev, &attr); if (!ret) - ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len, + attr.u.ppid.id); } rtnl_unlock(); @@ -464,6 +499,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, NULL, }; @@ -950,6 +986,60 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, return sprintf(buf, "%lu", trans_timeout); } +#ifdef CONFIG_XPS +static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) + if (queue == &dev->_tx[i]) + break; + + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + +static ssize_t show_tx_maxrate(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + char *buf) +{ + return sprintf(buf, "%lu\n", queue->tx_maxrate); +} + +static ssize_t set_tx_maxrate(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + int err, index = get_netdev_queue_index(queue); + u32 rate = 0; + + err = kstrtou32(buf, 10, &rate); + if (err < 0) + return err; + + if (!rtnl_trylock()) + return restart_syscall(); + + err = -EOPNOTSUPP; + if (dev->netdev_ops->ndo_set_tx_maxrate) + err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); + + rtnl_unlock(); + if (!err) { + queue->tx_maxrate = rate; + return len; + } + return err; +} + +static struct netdev_queue_attribute queue_tx_maxrate = + __ATTR(tx_maxrate, S_IRUGO | S_IWUSR, + show_tx_maxrate, set_tx_maxrate); +#endif + static struct netdev_queue_attribute queue_trans_timeout = __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); @@ -1064,18 +1154,6 @@ static struct attribute_group dql_group = { #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static unsigned int get_netdev_queue_index(struct netdev_queue *queue) -{ - struct net_device *dev = queue->dev; - unsigned int i; - - i = queue - dev->_tx; - BUG_ON(i >= dev->num_tx_queues); - - return i; -} - - static ssize_t show_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, char *buf) { @@ -1152,6 +1230,7 @@ static struct attribute *netdev_queue_default_attrs[] = { &queue_trans_timeout.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, + &queue_tx_maxrate.attr, #endif NULL }; @@ -1374,6 +1453,30 @@ static struct class net_class = { .namespace = net_namespace, }; +#ifdef CONFIG_OF_NET +static int of_dev_node_match(struct device *dev, const void *data) +{ + int ret = 0; + + if (dev->parent) + ret = dev->parent->of_node == data; + + return ret == 0 ? dev->of_node == data : ret; +} + +struct net_device *of_find_net_device_by_node(struct device_node *np) +{ + struct device *dev; + + dev = class_find_device(&net_class, NULL, np, of_dev_node_match); + if (!dev) + return NULL; + + return to_net_dev(dev); +} +EXPORT_SYMBOL(of_find_net_device_by_node); +#endif + /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 70d3450588b2..2c2eb1b629b1 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -16,7 +16,6 @@ #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/net_namespace.h> -#include <linux/rtnetlink.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> @@ -148,18 +147,17 @@ static void ops_free_list(const struct pernet_operations *ops, } } +/* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { int min = 0, max = 0; - ASSERT_RTNL(); - if (reqid >= 0) { min = reqid; max = reqid + 1; } - return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL); + return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC); } /* This function is used by idr_for_each(). If net is equal to peer, the @@ -175,11 +173,16 @@ static int net_eq_idr(int id, void *net, void *peer) return 0; } -static int __peernet2id(struct net *net, struct net *peer, bool alloc) +/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc + * is set to true, thus the caller knows that the new id must be notified via + * rtnl. + */ +static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); + bool alloc_it = *alloc; - ASSERT_RTNL(); + *alloc = false; /* Magic value for id 0. */ if (id == NET_ID_ZERO) @@ -187,36 +190,77 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) if (id > 0) return id; - if (alloc) - return alloc_netid(net, peer, -1); + if (alloc_it) { + id = alloc_netid(net, peer, -1); + *alloc = true; + return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; + } + + return NETNSA_NSID_NOT_ASSIGNED; +} + +/* should be called with nsid_lock held */ +static int __peernet2id(struct net *net, struct net *peer) +{ + bool no = false; - return -ENOENT; + return __peernet2id_alloc(net, peer, &no); } +static void rtnl_net_notifyid(struct net *net, int cmd, int id); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ +int peernet2id_alloc(struct net *net, struct net *peer) +{ + unsigned long flags; + bool alloc; + int id; + + spin_lock_irqsave(&net->nsid_lock, flags); + alloc = atomic_read(&peer->count) == 0 ? false : true; + id = __peernet2id_alloc(net, peer, &alloc); + spin_unlock_irqrestore(&net->nsid_lock, flags); + if (alloc && id >= 0) + rtnl_net_notifyid(net, RTM_NEWNSID, id); + return id; +} +EXPORT_SYMBOL(peernet2id_alloc); + +/* This function returns, if assigned, the id of a peer netns. */ int peernet2id(struct net *net, struct net *peer) { - bool alloc = atomic_read(&peer->count) == 0 ? false : true; + unsigned long flags; int id; - id = __peernet2id(net, peer, alloc); - return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; + spin_lock_irqsave(&net->nsid_lock, flags); + id = __peernet2id(net, peer); + spin_unlock_irqrestore(&net->nsid_lock, flags); + return id; +} + +/* This function returns true is the peer netns has an id assigned into the + * current netns. + */ +bool peernet_has_id(struct net *net, struct net *peer) +{ + return peernet2id(net, peer) >= 0; } -EXPORT_SYMBOL(peernet2id); struct net *get_net_ns_by_id(struct net *net, int id) { + unsigned long flags; struct net *peer; if (id < 0) return NULL; rcu_read_lock(); + spin_lock_irqsave(&net->nsid_lock, flags); peer = idr_find(&net->netns_ids, id); if (peer) get_net(peer); + spin_unlock_irqrestore(&net->nsid_lock, flags); rcu_read_unlock(); return peer; @@ -237,10 +281,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); - -#ifdef NETNS_REFCNT_DEBUG - atomic_set(&net->use_count, 0); -#endif + spin_lock_init(&net->nsid_lock); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -296,13 +337,6 @@ out_free: static void net_free(struct net *net) { -#ifdef NETNS_REFCNT_DEBUG - if (unlikely(atomic_read(&net->use_count) != 0)) { - pr_emerg("network namespace not free! Usage: %d\n", - atomic_read(&net->use_count)); - return; - } -#endif kfree(rcu_access_pointer(net->gen)); kmem_cache_free(net_cachep, net); } @@ -368,12 +402,19 @@ static void cleanup_net(struct work_struct *work) list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); for_each_net(tmp) { - int id = __peernet2id(tmp, net, false); + int id; + spin_lock_irq(&tmp->nsid_lock); + id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); + spin_unlock_irq(&tmp->nsid_lock); + if (id >= 0) + rtnl_net_notifyid(tmp, RTM_DELNSID, id); } + spin_lock_irq(&net->nsid_lock); idr_destroy(&net->netns_ids); + spin_unlock_irq(&net->nsid_lock); } rtnl_unlock(); @@ -501,6 +542,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + unsigned long flags; struct net *peer; int nsid, err; @@ -521,14 +563,19 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) if (IS_ERR(peer)) return PTR_ERR(peer); - if (__peernet2id(net, peer, false) >= 0) { + spin_lock_irqsave(&net->nsid_lock, flags); + if (__peernet2id(net, peer) >= 0) { + spin_unlock_irqrestore(&net->nsid_lock, flags); err = -EEXIST; goto out; } err = alloc_netid(net, peer, nsid); - if (err > 0) + spin_unlock_irqrestore(&net->nsid_lock, flags); + if (err >= 0) { + rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; + } out: put_net(peer); return err; @@ -542,13 +589,10 @@ static int rtnl_net_get_size(void) } static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, struct net *peer) + int cmd, struct net *net, int nsid) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - int id; - - ASSERT_RTNL(); nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); if (!nlh) @@ -557,10 +601,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - id = __peernet2id(net, peer, false); - if (id < 0) - id = NETNSA_NSID_NOT_ASSIGNED; - if (nla_put_s32(skb, NETNSA_NSID, id)) + if (nla_put_s32(skb, NETNSA_NSID, nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -576,8 +617,8 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; struct sk_buff *msg; - int err = -ENOBUFS; struct net *peer; + int err, id; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy); @@ -599,8 +640,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } + id = peernet2id(net, peer); err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_GETNSID, net, peer); + RTM_NEWNSID, net, id); if (err < 0) goto err_out; @@ -614,6 +656,75 @@ out: return err; } +struct rtnl_net_dump_cb { + struct net *net; + struct sk_buff *skb; + struct netlink_callback *cb; + int idx; + int s_idx; +}; + +static int rtnl_net_dumpid_one(int id, void *peer, void *data) +{ + struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; + int ret; + + if (net_cb->idx < net_cb->s_idx) + goto cont; + + ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, + net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWNSID, net_cb->net, id); + if (ret < 0) + return ret; + +cont: + net_cb->idx++; + return 0; +} + +static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct rtnl_net_dump_cb net_cb = { + .net = net, + .skb = skb, + .cb = cb, + .idx = 0, + .s_idx = cb->args[0], + }; + unsigned long flags; + + spin_lock_irqsave(&net->nsid_lock, flags); + idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); + spin_unlock_irqrestore(&net->nsid_lock, flags); + + cb->args[0] = net_cb.idx; + return skb->len; +} + +static void rtnl_net_notifyid(struct net *net, int cmd, int id) +{ + struct sk_buff *msg; + int err = -ENOMEM; + + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); + if (!msg) + goto out; + + err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); + if (err < 0) + goto err_out; + + rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0); + return; + +err_out: + nlmsg_free(msg); +out: + rtnl_set_sk_err(net, RTNLGRP_NSID, err); +} + static int __init net_ns_init(void) { struct net_generic *ng; @@ -648,7 +759,8 @@ static int __init net_ns_init(void) register_pernet_subsys(&net_ns_ops); rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, + NULL); return 0; } diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 1f2a126f4ffa..6441f47b1a8f 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -23,7 +23,8 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return css_cls_state(task_css(p, net_cls_cgrp_id)); + return css_cls_state(task_css_check(p, net_cls_cgrp_id, + rcu_read_lock_bh_held())); } EXPORT_SYMBOL_GPL(task_cls_state); diff --git a/net/core/netevent.c b/net/core/netevent.c index f17ccd291d39..8b3bc4fac613 100644 --- a/net/core/netevent.c +++ b/net/core/netevent.c @@ -31,10 +31,7 @@ static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); */ int register_netevent_notifier(struct notifier_block *nb) { - int err; - - err = atomic_notifier_chain_register(&netevent_notif_chain, nb); - return err; + return atomic_notifier_chain_register(&netevent_notif_chain, nb); } EXPORT_SYMBOL_GPL(register_netevent_notifier); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 508155b283dd..1ebdf1c0d118 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -177,7 +177,7 @@ #include <asm/dma.h> #include <asm/div64.h> /* do_div */ -#define VERSION "2.74" +#define VERSION "2.75" #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ #define MPLS_STACK_BOTTOM htonl(0x00000100) @@ -210,6 +210,10 @@ #define T_REMDEVALL (1<<2) /* Remove all devs */ #define T_REMDEV (1<<3) /* Remove one dev */ +/* Xmit modes */ +#define M_START_XMIT 0 /* Default normal TX */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ + /* If lock -- protects updating of if_list */ #define if_lock(t) spin_lock(&(t->if_lock)); #define if_unlock(t) spin_unlock(&(t->if_lock)); @@ -251,13 +255,14 @@ struct pktgen_dev { * we will do a random selection from within the range. */ __u32 flags; - int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ - + int xmit_mode; int min_pkt_size; int max_pkt_size; int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; + int removal_mark; /* non-zero => the device is marked for + * removal by worker thread */ + struct page *page; u64 delay; /* nano-seconds */ @@ -507,7 +512,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, pktgen_reset_all_threads(pn); else - pr_warn("Unknown command: %s\n", data); + return -EINVAL; return count; } @@ -567,7 +572,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) " dst_min: %s dst_max: %s\n", pkt_dev->dst_min, pkt_dev->dst_max); seq_printf(seq, - " src_min: %s src_max: %s\n", + " src_min: %s src_max: %s\n", pkt_dev->src_min, pkt_dev->src_max); } @@ -620,6 +625,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->node >= 0) seq_printf(seq, " node: %d\n", pkt_dev->node); + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) + seq_puts(seq, " xmit_mode: netif_receive\n"); + seq_puts(seq, " Flags: "); if (pkt_dev->flags & F_IPV6) @@ -1081,7 +1089,8 @@ static ssize_t pktgen_if_write(struct file *file, if (len < 0) return len; if ((value > 0) && - (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || + !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) return -ENOTSUPP; i += len; pkt_dev->clone_skb = value; @@ -1134,7 +1143,7 @@ static ssize_t pktgen_if_write(struct file *file, return len; i += len; - if ((value > 1) && + if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) && (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) return -ENOTSUPP; pkt_dev->burst = value < 1 ? 1 : value; @@ -1160,6 +1169,45 @@ static ssize_t pktgen_if_write(struct file *file, sprintf(pg_result, "ERROR: node not possible"); return count; } + if (!strcmp(name, "xmit_mode")) { + char f[32]; + + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) + return len; + + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + + if (strcmp(f, "start_xmit") == 0) { + pkt_dev->xmit_mode = M_START_XMIT; + } else if (strcmp(f, "netif_receive") == 0) { + /* clone_skb set earlier, not supported in this mode */ + if (pkt_dev->clone_skb > 0) + return -ENOTSUPP; + + pkt_dev->xmit_mode = M_NETIF_RECEIVE; + + /* make sure new packet is allocated every time + * pktgen_xmit() is called + */ + pkt_dev->last_ok = 1; + + /* override clone_skb if user passed default value + * at module loading time + */ + pkt_dev->clone_skb = 0; + } else { + sprintf(pg_result, + "xmit_mode -:%s:- unknown\nAvailable modes: %s", + f, "start_xmit, netif_receive\n"); + return count; + } + sprintf(pg_result, "OK: xmit_mode=%s", f); + return count; + } if (!strcmp(name, "flag")) { char f[32]; memset(f, 0, 32); @@ -1267,6 +1315,9 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "NO_TIMESTAMP") == 0) pkt_dev->flags |= F_NO_TIMESTAMP; + else if (strcmp(f, "!NO_TIMESTAMP") == 0) + pkt_dev->flags &= ~F_NO_TIMESTAMP; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", @@ -2212,8 +2263,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; if (likely(t.task)) schedule(); @@ -2594,9 +2643,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; if (x) { - int ret; - __u8 *eth; + struct ethhdr *eth; struct iphdr *iph; + int ret; nhead = x->props.header_len - skb_headroom(skb); if (nhead > 0) { @@ -2616,9 +2665,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, goto err; } /* restore ll */ - eth = (__u8 *) skb_push(skb, ETH_HLEN); - memcpy(eth, pkt_dev->hh, 12); - *(u16 *) ð[12] = protocol; + eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); + memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN); + eth->h_proto = protocol; /* Update IPv4 header len as well as checksum value */ iph = ip_hdr(skb); @@ -3317,6 +3366,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) unsigned int burst = ACCESS_ONCE(pkt_dev->burst); struct net_device *odev = pkt_dev->odev; struct netdev_queue *txq; + struct sk_buff *skb; int ret; /* If device is offline, then don't send */ @@ -3354,6 +3404,37 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) { + skb = pkt_dev->skb; + skb->protocol = eth_type_trans(skb, skb->dev); + atomic_add(burst, &skb->users); + local_bh_disable(); + do { + ret = netif_receive_skb(skb); + if (ret == NET_RX_DROP) + pkt_dev->errors++; + pkt_dev->sofar++; + pkt_dev->seq_num++; + if (atomic_read(&skb->users) != burst) { + /* skb was queued by rps/rfs or taps, + * so cannot reuse this skb + */ + atomic_sub(burst - 1, &skb->users); + /* get out of the loop and wait + * until skb is consumed + */ + break; + } + /* skb was 'freed' by stack, so clean few + * bits and reuse it + */ +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = 0; /* reset reclass/redir ttl */ +#endif + } while (--burst > 0); + goto out; /* Skips xmit_mode M_START_XMIT */ + } + txq = skb_get_tx_queue(odev, pkt_dev->skb); local_bh_disable(); @@ -3401,6 +3482,7 @@ xmit_more: unlock: HARD_TX_UNLOCK(odev, txq); +out: local_bh_enable(); /* If pkt_dev->count is zero, then run forever */ @@ -3489,13 +3571,6 @@ static int pktgen_thread_worker(void *arg) pr_debug("%s removing thread\n", t->tsk->comm); pktgen_rem_thread(t); - /* Wait for kthread_stop */ - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - } - __set_current_state(TASK_RUNNING); - return 0; } @@ -3687,6 +3762,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) } t->net = pn; + get_task_struct(p); wake_up_process(p); wait_for_completion(&t->start_done); @@ -3809,6 +3885,7 @@ static void __net_exit pg_net_exit(struct net *net) t = list_entry(q, struct pktgen_thread, th_list); list_del(&t->th_list); kthread_stop(t->tsk); + put_task_struct(t->tsk); kfree(t); } diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 04db318e6218..87b22c0bc08c 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -58,14 +58,14 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, return -ENOMEM; get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); - rwlock_init(&queue->syn_wait_lock); + spin_lock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; lopt->max_qlen_log = ilog2(nr_table_entries); - write_lock_bh(&queue->syn_wait_lock); + spin_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; - write_unlock_bh(&queue->syn_wait_lock); + spin_unlock_bh(&queue->syn_wait_lock); return 0; } @@ -81,10 +81,10 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk( { struct listen_sock *lopt; - write_lock_bh(&queue->syn_wait_lock); + spin_lock_bh(&queue->syn_wait_lock); lopt = queue->listen_opt; queue->listen_opt = NULL; - write_unlock_bh(&queue->syn_wait_lock); + spin_unlock_bh(&queue->syn_wait_lock); return lopt; } @@ -94,21 +94,26 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - if (lopt->qlen != 0) { + if (listen_sock_qlen(lopt) != 0) { unsigned int i; for (i = 0; i < lopt->nr_table_entries; i++) { struct request_sock *req; + spin_lock_bh(&queue->syn_wait_lock); while ((req = lopt->syn_table[i]) != NULL) { lopt->syn_table[i] = req->dl_next; - lopt->qlen--; - reqsk_free(req); + atomic_inc(&lopt->qlen_dec); + if (del_timer(&req->rsk_timer)) + reqsk_put(req); + reqsk_put(req); } + spin_unlock_bh(&queue->syn_wait_lock); } } - WARN_ON(lopt->qlen != 0); + if (WARN_ON(listen_sock_qlen(lopt) != 0)) + pr_err("qlen %u\n", listen_sock_qlen(lopt)); kvfree(lopt); } @@ -153,24 +158,22 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) * case might also exist in tcp_v4_hnd_req() that will trigger this locking * order. * - * When a TFO req is created, it needs to sock_hold its listener to prevent - * the latter data structure from going away. - * - * This function also sets "treq->listener" to NULL and unreference listener - * socket. treq->listener is used by the listener so it is protected by the + * This function also sets "treq->tfo_listener" to false. + * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { - struct sock *lsk = tcp_rsk(req)->listener; - struct fastopen_queue *fastopenq = - inet_csk(lsk)->icsk_accept_queue.fastopenq; + struct sock *lsk = req->rsk_listener; + struct fastopen_queue *fastopenq; + + fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq; tcp_sk(sk)->fastopen_rsk = NULL; spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; - tcp_rsk(req)->listener = NULL; + tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; @@ -179,8 +182,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); - sock_put(lsk); - reqsk_free(req); + reqsk_put(req); return; } /* Wait for 60secs before removing a req that has triggered RST. @@ -190,7 +192,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, * * For more details see CoNext'11 "TCP Fast Open" paper. */ - req->expires = jiffies + 60*HZ; + req->rsk_timer.expires = jiffies + 60*HZ; if (fastopenq->rskq_rst_head == NULL) fastopenq->rskq_rst_head = req; else @@ -201,5 +203,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); - sock_put(lsk); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7ebed55b5f7d..dc004b1e1f85 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -818,7 +818,20 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_rate)) + - nla_total_size(sizeof(struct ifla_vf_link_state))); + nla_total_size(sizeof(struct ifla_vf_link_state)) + + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + + /* IFLA_VF_STATS_RX_PACKETS */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_PACKETS */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_BYTES */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_BYTES */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_BROADCAST */ + nla_total_size(sizeof(__u64)) + + /* IFLA_VF_STATS_MULTICAST */ + nla_total_size(sizeof(__u64))); return size; } else return 0; @@ -982,19 +995,41 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) +{ + char name[IFNAMSIZ]; + int err; + + err = dev_get_phys_port_name(dev, name, sizeof(name)); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; - struct netdev_phys_item_id psid; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; - err = netdev_switch_parent_id_get(dev, &psid); + err = switchdev_port_attr_get(dev, &attr); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } - if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id)) + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len, + attr.u.ppid.id)) return -EMSGSIZE; return 0; @@ -1037,8 +1072,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink)) || + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || (upper_dev && nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || @@ -1072,6 +1107,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; + if (rtnl_phys_port_name_fill(skb, dev)) + goto nla_put_failure; + if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; @@ -1097,7 +1135,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, && (ext_filter_mask & RTEXT_FILTER_VF)) { int i; - struct nlattr *vfinfo, *vf; + struct nlattr *vfinfo, *vf, *vfstats; int num_vfs = dev_num_vf(dev->dev.parent); vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); @@ -1111,14 +1149,17 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_rss_query_en vf_rss_query_en; + struct ifla_vf_stats vf_stats; /* * Not all SR-IOV capable drivers support the - * spoofcheck query. Preset to -1 so the user - * space tool can detect that the driver didn't - * report anything. + * spoofcheck and "RSS query enable" query. Preset to + * -1 so the user space tool can detect that the driver + * didn't report anything. */ ivi.spoofchk = -1; + ivi.rss_query_en = -1; memset(ivi.mac, 0, sizeof(ivi.mac)); /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero @@ -1131,7 +1172,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = - vf_linkstate.vf = ivi.vf; + vf_linkstate.vf = + vf_rss_query_en.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; @@ -1141,6 +1183,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; + vf_rss_query_en.setting = ivi.rss_query_en; vf = nla_nest_start(skb, IFLA_VF_INFO); if (!vf) { nla_nest_cancel(skb, vfinfo); @@ -1155,8 +1198,35 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), - &vf_linkstate)) + &vf_linkstate) || + nla_put(skb, IFLA_VF_RSS_QUERY_EN, + sizeof(vf_rss_query_en), + &vf_rss_query_en)) goto nla_put_failure; + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, i, + &vf_stats); + vfstats = nla_nest_start(skb, IFLA_VF_STATS); + if (!vfstats) { + nla_nest_cancel(skb, vf); + nla_nest_cancel(skb, vfinfo); + goto nla_put_failure; + } + if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast) || + nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast)) + goto nla_put_failure; + nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); } nla_nest_end(skb, vfinfo); @@ -1175,7 +1245,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id(dev_net(dev), link_net); + int id = peernet2id_alloc(dev_net(dev), link_net); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) goto nla_put_failure; @@ -1258,10 +1328,6 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; -static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { - [IFLA_VF_INFO] = { .type = NLA_NESTED }, -}; - static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, @@ -1269,6 +1335,17 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, + [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, + [IFLA_VF_STATS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = { + [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 }, + [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 }, + [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 }, + [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 }, + [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 }, + [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { @@ -1407,85 +1484,98 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return 0; } -static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) +static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { - int rem, err = -EINVAL; - struct nlattr *vf; const struct net_device_ops *ops = dev->netdev_ops; + int err = -EINVAL; - nla_for_each_nested(vf, attr, rem) { - switch (nla_type(vf)) { - case IFLA_VF_MAC: { - struct ifla_vf_mac *ivm; - ivm = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_mac) - err = ops->ndo_set_vf_mac(dev, ivm->vf, - ivm->mac); - break; - } - case IFLA_VF_VLAN: { - struct ifla_vf_vlan *ivv; - ivv = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_vlan) - err = ops->ndo_set_vf_vlan(dev, ivv->vf, - ivv->vlan, - ivv->qos); - break; - } - case IFLA_VF_TX_RATE: { - struct ifla_vf_tx_rate *ivt; - struct ifla_vf_info ivf; - ivt = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_get_vf_config) - err = ops->ndo_get_vf_config(dev, ivt->vf, - &ivf); - if (err) - break; - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_rate) - err = ops->ndo_set_vf_rate(dev, ivt->vf, - ivf.min_tx_rate, - ivt->rate); - break; - } - case IFLA_VF_RATE: { - struct ifla_vf_rate *ivt; - ivt = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_rate) - err = ops->ndo_set_vf_rate(dev, ivt->vf, - ivt->min_tx_rate, - ivt->max_tx_rate); - break; - } - case IFLA_VF_SPOOFCHK: { - struct ifla_vf_spoofchk *ivs; - ivs = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_spoofchk) - err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, - ivs->setting); - break; - } - case IFLA_VF_LINK_STATE: { - struct ifla_vf_link_state *ivl; - ivl = nla_data(vf); - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_link_state) - err = ops->ndo_set_vf_link_state(dev, ivl->vf, - ivl->link_state); - break; - } - default: - err = -EINVAL; - break; - } - if (err) - break; + if (tb[IFLA_VF_MAC]) { + struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_mac) + err = ops->ndo_set_vf_mac(dev, ivm->vf, + ivm->mac); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN]) { + struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_vlan) + err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, + ivv->qos); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_TX_RATE]) { + struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); + struct ifla_vf_info ivf; + + err = -EOPNOTSUPP; + if (ops->ndo_get_vf_config) + err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); + if (err < 0) + return err; + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_rate) + err = ops->ndo_set_vf_rate(dev, ivt->vf, + ivf.min_tx_rate, + ivt->rate); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_RATE]) { + struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_rate) + err = ops->ndo_set_vf_rate(dev, ivt->vf, + ivt->min_tx_rate, + ivt->max_tx_rate); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_SPOOFCHK]) { + struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_spoofchk) + err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, + ivs->setting); + if (err < 0) + return err; } + + if (tb[IFLA_VF_LINK_STATE]) { + struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_link_state) + err = ops->ndo_set_vf_link_state(dev, ivl->vf, + ivl->link_state); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_RSS_QUERY_EN]) { + struct ifla_vf_rss_query_en *ivrssq_en; + + err = -EOPNOTSUPP; + ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); + if (ops->ndo_set_vf_rss_query_en) + err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, + ivrssq_en->setting); + if (err < 0) + return err; + } + return err; } @@ -1681,14 +1771,21 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_VFINFO_LIST]) { + struct nlattr *vfinfo[IFLA_VF_MAX + 1]; struct nlattr *attr; int rem; + nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { - if (nla_type(attr) != IFLA_VF_INFO) { + if (nla_type(attr) != IFLA_VF_INFO || + nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } - err = do_setvfinfo(dev, attr); + err = nla_parse_nested(vfinfo, IFLA_VF_MAX, attr, + ifla_vf_policy); + if (err < 0) + goto errout; + err = do_setvfinfo(dev, vfinfo); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; @@ -1707,10 +1804,13 @@ static int do_setlink(const struct sk_buff *skb, goto errout; nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { - if (nla_type(attr) != IFLA_VF_PORT) - continue; - err = nla_parse_nested(port, IFLA_PORT_MAX, - attr, ifla_port_policy); + if (nla_type(attr) != IFLA_VF_PORT || + nla_len(attr) < NLA_HDRLEN) { + err = -EINVAL; + goto errout; + } + err = nla_parse_nested(port, IFLA_PORT_MAX, attr, + ifla_port_policy); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { @@ -1815,6 +1915,42 @@ errout: return err; } +static int rtnl_group_dellink(const struct net *net, int group) +{ + struct net_device *dev, *aux; + LIST_HEAD(list_kill); + bool found = false; + + if (!group) + return -EPERM; + + for_each_netdev(net, dev) { + if (dev->group == group) { + const struct rtnl_link_ops *ops; + + found = true; + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + } + } + + if (!found) + return -ENODEV; + + for_each_netdev_safe(net, dev, aux) { + if (dev->group == group) { + const struct rtnl_link_ops *ops; + + ops = dev->rtnl_link_ops; + ops->dellink(dev, &list_kill); + } + } + unregister_netdevice_many(&list_kill); + + return 0; +} + static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -1838,6 +1974,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) dev = __dev_get_by_name(net, ifname); + else if (tb[IFLA_GROUP]) + return rtnl_group_dellink(net, nla_get_u32(tb[IFLA_GROUP])); else return -EINVAL; @@ -1873,7 +2011,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, - char *ifname, unsigned char name_assign_type, + const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[]) { int err; @@ -2337,6 +2475,9 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, { struct sk_buff *skb; + if (dev->reg_state != NETREG_REGISTERED) + return; + skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); if (skb) rtmsg_ifinfo_send(skb, dev, flags); @@ -2345,7 +2486,7 @@ EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, - u8 *addr, u32 pid, u32 seq, + u8 *addr, u16 vid, u32 pid, u32 seq, int type, unsigned int flags, int nlflags) { @@ -2367,6 +2508,9 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; + if (vid) + if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -2381,7 +2525,7 @@ static inline size_t rtnl_fdb_nlmsg_size(void) return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN); } -static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type) +static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2391,7 +2535,8 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type) if (!skb) goto errout; - err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0); + err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, + 0, 0, type, NTF_SELF, 0); if (err < 0) { kfree_skb(skb); goto errout; @@ -2526,7 +2671,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) nlh->nlmsg_flags); if (!err) { - rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH); + rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH); ndm->ndm_flags &= ~NTF_SELF; } } @@ -2627,7 +2772,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); if (!err) { - rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); + rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH); ndm->ndm_flags &= ~NTF_SELF; } } @@ -2652,7 +2797,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, if (*idx < cb->args[0]) goto skip; - err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, + err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, RTM_NEWNEIGH, NTF_SELF, NLM_F_MULTI); @@ -2695,7 +2840,6 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net_device *dev; struct nlattr *tb[IFLA_MAX+1]; - struct net_device *bdev = NULL; struct net_device *br_dev = NULL; const struct net_device_ops *ops = NULL; const struct net_device_ops *cops = NULL; @@ -2719,7 +2863,6 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) return -ENODEV; ops = br_dev->netdev_ops; - bdev = br_dev; } for_each_netdev(net, dev) { @@ -2732,7 +2875,6 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) cops = br_dev->netdev_ops; } - bdev = dev; } else { if (dev != br_dev && !(dev->priv_flags & IFF_BRIDGE_PORT)) @@ -2742,7 +2884,6 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) !(dev->priv_flags & IFF_EBRIDGE)) continue; - bdev = br_dev; cops = ops; } @@ -2775,7 +2916,11 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, - u32 flags, u32 mask) + u32 flags, u32 mask, int nlflags, + u32 filter_mask, + int (*vlan_fill)(struct sk_buff *skb, + struct net_device *dev, + u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; @@ -2783,8 +2928,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); + int err = 0; - nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); + nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) return -EMSGSIZE; @@ -2804,8 +2950,8 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink))) + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; br_afspec = nla_nest_start(skb, IFLA_AF_SPEC); @@ -2823,6 +2969,13 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, goto nla_put_failure; } } + if (vlan_fill) { + err = vlan_fill(skb, dev, filter_mask); + if (err) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); @@ -2856,9 +3009,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, return 0; nla_put_failure: nlmsg_cancel(skb, nlh); - return -EMSGSIZE; + return err ? err : -EMSGSIZE; } -EXPORT_SYMBOL(ndo_dflt_bridge_getlink); +EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { @@ -2890,7 +3043,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0] && br_dev->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev, filter_mask) < 0) + skb, portid, seq, dev, filter_mask, + NLM_F_MULTI) < 0) break; idx++; } @@ -2898,7 +3052,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0] && ops->ndo_bridge_getlink(skb, portid, seq, dev, - filter_mask) < 0) + filter_mask, + NLM_F_MULTI) < 0) break; idx++; } @@ -2939,7 +3094,7 @@ static int rtnl_bridge_notify(struct net_device *dev) goto errout; } - err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); + err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); if (err < 0) goto errout; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 51dd3193a33e..fd3ce461fbe6 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -154,7 +154,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + daddr[i]; + secret[i] = net_secret[i] + (__force u32)daddr[i]; secret[4] = net_secret[4] + (((__force u16)sport << 16) + (__force u16)dport); for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8e4ac97c8477..b6a19ca0f99e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -280,13 +280,14 @@ nodata: EXPORT_SYMBOL(__alloc_skb); /** - * build_skb - build a network buffer + * __build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of fragment, or 0 if head was kmalloced + * @frag_size: size of data, or 0 if head was kmalloced * * Allocate a new &sk_buff. Caller provides space holding head and * skb_shared_info. @data must have been allocated by kmalloc() only if - * @frag_size is 0, otherwise data should come from the page allocator. + * @frag_size is 0, otherwise data should come from the page allocator + * or vmalloc() * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : @@ -297,7 +298,7 @@ EXPORT_SYMBOL(__alloc_skb); * before giving packet to stack. * RX rings only contains data buffers, not full skbs. */ -struct sk_buff *build_skb(void *data, unsigned int frag_size) +struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct skb_shared_info *shinfo; struct sk_buff *skb; @@ -311,7 +312,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = SKB_TRUESIZE(size); - skb->head_frag = frag_size != 0; atomic_set(&skb->users, 1); skb->head = data; skb->data = data; @@ -328,95 +328,37 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) return skb; } -EXPORT_SYMBOL(build_skb); - -struct netdev_alloc_cache { - struct page_frag frag; - /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_count every time we allocate a fragment. - */ - unsigned int pagecnt_bias; -}; -static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); - -static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, - gfp_t gfp_mask) -{ - const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER; - struct page *page = NULL; - gfp_t gfp = gfp_mask; - - if (order) { - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; - page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); - nc->frag.size = PAGE_SIZE << (page ? order : 0); - } - - if (unlikely(!page)) - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - - nc->frag.page = page; - - return page; -} -static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache, - unsigned int fragsz, gfp_t gfp_mask) +/* build_skb() is wrapper over __build_skb(), that specifically + * takes care of skb->head and skb->pfmemalloc + * This means that if @frag_size is not zero, then @data must be backed + * by a page fragment, not kmalloc() or vmalloc() + */ +struct sk_buff *build_skb(void *data, unsigned int frag_size) { - struct netdev_alloc_cache *nc = this_cpu_ptr(cache); - struct page *page = nc->frag.page; - unsigned int size; - int offset; - - if (unlikely(!page)) { -refill: - page = __page_frag_refill(nc, gfp_mask); - if (!page) - return NULL; - - /* if size can vary use frag.size else just use PAGE_SIZE */ - size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - - /* Even if we own the page, we do not use atomic_set(). - * This would break get_page_unless_zero() users. - */ - atomic_add(size - 1, &page->_count); - - /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; - nc->frag.offset = size; - } + struct sk_buff *skb = __build_skb(data, frag_size); - offset = nc->frag.offset - fragsz; - if (unlikely(offset < 0)) { - if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) - goto refill; - - /* if size can vary use frag.size else just use PAGE_SIZE */ - size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - - /* OK, page count is 0, we can safely set it */ - atomic_set(&page->_count, size); - - /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; - offset = size - fragsz; + if (skb && frag_size) { + skb->head_frag = 1; + if (virt_to_head_page(data)->pfmemalloc) + skb->pfmemalloc = 1; } - - nc->pagecnt_bias--; - nc->frag.offset = offset; - - return page_address(page) + offset; + return skb; } +EXPORT_SYMBOL(build_skb); + +static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { + struct page_frag_cache *nc; unsigned long flags; void *data; local_irq_save(flags); - data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask); + nc = this_cpu_ptr(&netdev_alloc_cache); + data = __alloc_page_frag(nc, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -436,7 +378,9 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask); + struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + return __alloc_page_frag(nc, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -446,76 +390,70 @@ void *napi_alloc_frag(unsigned int fragsz) EXPORT_SYMBOL(napi_alloc_frag); /** - * __alloc_rx_skb - allocate an skbuff for rx + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on * @length: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb - * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for - * allocations in case we have to fallback to __alloc_skb() - * If SKB_ALLOC_NAPI is set, page fragment will be allocated - * from napi_cache instead of netdev_cache. * * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate + * buffer has NET_SKB_PAD headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. */ -static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask, - int flags) +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, + gfp_t gfp_mask) { - struct sk_buff *skb = NULL; - unsigned int fragsz = SKB_DATA_ALIGN(length) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct page_frag_cache *nc; + unsigned long flags; + struct sk_buff *skb; + bool pfmemalloc; + void *data; - if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { - void *data; + len += NET_SKB_PAD; - if (sk_memalloc_socks()) - gfp_mask |= __GFP_MEMALLOC; + if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } - data = (flags & SKB_ALLOC_NAPI) ? - __napi_alloc_frag(fragsz, gfp_mask) : - __netdev_alloc_frag(fragsz, gfp_mask); + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); - if (likely(data)) { - skb = build_skb(data, fragsz); - if (unlikely(!skb)) - put_page(virt_to_head_page(data)); - } - } else { - skb = __alloc_skb(length, gfp_mask, - SKB_ALLOC_RX, NUMA_NO_NODE); - } - return skb; -} + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; -/** - * __netdev_alloc_skb - allocate an skbuff for rx on a specific device - * @dev: network device to receive on - * @length: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb - * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has NET_SKB_PAD headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. - * - * %NULL is returned if there is no free memory. - */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, - unsigned int length, gfp_t gfp_mask) -{ - struct sk_buff *skb; + local_irq_save(flags); - length += NET_SKB_PAD; - skb = __alloc_rx_skb(length, gfp_mask, 0); + nc = this_cpu_ptr(&netdev_alloc_cache); + data = __alloc_page_frag(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD); - skb->dev = dev; + local_irq_restore(flags); + + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; } + /* use OR instead of assignment to avoid clearing of bits in mask */ + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + +skb_success: + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + +skb_fail: return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); @@ -533,19 +471,49 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, - unsigned int length, gfp_t gfp_mask) +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, + gfp_t gfp_mask) { + struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; + void *data; - length += NET_SKB_PAD + NET_IP_ALIGN; - skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI); + len += NET_SKB_PAD + NET_IP_ALIGN; - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); - skb->dev = napi->dev; + if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + (gfp_mask & (__GFP_WAIT | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; } + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); + + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + data = __alloc_page_frag(nc, len, gfp_mask); + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; + } + + /* use OR instead of assignment to avoid clearing of bits in mask */ + if (nc->pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + +skb_success: + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + skb->dev = napi->dev; + +skb_fail: return skb; } EXPORT_SYMBOL(__napi_alloc_skb); @@ -593,10 +561,12 @@ static void skb_clone_fraglist(struct sk_buff *skb) static void skb_free_head(struct sk_buff *skb) { + unsigned char *head = skb->head; + if (skb->head_frag) - put_page(virt_to_head_page(skb->head)); + skb_free_frag(head); else - kfree(skb->head); + kfree(head); } static void skb_release_data(struct sk_buff *skb) @@ -1900,15 +1870,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return false; } +ssize_t skb_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + + /* Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(sk); + ret = splice_to_pipe(pipe, spd); + lock_sock(sk); + + return ret; +} + /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. It does NOT handle frag lists within * the frag list, if such a thing exists. We'd probably need to recurse to * handle that cleanly. */ -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, - unsigned int flags) + unsigned int flags, + ssize_t (*splice_cb)(struct sock *, + struct pipe_inode_info *, + struct splice_pipe_desc *)) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; @@ -1921,7 +1915,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .spd_release = sock_spd_release, }; struct sk_buff *frag_iter; - struct sock *sk = skb->sk; int ret = 0; /* @@ -1944,23 +1937,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, } done: - if (spd.nr_pages) { - /* - * Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, &spd); - lock_sock(sk); - } + if (spd.nr_pages) + ret = splice_cb(sk, pipe, &spd); return ret; } +EXPORT_SYMBOL_GPL(skb_splice_bits); /** * skb_store_bits - store bits from kernel buffer to skb @@ -2865,7 +2847,6 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) * @from: search offset * @to: search limit * @config: textsearch configuration - * @state: uninitialized textsearch state variable * * Finds a pattern in the skb data according to the specified * textsearch configuration. Use textsearch_next() to retrieve @@ -2873,17 +2854,17 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) * to the first occurrence or UINT_MAX if no match was found. */ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, - unsigned int to, struct ts_config *config, - struct ts_state *state) + unsigned int to, struct ts_config *config) { + struct ts_state state; unsigned int ret; config->get_next_block = skb_ts_get_next_block; config->finish = skb_ts_finish; - skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); - ret = textsearch_find(config, state); + ret = textsearch_find(config, &state); return (ret <= to - from ? ret : UINT_MAX); } EXPORT_SYMBOL(skb_find_text); @@ -2946,6 +2927,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, } EXPORT_SYMBOL(skb_append_datato_frags); +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, + int offset, size_t size) +{ + int i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, size); + } else { + return -EMSGSIZE; + } + + return 0; +} +EXPORT_SYMBOL_GPL(skb_append_pagefrags); + /** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update @@ -3207,10 +3206,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); - struct sk_buff *nskb, *lp, *p = *head; unsigned int len = skb_gro_len(skb); + struct sk_buff *lp, *p = *head; unsigned int delta_truesize; - unsigned int headroom; if (unlikely(p->len + len >= 65536)) return -E2BIG; @@ -3277,48 +3275,6 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } - /* switch back to head shinfo */ - pinfo = skb_shinfo(p); - - if (pinfo->frag_list) - goto merge; - if (skb_gro_len(p) != pinfo->gso_size) - return -E2BIG; - - headroom = skb_headroom(p); - nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); - if (unlikely(!nskb)) - return -ENOMEM; - - __copy_skb_header(nskb, p); - nskb->mac_len = p->mac_len; - - skb_reserve(nskb, headroom); - __skb_put(nskb, skb_gro_offset(p)); - - skb_set_mac_header(nskb, skb_mac_header(p) - p->data); - skb_set_network_header(nskb, skb_network_offset(p)); - skb_set_transport_header(nskb, skb_transport_offset(p)); - - __skb_pull(p, skb_gro_offset(p)); - memcpy(skb_mac_header(nskb), skb_mac_header(p), - p->data - skb_mac_header(p)); - - skb_shinfo(nskb)->frag_list = p; - skb_shinfo(nskb)->gso_size = pinfo->gso_size; - pinfo->gso_size = 0; - __skb_header_release(p); - NAPI_GRO_CB(nskb)->last = p; - - nskb->data_len += p->len; - nskb->truesize += p->truesize; - nskb->len += p->len; - - *head = nskb; - nskb->next = p->next; - p->next = NULL; - - p = nskb; merge: delta_truesize = skb->truesize; @@ -3796,7 +3752,6 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); - /** * skb_partial_csum_set - set up and verify partial csum values for packet * @skb: the skb to set @@ -4057,6 +4012,93 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate) } EXPORT_SYMBOL(skb_checksum_setup); +/** + * skb_checksum_maybe_trim - maybe trims the given skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * + * Checks whether the given skb has data beyond the given transport length. + * If so, returns a cloned skb trimmed to this transport length. + * Otherwise returns the provided skb. Returns NULL in error cases + * (e.g. transport_len exceeds skb length or out-of-memory). + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, + unsigned int transport_len) +{ + struct sk_buff *skb_chk; + unsigned int len = skb_transport_offset(skb) + transport_len; + int ret; + + if (skb->len < len) { + kfree_skb(skb); + return NULL; + } else if (skb->len == len) { + return skb; + } + + skb_chk = skb_clone(skb, GFP_ATOMIC); + kfree_skb(skb); + + if (!skb_chk) + return NULL; + + ret = pskb_trim_rcsum(skb_chk, len); + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} + +/** + * skb_checksum_trimmed - validate checksum of an skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * @skb_chkf: checksum function to use + * + * Applies the given checksum function skb_chkf to the provided skb. + * Returns a checked and maybe trimmed skb. Returns NULL on error. + * + * If the skb has data beyond the given transport length, then a + * trimmed & cloned skb is checked and returned. + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)) +{ + struct sk_buff *skb_chk; + unsigned int offset = skb_transport_offset(skb); + __sum16 ret; + + skb_chk = skb_checksum_maybe_trim(skb, transport_len); + if (!skb_chk) + return NULL; + + if (!pskb_may_pull(skb_chk, offset)) { + kfree_skb(skb_chk); + return NULL; + } + + __skb_pull(skb_chk, offset); + ret = skb_chkf(skb_chk); + __skb_push(skb_chk, offset); + + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} +EXPORT_SYMBOL(skb_checksum_trimmed); + void __skb_warn_lro_forwarding(const struct sk_buff *skb) { net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", @@ -4169,19 +4211,21 @@ EXPORT_SYMBOL(skb_try_coalesce); */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - if (xnet) - skb_orphan(skb); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - skb->mark = 0; skb_sender_cpu_clear(skb); - skb_init_secmark(skb); secpath_reset(skb); nf_reset(skb); nf_reset_trace(skb); + + if (!xnet) + return; + + skb_orphan(skb); + skb->mark = 0; } EXPORT_SYMBOL_GPL(skb_scrub_packet); @@ -4423,7 +4467,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, while (order) { if (npages >= 1 << order) { - page = alloc_pages(gfp_mask | + page = alloc_pages((gfp_mask & ~__GFP_WAIT) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, diff --git a/net/core/sock.c b/net/core/sock.c index 71e3e5f1eaa0..193901d09757 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -131,6 +131,7 @@ #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> +#include <linux/sock_diag.h> #include <linux/filter.h> @@ -354,15 +355,12 @@ void sk_clear_memalloc(struct sock *sk) /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward - * progress of swapping. However, if SOCK_MEMALLOC is cleared while - * it has rmem allocations there is a risk that the user of the - * socket cannot make forward progress due to exceeding the rmem - * limits. By rights, sk_clear_memalloc() should only be called - * on sockets being torn down but warn and reset the accounting if - * that assumption breaks. + * progress of swapping. SOCK_MEMALLOC may be cleared while + * it has rmem allocations due to the last swapfile being deactivated + * but there is a risk that the socket is unusable due to exceeding + * the rmem limits. Reclaim the reserves and obey rmem limits again. */ - if (WARN_ON(sk->sk_forward_alloc)) - sk_mem_reclaim(sk); + sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); @@ -466,7 +464,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) skb_dst_force(skb); spin_lock_irqsave(&list->lock, flags); - skb->dropcount = atomic_read(&sk->sk_drops); + sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock_irqrestore(&list->lock, flags); @@ -947,8 +945,6 @@ set_rcvbuf: sk->sk_mark = val; break; - /* We implement the SO_SNDLOWAT etc to - not be settable (1003.1g 5.3) */ case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; @@ -1253,6 +1249,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; default: + /* We implement the SO_SNDLOWAT etc to not be settable + * (1003.1g 7). + */ return -ENOPROTOOPT; } @@ -1395,9 +1394,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx); * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot) + struct proto *prot, int kern) { struct sock *sk; @@ -1410,7 +1410,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); - sock_net_set(sk, get_net(net)); + sk->sk_net_refcnt = kern ? 0 : 1; + if (likely(sk->sk_net_refcnt)) + get_net(net); + sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); @@ -1421,7 +1424,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } EXPORT_SYMBOL(sk_alloc); -static void __sk_free(struct sock *sk) +void sk_destruct(struct sock *sk) { struct sk_filter *filter; @@ -1444,10 +1447,19 @@ static void __sk_free(struct sock *sk) if (sk->sk_peer_cred) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - put_net(sock_net(sk)); + if (likely(sk->sk_net_refcnt)) + put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } +static void __sk_free(struct sock *sk) +{ + if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) + sock_diag_broadcast_destroy(sk); + else + sk_destruct(sk); +} + void sk_free(struct sock *sk) { /* @@ -1460,26 +1472,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -/* - * Last sock_put should drop reference to sk->sk_net. It has already - * been dropped in sk_change_net. Taking reference to stopping namespace - * is not an option. - * Take reference to a socket to remove it from hash _alive_ and after that - * destroy it in the context of init_net. - */ -void sk_release_kernel(struct sock *sk) -{ - if (sk == NULL || sk->sk_socket == NULL) - return; - - sock_hold(sk); - sock_release(sk->sk_socket); - release_net(sock_net(sk)); - sock_net_set(sk, get_net(&init_net)); - sock_put(sk); -} -EXPORT_SYMBOL(sk_release_kernel); - static void sk_update_clone(const struct sock *sk, struct sock *newsk) { if (mem_cgroup_sockets_enabled && sk->sk_cgrp) @@ -1505,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); /* SANITY */ - get_net(sock_net(newsk)); + if (likely(newsk->sk_net_refcnt)) + get_net(sock_net(newsk)); sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); @@ -1557,6 +1550,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); + atomic64_set(&newsk->sk_cookie, 0); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -1594,6 +1588,8 @@ EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + u32 max_segs = 1; + __sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) @@ -1605,9 +1601,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; - sk->sk_gso_max_segs = dst->dev->gso_max_segs; + max_segs = max_t(u32, dst->dev->gso_max_segs, 1); } } + sk->sk_gso_max_segs = max_segs; } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -1684,19 +1681,6 @@ void sock_efree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_efree); -#ifdef CONFIG_INET -void sock_edemux(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_put(inet_twsk(sk)); - else - sock_put(sk); -} -EXPORT_SYMBOL(sock_edemux); -#endif - kuid_t sock_i_uid(struct sock *sk) { kuid_t uid; @@ -1895,7 +1879,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER) { - pfrag->page = alloc_pages(gfp | __GFP_COMP | + pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { @@ -1984,20 +1968,21 @@ static void __release_sock(struct sock *sk) * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long + * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ -int sk_wait_data(struct sock *sk, long *timeo) +int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { int rc; DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); + rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); finish_wait(sk_sleep(sk), &wait); return rc; @@ -2095,12 +2080,13 @@ EXPORT_SYMBOL(__sk_mem_schedule); /** * __sk_reclaim - reclaim memory_allocated * @sk: socket + * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ -void __sk_mem_reclaim(struct sock *sk) +void __sk_mem_reclaim(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); - sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + amount >>= SK_MEM_QUANTUM_SHIFT; + sk_memory_allocated_sub(sk, amount); + sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) @@ -2186,15 +2172,14 @@ int sock_no_getsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(sock_no_getsockopt); -int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, - size_t len) +int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_sendmsg); -int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, - size_t len, int flags) +int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, + int flags) { return -EOPNOTSUPP; } @@ -2286,7 +2271,6 @@ static void sock_def_write_space(struct sock *sk) static void sock_def_destruct(struct sock *sk) { - kfree(sk->sk_protinfo); } void sk_send_sigurg(struct sock *sk) @@ -2566,14 +2550,14 @@ int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, EXPORT_SYMBOL(compat_sock_common_getsockopt); #endif -int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; - err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; @@ -2750,6 +2734,42 @@ static inline void release_proto_idx(struct proto *prot) } #endif +static void req_prot_cleanup(struct request_sock_ops *rsk_prot) +{ + if (!rsk_prot) + return; + kfree(rsk_prot->slab_name); + rsk_prot->slab_name = NULL; + if (rsk_prot->slab) { + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; + } +} + +static int req_prot_init(const struct proto *prot) +{ + struct request_sock_ops *rsk_prot = prot->rsk_prot; + + if (!rsk_prot) + return 0; + + rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", + prot->name); + if (!rsk_prot->slab_name) + return -ENOMEM; + + rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, + rsk_prot->obj_size, 0, + 0, NULL); + + if (!rsk_prot->slab) { + pr_crit("%s: Can't create request sock SLAB cache!\n", + prot->name); + return -ENOMEM; + } + return 0; +} + int proto_register(struct proto *prot, int alloc_slab) { if (alloc_slab) { @@ -2763,21 +2783,8 @@ int proto_register(struct proto *prot, int alloc_slab) goto out; } - if (prot->rsk_prot != NULL) { - prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); - if (prot->rsk_prot->slab_name == NULL) - goto out_free_sock_slab; - - prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, - prot->rsk_prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); - - if (prot->rsk_prot->slab == NULL) { - pr_crit("%s: Can't create request sock SLAB cache!\n", - prot->name); - goto out_free_request_sock_slab_name; - } - } + if (req_prot_init(prot)) + goto out_free_request_sock_slab; if (prot->twsk_prot != NULL) { prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); @@ -2789,8 +2796,7 @@ int proto_register(struct proto *prot, int alloc_slab) kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, 0, - SLAB_HWCACHE_ALIGN | - prot->slab_flags, + prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; @@ -2806,14 +2812,8 @@ int proto_register(struct proto *prot, int alloc_slab) out_free_timewait_sock_slab_name: kfree(prot->twsk_prot->twsk_slab_name); out_free_request_sock_slab: - if (prot->rsk_prot && prot->rsk_prot->slab) { - kmem_cache_destroy(prot->rsk_prot->slab); - prot->rsk_prot->slab = NULL; - } -out_free_request_sock_slab_name: - if (prot->rsk_prot) - kfree(prot->rsk_prot->slab_name); -out_free_sock_slab: + req_prot_cleanup(prot->rsk_prot); + kmem_cache_destroy(prot->slab); prot->slab = NULL; out: @@ -2833,11 +2833,7 @@ void proto_unregister(struct proto *prot) prot->slab = NULL; } - if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { - kmem_cache_destroy(prot->rsk_prot->slab); - kfree(prot->rsk_prot->slab_name); - prot->rsk_prot->slab = NULL; - } + req_prot_cleanup(prot->rsk_prot); if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { kmem_cache_destroy(prot->twsk_prot->twsk_slab); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index ad704c757bb4..d79866c5f8bc 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -5,6 +5,9 @@ #include <net/net_namespace.h> #include <linux/module.h> #include <net/sock.h> +#include <linux/kernel.h> +#include <linux/tcp.h> +#include <linux/workqueue.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> @@ -12,23 +15,41 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); +static struct workqueue_struct *broadcast_wq; -int sock_diag_check_cookie(void *sk, __u32 *cookie) +static u64 sock_gen_cookie(struct sock *sk) { - if ((cookie[0] != INET_DIAG_NOCOOKIE || - cookie[1] != INET_DIAG_NOCOOKIE) && - ((u32)(unsigned long)sk != cookie[0] || - (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1])) - return -ESTALE; - else + while (1) { + u64 res = atomic64_read(&sk->sk_cookie); + + if (res) + return res; + res = atomic64_inc_return(&sock_net(sk)->cookie_gen); + atomic64_cmpxchg(&sk->sk_cookie, 0, res); + } +} + +int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie) +{ + u64 res; + + if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE) return 0; + + res = sock_gen_cookie(sk); + if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1]) + return -ESTALE; + + return 0; } EXPORT_SYMBOL_GPL(sock_diag_check_cookie); -void sock_diag_save_cookie(void *sk, __u32 *cookie) +void sock_diag_save_cookie(struct sock *sk, __u32 *cookie) { - cookie[0] = (u32)(unsigned long)sk; - cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + u64 res = sock_gen_cookie(sk); + + cookie[0] = (u32)res; + cookie[1] = (u32)(res >> 32); } EXPORT_SYMBOL_GPL(sock_diag_save_cookie); @@ -84,6 +105,62 @@ out: } EXPORT_SYMBOL(sock_diag_put_filterinfo); +struct broadcast_sk { + struct sock *sk; + struct work_struct work; +}; + +static size_t sock_diag_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct inet_diag_msg) + + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */ + + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ +} + +static void sock_diag_broadcast_destroy_work(struct work_struct *work) +{ + struct broadcast_sk *bsk = + container_of(work, struct broadcast_sk, work); + struct sock *sk = bsk->sk; + const struct sock_diag_handler *hndl; + struct sk_buff *skb; + const enum sknetlink_groups group = sock_diag_destroy_group(sk); + int err = -1; + + WARN_ON(group == SKNLGRP_NONE); + + skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out; + + mutex_lock(&sock_diag_table_mutex); + hndl = sock_diag_handlers[sk->sk_family]; + if (hndl && hndl->get_info) + err = hndl->get_info(skb, sk); + mutex_unlock(&sock_diag_table_mutex); + + if (!err) + nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, + GFP_KERNEL); + else + kfree_skb(skb); +out: + sk_destruct(sk); + kfree(bsk); +} + +void sock_diag_broadcast_destroy(struct sock *sk) +{ + /* Note, this function is often called from an interrupt context. */ + struct broadcast_sk *bsk = + kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC); + if (!bsk) + return sk_destruct(sk); + bsk->sk = sk; + INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work); + queue_work(broadcast_wq, &bsk->work); +} + void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) { mutex_lock(&sock_diag_table_mutex); @@ -194,10 +271,32 @@ static void sock_diag_rcv(struct sk_buff *skb) mutex_unlock(&sock_diag_mutex); } +static int sock_diag_bind(struct net *net, int group) +{ + switch (group) { + case SKNLGRP_INET_TCP_DESTROY: + case SKNLGRP_INET_UDP_DESTROY: + if (!sock_diag_handlers[AF_INET]) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, AF_INET); + break; + case SKNLGRP_INET6_TCP_DESTROY: + case SKNLGRP_INET6_UDP_DESTROY: + if (!sock_diag_handlers[AF_INET6]) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, AF_INET); + break; + } + return 0; +} + static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { + .groups = SKNLGRP_MAX, .input = sock_diag_rcv, + .bind = sock_diag_bind, + .flags = NL_CFG_F_NONROOT_RECV, }; net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); @@ -217,12 +316,15 @@ static struct pernet_operations diag_net_ops = { static int __init sock_diag_init(void) { + broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0); + BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } static void __exit sock_diag_exit(void) { unregister_pernet_subsys(&diag_net_ops); + destroy_workqueue(broadcast_wq); } module_init(sock_diag_init); diff --git a/net/core/stream.c b/net/core/stream.c index 301c05f26060..d70f77a0c889 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -119,6 +119,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) int err = 0; long vm_wait = 0; long current_timeo = *timeo_p; + bool noblock = (*timeo_p ? false : true); DEFINE_WAIT(wait); if (sk_stream_memory_free(sk)) @@ -131,8 +132,11 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - if (!*timeo_p) + if (!*timeo_p) { + if (noblock) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); goto do_nonblock; + } if (signal_pending(current)) goto do_interrupted; clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8ce351ffceb1..95b6139d710c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,7 +24,6 @@ static int zero = 0; static int one = 1; -static int ushort_max = USHRT_MAX; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; @@ -403,7 +402,6 @@ static struct ctl_table netns_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .extra1 = &zero, - .extra2 = &ushort_max, .proc_handler = proc_dointvec_minmax }, { } diff --git a/net/core/utils.c b/net/core/utils.c index 7b803884c162..a7732a068043 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -304,13 +304,15 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, int pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from), - to)); + csum_replace4(sum, from, to); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to); + skb->csum = ~csum_add(csum_sub(~(skb->csum), + (__force __wsum)from), + (__force __wsum)to); } else if (pseudohdr) - *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from), - to)); + *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), + (__force __wsum)from), + (__force __wsum)to)); } EXPORT_SYMBOL(inet_proto_csum_replace4); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 93ea80196f0e..5b21f6f88e97 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -177,6 +177,8 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, + [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, + [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, }; static const struct nla_policy dcbnl_ieee_app[DCB_ATTR_IEEE_APP_MAX + 1] = { @@ -1030,7 +1032,7 @@ nla_put_failure: return err; } -/* Handle IEEE 802.1Qaz GET commands. */ +/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { struct nlattr *ieee, *app; @@ -1067,6 +1069,32 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) } } + if (ops->ieee_getqcn) { + struct ieee_qcn qcn; + + memset(&qcn, 0, sizeof(qcn)); + err = ops->ieee_getqcn(netdev, &qcn); + if (!err) { + err = nla_put(skb, DCB_ATTR_IEEE_QCN, + sizeof(qcn), &qcn); + if (err) + return -EMSGSIZE; + } + } + + if (ops->ieee_getqcnstats) { + struct ieee_qcn_stats qcn_stats; + + memset(&qcn_stats, 0, sizeof(qcn_stats)); + err = ops->ieee_getqcnstats(netdev, &qcn_stats); + if (!err) { + err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS, + sizeof(qcn_stats), &qcn_stats); + if (err) + return -EMSGSIZE; + } + } + if (ops->ieee_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); @@ -1379,8 +1407,9 @@ int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, } EXPORT_SYMBOL(dcbnl_cee_notify); -/* Handle IEEE 802.1Qaz SET commands. If any requested operation can not - * be completed the entire msg is aborted and error value is returned. +/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands. + * If any requested operation can not be completed + * the entire msg is aborted and error value is returned. * No attempt is made to reconcile the case where only part of the * cmd can be completed. */ @@ -1417,6 +1446,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, goto err; } + if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) { + struct ieee_qcn *qcn = + nla_data(ieee[DCB_ATTR_IEEE_QCN]); + + err = ops->ieee_setqcn(netdev, qcn); + if (err) + goto err; + } + if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e4c144fa706f..bebc735f5afc 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -280,8 +280,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct request_sock **prev); + struct request_sock *req); int dccp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); @@ -310,16 +309,15 @@ int compat_dccp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); #endif int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); -int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size); -int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, int nonblock, int flags, - int *addr_len); +int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len); void dccp_shutdown(struct sock *sk, int how); int inet_dccp_listen(struct socket *sock, int backlog); unsigned int dccp_poll(struct file *file, struct socket *sock, poll_table *wait); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +void dccp_req_err(struct sock *sk, u64 seq); struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 028fc43aacbd..2d84303ea6bf 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -49,13 +49,14 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r, struct nlattr *bc) { inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc); } -static int dccp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) +static int dccp_diag_dump_one(struct sk_buff *in_skb, + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *req) { return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req); } @@ -65,6 +66,7 @@ static const struct inet_diag_handler dccp_diag_handler = { .dump_one = dccp_diag_dump_one, .idiag_get_info = dccp_diag_get_info, .idiag_type = IPPROTO_DCCP, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init dccp_diag_init(void) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index e45b968613a4..ccf4c5629b3c 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -89,10 +89,9 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (inet->inet_saddr == 0) inet->inet_saddr = fl4->saddr; - inet->inet_rcv_saddr = inet->inet_saddr; - + sk_rcv_saddr_set(sk, inet->inet_saddr); inet->inet_dport = usin->sin_port; - inet->inet_daddr = daddr; + sk_daddr_set(sk, daddr); inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) @@ -196,6 +195,32 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) dst->ops->redirect(dst, sk, skb); } +void dccp_req_err(struct sock *sk, u64 seq) + { + struct request_sock *req = inet_reqsk(sk); + struct net *net = sock_net(sk); + + /* + * ICMPs are not backlogged, hence we cannot get an established + * socket here. + */ + WARN_ON(req->sk); + + if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + reqsk_put(req); + } else { + /* + * Still in RESPOND, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + } +} +EXPORT_SYMBOL(dccp_req_err); + /* * This routine is called by the ICMP module when it gets some sort of error * condition. If err < 0 then the socket should be closed and the error @@ -228,10 +253,11 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) return; } - sk = inet_lookup(net, &dccp_hashinfo, - iph->daddr, dh->dccph_dport, - iph->saddr, dh->dccph_sport, inet_iif(skb)); - if (sk == NULL) { + sk = __inet_lookup_established(net, &dccp_hashinfo, + iph->daddr, dh->dccph_dport, + iph->saddr, ntohs(dh->dccph_sport), + inet_iif(skb)); + if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } @@ -240,6 +266,9 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) inet_twsk_put(inet_twsk(sk)); return; } + seq = dccp_hdr_seq(dh); + if (sk->sk_state == DCCP_NEW_SYN_RECV) + return dccp_req_err(sk, seq); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -252,7 +281,6 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) goto out; dp = dccp_sk(sk); - seq = dccp_hdr_seq(dh); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); @@ -289,35 +317,6 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) } switch (sk->sk_state) { - struct request_sock *req , **prev; - case DCCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - req = inet_csk_search_req(sk, &prev, dh->dccph_dport, - iph->daddr, iph->saddr); - if (!req) - goto out; - - /* - * ICMPs are not backlogged, hence we cannot get an established - * socket here. - */ - WARN_ON(req->sk); - - if (!between48(seq, dccp_rsk(req)->dreq_iss, - dccp_rsk(req)->dreq_gss)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - /* - * Still in RESPOND, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(sk, req, prev); - goto out; - case DCCP_REQUESTING: case DCCP_RESPOND: if (!sock_owned_by_user(sk)) { @@ -408,8 +407,8 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->inet_daddr = ireq->ir_rmt_addr; - newinet->inet_rcv_saddr = ireq->ir_loc_addr; + sk_daddr_set(newsk, ireq->ir_rmt_addr); + sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newinet->inet_saddr = ireq->ir_loc_addr; newinet->inet_opt = ireq->opt; ireq->opt = NULL; @@ -449,14 +448,15 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) const struct dccp_hdr *dh = dccp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *nsk; - struct request_sock **prev; /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, &prev, - dh->dccph_sport, + struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport, iph->saddr, iph->daddr); - if (req != NULL) - return dccp_check_req(sk, skb, req, prev); - + if (req) { + nsk = dccp_check_req(sk, skb, req); + if (!nsk) + reqsk_put(req); + return nsk; + } nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, iph->saddr, dh->dccph_sport, iph->daddr, dh->dccph_dport, @@ -575,7 +575,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -void dccp_syn_ack_timeout(struct sock *sk, struct request_sock *req) +void dccp_syn_ack_timeout(const struct request_sock *req) { } EXPORT_SYMBOL(dccp_syn_ack_timeout); @@ -624,7 +624,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp_request_sock_ops); + req = inet_reqsk_alloc(&dccp_request_sock_ops, sk); if (req == NULL) goto drop; @@ -639,8 +639,10 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; ireq = inet_rsk(req); - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); + ireq->ireq_family = AF_INET; + ireq->ir_iif = sk->sk_bound_dev_if; /* * Step 3: Process LISTEN state diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6bcaa33cd804..5165571f397a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -40,19 +40,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; -static void dccp_v6_hash(struct sock *sk) -{ - if (sk->sk_state != DCCP_CLOSED) { - if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) { - inet_hash(sk); - return; - } - local_bh_disable(); - __inet6_hash(sk, NULL); - local_bh_enable(); - } -} - /* add pseudo-header to DCCP checksum stored in skb->csum */ static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, const struct in6_addr *saddr, @@ -98,11 +85,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } - sk = inet6_lookup(net, &dccp_hashinfo, - &hdr->daddr, dh->dccph_dport, - &hdr->saddr, dh->dccph_sport, inet6_iif(skb)); + sk = __inet6_lookup_established(net, &dccp_hashinfo, + &hdr->daddr, dh->dccph_dport, + &hdr->saddr, ntohs(dh->dccph_sport), + inet6_iif(skb)); - if (sk == NULL) { + if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return; @@ -112,6 +100,9 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, inet_twsk_put(inet_twsk(sk)); return; } + seq = dccp_hdr_seq(dh); + if (sk->sk_state == DCCP_NEW_SYN_RECV) + return dccp_req_err(sk, seq); bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -121,7 +112,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; dp = dccp_sk(sk); - seq = dccp_hdr_seq(dh); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); @@ -162,32 +152,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Might be for an request_sock */ switch (sk->sk_state) { - struct request_sock *req, **prev; - case DCCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - - req = inet6_csk_search_req(sk, &prev, dh->dccph_dport, - &hdr->daddr, &hdr->saddr, - inet6_iif(skb)); - if (req == NULL) - goto out; - - /* - * ICMPs are not backlogged, hence we cannot get an established - * socket here. - */ - WARN_ON(req->sk != NULL); - - if (!between48(seq, dccp_rsk(req)->dreq_iss, - dccp_rsk(req)->dreq_gss)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - inet_csk_reqsk_queue_drop(sk, req, prev); - goto out; - case DCCP_REQUESTING: case DCCP_RESPOND: /* Cannot happen. It can, it SYNs are crossed. --ANK */ @@ -330,17 +294,17 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); const struct ipv6hdr *iph = ipv6_hdr(skb); + struct request_sock *req; struct sock *nsk; - struct request_sock **prev; - /* Find possible connection requests. */ - struct request_sock *req = inet6_csk_search_req(sk, &prev, - dh->dccph_sport, - &iph->saddr, - &iph->daddr, - inet6_iif(skb)); - if (req != NULL) - return dccp_check_req(sk, skb, req, prev); + req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr, + &iph->daddr, inet6_iif(skb)); + if (req) { + nsk = dccp_check_req(sk, skb, req); + if (!nsk) + reqsk_put(req); + return nsk; + } nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, &iph->saddr, dh->dccph_sport, &iph->daddr, ntohs(dh->dccph_dport), @@ -386,7 +350,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet_reqsk_alloc(&dccp6_request_sock_ops); + req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk); if (req == NULL) goto drop; @@ -403,6 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + ireq->ireq_family = AF_INET6; if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || @@ -469,11 +434,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr); - - ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); - - newsk->sk_v6_rcv_saddr = newnp->saddr; + newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; newsk->sk_backlog_rcv = dccp_v4_do_rcv; @@ -591,7 +552,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, dccp_done(newsk); goto out; } - __inet6_hash(newsk, NULL); + __inet_hash(newsk, NULL); return newsk; @@ -916,9 +877,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_backlog_rcv = dccp_v6_do_rcv; goto failure; } - ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); - ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr); - + np->saddr = sk->sk_v6_rcv_saddr; return err; } @@ -1061,7 +1020,7 @@ static struct proto dccp_v6_prot = { .sendmsg = dccp_sendmsg, .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v6_do_rcv, - .hash = dccp_v6_hash, + .hash = inet_hash, .unhash = inet_unhash, .accept = inet_csk_accept, .get_port = inet_csk_get_port, diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index b50dc436db1f..30addee2dd03 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -27,28 +27,16 @@ struct inet_timewait_death_row dccp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, - .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, - .death_lock = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock), .hashinfo = &dccp_hashinfo, - .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, - (unsigned long)&dccp_death_row), - .twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work, - inet_twdr_twkill_work), -/* Short-time timewait calendar */ - - .twcal_hand = -1, - .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, - (unsigned long)&dccp_death_row), }; EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_time_wait(struct sock *sk, int state, int timeo) { - struct inet_timewait_sock *tw = NULL; + struct inet_timewait_sock *tw; - if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) - tw = inet_twsk_alloc(sk, state); + tw = inet_twsk_alloc(sk, &dccp_death_row, state); if (tw != NULL) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -71,8 +59,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (state == DCCP_TIME_WAIT) timeo = DCCP_TIMEWAIT_LEN; - inet_twsk_schedule(tw, &dccp_death_row, timeo, - DCCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, timeo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -152,8 +139,7 @@ EXPORT_SYMBOL_GPL(dccp_create_openreq_child); * as an request_sock. */ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct request_sock **prev) + struct request_sock *req) { struct sock *child = NULL; struct dccp_request_sock *dreq = dccp_rsk(req); @@ -200,8 +186,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, if (child == NULL) goto listen_overflow; - inet_csk_reqsk_queue_unlink(sk, req, prev); - inet_csk_reqsk_queue_removed(sk, req); + inet_csk_reqsk_queue_drop(sk, req); inet_csk_reqsk_queue_add(sk, req, child); out: return child; @@ -212,7 +197,7 @@ drop: if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) req->rsk_ops->send_reset(sk, skb); - inet_csk_reqsk_queue_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req); goto out; } diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 595ddf0459db..d8346d0eadeb 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -72,8 +72,7 @@ static void printl(const char *fmt, ...) wake_up(&dccpw.wait); } -static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size) +static int jdccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { const struct inet_sock *inet = inet_sk(sk); struct ccid3_hc_tx_sock *hc = NULL; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index e171b780b499..b5cf13a28009 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -741,8 +741,7 @@ static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) return 0; } -int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { const struct dccp_sock *dp = dccp_sk(sk); const int flags = msg->msg_flags; @@ -806,8 +805,8 @@ out_discard: EXPORT_SYMBOL_GPL(dccp_sendmsg); -int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int nonblock, int flags, int *addr_len) +int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) { const struct dccp_hdr *dh; long timeo; @@ -887,7 +886,7 @@ verify_sock_status: break; } - sk_wait_data(sk, &timeo); + sk_wait_data(sk, &timeo, NULL); continue; found_ok_skb: if (len > skb->len) diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 1cd46a345cb0..3ef7acef3ce8 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -161,33 +161,11 @@ out: sock_put(sk); } -/* - * Timer for listening sockets - */ -static void dccp_response_timer(struct sock *sk) -{ - inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT, - DCCP_RTO_MAX); -} - static void dccp_keepalive_timer(unsigned long data) { struct sock *sk = (struct sock *)data; - /* Only process if socket is not in use. */ - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - inet_csk_reset_keepalive_timer(sk, HZ / 20); - goto out; - } - - if (sk->sk_state == DCCP_LISTEN) { - dccp_response_timer(sk); - goto out; - } -out: - bh_unlock_sock(sk); + pr_err("dccp should not use a keepalive timer !\n"); sock_put(sk); } diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 810228646de3..675cf94e04f8 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -468,10 +468,10 @@ static struct proto dn_proto = { .obj_size = sizeof(struct dn_sock), }; -static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp) +static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern) { struct dn_scp *scp; - struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto); + struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern); if (!sk) goto out; @@ -693,7 +693,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, } - if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL) + if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL) return -ENOBUFS; sk->sk_protocol = protocol; @@ -1096,7 +1096,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags) cb = DN_SKB_CB(skb); sk->sk_ack_backlog--; - newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation); + newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0); if (newsk == NULL) { release_sock(sk); kfree_skb(skb); @@ -1669,8 +1669,8 @@ static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int } -static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); @@ -1905,8 +1905,7 @@ static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk, return skb; } -static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size) +static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 7ca7c3143da3..4507b188fc51 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -49,41 +49,17 @@ #include <net/dn_route.h> static int dn_neigh_construct(struct neighbour *); -static void dn_long_error_report(struct neighbour *, struct sk_buff *); -static void dn_short_error_report(struct neighbour *, struct sk_buff *); -static int dn_long_output(struct neighbour *, struct sk_buff *); -static int dn_short_output(struct neighbour *, struct sk_buff *); -static int dn_phase3_output(struct neighbour *, struct sk_buff *); - - -/* - * For talking to broadcast devices: Ethernet & PPP - */ -static const struct neigh_ops dn_long_ops = { - .family = AF_DECnet, - .error_report = dn_long_error_report, - .output = dn_long_output, - .connected_output = dn_long_output, -}; +static void dn_neigh_error_report(struct neighbour *, struct sk_buff *); +static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb); /* - * For talking to pointopoint and multidrop devices: DDCMP and X.25 + * Operations for adding the link layer header. */ -static const struct neigh_ops dn_short_ops = { +static const struct neigh_ops dn_neigh_ops = { .family = AF_DECnet, - .error_report = dn_short_error_report, - .output = dn_short_output, - .connected_output = dn_short_output, -}; - -/* - * For talking to DECnet phase III nodes - */ -static const struct neigh_ops dn_phase3_ops = { - .family = AF_DECnet, - .error_report = dn_short_error_report, /* Can use short version here */ - .output = dn_phase3_output, - .connected_output = dn_phase3_output, + .error_report = dn_neigh_error_report, + .output = dn_neigh_output, + .connected_output = dn_neigh_output, }; static u32 dn_neigh_hash(const void *pkey, @@ -93,11 +69,18 @@ static u32 dn_neigh_hash(const void *pkey, return jhash_2words(*(__u16 *)pkey, 0, hash_rnd[0]); } +static bool dn_key_eq(const struct neighbour *neigh, const void *pkey) +{ + return neigh_key_eq16(neigh, pkey); +} + struct neigh_table dn_neigh_table = { .family = PF_DECnet, .entry_size = NEIGH_ENTRY_SIZE(sizeof(struct dn_neigh)), .key_len = sizeof(__le16), + .protocol = cpu_to_be16(ETH_P_DNA_RT), .hash = dn_neigh_hash, + .key_eq = dn_key_eq, .constructor = dn_neigh_construct, .id = "dn_neigh_cache", .parms ={ @@ -146,16 +129,9 @@ static int dn_neigh_construct(struct neighbour *neigh) __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); - - if (dn_db->use_long) - neigh->ops = &dn_long_ops; - else - neigh->ops = &dn_short_ops; rcu_read_unlock(); - if (dn->flags & DN_NDFLAG_P3) - neigh->ops = &dn_phase3_ops; - + neigh->ops = &dn_neigh_ops; neigh->nud_state = NUD_NOARP; neigh->output = neigh->ops->connected_output; @@ -187,24 +163,16 @@ static int dn_neigh_construct(struct neighbour *neigh) return 0; } -static void dn_long_error_report(struct neighbour *neigh, struct sk_buff *skb) -{ - printk(KERN_DEBUG "dn_long_error_report: called\n"); - kfree_skb(skb); -} - - -static void dn_short_error_report(struct neighbour *neigh, struct sk_buff *skb) +static void dn_neigh_error_report(struct neighbour *neigh, struct sk_buff *skb) { - printk(KERN_DEBUG "dn_short_error_report: called\n"); + printk(KERN_DEBUG "dn_neigh_error_report: called\n"); kfree_skb(skb); } -static int dn_neigh_output_packet(struct sk_buff *skb) +static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; - struct neighbour *neigh = rt->n; struct net_device *dev = neigh->dev; char mac_addr[ETH_ALEN]; unsigned int seq; @@ -226,7 +194,20 @@ static int dn_neigh_output_packet(struct sk_buff *skb) return err; } -static int dn_long_output(struct neighbour *neigh, struct sk_buff *skb) +static int dn_neigh_output_packet(struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct dn_route *rt = (struct dn_route *)dst; + struct neighbour *neigh = rt->n; + + return neigh->output(neigh, skb); +} + +/* + * For talking to broadcast devices: Ethernet & PPP + */ +static int dn_long_output(struct neighbour *neigh, struct sock *sk, + struct sk_buff *skb) { struct net_device *dev = neigh->dev; int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3; @@ -265,11 +246,15 @@ static int dn_long_output(struct neighbour *neigh, struct sk_buff *skb) skb_reset_network_header(skb); - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, skb, NULL, - neigh->dev, dn_neigh_output_packet); + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb, + NULL, neigh->dev, dn_neigh_output_packet); } -static int dn_short_output(struct neighbour *neigh, struct sk_buff *skb) +/* + * For talking to pointopoint and multidrop devices: DDCMP and X.25 + */ +static int dn_short_output(struct neighbour *neigh, struct sock *sk, + struct sk_buff *skb) { struct net_device *dev = neigh->dev; int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2; @@ -301,15 +286,17 @@ static int dn_short_output(struct neighbour *neigh, struct sk_buff *skb) skb_reset_network_header(skb); - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, skb, NULL, - neigh->dev, dn_neigh_output_packet); + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb, + NULL, neigh->dev, dn_neigh_output_packet); } /* - * Phase 3 output is the same is short output, execpt that + * For talking to DECnet phase III nodes + * Phase 3 output is the same as short output, execpt that * it clears the area bits before transmission. */ -static int dn_phase3_output(struct neighbour *neigh, struct sk_buff *skb) +static int dn_phase3_output(struct neighbour *neigh, struct sock *sk, + struct sk_buff *skb) { struct net_device *dev = neigh->dev; int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2; @@ -340,8 +327,34 @@ static int dn_phase3_output(struct neighbour *neigh, struct sk_buff *skb) skb_reset_network_header(skb); - return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, skb, NULL, - neigh->dev, dn_neigh_output_packet); + return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb, + NULL, neigh->dev, dn_neigh_output_packet); +} + +int dn_to_neigh_output(struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct dn_route *rt = (struct dn_route *) dst; + struct neighbour *neigh = rt->n; + struct dn_neigh *dn = (struct dn_neigh *)neigh; + struct dn_dev *dn_db; + bool use_long; + + rcu_read_lock(); + dn_db = rcu_dereference(neigh->dev->dn_ptr); + if (dn_db == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + use_long = dn_db->use_long; + rcu_read_unlock(); + + if (dn->flags & DN_NDFLAG_P3) + return dn_phase3_output(neigh, sk, skb); + if (use_long) + return dn_long_output(neigh, sk, skb); + else + return dn_short_output(neigh, sk, skb); } /* @@ -362,7 +375,7 @@ void dn_neigh_pointopoint_hello(struct sk_buff *skb) /* * Ethernet router hello message received */ -int dn_neigh_router_hello(struct sk_buff *skb) +int dn_neigh_router_hello(struct sock *sk, struct sk_buff *skb) { struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data; @@ -424,7 +437,7 @@ int dn_neigh_router_hello(struct sk_buff *skb) /* * Endnode hello message received */ -int dn_neigh_endnode_hello(struct sk_buff *skb) +int dn_neigh_endnode_hello(struct sock *sk, struct sk_buff *skb) { struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data; struct neighbour *neigh; diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index fe5f01485d33..a321eac9fd0c 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -714,7 +714,7 @@ out: return ret; } -static int dn_nsp_rx_packet(struct sk_buff *skb) +static int dn_nsp_rx_packet(struct sock *sk2, struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); struct sock *sk = NULL; @@ -814,7 +814,8 @@ free_out: int dn_nsp_rx(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, skb, skb->dev, NULL, + return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, NULL, skb, + skb->dev, NULL, dn_nsp_rx_packet); } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 3b81092771f8..03227ffd19ce 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -136,7 +136,6 @@ int decnet_dst_gc_interval = 2; static struct dst_ops dn_dst_ops = { .family = PF_DECnet, - .protocol = cpu_to_be16(ETH_P_DNA_RT), .gc_thresh = 128, .gc = dn_dst_gc, .check = dn_dst_check, @@ -513,7 +512,7 @@ static int dn_return_long(struct sk_buff *skb) * * Returns: result of input function if route is found, error code otherwise */ -static int dn_route_rx_packet(struct sk_buff *skb) +static int dn_route_rx_packet(struct sock *sk, struct sk_buff *skb) { struct dn_skb_cb *cb; int err; @@ -574,7 +573,8 @@ static int dn_route_rx_long(struct sk_buff *skb) ptr++; cb->hops = *ptr++; /* Visit Count */ - return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, skb, skb->dev, NULL, + return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb, + skb->dev, NULL, dn_route_rx_packet); drop_it: @@ -601,7 +601,8 @@ static int dn_route_rx_short(struct sk_buff *skb) ptr += 2; cb->hops = *ptr & 0x3f; - return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, skb, skb->dev, NULL, + return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb, + skb->dev, NULL, dn_route_rx_packet); drop_it: @@ -609,7 +610,7 @@ drop_it: return NET_RX_DROP; } -static int dn_route_discard(struct sk_buff *skb) +static int dn_route_discard(struct sock *sk, struct sk_buff *skb) { /* * I know we drop the packet here, but thats considered success in @@ -619,7 +620,7 @@ static int dn_route_discard(struct sk_buff *skb) return NET_RX_SUCCESS; } -static int dn_route_ptp_hello(struct sk_buff *skb) +static int dn_route_ptp_hello(struct sock *sk, struct sk_buff *skb) { dn_dev_hello(skb); dn_neigh_pointopoint_hello(skb); @@ -705,22 +706,22 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type switch (flags & DN_RT_CNTL_MSK) { case DN_RT_PKT_HELO: return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - skb, skb->dev, NULL, + NULL, skb, skb->dev, NULL, dn_route_ptp_hello); case DN_RT_PKT_L1RT: case DN_RT_PKT_L2RT: return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE, - skb, skb->dev, NULL, + NULL, skb, skb->dev, NULL, dn_route_discard); case DN_RT_PKT_ERTH: return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - skb, skb->dev, NULL, + NULL, skb, skb->dev, NULL, dn_neigh_router_hello); case DN_RT_PKT_EEDH: return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO, - skb, skb->dev, NULL, + NULL, skb, skb->dev, NULL, dn_neigh_endnode_hello); } } else { @@ -743,15 +744,6 @@ out: return NET_RX_DROP; } -static int dn_to_neigh_output(struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - struct dn_route *rt = (struct dn_route *) dst; - struct neighbour *n = rt->n; - - return n->output(n, skb); -} - static int dn_output(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -778,7 +770,8 @@ static int dn_output(struct sock *sk, struct sk_buff *skb) cb->rt_flags |= DN_RT_F_IE; cb->hops = 0; - return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, skb, NULL, dev, + return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, sk, skb, + NULL, dev, dn_to_neigh_output); error: @@ -826,7 +819,8 @@ static int dn_forward(struct sk_buff *skb) if (rt->rt_flags & RTCF_DOREDIRECT) cb->rt_flags |= DN_RT_F_IE; - return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, skb, dev, skb->dev, + return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, NULL, skb, + dev, skb->dev, dn_to_neigh_output); drop: diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index e4d9560a910b..af34fc9bdf69 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -89,9 +89,7 @@ static void dnrmg_send_peer(struct sk_buff *skb) static unsigned int dnrmg_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { dnrmg_send_peer(skb); return NF_ACCEPT; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 5f8ac404535b..ff7736f7ff42 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -5,9 +5,12 @@ config HAVE_NET_DSA # Drivers must select NET_DSA and the appropriate tagging format config NET_DSA - tristate - depends on HAVE_NET_DSA + tristate "Distributed Switch Architecture" + depends on HAVE_NET_DSA && NET_SWITCHDEV select PHYLIB + ---help--- + Say Y if you want to enable support for the hardware switches supported + by the Distributed Switch Architecture. if NET_DSA diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 4dea2e0681d1..b445d492c115 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -20,6 +20,7 @@ #include <linux/of.h> #include <linux/of_mdio.h> #include <linux/of_platform.h> +#include <linux/of_net.h> #include <linux/sysfs.h> #include "dsa_priv.h" @@ -123,7 +124,7 @@ static ssize_t temp1_max_store(struct device *dev, return count; } -static DEVICE_ATTR(temp1_max, S_IRUGO, temp1_max_show, temp1_max_store); +static DEVICE_ATTR_RW(temp1_max); static ssize_t temp1_max_alarm_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -158,8 +159,8 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj, if (index == 1) { if (!drv->get_temp_limit) mode = 0; - else if (drv->set_temp_limit) - mode |= S_IWUSR; + else if (!drv->set_temp_limit) + mode &= ~S_IWUSR; } else if (index == 2 && !drv->get_temp_alarm) { mode = 0; } @@ -175,43 +176,14 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); #endif /* CONFIG_NET_DSA_HWMON */ /* basic switch operations **************************************************/ -static struct dsa_switch * -dsa_switch_setup(struct dsa_switch_tree *dst, int index, - struct device *parent, struct device *host_dev) +static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { - struct dsa_chip_data *pd = dst->pd->chip + index; - struct dsa_switch_driver *drv; - struct dsa_switch *ds; - int ret; - char *name; - int i; + struct dsa_switch_driver *drv = ds->drv; + struct dsa_switch_tree *dst = ds->dst; + struct dsa_chip_data *pd = ds->pd; bool valid_name_found = false; - - /* - * Probe for switch model. - */ - drv = dsa_switch_probe(host_dev, pd->sw_addr, &name); - if (drv == NULL) { - netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", - index); - return ERR_PTR(-EINVAL); - } - netdev_info(dst->master_netdev, "[%d]: detected a %s switch\n", - index, name); - - - /* - * Allocate and initialise switch state. - */ - ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL); - if (ds == NULL) - return ERR_PTR(-ENOMEM); - - ds->dst = dst; - ds->index = index; - ds->pd = dst->pd->chip + index; - ds->drv = drv; - ds->master_dev = host_dev; + int index = ds->index; + int i, ret; /* * Validate supplied switch configuration. @@ -256,7 +228,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, * switch. */ if (dst->cpu_switch == index) { - switch (drv->tag_protocol) { + switch (ds->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case DSA_TAG_PROTO_DSA: dst->rcv = dsa_netdev_ops.rcv; @@ -284,7 +256,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, goto out; } - dst->tag_protocol = drv->tag_protocol; + dst->tag_protocol = ds->tag_protocol; } /* @@ -314,19 +286,15 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, * Create network devices for physical switch ports. */ for (i = 0; i < DSA_MAX_PORTS; i++) { - struct net_device *slave_dev; - if (!(ds->phys_port_mask & (1 << i))) continue; - slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]); - if (slave_dev == NULL) { + ret = dsa_slave_create(ds, parent, i, pd->port_names[i]); + if (ret < 0) { netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s)\n", index, i, pd->port_names[i]); - continue; + ret = 0; } - - ds->ports[i] = slave_dev; } #ifdef CONFIG_NET_DSA_HWMON @@ -354,13 +322,57 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, } #endif /* CONFIG_NET_DSA_HWMON */ - return ds; + return ret; out_free: mdiobus_free(ds->slave_mii_bus); out: kfree(ds); - return ERR_PTR(ret); + return ret; +} + +static struct dsa_switch * +dsa_switch_setup(struct dsa_switch_tree *dst, int index, + struct device *parent, struct device *host_dev) +{ + struct dsa_chip_data *pd = dst->pd->chip + index; + struct dsa_switch_driver *drv; + struct dsa_switch *ds; + int ret; + char *name; + + /* + * Probe for switch model. + */ + drv = dsa_switch_probe(host_dev, pd->sw_addr, &name); + if (drv == NULL) { + netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", + index); + return ERR_PTR(-EINVAL); + } + netdev_info(dst->master_netdev, "[%d]: detected a %s switch\n", + index, name); + + + /* + * Allocate and initialise switch state. + */ + ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL); + if (ds == NULL) + return ERR_PTR(-ENOMEM); + + ds->dst = dst; + ds->index = index; + ds->pd = pd; + ds->drv = drv; + ds->tag_protocol = drv->tag_protocol; + ds->master_dev = host_dev; + + ret = dsa_switch_setup_one(ds, parent); + if (ret) + return ERR_PTR(ret); + + return ds; } static void dsa_switch_destroy(struct dsa_switch *ds) @@ -378,7 +390,7 @@ static int dsa_switch_suspend(struct dsa_switch *ds) /* Suspend slave network devices */ for (i = 0; i < DSA_MAX_PORTS; i++) { - if (!(ds->phys_port_mask & (1 << i))) + if (!dsa_is_port_initialized(ds, i)) continue; ret = dsa_slave_suspend(ds->ports[i]); @@ -404,7 +416,7 @@ static int dsa_switch_resume(struct dsa_switch *ds) /* Resume slave network devices */ for (i = 0; i < DSA_MAX_PORTS; i++) { - if (!(ds->phys_port_mask & (1 << i))) + if (!dsa_is_port_initialized(ds, i)) continue; ret = dsa_slave_resume(ds->ports[i]); @@ -558,12 +570,12 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) kfree(pd->chip); } -static int dsa_of_probe(struct platform_device *pdev) +static int dsa_of_probe(struct device *dev) { - struct device_node *np = pdev->dev.of_node; + struct device_node *np = dev->of_node; struct device_node *child, *mdio, *ethernet, *port, *link; struct mii_bus *mdio_bus; - struct platform_device *ethernet_dev; + struct net_device *ethernet_dev; struct dsa_platform_data *pd; struct dsa_chip_data *cd; const char *port_name; @@ -578,22 +590,22 @@ static int dsa_of_probe(struct platform_device *pdev) mdio_bus = of_mdio_find_bus(mdio); if (!mdio_bus) - return -EINVAL; + return -EPROBE_DEFER; ethernet = of_parse_phandle(np, "dsa,ethernet", 0); if (!ethernet) return -EINVAL; - ethernet_dev = of_find_device_by_node(ethernet); + ethernet_dev = of_find_net_device_by_node(ethernet); if (!ethernet_dev) - return -ENODEV; + return -EPROBE_DEFER; pd = kzalloc(sizeof(*pd), GFP_KERNEL); if (!pd) return -ENOMEM; - pdev->dev.platform_data = pd; - pd->netdev = ðernet_dev->dev; + dev->platform_data = pd; + pd->of_netdev = ethernet_dev; pd->nr_chips = of_get_available_child_count(np); if (pd->nr_chips > DSA_MAX_SWITCHES) pd->nr_chips = DSA_MAX_SWITCHES; @@ -618,10 +630,10 @@ static int dsa_of_probe(struct platform_device *pdev) continue; cd->sw_addr = be32_to_cpup(sw_addr); - if (cd->sw_addr > PHY_MAX_ADDR) + if (cd->sw_addr >= PHY_MAX_ADDR) continue; - if (!of_property_read_u32(np, "eeprom-length", &eeprom_len)) + if (!of_property_read_u32(child, "eeprom-length", &eeprom_len)) cd->eeprom_len = eeprom_len; for_each_available_child_of_node(child, port) { @@ -630,6 +642,8 @@ static int dsa_of_probe(struct platform_device *pdev) continue; port_index = be32_to_cpup(port_reg); + if (port_index >= DSA_MAX_PORTS) + break; port_name = of_get_property(port, "label", NULL); if (!port_name) @@ -654,8 +668,6 @@ static int dsa_of_probe(struct platform_device *pdev) goto out_free_chip; } - if (port_index == DSA_MAX_PORTS) - break; } } @@ -665,72 +677,35 @@ out_free_chip: dsa_of_free_platform_data(pd); out_free: kfree(pd); - pdev->dev.platform_data = NULL; + dev->platform_data = NULL; return ret; } -static void dsa_of_remove(struct platform_device *pdev) +static void dsa_of_remove(struct device *dev) { - struct dsa_platform_data *pd = pdev->dev.platform_data; + struct dsa_platform_data *pd = dev->platform_data; - if (!pdev->dev.of_node) + if (!dev->of_node) return; dsa_of_free_platform_data(pd); kfree(pd); } #else -static inline int dsa_of_probe(struct platform_device *pdev) +static inline int dsa_of_probe(struct device *dev) { return 0; } -static inline void dsa_of_remove(struct platform_device *pdev) +static inline void dsa_of_remove(struct device *dev) { } #endif -static int dsa_probe(struct platform_device *pdev) +static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, + struct device *parent, struct dsa_platform_data *pd) { - struct dsa_platform_data *pd = pdev->dev.platform_data; - struct net_device *dev; - struct dsa_switch_tree *dst; - int i, ret; - - pr_notice_once("Distributed Switch Architecture driver version %s\n", - dsa_driver_version); - - if (pdev->dev.of_node) { - ret = dsa_of_probe(pdev); - if (ret) - return ret; - - pd = pdev->dev.platform_data; - } - - if (pd == NULL || pd->netdev == NULL) - return -EINVAL; - - dev = dev_to_net_device(pd->netdev); - if (dev == NULL) { - ret = -EINVAL; - goto out; - } - - if (dev->dsa_ptr != NULL) { - dev_put(dev); - ret = -EEXIST; - goto out; - } - - dst = kzalloc(sizeof(*dst), GFP_KERNEL); - if (dst == NULL) { - dev_put(dev); - ret = -ENOMEM; - goto out; - } - - platform_set_drvdata(pdev, dst); + int i; dst->pd = pd; dst->master_netdev = dev; @@ -740,7 +715,7 @@ static int dsa_probe(struct platform_device *pdev) for (i = 0; i < pd->nr_chips; i++) { struct dsa_switch *ds; - ds = dsa_switch_setup(dst, i, &pdev->dev, pd->chip[i].host_dev); + ds = dsa_switch_setup(dst, i, parent, pd->chip[i].host_dev); if (IS_ERR(ds)) { netdev_err(dev, "[%d]: couldn't create dsa switch instance (error %ld)\n", i, PTR_ERR(ds)); @@ -768,18 +743,67 @@ static int dsa_probe(struct platform_device *pdev) dst->link_poll_timer.expires = round_jiffies(jiffies + HZ); add_timer(&dst->link_poll_timer); } +} + +static int dsa_probe(struct platform_device *pdev) +{ + struct dsa_platform_data *pd = pdev->dev.platform_data; + struct net_device *dev; + struct dsa_switch_tree *dst; + int ret; + + pr_notice_once("Distributed Switch Architecture driver version %s\n", + dsa_driver_version); + + if (pdev->dev.of_node) { + ret = dsa_of_probe(&pdev->dev); + if (ret) + return ret; + + pd = pdev->dev.platform_data; + } + + if (pd == NULL || (pd->netdev == NULL && pd->of_netdev == NULL)) + return -EINVAL; + + if (pd->of_netdev) { + dev = pd->of_netdev; + dev_hold(dev); + } else { + dev = dev_to_net_device(pd->netdev); + } + if (dev == NULL) { + ret = -EPROBE_DEFER; + goto out; + } + + if (dev->dsa_ptr != NULL) { + dev_put(dev); + ret = -EEXIST; + goto out; + } + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (dst == NULL) { + dev_put(dev); + ret = -ENOMEM; + goto out; + } + + platform_set_drvdata(pdev, dst); + + dsa_setup_dst(dst, dev, &pdev->dev, pd); return 0; out: - dsa_of_remove(pdev); + dsa_of_remove(&pdev->dev); return ret; } -static int dsa_remove(struct platform_device *pdev) +static void dsa_remove_dst(struct dsa_switch_tree *dst) { - struct dsa_switch_tree *dst = platform_get_drvdata(pdev); int i; if (dst->link_poll_needed) @@ -793,8 +817,14 @@ static int dsa_remove(struct platform_device *pdev) if (ds != NULL) dsa_switch_destroy(ds); } +} - dsa_of_remove(pdev); +static int dsa_remove(struct platform_device *pdev) +{ + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + + dsa_remove_dst(dst); + dsa_of_remove(&pdev->dev); return 0; } @@ -821,6 +851,10 @@ static struct packet_type dsa_pack_type __read_mostly = { .func = dsa_switch_rcv, }; +static struct notifier_block dsa_netdevice_nb __read_mostly = { + .notifier_call = dsa_slave_netdevice_event, +}; + #ifdef CONFIG_PM_SLEEP static int dsa_suspend(struct device *d) { @@ -879,6 +913,8 @@ static int __init dsa_init_module(void) { int rc; + register_netdevice_notifier(&dsa_netdevice_nb); + rc = platform_driver_register(&dsa_driver); if (rc) return rc; @@ -891,6 +927,7 @@ module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { + unregister_netdevice_notifier(&dsa_netdevice_nb); dev_remove_pack(&dsa_pack_type); platform_driver_unregister(&dsa_driver); } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index dc9756d3154c..d5f1f9b862ea 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -45,6 +45,8 @@ struct dsa_slave_priv { int old_link; int old_pause; int old_duplex; + + struct net_device *bridge_dev; }; /* dsa.c */ @@ -53,11 +55,12 @@ extern char dsa_driver_version[]; /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); -struct net_device *dsa_slave_create(struct dsa_switch *ds, - struct device *parent, - int port, char *name); +int dsa_slave_create(struct dsa_switch *ds, struct device *parent, + int port, char *name); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); +int dsa_slave_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr); /* tag_dsa.c */ extern const struct dsa_device_ops dsa_netdev_ops; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f23deadf42a0..0917123790ea 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -10,10 +10,14 @@ #include <linux/list.h> #include <linux/etherdevice.h> +#include <linux/netdevice.h> #include <linux/phy.h> #include <linux/phy_fixed.h> #include <linux/of_net.h> #include <linux/of_mdio.h> +#include <net/rtnetlink.h> +#include <net/switchdev.h> +#include <linux/if_bridge.h> #include "dsa_priv.h" /* slave mii_bus handling ***************************************************/ @@ -51,13 +55,16 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) /* slave device handling ****************************************************/ -static int dsa_slave_init(struct net_device *dev) +static int dsa_slave_get_iflink(const struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - dev->iflink = p->parent->dst->master_netdev->ifindex; + return p->parent->dst->master_netdev->ifindex; +} - return 0; +static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p) +{ + return !!p->bridge_dev; } static int dsa_slave_open(struct net_device *dev) @@ -65,6 +72,8 @@ static int dsa_slave_open(struct net_device *dev) struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = p->parent->dst->master_netdev; struct dsa_switch *ds = p->parent; + u8 stp_state = dsa_port_is_bridged(p) ? + BR_STATE_BLOCKING : BR_STATE_FORWARDING; int err; if (!(master->flags & IFF_UP)) @@ -93,6 +102,9 @@ static int dsa_slave_open(struct net_device *dev) goto clear_promisc; } + if (ds->drv->port_stp_update) + ds->drv->port_stp_update(ds, p->port, stp_state); + if (p->phy) phy_start(p->phy); @@ -100,7 +112,7 @@ static int dsa_slave_open(struct net_device *dev) clear_promisc: if (dev->flags & IFF_PROMISC) - dev_set_promiscuity(master, 0); + dev_set_promiscuity(master, -1); clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(master, -1); @@ -133,6 +145,9 @@ static int dsa_slave_close(struct net_device *dev) if (ds->drv->port_disable) ds->drv->port_disable(ds, p->port, p->phy); + if (ds->drv->port_stp_update) + ds->drv->port_stp_update(ds, p->port, BR_STATE_DISABLED); + return 0; } @@ -184,6 +199,105 @@ out: return 0; } +static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid, u16 nlm_flags) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + if (ds->drv->fdb_add) + ret = ds->drv->fdb_add(ds, p->port, addr, vid); + + return ret; +} + +static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + if (ds->drv->fdb_del) + ret = ds->drv->fdb_del(ds, p->port, addr, vid); + + return ret; +} + +static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb, + const unsigned char *addr, u16 vid, + bool is_static, + u32 portid, u32 seq, int type, + unsigned int flags) +{ + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); + if (!nlh) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = AF_BRIDGE; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = NTF_EXT_LEARNED; + ndm->ndm_type = 0; + ndm->ndm_ifindex = dev->ifindex; + ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; + + if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) + goto nla_put_failure; + + if (vid && nla_put_u16(skb, NDA_VLAN, vid)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +/* Dump information about entries, in response to GETNEIGH */ +static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, + struct net_device *dev, + struct net_device *filter_dev, int idx) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + unsigned char addr[ETH_ALEN] = { 0 }; + int ret; + + if (!ds->drv->fdb_getnext) + return -EOPNOTSUPP; + + for (; ; idx++) { + bool is_static; + + ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static); + if (ret < 0) + break; + + if (idx < cb->args[0]) + continue; + + ret = dsa_slave_fill_info(dev, skb, addr, 0, + is_static, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI); + if (ret < 0) + break; + } + + return idx; +} + static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -194,6 +308,116 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } +/* Return a bitmask of all ports being currently bridged within a given bridge + * device. Note that on leave, the mask will still return the bitmask of ports + * currently bridged, prior to port removal, and this is exactly what we want. + */ +static u32 dsa_slave_br_port_mask(struct dsa_switch *ds, + struct net_device *bridge) +{ + struct dsa_slave_priv *p; + unsigned int port; + u32 mask = 0; + + for (port = 0; port < DSA_MAX_PORTS; port++) { + if (!dsa_is_port_initialized(ds, port)) + continue; + + p = netdev_priv(ds->ports[port]); + + if (ds->ports[port]->priv_flags & IFF_BRIDGE_PORT && + p->bridge_dev == bridge) + mask |= 1 << port; + } + + return mask; +} + +static int dsa_slave_stp_update(struct net_device *dev, u8 state) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + if (ds->drv->port_stp_update) + ret = ds->drv->port_stp_update(ds, p->port, state); + + return ret; +} + +static int dsa_slave_port_attr_set(struct net_device *dev, + struct switchdev_attr *attr) +{ + int ret = 0; + + switch (attr->id) { + case SWITCHDEV_ATTR_PORT_STP_STATE: + if (attr->trans == SWITCHDEV_TRANS_COMMIT) + ret = dsa_slave_stp_update(dev, attr->u.stp_state); + break; + default: + ret = -EOPNOTSUPP; + break; + } + + return ret; +} + +static int dsa_slave_bridge_port_join(struct net_device *dev, + struct net_device *br) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + p->bridge_dev = br; + + if (ds->drv->port_join_bridge) + ret = ds->drv->port_join_bridge(ds, p->port, + dsa_slave_br_port_mask(ds, br)); + + return ret; +} + +static int dsa_slave_bridge_port_leave(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + + if (ds->drv->port_leave_bridge) + ret = ds->drv->port_leave_bridge(ds, p->port, + dsa_slave_br_port_mask(ds, p->bridge_dev)); + + p->bridge_dev = NULL; + + /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, + * so allow it to be in BR_STATE_FORWARDING to be kept functional + */ + dsa_slave_stp_update(dev, BR_STATE_FORWARDING); + + return ret; +} + +static int dsa_slave_port_attr_get(struct net_device *dev, + struct switchdev_attr *attr) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + switch (attr->id) { + case SWITCHDEV_ATTR_PORT_PARENT_ID: + attr->u.ppid.id_len = sizeof(ds->index); + memcpy(&attr->u.ppid.id, &ds->index, attr->u.ppid.id_len); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -462,14 +686,22 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { }; static const struct net_device_ops dsa_slave_netdev_ops = { - .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, .ndo_start_xmit = dsa_slave_xmit, .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, + .ndo_fdb_add = dsa_slave_fdb_add, + .ndo_fdb_del = dsa_slave_fdb_del, + .ndo_fdb_dump = dsa_slave_fdb_dump, .ndo_do_ioctl = dsa_slave_ioctl, + .ndo_get_iflink = dsa_slave_get_iflink, +}; + +static const struct switchdev_ops dsa_slave_switchdev_ops = { + .switchdev_port_attr_get = dsa_slave_port_attr_get, + .switchdev_port_attr_set = dsa_slave_port_attr_set, }; static void dsa_slave_adjust_link(struct net_device *dev) @@ -513,6 +745,24 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, } /* slave device setup *******************************************************/ +static int dsa_slave_phy_connect(struct dsa_slave_priv *p, + struct net_device *slave_dev, + int addr) +{ + struct dsa_switch *ds = p->parent; + + p->phy = ds->slave_mii_bus->phy_map[addr]; + if (!p->phy) + return -ENODEV; + + /* Use already configured phy mode */ + p->phy_interface = p->phy->interface; + phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, + p->phy_interface); + + return 0; +} + static int dsa_slave_phy_setup(struct dsa_slave_priv *p, struct net_device *slave_dev) { @@ -546,10 +796,25 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, if (ds->drv->get_phy_flags) phy_flags = ds->drv->get_phy_flags(ds, p->port); - if (phy_dn) - p->phy = of_phy_connect(slave_dev, phy_dn, - dsa_slave_adjust_link, phy_flags, - p->phy_interface); + if (phy_dn) { + ret = of_mdio_parse_addr(&slave_dev->dev, phy_dn); + /* If this PHY address is part of phys_mii_mask, which means + * that we need to divert reads and writes to/from it, then we + * want to bind this device using the slave MII bus created by + * DSA to make that happen. + */ + if (!phy_is_fixed && ret >= 0 && + (ds->phys_mii_mask & (1 << ret))) { + ret = dsa_slave_phy_connect(p, slave_dev, ret); + if (ret) + return ret; + } else { + p->phy = of_phy_connect(slave_dev, phy_dn, + dsa_slave_adjust_link, + phy_flags, + p->phy_interface); + } + } if (p->phy && phy_is_fixed) fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update); @@ -558,14 +823,9 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, * MDIO bus instead */ if (!p->phy) { - p->phy = ds->slave_mii_bus->phy_map[p->port]; - if (!p->phy) - return -ENODEV; - - /* Use already configured phy mode */ - p->phy_interface = p->phy->interface; - phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, - p->phy_interface); + ret = dsa_slave_phy_connect(p, slave_dev, p->port); + if (ret) + return ret; } else { netdev_info(slave_dev, "attached PHY at address %d [%s]\n", p->phy->addr, p->phy->drv->name); @@ -574,12 +834,19 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, return 0; } +static struct lock_class_key dsa_slave_netdev_xmit_lock_key; +static void dsa_slave_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &dsa_slave_netdev_xmit_lock_key); +} + int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); - netif_device_detach(slave_dev); - if (p->phy) { phy_stop(p->phy); p->old_pause = -1; @@ -605,9 +872,8 @@ int dsa_slave_resume(struct net_device *slave_dev) return 0; } -struct net_device * -dsa_slave_create(struct dsa_switch *ds, struct device *parent, - int port, char *name) +int dsa_slave_create(struct dsa_switch *ds, struct device *parent, + int port, char *name) { struct net_device *master = ds->dst->master_netdev; struct net_device *slave_dev; @@ -617,13 +883,17 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name, NET_NAME_UNKNOWN, ether_setup); if (slave_dev == NULL) - return slave_dev; + return -ENOMEM; slave_dev->features = master->vlan_features; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); slave_dev->tx_queue_len = 0; slave_dev->netdev_ops = &dsa_slave_netdev_ops; + slave_dev->switchdev_ops = &dsa_slave_switchdev_ops; + + netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, + NULL); SET_NETDEV_DEV(slave_dev, parent); slave_dev->dev.of_node = ds->pd->port_dn[port]; @@ -667,19 +937,64 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { free_netdev(slave_dev); - return NULL; + return ret; } + ds->ports[port] = slave_dev; ret = register_netdev(slave_dev); if (ret) { netdev_err(master, "error %d registering interface %s\n", ret, slave_dev->name); phy_disconnect(p->phy); + ds->ports[port] = NULL; free_netdev(slave_dev); - return NULL; + return ret; } netif_carrier_off(slave_dev); - return slave_dev; + return 0; +} + +static bool dsa_slave_dev_check(struct net_device *dev) +{ + return dev->netdev_ops == &dsa_slave_netdev_ops; +} + +static int dsa_slave_master_changed(struct net_device *dev) +{ + struct net_device *master = netdev_master_upper_dev_get(dev); + struct dsa_slave_priv *p = netdev_priv(dev); + int err = 0; + + if (master && master->rtnl_link_ops && + !strcmp(master->rtnl_link_ops->kind, "bridge")) + err = dsa_slave_bridge_port_join(dev, master); + else if (dsa_port_is_bridged(p)) + err = dsa_slave_bridge_port_leave(dev); + + return err; +} + +int dsa_slave_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev; + int err = 0; + + switch (event) { + case NETDEV_CHANGEUPPER: + dev = netdev_notifier_info_to_dev(ptr); + if (!dsa_slave_dev_check(dev)) + goto out; + + err = dsa_slave_master_changed(dev); + if (err) + netdev_warn(dev, "failed to reflect master change\n"); + + break; + } + +out: + return NOTIFY_DONE; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 238f38d21641..77e0f0e7a88e 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -58,6 +58,7 @@ #include <net/ipv6.h> #include <net/ip.h> #include <net/dsa.h> +#include <net/flow_dissector.h> #include <linux/uaccess.h> __setup("ether=", netdev_boot_setup); @@ -104,7 +105,7 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, */ if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) { - memset(eth->h_dest, 0, ETH_ALEN); + eth_zero_addr(eth->h_dest); return ETH_HLEN; } @@ -113,39 +114,6 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, EXPORT_SYMBOL(eth_header); /** - * eth_rebuild_header- rebuild the Ethernet MAC header. - * @skb: socket buffer to update - * - * This is called after an ARP or IPV6 ndisc it's resolution on this - * sk_buff. We now let protocol (ARP) fill in the other fields. - * - * This routine CANNOT use cached dst->neigh! - * Really, it is used only when dst->neigh is wrong. - */ -int eth_rebuild_header(struct sk_buff *skb) -{ - struct ethhdr *eth = (struct ethhdr *)skb->data; - struct net_device *dev = skb->dev; - - switch (eth->h_proto) { -#ifdef CONFIG_INET - case htons(ETH_P_IP): - return arp_find(eth->h_dest, skb); -#endif - default: - netdev_dbg(dev, - "%s: unable to resolve type %X addresses.\n", - dev->name, ntohs(eth->h_proto)); - - memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); - break; - } - - return 0; -} -EXPORT_SYMBOL(eth_rebuild_header); - -/** * eth_get_headlen - determine the the length of header for an ethernet frame * @data: pointer to start of frame * @len: total length of frame @@ -163,9 +131,9 @@ u32 eth_get_headlen(void *data, unsigned int len) return len; /* parse any remaining L2/L3 headers, check for L4 */ - if (!__skb_flow_dissect(NULL, &keys, data, - eth->h_proto, sizeof(*eth), len)) - return max_t(u32, keys.thoff, sizeof(*eth)); + if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, + sizeof(*eth), len)) + return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); @@ -189,10 +157,11 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; skb_reset_mac_header(skb); + + eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); - eth = eth_hdr(skb); - if (unlikely(is_multicast_ether_addr(eth->h_dest))) { + if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else @@ -211,7 +180,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); - if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN)) + if (likely(eth_proto_is_802_3(eth->h_proto))) return eth->h_proto; /* @@ -369,7 +338,6 @@ EXPORT_SYMBOL(eth_validate_addr); const struct header_ops eth_header_ops ____cacheline_aligned = { .create = eth_header, .parse = eth_header_parse, - .rebuild = eth_rebuild_header, .cache = eth_header_cache, .cache_update = eth_header_cache_update, }; @@ -391,7 +359,7 @@ void ether_setup(struct net_device *dev) dev->flags = IFF_BROADCAST|IFF_MULTICAST; dev->priv_flags |= IFF_TX_SKB_SHARING; - memset(dev->broadcast, 0xFF, ETH_ALEN); + eth_broadcast_addr(dev->broadcast); } EXPORT_SYMBOL(ether_setup); @@ -502,6 +470,7 @@ EXPORT_SYMBOL(eth_gro_complete); static struct packet_offload eth_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_TEB), + .priority = 10, .callbacks = { .gro_receive = eth_gro_receive, .gro_complete = eth_gro_complete, diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 055fbb71ba6f..f20a387a1011 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -55,27 +55,6 @@ LIST_HEAD(lowpan_devices); static int lowpan_open_count; -static __le16 lowpan_get_pan_id(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - - return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev); -} - -static __le16 lowpan_get_short_addr(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - - return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); -} - -static u8 lowpan_get_dsn(const struct net_device *dev) -{ - struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; - - return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev); -} - static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; @@ -103,17 +82,11 @@ static const struct net_device_ops lowpan_netdev_ops = { .ndo_start_xmit = lowpan_xmit, }; -static struct ieee802154_mlme_ops lowpan_mlme = { - .get_pan_id = lowpan_get_pan_id, - .get_short_addr = lowpan_get_short_addr, - .get_dsn = lowpan_get_dsn, -}; - static void lowpan_setup(struct net_device *dev) { dev->addr_len = IEEE802154_ADDR_LEN; memset(dev->broadcast, 0xff, IEEE802154_ADDR_LEN); - dev->type = ARPHRD_IEEE802154; + dev->type = ARPHRD_6LOWPAN; /* Frame Control + Sequence Number + Address fields + Security Header */ dev->hard_header_len = 2 + 1 + 20 + 14; dev->needed_tailroom = 2; /* FCS */ @@ -124,8 +97,8 @@ static void lowpan_setup(struct net_device *dev) dev->netdev_ops = &lowpan_netdev_ops; dev->header_ops = &lowpan_header_ops; - dev->ml_priv = &lowpan_mlme; dev->destructor = free_netdev; + dev->features |= NETIF_F_NETNS_LOCAL; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -148,10 +121,11 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, pr_debug("adding new link\n"); - if (!tb[IFLA_LINK]) + if (!tb[IFLA_LINK] || + !net_eq(dev_net(dev), &init_net)) return -EINVAL; /* find and hold real wpan device */ - real_dev = dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + real_dev = dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK])); if (!real_dev) return -ENODEV; if (real_dev->type != ARPHRD_IEEE802154) { diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index f46e4d1306f2..214d44aef35b 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -207,7 +207,7 @@ found: } else { fq->q.meat += skb->len; } - add_frag_mem_limit(&fq->q, skb->truesize); + add_frag_mem_limit(fq->q.net, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { @@ -287,7 +287,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, clone->data_len = clone->len; head->data_len -= clone->len; head->len -= clone->len; - add_frag_mem_limit(&fq->q, clone->truesize); + add_frag_mem_limit(fq->q.net, clone->truesize); } WARN_ON(head == NULL); @@ -310,7 +310,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, } fp = next; } - sub_frag_mem_limit(&fq->q, sum_truesize); + sub_frag_mem_limit(fq->q.net, sum_truesize); head->next = NULL; head->dev = dev; diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 2349070bd534..2597abbf7f4b 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -190,6 +190,7 @@ err: static int lowpan_header(struct sk_buff *skb, struct net_device *dev) { + struct wpan_dev *wpan_dev = lowpan_dev_info(dev)->real_dev->ieee802154_ptr; struct ieee802154_addr sa, da; struct ieee802154_mac_cb *cb = mac_cb_init(skb); struct lowpan_addr_info info; @@ -207,7 +208,7 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) /* prepare wpan address data */ sa.mode = IEEE802154_ADDR_LONG; - sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + sa.pan_id = wpan_dev->pan_id; sa.extended_addr = ieee802154_devaddr_from_raw(saddr); /* intra-PAN communications */ @@ -223,7 +224,7 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) } else { da.mode = IEEE802154_ADDR_LONG; da.extended_addr = ieee802154_devaddr_from_raw(daddr); - cb->ackreq = true; + cb->ackreq = wpan_dev->frame_retries >= 0; } return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile index 05dab2957cd4..4adfd4d5471b 100644 --- a/net/ieee802154/Makefile +++ b/net/ieee802154/Makefile @@ -3,7 +3,9 @@ obj-$(CONFIG_IEEE802154_SOCKET) += ieee802154_socket.o obj-y += 6lowpan/ ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \ - header_ops.o sysfs.o nl802154.o + header_ops.o sysfs.o nl802154.o trace.o ieee802154_socket-y := socket.o +CFLAGS_trace.o := -I$(src) + ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 18bc7e738507..b0248e934230 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -25,6 +25,9 @@ #include "sysfs.h" #include "core.h" +/* name for sysfs, %d is appended */ +#define PHY_NAME "phy" + /* RCU-protected (and RTNL for writers) */ LIST_HEAD(cfg802154_rdev_list); int cfg802154_rdev_list_generation; @@ -118,11 +121,9 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) /* atomic_inc_return makes it start at 1, make it start at 0 */ rdev->wpan_phy_idx--; - mutex_init(&rdev->wpan_phy.pib_lock); - INIT_LIST_HEAD(&rdev->wpan_dev_list); device_initialize(&rdev->wpan_phy.dev); - dev_set_name(&rdev->wpan_phy.dev, "wpan-phy%d", rdev->wpan_phy_idx); + dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx); rdev->wpan_phy.dev.class = &wpan_phy_class; rdev->wpan_phy.dev.platform_data = rdev; @@ -225,6 +226,7 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb, switch (state) { /* TODO NETDEV_DEVTYPE */ case NETDEV_REGISTER: + dev->features |= NETIF_F_NETNS_LOCAL; wpan_dev->identifier = ++rdev->wpan_dev_id; list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list); rdev->devlist_generation++; diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 9105265920fe..3503c38954f9 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -76,7 +76,6 @@ nla_put_failure: nlmsg_free(msg); return -ENOBUFS; } -EXPORT_SYMBOL(ieee802154_nl_start_confirm); static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct net_device *dev) @@ -98,8 +97,10 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, BUG_ON(!phy); get_device(&phy->dev); - short_addr = ops->get_short_addr(dev); - pan_id = ops->get_pan_id(dev); + rtnl_lock(); + short_addr = dev->ieee802154_ptr->short_addr; + pan_id = dev->ieee802154_ptr->pan_id; + rtnl_unlock(); if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || @@ -118,12 +119,12 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, rtnl_unlock(); if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER, - params.transmit_power) || + params.transmit_power / 100) || nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) || nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE, params.cca.mode) || nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL, - params.cca_ed_level) || + params.cca_ed_level / 100) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES, params.csma_retries) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE, @@ -167,10 +168,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) if (!dev) return NULL; - /* Check on mtu is currently a hacked solution because lowpan - * and wpan have the same ARPHRD type. - */ - if (dev->type != ARPHRD_IEEE802154 || dev->mtu != IEEE802154_MTU) { + if (dev->type != ARPHRD_IEEE802154) { dev_put(dev); return NULL; } @@ -245,7 +243,9 @@ int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info) addr.mode = IEEE802154_ADDR_LONG; addr.extended_addr = nla_get_hwaddr( info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]); - addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + rtnl_lock(); + addr.pan_id = dev->ieee802154_ptr->pan_id; + rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr, nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]), @@ -282,7 +282,9 @@ int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info) addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]); } - addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + rtnl_lock(); + addr.pan_id = dev->ieee802154_ptr->pan_id; + rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr, nla_get_u8(info->attrs[IEEE802154_ATTR_REASON])); @@ -450,11 +452,7 @@ int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; for_each_netdev(net, dev) { - /* Check on mtu is currently a hacked solution because lowpan - * and wpan have the same ARPHRD type. - */ - if (idx < s_idx || dev->type != ARPHRD_IEEE802154 || - dev->mtu != IEEE802154_MTU) + if (idx < s_idx || dev->type != ARPHRD_IEEE802154) goto cont; if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid, @@ -511,7 +509,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info) ops->get_mac_params(dev, ¶ms); if (info->attrs[IEEE802154_ATTR_TXPOWER]) - params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]); + params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]) * 100; if (info->attrs[IEEE802154_ATTR_LBT_ENABLED]) params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]); @@ -520,7 +518,7 @@ int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info) params.cca.mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]); if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) - params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]); + params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) * 100; if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES]) params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]); @@ -784,11 +782,7 @@ ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb, int rc; for_each_netdev(net, dev) { - /* Check on mtu is currently a hacked solution because lowpan - * and wpan have the same ARPHRD type. - */ - if (idx < first_dev || dev->type != ARPHRD_IEEE802154 || - dev->mtu != IEEE802154_MTU) + if (idx < first_dev || dev->type != ARPHRD_IEEE802154) goto skip; data.ops = ieee802154_mlme_ops(dev); diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 1b9d25f6e898..77d73014bde3 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -50,26 +50,26 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, if (!hdr) goto out; - mutex_lock(&phy->pib_lock); + rtnl_lock(); if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) || nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel)) goto nla_put_failure; for (i = 0; i < 32; i++) { - if (phy->channels_supported[i]) - buf[pages++] = phy->channels_supported[i] | (i << 27); + if (phy->supported.channels[i]) + buf[pages++] = phy->supported.channels[i] | (i << 27); } if (pages && nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST, pages * sizeof(uint32_t), buf)) goto nla_put_failure; - mutex_unlock(&phy->pib_lock); + rtnl_unlock(); kfree(buf); genlmsg_end(msg, hdr); return 0; nla_put_failure: - mutex_unlock(&phy->pib_lock); + rtnl_unlock(); genlmsg_cancel(msg, hdr); out: kfree(buf); @@ -175,6 +175,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) int rc = -ENOBUFS; struct net_device *dev; int type = __IEEE802154_DEV_INVALID; + unsigned char name_assign_type; pr_debug("%s\n", __func__); @@ -190,8 +191,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') return -EINVAL; /* phy name should be null-terminated */ + name_assign_type = NET_NAME_USER; } else { devname = "wpan%d"; + name_assign_type = NET_NAME_ENUM; } if (strlen(devname) >= IFNAMSIZ) @@ -221,7 +224,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) } dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname, - type); + name_assign_type, type); if (IS_ERR(dev)) { rc = PTR_ERR(dev); goto nla_put_failure; diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index a4daf91b8d0a..68f24016860c 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -207,10 +207,11 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_PAGE] = { .type = NLA_U8, }, [NL802154_ATTR_CHANNEL] = { .type = NLA_U8, }, - [NL802154_ATTR_TX_POWER] = { .type = NLA_S8, }, + [NL802154_ATTR_TX_POWER] = { .type = NLA_S32, }, [NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, }, [NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, }, + [NL802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, }, [NL802154_ATTR_SUPPORTED_CHANNEL] = { .type = NLA_U32, }, @@ -225,6 +226,10 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_MAX_FRAME_RETRIES] = { .type = NLA_S8, }, [NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, }, + + [NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED }, + + [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED }, }; /* message building helper */ @@ -236,6 +241,28 @@ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, } static int +nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask) +{ + struct nlattr *nl_flags = nla_nest_start(msg, attr); + int i; + + if (!nl_flags) + return -ENOBUFS; + + i = 0; + while (mask) { + if ((mask & 1) && nla_put_flag(msg, i)) + return -ENOBUFS; + + mask >>= 1; + i++; + } + + nla_nest_end(msg, nl_flags); + return 0; +} + +static int nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, struct sk_buff *msg) { @@ -248,7 +275,7 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, for (page = 0; page <= IEEE802154_MAX_PAGE; page++) { if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL, - rdev->wpan_phy.channels_supported[page])) + rdev->wpan_phy.supported.channels[page])) return -ENOBUFS; } nla_nest_end(msg, nl_page); @@ -256,12 +283,100 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, return 0; } +static int +nl802154_put_capabilities(struct sk_buff *msg, + struct cfg802154_registered_device *rdev) +{ + const struct wpan_phy_supported *caps = &rdev->wpan_phy.supported; + struct nlattr *nl_caps, *nl_channels; + int i; + + nl_caps = nla_nest_start(msg, NL802154_ATTR_WPAN_PHY_CAPS); + if (!nl_caps) + return -ENOBUFS; + + nl_channels = nla_nest_start(msg, NL802154_CAP_ATTR_CHANNELS); + if (!nl_channels) + return -ENOBUFS; + + for (i = 0; i <= IEEE802154_MAX_PAGE; i++) { + if (caps->channels[i]) { + if (nl802154_put_flags(msg, i, caps->channels[i])) + return -ENOBUFS; + } + } + + nla_nest_end(msg, nl_channels); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { + struct nlattr *nl_ed_lvls; + + nl_ed_lvls = nla_nest_start(msg, + NL802154_CAP_ATTR_CCA_ED_LEVELS); + if (!nl_ed_lvls) + return -ENOBUFS; + + for (i = 0; i < caps->cca_ed_levels_size; i++) { + if (nla_put_s32(msg, i, caps->cca_ed_levels[i])) + return -ENOBUFS; + } + + nla_nest_end(msg, nl_ed_lvls); + } + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { + struct nlattr *nl_tx_pwrs; + + nl_tx_pwrs = nla_nest_start(msg, NL802154_CAP_ATTR_TX_POWERS); + if (!nl_tx_pwrs) + return -ENOBUFS; + + for (i = 0; i < caps->tx_powers_size; i++) { + if (nla_put_s32(msg, i, caps->tx_powers[i])) + return -ENOBUFS; + } + + nla_nest_end(msg, nl_tx_pwrs); + } + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { + if (nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_MODES, + caps->cca_modes) || + nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_OPTS, + caps->cca_opts)) + return -ENOBUFS; + } + + if (nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MINBE, caps->min_minbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MINBE, caps->max_minbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MAXBE, caps->min_maxbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MAXBE, caps->max_maxbe) || + nla_put_u8(msg, NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS, + caps->min_csma_backoffs) || + nla_put_u8(msg, NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS, + caps->max_csma_backoffs) || + nla_put_s8(msg, NL802154_CAP_ATTR_MIN_FRAME_RETRIES, + caps->min_frame_retries) || + nla_put_s8(msg, NL802154_CAP_ATTR_MAX_FRAME_RETRIES, + caps->max_frame_retries) || + nl802154_put_flags(msg, NL802154_CAP_ATTR_IFTYPES, + caps->iftypes) || + nla_put_u32(msg, NL802154_CAP_ATTR_LBT, caps->lbt)) + return -ENOBUFS; + + nla_nest_end(msg, nl_caps); + + return 0; +} + static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, enum nl802154_commands cmd, struct sk_buff *msg, u32 portid, u32 seq, int flags) { + struct nlattr *nl_cmds; void *hdr; + int i; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) @@ -286,25 +401,76 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, rdev->wpan_phy.current_channel)) goto nla_put_failure; - /* supported channels array */ + /* TODO remove this behaviour, we still keep support it for a while + * so users can change the behaviour to the new one. + */ if (nl802154_send_wpan_phy_channels(rdev, msg)) goto nla_put_failure; /* cca mode */ - if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE, - rdev->wpan_phy.cca.mode)) - goto nla_put_failure; + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { + if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE, + rdev->wpan_phy.cca.mode)) + goto nla_put_failure; + + if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) { + if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT, + rdev->wpan_phy.cca.opt)) + goto nla_put_failure; + } + } + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { + if (nla_put_s32(msg, NL802154_ATTR_TX_POWER, + rdev->wpan_phy.transmit_power)) + goto nla_put_failure; + } - if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) { - if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT, - rdev->wpan_phy.cca.opt)) + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { + if (nla_put_s32(msg, NL802154_ATTR_CCA_ED_LEVEL, + rdev->wpan_phy.cca_ed_level)) goto nla_put_failure; } - if (nla_put_s8(msg, NL802154_ATTR_TX_POWER, - rdev->wpan_phy.transmit_power)) + if (nl802154_put_capabilities(msg, rdev)) + goto nla_put_failure; + + nl_cmds = nla_nest_start(msg, NL802154_ATTR_SUPPORTED_COMMANDS); + if (!nl_cmds) goto nla_put_failure; + i = 0; +#define CMD(op, n) \ + do { \ + if (rdev->ops->op) { \ + i++; \ + if (nla_put_u32(msg, i, NL802154_CMD_ ## n)) \ + goto nla_put_failure; \ + } \ + } while (0) + + CMD(add_virtual_intf, NEW_INTERFACE); + CMD(del_virtual_intf, DEL_INTERFACE); + CMD(set_channel, SET_CHANNEL); + CMD(set_pan_id, SET_PAN_ID); + CMD(set_short_addr, SET_SHORT_ADDR); + CMD(set_backoff_exponent, SET_BACKOFF_EXPONENT); + CMD(set_max_csma_backoffs, SET_MAX_CSMA_BACKOFFS); + CMD(set_max_frame_retries, SET_MAX_FRAME_RETRIES); + CMD(set_lbt_mode, SET_LBT_MODE); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) + CMD(set_tx_power, SET_TX_POWER); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) + CMD(set_cca_ed_level, SET_CCA_ED_LEVEL); + + if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) + CMD(set_cca_mode, SET_CCA_MODE); + +#undef CMD + nla_nest_end(msg, nl_cmds); + finish: genlmsg_end(msg, hdr); return 0; @@ -575,7 +741,8 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL802154_ATTR_IFTYPE]) { type = nla_get_u32(info->attrs[NL802154_ATTR_IFTYPE]); - if (type > NL802154_IFTYPE_MAX) + if (type > NL802154_IFTYPE_MAX || + !(rdev->wpan_phy.supported.iftypes & BIT(type))) return -EINVAL; } @@ -589,7 +756,7 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) return rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL802154_ATTR_IFNAME]), - type, extended_addr); + NET_NAME_USER, type, extended_addr); } static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info) @@ -625,7 +792,8 @@ static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info) channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]); /* check 802.15.4 constraints */ - if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL) + if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL || + !(rdev->wpan_phy.supported.channels[page] & BIT(channel))) return -EINVAL; return rdev_set_channel(rdev, page, channel); @@ -636,12 +804,17 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_phy_cca cca; + if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE)) + return -EOPNOTSUPP; + if (!info->attrs[NL802154_ATTR_CCA_MODE]) return -EINVAL; cca.mode = nla_get_u32(info->attrs[NL802154_ATTR_CCA_MODE]); /* checking 802.15.4 constraints */ - if (cca.mode < NL802154_CCA_ENERGY || cca.mode > NL802154_CCA_ATTR_MAX) + if (cca.mode < NL802154_CCA_ENERGY || + cca.mode > NL802154_CCA_ATTR_MAX || + !(rdev->wpan_phy.supported.cca_modes & BIT(cca.mode))) return -EINVAL; if (cca.mode == NL802154_CCA_ENERGY_CARRIER) { @@ -649,13 +822,58 @@ static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) return -EINVAL; cca.opt = nla_get_u32(info->attrs[NL802154_ATTR_CCA_OPT]); - if (cca.opt > NL802154_CCA_OPT_ATTR_MAX) + if (cca.opt > NL802154_CCA_OPT_ATTR_MAX || + !(rdev->wpan_phy.supported.cca_opts & BIT(cca.opt))) return -EINVAL; } return rdev_set_cca_mode(rdev, &cca); } +static int nl802154_set_cca_ed_level(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + s32 ed_level; + int i; + + if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL)) + return -EOPNOTSUPP; + + if (!info->attrs[NL802154_ATTR_CCA_ED_LEVEL]) + return -EINVAL; + + ed_level = nla_get_s32(info->attrs[NL802154_ATTR_CCA_ED_LEVEL]); + + for (i = 0; i < rdev->wpan_phy.supported.cca_ed_levels_size; i++) { + if (ed_level == rdev->wpan_phy.supported.cca_ed_levels[i]) + return rdev_set_cca_ed_level(rdev, ed_level); + } + + return -EINVAL; +} + +static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + s32 power; + int i; + + if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER)) + return -EOPNOTSUPP; + + if (!info->attrs[NL802154_ATTR_TX_POWER]) + return -EINVAL; + + power = nla_get_s32(info->attrs[NL802154_ATTR_TX_POWER]); + + for (i = 0; i < rdev->wpan_phy.supported.tx_powers_size; i++) { + if (power == rdev->wpan_phy.supported.tx_powers[i]) + return rdev_set_tx_power(rdev, power); + } + + return -EINVAL; +} + static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; @@ -668,14 +886,22 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) return -EBUSY; /* don't change address fields on monitor */ - if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) - return -EINVAL; - - if (!info->attrs[NL802154_ATTR_PAN_ID]) + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || + !info->attrs[NL802154_ATTR_PAN_ID]) return -EINVAL; pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]); + /* TODO + * I am not sure about to check here on broadcast pan_id. + * Broadcast is a valid setting, comment from 802.15.4: + * If this value is 0xffff, the device is not associated. + * + * This could useful to simple deassociate an device. + */ + if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) + return -EINVAL; + return rdev_set_pan_id(rdev, wpan_dev, pan_id); } @@ -691,14 +917,27 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info) return -EBUSY; /* don't change address fields on monitor */ - if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) - return -EINVAL; - - if (!info->attrs[NL802154_ATTR_SHORT_ADDR]) + if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || + !info->attrs[NL802154_ATTR_SHORT_ADDR]) return -EINVAL; short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]); + /* TODO + * I am not sure about to check here on broadcast short_addr. + * Broadcast is a valid setting, comment from 802.15.4: + * A value of 0xfffe indicates that the device has + * associated but has not been allocated an address. A + * value of 0xffff indicates that the device does not + * have a short address. + * + * I think we should allow to set these settings but + * don't allow to allow socket communication with it. + */ + if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) || + short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST)) + return -EINVAL; + return rdev_set_short_addr(rdev, wpan_dev, short_addr); } @@ -722,7 +961,11 @@ nl802154_set_backoff_exponent(struct sk_buff *skb, struct genl_info *info) max_be = nla_get_u8(info->attrs[NL802154_ATTR_MAX_BE]); /* check 802.15.4 constraints */ - if (max_be < 3 || max_be > 8 || min_be > max_be) + if (min_be < rdev->wpan_phy.supported.min_minbe || + min_be > rdev->wpan_phy.supported.max_minbe || + max_be < rdev->wpan_phy.supported.min_maxbe || + max_be > rdev->wpan_phy.supported.max_maxbe || + min_be > max_be) return -EINVAL; return rdev_set_backoff_exponent(rdev, wpan_dev, min_be, max_be); @@ -747,7 +990,8 @@ nl802154_set_max_csma_backoffs(struct sk_buff *skb, struct genl_info *info) info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]); /* check 802.15.4 constraints */ - if (max_csma_backoffs > 5) + if (max_csma_backoffs < rdev->wpan_phy.supported.min_csma_backoffs || + max_csma_backoffs > rdev->wpan_phy.supported.max_csma_backoffs) return -EINVAL; return rdev_set_max_csma_backoffs(rdev, wpan_dev, max_csma_backoffs); @@ -771,7 +1015,8 @@ nl802154_set_max_frame_retries(struct sk_buff *skb, struct genl_info *info) info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]); /* check 802.15.4 constraints */ - if (max_frame_retries < -1 || max_frame_retries > 7) + if (max_frame_retries < rdev->wpan_phy.supported.min_frame_retries || + max_frame_retries > rdev->wpan_phy.supported.max_frame_retries) return -EINVAL; return rdev_set_max_frame_retries(rdev, wpan_dev, max_frame_retries); @@ -791,6 +1036,9 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) return -EINVAL; mode = !!nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); + if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt)) + return -EINVAL; + return rdev_set_lbt_mode(rdev, wpan_dev, mode); } @@ -937,6 +1185,22 @@ static const struct genl_ops nl802154_ops[] = { NL802154_FLAG_NEED_RTNL, }, { + .cmd = NL802154_CMD_SET_CCA_ED_LEVEL, + .doit = nl802154_set_cca_ed_level, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_SET_TX_POWER, + .doit = nl802154_set_tx_power, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | + NL802154_FLAG_NEED_RTNL, + }, + { .cmd = NL802154_CMD_SET_PAN_ID, .doit = nl802154_set_pan_id, .policy = nl802154_policy, diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 7c46732fad2b..b2155a123f6c 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -4,13 +4,16 @@ #include <net/cfg802154.h> #include "core.h" +#include "trace.h" static inline struct net_device * rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, - const char *name, int type) + const char *name, + unsigned char name_assign_type, + int type) { return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name, - type); + name_assign_type, type); } static inline void @@ -22,75 +25,154 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, static inline int rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name, + unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { - return rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, type, + int ret; + + trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type, extended_addr); + ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, + name_assign_type, type, + extended_addr); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_del_virtual_intf(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { - return rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev); + int ret; + + trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev); + ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel) { - return rdev->ops->set_channel(&rdev->wpan_phy, page, channel); + int ret; + + trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel); + ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_cca_mode(struct cfg802154_registered_device *rdev, const struct wpan_phy_cca *cca) { - return rdev->ops->set_cca_mode(&rdev->wpan_phy, cca); + int ret; + + trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca); + ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + +static inline int +rdev_set_cca_ed_level(struct cfg802154_registered_device *rdev, s32 ed_level) +{ + int ret; + + trace_802154_rdev_set_cca_ed_level(&rdev->wpan_phy, ed_level); + ret = rdev->ops->set_cca_ed_level(&rdev->wpan_phy, ed_level); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + +static inline int +rdev_set_tx_power(struct cfg802154_registered_device *rdev, + s32 power) +{ + int ret; + + trace_802154_rdev_set_tx_power(&rdev->wpan_phy, power); + ret = rdev->ops->set_tx_power(&rdev->wpan_phy, power); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_pan_id(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 pan_id) { - return rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); + int ret; + + trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); + ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_short_addr(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, __le16 short_addr) { - return rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); + int ret; + + trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); + ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) { - return rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev, + int ret; + + trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev, min_be, max_be); + ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev, + min_be, max_be); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 max_csma_backoffs) { - return rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev, - max_csma_backoffs); + int ret; + + trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev, + max_csma_backoffs); + ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev, + max_csma_backoffs); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, s8 max_frame_retries) { - return rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev, + int ret; + + trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev, max_frame_retries); + ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev, + max_frame_retries); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } static inline int rdev_set_lbt_mode(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, bool mode) { - return rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); + int ret; + + trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); + ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; } #endif /* __CFG802154_RDEV_OPS */ diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 2878d8ca6d3b..b6eacf30ee7a 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -64,10 +64,8 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) if (tmp->type != ARPHRD_IEEE802154) continue; - pan_id = ieee802154_mlme_ops(tmp)->get_pan_id(tmp); - short_addr = - ieee802154_mlme_ops(tmp)->get_short_addr(tmp); - + pan_id = tmp->ieee802154_ptr->pan_id; + short_addr = tmp->ieee802154_ptr->short_addr; if (pan_id == addr->pan_id && short_addr == addr->short_addr) { dev = tmp; @@ -98,12 +96,12 @@ static int ieee802154_sock_release(struct socket *sock) return 0; } -static int ieee802154_sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; - return sk->sk_prot->sendmsg(iocb, sk, msg, len); + return sk->sk_prot->sendmsg(sk, msg, len); } static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, @@ -228,15 +226,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) goto out; } - if (dev->type != ARPHRD_IEEE802154) { - err = -ENODEV; - goto out_put; - } - sk->sk_bound_dev_if = dev->ifindex; sk_dst_reset(sk); -out_put: dev_put(dev); out: release_sock(sk); @@ -255,8 +247,7 @@ static int raw_disconnect(struct sock *sk, int flags) return 0; } -static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size) +static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; @@ -287,7 +278,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, if (size > mtu) { pr_debug("size = %Zu, mtu = %u\n", size, mtu); - err = -EINVAL; + err = -EMSGSIZE; goto out_dev; } @@ -327,8 +318,8 @@ out: return err; } -static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; @@ -615,8 +606,7 @@ static int dgram_disconnect(struct sock *sk, int flags) return 0; } -static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size) +static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; @@ -715,9 +705,8 @@ out: return err; } -static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, int noblock, - int flags, int *addr_len) +static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; @@ -742,6 +731,12 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, sock_recv_ts_and_drops(msg, sk, skb); if (saddr) { + /* Clear the implicit padding in struct sockaddr_ieee802154 + * (16 bits between 'family' and 'addr') and in struct + * ieee802154_addr_sa (16 bits at the end of the structure). + */ + memset(saddr, 0, sizeof(*saddr)); + saddr->family = AF_IEEE802154; ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source); *addr_len = sizeof(*saddr); @@ -800,9 +795,9 @@ static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) /* Data frame processing */ BUG_ON(dev->type != ARPHRD_IEEE802154); - pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev); - hw_addr = ieee802154_devaddr_from_raw(dev->dev_addr); + pan_id = dev->ieee802154_ptr->pan_id; + short_addr = dev->ieee802154_ptr->short_addr; + hw_addr = dev->ieee802154_ptr->extended_addr; read_lock(&dgram_lock); sk_for_each(sk, &dgram_head) { @@ -1017,7 +1012,7 @@ static int ieee802154_create(struct net *net, struct socket *sock, } rc = -ENOMEM; - sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto); + sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern); if (!sk) goto out; rc = 0; diff --git a/net/ieee802154/sysfs.c b/net/ieee802154/sysfs.c index dff55c2d87f3..133b4280660c 100644 --- a/net/ieee802154/sysfs.c +++ b/net/ieee802154/sysfs.c @@ -48,49 +48,6 @@ static ssize_t name_show(struct device *dev, } static DEVICE_ATTR_RO(name); -#define MASTER_SHOW_COMPLEX(name, format_string, args...) \ -static ssize_t name ## _show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); \ - int ret; \ - \ - mutex_lock(&phy->pib_lock); \ - ret = snprintf(buf, PAGE_SIZE, format_string "\n", args); \ - mutex_unlock(&phy->pib_lock); \ - return ret; \ -} \ -static DEVICE_ATTR_RO(name) - -#define MASTER_SHOW(field, format_string) \ - MASTER_SHOW_COMPLEX(field, format_string, phy->field) - -MASTER_SHOW(current_channel, "%d"); -MASTER_SHOW(current_page, "%d"); -MASTER_SHOW(transmit_power, "%d +- 1 dB"); -MASTER_SHOW_COMPLEX(cca_mode, "%d", phy->cca.mode); - -static ssize_t channels_supported_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); - int ret; - int i, len = 0; - - mutex_lock(&phy->pib_lock); - for (i = 0; i < 32; i++) { - ret = snprintf(buf + len, PAGE_SIZE - len, - "%#09x\n", phy->channels_supported[i]); - if (ret < 0) - break; - len += ret; - } - mutex_unlock(&phy->pib_lock); - return len; -} -static DEVICE_ATTR_RO(channels_supported); - static void wpan_phy_release(struct device *dev) { struct cfg802154_registered_device *rdev = dev_to_rdev(dev); @@ -101,12 +58,6 @@ static void wpan_phy_release(struct device *dev) static struct attribute *pmib_attrs[] = { &dev_attr_index.attr, &dev_attr_name.attr, - /* below will be removed soon */ - &dev_attr_current_channel.attr, - &dev_attr_current_page.attr, - &dev_attr_channels_supported.attr, - &dev_attr_transmit_power.attr, - &dev_attr_cca_mode.attr, NULL, }; ATTRIBUTE_GROUPS(pmib); diff --git a/net/ieee802154/trace.c b/net/ieee802154/trace.c new file mode 100644 index 000000000000..95f997fad755 --- /dev/null +++ b/net/ieee802154/trace.c @@ -0,0 +1,7 @@ +#include <linux/module.h> + +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h new file mode 100644 index 000000000000..9b5f0eb36696 --- /dev/null +++ b/net/ieee802154/trace.h @@ -0,0 +1,277 @@ +/* Based on net/wireless/trace.h */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cfg802154 + +#if !defined(__RDEV_CFG802154_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ) +#define __RDEV_CFG802154_OPS_TRACE + +#include <linux/tracepoint.h> + +#include <net/cfg802154.h> + +#define MAXNAME 32 +#define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME) +#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \ + wpan_phy_name(wpan_phy), \ + MAXNAME) +#define WPAN_PHY_PR_FMT "%s" +#define WPAN_PHY_PR_ARG __entry->wpan_phy_name + +#define WPAN_DEV_ENTRY __field(u32, identifier) +#define WPAN_DEV_ASSIGN (__entry->identifier) = (!IS_ERR_OR_NULL(wpan_dev) \ + ? wpan_dev->identifier : 0) +#define WPAN_DEV_PR_FMT "wpan_dev(%u)" +#define WPAN_DEV_PR_ARG (__entry->identifier) + +#define WPAN_CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \ + __field(enum nl802154_cca_opts, cca_opt) +#define WPAN_CCA_ASSIGN \ + do { \ + (__entry->cca_mode) = cca->mode; \ + (__entry->cca_opt) = cca->opt; \ + } while (0) +#define WPAN_CCA_PR_FMT "cca_mode: %d, cca_opt: %d" +#define WPAN_CCA_PR_ARG __entry->cca_mode, __entry->cca_opt + +#define BOOL_TO_STR(bo) (bo) ? "true" : "false" + +/************************************************************* + * rdev->ops traces * + *************************************************************/ + +TRACE_EVENT(802154_rdev_add_virtual_intf, + TP_PROTO(struct wpan_phy *wpan_phy, char *name, + enum nl802154_iftype type, __le64 extended_addr), + TP_ARGS(wpan_phy, name, type, extended_addr), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __string(vir_intf_name, name ? name : "<noname>") + __field(enum nl802154_iftype, type) + __field(__le64, extended_addr) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __assign_str(vir_intf_name, name ? name : "<noname>"); + __entry->type = type; + __entry->extended_addr = extended_addr; + ), + TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, extended addr: 0x%llx", + WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type, + __le64_to_cpu(__entry->extended_addr)) +); + +TRACE_EVENT(802154_rdev_del_virtual_intf, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev), + TP_ARGS(wpan_phy, wpan_dev), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG) +); + +TRACE_EVENT(802154_rdev_set_channel, + TP_PROTO(struct wpan_phy *wpan_phy, u8 page, u8 channel), + TP_ARGS(wpan_phy, page, channel), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(u8, page) + __field(u8, channel) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->page = page; + __entry->channel = channel; + ), + TP_printk(WPAN_PHY_PR_FMT ", page: %d, channel: %d", WPAN_PHY_PR_ARG, + __entry->page, __entry->channel) +); + +TRACE_EVENT(802154_rdev_set_tx_power, + TP_PROTO(struct wpan_phy *wpan_phy, s32 power), + TP_ARGS(wpan_phy, power), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(s32, power) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->power = power; + ), + TP_printk(WPAN_PHY_PR_FMT ", mbm: %d", WPAN_PHY_PR_ARG, + __entry->power) +); + +TRACE_EVENT(802154_rdev_set_cca_mode, + TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca), + TP_ARGS(wpan_phy, cca), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_CCA_ENTRY + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_CCA_ASSIGN; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_CCA_PR_FMT, WPAN_PHY_PR_ARG, + WPAN_CCA_PR_ARG) +); + +TRACE_EVENT(802154_rdev_set_cca_ed_level, + TP_PROTO(struct wpan_phy *wpan_phy, s32 ed_level), + TP_ARGS(wpan_phy, ed_level), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(s32, ed_level) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->ed_level = ed_level; + ), + TP_printk(WPAN_PHY_PR_FMT ", ed level: %d", WPAN_PHY_PR_ARG, + __entry->ed_level) +); + +DECLARE_EVENT_CLASS(802154_le16_template, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le16 le16arg), + TP_ARGS(wpan_phy, wpan_dev, le16arg), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(__le16, le16arg) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->le16arg = le16arg; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", pan id: 0x%04x", + WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, + __le16_to_cpu(__entry->le16arg)) +); + +DEFINE_EVENT(802154_le16_template, 802154_rdev_set_pan_id, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le16 le16arg), + TP_ARGS(wpan_phy, wpan_dev, le16arg) +); + +DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + __le16 le16arg), + TP_ARGS(wpan_phy, wpan_dev, le16arg), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", short addr: 0x%04x", + WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, + __le16_to_cpu(__entry->le16arg)) +); + +TRACE_EVENT(802154_rdev_set_backoff_exponent, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + u8 min_be, u8 max_be), + TP_ARGS(wpan_phy, wpan_dev, min_be, max_be), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(u8, min_be) + __field(u8, max_be) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->min_be = min_be; + __entry->max_be = max_be; + ), + + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", min be: %d, max be: %d", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be) +); + +TRACE_EVENT(802154_rdev_set_csma_backoffs, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + u8 max_csma_backoffs), + TP_ARGS(wpan_phy, wpan_dev, max_csma_backoffs), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(u8, max_csma_backoffs) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->max_csma_backoffs = max_csma_backoffs; + ), + + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", max csma backoffs: %d", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, __entry->max_csma_backoffs) +); + +TRACE_EVENT(802154_rdev_set_max_frame_retries, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + s8 max_frame_retries), + TP_ARGS(wpan_phy, wpan_dev, max_frame_retries), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(s8, max_frame_retries) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->max_frame_retries = max_frame_retries; + ), + + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", max frame retries: %d", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, __entry->max_frame_retries) +); + +TRACE_EVENT(802154_rdev_set_lbt_mode, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + bool mode), + TP_ARGS(wpan_phy, wpan_dev, mode), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(bool, mode) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->mode = mode; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", lbt mode: %s", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode)) +); + +TRACE_EVENT(802154_rdev_return_int, + TP_PROTO(struct wpan_phy *wpan_phy, int ret), + TP_ARGS(wpan_phy, ret), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + __field(int, ret) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + __entry->ret = ret; + ), + TP_printk(WPAN_PHY_PR_FMT ", returned: %d", WPAN_PHY_PR_ARG, + __entry->ret) +); + +#endif /* !__RDEV_CFG802154_OPS_TRACE || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index bd2901604842..6fb3c90ad726 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -331,8 +331,8 @@ config NET_FOU_IP_TUNNELS When this option is enabled IP tunnels can be configured to use FOU or GUE encapsulation. -config GENEVE - tristate "Generic Network Virtualization Encapsulation (Geneve)" +config GENEVE_CORE + tristate "Generic Network Virtualization Encapsulation library" depends on INET select NET_UDP_TUNNEL ---help--- @@ -615,6 +615,22 @@ config TCP_CONG_DCTCP For further details see: http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf +config TCP_CONG_CDG + tristate "CAIA Delay-Gradient (CDG)" + default n + ---help--- + CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies + the TCP sender in order to: + + o Use the delay gradient as a congestion signal. + o Back off with an average probability that is independent of the RTT. + o Coexist with flows that use loss-based congestion control. + o Tolerate packet loss unrelated to congestion. + + For further details see: + D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -646,6 +662,9 @@ choice config DEFAULT_DCTCP bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_CDG + bool "CDG" if TCP_CONG_CDG=y + config DEFAULT_RENO bool "Reno" endchoice @@ -668,6 +687,7 @@ config DEFAULT_TCP_CONG default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO default "dctcp" if DEFAULT_DCTCP + default "cdg" if DEFAULT_CDG default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 518c04ed666e..efc43f300b8c 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o @@ -56,7 +57,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o -obj-$(CONFIG_GENEVE) += geneve.o +obj-$(CONFIG_GENEVE_CORE) += geneve_core.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d2e49baaff63..9532ee87151f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -217,7 +217,7 @@ int inet_listen(struct socket *sock, int backlog) * shutdown() (rather than close()). */ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && - inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { + !inet_csk(sk)->icsk_accept_queue.fastopenq) { if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) err = fastopen_init_queue(sk, backlog); else if ((sysctl_tcp_fastopen & @@ -228,6 +228,8 @@ int inet_listen(struct socket *sock, int backlog) err = 0; if (err) goto out; + + tcp_fastopen_init_key_once(true); } err = inet_csk_listen_start(sk, backlog); if (err) @@ -314,11 +316,11 @@ lookup_protocol: answer_flags = answer->flags; rcu_read_unlock(); - WARN_ON(answer_prot->slab == NULL); + WARN_ON(!answer_prot->slab); err = -ENOBUFS; - sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); - if (sk == NULL) + sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); + if (!sk) goto out; err = 0; @@ -488,7 +490,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; @@ -716,8 +719,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, } EXPORT_SYMBOL(inet_getname); -int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size) +int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; @@ -728,7 +730,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->sendmsg(iocb, sk, msg, size); + return sk->sk_prot->sendmsg(sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg); @@ -750,8 +752,8 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, } EXPORT_SYMBOL(inet_sendpage); -int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size, int flags) +int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; int addr_len = 0; @@ -759,7 +761,7 @@ int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, sock_rps_record_flow(sk); - err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; @@ -1270,7 +1272,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (udpfrag) { iph->id = htons(id); iph->frag_off = htons(offset >> 3); - if (skb->next != NULL) + if (skb->next) iph->frag_off |= htons(IP_MF); offset += skb->len - nhoff - ihl; } else { @@ -1431,7 +1433,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, struct net *net) { struct socket *sock; - int rc = sock_create_kern(family, type, protocol, &sock); + int rc = sock_create_kern(net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; @@ -1441,8 +1443,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, * we do not wish this socket to see incoming packets. */ (*sk)->sk_prot->unhash(*sk); - - sk_change_net(*sk, net); } return rc; } @@ -1598,7 +1598,7 @@ static __net_init int inet_init_net(struct net *net) */ seqlock_init(&net->ipv4.ip_local_ports.lock); net->ipv4.ip_local_ports.range[0] = 32768; - net->ipv4.ip_local_ports.range[1] = 61000; + net->ipv4.ip_local_ports.range[1] = 60999; seqlock_init(&net->ipv4.ping_group_range.lock); /* @@ -1675,7 +1675,7 @@ static int __init inet_init(void) struct list_head *r; int rc = -EINVAL; - BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); + sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); rc = proto_register(&tcp_prot, 1); if (rc) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 205e1472aa78..6c8b1fbafce8 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -122,6 +122,7 @@ * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); +static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -149,18 +150,12 @@ static const struct neigh_ops arp_direct_ops = { .connected_output = neigh_direct_output, }; -static const struct neigh_ops arp_broken_ops = { - .family = AF_INET, - .solicit = arp_solicit, - .error_report = arp_error_report, - .output = neigh_compat_output, - .connected_output = neigh_compat_output, -}; - struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, + .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, + .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .id = "arp_cache", @@ -216,7 +211,12 @@ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { - return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd); + return arp_hashfn(pkey, dev, hash_rnd); +} + +static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) +{ + return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) @@ -228,7 +228,7 @@ static int arp_constructor(struct neighbour *neigh) rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); - if (in_dev == NULL) { + if (!in_dev) { rcu_read_unlock(); return -EINVAL; } @@ -260,35 +260,6 @@ static int arp_constructor(struct neighbour *neigh) in old paradigm. */ -#if 1 - /* So... these "amateur" devices are hopeless. - The only thing, that I can say now: - It is very sad that we need to keep ugly obsolete - code to make them happy. - - They should be moved to more reasonable state, now - they use rebuild_header INSTEAD OF hard_start_xmit!!! - Besides that, they are sort of out of date - (a lot of redundant clones/copies, useless in 2.1), - I wonder why people believe that they work. - */ - switch (dev->type) { - default: - break; - case ARPHRD_ROSE: -#if IS_ENABLED(CONFIG_AX25) - case ARPHRD_AX25: -#if IS_ENABLED(CONFIG_NETROM) - case ARPHRD_NETROM: -#endif - neigh->ops = &arp_broken_ops; - neigh->output = neigh->ops->output; - return 0; -#else - break; -#endif - } -#endif if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); @@ -433,71 +404,6 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) return flag; } -/* OBSOLETE FUNCTIONS */ - -/* - * Find an arp mapping in the cache. If not found, post a request. - * - * It is very UGLY routine: it DOES NOT use skb->dst->neighbour, - * even if it exists. It is supposed that skb->dev was mangled - * by a virtual device (eql, shaper). Nobody but broken devices - * is allowed to use this function, it is scheduled to be removed. --ANK - */ - -static int arp_set_predefined(int addr_hint, unsigned char *haddr, - __be32 paddr, struct net_device *dev) -{ - switch (addr_hint) { - case RTN_LOCAL: - pr_debug("arp called for own IP address\n"); - memcpy(haddr, dev->dev_addr, dev->addr_len); - return 1; - case RTN_MULTICAST: - arp_mc_map(paddr, haddr, dev, 1); - return 1; - case RTN_BROADCAST: - memcpy(haddr, dev->broadcast, dev->addr_len); - return 1; - } - return 0; -} - - -int arp_find(unsigned char *haddr, struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - __be32 paddr; - struct neighbour *n; - - if (!skb_dst(skb)) { - pr_debug("arp_find is called with dst==NULL\n"); - kfree_skb(skb); - return 1; - } - - paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr); - if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, - paddr, dev)) - return 0; - - n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); - - if (n) { - n->used = jiffies; - if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) { - neigh_ha_snapshot(haddr, n, dev); - neigh_release(n); - return 0; - } - neigh_release(n); - } else - kfree_skb(skb); - return 1; -} -EXPORT_SYMBOL(arp_find); - -/* END OF OBSOLETE FUNCTIONS */ - /* * Check if we can use proxy ARP for this path */ @@ -569,7 +475,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev, */ /* - * Create an arp packet. If (dest_hw == NULL), we create a broadcast + * Create an arp packet. If dest_hw is not set, we create a broadcast * message. */ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, @@ -589,7 +495,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, */ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); - if (skb == NULL) + if (!skb) return NULL; skb_reserve(skb, hlen); @@ -597,9 +503,9 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); - if (src_hw == NULL) + if (!src_hw) src_hw = dev->dev_addr; - if (dest_hw == NULL) + if (!dest_hw) dest_hw = dev->broadcast; /* @@ -663,7 +569,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, break; #endif default: - if (target_hw != NULL) + if (target_hw) memcpy(arp_ptr, target_hw, dev->addr_len); else memset(arp_ptr, 0, dev->addr_len); @@ -685,7 +591,8 @@ EXPORT_SYMBOL(arp_create); void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ - NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); + NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb, + NULL, skb->dev, dev_queue_xmit_sk); } EXPORT_SYMBOL(arp_xmit); @@ -708,7 +615,7 @@ void arp_send(int type, int ptype, __be32 dest_ip, skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); - if (skb == NULL) + if (!skb) return; arp_xmit(skb); @@ -719,7 +626,7 @@ EXPORT_SYMBOL(arp_send); * Process an arp request. */ -static int arp_process(struct sk_buff *skb) +static int arp_process(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -738,7 +645,7 @@ static int arp_process(struct sk_buff *skb) * is ARP'able. */ - if (in_dev == NULL) + if (!in_dev) goto out; arp = arp_hdr(skb); @@ -902,7 +809,7 @@ static int arp_process(struct sk_buff *skb) is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && inet_addr_type(net, sip) == RTN_UNICAST; - if (n == NULL && + if (!n && ((arp->ar_op == htons(ARPOP_REPLY) && inet_addr_type(net, sip) == RTN_UNICAST) || is_garp)) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); @@ -940,7 +847,7 @@ out: static void parp_redo(struct sk_buff *skb) { - arp_process(skb); + arp_process(NULL, skb); } @@ -973,7 +880,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); + return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb, + dev, NULL, arp_process); consumeskb: consume_skb(skb); @@ -994,7 +902,7 @@ out_of_mem: static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { - if (dev == NULL) { + if (!dev) { IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } @@ -1020,7 +928,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return -ENODEV; } if (mask) { - if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL) + if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) return -ENOBUFS; return 0; } @@ -1041,7 +949,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; - if (dev == NULL) { + if (!dev) { struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); if (IS_ERR(rt)) @@ -1109,14 +1017,16 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { - read_lock_bh(&neigh->lock); - memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); - r->arp_flags = arp_state_to_flags(neigh); - read_unlock_bh(&neigh->lock); - r->arp_ha.sa_family = dev->type; - strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); + if (!(neigh->nud_state & NUD_NOARP)) { + read_lock_bh(&neigh->lock); + memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); + r->arp_ha.sa_family = dev->type; + strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); + err = 0; + } neigh_release(neigh); - err = 0; } return err; } @@ -1161,7 +1071,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (dev == NULL) { + if (!dev) { struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -1210,7 +1120,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (r.arp_dev[0]) { err = -ENODEV; dev = __dev_get_by_name(net, r.arp_dev); - if (dev == NULL) + if (!dev) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index e361ea6f3fc8..bdb2a07ec363 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -255,7 +255,7 @@ static int __init cipso_v4_cache_init(void) cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS, sizeof(struct cipso_v4_map_cache_bkt), GFP_KERNEL); - if (cipso_v4_cache == NULL) + if (!cipso_v4_cache) return -ENOMEM; for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { @@ -339,7 +339,7 @@ static int cipso_v4_cache_check(const unsigned char *key, secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CIPSOV4; - if (prev_entry == NULL) { + if (!prev_entry) { spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; } @@ -393,10 +393,10 @@ int cipso_v4_cache_add(const unsigned char *cipso_ptr, cipso_ptr_len = cipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); - if (entry == NULL) + if (!entry) return -ENOMEM; entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC); - if (entry->key == NULL) { + if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } @@ -502,7 +502,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def, atomic_set(&doi_def->refcount, 1); spin_lock(&cipso_v4_doi_list_lock); - if (cipso_v4_doi_search(doi_def->doi) != NULL) { + if (cipso_v4_doi_search(doi_def->doi)) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; @@ -513,7 +513,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def, doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info); - if (audit_buf != NULL) { + if (audit_buf) { const char *type_str; switch (doi_type) { case CIPSO_V4_MAP_TRANS: @@ -547,7 +547,7 @@ doi_add_return: */ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) { - if (doi_def == NULL) + if (!doi_def) return; switch (doi_def->type) { @@ -598,7 +598,7 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) spin_lock(&cipso_v4_doi_list_lock); doi_def = cipso_v4_doi_search(doi); - if (doi_def == NULL) { + if (!doi_def) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; @@ -617,7 +617,7 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info); - if (audit_buf != NULL) { + if (audit_buf) { audit_log_format(audit_buf, " cipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); @@ -644,7 +644,7 @@ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi) rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); - if (doi_def == NULL) + if (!doi_def) goto doi_getdef_return; if (!atomic_inc_not_zero(&doi_def->refcount)) doi_def = NULL; @@ -664,7 +664,7 @@ doi_getdef_return: */ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) { - if (doi_def == NULL) + if (!doi_def) return; if (!atomic_dec_and_test(&doi_def->refcount)) @@ -1642,7 +1642,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) rcu_read_lock(); doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2])); - if (doi_def == NULL) { + if (!doi_def) { err_offset = 2; goto validate_return_locked; } @@ -1736,7 +1736,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) * not the loopback device drop the packet. Further, * there is no legitimate reason for setting this from * userspace so reject it if skb is NULL. */ - if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) { + if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) { err_offset = opt_iter; goto validate_return_locked; } @@ -1897,7 +1897,7 @@ int cipso_v4_sock_setattr(struct sock *sk, * defined yet but it is not a problem as the only users of these * "lite" PF_INET sockets are functions which do an accept() call * afterwards so we will label the socket as part of the accept(). */ - if (sk == NULL) + if (!sk) return 0; /* We allocate the maximum CIPSO option size here so we are probably @@ -1905,7 +1905,7 @@ int cipso_v4_sock_setattr(struct sock *sk, * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); - if (buf == NULL) { + if (!buf) { ret_val = -ENOMEM; goto socket_setattr_failure; } @@ -1921,7 +1921,7 @@ int cipso_v4_sock_setattr(struct sock *sk, * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); - if (opt == NULL) { + if (!opt) { ret_val = -ENOMEM; goto socket_setattr_failure; } @@ -1981,7 +1981,7 @@ int cipso_v4_req_setattr(struct request_sock *req, * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); - if (buf == NULL) { + if (!buf) { ret_val = -ENOMEM; goto req_setattr_failure; } @@ -1997,7 +1997,7 @@ int cipso_v4_req_setattr(struct request_sock *req, * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); - if (opt == NULL) { + if (!opt) { ret_val = -ENOMEM; goto req_setattr_failure; } @@ -2102,7 +2102,7 @@ void cipso_v4_sock_delattr(struct sock *sk) sk_inet = inet_sk(sk); opt = rcu_dereference_protected(sk_inet->inet_opt, 1); - if (opt == NULL || opt->opt.cipso == 0) + if (!opt || opt->opt.cipso == 0) return; hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); @@ -2128,7 +2128,7 @@ void cipso_v4_req_delattr(struct request_sock *req) req_inet = inet_rsk(req); opt = req_inet->opt; - if (opt == NULL || opt->opt.cipso == 0) + if (!opt || opt->opt.cipso == 0) return; cipso_v4_delopt(&req_inet->opt); @@ -2157,7 +2157,7 @@ int cipso_v4_getattr(const unsigned char *cipso, doi = get_unaligned_be32(&cipso[2]); rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); - if (doi_def == NULL) + if (!doi_def) goto getattr_return; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 90c0e8386116..574fad9cca05 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -20,7 +20,7 @@ #include <net/route.h> #include <net/tcp_states.h> -int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; @@ -39,8 +39,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); - lock_sock(sk); - oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { @@ -82,9 +80,19 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_dst_set(sk, &rt->dst); err = 0; out: - release_sock(sk); return err; } +EXPORT_SYMBOL(__ip4_datagram_connect); + +int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip4_datagram_connect(sk, uaddr, addr_len); + release_sock(sk); + return res; +} EXPORT_SYMBOL(ip4_datagram_connect); /* Because UDP xmit path can manipulate sk_dst_cache without holding diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3a8985c94581..2d9cb1748f81 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -107,7 +107,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; -static u32 inet_addr_hash(struct net *net, __be32 addr) +static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = (__force u32) addr ^ net_hash_mix(net); @@ -548,6 +548,26 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, return NULL; } +static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) +{ + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ifa->ifa_address, + .imr_ifindex = ifa->ifa_dev->dev->ifindex, + }; + int ret; + + ASSERT_RTNL(); + + lock_sock(sk); + if (join) + ret = ip_mc_join_group(sk, &mreq); + else + ret = ip_mc_leave_group(sk, &mreq); + release_sock(sk); + + return ret; +} + static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -565,7 +585,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) ifm = nlmsg_data(nlh); in_dev = inetdev_by_index(net, ifm->ifa_index); - if (in_dev == NULL) { + if (!in_dev) { err = -ENODEV; goto errout; } @@ -573,7 +593,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && - ifa->ifa_local != nla_get_be32(tb[IFA_LOCAL])) + ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) continue; if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) @@ -581,9 +601,11 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) if (tb[IFA_ADDRESS] && (ifm->ifa_prefixlen != ifa->ifa_prefixlen || - !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) + !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa))) continue; + if (ipv4_is_multicast(ifa->ifa_address)) + ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); return 0; } @@ -733,21 +755,21 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); err = -EINVAL; - if (ifm->ifa_prefixlen > 32 || tb[IFA_LOCAL] == NULL) + if (ifm->ifa_prefixlen > 32 || !tb[IFA_LOCAL]) goto errout; dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; - if (dev == NULL) + if (!dev) goto errout; in_dev = __in_dev_get_rtnl(dev); err = -ENOBUFS; - if (in_dev == NULL) + if (!in_dev) goto errout; ifa = inet_alloc_ifa(); - if (ifa == NULL) + if (!ifa) /* * A potential indev allocation can be left alive, it stays * assigned to its device and is destroy with it. @@ -758,7 +780,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, neigh_parms_data_state_setall(in_dev->arp_parms); in_dev_hold(in_dev); - if (tb[IFA_ADDRESS] == NULL) + if (!tb[IFA_ADDRESS]) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; INIT_HLIST_NODE(&ifa->hash); @@ -769,11 +791,11 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_dev = in_dev; - ifa->ifa_local = nla_get_be32(tb[IFA_LOCAL]); - ifa->ifa_address = nla_get_be32(tb[IFA_ADDRESS]); + ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); + ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); if (tb[IFA_BROADCAST]) - ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]); + ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); @@ -838,6 +860,15 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); + if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { + int ret = ip_mc_config(net->ipv4.mc_autojoin_sk, + true, ifa); + + if (ret < 0) { + inet_free_ifa(ifa); + return ret; + } + } return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); } else { inet_free_ifa(ifa); @@ -851,7 +882,6 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); - blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); } return 0; } @@ -1259,7 +1289,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 addr = 0; struct net_device *dev; - if (in_dev != NULL) + if (in_dev) return confirm_addr_indev(in_dev, dst, local, scope); rcu_read_lock(); @@ -1309,7 +1339,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) if (named++ == 0) goto skip; dot = strchr(old, ':'); - if (dot == NULL) { + if (!dot) { sprintf(old, ":%d", named); dot = old; } @@ -1478,7 +1508,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, u32 preferred, valid; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); @@ -1510,11 +1540,11 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, valid = INFINITY_LIFE_TIME; } if ((ifa->ifa_address && - nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) || + nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && - nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) || + nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) || (ifa->ifa_broadcast && - nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || + nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || @@ -1597,7 +1627,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, net = dev_net(ifa->ifa_dev->dev); skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); - if (skb == NULL) + if (!skb) goto errout; err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0); @@ -1634,7 +1664,7 @@ static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev) return -ENODATA; nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); - if (nla == NULL) + if (!nla) return -EMSGSIZE; for (i = 0; i < IPV4_DEVCONF_MAX; i++) @@ -1709,6 +1739,8 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (type == -1 || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); + if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + size += nla_total_size(4); return size; } @@ -1723,7 +1755,7 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; ncm = nlmsg_data(nlh); @@ -1749,6 +1781,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; + if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -1765,7 +1801,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, int err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC); - if (skb == NULL) + if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, @@ -1788,6 +1824,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, + [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet_netconf_get_devconf(struct sk_buff *in_skb, @@ -1822,10 +1859,10 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, break; default: dev = __dev_get_by_index(net, ifindex); - if (dev == NULL) + if (!dev) goto errout; in_dev = __in_dev_get_rtnl(dev); - if (in_dev == NULL) + if (!in_dev) goto errout; devconf = &in_dev->cnf; break; @@ -1833,7 +1870,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); - if (skb == NULL) + if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, @@ -2017,6 +2054,12 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, ifindex, cnf); } + if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && + new_value != old_value) { + ifindex = devinet_conf_ifindex(net, cnf); + inet_netconf_notify_devconf(net, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + ifindex, cnf); + } } return ret; @@ -2138,6 +2181,8 @@ static struct devinet_sysctl_table { "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), + DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, + "ignore_routes_with_linkdown"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -2184,7 +2229,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) { struct devinet_sysctl_table *t = cnf->sysctl; - if (t == NULL) + if (!t) return; cnf->sysctl = NULL; @@ -2245,16 +2290,16 @@ static __net_init int devinet_init_net(struct net *net) if (!net_eq(net, &init_net)) { all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); - if (all == NULL) + if (!all) goto err_alloc_all; dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); - if (dflt == NULL) + if (!dflt) goto err_alloc_dflt; #ifdef CONFIG_SYSCTL tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL); - if (tbl == NULL) + if (!tbl) goto err_alloc_ctl; tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; @@ -2274,7 +2319,7 @@ static __net_init int devinet_init_net(struct net *net) err = -ENOMEM; forw_hdr = register_net_sysctl(net, "net/ipv4", tbl); - if (forw_hdr == NULL) + if (!forw_hdr) goto err_reg_ctl; net->ipv4.forw_hdr = forw_hdr; #endif diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 60173d4d3a0e..477937465a20 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -49,7 +49,7 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen) len = ALIGN(len, crypto_tfm_ctx_alignment()); } - len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead); + len += sizeof(struct aead_request) + crypto_aead_reqsize(aead); len = ALIGN(len, __alignof__(struct scatterlist)); len += sizeof(struct scatterlist) * nfrags; @@ -68,17 +68,6 @@ static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; } -static inline struct aead_givcrypt_request *esp_tmp_givreq( - struct crypto_aead *aead, u8 *iv) -{ - struct aead_givcrypt_request *req; - - req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), - crypto_tfm_ctx_alignment()); - aead_givcrypt_set_tfm(req, aead); - return req; -} - static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv) { struct aead_request *req; @@ -97,14 +86,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static inline struct scatterlist *esp_givreq_sg( - struct crypto_aead *aead, struct aead_givcrypt_request *req) -{ - return (void *)ALIGN((unsigned long)(req + 1) + - crypto_aead_reqsize(aead), - __alignof__(struct scatterlist)); -} - static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -113,14 +94,37 @@ static void esp_output_done(struct crypto_async_request *base, int err) xfrm_output_resume(skb, err); } +/* Move ESP header back into place. */ +static void esp_restore_header(struct sk_buff *skb, unsigned int offset) +{ + struct ip_esp_hdr *esph = (void *)(skb->data + offset); + void *tmp = ESP_SKB_CB(skb)->tmp; + __be32 *seqhi = esp_tmp_seqhi(tmp); + + esph->seq_no = esph->spi; + esph->spi = *seqhi; +} + +static void esp_output_restore_header(struct sk_buff *skb) +{ + esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); +} + +static void esp_output_done_esn(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + esp_output_restore_header(skb); + esp_output_done(base, err); +} + static int esp_output(struct xfrm_state *x, struct sk_buff *skb) { int err; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct aead_givcrypt_request *req; + struct aead_request *req; struct scatterlist *sg; - struct scatterlist *asg; struct sk_buff *trailer; void *tmp; u8 *iv; @@ -129,17 +133,19 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) int clen; int alen; int plen; + int ivlen; int tfclen; int nfrags; int assoclen; - int sglists; int seqhilen; __be32 *seqhi; + __be64 seqno; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); + ivlen = crypto_aead_ivsize(aead); tfclen = 0; if (x->tfcpad) { @@ -160,16 +166,14 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) nfrags = err; assoclen = sizeof(*esph); - sglists = 1; seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { - sglists += 2; seqhilen += sizeof(__be32); assoclen += seqhilen; } - tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) { err = -ENOMEM; goto error; @@ -177,9 +181,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); - req = esp_tmp_givreq(aead, iv); - asg = esp_givreq_sg(aead, req); - sg = asg + sglists; + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); /* Fill padding... */ tail = skb_tail_pointer(trailer); @@ -235,36 +238,53 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) *skb_mac_header(skb) = IPPROTO_UDP; } - esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + aead_request_set_callback(req, 0, esp_output_done, skb); + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * encryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); + *seqhi = esph->spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + aead_request_set_callback(req, 0, esp_output_done_esn, skb); + } + + esph->spi = x->id.spi; + sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, - esph->enc_data + crypto_aead_ivsize(aead) - skb->data, - clen + alen); + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); - if ((x->props.flags & XFRM_STATE_ESN)) { - sg_init_table(asg, 3); - sg_set_buf(asg, &esph->spi, sizeof(__be32)); - *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); - sg_set_buf(asg + 1, seqhi, seqhilen); - sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); - } else - sg_init_one(asg, esph, sizeof(*esph)); - - aead_givcrypt_set_callback(req, 0, esp_output_done, skb); - aead_givcrypt_set_crypt(req, sg, sg, clen, iv); - aead_givcrypt_set_assoc(req, asg, assoclen); - aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output.low); + aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); + aead_request_set_ad(req, assoclen); + + seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); + + memset(iv, 0, ivlen); + memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8), + min(ivlen, 8)); ESP_SKB_CB(skb)->tmp = tmp; - err = crypto_aead_givencrypt(req); - if (err == -EINPROGRESS) + err = crypto_aead_encrypt(req); + + switch (err) { + case -EINPROGRESS: goto error; - if (err == -EBUSY) + case -EBUSY: err = NET_XMIT_DROP; + break; + + case 0: + if ((x->props.flags & XFRM_STATE_ESN)) + esp_output_restore_header(skb); + } kfree(tmp); @@ -363,6 +383,20 @@ static void esp_input_done(struct crypto_async_request *base, int err) xfrm_input_resume(skb, esp_input_done2(skb, err)); } +static void esp_input_restore_header(struct sk_buff *skb) +{ + esp_restore_header(skb, 0); + __skb_pull(skb, 4); +} + +static void esp_input_done_esn(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + esp_input_restore_header(skb); + esp_input_done(base, err); +} + /* * Note: detecting truncated vs. non-truncated authentication data is very * expensive, so we only support truncated data, which is the recommended @@ -374,19 +408,18 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; - int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); + int ivlen = crypto_aead_ivsize(aead); + int elen = skb->len - sizeof(*esph) - ivlen; int nfrags; int assoclen; - int sglists; int seqhilen; __be32 *seqhi; void *tmp; u8 *iv; struct scatterlist *sg; - struct scatterlist *asg; int err = -EINVAL; - if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) + if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) goto out; if (elen <= 0) @@ -399,17 +432,15 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) nfrags = err; assoclen = sizeof(*esph); - sglists = 1; seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { - sglists += 2; seqhilen += sizeof(__be32); assoclen += seqhilen; } err = -ENOMEM; - tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) goto out; @@ -417,36 +448,39 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); - asg = esp_req_sg(aead, req); - sg = asg + sglists; + sg = esp_req_sg(aead, req); skb->ip_summed = CHECKSUM_NONE; esph = (struct ip_esp_hdr *)skb->data; - /* Get ivec. This can be wrong, check against another impls. */ - iv = esph->enc_data; - - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); + aead_request_set_callback(req, 0, esp_input_done, skb); + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * decryption. + */ if ((x->props.flags & XFRM_STATE_ESN)) { - sg_init_table(asg, 3); - sg_set_buf(asg, &esph->spi, sizeof(__be32)); - *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; - sg_set_buf(asg + 1, seqhi, seqhilen); - sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); - } else - sg_init_one(asg, esph, sizeof(*esph)); + esph = (void *)skb_push(skb, 4); + *seqhi = esph->spi; + esph->spi = esph->seq_no; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi); + aead_request_set_callback(req, 0, esp_input_done_esn, skb); + } - aead_request_set_callback(req, 0, esp_input_done, skb); - aead_request_set_crypt(req, sg, sg, elen, iv); - aead_request_set_assoc(req, asg, assoclen); + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); + aead_request_set_ad(req, assoclen); err = crypto_aead_decrypt(req); if (err == -EINPROGRESS) goto out; + if ((x->props.flags & XFRM_STATE_ESN)) + esp_input_restore_header(skb); + err = esp_input_done2(skb, err); out: @@ -518,10 +552,16 @@ static void esp_destroy(struct xfrm_state *x) static int esp_init_aead(struct xfrm_state *x) { + char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - aead = crypto_alloc_aead(x->aead->alg_name, 0, 0); + err = -ENAMETOOLONG; + if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", + x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -553,22 +593,26 @@ static int esp_init_authenc(struct xfrm_state *x) int err; err = -EINVAL; - if (x->ealg == NULL) + if (!x->ealg) goto error; err = -ENAMETOOLONG; if ((x->props.flags & XFRM_STATE_ESN)) { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, - "authencesn(%s,%s)", + "%s%sauthencesn(%s,%s)%s", + x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + x->ealg->alg_name, + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) goto error; } else { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, - "authenc(%s,%s)", + "%s%sauthenc(%s,%s)%s", + x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + x->ealg->alg_name, + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) goto error; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 23b9b3e86f4c..6bbc54940eb4 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -52,12 +52,12 @@ static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; - local_table = fib_trie_table(RT_TABLE_LOCAL); - if (local_table == NULL) + main_table = fib_trie_table(RT_TABLE_MAIN, NULL); + if (!main_table) return -ENOMEM; - main_table = fib_trie_table(RT_TABLE_MAIN); - if (main_table == NULL) + local_table = fib_trie_table(RT_TABLE_LOCAL, main_table); + if (!local_table) goto fail; hlist_add_head_rcu(&local_table->tb_hlist, @@ -67,14 +67,14 @@ static int __net_init fib4_rules_init(struct net *net) return 0; fail: - fib_free_table(local_table); + fib_free_table(main_table); return -ENOMEM; } #else struct fib_table *fib_new_table(struct net *net, u32 id) { - struct fib_table *tb; + struct fib_table *tb, *alias = NULL; unsigned int h; if (id == 0) @@ -83,23 +83,23 @@ struct fib_table *fib_new_table(struct net *net, u32 id) if (tb) return tb; - tb = fib_trie_table(id); + if (id == RT_TABLE_LOCAL) + alias = fib_new_table(net, RT_TABLE_MAIN); + + tb = fib_trie_table(id, alias); if (!tb) return NULL; switch (id) { case RT_TABLE_LOCAL: - net->ipv4.fib_local = tb; + rcu_assign_pointer(net->ipv4.fib_local, tb); break; - case RT_TABLE_MAIN: - net->ipv4.fib_main = tb; + rcu_assign_pointer(net->ipv4.fib_main, tb); break; - case RT_TABLE_DEFAULT: - net->ipv4.fib_default = tb; + rcu_assign_pointer(net->ipv4.fib_default, tb); break; - default: break; } @@ -129,16 +129,62 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } #endif /* CONFIG_IP_MULTIPLE_TABLES */ +static void fib_replace_table(struct net *net, struct fib_table *old, + struct fib_table *new) +{ +#ifdef CONFIG_IP_MULTIPLE_TABLES + switch (new->tb_id) { + case RT_TABLE_LOCAL: + rcu_assign_pointer(net->ipv4.fib_local, new); + break; + case RT_TABLE_MAIN: + rcu_assign_pointer(net->ipv4.fib_main, new); + break; + case RT_TABLE_DEFAULT: + rcu_assign_pointer(net->ipv4.fib_default, new); + break; + default: + break; + } + +#endif + /* replace the old table in the hlist */ + hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist); +} + +int fib_unmerge(struct net *net) +{ + struct fib_table *old, *new; + + /* attempt to fetch local table if it has been allocated */ + old = fib_get_table(net, RT_TABLE_LOCAL); + if (!old) + return 0; + + new = fib_trie_unmerge(old); + if (!new) + return -ENOMEM; + + /* replace merged table with clean table */ + if (new != old) { + fib_replace_table(net, old, new); + fib_free_table(old); + } + + return 0; +} + static void fib_flush(struct net *net) { int flushed = 0; - struct fib_table *tb; - struct hlist_head *head; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { - head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, head, tb_hlist) + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct hlist_node *tmp; + struct fib_table *tb; + + hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(tb); } @@ -146,6 +192,19 @@ static void fib_flush(struct net *net) rt_cache_flush(net); } +void fib_flush_external(struct net *net) +{ + struct fib_table *tb; + struct hlist_head *head; + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry(tb, head, tb_hlist) + fib_table_flush_external(tb); + } +} + /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. @@ -221,7 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_scope = scope; fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; - if (!fib_lookup(net, &fl4, &res)) + if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); } else { scope = RT_SCOPE_LINK; @@ -260,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; net = dev_net(dev); - if (fib_lookup(net, &fl4, &res)) + if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) @@ -295,7 +354,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_oif = dev->ifindex; ret = 0; - if (fib_lookup(net, &fl4, &res) == 0) { + if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } @@ -427,7 +486,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) if (strcmp(ifa->ifa_label, devname) == 0) break; - if (ifa == NULL) + if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; } @@ -455,7 +514,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, int len = 0; mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL); - if (mx == NULL) + if (!mx) return -ENOMEM; if (rt->rt_flags & RTF_MTU) @@ -617,7 +676,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; tb = fib_get_table(net, cfg.fc_table); - if (tb == NULL) { + if (!tb) { err = -ESRCH; goto errout; } @@ -639,7 +698,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; tb = fib_new_table(net, cfg.fc_table); - if (tb == NULL) { + if (!tb) { err = -ENOBUFS; goto errout; } @@ -665,10 +724,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[0]; s_e = cb->args[1]; + rcu_read_lock(); + for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) @@ -682,6 +743,8 @@ next: } } out: + rcu_read_unlock(); + cb->args[1] = e; cb->args[0] = h; @@ -716,7 +779,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad else tb = fib_new_table(net, RT_TABLE_LOCAL); - if (tb == NULL) + if (!tb) return; cfg.fc_table = tb->tb_id; @@ -743,7 +806,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); - if (prim == NULL) { + if (!prim) { pr_warn("%s: bug: prim == NULL\n", __func__); return; } @@ -797,7 +860,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); - if (prim == NULL) { + if (!prim) { pr_warn("%s: bug: prim == NULL\n", __func__); return; } @@ -967,7 +1030,7 @@ static void nl_fib_input(struct sk_buff *skb) return; skb = netlink_skb_clone(skb, GFP_KERNEL); - if (skb == NULL) + if (!skb) return; nlh = nlmsg_hdr(skb); @@ -988,7 +1051,7 @@ static int __net_init nl_fib_lookup_init(struct net *net) }; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); - if (sk == NULL) + if (!sk) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; return 0; @@ -1000,9 +1063,9 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force) +static void fib_disable_ip(struct net_device *dev, unsigned long event) { - if (fib_sync_down_dev(dev, force)) + if (fib_sync_down_dev(dev, event)) fib_flush(dev_net(dev)); rt_cache_flush(dev_net(dev)); arp_ifdown(dev); @@ -1018,7 +1081,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev)); @@ -1026,11 +1089,11 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); atomic_inc(&net->ipv4.dev_addr_genid); - if (ifa->ifa_dev->ifa_list == NULL) { + if (!ifa->ifa_dev->ifa_list) { /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1); + fib_disable_ip(dev, event); } else { rt_cache_flush(dev_net(dev)); } @@ -1044,9 +1107,10 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; struct net *net = dev_net(dev); + unsigned int flags; if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2); + fib_disable_ip(dev, event); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1061,16 +1125,22 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_add_ifaddr(ifa); } endfor_ifa(in_dev); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(dev); + fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0); + fib_disable_ip(dev, event); break; - case NETDEV_CHANGEMTU: case NETDEV_CHANGE: + flags = dev_get_flags(dev); + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + fib_sync_up(dev, RTNH_F_LINKDOWN); + else + fib_sync_down_dev(dev, event); + /* fall through */ + case NETDEV_CHANGEMTU: rt_cache_flush(net); break; } @@ -1094,7 +1164,7 @@ static int __net_init ip_fib_net_init(struct net *net) size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); - if (net->ipv4.fib_table_hash == NULL) + if (!net->ipv4.fib_table_hash) return -ENOMEM; err = fib4_rules_init(net); @@ -1113,20 +1183,25 @@ static void ip_fib_net_exit(struct net *net) rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES - fib4_rules_exit(net); + RCU_INIT_POINTER(net->ipv4.fib_local, NULL); + RCU_INIT_POINTER(net->ipv4.fib_main, NULL); + RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif for (i = 0; i < FIB_TABLE_HASHSZ; i++) { - struct fib_table *tb; - struct hlist_head *head; + struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; + struct fib_table *tb; - head = &net->ipv4.fib_table_hash[i]; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); fib_table_flush(tb); fib_free_table(tb); } } + +#ifdef CONFIG_IP_MULTIPLE_TABLES + fib4_rules_exit(net); +#endif rtnl_unlock(); kfree(net->ipv4.fib_table_hash); } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 825981b1049a..9c02920725db 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -6,11 +6,14 @@ #include <net/ip_fib.h> struct fib_alias { - struct list_head fa_list; + struct hlist_node fa_list; struct fib_info *fa_info; u8 fa_tos; u8 fa_type; u8 fa_state; + u8 fa_slen; + u32 tb_id; + s16 fa_default; struct rcu_head rcu; }; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index d3db718be51d..18123d50f576 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -47,11 +47,12 @@ struct fib4_rule { #endif }; -int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, + struct fib_result *res, unsigned int flags) { struct fib_lookup_arg arg = { .result = res, - .flags = FIB_LOOKUP_NOREF, + .flags = flags, }; int err; @@ -153,7 +154,7 @@ static struct fib_table *fib_empty_table(struct net *net) u32 id; for (id = 1; id <= RT_TABLE_MAX; id++) - if (fib_get_table(net, id) == NULL) + if (!fib_get_table(net, id)) return fib_new_table(net, id); return NULL; } @@ -174,12 +175,17 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (frh->tos & ~IPTOS_TOS_MASK) goto errout; + /* split local/main if they are not already split */ + err = fib_unmerge(net); + if (err) + goto errout; + if (rule->table == RT_TABLE_UNSPEC) { if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; table = fib_empty_table(net); - if (table == NULL) { + if (!table) { err = -ENOBUFS; goto errout; } @@ -189,10 +195,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } if (frh->src_len) - rule4->src = nla_get_be32(tb[FRA_SRC]); + rule4->src = nla_get_in_addr(tb[FRA_SRC]); if (frh->dst_len) - rule4->dst = nla_get_be32(tb[FRA_DST]); + rule4->dst = nla_get_in_addr(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) { @@ -209,21 +215,31 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; + fib_flush_external(rule->fr_net); + err = 0; errout: return err; } -static void fib4_rule_delete(struct fib_rule *rule) +static int fib4_rule_delete(struct fib_rule *rule) { struct net *net = rule->fr_net; -#ifdef CONFIG_IP_ROUTE_CLASSID - struct fib4_rule *rule4 = (struct fib4_rule *) rule; + int err; - if (rule4->tclassid) + /* split local/main if they are not already split */ + err = fib_unmerge(net); + if (err) + goto errout; + +#ifdef CONFIG_IP_ROUTE_CLASSID + if (((struct fib4_rule *)rule)->tclassid) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; + fib_flush_external(rule->fr_net); +errout: + return err; } static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, @@ -245,10 +261,10 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; #endif - if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC]))) + if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC]))) return 0; - if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST]))) + if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST]))) return 0; return 1; @@ -264,9 +280,9 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->tos = rule4->tos; if ((rule4->dst_len && - nla_put_be32(skb, FRA_DST, rule4->dst)) || + nla_put_in_addr(skb, FRA_DST, rule4->dst)) || (rule4->src_len && - nla_put_be32(skb, FRA_SRC, rule4->src))) + nla_put_in_addr(skb, FRA_SRC, rule4->src))) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid && diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1e2090ea663e..3a06586b170c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -213,7 +213,6 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - release_net(fi->fib_net); if (fi->fib_metrics != (u32 *) dst_default_metrics) kfree(fi->fib_metrics); kfree(fi); @@ -267,7 +266,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; } endfor_nexthops(fi); @@ -319,7 +318,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_type == fi->fib_type && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && - ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && + !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -391,7 +390,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int err = -ENOBUFS; skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); - if (skb == NULL) + if (!skb) goto errout; err = fib_dump_info(skb, info->portid, seq, event, tb_id, @@ -469,7 +468,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; + nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0; #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; @@ -504,7 +503,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (cfg->fc_mp == NULL) + if (!cfg->fc_mp) return 0; rtnh = cfg->fc_mp; @@ -524,7 +523,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla && nla_get_be32(nla) != nh->nh_gw) + if (nla && nla_get_in_addr(nla) != nh->nh_gw) return 1; #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); @@ -605,6 +604,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, return -ENODEV; if (!(dev->flags & IFF_UP)) return -ENETDOWN; + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; dev_hold(dev); nh->nh_scope = RT_SCOPE_LINK; @@ -622,7 +623,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl4, &res); + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); if (err) { rcu_read_unlock(); return err; @@ -637,6 +639,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (!dev) goto out; dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; @@ -647,7 +651,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, rcu_read_lock(); err = -ENODEV; in_dev = inetdev_by_index(net, nh->nh_oif); - if (in_dev == NULL) + if (!in_dev) goto out; err = -ENETDOWN; if (!(in_dev->dev->flags & IFF_UP)) @@ -655,6 +659,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->nh_dev)) + nh->nh_flags |= RTNH_F_LINKDOWN; err = 0; } out: @@ -714,8 +720,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *dest; unsigned int new_hash; - hlist_del(&fi->fib_hash); - new_hash = fib_info_hashfn(fi); dest = &new_info_hash[new_hash]; hlist_add_head(&fi->fib_hash, dest); @@ -732,8 +736,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, struct hlist_head *ldest; unsigned int new_hash; - hlist_del(&fi->fib_lhash); - new_hash = fib_laddr_hashfn(fi->fib_prefsrc); ldest = &new_laddrhash[new_hash]; hlist_add_head(&fi->fib_lhash, ldest); @@ -804,7 +806,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); - if (fi == NULL) + if (!fi) goto failure; fib_info_cnt++; if (cfg->fc_mx) { @@ -814,7 +816,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } else fi->fib_metrics = (u32 *) dst_default_metrics; - fi->fib_net = hold_net(net); + fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; @@ -922,14 +924,20 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_scope = RT_SCOPE_NOWHERE; nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); err = -ENODEV; - if (nh->nh_dev == NULL) + if (!nh->nh_dev) goto failure; } else { + int linkdown = 0; + change_nexthops(fi) { err = fib_check_nh(cfg, fi, nexthop_nh); if (err != 0) goto failure; + if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + linkdown++; } endfor_nexthops(fi) + if (linkdown == fi->fib_nhs) + fi->fib_flags |= RTNH_F_LINKDOWN; } if (fi->fib_prefsrc) { @@ -996,7 +1004,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, struct rtmsg *rtm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); @@ -1016,7 +1024,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && - nla_put_be32(skb, RTA_DST, dst)) + nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) @@ -1025,15 +1033,23 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; if (fi->fib_prefsrc && - nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc)) + nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { + struct in_device *in_dev; + if (fi->fib_nh->nh_gw && - nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) + nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) goto nla_put_failure; if (fi->fib_nh->nh_oif && nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) goto nla_put_failure; + if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtm->rtm_flags |= RTNH_F_DEAD; + } #ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid && nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) @@ -1046,20 +1062,28 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, struct nlattr *mp; mp = nla_nest_start(skb, RTA_MULTIPATH); - if (mp == NULL) + if (!mp) goto nla_put_failure; for_nexthops(fi) { + struct in_device *in_dev; + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); - if (rtnh == NULL) + if (!rtnh) goto nla_put_failure; rtnh->rtnh_flags = nh->nh_flags & 0xFF; + if (nh->nh_flags & RTNH_F_LINKDOWN) { + in_dev = __in_dev_get_rtnl(nh->nh_dev); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rtnh->rtnh_flags |= RTNH_F_DEAD; + } rtnh->rtnh_hops = nh->nh_weight - 1; rtnh->rtnh_ifindex = nh->nh_oif; if (nh->nh_gw && - nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw)) + nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw)) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid && @@ -1094,7 +1118,7 @@ int fib_sync_down_addr(struct net *net, __be32 local) struct hlist_head *head = &fib_info_laddrhash[hash]; struct fib_info *fi; - if (fib_info_laddrhash == NULL || local == 0) + if (!fib_info_laddrhash || local == 0) return 0; hlist_for_each_entry(fi, head, fib_lhash) { @@ -1108,7 +1132,7 @@ int fib_sync_down_addr(struct net *net, __be32 local) return ret; } -int fib_sync_down_dev(struct net_device *dev, int force) +int fib_sync_down_dev(struct net_device *dev, unsigned long event) { int ret = 0; int scope = RT_SCOPE_NOWHERE; @@ -1117,7 +1141,8 @@ int fib_sync_down_dev(struct net_device *dev, int force) struct hlist_head *head = &fib_info_devhash[hash]; struct fib_nh *nh; - if (force) + if (event == NETDEV_UNREGISTER || + event == NETDEV_DOWN) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { @@ -1134,7 +1159,15 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; else if (nexthop_nh->nh_dev == dev && nexthop_nh->nh_scope != scope) { - nexthop_nh->nh_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nexthop_nh->nh_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; + break; + } #ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); fi->fib_power -= nexthop_nh->nh_power; @@ -1144,14 +1177,23 @@ int fib_sync_down_dev(struct net_device *dev, int force) dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nexthop_nh->nh_dev == dev) { + if (event == NETDEV_UNREGISTER && + nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { - fi->fib_flags |= RTNH_F_DEAD; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + fi->fib_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + fi->fib_flags |= RTNH_F_LINKDOWN; + break; + } ret++; } } @@ -1160,68 +1202,85 @@ int fib_sync_down_dev(struct net_device *dev, int force) } /* Must be invoked inside of an RCU protected region. */ -void fib_select_default(struct fib_result *res) +void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; - struct list_head *fa_head = res->fa_head; + struct hlist_head *fa_head = res->fa_head; struct fib_table *tb = res->table; + u8 slen = 32 - res->prefixlen; int order = -1, last_idx = -1; - struct fib_alias *fa; + struct fib_alias *fa, *fa1 = NULL; + u32 last_prio = res->fi->fib_priority; + u8 last_tos = 0; - list_for_each_entry_rcu(fa, fa_head, fa_list) { + hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; + if (fa->fa_slen != slen) + continue; + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fa->tb_id != tb->tb_id) + continue; + if (next_fi->fib_priority > last_prio && + fa->fa_tos == last_tos) { + if (last_tos) + continue; + break; + } + if (next_fi->fib_flags & RTNH_F_DEAD) + continue; + last_tos = fa->fa_tos; + last_prio = next_fi->fib_priority; + if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); - if (fi == NULL) { + if (!fi) { if (next_fi != res->fi) break; + fa1 = fa; } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { + &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); - tb->tb_default = order; + fa1->fa_default = order; goto out; } fi = next_fi; order++; } - if (order <= 0 || fi == NULL) { - tb->tb_default = -1; + if (order <= 0 || !fi) { + if (fa1) + fa1->fa_default = -1; goto out; } if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { + fa1->fa_default)) { fib_result_assign(res, fi); - tb->tb_default = order; + fa1->fa_default = order; goto out; } if (last_idx >= 0) fib_result_assign(res, last_resort); - tb->tb_default = last_idx; + fa1->fa_default = last_idx; out: return; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev) +int fib_sync_up(struct net_device *dev, unsigned int nh_flags) { struct fib_info *prev_fi; unsigned int hash; @@ -1248,25 +1307,29 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & nh_flags)) { alive++; continue; } - if (nexthop_nh->nh_dev == NULL || + if (!nexthop_nh->nh_dev || !(nexthop_nh->nh_dev->flags & IFF_UP)) continue; if (nexthop_nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) continue; alive++; +#ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); nexthop_nh->nh_power = 0; - nexthop_nh->nh_flags &= ~RTNH_F_DEAD; + nexthop_nh->nh_flags &= ~nh_flags; spin_unlock_bh(&fib_multipath_lock); +#else + nexthop_nh->nh_flags &= ~nh_flags; +#endif } endfor_nexthops(fi) if (alive > 0) { - fi->fib_flags &= ~RTNH_F_DEAD; + fi->fib_flags &= ~nh_flags; ret++; } } @@ -1274,6 +1337,8 @@ int fib_sync_up(struct net_device *dev) return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* * The algorithm is suboptimal, but it provides really * fair weighted route distribution. @@ -1281,16 +1346,22 @@ int fib_sync_up(struct net_device *dev) void fib_select_multipath(struct fib_result *res) { struct fib_info *fi = res->fi; + struct in_device *in_dev; int w; spin_lock_bh(&fib_multipath_lock); if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { - power += nexthop_nh->nh_weight; - nexthop_nh->nh_power = nexthop_nh->nh_weight; - } + in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev); + if (nexthop_nh->nh_flags & RTNH_F_DEAD) + continue; + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + continue; + power += nexthop_nh->nh_weight; + nexthop_nh->nh_power = nexthop_nh->nh_weight; } endfor_nexthops(fi); fi->fib_power = power; if (power <= 0) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3daf0224ff2e..37c4bb89a708 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -72,6 +72,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -79,6 +80,7 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/switchdev.h> #include "fib_lookup.h" #define MAX_STAT_DEPTH 32 @@ -88,38 +90,35 @@ typedef unsigned int t_key; -#define IS_TNODE(n) ((n)->bits) -#define IS_LEAF(n) (!(n)->bits) +#define IS_TRIE(n) ((n)->pos >= KEYLENGTH) +#define IS_TNODE(n) ((n)->bits) +#define IS_LEAF(n) (!(n)->bits) -#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos) - -struct tnode { +struct key_vector { t_key key; - unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char pos; /* 2log(KEYLENGTH) bits needed */ + unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char slen; - struct tnode __rcu *parent; - struct rcu_head rcu; union { - /* The fields in this struct are valid if bits > 0 (TNODE) */ - struct { - t_key empty_children; /* KEYLENGTH bits needed */ - t_key full_children; /* KEYLENGTH bits needed */ - struct tnode __rcu *child[0]; - }; - /* This list pointer if valid if bits == 0 (LEAF) */ - struct hlist_head list; + /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ + struct hlist_head leaf; + /* This array is valid if (pos | bits) > 0 (TNODE) */ + struct key_vector __rcu *tnode[0]; }; }; -struct leaf_info { - struct hlist_node hlist; - int plen; - u32 mask_plen; /* ntohl(inet_make_mask(plen)) */ - struct list_head falh; +struct tnode { struct rcu_head rcu; + t_key empty_children; /* KEYLENGTH bits needed */ + t_key full_children; /* KEYLENGTH bits needed */ + struct key_vector __rcu *parent; + struct key_vector kv[1]; +#define tn_bits kv[0].bits }; +#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n]) +#define LEAF_SIZE TNODE_SIZE(1) + #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; @@ -142,13 +141,13 @@ struct trie_stat { }; struct trie { - struct tnode __rcu *trie; + struct key_vector kv[1]; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats; #endif }; -static void resize(struct trie *t, struct tnode *tn); +static struct key_vector *resize(struct trie *t, struct key_vector *tn); static size_t tnode_free_size; /* @@ -161,41 +160,46 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; +static inline struct tnode *tn_info(struct key_vector *kv) +{ + return container_of(kv, struct tnode, kv[0]); +} + /* caller must hold RTNL */ -#define node_parent(n) rtnl_dereference((n)->parent) +#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent) +#define get_child(tn, i) rtnl_dereference((tn)->tnode[i]) /* caller must hold RCU read lock or RTNL */ -#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent) +#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent) +#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i]) /* wrapper for rcu_assign_pointer */ -static inline void node_set_parent(struct tnode *n, struct tnode *tp) +static inline void node_set_parent(struct key_vector *n, struct key_vector *tp) { if (n) - rcu_assign_pointer(n->parent, tp); + rcu_assign_pointer(tn_info(n)->parent, tp); } -#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p) +#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p) /* This provides us with the number of children in this node, in the case of a * leaf this will return 0 meaning none of the children are accessible. */ -static inline unsigned long tnode_child_length(const struct tnode *tn) +static inline unsigned long child_length(const struct key_vector *tn) { return (1ul << tn->bits) & ~(1ul); } -/* caller must hold RTNL */ -static inline struct tnode *tnode_get_child(const struct tnode *tn, - unsigned long i) -{ - return rtnl_dereference(tn->child[i]); -} +#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos) -/* caller must hold RCU read lock or RTNL */ -static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, - unsigned long i) +static inline unsigned long get_index(t_key key, struct key_vector *kv) { - return rcu_dereference_rtnl(tn->child[i]); + unsigned long index = key ^ kv->key; + + if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos)) + return 0; + + return index >> kv->pos; } /* To understand this stuff, an understanding of keys and all their bits is @@ -274,106 +278,108 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) } #define TNODE_KMALLOC_MAX \ - ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *)) + ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *)) +#define TNODE_VMALLOC_MAX \ + ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) static void __node_free_rcu(struct rcu_head *head) { struct tnode *n = container_of(head, struct tnode, rcu); - if (IS_LEAF(n)) + if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); - else if (n->bits <= TNODE_KMALLOC_MAX) + else if (n->tn_bits <= TNODE_KMALLOC_MAX) kfree(n); else vfree(n); } -#define node_free(n) call_rcu(&n->rcu, __node_free_rcu) +#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) -static inline void free_leaf_info(struct leaf_info *leaf) +static struct tnode *tnode_alloc(int bits) { - kfree_rcu(leaf, rcu); -} + size_t size; + + /* verify bits is within bounds */ + if (bits > TNODE_VMALLOC_MAX) + return NULL; + + /* determine size and verify it is non-zero and didn't overflow */ + size = TNODE_SIZE(1ul << bits); -static struct tnode *tnode_alloc(size_t size) -{ if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else return vzalloc(size); } -static inline void empty_child_inc(struct tnode *n) +static inline void empty_child_inc(struct key_vector *n) { - ++n->empty_children ? : ++n->full_children; + ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children; } -static inline void empty_child_dec(struct tnode *n) +static inline void empty_child_dec(struct key_vector *n) { - n->empty_children-- ? : n->full_children--; + tn_info(n)->empty_children-- ? : tn_info(n)->full_children--; } -static struct tnode *leaf_new(t_key key) +static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { - struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); - if (l) { - l->parent = NULL; - /* set key and pos to reflect full key value - * any trailing zeros in the key should be ignored - * as the nodes are searched - */ - l->key = key; - l->slen = 0; - l->pos = 0; - /* set bits to 0 indicating we are not a tnode */ - l->bits = 0; + struct key_vector *l; + struct tnode *kv; - INIT_HLIST_HEAD(&l->list); - } - return l; -} + kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); + if (!kv) + return NULL; -static struct leaf_info *leaf_info_new(int plen) -{ - struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); - if (li) { - li->plen = plen; - li->mask_plen = ntohl(inet_make_mask(plen)); - INIT_LIST_HEAD(&li->falh); - } - return li; + /* initialize key vector */ + l = kv->kv; + l->key = key; + l->pos = 0; + l->bits = 0; + l->slen = fa->fa_slen; + + /* link leaf to fib alias */ + INIT_HLIST_HEAD(&l->leaf); + hlist_add_head(&fa->fa_list, &l->leaf); + + return l; } -static struct tnode *tnode_new(t_key key, int pos, int bits) +static struct key_vector *tnode_new(t_key key, int pos, int bits) { - size_t sz = offsetof(struct tnode, child[1ul << bits]); - struct tnode *tn = tnode_alloc(sz); unsigned int shift = pos + bits; + struct key_vector *tn; + struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); - if (tn) { - tn->parent = NULL; - tn->slen = pos; - tn->pos = pos; - tn->bits = bits; - tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; - if (bits == KEYLENGTH) - tn->full_children = 1; - else - tn->empty_children = 1ul << bits; - } + tnode = tnode_alloc(bits); + if (!tnode) + return NULL; + + pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), + sizeof(struct key_vector *) << bits); + + if (bits == KEYLENGTH) + tnode->full_children = 1; + else + tnode->empty_children = 1ul << bits; + + tn = tnode->kv; + tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; + tn->pos = pos; + tn->bits = bits; + tn->slen = pos; - pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct tnode *) << bits); return tn; } /* Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(const struct tnode *tn, const struct tnode *n) +static inline int tnode_full(struct key_vector *tn, struct key_vector *n) { return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } @@ -381,17 +387,18 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n) /* Add a child at position i overwriting the old value. * Update the value of full_children and empty_children. */ -static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) +static void put_child(struct key_vector *tn, unsigned long i, + struct key_vector *n) { - struct tnode *chi = tnode_get_child(tn, i); + struct key_vector *chi = get_child(tn, i); int isfull, wasfull; - BUG_ON(i >= tnode_child_length(tn)); + BUG_ON(i >= child_length(tn)); /* update emptyChildren, overflow into fullChildren */ - if (n == NULL && chi != NULL) + if (!n && chi) empty_child_inc(tn); - if (n != NULL && chi == NULL) + if (n && !chi) empty_child_dec(tn); /* update fullChildren */ @@ -399,23 +406,23 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) isfull = tnode_full(tn, n); if (wasfull && !isfull) - tn->full_children--; + tn_info(tn)->full_children--; else if (!wasfull && isfull) - tn->full_children++; + tn_info(tn)->full_children++; if (n && (tn->slen < n->slen)) tn->slen = n->slen; - rcu_assign_pointer(tn->child[i], n); + rcu_assign_pointer(tn->tnode[i], n); } -static void update_children(struct tnode *tn) +static void update_children(struct key_vector *tn) { unsigned long i; /* update all of the child parent pointers */ - for (i = tnode_child_length(tn); i;) { - struct tnode *inode = tnode_get_child(tn, --i); + for (i = child_length(tn); i;) { + struct key_vector *inode = get_child(tn, --i); if (!inode) continue; @@ -431,36 +438,37 @@ static void update_children(struct tnode *tn) } } -static inline void put_child_root(struct tnode *tp, struct trie *t, - t_key key, struct tnode *n) +static inline void put_child_root(struct key_vector *tp, t_key key, + struct key_vector *n) { - if (tp) - put_child(tp, get_index(key, tp), n); + if (IS_TRIE(tp)) + rcu_assign_pointer(tp->tnode[0], n); else - rcu_assign_pointer(t->trie, n); + put_child(tp, get_index(key, tp), n); } -static inline void tnode_free_init(struct tnode *tn) +static inline void tnode_free_init(struct key_vector *tn) { - tn->rcu.next = NULL; + tn_info(tn)->rcu.next = NULL; } -static inline void tnode_free_append(struct tnode *tn, struct tnode *n) +static inline void tnode_free_append(struct key_vector *tn, + struct key_vector *n) { - n->rcu.next = tn->rcu.next; - tn->rcu.next = &n->rcu; + tn_info(n)->rcu.next = tn_info(tn)->rcu.next; + tn_info(tn)->rcu.next = &tn_info(n)->rcu; } -static void tnode_free(struct tnode *tn) +static void tnode_free(struct key_vector *tn) { - struct callback_head *head = &tn->rcu; + struct callback_head *head = &tn_info(tn)->rcu; while (head) { head = head->next; - tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); + tnode_free_size += TNODE_SIZE(1ul << tn->bits); node_free(tn); - tn = container_of(head, struct tnode, rcu); + tn = container_of(head, struct tnode, rcu)->kv; } if (tnode_free_size >= PAGE_SIZE * sync_pages) { @@ -469,14 +477,16 @@ static void tnode_free(struct tnode *tn) } } -static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn) +static struct key_vector *replace(struct trie *t, + struct key_vector *oldtnode, + struct key_vector *tn) { - struct tnode *tp = node_parent(oldtnode); + struct key_vector *tp = node_parent(oldtnode); unsigned long i; /* setup the parent pointer out of and back into this node */ NODE_INIT_PARENT(tn, tp); - put_child_root(tp, t, tn->key, tn); + put_child_root(tp, tn->key, tn); /* update all of the child parent pointers */ update_children(tn); @@ -485,18 +495,21 @@ static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn) tnode_free(oldtnode); /* resize children now that oldtnode is freed */ - for (i = tnode_child_length(tn); i;) { - struct tnode *inode = tnode_get_child(tn, --i); + for (i = child_length(tn); i;) { + struct key_vector *inode = get_child(tn, --i); /* resize child node */ if (tnode_full(tn, inode)) - resize(t, inode); + tn = resize(t, inode); } + + return tp; } -static int inflate(struct trie *t, struct tnode *oldtnode) +static struct key_vector *inflate(struct trie *t, + struct key_vector *oldtnode) { - struct tnode *tn; + struct key_vector *tn; unsigned long i; t_key m; @@ -504,7 +517,7 @@ static int inflate(struct trie *t, struct tnode *oldtnode) tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); if (!tn) - return -ENOMEM; + goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); @@ -514,13 +527,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode) * point to existing tnodes and the links between our allocated * nodes. */ - for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) { - struct tnode *inode = tnode_get_child(oldtnode, --i); - struct tnode *node0, *node1; + for (i = child_length(oldtnode), m = 1u << tn->pos; i;) { + struct key_vector *inode = get_child(oldtnode, --i); + struct key_vector *node0, *node1; unsigned long j, k; /* An empty child */ - if (inode == NULL) + if (!inode) continue; /* A leaf or an internal node with skipped bits */ @@ -534,8 +547,8 @@ static int inflate(struct trie *t, struct tnode *oldtnode) /* An internal node with two children */ if (inode->bits == 1) { - put_child(tn, 2 * i + 1, tnode_get_child(inode, 1)); - put_child(tn, 2 * i, tnode_get_child(inode, 0)); + put_child(tn, 2 * i + 1, get_child(inode, 1)); + put_child(tn, 2 * i, get_child(inode, 0)); continue; } @@ -564,11 +577,11 @@ static int inflate(struct trie *t, struct tnode *oldtnode) tnode_free_append(tn, node0); /* populate child pointers in new nodes */ - for (k = tnode_child_length(inode), j = k / 2; j;) { - put_child(node1, --j, tnode_get_child(inode, --k)); - put_child(node0, j, tnode_get_child(inode, j)); - put_child(node1, --j, tnode_get_child(inode, --k)); - put_child(node0, j, tnode_get_child(inode, j)); + for (k = child_length(inode), j = k / 2; j;) { + put_child(node1, --j, get_child(inode, --k)); + put_child(node0, j, get_child(inode, j)); + put_child(node1, --j, get_child(inode, --k)); + put_child(node0, j, get_child(inode, j)); } /* link new nodes to parent */ @@ -581,25 +594,25 @@ static int inflate(struct trie *t, struct tnode *oldtnode) } /* setup the parent pointers into and out of this node */ - replace(t, oldtnode, tn); - - return 0; + return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); - return -ENOMEM; +notnode: + return NULL; } -static int halve(struct trie *t, struct tnode *oldtnode) +static struct key_vector *halve(struct trie *t, + struct key_vector *oldtnode) { - struct tnode *tn; + struct key_vector *tn; unsigned long i; pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); if (!tn) - return -ENOMEM; + goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); @@ -609,10 +622,10 @@ static int halve(struct trie *t, struct tnode *oldtnode) * point to existing tnodes and the links between our allocated * nodes. */ - for (i = tnode_child_length(oldtnode); i;) { - struct tnode *node1 = tnode_get_child(oldtnode, --i); - struct tnode *node0 = tnode_get_child(oldtnode, --i); - struct tnode *inode; + for (i = child_length(oldtnode); i;) { + struct key_vector *node1 = get_child(oldtnode, --i); + struct key_vector *node0 = get_child(oldtnode, --i); + struct key_vector *inode; /* At least one of the children is empty */ if (!node1 || !node0) { @@ -622,10 +635,8 @@ static int halve(struct trie *t, struct tnode *oldtnode) /* Two nonempty children */ inode = tnode_new(node0->key, oldtnode->pos, 1); - if (!inode) { - tnode_free(tn); - return -ENOMEM; - } + if (!inode) + goto nomem; tnode_free_append(tn, inode); /* initialize pointers out of node */ @@ -638,30 +649,36 @@ static int halve(struct trie *t, struct tnode *oldtnode) } /* setup the parent pointers into and out of this node */ - replace(t, oldtnode, tn); - - return 0; + return replace(t, oldtnode, tn); +nomem: + /* all pointers should be clean so we are done */ + tnode_free(tn); +notnode: + return NULL; } -static void collapse(struct trie *t, struct tnode *oldtnode) +static struct key_vector *collapse(struct trie *t, + struct key_vector *oldtnode) { - struct tnode *n, *tp; + struct key_vector *n, *tp; unsigned long i; /* scan the tnode looking for that one child that might still exist */ - for (n = NULL, i = tnode_child_length(oldtnode); !n && i;) - n = tnode_get_child(oldtnode, --i); + for (n = NULL, i = child_length(oldtnode); !n && i;) + n = get_child(oldtnode, --i); /* compress one level */ tp = node_parent(oldtnode); - put_child_root(tp, t, oldtnode->key, n); + put_child_root(tp, oldtnode->key, n); node_set_parent(n, tp); /* drop dead node */ node_free(oldtnode); + + return tp; } -static unsigned char update_suffix(struct tnode *tn) +static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; @@ -671,8 +688,8 @@ static unsigned char update_suffix(struct tnode *tn) * why we start with a stride of 2 since a stride of 1 would * represent the nodes with suffix length equal to tn->pos */ - for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) { - struct tnode *n = tnode_get_child(tn, i); + for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) { + struct key_vector *n = get_child(tn, i); if (!n || (n->slen <= slen)) continue; @@ -704,12 +721,12 @@ static unsigned char update_suffix(struct tnode *tn) * * 'high' in this instance is the variable 'inflate_threshold'. It * is expressed as a percentage, so we multiply it with - * tnode_child_length() and instead of multiplying by 2 (since the + * child_length() and instead of multiplying by 2 (since the * child array will be doubled by inflate()) and multiplying * the left-hand side by 100 (to handle the percentage thing) we * multiply the left-hand side by 50. * - * The left-hand side may look a bit weird: tnode_child_length(tn) + * The left-hand side may look a bit weird: child_length(tn) * - tn->empty_children is of course the number of non-null children * in the current node. tn->full_children is the number of "full" * children, that is non-null tnodes with a skip value of 0. @@ -719,10 +736,10 @@ static unsigned char update_suffix(struct tnode *tn) * A clearer way to write this would be: * * to_be_doubled = tn->full_children; - * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - + * not_to_be_doubled = child_length(tn) - tn->empty_children - * tn->full_children; * - * new_child_length = tnode_child_length(tn) * 2; + * new_child_length = child_length(tn) * 2; * * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / * new_child_length; @@ -739,57 +756,57 @@ static unsigned char update_suffix(struct tnode *tn) * inflate_threshold * new_child_length * * expand not_to_be_doubled and to_be_doubled, and shorten: - * 100 * (tnode_child_length(tn) - tn->empty_children + + * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: - * 100 * (tnode_child_length(tn) - tn->empty_children + + * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= - * inflate_threshold * tnode_child_length(tn) * 2 + * inflate_threshold * child_length(tn) * 2 * * shorten again: - * 50 * (tn->full_children + tnode_child_length(tn) - + * 50 * (tn->full_children + child_length(tn) - * tn->empty_children) >= inflate_threshold * - * tnode_child_length(tn) + * child_length(tn) * */ -static bool should_inflate(const struct tnode *tp, const struct tnode *tn) +static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn) { - unsigned long used = tnode_child_length(tn); + unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ - threshold *= tp ? inflate_threshold : inflate_threshold_root; - used -= tn->empty_children; - used += tn->full_children; + threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold; + used -= tn_info(tn)->empty_children; + used += tn_info(tn)->full_children; /* if bits == KEYLENGTH then pos = 0, and will fail below */ return (used > 1) && tn->pos && ((50 * used) >= threshold); } -static bool should_halve(const struct tnode *tp, const struct tnode *tn) +static inline bool should_halve(struct key_vector *tp, struct key_vector *tn) { - unsigned long used = tnode_child_length(tn); + unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ - threshold *= tp ? halve_threshold : halve_threshold_root; - used -= tn->empty_children; + threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold; + used -= tn_info(tn)->empty_children; /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); } -static bool should_collapse(const struct tnode *tn) +static inline bool should_collapse(struct key_vector *tn) { - unsigned long used = tnode_child_length(tn); + unsigned long used = child_length(tn); - used -= tn->empty_children; + used -= tn_info(tn)->empty_children; /* account for bits == KEYLENGTH case */ - if ((tn->bits == KEYLENGTH) && tn->full_children) + if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children) used -= KEY_MAX; /* One child or none, time to drop us from the trie */ @@ -797,10 +814,13 @@ static bool should_collapse(const struct tnode *tn) } #define MAX_WORK 10 -static void resize(struct trie *t, struct tnode *tn) +static struct key_vector *resize(struct trie *t, struct key_vector *tn) { - struct tnode *tp = node_parent(tn); - struct tnode __rcu **cptr; +#ifdef CONFIG_IP_FIB_TRIE_STATS + struct trie_use_stats __percpu *stats = t->stats; +#endif + struct key_vector *tp = node_parent(tn); + unsigned long cindex = get_index(tn->key, tp); int max_work = MAX_WORK; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", @@ -810,183 +830,128 @@ static void resize(struct trie *t, struct tnode *tn) * doing it ourselves. This way we can let RCU fully do its * thing without us interfering */ - cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie; - BUG_ON(tn != rtnl_dereference(*cptr)); + BUG_ON(tn != get_child(tp, cindex)); /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ while (should_inflate(tp, tn) && max_work) { - if (inflate(t, tn)) { + tp = inflate(t, tn); + if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->resize_node_skipped); + this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; - tn = rtnl_dereference(*cptr); + tn = get_child(tp, cindex); } + /* update parent in case inflate failed */ + tp = node_parent(tn); + /* Return if at least one inflate is run */ if (max_work != MAX_WORK) - return; + return tp; /* Halve as long as the number of empty children in this * node is above threshold. */ while (should_halve(tp, tn) && max_work) { - if (halve(t, tn)) { + tp = halve(t, tn); + if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->resize_node_skipped); + this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; - tn = rtnl_dereference(*cptr); + tn = get_child(tp, cindex); } /* Only one child remains */ - if (should_collapse(tn)) { - collapse(t, tn); - return; - } + if (should_collapse(tn)) + return collapse(t, tn); + + /* update parent in case halve failed */ + tp = node_parent(tn); /* Return if at least one deflate was run */ if (max_work != MAX_WORK) - return; + return tp; /* push the suffix length to the parent node */ if (tn->slen > tn->pos) { unsigned char slen = update_suffix(tn); - if (tp && (slen > tp->slen)) + if (slen > tp->slen) tp->slen = slen; } -} - -/* readside must use rcu_read_lock currently dump routines - via get_fa_head and dump */ - -static struct leaf_info *find_leaf_info(struct tnode *l, int plen) -{ - struct hlist_head *head = &l->list; - struct leaf_info *li; - - hlist_for_each_entry_rcu(li, head, hlist) - if (li->plen == plen) - return li; - - return NULL; -} -static inline struct list_head *get_fa_head(struct tnode *l, int plen) -{ - struct leaf_info *li = find_leaf_info(l, plen); - - if (!li) - return NULL; - - return &li->falh; + return tp; } -static void leaf_pull_suffix(struct tnode *l) +static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l) { - struct tnode *tp = node_parent(l); - - while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) { + while ((tp->slen > tp->pos) && (tp->slen > l->slen)) { if (update_suffix(tp) > l->slen) break; tp = node_parent(tp); } } -static void leaf_push_suffix(struct tnode *l) +static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l) { - struct tnode *tn = node_parent(l); - /* if this is a new leaf then tn will be NULL and we can sort * out parent suffix lengths as a part of trie_rebalance */ - while (tn && (tn->slen < l->slen)) { + while (tn->slen < l->slen) { tn->slen = l->slen; tn = node_parent(tn); } } -static void remove_leaf_info(struct tnode *l, struct leaf_info *old) -{ - /* record the location of the previous list_info entry */ - struct hlist_node **pprev = old->hlist.pprev; - struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next); - - /* remove the leaf info from the list */ - hlist_del_rcu(&old->hlist); - - /* only access li if it is pointing at the last valid hlist_node */ - if (hlist_empty(&l->list) || (*pprev)) - return; - - /* update the trie with the latest suffix length */ - l->slen = KEYLENGTH - li->plen; - leaf_pull_suffix(l); -} - -static void insert_leaf_info(struct tnode *l, struct leaf_info *new) +/* rcu_read_lock needs to be hold by caller from readside */ +static struct key_vector *fib_find_node(struct trie *t, + struct key_vector **tp, u32 key) { - struct hlist_head *head = &l->list; - struct leaf_info *li = NULL, *last = NULL; - - if (hlist_empty(head)) { - hlist_add_head_rcu(&new->hlist, head); - } else { - hlist_for_each_entry(li, head, hlist) { - if (new->plen > li->plen) - break; - - last = li; - } - if (last) - hlist_add_behind_rcu(&new->hlist, &last->hlist); - else - hlist_add_before_rcu(&new->hlist, &li->hlist); - } + struct key_vector *pn, *n = t->kv; + unsigned long index = 0; - /* if we added to the tail node then we need to update slen */ - if (l->slen < (KEYLENGTH - new->plen)) { - l->slen = KEYLENGTH - new->plen; - leaf_push_suffix(l); - } -} + do { + pn = n; + n = get_child_rcu(n, index); -/* rcu_read_lock needs to be hold by caller from readside */ -static struct tnode *fib_find_node(struct trie *t, u32 key) -{ - struct tnode *n = rcu_dereference_rtnl(t->trie); + if (!n) + break; - while (n) { - unsigned long index = get_index(key, n); + index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the bits in the cindex. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. - * if (index & (~0ul << bits)) + * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex + * + * This check is safe even if bits == KEYLENGTH due to the + * fact that we can only allocate a node with 32 bits if a + * long is greater than 32 bits. */ - if (index & (~0ul << n->bits)) - return NULL; - - /* we have found a leaf. Prefixes have already been compared */ - if (IS_LEAF(n)) + if (index >= (1ul << n->bits)) { + n = NULL; break; + } - n = tnode_get_child_rcu(n, index); - } + /* keep searching until we find a perfect match leaf or NULL */ + } while (IS_TNODE(n)); + + *tp = pn; return n; } @@ -994,14 +959,23 @@ static struct tnode *fib_find_node(struct trie *t, u32 key) /* Return the first fib alias matching TOS with * priority less than or equal to PRIO. */ -static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) +static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, + u8 tos, u32 prio, u32 tb_id) { struct fib_alias *fa; if (!fah) return NULL; - list_for_each_entry(fa, fah, fa_list) { + hlist_for_each_entry(fa, fah, fa_list) { + if (fa->fa_slen < slen) + continue; + if (fa->fa_slen != slen) + break; + if (fa->tb_id > tb_id) + continue; + if (fa->tb_id != tb_id) + break; if (fa->fa_tos > tos) continue; if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) @@ -1011,77 +985,23 @@ static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) return NULL; } -static void trie_rebalance(struct trie *t, struct tnode *tn) +static void trie_rebalance(struct trie *t, struct key_vector *tn) { - struct tnode *tp; - - while ((tp = node_parent(tn)) != NULL) { - resize(t, tn); - tn = tp; - } - - /* Handle last (top) tnode */ - if (IS_TNODE(tn)) - resize(t, tn); + while (!IS_TRIE(tn)) + tn = resize(t, tn); } -/* only used from updater-side */ - -static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) +static int fib_insert_node(struct trie *t, struct key_vector *tp, + struct fib_alias *new, t_key key) { - struct list_head *fa_head = NULL; - struct tnode *l, *n, *tp = NULL; - struct leaf_info *li; + struct key_vector *n, *l; - li = leaf_info_new(plen); - if (!li) - return NULL; - fa_head = &li->falh; - - n = rtnl_dereference(t->trie); - - /* If we point to NULL, stop. Either the tree is empty and we should - * just put a new leaf in if, or we have reached an empty child slot, - * and we should just put our new leaf in that. - * - * If we hit a node with a key that does't match then we should stop - * and create a new tnode to replace that node and insert ourselves - * and the other node into the new tnode. - */ - while (n) { - unsigned long index = get_index(key, n); - - /* This bit of code is a bit tricky but it combines multiple - * checks into a single check. The prefix consists of the - * prefix plus zeros for the "bits" in the prefix. The index - * is the difference between the key and this value. From - * this we can actually derive several pieces of data. - * if !(index >> bits) - * we know the value is child index - * else - * we have a mismatch in skip bits and failed - */ - if (index >> n->bits) - break; - - /* we have found a leaf. Prefixes have already been compared */ - if (IS_LEAF(n)) { - /* Case 1: n is a leaf, and prefixes match*/ - insert_leaf_info(n, li); - return fa_head; - } - - tp = n; - n = tnode_get_child_rcu(n, index); - } - - l = leaf_new(key); - if (!l) { - free_leaf_info(li); - return NULL; - } + l = leaf_new(key, new); + if (!l) + goto noleaf; - insert_leaf_info(l, li); + /* retrieve child from parent node */ + n = get_child(tp, get_index(key, tp)); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * @@ -1090,21 +1010,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) * leaves us in position for handling as case 3 */ if (n) { - struct tnode *tn; + struct key_vector *tn; tn = tnode_new(key, __fls(key ^ n->key), 1); - if (!tn) { - free_leaf_info(li); - node_free(l); - return NULL; - } + if (!tn) + goto notnode; /* initialize routes out of node */ NODE_INIT_PARENT(tn, tp); put_child(tn, get_index(key, tn) ^ 1, n); /* start adding routes into the node */ - put_child_root(tp, t, key, tn); + put_child_root(tp, key, tn); node_set_parent(n, tn); /* parent now has a NULL spot where the leaf can go */ @@ -1112,69 +1029,94 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) } /* Case 3: n is NULL, and will just insert a new leaf */ - if (tp) { - NODE_INIT_PARENT(l, tp); - put_child(tp, get_index(key, tp), l); - trie_rebalance(t, tp); + NODE_INIT_PARENT(l, tp); + put_child_root(tp, key, l); + trie_rebalance(t, tp); + + return 0; +notnode: + node_free(l); +noleaf: + return -ENOMEM; +} + +static int fib_insert_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *new, + struct fib_alias *fa, t_key key) +{ + if (!l) + return fib_insert_node(t, tp, new, key); + + if (fa) { + hlist_add_before_rcu(&new->fa_list, &fa->fa_list); } else { - rcu_assign_pointer(t->trie, l); + struct fib_alias *last; + + hlist_for_each_entry(last, &l->leaf, fa_list) { + if (new->fa_slen < last->fa_slen) + break; + if ((new->fa_slen == last->fa_slen) && + (new->tb_id > last->tb_id)) + break; + fa = last; + } + + if (fa) + hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); + else + hlist_add_head_rcu(&new->fa_list, &l->leaf); } - return fa_head; + /* if we added to the tail node then we need to update slen */ + if (l->slen < new->fa_slen) { + l->slen = new->fa_slen; + leaf_push_suffix(tp, l); + } + + return 0; } -/* - * Caller must hold RTNL. - */ +/* Caller must hold RTNL. */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { - struct trie *t = (struct trie *) tb->tb_data; + struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; - struct list_head *fa_head = NULL; + struct key_vector *l, *tp; + unsigned int nlflags = 0; struct fib_info *fi; - int plen = cfg->fc_dst_len; + u8 plen = cfg->fc_dst_len; + u8 slen = KEYLENGTH - plen; u8 tos = cfg->fc_tos; - u32 key, mask; + u32 key; int err; - struct tnode *l; - if (plen > 32) + if (plen > KEYLENGTH) return -EINVAL; key = ntohl(cfg->fc_dst); pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); - mask = ntohl(inet_make_mask(plen)); - - if (key & ~mask) + if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; - key = key & mask; - fi = fib_create_info(cfg); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; } - l = fib_find_node(t, key); - fa = NULL; - - if (l) { - fa_head = get_fa_head(l, plen); - fa = fib_find_alias(fa_head, tos, fi->fib_priority); - } + l = fib_find_node(t, &tp, key); + fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, + tb->tb_id) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,tos,priority], if such key already * exists or to the node before which we will insert new one. * * If fa is NULL, we will need to allocate a new one and - * insert to the head of f. - * - * If f is NULL, no fib node matched the destination key - * and we need to allocate a new one of those as well. + * insert to the tail of the section matching the suffix length + * of the new alias. */ if (fa && fa->fa_tos == tos && @@ -1192,9 +1134,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) */ fa_match = NULL; fa_first = fa; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, fa_head, fa_list) { - if (fa->fa_tos != tos) + hlist_for_each_entry_from(fa, fa_list) { + if ((fa->fa_slen != slen) || + (fa->tb_id != tb->tb_id) || + (fa->fa_tos != tos)) break; if (fa->fa_info->fib_priority != fi->fib_priority) break; @@ -1217,7 +1160,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) } err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - if (new_fa == NULL) + if (!new_fa) goto out; fi_drop = fa->fa_info; @@ -1226,8 +1169,23 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_type = cfg->fc_type; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; + new_fa->fa_slen = fa->fa_slen; + new_fa->tb_id = tb->tb_id; + new_fa->fa_default = -1; + + err = switchdev_fib_ipv4_add(key, plen, fi, + new_fa->fa_tos, + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); + if (err) { + switchdev_fib_ipv4_abort(fi); + kmem_cache_free(fn_alias_kmem, new_fa); + goto out; + } + + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); - list_replace_rcu(&fa->fa_list, &new_fa->fa_list); alias_free_mem_rcu(fa); fib_release_info(fi_drop); @@ -1245,7 +1203,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa_match) goto out; - if (!(cfg->fc_nlflags & NLM_F_APPEND)) + if (cfg->fc_nlflags & NLM_F_APPEND) + nlflags = NLM_F_APPEND; + else fa = fa_first; } err = -ENOENT; @@ -1254,37 +1214,41 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - if (new_fa == NULL) + if (!new_fa) goto out; new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; - /* - * Insert new entry to the list. - */ - - if (!fa_head) { - fa_head = fib_insert_node(t, key, plen); - if (unlikely(!fa_head)) { - err = -ENOMEM; - goto out_free_new_fa; - } + new_fa->fa_slen = slen; + new_fa->tb_id = tb->tb_id; + new_fa->fa_default = -1; + + /* (Optionally) offload fib entry to switch hardware. */ + err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type, + cfg->fc_nlflags, tb->tb_id); + if (err) { + switchdev_fib_ipv4_abort(fi); + goto out_free_new_fa; } + /* Insert new entry to the list. */ + err = fib_insert_alias(t, tp, l, new_fa, fa, key); + if (err) + goto out_sw_fib_del; + if (!plen) tb->tb_num_default++; - list_add_tail_rcu(&new_fa->fa_list, - (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(cfg->fc_nlinfo.nl_net); - rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, - &cfg->fc_nlinfo, 0); + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, + &cfg->fc_nlinfo, nlflags); succeeded: return 0; +out_sw_fib_del: + switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1293,7 +1257,7 @@ err: return err; } -static inline t_key prefix_mismatch(t_key key, struct tnode *n) +static inline t_key prefix_mismatch(t_key key, struct key_vector *n) { t_key prefix = n->key; @@ -1304,16 +1268,20 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n) int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { - struct trie *t = (struct trie *)tb->tb_data; + struct trie *t = (struct trie *) tb->tb_data; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif const t_key key = ntohl(flp->daddr); - struct tnode *n, *pn; - struct leaf_info *li; + struct key_vector *n, *pn; + struct fib_alias *fa; + unsigned long index; t_key cindex; - n = rcu_dereference(t->trie); + pn = t->kv; + cindex = 0; + + n = get_child_rcu(pn, cindex); if (!n) return -EAGAIN; @@ -1321,24 +1289,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, this_cpu_inc(stats->gets); #endif - pn = n; - cindex = 0; - /* Step 1: Travel to the longest prefix match in the trie */ for (;;) { - unsigned long index = get_index(key, n); + index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the "bits" in the prefix. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. - * if (index & (~0ul << bits)) + * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex + * + * This check is safe even if bits == KEYLENGTH due to the + * fact that we can only allocate a node with 32 bits if a + * long is greater than 32 bits. */ - if (index & (~0ul << n->bits)) + if (index >= (1ul << n->bits)) break; /* we have found a leaf. Prefixes have already been compared */ @@ -1353,7 +1322,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, cindex = index; } - n = tnode_get_child_rcu(n, index); + n = get_child_rcu(n, index); if (unlikely(!n)) goto backtrace; } @@ -1361,7 +1330,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, /* Step 2: Sort out leaves and begin backtracing for longest prefix */ for (;;) { /* record the pointer where our next node pointer is stored */ - struct tnode __rcu **cptr = n->child; + struct key_vector __rcu **cptr = n->tnode; /* This test verifies that none of the bits that differ * between the key and the prefix exist in the region of @@ -1393,13 +1362,17 @@ backtrace: while (!cindex) { t_key pkey = pn->key; - pn = node_parent_rcu(pn); - if (unlikely(!pn)) + /* If we don't have a parent then there is + * nothing for us to do as we do not have any + * further nodes to parse. + */ + if (IS_TRIE(pn)) return -EAGAIN; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif /* Get Child's index */ + pn = node_parent_rcu(pn); cindex = get_index(pkey, pn); } @@ -1407,138 +1380,140 @@ backtrace: cindex &= cindex - 1; /* grab pointer for next child node */ - cptr = &pn->child[cindex]; + cptr = &pn->tnode[cindex]; } } found: + /* this line carries forward the xor from earlier in the function */ + index = key ^ n->key; + /* Step 3: Process the leaf, if that fails fall back to backtracing */ - hlist_for_each_entry_rcu(li, &n->list, hlist) { - struct fib_alias *fa; + hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + int nhsel, err; - if ((key ^ n->key) & li->mask_plen) + if ((index >= (1ul << fa->fa_slen)) && + ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH))) continue; + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fi->fib_dead) + continue; + if (fa->fa_info->fib_scope < flp->flowi4_scope) + continue; + fib_alias_accessed(fa); + err = fib_props[fa->fa_type].error; + if (unlikely(err < 0)) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + this_cpu_inc(stats->semantic_match_passed); +#endif + return err; + } + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - struct fib_info *fi = fa->fa_info; - int nhsel, err; - - if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) - continue; - if (fi->fib_dead) + if (nh->nh_flags & RTNH_F_DEAD) continue; - if (fa->fa_info->fib_scope < flp->flowi4_scope) + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + nh->nh_flags & RTNH_F_LINKDOWN && + !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - fib_alias_accessed(fa); - err = fib_props[fa->fa_type].error; - if (unlikely(err < 0)) { -#ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->semantic_match_passed); -#endif - return err; - } - if (fi->fib_flags & RTNH_F_DEAD) + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; - - if (nh->nh_flags & RTNH_F_DEAD) - continue; - if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) - continue; - - if (!(fib_flags & FIB_LOOKUP_NOREF)) - atomic_inc(&fi->fib_clntref); - - res->prefixlen = li->plen; - res->nh_sel = nhsel; - res->type = fa->fa_type; - res->scope = fi->fib_scope; - res->fi = fi; - res->table = tb; - res->fa_head = &li->falh; + + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&fi->fib_clntref); + + res->prefixlen = KEYLENGTH - fa->fa_slen; + res->nh_sel = nhsel; + res->type = fa->fa_type; + res->scope = fi->fib_scope; + res->fi = fi; + res->table = tb; + res->fa_head = &n->leaf; #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->semantic_match_passed); + this_cpu_inc(stats->semantic_match_passed); #endif - return err; - } + return err; } - + } #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->semantic_match_miss); + this_cpu_inc(stats->semantic_match_miss); #endif - } goto backtrace; } EXPORT_SYMBOL_GPL(fib_table_lookup); -/* - * Remove the leaf and return parent. - */ -static void trie_leaf_remove(struct trie *t, struct tnode *l) +static void fib_remove_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *old) { - struct tnode *tp = node_parent(l); + /* record the location of the previous list_info entry */ + struct hlist_node **pprev = old->fa_list.pprev; + struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); - pr_debug("entering trie_leaf_remove(%p)\n", l); + /* remove the fib_alias from the list */ + hlist_del_rcu(&old->fa_list); - if (tp) { - put_child(tp, get_index(l->key, tp), NULL); + /* if we emptied the list this leaf will be freed and we can sort + * out parent suffix lengths as a part of trie_rebalance + */ + if (hlist_empty(&l->leaf)) { + put_child_root(tp, l->key, NULL); + node_free(l); trie_rebalance(t, tp); - } else { - RCU_INIT_POINTER(t->trie, NULL); + return; } - node_free(l); + /* only access fa if it is pointing at the last valid hlist_node */ + if (*pprev) + return; + + /* update the trie with the latest suffix length */ + l->slen = fa->fa_slen; + leaf_pull_suffix(tp, l); } -/* - * Caller must hold RTNL. - */ +/* Caller must hold RTNL. */ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; - u32 key, mask; - int plen = cfg->fc_dst_len; - u8 tos = cfg->fc_tos; struct fib_alias *fa, *fa_to_delete; - struct list_head *fa_head; - struct tnode *l; - struct leaf_info *li; + struct key_vector *l, *tp; + u8 plen = cfg->fc_dst_len; + u8 slen = KEYLENGTH - plen; + u8 tos = cfg->fc_tos; + u32 key; - if (plen > 32) + if (plen > KEYLENGTH) return -EINVAL; key = ntohl(cfg->fc_dst); - mask = ntohl(inet_make_mask(plen)); - if (key & ~mask) + if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; - key = key & mask; - l = fib_find_node(t, key); - + l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; - li = find_leaf_info(l, plen); - - if (!li) - return -ESRCH; - - fa_head = &li->falh; - fa = fib_find_alias(fa_head, tos, 0); - + fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); if (!fa) return -ESRCH; pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); fa_to_delete = NULL; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, fa_head, fa_list) { + hlist_for_each_entry_from(fa, fa_list) { struct fib_info *fi = fa->fa_info; - if (fa->fa_tos != tos) + if ((fa->fa_slen != slen) || + (fa->tb_id != tb->tb_id) || + (fa->fa_tos != tos)) break; if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && @@ -1557,240 +1532,391 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, - &cfg->fc_nlinfo, 0); + switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, + cfg->fc_type, tb->tb_id); - list_del_rcu(&fa->fa_list); + rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, + &cfg->fc_nlinfo, 0); if (!plen) tb->tb_num_default--; - if (list_empty(fa_head)) { - remove_leaf_info(l, li); - free_leaf_info(li); - } - - if (hlist_empty(&l->list)) - trie_leaf_remove(t, l); + fib_remove_alias(t, tp, l, fa_to_delete); - if (fa->fa_state & FA_S_ACCESSED) + if (fa_to_delete->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); - fib_release_info(fa->fa_info); - alias_free_mem_rcu(fa); + fib_release_info(fa_to_delete->fa_info); + alias_free_mem_rcu(fa_to_delete); return 0; } -static int trie_flush_list(struct list_head *head) +/* Scan for the next leaf starting at the provided key value */ +static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) { - struct fib_alias *fa, *fa_node; - int found = 0; + struct key_vector *pn, *n = *tn; + unsigned long cindex; - list_for_each_entry_safe(fa, fa_node, head, fa_list) { - struct fib_info *fi = fa->fa_info; + /* this loop is meant to try and find the key in the trie */ + do { + /* record parent and next child index */ + pn = n; + cindex = key ? get_index(key, pn) : 0; - if (fi && (fi->fib_flags & RTNH_F_DEAD)) { - list_del_rcu(&fa->fa_list); - fib_release_info(fa->fa_info); - alias_free_mem_rcu(fa); - found++; + if (cindex >> pn->bits) + break; + + /* descend into the next child */ + n = get_child_rcu(pn, cindex++); + if (!n) + break; + + /* guarantee forward progress on the keys */ + if (IS_LEAF(n) && (n->key >= key)) + goto found; + } while (IS_TNODE(n)); + + /* this loop will search for the next leaf with a greater key */ + while (!IS_TRIE(pn)) { + /* if we exhausted the parent node we will need to climb */ + if (cindex >= (1ul << pn->bits)) { + t_key pkey = pn->key; + + pn = node_parent_rcu(pn); + cindex = get_index(pkey, pn) + 1; + continue; } + + /* grab the next available node */ + n = get_child_rcu(pn, cindex++); + if (!n) + continue; + + /* no need to compare keys since we bumped the index */ + if (IS_LEAF(n)) + goto found; + + /* Rescan start scanning in new node */ + pn = n; + cindex = 0; } - return found; + + *tn = pn; + return NULL; /* Root of trie */ +found: + /* if we are at the limit for keys just return NULL for the tnode */ + *tn = pn; + return n; } -static int trie_flush_leaf(struct tnode *l) +static void fib_trie_free(struct fib_table *tb) { - int found = 0; - struct hlist_head *lih = &l->list; + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; struct hlist_node *tmp; - struct leaf_info *li = NULL; - unsigned char plen = KEYLENGTH; + struct fib_alias *fa; - hlist_for_each_entry_safe(li, tmp, lih, hlist) { - found += trie_flush_list(&li->falh); + /* walk trie in reverse order and free everything */ + for (;;) { + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + if (IS_TRIE(pn)) + break; + + n = pn; + pn = node_parent(pn); + + /* drop emptied tnode */ + put_child_root(pn, n->key, NULL); + node_free(n); + + cindex = get_index(pkey, pn); - if (list_empty(&li->falh)) { - hlist_del_rcu(&li->hlist); - free_leaf_info(li); continue; } - plen = li->plen; - } + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; - l->slen = KEYLENGTH - plen; + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; - return found; + continue; + } + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + hlist_del_rcu(&fa->fa_list); + alias_free_mem_rcu(fa); + } + + put_child_root(pn, n->key, NULL); + node_free(n); + } + +#ifdef CONFIG_IP_FIB_TRIE_STATS + free_percpu(t->stats); +#endif + kfree(tb); } -/* - * Scan for the next right leaf starting at node p->child[idx] - * Since we have back pointer, no recursion necessary. - */ -static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c) +struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) { - do { - unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0; + struct trie *ot = (struct trie *)oldtb->tb_data; + struct key_vector *l, *tp = ot->kv; + struct fib_table *local_tb; + struct fib_alias *fa; + struct trie *lt; + t_key key = 0; + + if (oldtb->tb_data == oldtb->__data) + return oldtb; + + local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL); + if (!local_tb) + return NULL; + + lt = (struct trie *)local_tb->tb_data; + + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { + struct key_vector *local_l = NULL, *local_tp; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + struct fib_alias *new_fa; - while (idx < tnode_child_length(p)) { - c = tnode_get_child_rcu(p, idx++); - if (!c) + if (local_tb->tb_id != fa->tb_id) continue; - if (IS_LEAF(c)) - return c; + /* clone fa for new local table */ + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + if (!new_fa) + goto out; + + memcpy(new_fa, fa, sizeof(*fa)); + + /* insert clone into table */ + if (!local_l) + local_l = fib_find_node(lt, &local_tp, l->key); - /* Rescan start scanning in new node */ - p = c; - idx = 0; + if (fib_insert_alias(lt, local_tp, local_l, new_fa, + NULL, l->key)) + goto out; } - /* Node empty, walk back up to parent */ - c = p; - } while ((p = node_parent_rcu(c)) != NULL); + /* stop loop if key wrapped back to 0 */ + key = l->key + 1; + if (key < l->key) + break; + } - return NULL; /* Root of trie */ + return local_tb; +out: + fib_trie_free(local_tb); + + return NULL; } -static struct tnode *trie_firstleaf(struct trie *t) +/* Caller must hold RTNL */ +void fib_table_flush_external(struct fib_table *tb) { - struct tnode *n = rcu_dereference_rtnl(t->trie); + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct hlist_node *tmp; + struct fib_alias *fa; - if (!n) - return NULL; + /* walk trie in reverse order */ + for (;;) { + unsigned char slen = 0; + struct key_vector *n; - if (IS_LEAF(n)) /* trie is just a leaf */ - return n; + if (!(cindex--)) { + t_key pkey = pn->key; - return leaf_walk_rcu(n, NULL); -} + /* cannot resize the trie vector */ + if (IS_TRIE(pn)) + break; -static struct tnode *trie_nextleaf(struct tnode *l) -{ - struct tnode *p = node_parent_rcu(l); + /* resize completed node */ + pn = resize(t, pn); + cindex = get_index(pkey, pn); - if (!p) - return NULL; /* trie with just one leaf */ + continue; + } - return leaf_walk_rcu(p, l); -} + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; -static struct tnode *trie_leafindex(struct trie *t, int index) -{ - struct tnode *l = trie_firstleaf(t); + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; - while (l && index-- > 0) - l = trie_nextleaf(l); + continue; + } - return l; -} + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + /* if alias was cloned to local then we just + * need to remove the local copy from main + */ + if (tb->tb_id != fa->tb_id) { + hlist_del_rcu(&fa->fa_list); + alias_free_mem_rcu(fa); + continue; + } + /* record local slen */ + slen = fa->fa_slen; -/* - * Caller must hold RTNL. - */ + if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD)) + continue; + + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); + } + + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, n->key, NULL); + node_free(n); + } + } +} + +/* Caller must hold RTNL. */ int fib_table_flush(struct fib_table *tb) { - struct trie *t = (struct trie *) tb->tb_data; - struct tnode *l, *ll = NULL; + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct hlist_node *tmp; + struct fib_alias *fa; int found = 0; - for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { - found += trie_flush_leaf(l); + /* walk trie in reverse order */ + for (;;) { + unsigned char slen = 0; + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; - if (ll) { - if (hlist_empty(&ll->list)) - trie_leaf_remove(t, ll); - else - leaf_pull_suffix(ll); + /* cannot resize the trie vector */ + if (IS_TRIE(pn)) + break; + + /* resize completed node */ + pn = resize(t, pn); + cindex = get_index(pkey, pn); + + continue; } - ll = l; - } + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; - if (ll) { - if (hlist_empty(&ll->list)) - trie_leaf_remove(t, ll); - else - leaf_pull_suffix(ll); + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) { + slen = fa->fa_slen; + continue; + } + + switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id); + hlist_del_rcu(&fa->fa_list); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); + found++; + } + + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, n->key, NULL); + node_free(n); + } } pr_debug("trie_flush found=%d\n", found); return found; } -void fib_free_table(struct fib_table *tb) +static void __trie_free_rcu(struct rcu_head *head) { + struct fib_table *tb = container_of(head, struct fib_table, rcu); #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie *t = (struct trie *)tb->tb_data; - free_percpu(t->stats); + if (tb->tb_data == tb->__data) + free_percpu(t->stats); #endif /* CONFIG_IP_FIB_TRIE_STATS */ kfree(tb); } -static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, - struct fib_table *tb, - struct sk_buff *skb, struct netlink_callback *cb) +void fib_free_table(struct fib_table *tb) { - int i, s_i; + call_rcu(&tb->rcu, __trie_free_rcu); +} + +static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) +{ + __be32 xkey = htonl(l->key); struct fib_alias *fa; - __be32 xkey = htonl(key); + int i, s_i; - s_i = cb->args[5]; + s_i = cb->args[4]; i = 0; /* rcu_read_lock is hold by caller */ - - list_for_each_entry_rcu(fa, fah, fa_list) { + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { if (i < s_i) { i++; continue; } + if (tb->tb_id != fa->tb_id) { + i++; + continue; + } + if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, tb->tb_id, fa->fa_type, xkey, - plen, + KEYLENGTH - fa->fa_slen, fa->fa_tos, fa->fa_info, NLM_F_MULTI) < 0) { - cb->args[5] = i; - return -1; - } - i++; - } - cb->args[5] = i; - return skb->len; -} - -static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, - struct sk_buff *skb, struct netlink_callback *cb) -{ - struct leaf_info *li; - int i, s_i; - - s_i = cb->args[4]; - i = 0; - - /* rcu_read_lock is hold by caller */ - hlist_for_each_entry_rcu(li, &l->list, hlist) { - if (i < s_i) { - i++; - continue; - } - - if (i > s_i) - cb->args[5] = 0; - - if (list_empty(&li->falh)) - continue; - - if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) { cb->args[4] = i; return -1; } @@ -1801,44 +1927,38 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, return skb->len; } +/* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { - struct tnode *l; - struct trie *t = (struct trie *) tb->tb_data; - t_key key = cb->args[2]; - int count = cb->args[3]; - - rcu_read_lock(); + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *l, *tp = t->kv; /* Dump starting at last key. * Note: 0.0.0.0/0 (ie default) is first key. */ - if (count == 0) - l = trie_firstleaf(t); - else { - /* Normally, continue from last key, but if that is missing - * fallback to using slow rescan - */ - l = fib_find_node(t, key); - if (!l) - l = trie_leafindex(t, count); - } + int count = cb->args[2]; + t_key key = cb->args[3]; - while (l) { - cb->args[2] = l->key; + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { - cb->args[3] = count; - rcu_read_unlock(); + cb->args[3] = key; + cb->args[2] = count; return -1; } ++count; - l = trie_nextleaf(l); + key = l->key + 1; + memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); + + /* stop loop if key wrapped back to 0 */ + if (key < l->key) + break; } - cb->args[3] = count; - rcu_read_unlock(); + + cb->args[3] = key; + cb->args[2] = count; return skb->len; } @@ -1850,28 +1970,33 @@ void __init fib_trie_init(void) 0, SLAB_PANIC, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", - max(sizeof(struct tnode), - sizeof(struct leaf_info)), + LEAF_SIZE, 0, SLAB_PANIC, NULL); } - -struct fib_table *fib_trie_table(u32 id) +struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) { struct fib_table *tb; struct trie *t; + size_t sz = sizeof(*tb); - tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), - GFP_KERNEL); - if (tb == NULL) + if (!alias) + sz += sizeof(struct trie); + + tb = kzalloc(sz, GFP_KERNEL); + if (!tb) return NULL; tb->tb_id = id; - tb->tb_default = -1; tb->tb_num_default = 0; + tb->tb_data = (alias ? alias->__data : tb->__data); + + if (alias) + return tb; t = (struct trie *) tb->tb_data; - RCU_INIT_POINTER(t->trie, NULL); + t->kv[0].pos = KEYLENGTH; + t->kv[0].slen = KEYLENGTH; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats = alloc_percpu(struct trie_use_stats); if (!t->stats) { @@ -1888,65 +2013,64 @@ struct fib_table *fib_trie_table(u32 id) struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; - struct tnode *tnode; + struct key_vector *tnode; unsigned int index; unsigned int depth; }; -static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter) +static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) { unsigned long cindex = iter->index; - struct tnode *tn = iter->tnode; - struct tnode *p; - - /* A single entry routing table */ - if (!tn) - return NULL; + struct key_vector *pn = iter->tnode; + t_key pkey; pr_debug("get_next iter={node=%p index=%d depth=%d}\n", iter->tnode, iter->index, iter->depth); -rescan: - while (cindex < tnode_child_length(tn)) { - struct tnode *n = tnode_get_child_rcu(tn, cindex); - if (n) { + while (!IS_TRIE(pn)) { + while (cindex < child_length(pn)) { + struct key_vector *n = get_child_rcu(pn, cindex++); + + if (!n) + continue; + if (IS_LEAF(n)) { - iter->tnode = tn; - iter->index = cindex + 1; + iter->tnode = pn; + iter->index = cindex; } else { /* push down one level */ iter->tnode = n; iter->index = 0; ++iter->depth; } + return n; } - ++cindex; - } - - /* Current node exhausted, pop back up */ - p = node_parent_rcu(tn); - if (p) { - cindex = get_index(tn->key, p) + 1; - tn = p; + /* Current node exhausted, pop back up */ + pkey = pn->key; + pn = node_parent_rcu(pn); + cindex = get_index(pkey, pn) + 1; --iter->depth; - goto rescan; } - /* got root? */ + /* record root node so further searches know we are done */ + iter->tnode = pn; + iter->index = 0; + return NULL; } -static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, - struct trie *t) +static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, + struct trie *t) { - struct tnode *n; + struct key_vector *n, *pn; if (!t) return NULL; - n = rcu_dereference(t->trie); + pn = t->kv; + n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; @@ -1955,7 +2079,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, iter->index = 0; iter->depth = 1; } else { - iter->tnode = NULL; + iter->tnode = pn; iter->index = 0; iter->depth = 0; } @@ -1965,7 +2089,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, static void trie_collect_stats(struct trie *t, struct trie_stat *s) { - struct tnode *n; + struct key_vector *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); @@ -1973,20 +2097,20 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) rcu_read_lock(); for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { - struct leaf_info *li; + struct fib_alias *fa; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; - hlist_for_each_entry_rcu(li, &n->list, hlist) + hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) ++s->prefixes; } else { s->tnodes++; if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; - s->nullpointers += n->empty_children; + s->nullpointers += tn_info(n)->empty_children; } } rcu_read_unlock(); @@ -2009,13 +2133,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); - bytes = sizeof(struct tnode) * stat->leaves; + bytes = LEAF_SIZE * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); - bytes += sizeof(struct leaf_info) * stat->prefixes; + bytes += sizeof(struct fib_alias) * stat->prefixes; seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); - bytes += sizeof(struct tnode) * stat->tnodes; + bytes += TNODE_SIZE(0) * stat->tnodes; max = MAX_STAT_DEPTH; while (max > 0 && stat->nodesizes[max-1] == 0) @@ -2030,7 +2154,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); - bytes += sizeof(struct tnode *) * pointers; + bytes += sizeof(struct key_vector *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } @@ -2084,7 +2208,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "Basic info: size of leaf:" " %Zd bytes, size of tnode: %Zd bytes.\n", - sizeof(struct tnode), sizeof(struct tnode)); + LEAF_SIZE, TNODE_SIZE(0)); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; @@ -2123,7 +2247,7 @@ static const struct file_operations fib_triestat_fops = { .release = single_release_net, }; -static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2135,7 +2259,7 @@ static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos) struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { - struct tnode *n; + struct key_vector *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); @@ -2164,7 +2288,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; - struct tnode *n; + struct key_vector *n; ++*pos; /* next node in same table */ @@ -2250,9 +2374,9 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t) static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; - struct tnode *n = v; + struct key_vector *n = v; - if (!node_parent_rcu(n)) + if (IS_TRIE(node_parent_rcu(n))) fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { @@ -2261,30 +2385,28 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", &prf, KEYLENGTH - n->pos - n->bits, n->bits, - n->full_children, n->empty_children); + tn_info(n)->full_children, + tn_info(n)->empty_children); } else { - struct leaf_info *li; __be32 val = htonl(n->key); + struct fib_alias *fa; seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); - hlist_for_each_entry_rcu(li, &n->list, hlist) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - char buf1[32], buf2[32]; - - seq_indent(seq, iter->depth+1); - seq_printf(seq, " /%d %s %s", li->plen, - rtn_scope(buf1, sizeof(buf1), - fa->fa_info->fib_scope), - rtn_type(buf2, sizeof(buf2), - fa->fa_type)); - if (fa->fa_tos) - seq_printf(seq, " tos=%d", fa->fa_tos); - seq_putc(seq, '\n'); - } + hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { + char buf1[32], buf2[32]; + + seq_indent(seq, iter->depth + 1); + seq_printf(seq, " /%zu %s %s", + KEYLENGTH - fa->fa_slen, + rtn_scope(buf1, sizeof(buf1), + fa->fa_info->fib_scope), + rtn_type(buf2, sizeof(buf2), + fa->fa_type)); + if (fa->fa_tos) + seq_printf(seq, " tos=%d", fa->fa_tos); + seq_putc(seq, '\n'); } } @@ -2314,31 +2436,47 @@ static const struct file_operations fib_trie_fops = { struct fib_route_iter { struct seq_net_private p; - struct trie *main_trie; + struct fib_table *main_tb; + struct key_vector *tnode; loff_t pos; t_key key; }; -static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) +static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, + loff_t pos) { - struct tnode *l = NULL; - struct trie *t = iter->main_trie; + struct fib_table *tb = iter->main_tb; + struct key_vector *l, **tp = &iter->tnode; + struct trie *t; + t_key key; - /* use cache location of last found key */ - if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key))) + /* use cache location of next-to-find key */ + if (iter->pos > 0 && pos >= iter->pos) { pos -= iter->pos; - else { + key = iter->key; + } else { + t = (struct trie *)tb->tb_data; + iter->tnode = t->kv; iter->pos = 0; - l = trie_firstleaf(t); + key = 0; } - while (l && pos-- > 0) { + while ((l = leaf_walk_rcu(tp, key)) != NULL) { + key = l->key + 1; iter->pos++; - l = trie_nextleaf(l); + + if (pos-- <= 0) + break; + + l = NULL; + + /* handle unlikely case of a key wrap */ + if (!key) + break; } if (l) - iter->key = pos; /* remember it */ + iter->key = key; /* remember it */ else iter->pos = 0; /* forget it */ @@ -2350,37 +2488,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct fib_table *tb; + struct trie *t; rcu_read_lock(); + tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; - iter->main_trie = (struct trie *) tb->tb_data; - if (*pos == 0) - return SEQ_START_TOKEN; - else - return fib_route_get_idx(iter, *pos - 1); + iter->main_tb = tb; + + if (*pos != 0) + return fib_route_get_idx(iter, *pos); + + t = (struct trie *)tb->tb_data; + iter->tnode = t->kv; + iter->pos = 0; + iter->key = 0; + + return SEQ_START_TOKEN; } static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; - struct tnode *l = v; + struct key_vector *l = NULL; + t_key key = iter->key; ++*pos; - if (v == SEQ_START_TOKEN) { - iter->pos = 0; - l = trie_firstleaf(iter->main_trie); - } else { + + /* only allow key of 0 for start of sequence */ + if ((v == SEQ_START_TOKEN) || key) + l = leaf_walk_rcu(&iter->tnode, key); + + if (l) { + iter->key = l->key + 1; iter->pos++; - l = trie_nextleaf(l); + } else { + iter->pos = 0; } - if (l) - iter->key = l->key; - else - iter->pos = 0; return l; } @@ -2412,8 +2559,11 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info */ static int fib_route_seq_show(struct seq_file *seq, void *v) { - struct tnode *l = v; - struct leaf_info *li; + struct fib_route_iter *iter = seq->private; + struct fib_table *tb = iter->main_tb; + struct fib_alias *fa; + struct key_vector *l = v; + __be32 prefix; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " @@ -2422,45 +2572,43 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } - hlist_for_each_entry_rcu(li, &l->list, hlist) { - struct fib_alias *fa; - __be32 mask, prefix; + prefix = htonl(l->key); - mask = inet_make_mask(li->plen); - prefix = htonl(l->key); + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + const struct fib_info *fi = fa->fa_info; + __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); + unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - const struct fib_info *fi = fa->fa_info; - unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); + if ((fa->fa_type == RTN_BROADCAST) || + (fa->fa_type == RTN_MULTICAST)) + continue; - if (fa->fa_type == RTN_BROADCAST - || fa->fa_type == RTN_MULTICAST) - continue; + if (fa->tb_id != tb->tb_id) + continue; - seq_setwidth(seq, 127); - - if (fi) - seq_printf(seq, - "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", - fi->fib_dev ? fi->fib_dev->name : "*", - prefix, - fi->fib_nh->nh_gw, flags, 0, 0, - fi->fib_priority, - mask, - (fi->fib_advmss ? - fi->fib_advmss + 40 : 0), - fi->fib_window, - fi->fib_rtt >> 3); - else - seq_printf(seq, - "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", - prefix, 0, flags, 0, 0, 0, - mask, 0, 0, 0); - - seq_pad(seq, '\n'); - } + seq_setwidth(seq, 127); + + if (fi) + seq_printf(seq, + "%s\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u", + fi->fib_dev ? fi->fib_dev->name : "*", + prefix, + fi->fib_nh->nh_gw, flags, 0, 0, + fi->fib_priority, + mask, + (fi->fib_advmss ? + fi->fib_advmss + 40 : 0), + fi->fib_window, + fi->fib_rtt >> 3); + else + seq_printf(seq, + "*\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u", + prefix, 0, flags, 0, 0, 0, + mask, 0, 0, 0); + + seq_pad(seq, '\n'); } return 0; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index ff069f6597ac..34968cd5c146 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -16,14 +16,12 @@ #include <uapi/linux/fou.h> #include <uapi/linux/genetlink.h> -static DEFINE_SPINLOCK(fou_lock); -static LIST_HEAD(fou_list); - struct fou { struct socket *sock; u8 protocol; u8 flags; - u16 port; + __be16 port; + u16 type; struct udp_offload udp_offloads; struct list_head list; }; @@ -37,6 +35,13 @@ struct fou_cfg { struct udp_port_cfg udp_config; }; +static unsigned int fou_net_id; + +struct fou_net { + struct list_head fou_list; + struct mutex fou_lock; +}; + static inline struct fou *fou_from_sock(struct sock *sk) { return sk->sk_user_data; @@ -387,20 +392,21 @@ out_unlock: return err; } -static int fou_add_to_port_list(struct fou *fou) +static int fou_add_to_port_list(struct net *net, struct fou *fou) { + struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fout; - spin_lock(&fou_lock); - list_for_each_entry(fout, &fou_list, list) { + mutex_lock(&fn->fou_lock); + list_for_each_entry(fout, &fn->fou_list, list) { if (fou->port == fout->port) { - spin_unlock(&fou_lock); + mutex_unlock(&fn->fou_lock); return -EALREADY; } } - list_add(&fou->list, &fou_list); - spin_unlock(&fou_lock); + list_add(&fou->list, &fn->fou_list); + mutex_unlock(&fn->fou_lock); return 0; } @@ -410,14 +416,10 @@ static void fou_release(struct fou *fou) struct socket *sock = fou->sock; struct sock *sk = sock->sk; - udp_del_offload(&fou->udp_offloads); - + if (sk->sk_family == AF_INET) + udp_del_offload(&fou->udp_offloads); list_del(&fou->list); - - /* Remove hooks into tunnel socket */ - sk->sk_user_data = NULL; - - sock_release(sock); + udp_tunnel_sock_release(sock); kfree(fou); } @@ -447,10 +449,10 @@ static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) static int fou_create(struct net *net, struct fou_cfg *cfg, struct socket **sockp) { - struct fou *fou = NULL; - int err; struct socket *sock = NULL; + struct fou *fou = NULL; struct sock *sk; + int err; /* Open UDP socket */ err = udp_sock_create(net, &cfg->udp_config, &sock); @@ -486,6 +488,8 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, goto error; } + fou->type = cfg->type; + udp_sk(sk)->encap_type = 1; udp_encap_enable(); @@ -502,7 +506,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, goto error; } - err = fou_add_to_port_list(fou); + err = fou_add_to_port_list(net, fou); if (err) goto error; @@ -514,27 +518,27 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, error: kfree(fou); if (sock) - sock_release(sock); + udp_tunnel_sock_release(sock); return err; } static int fou_destroy(struct net *net, struct fou_cfg *cfg) { - struct fou *fou; - u16 port = cfg->udp_config.local_udp_port; + struct fou_net *fn = net_generic(net, fou_net_id); + __be16 port = cfg->udp_config.local_udp_port; int err = -EINVAL; + struct fou *fou; - spin_lock(&fou_lock); - list_for_each_entry(fou, &fou_list, list) { + mutex_lock(&fn->fou_lock); + list_for_each_entry(fou, &fn->fou_list, list) { if (fou->port == port) { - udp_del_offload(&fou->udp_offloads); fou_release(fou); err = 0; break; } } - spin_unlock(&fou_lock); + mutex_unlock(&fn->fou_lock); return err; } @@ -573,7 +577,7 @@ static int parse_nl_config(struct genl_info *info, } if (info->attrs[FOU_ATTR_PORT]) { - u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]); + __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]); cfg->udp_config.local_udp_port = port; } @@ -592,6 +596,7 @@ static int parse_nl_config(struct genl_info *info, static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) { + struct net *net = genl_info_net(info); struct fou_cfg cfg; int err; @@ -599,16 +604,119 @@ static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) if (err) return err; - return fou_create(&init_net, &cfg, NULL); + return fou_create(net, &cfg, NULL); } static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) { + struct net *net = genl_info_net(info); struct fou_cfg cfg; + int err; - parse_nl_config(info, &cfg); + err = parse_nl_config(info, &cfg); + if (err) + return err; - return fou_destroy(&init_net, &cfg); + return fou_destroy(net, &cfg); +} + +static int fou_fill_info(struct fou *fou, struct sk_buff *msg) +{ + if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) || + nla_put_be16(msg, FOU_ATTR_PORT, fou->port) || + nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) || + nla_put_u8(msg, FOU_ATTR_TYPE, fou->type)) + return -1; + + if (fou->flags & FOU_F_REMCSUM_NOPARTIAL) + if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL)) + return -1; + return 0; +} + +static int fou_dump_info(struct fou *fou, u32 portid, u32 seq, + u32 flags, struct sk_buff *skb, u8 cmd) +{ + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + if (fou_fill_info(fou, skb) < 0) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct fou_net *fn = net_generic(net, fou_net_id); + struct sk_buff *msg; + struct fou_cfg cfg; + struct fou *fout; + __be16 port; + int ret; + + ret = parse_nl_config(info, &cfg); + if (ret) + return ret; + port = cfg.udp_config.local_udp_port; + if (port == 0) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + ret = -ESRCH; + mutex_lock(&fn->fou_lock); + list_for_each_entry(fout, &fn->fou_list, list) { + if (port == fout->port) { + ret = fou_dump_info(fout, info->snd_portid, + info->snd_seq, 0, msg, + info->genlhdr->cmd); + break; + } + } + mutex_unlock(&fn->fou_lock); + if (ret < 0) + goto out_free; + + return genlmsg_reply(msg, info); + +out_free: + nlmsg_free(msg); + return ret; +} + +static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct fou_net *fn = net_generic(net, fou_net_id); + struct fou *fout; + int idx = 0, ret; + + mutex_lock(&fn->fou_lock); + list_for_each_entry(fout, &fn->fou_list, list) { + if (idx++ < cb->args[0]) + continue; + ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, FOU_CMD_GET); + if (ret) + break; + } + mutex_unlock(&fn->fou_lock); + + cb->args[0] = idx; + return skb->len; } static const struct genl_ops fou_nl_ops[] = { @@ -624,6 +732,12 @@ static const struct genl_ops fou_nl_ops[] = { .policy = fou_nl_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = FOU_CMD_GET, + .doit = fou_nl_cmd_get_port, + .dumpit = fou_nl_dump, + .policy = fou_nl_policy, + }, }; size_t fou_encap_hlen(struct ip_tunnel_encap *e) @@ -771,12 +885,12 @@ EXPORT_SYMBOL(gue_build_header); #ifdef CONFIG_NET_FOU_IP_TUNNELS -static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = { +static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou_build_header, }; -static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = { +static const struct ip_tunnel_encap_ops gue_iptun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue_build_header, }; @@ -820,38 +934,63 @@ static void ip_tunnel_encap_del_fou_ops(void) #endif +static __net_init int fou_init_net(struct net *net) +{ + struct fou_net *fn = net_generic(net, fou_net_id); + + INIT_LIST_HEAD(&fn->fou_list); + mutex_init(&fn->fou_lock); + return 0; +} + +static __net_exit void fou_exit_net(struct net *net) +{ + struct fou_net *fn = net_generic(net, fou_net_id); + struct fou *fou, *next; + + /* Close all the FOU sockets */ + mutex_lock(&fn->fou_lock); + list_for_each_entry_safe(fou, next, &fn->fou_list, list) + fou_release(fou); + mutex_unlock(&fn->fou_lock); +} + +static struct pernet_operations fou_net_ops = { + .init = fou_init_net, + .exit = fou_exit_net, + .id = &fou_net_id, + .size = sizeof(struct fou_net), +}; + static int __init fou_init(void) { int ret; + ret = register_pernet_device(&fou_net_ops); + if (ret) + goto exit; + ret = genl_register_family_with_ops(&fou_nl_family, fou_nl_ops); - if (ret < 0) - goto exit; + goto unregister; ret = ip_tunnel_encap_add_fou_ops(); - if (ret < 0) - genl_unregister_family(&fou_nl_family); + if (ret == 0) + return 0; + genl_unregister_family(&fou_nl_family); +unregister: + unregister_pernet_device(&fou_net_ops); exit: return ret; } static void __exit fou_fini(void) { - struct fou *fou, *next; - ip_tunnel_encap_del_fou_ops(); - genl_unregister_family(&fou_nl_family); - - /* Close all the FOU sockets */ - - spin_lock(&fou_lock); - list_for_each_entry_safe(fou, next, &fou_list, list) - fou_release(fou); - spin_unlock(&fou_lock); + unregister_pernet_device(&fou_net_ops); } module_init(fou_init); diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve_core.c index 5a4828ba05ad..311a4ba6950a 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve_core.c @@ -60,11 +60,6 @@ struct geneve_net { static int geneve_net_id; -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - static struct geneve_sock *geneve_find_sock(struct net *net, sa_family_t family, __be16 port) { @@ -113,10 +108,6 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, int min_headroom; int err; - skb = udp_tunnel_handle_offloads(skb, csum); - if (IS_ERR(skb)) - return PTR_ERR(skb); - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); @@ -131,12 +122,16 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, if (unlikely(!skb)) return -ENOMEM; + skb = udp_tunnel_handle_offloads(skb, csum); + if (IS_ERR(skb)) + return PTR_ERR(skb); + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - return udp_tunnel_xmit_skb(rt, skb, src, dst, + return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, tos, ttl, df, src_port, dst_port, xnet, !csum); } @@ -196,7 +191,7 @@ static struct sk_buff **geneve_gro_receive(struct sk_buff **head, rcu_read_lock(); ptype = gro_find_receive_by_type(type); - if (ptype == NULL) { + if (!ptype) { flush = 1; goto out_unlock; } @@ -230,7 +225,7 @@ static int geneve_gro_complete(struct sk_buff *skb, int nhoff, rcu_read_lock(); ptype = gro_find_complete_by_type(type); - if (ptype != NULL) + if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); rcu_read_unlock(); @@ -435,7 +430,7 @@ static int __init geneve_init_module(void) if (rc) return rc; - pr_info("Geneve driver\n"); + pr_info("Geneve core logic\n"); return 0; } @@ -449,5 +444,4 @@ module_exit(geneve_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>"); -MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); -MODULE_ALIAS_RTNL_LINK("geneve"); +MODULE_DESCRIPTION("Driver library for GENEVE encapsulated traffic"); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 51973ddc05a6..5aa46d4b44ef 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -149,7 +149,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, rcu_read_lock(); ptype = gro_find_receive_by_type(type); - if (ptype == NULL) + if (!ptype) goto out_unlock; grehlen = GRE_HEADER_SECTION; @@ -243,7 +243,7 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) rcu_read_lock(); ptype = gro_find_complete_by_type(type); - if (ptype != NULL) + if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); rcu_read_unlock(); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5e564014a0b7..f5203fba6236 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -399,7 +399,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) return; sk = icmp_xmit_lock(net); - if (sk == NULL) + if (!sk) return; inet = inet_sk(sk); @@ -609,7 +609,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) skb_in->data, sizeof(_inner_type), &_inner_type); - if (itp == NULL) + if (!itp) goto out; /* @@ -627,7 +627,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) return; sk = icmp_xmit_lock(net); - if (sk == NULL) + if (!sk) goto out_free; /* diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 666cf364df86..651cdf648ec4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -97,6 +97,7 @@ #include <net/route.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/inet_common.h> #include <linux/netfilter_ipv4.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> @@ -369,7 +370,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(skb, NULL); + ip_select_ident(net, skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; @@ -691,7 +692,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC); - if (skb == NULL) { + if (!skb) { ip_rt_put(rt); return -1; } @@ -713,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; - ip_select_ident(skb, NULL); + ip_select_ident(net, skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; @@ -980,7 +981,7 @@ int igmp_rcv(struct sk_buff *skb) int len = skb->len; bool dropped = true; - if (in_dev == NULL) + if (!in_dev) goto drop; if (!pskb_may_pull(skb, sizeof(struct igmphdr))) @@ -1338,6 +1339,168 @@ out: } EXPORT_SYMBOL(ip_mc_inc_group); +static int ip_mc_check_iphdr(struct sk_buff *skb) +{ + const struct iphdr *iph; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) + return -EINVAL; + + offset += ip_hdrlen(skb) - sizeof(*iph); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + iph = ip_hdr(skb); + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + return -EINVAL; + + len = skb_network_offset(skb) + ntohs(iph->tot_len); + if (skb->len < len || len < offset) + return -EINVAL; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmpv3_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ip_mc_check_igmp_query(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct igmphdr); + if (skb->len < len) + return -EINVAL; + + /* IGMPv{1,2}? */ + if (skb->len != len) { + /* or IGMPv3? */ + len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer + * all-systems destination addresses (224.0.0.1) for general queries + */ + if (!igmp_hdr(skb)->group && + ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) + return -EINVAL; + + return 0; +} + +static int ip_mc_check_igmp_msg(struct sk_buff *skb) +{ + switch (igmp_hdr(skb)->type) { + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + /* fall through */ + return 0; + case IGMPV3_HOST_MEMBERSHIP_REPORT: + return ip_mc_check_igmp_reportv3(skb); + case IGMP_HOST_MEMBERSHIP_QUERY: + return ip_mc_check_igmp_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_simple_validate(skb); +} + +static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); + int ret; + + transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); + + skb_get(skb); + skb_chk = skb_checksum_trimmed(skb, transport_len, + ip_mc_validate_checksum); + if (!skb_chk) + return -EINVAL; + + if (!pskb_may_pull(skb_chk, len)) { + kfree_skb(skb_chk); + return -EINVAL; + } + + ret = ip_mc_check_igmp_msg(skb_chk); + if (ret) { + kfree_skb(skb_chk); + return ret; + } + + if (skb_trimmed) + *skb_trimmed = skb_chk; + else + kfree_skb(skb_chk); + + return 0; +} + +/** + * ip_mc_check_igmp - checks whether this is a sane IGMP packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional) + * + * Checks whether an IPv4 packet is a valid IGMP packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an IGMP packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * The caller needs to release a reference count from any returned skb_trimmed. + */ +int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret = ip_mc_check_iphdr(skb); + + if (ret < 0) + return ret; + + if (ip_hdr(skb)->protocol != IPPROTO_IGMP) + return -ENOMSG; + + return __ip_mc_check_igmp(skb, skb_trimmed); +} +EXPORT_SYMBOL(ip_mc_check_igmp); + /* * Resend IGMP JOIN report; used by netdev notifier. */ @@ -1849,30 +2012,28 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) pmc->sfcount[MCAST_EXCLUDE] = 1; } - -/* - * Join a multicast group +/* Join a multicast group */ -int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) + +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) { - int err; __be32 addr = imr->imr_multiaddr.s_addr; - struct ip_mc_socklist *iml = NULL, *i; + struct ip_mc_socklist *iml, *i; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int ifindex; int count = 0; + int err; + + ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); - in_dev = ip_mc_find_dev(net, imr); if (!in_dev) { - iml = NULL; err = -ENODEV; goto done; } @@ -1889,7 +2050,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) if (count >= sysctl_igmp_max_memberships) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); - if (iml == NULL) + if (!iml) goto done; memcpy(&iml->multi, imr, sizeof(*imr)); @@ -1900,7 +2061,6 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) ip_mc_inc_group(in_dev, addr); err = 0; done: - rtnl_unlock(); return err; } EXPORT_SYMBOL(ip_mc_join_group); @@ -1911,7 +2071,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist); int err; - if (psf == NULL) { + if (!psf) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); @@ -1925,10 +2085,6 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, return err; } -/* - * Ask a socket to leave a group. - */ - int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct inet_sock *inet = inet_sk(sk); @@ -1940,7 +2096,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) u32 ifindex; int ret = -EADDRNOTAVAIL; - rtnl_lock(); + ASSERT_RTNL(); + in_dev = ip_mc_find_dev(net, imr); if (!in_dev) { ret = -ENODEV; @@ -1964,14 +2121,13 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; ip_mc_dec_group(in_dev, group); - rtnl_unlock(); + /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } out: - rtnl_unlock(); return ret; } EXPORT_SYMBOL(ip_mc_leave_group); @@ -1993,7 +2149,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); + ASSERT_RTNL(); imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; imr.imr_address.s_addr = mreqs->imr_interface; @@ -2107,9 +2263,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); done: - rtnl_unlock(); if (leavegroup) - return ip_mc_leave_group(sk, &imr); + err = ip_mc_leave_group(sk, &imr); return err; } @@ -2131,7 +2286,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; - rtnl_lock(); + ASSERT_RTNL(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; @@ -2193,7 +2348,6 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) pmc->sfmode = msf->imsf_fmode; err = 0; done: - rtnl_unlock(); if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; @@ -2368,7 +2522,7 @@ void ip_mc_drop_socket(struct sock *sk) struct ip_mc_socklist *iml; struct net *net = sock_net(sk); - if (inet->mc_list == NULL) + if (!inet->mc_list) return; rtnl_lock(); @@ -2378,7 +2532,7 @@ void ip_mc_drop_socket(struct sock *sk) inet->mc_list = iml->next_rcu; in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); - if (in_dev != NULL) + if (in_dev) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); @@ -2595,13 +2749,13 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) for_each_netdev_rcu(net, state->dev) { struct in_device *idev; idev = __in_dev_get_rcu(state->dev); - if (unlikely(idev == NULL)) + if (unlikely(!idev)) continue; im = rcu_dereference(idev->mc_list); - if (likely(im != NULL)) { + if (likely(im)) { spin_lock_bh(&im->lock); psf = im->sources; - if (likely(psf != NULL)) { + if (likely(psf)) { state->im = im; state->idev = idev; break; @@ -2671,7 +2825,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); - if (likely(state->im != NULL)) { + if (likely(state->im)) { spin_unlock_bh(&state->im->lock); state->im = NULL; } @@ -2724,6 +2878,7 @@ static const struct file_operations igmp_mcf_seq_fops = { static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; + int err; pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); if (!pde) @@ -2732,8 +2887,18 @@ static int __net_init igmp_net_init(struct net *net) &igmp_mcf_seq_fops); if (!pde) goto out_mcfilter; + err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET, + SOCK_DGRAM, 0, net); + if (err < 0) { + pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n", + err); + goto out_sock; + } + return 0; +out_sock: + remove_proc_entry("mcfilter", net->proc_net); out_mcfilter: remove_proc_entry("igmp", net->proc_net); out_igmp: @@ -2744,6 +2909,7 @@ static void __net_exit igmp_net_exit(struct net *net) { remove_proc_entry("mcfilter", net->proc_net); remove_proc_entry("igmp", net->proc_net); + inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk); } static struct pernet_operations igmp_net_ops = { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3e44b9b0b78e..60021d0d9326 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -23,6 +23,7 @@ #include <net/route.h> #include <net/tcp_states.h> #include <net/xfrm.h> +#include <net/tcp.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -98,6 +99,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) struct net *net = sock_net(sk); int smallest_size = -1, smallest_rover; kuid_t uid = sock_i_uid(sk); + int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; local_bh_disable(); if (!snum) { @@ -105,6 +107,14 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) again: inet_get_local_port_range(net, &low, &high); + if (attempt_half) { + int half = low + ((high - low) >> 1); + + if (attempt_half == 1) + high = half; + else + low = half; + } remaining = (high - low) + 1; smallest_rover = rover = prandom_u32() % remaining + low; @@ -126,11 +136,6 @@ again: (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_rover = rover; - if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && - !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { - snum = smallest_rover; - goto tb_found; - } } if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { snum = rover; @@ -158,6 +163,11 @@ again: snum = smallest_rover; goto have_snum; } + if (attempt_half == 1) { + /* OK we now try the upper half of the range */ + attempt_half = 2; + goto again; + } goto fail; } /* OK, here is the one we will use. HEAD is @@ -294,8 +304,8 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct sock *newsk; struct request_sock *req; + struct sock *newsk; int error; lock_sock(sk); @@ -324,9 +334,11 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) newsk = req->sk; sk_acceptq_removed(sk); - if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { + if (sk->sk_protocol == IPPROTO_TCP && + tcp_rsk(req)->tfo_listener && + queue->fastopenq) { spin_lock_bh(&queue->fastopenq->lock); - if (tcp_rsk(req)->listener) { + if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken @@ -341,7 +353,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) out: release_sock(sk); if (req) - __reqsk_free(req); + reqsk_put(req); return newsk; out_err: newsk = NULL; @@ -400,18 +412,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { - struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options_rcu *opt = inet_rsk(req)->opt; - struct net *net = sock_net(sk); - int flags = inet_sk_flowi_flags(sk); + struct net *net = read_pnet(&ireq->ireq_net); + struct ip_options_rcu *opt = ireq->opt; + struct rtable *rt; - flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, - sk->sk_protocol, - flags, + sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + ireq->ir_loc_addr, ireq->ir_rmt_port, + htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -433,9 +444,9 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); + struct net *net = read_pnet(&ireq->ireq_net); struct inet_sock *newinet = inet_sk(newsk); struct ip_options_rcu *opt; - struct net *net = sock_net(sk); struct flowi4 *fl4; struct rtable *rt; @@ -443,11 +454,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rcu_read_lock(); opt = rcu_dereference(newinet->inet_opt); - flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + ireq->ir_loc_addr, ireq->ir_rmt_port, + htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -475,33 +487,37 @@ static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else -#define AF_INET_FAMILY(fam) 1 +#define AF_INET_FAMILY(fam) true #endif -struct request_sock *inet_csk_search_req(const struct sock *sk, - struct request_sock ***prevp, - const __be16 rport, const __be32 raddr, +/* Note: this is temporary : + * req sock will no longer be in listener hash table +*/ +struct request_sock *inet_csk_search_req(struct sock *sk, + const __be16 rport, + const __be32 raddr, const __be32 laddr) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; + struct request_sock *req; + u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries); - for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries)]; - (req = *prev) != NULL; - prev = &req->dl_next) { + spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); + for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); if (ireq->ir_rmt_port == rport && ireq->ir_rmt_addr == raddr && ireq->ir_loc_addr == laddr && AF_INET_FAMILY(req->rsk_ops->family)) { + atomic_inc(&req->rsk_refcnt); WARN_ON(req->sk); - *prevp = prev; break; } } + spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); return req; } @@ -557,23 +573,58 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -void inet_csk_reqsk_queue_prune(struct sock *parent, - const unsigned long interval, - const unsigned long timeout, - const unsigned long max_rto) +/* return true if req was found in the syn_table[] */ +static bool reqsk_queue_unlink(struct request_sock_queue *queue, + struct request_sock *req) +{ + struct listen_sock *lopt = queue->listen_opt; + struct request_sock **prev; + bool found = false; + + spin_lock(&queue->syn_wait_lock); + + for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; + prev = &(*prev)->dl_next) { + if (*prev == req) { + *prev = req->dl_next; + found = true; + break; + } + } + + spin_unlock(&queue->syn_wait_lock); + if (del_timer(&req->rsk_timer)) + reqsk_put(req); + return found; +} + +void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) +{ + if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) { + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + reqsk_put(req); + } +} +EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); + +static void reqsk_timer_handler(unsigned long data) { - struct inet_connection_sock *icsk = inet_csk(parent); + struct request_sock *req = (struct request_sock *)data; + struct sock *sk_listener = req->rsk_listener; + struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; - int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; - int thresh = max_retries; - unsigned long now = jiffies; - struct request_sock **reqp, *req; - int i, budget; + int qlen, expire = 0, resend = 0; + int max_retries, thresh; + u8 defer_accept; - if (lopt == NULL || lopt->qlen == 0) + if (sk_listener->sk_state != TCP_LISTEN || !lopt) { + reqsk_put(req); return; + } + max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + thresh = max_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means @@ -591,67 +642,65 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - if (lopt->qlen>>(lopt->max_qlen_log-1)) { - int young = (lopt->qlen_young<<1); + qlen = listen_sock_qlen(lopt); + if (qlen >> (lopt->max_qlen_log - 1)) { + int young = listen_sock_young(lopt) << 1; while (thresh > 2) { - if (lopt->qlen < young) + if (qlen < young) break; thresh--; young <<= 1; } } + defer_accept = READ_ONCE(queue->rskq_defer_accept); + if (defer_accept) + max_retries = defer_accept; + syn_ack_recalc(req, thresh, max_retries, defer_accept, + &expire, &resend); + req->rsk_ops->syn_ack_timeout(req); + if (!expire && + (!resend || + !inet_rtx_syn_ack(sk_listener, req) || + inet_rsk(req)->acked)) { + unsigned long timeo; + + if (req->num_timeout++ == 0) + atomic_inc(&lopt->young_dec); + timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); + mod_timer_pinned(&req->rsk_timer, jiffies + timeo); + return; + } + inet_csk_reqsk_queue_drop(sk_listener, req); + reqsk_put(req); +} - if (queue->rskq_defer_accept) - max_retries = queue->rskq_defer_accept; - - budget = 2 * (lopt->nr_table_entries / (timeout / interval)); - i = lopt->clock_hand; - - do { - reqp=&lopt->syn_table[i]; - while ((req = *reqp) != NULL) { - if (time_after_eq(now, req->expires)) { - int expire = 0, resend = 0; - - syn_ack_recalc(req, thresh, max_retries, - queue->rskq_defer_accept, - &expire, &resend); - req->rsk_ops->syn_ack_timeout(parent, req); - if (!expire && - (!resend || - !inet_rtx_syn_ack(parent, req) || - inet_rsk(req)->acked)) { - unsigned long timeo; - - if (req->num_timeout++ == 0) - lopt->qlen_young--; - timeo = min(timeout << req->num_timeout, - max_rto); - req->expires = now + timeo; - reqp = &req->dl_next; - continue; - } - - /* Drop this request */ - inet_csk_reqsk_queue_unlink(parent, req, reqp); - reqsk_queue_removed(queue, req); - reqsk_free(req); - continue; - } - reqp = &req->dl_next; - } +void reqsk_queue_hash_req(struct request_sock_queue *queue, + u32 hash, struct request_sock *req, + unsigned long timeout) +{ + struct listen_sock *lopt = queue->listen_opt; - i = (i + 1) & (lopt->nr_table_entries - 1); + req->num_retrans = 0; + req->num_timeout = 0; + req->sk = NULL; - } while (--budget > 0); + /* before letting lookups find us, make sure all req fields + * are committed to memory and refcnt initialized. + */ + smp_wmb(); + atomic_set(&req->rsk_refcnt, 2); + setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); + req->rsk_hash = hash; - lopt->clock_hand = i; + spin_lock(&queue->syn_wait_lock); + req->dl_next = lopt->syn_table[hash]; + lopt->syn_table[hash] = req; + spin_unlock(&queue->syn_wait_lock); - if (lopt->qlen) - inet_csk_reset_keepalive_timer(parent, interval); + mod_timer_pinned(&req->rsk_timer, jiffies + timeout); } -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); +EXPORT_SYMBOL(reqsk_queue_hash_req); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone @@ -667,7 +716,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, { struct sock *newsk = sk_clone_lock(sk, priority); - if (newsk != NULL) { + if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); newsk->sk_state = TCP_SYN_RECV; @@ -679,6 +728,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, newsk->sk_write_space = sk_stream_write_space; newsk->sk_mark = inet_rsk(req)->ir_mark; + atomic64_set(&newsk->sk_cookie, + atomic64_read(&inet_rsk(req)->ir_cookie)); newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; @@ -785,8 +836,6 @@ void inet_csk_listen_stop(struct sock *sk) struct request_sock *acc_req; struct request_sock *req; - inet_csk_delete_keepalive_timer(sk); - /* make all the listen_opt local to us */ acc_req = reqsk_queue_yank_acceptq(queue); @@ -816,9 +865,9 @@ void inet_csk_listen_stop(struct sock *sk) percpu_counter_inc(sk->sk_prot->orphan_count); - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(tcp_sk(child)->fastopen_rsk != req); - BUG_ON(sk != tcp_rsk(req)->listener); + BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if * an inbound pkt destined for child is @@ -827,7 +876,6 @@ void inet_csk_listen_stop(struct sock *sk) * tcp_v4_destroy_sock(). */ tcp_sk(child)->fastopen_rsk = NULL; - sock_put(sk); } inet_csk_destroy_sock(child); @@ -836,9 +884,9 @@ void inet_csk_listen_stop(struct sock *sk) sock_put(child); sk_acceptq_removed(sk); - __reqsk_free(req); + reqsk_put(req); } - if (queue->fastopenq != NULL) { + if (queue->fastopenq) { /* Free all the reqs queued in rskq_rst_head. */ spin_lock_bh(&queue->fastopenq->lock); acc_req = queue->fastopenq->rskq_rst_head; @@ -846,7 +894,7 @@ void inet_csk_listen_stop(struct sock *sk) spin_unlock_bh(&queue->fastopenq->lock); while ((req = acc_req) != NULL) { acc_req = req->dl_next; - __reqsk_free(req); + reqsk_put(req); } } WARN_ON(sk->sk_ack_backlog); @@ -870,7 +918,7 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_af_ops->compat_getsockopt != NULL) + if (icsk->icsk_af_ops->compat_getsockopt) return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname, optval, optlen); return icsk->icsk_af_ops->getsockopt(sk, level, optname, @@ -883,7 +931,7 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_af_ops->compat_setsockopt != NULL) + if (icsk->icsk_af_ops->compat_setsockopt) return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname, optval, optlen); return icsk->icsk_af_ops->setsockopt(sk, level, optname, diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 592aff37366b..c3b1f3a0f4cf 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -38,16 +38,12 @@ static const struct inet_diag_handler **inet_diag_table; struct inet_diag_entry { - __be32 *saddr; - __be32 *daddr; + const __be32 *saddr; + const __be32 *daddr; u16 sport; u16 dport; u16 family; u16 userlocks; -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */ - struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */ -#endif }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -65,12 +61,35 @@ static const struct inet_diag_handler *inet_diag_lock_handler(int proto) return inet_diag_table[proto]; } -static inline void inet_diag_unlock_handler( - const struct inet_diag_handler *handler) +static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) { mutex_unlock(&inet_diag_table_mutex); } +static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) +{ + r->idiag_family = sk->sk_family; + + r->id.idiag_sport = htons(sk->sk_num); + r->id.idiag_dport = sk->sk_dport; + r->id.idiag_if = sk->sk_bound_dev_if; + sock_diag_save_cookie(sk, r->id.idiag_cookie); + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; + *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; + } else +#endif + { + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + + r->id.idiag_src[0] = sk->sk_rcv_saddr; + r->id.idiag_dst[0] = sk->sk_daddr; + } +} + static size_t inet_sk_attr_size(void) { return nla_total_size(sizeof(struct tcp_info)) @@ -86,21 +105,22 @@ static size_t inet_sk_attr_size(void) } int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, - struct sk_buff *skb, struct inet_diag_req_v2 *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + struct sk_buff *skb, const struct inet_diag_req_v2 *req, + struct user_namespace *user_ns, + u32 portid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) { const struct inet_sock *inet = inet_sk(sk); + const struct tcp_congestion_ops *ca_ops; + const struct inet_diag_handler *handler; + int ext = req->idiag_ext; struct inet_diag_msg *r; struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; - const struct inet_diag_handler *handler; - int ext = req->idiag_ext; handler = inet_diag_table[req->sdiag_protocol]; - BUG_ON(handler == NULL); + BUG_ON(!handler); nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlmsg_flags); @@ -108,25 +128,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, return -EMSGSIZE; r = nlmsg_data(nlh); - BUG_ON(sk->sk_state == TCP_TIME_WAIT); + BUG_ON(!sk_fullsock(sk)); - r->idiag_family = sk->sk_family; + inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; r->idiag_timer = 0; r->idiag_retrans = 0; - r->id.idiag_if = sk->sk_bound_dev_if; - sock_diag_save_cookie(sk, r->id.idiag_cookie); - - r->id.idiag_sport = inet->inet_sport; - r->id.idiag_dport = inet->inet_dport; - - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - - r->id.idiag_src[0] = inet->inet_rcv_saddr; - r->id.idiag_dst[0] = inet->inet_daddr; - if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; @@ -139,14 +147,14 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { - - *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; - if (ext & (1 << (INET_DIAG_TCLASS - 1))) if (nla_put_u8(skb, INET_DIAG_TCLASS, inet6_sk(sk)->tclass) < 0) goto errout; + + if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && + nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk))) + goto errout; } #endif @@ -169,7 +177,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) goto errout; - if (icsk == NULL) { + if (!icsk) { handler->idiag_get_info(sk, r, NULL); goto out; } @@ -196,25 +204,42 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } #undef EXPIRES_IN_MS - if (ext & (1 << (INET_DIAG_INFO - 1))) { + if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { attr = nla_reserve(skb, INET_DIAG_INFO, - sizeof(struct tcp_info)); + handler->idiag_info_size); if (!attr) goto errout; info = nla_data(attr); } - if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) - if (nla_put_string(skb, INET_DIAG_CONG, - icsk->icsk_ca_ops->name) < 0) + if (ext & (1 << (INET_DIAG_CONG - 1))) { + int err = 0; + + rcu_read_lock(); + ca_ops = READ_ONCE(icsk->icsk_ca_ops); + if (ca_ops) + err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name); + rcu_read_unlock(); + if (err < 0) goto errout; + } handler->idiag_get_info(sk, r, info); - if (sk->sk_state < TCP_TIME_WAIT && - icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) - icsk->icsk_ca_ops->get_info(sk, ext, skb); + if (sk->sk_state < TCP_TIME_WAIT) { + union tcp_cc_info info; + size_t sz = 0; + int attr; + + rcu_read_lock(); + ca_ops = READ_ONCE(icsk->icsk_ca_ops); + if (ca_ops && ca_ops->get_info) + sz = ca_ops->get_info(sk, ext, &attr, &info); + rcu_read_unlock(); + if (sz && nla_put(skb, attr, sz, &info) < 0) + goto errout; + } out: nlmsg_end(skb, nlh); @@ -227,23 +252,25 @@ errout: EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_csk_diag_fill(struct sock *sk, - struct sk_buff *skb, struct inet_diag_req_v2 *req, + struct sk_buff *skb, + const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { - return inet_sk_diag_fill(sk, inet_csk(sk), - skb, req, user_ns, portid, seq, nlmsg_flags, unlh); + return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, + user_ns, portid, seq, nlmsg_flags, unlh); } -static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, - struct sk_buff *skb, struct inet_diag_req_v2 *req, +static int inet_twsk_diag_fill(struct sock *sk, + struct sk_buff *skb, u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { - s32 tmo; + struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; + long tmo; nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlmsg_flags); @@ -253,25 +280,13 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_ttd - inet_tw_time_stamp(); + tmo = tw->tw_timer.expires - jiffies; if (tmo < 0) tmo = 0; - r->idiag_family = tw->tw_family; + inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; - r->id.idiag_if = tw->tw_bound_dev_if; - sock_diag_save_cookie(tw, r->id.idiag_cookie); - - r->id.idiag_sport = tw->tw_sport; - r->id.idiag_dport = tw->tw_dport; - - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - - r->id.idiag_src[0] = tw->tw_rcv_saddr; - r->id.idiag_dst[0] = tw->tw_daddr; - r->idiag_state = tw->tw_substate; r->idiag_timer = 3; r->idiag_expires = jiffies_to_msecs(tmo); @@ -279,61 +294,91 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r->idiag_wqueue = 0; r->idiag_uid = 0; r->idiag_inode = 0; -#if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr; - } -#endif + + nlmsg_end(skb, nlh); + return 0; +} + +static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, + u32 portid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), + nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = inet_reqsk(sk)->num_retrans; + + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != + offsetof(struct sock, sk_cookie)); + + tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; + r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0; + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; nlmsg_end(skb, nlh); return 0; } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, - struct inet_diag_req_v2 *r, + const struct inet_diag_req_v2 *r, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq, + return inet_twsk_diag_fill(sk, skb, portid, seq, nlmsg_flags, unlh); + if (sk->sk_state == TCP_NEW_SYN_RECV) + return inet_req_diag_fill(sk, skb, portid, seq, + nlmsg_flags, unlh); + return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh); } -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, - const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) +int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, + struct sk_buff *in_skb, + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *req) { - int err; - struct sock *sk; - struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); + struct sk_buff *rep; + struct sock *sk; + int err; err = -EINVAL; - if (req->sdiag_family == AF_INET) { + if (req->sdiag_family == AF_INET) sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); - } #if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) { + else if (req->sdiag_family == AF_INET6) sk = inet6_lookup(net, hashinfo, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if); - } #endif - else { + else goto out_nosk; - } err = -ENOENT; - if (sk == NULL) + if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); @@ -371,7 +416,7 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; int err; @@ -412,9 +457,8 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits) return 1; } - static int inet_diag_bc_run(const struct nlattr *_bc, - const struct inet_diag_entry *entry) + const struct inet_diag_entry *entry) { const void *bc = nla_data(_bc); int len = nla_len(_bc); @@ -446,10 +490,10 @@ static int inet_diag_bc_run(const struct nlattr *_bc, break; case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: { - struct inet_diag_hostcond *cond; - __be32 *addr; + const struct inet_diag_hostcond *cond; + const __be32 *addr; - cond = (struct inet_diag_hostcond *)(op + 1); + cond = (const struct inet_diag_hostcond *)(op + 1); if (cond->port != -1 && cond->port != (op->code == INET_DIAG_BC_S_COND ? entry->sport : entry->dport)) { @@ -498,29 +542,36 @@ static int inet_diag_bc_run(const struct nlattr *_bc, return len == 0; } +/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV) + */ +static void entry_fill_addrs(struct inet_diag_entry *entry, + const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32; + entry->daddr = sk->sk_v6_daddr.s6_addr32; + } else +#endif + { + entry->saddr = &sk->sk_rcv_saddr; + entry->daddr = &sk->sk_daddr; + } +} + int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) { - struct inet_diag_entry entry; struct inet_sock *inet = inet_sk(sk); + struct inet_diag_entry entry; - if (bc == NULL) + if (!bc) return 1; entry.family = sk->sk_family; -#if IS_ENABLED(CONFIG_IPV6) - if (entry.family == AF_INET6) { - - entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32; - entry.daddr = sk->sk_v6_daddr.s6_addr32; - } else -#endif - { - entry.saddr = &inet->inet_rcv_saddr; - entry.daddr = &inet->inet_daddr; - } + entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); - entry.userlocks = sk->sk_userlocks; + entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; return inet_diag_bc_run(bc, &entry); } @@ -547,8 +598,8 @@ static int valid_cc(const void *bc, int len, int cc) static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) { - int addr_len; struct inet_diag_hostcond *cond; + int addr_len; /* Check hostcond space. */ *min_len += sizeof(struct inet_diag_hostcond); @@ -582,8 +633,8 @@ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, } /* Validate a port comparison operator. */ -static inline bool valid_port_comparison(const struct inet_diag_bc_op *op, - int len, int *min_len) +static bool valid_port_comparison(const struct inet_diag_bc_op *op, + int len, int *min_len) { /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ *min_len += sizeof(struct inet_diag_bc_op); @@ -598,10 +649,9 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) int len = bytecode_len; while (len > 0) { - const struct inet_diag_bc_op *op = bc; int min_len = sizeof(struct inet_diag_bc_op); + const struct inet_diag_bc_op *op = bc; -//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); switch (op->code) { case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: @@ -642,7 +692,7 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) static int inet_csk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, + const struct inet_diag_req_v2 *r, const struct nlattr *bc) { if (!inet_diag_bc_sk(bc, sk)) @@ -654,139 +704,42 @@ static int inet_csk_diag_dump(struct sock *sk, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } -static int inet_twsk_diag_dump(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - struct inet_diag_req_v2 *r, - const struct nlattr *bc) +static void twsk_build_assert(void) { - struct inet_timewait_sock *tw = inet_twsk(sk); + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != + offsetof(struct sock, sk_family)); - if (bc != NULL) { - struct inet_diag_entry entry; + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != + offsetof(struct inet_sock, inet_num)); - entry.family = tw->tw_family; -#if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == AF_INET6) { - entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32; - entry.daddr = tw->tw_v6_daddr.s6_addr32; - } else -#endif - { - entry.saddr = &tw->tw_rcv_saddr; - entry.daddr = &tw->tw_daddr; - } - entry.sport = tw->tw_num; - entry.dport = ntohs(tw->tw_dport); - entry.userlocks = 0; + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != + offsetof(struct inet_sock, inet_dport)); - if (!inet_diag_bc_run(bc, &entry)) - return 0; - } + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != + offsetof(struct inet_sock, inet_rcv_saddr)); - return inet_twsk_diag_fill(tw, skb, r, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); -} - -/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses - * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6. - */ -static inline void inet_diag_req_addrs(const struct sock *sk, - const struct request_sock *req, - struct inet_diag_entry *entry) -{ - struct inet_request_sock *ireq = inet_rsk(req); + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != + offsetof(struct inet_sock, inet_daddr)); #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - if (req->rsk_ops->family == AF_INET6) { - entry->saddr = ireq->ir_v6_loc_addr.s6_addr32; - entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32; - } else if (req->rsk_ops->family == AF_INET) { - ipv6_addr_set_v4mapped(ireq->ir_loc_addr, - &entry->saddr_storage); - ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, - &entry->daddr_storage); - entry->saddr = entry->saddr_storage.s6_addr32; - entry->daddr = entry->daddr_storage.s6_addr32; - } - } else -#endif - { - entry->saddr = &ireq->ir_loc_addr; - entry->daddr = &ireq->ir_rmt_addr; - } -} - -static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, - struct request_sock *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, - const struct nlmsghdr *unlh) -{ - const struct inet_request_sock *ireq = inet_rsk(req); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - NLM_F_MULTI); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - r->idiag_family = sk->sk_family; - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = req->num_retrans; - - r->id.idiag_if = sk->sk_bound_dev_if; - sock_diag_save_cookie(req, r->id.idiag_cookie); - - tmo = req->expires - jiffies; - if (tmo < 0) - tmo = 0; - - r->id.idiag_sport = inet->inet_sport; - r->id.idiag_dport = ireq->ir_rmt_port; - - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != + offsetof(struct sock, sk_v6_rcv_saddr)); - r->id.idiag_src[0] = ireq->ir_loc_addr; - r->id.idiag_dst[0] = ireq->ir_rmt_addr; - - r->idiag_expires = jiffies_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); - r->idiag_inode = 0; -#if IS_ENABLED(CONFIG_IPV6) - if (r->idiag_family == AF_INET6) { - struct inet_diag_entry entry; - inet_diag_req_addrs(sk, req, &entry); - memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr)); - memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr)); - } + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != + offsetof(struct sock, sk_v6_daddr)); #endif - - nlmsg_end(skb, nlh); - return 0; } static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, + const struct inet_diag_req_v2 *r, const struct nlattr *bc) { - struct inet_diag_entry entry; struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt; struct inet_sock *inet = inet_sk(sk); - int j, s_j; - int reqnum, s_reqnum; + struct inet_diag_entry entry; + int j, s_j, reqnum, s_reqnum; + struct listen_sock *lopt; int err = 0; s_j = cb->args[3]; @@ -797,13 +750,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.family = sk->sk_family; - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !lopt->qlen) + if (!lopt || !listen_sock_qlen(lopt)) goto out; - if (bc != NULL) { + if (bc) { entry.sport = inet->inet_num; entry.userlocks = sk->sk_userlocks; } @@ -822,17 +775,18 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, continue; if (bc) { - inet_diag_req_addrs(sk, req, &entry); + /* Note: entry.sport and entry.userlocks are already set */ + entry_fill_addrs(&entry, req_to_sk(req)); entry.dport = ntohs(ireq->ir_rmt_port); if (!inet_diag_bc_run(bc, &entry)) continue; } - err = inet_diag_fill_req(skb, sk, req, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, cb->nlh); + err = inet_req_diag_fill(req_to_sk(req), skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb->nlh); if (err < 0) { cb->args[3] = j + 1; cb->args[4] = reqnum; @@ -844,17 +798,17 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, } out: - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); return err; } void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, - struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc) + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, struct nlattr *bc) { - int i, num; - int s_i, s_num; struct net *net = sock_net(skb->sk); + int i, num, s_i, s_num; s_i = cb->args[1]; s_num = num = cb->args[2]; @@ -864,9 +818,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { - struct sock *sk; - struct hlist_nulls_node *node; struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + struct sock *sk; num = 0; ilb = &hashinfo->listening_hash[i]; @@ -883,7 +837,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, } if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) + sk->sk_family != r->sdiag_family) goto next_listen; if (r->id.idiag_sport != inet->inet_sport && @@ -931,8 +885,8 @@ skip_listen_ht: for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); - struct sock *sk; struct hlist_nulls_node *node; + struct sock *sk; num = 0; @@ -944,8 +898,7 @@ skip_listen_ht: spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { - int res; - int state; + int state, res; if (!net_eq(sock_net(sk), net)) continue; @@ -964,10 +917,16 @@ skip_listen_ht: if (r->id.idiag_dport != sk->sk_dport && r->id.idiag_dport) goto next_normal; - if (sk->sk_state == TCP_TIME_WAIT) - res = inet_twsk_diag_dump(sk, skb, cb, r, bc); - else - res = inet_csk_diag_dump(sk, skb, cb, r, bc); + twsk_build_assert(); + + if (!inet_diag_bc_sk(bc, sk)) + goto next_normal; + + res = sk_diag_fill(sk, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh); if (res < 0) { spin_unlock_bh(lock); goto done; @@ -988,7 +947,8 @@ out: EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r, + struct nlattr *bc) { const struct inet_diag_handler *handler; int err = 0; @@ -1005,8 +965,8 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct nlattr *bc = NULL; int hdrlen = sizeof(struct inet_diag_req_v2); + struct nlattr *bc = NULL; if (nlmsg_attrlen(cb->nlh, hdrlen)) bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); @@ -1014,7 +974,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); } -static inline int inet_diag_type2proto(int type) +static int inet_diag_type2proto(int type) { switch (type) { case TCPDIAG_GETSOCK: @@ -1026,12 +986,13 @@ static inline int inet_diag_type2proto(int type) } } -static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) +static int inet_diag_dump_compat(struct sk_buff *skb, + struct netlink_callback *cb) { struct inet_diag_req *rc = nlmsg_data(cb->nlh); + int hdrlen = sizeof(struct inet_diag_req); struct inet_diag_req_v2 req; struct nlattr *bc = NULL; - int hdrlen = sizeof(struct inet_diag_req); req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); @@ -1046,7 +1007,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c } static int inet_diag_get_exact_compat(struct sk_buff *in_skb, - const struct nlmsghdr *nlh) + const struct nlmsghdr *nlh) { struct inet_diag_req *rc = nlmsg_data(nlh); struct inet_diag_req_v2 req; @@ -1075,7 +1036,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) attr = nlmsg_find_attr(nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - if (attr == NULL || + if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op) || inet_diag_bc_audit(nla_data(attr), nla_len(attr))) return -EINVAL; @@ -1102,9 +1063,10 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) if (h->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(h, hdrlen)) { struct nlattr *attr; + attr = nlmsg_find_attr(h, hdrlen, INET_DIAG_REQ_BYTECODE); - if (attr == NULL || + if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op) || inet_diag_bc_audit(nla_data(attr), nla_len(attr))) return -EINVAL; @@ -1120,14 +1082,62 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) return inet_diag_get_exact(skb, h, nlmsg_data(h)); } +static +int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) +{ + const struct inet_diag_handler *handler; + struct nlmsghdr *nlh; + struct nlattr *attr; + struct inet_diag_msg *r; + void *info = NULL; + int err = 0; + + nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0); + if (!nlh) + return -ENOMEM; + + r = nlmsg_data(nlh); + memset(r, 0, sizeof(*r)); + inet_diag_msg_common_fill(r, sk); + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM) + r->id.idiag_sport = inet_sk(sk)->inet_sport; + r->idiag_state = sk->sk_state; + + if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { + nlmsg_cancel(skb, nlh); + return err; + } + + handler = inet_diag_lock_handler(sk->sk_protocol); + if (IS_ERR(handler)) { + inet_diag_unlock_handler(handler); + nlmsg_cancel(skb, nlh); + return PTR_ERR(handler); + } + + attr = handler->idiag_info_size + ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size) + : NULL; + if (attr) + info = nla_data(attr); + + handler->idiag_get_info(sk, r, info); + inet_diag_unlock_handler(handler); + + nlmsg_end(skb, nlh); + return 0; +} + static const struct sock_diag_handler inet_diag_handler = { .family = AF_INET, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; static const struct sock_diag_handler inet6_diag_handler = { .family = AF_INET6, .dump = inet_diag_handler_dump, + .get_info = inet_diag_handler_get_info, }; int inet_diag_register(const struct inet_diag_handler *h) @@ -1140,7 +1150,7 @@ int inet_diag_register(const struct inet_diag_handler *h) mutex_lock(&inet_diag_table_mutex); err = -EEXIST; - if (inet_diag_table[type] == NULL) { + if (!inet_diag_table[type]) { inet_diag_table[type] = h; err = 0; } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e7920352646a..d0a7c0319e3d 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -131,34 +131,22 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) unsigned int evicted = 0; HLIST_HEAD(expired); -evict_again: spin_lock(&hb->chain_lock); hlist_for_each_entry_safe(fq, n, &hb->chain, list) { if (!inet_fragq_should_evict(fq)) continue; - if (!del_timer(&fq->timer)) { - /* q expiring right now thus increment its refcount so - * it won't be freed under us and wait until the timer - * has finished executing then destroy it - */ - atomic_inc(&fq->refcnt); - spin_unlock(&hb->chain_lock); - del_timer_sync(&fq->timer); - inet_frag_put(fq, f); - goto evict_again; - } + if (!del_timer(&fq->timer)) + continue; - fq->flags |= INET_FRAG_EVICTED; - hlist_del(&fq->list); - hlist_add_head(&fq->list, &expired); + hlist_add_head(&fq->list_evictor, &expired); ++evicted; } spin_unlock(&hb->chain_lock); - hlist_for_each_entry_safe(fq, n, &expired, list) + hlist_for_each_entry_safe(fq, n, &expired, list_evictor) f->frag_expire((unsigned long) fq); return evicted; @@ -240,18 +228,20 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) int i; nf->low_thresh = 0; - local_bh_disable(); evict_again: + local_bh_disable(); seq = read_seqbegin(&f->rnd_seqlock); for (i = 0; i < INETFRAGS_HASHSZ ; i++) inet_evict_bucket(f, &f->hash[i]); - if (read_seqretry(&f->rnd_seqlock, seq)) - goto evict_again; - local_bh_enable(); + cond_resched(); + + if (read_seqretry(&f->rnd_seqlock, seq) || + percpu_counter_sum(&nf->mem)) + goto evict_again; percpu_counter_destroy(&nf->mem); } @@ -284,8 +274,8 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) struct inet_frag_bucket *hb; hb = get_frag_bucket_locked(fq, f); - if (!(fq->flags & INET_FRAG_EVICTED)) - hlist_del(&fq->list); + hlist_del(&fq->list); + fq->flags |= INET_FRAG_COMPLETE; spin_unlock(&hb->chain_lock); } @@ -297,7 +287,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) if (!(fq->flags & INET_FRAG_COMPLETE)) { fq_unlink(fq, f); atomic_dec(&fq->refcnt); - fq->flags |= INET_FRAG_COMPLETE; } } EXPORT_SYMBOL(inet_frag_kill); @@ -330,11 +319,12 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) fp = xp; } sum = sum_truesize + f->qsize; - sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); kmem_cache_free(f->frags_cachep, q); + + sub_frag_mem_limit(nf, sum); } EXPORT_SYMBOL(inet_frag_destroy); @@ -385,12 +375,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, } q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); - if (q == NULL) + if (!q) return NULL; q->net = nf; f->constructor(q, arg); - add_frag_mem_limit(q, f->qsize); + add_frag_mem_limit(nf, f->qsize); setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); @@ -406,7 +396,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, struct inet_frag_queue *q; q = inet_frag_alloc(nf, f, arg); - if (q == NULL) + if (!q) return NULL; return inet_frag_intern(nf, q, f, arg); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9111a4e22155..0cb9165421d4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -18,15 +18,16 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> +#include <linux/vmalloc.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> -static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +static u32 inet_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 inet_ehash_secret __read_mostly; @@ -36,17 +37,21 @@ static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, inet_ehash_secret + net_hash_mix(net)); } - -static unsigned int inet_sk_ehashfn(const struct sock *sk) +/* This function handles inet_sock, but also timewait and request sockets + * for IPv4/IPv6. + */ +u32 sk_ehashfn(const struct sock *sk) { - const struct inet_sock *inet = inet_sk(sk); - const __be32 laddr = inet->inet_rcv_saddr; - const __u16 lport = inet->inet_num; - const __be32 faddr = inet->inet_daddr; - const __be16 fport = inet->inet_dport; - struct net *net = sock_net(sk); - - return inet_ehashfn(net, laddr, lport, faddr, fport); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) + return inet6_ehashfn(sock_net(sk), + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); +#endif + return inet_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); } /* @@ -60,8 +65,8 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); - if (tb != NULL) { - write_pnet(&tb->ib_net, hold_net(net)); + if (tb) { + write_pnet(&tb->ib_net, net); tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; @@ -79,7 +84,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); - release_net(ib_net(tb)); kmem_cache_free(cachep, tb); } } @@ -87,10 +91,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - - atomic_inc(&hashinfo->bsockets); - inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); tb->num_owners++; @@ -108,8 +108,6 @@ static void __inet_put_port(struct sock *sk) struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; - atomic_dec(&hashinfo->bsockets); - spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); @@ -263,11 +261,19 @@ void sock_gen_put(struct sock *sk) if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_free(inet_twsk(sk)); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + reqsk_free(inet_reqsk(sk)); else sk_free(sk); } EXPORT_SYMBOL_GPL(sock_gen_put); +void sock_edemux(struct sk_buff *skb) +{ + sock_gen_put(skb->sk); +} +EXPORT_SYMBOL(sock_edemux); + struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, @@ -377,7 +383,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); } @@ -388,9 +394,10 @@ not_unique: return -EADDRNOTAVAIL; } -static inline u32 inet_sk_port_offset(const struct sock *sk) +static u32 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); @@ -400,13 +407,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; - spinlock_t *lock; struct inet_ehash_bucket *head; + spinlock_t *lock; int twrefcnt = 0; WARN_ON(!sk_unhashed(sk)); - sk->sk_hash = inet_sk_ehashfn(sk); + sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); @@ -423,15 +430,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); -static void __inet_hash(struct sock *sk) +int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; - if (sk->sk_state != TCP_LISTEN) { - __inet_hash_nolisten(sk, NULL); - return; - } + if (sk->sk_state != TCP_LISTEN) + return __inet_hash_nolisten(sk, tw); WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -440,13 +445,15 @@ static void __inet_hash(struct sock *sk) __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); spin_unlock(&ilb->lock); + return 0; } +EXPORT_SYMBOL(__inet_hash); void inet_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk); + __inet_hash(sk, NULL); local_bh_enable(); } } @@ -477,8 +484,7 @@ EXPORT_SYMBOL_GPL(inet_unhash); int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, - struct sock *, __u16, struct inet_timewait_sock **), - int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) + struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; const unsigned short snum = inet_sk(sk)->inet_num; @@ -497,8 +503,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; + /* By starting with offset being an even number, + * we tend to leave about 50% of ports for other uses, + * like bind(0). + */ + offset &= ~1; + local_bh_disable(); - for (i = 1; i <= remaining; i++) { + for (i = 0; i < remaining; i++) { port = low + (i + offset) % remaining; if (inet_is_local_reserved_port(net, port)) continue; @@ -542,20 +554,20 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, return -EADDRNOTAVAIL; ok: - hint += i; + hint += (i + 2) & ~1; /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - twrefcnt += hash(sk, tw); + twrefcnt += __inet_hash_nolisten(sk, tw); } if (tw) twrefcnt += inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); if (tw) { - inet_twsk_deschedule(tw, death_row); + inet_twsk_deschedule(tw); while (twrefcnt) { twrefcnt--; inet_twsk_put(tw); @@ -570,7 +582,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - hash(sk, NULL); + __inet_hash_nolisten(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { @@ -589,8 +601,12 @@ out: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), - __inet_check_established, __inet_hash_nolisten); + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, + __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); @@ -598,7 +614,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h) { int i; - atomic_set(&h->bsockets, 0); for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, @@ -606,3 +621,32 @@ void inet_hashinfo_init(struct inet_hashinfo *h) } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); + +int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int locksz = sizeof(spinlock_t); + unsigned int i, nblocks = 1; + + if (locksz != 0) { + /* allocate 2 cache lines or at least one spinlock per cpu */ + nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); + nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + + /* no more locks than number of hash buckets */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); + + hashinfo->ehash_locks = kmalloc_array(nblocks, locksz, + GFP_KERNEL | __GFP_NOWARN); + if (!hashinfo->ehash_locks) + hashinfo->ehash_locks = vmalloc(nblocks * locksz); + + if (!hashinfo->ehash_locks) + return -ENOMEM; + + for (i = 0; i < nblocks; i++) + spin_lock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = nblocks - 1; + return 0; +} +EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 6d592f8555fb..2ffbd16b79e0 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -67,9 +67,9 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, } /* Must be called with locally disabled BHs. */ -static void __inet_twsk_kill(struct inet_timewait_sock *tw, - struct inet_hashinfo *hashinfo) +static void inet_twsk_kill(struct inet_timewait_sock *tw) { + struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; struct inet_bind_hashbucket *bhead; int refcnt; /* Unlink from established hashes. */ @@ -89,6 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); atomic_sub(refcnt, &tw->tw_refcnt); + atomic_dec(&tw->tw_dr->tw_count); + inet_twsk_put(tw); } void inet_twsk_free(struct inet_timewait_sock *tw) @@ -98,7 +100,6 @@ void inet_twsk_free(struct inet_timewait_sock *tw) #ifdef SOCK_REFCNT_DEBUG pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif - release_net(twsk_net(tw)); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } @@ -169,16 +170,34 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) +static void tw_timer_handler(unsigned long data) { - struct inet_timewait_sock *tw = - kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, - GFP_ATOMIC); - if (tw != NULL) { + struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; + + if (tw->tw_kill) + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); + else + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); + inet_twsk_kill(tw); +} + +struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, + struct inet_timewait_death_row *dr, + const int state) +{ + struct inet_timewait_sock *tw; + + if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets) + return NULL; + + tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + GFP_ATOMIC); + if (tw) { const struct inet_sock *inet = inet_sk(sk); kmemcheck_annotate_bitfield(tw, flags); + tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; tw->tw_rcv_saddr = inet->inet_rcv_saddr; @@ -195,14 +214,16 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_ipv6only = 0; tw->tw_transparent = inet->transparent; tw->tw_prot = sk->sk_prot_creator; - twsk_net_set(tw, hold_net(sock_net(sk))); + atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); + twsk_net_set(tw, sock_net(sk)); + setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this * timewait socket. */ atomic_set(&tw->tw_refcnt, 0); - inet_twsk_dead_node_init(tw); + __module_get(tw->tw_prot->owner); } @@ -210,139 +231,20 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat } EXPORT_SYMBOL_GPL(inet_twsk_alloc); -/* Returns non-zero if quota exceeded. */ -static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, - const int slot) -{ - struct inet_timewait_sock *tw; - unsigned int killed; - int ret; - - /* NOTE: compare this to previous version where lock - * was released after detaching chain. It was racy, - * because tw buckets are scheduled in not serialized context - * in 2.3 (with netfilter), and with softnet it is common, because - * soft irqs are not sequenced. - */ - killed = 0; - ret = 0; -rescan: - inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { - __inet_twsk_del_dead_node(tw); - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); -#ifdef CONFIG_NET_NS - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); -#endif - inet_twsk_put(tw); - killed++; - spin_lock(&twdr->death_lock); - if (killed > INET_TWDR_TWKILL_QUOTA) { - ret = 1; - break; - } - - /* While we dropped twdr->death_lock, another cpu may have - * killed off the next TW bucket in the list, therefore - * do a fresh re-read of the hlist head node with the - * lock reacquired. We still use the hlist traversal - * macro in order to get the prefetches. - */ - goto rescan; - } - - twdr->tw_count -= killed; -#ifndef CONFIG_NET_NS - NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed); -#endif - return ret; -} - -void inet_twdr_hangman(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - unsigned int need_timer; - - twdr = (struct inet_timewait_death_row *)data; - spin_lock(&twdr->death_lock); - - if (twdr->tw_count == 0) - goto out; - - need_timer = 0; - if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { - twdr->thread_slots |= (1 << twdr->slot); - schedule_work(&twdr->twkill_work); - need_timer = 1; - } else { - /* We purged the entire slot, anything left? */ - if (twdr->tw_count) - need_timer = 1; - twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); - } - if (need_timer) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); -out: - spin_unlock(&twdr->death_lock); -} -EXPORT_SYMBOL_GPL(inet_twdr_hangman); - -void inet_twdr_twkill_work(struct work_struct *work) -{ - struct inet_timewait_death_row *twdr = - container_of(work, struct inet_timewait_death_row, twkill_work); - int i; - - BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) > - (sizeof(twdr->thread_slots) * 8)); - - while (twdr->thread_slots) { - spin_lock_bh(&twdr->death_lock); - for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { - if (!(twdr->thread_slots & (1 << i))) - continue; - - while (inet_twdr_do_twkill_work(twdr, i) != 0) { - if (need_resched()) { - spin_unlock_bh(&twdr->death_lock); - schedule(); - spin_lock_bh(&twdr->death_lock); - } - } - - twdr->thread_slots &= ~(1 << i); - } - spin_unlock_bh(&twdr->death_lock); - } -} -EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); - /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ /* This is for handling early-kills of TIME_WAIT sockets. */ -void inet_twsk_deschedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr) +void inet_twsk_deschedule(struct inet_timewait_sock *tw) { - spin_lock(&twdr->death_lock); - if (inet_twsk_del_dead_node(tw)) { - inet_twsk_put(tw); - if (--twdr->tw_count == 0) - del_timer(&twdr->tw_timer); - } - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); + if (del_timer_sync(&tw->tw_timer)) + inet_twsk_kill(tw); } EXPORT_SYMBOL(inet_twsk_deschedule); -void inet_twsk_schedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr, - const int timeo, const int timewait_len) +void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) { - struct hlist_head *list; - int slot; - /* timeout := RTO * 3.5 * * 3.5 = 1+2+0.5 to wait for two retransmits. @@ -367,115 +269,15 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, * is greater than TS tick!) and detect old duplicates with help * of PAWS. */ - slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; - spin_lock(&twdr->death_lock); - - /* Unlink it, if it was scheduled */ - if (inet_twsk_del_dead_node(tw)) - twdr->tw_count--; - else + tw->tw_kill = timeo <= 4*HZ; + if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { atomic_inc(&tw->tw_refcnt); - - if (slot >= INET_TWDR_RECYCLE_SLOTS) { - /* Schedule to slow timer */ - if (timeo >= timewait_len) { - slot = INET_TWDR_TWKILL_SLOTS - 1; - } else { - slot = DIV_ROUND_UP(timeo, twdr->period); - if (slot >= INET_TWDR_TWKILL_SLOTS) - slot = INET_TWDR_TWKILL_SLOTS - 1; - } - tw->tw_ttd = inet_tw_time_stamp() + timeo; - slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); - list = &twdr->cells[slot]; - } else { - tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK); - - if (twdr->twcal_hand < 0) { - twdr->twcal_hand = 0; - twdr->twcal_jiffie = jiffies; - twdr->twcal_timer.expires = twdr->twcal_jiffie + - (slot << INET_TWDR_RECYCLE_TICK); - add_timer(&twdr->twcal_timer); - } else { - if (time_after(twdr->twcal_timer.expires, - jiffies + (slot << INET_TWDR_RECYCLE_TICK))) - mod_timer(&twdr->twcal_timer, - jiffies + (slot << INET_TWDR_RECYCLE_TICK)); - slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - list = &twdr->twcal_row[slot]; + atomic_inc(&tw->tw_dr->tw_count); } - - hlist_add_head(&tw->tw_death_node, list); - - if (twdr->tw_count++ == 0) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); - spin_unlock(&twdr->death_lock); } EXPORT_SYMBOL_GPL(inet_twsk_schedule); -void inet_twdr_twcal_tick(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - int n, slot; - unsigned long j; - unsigned long now = jiffies; - int killed = 0; - int adv = 0; - - twdr = (struct inet_timewait_death_row *)data; - - spin_lock(&twdr->death_lock); - if (twdr->twcal_hand < 0) - goto out; - - slot = twdr->twcal_hand; - j = twdr->twcal_jiffie; - - for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { - if (time_before_eq(j, now)) { - struct hlist_node *safe; - struct inet_timewait_sock *tw; - - inet_twsk_for_each_inmate_safe(tw, safe, - &twdr->twcal_row[slot]) { - __inet_twsk_del_dead_node(tw); - __inet_twsk_kill(tw, twdr->hashinfo); -#ifdef CONFIG_NET_NS - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); -#endif - inet_twsk_put(tw); - killed++; - } - } else { - if (!adv) { - adv = 1; - twdr->twcal_jiffie = j; - twdr->twcal_hand = slot; - } - - if (!hlist_empty(&twdr->twcal_row[slot])) { - mod_timer(&twdr->twcal_timer, j); - goto out; - } - } - j += 1 << INET_TWDR_RECYCLE_TICK; - slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - twdr->twcal_hand = -1; - -out: - if ((twdr->tw_count -= killed) == 0) - del_timer(&twdr->tw_timer); -#ifndef CONFIG_NET_NS - NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed); -#endif - spin_unlock(&twdr->death_lock); -} -EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); - void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) { @@ -487,6 +289,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; restart_rcu: + cond_resched(); rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { @@ -508,7 +311,7 @@ restart: rcu_read_unlock(); local_bh_disable(); - inet_twsk_deschedule(tw, twdr); + inet_twsk_deschedule(tw); local_bh_enable(); inet_twsk_put(tw); goto restart_rcu; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index d9bc28ac5d1b..2d3aa408fbdc 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,17 +39,21 @@ #include <net/route.h> #include <net/xfrm.h> -static bool ip_may_fragment(const struct sk_buff *skb) -{ - return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || - skb->ignore_df; -} - static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; + if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)) + return false; + + /* original fragment exceeds mtu and DF is set */ + if (unlikely(IPCB(skb)->frag_max_size > mtu)) + return true; + + if (skb->ignore_df) + return false; + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) return false; @@ -57,7 +61,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) } -static int ip_forward_finish(struct sk_buff *skb) +static int ip_forward_finish(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); @@ -68,7 +72,7 @@ static int ip_forward_finish(struct sk_buff *skb) ip_forward_options(skb); skb_sender_cpu_clear(skb); - return dst_output(skb); + return dst_output_sk(sk, skb); } int ip_forward(struct sk_buff *skb) @@ -82,6 +86,9 @@ int ip_forward(struct sk_buff *skb) if (skb->pkt_type != PACKET_HOST) goto drop; + if (unlikely(skb->sk)) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; @@ -111,7 +118,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { + if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); @@ -136,8 +143,8 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); - return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, - rt->dst.dev, ip_forward_finish); + return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, + skb->dev, rt->dst.dev, ip_forward_finish); sr_failed: /* diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 145a50c4d566..921138f6c97c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -75,6 +75,7 @@ struct ipq { __be16 id; u8 protocol; u8 ecn; /* RFC3168 support */ + u16 max_df_size; /* largest frag with DF set seen */ int iif; unsigned int rid; struct inet_peer *peer; @@ -173,6 +174,15 @@ static void ipq_kill(struct ipq *ipq) inet_frag_kill(&ipq->q, &ip4_frags); } +static bool frag_expire_skip_icmp(u32 user) +{ + return user == IP_DEFRAG_AF_PACKET || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, + __IP_DEFRAG_CONNTRACK_IN_END) || + ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP_DEFRAG_CONNTRACK_BRIDGE_IN); +} + /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -192,7 +202,7 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - if (!(qp->q.flags & INET_FRAG_EVICTED)) { + if (!inet_frag_evicting(&qp->q)) { struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; @@ -217,10 +227,8 @@ static void ip_expire(unsigned long arg) /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_AF_PACKET || - ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && - (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && - (skb_rtable(head)->rt_type != RTN_LOCAL))) + if (frag_expire_skip_icmp(qp->user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out_rcu_unlock; /* Send an ICMP "Fragment Reassembly Timeout" message. */ @@ -301,7 +309,7 @@ static int ip_frag_reinit(struct ipq *qp) kfree_skb(fp); fp = xp; } while (fp); - sub_frag_mem_limit(&qp->q, sum_truesize); + sub_frag_mem_limit(qp->q.net, sum_truesize); qp->q.flags = 0; qp->q.len = 0; @@ -319,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct sk_buff *prev, *next; struct net_device *dev; + unsigned int fragsize; int flags, offset; int ihl, end; int err = -ENOENT; @@ -342,7 +351,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) ihl = ip_hdrlen(skb); /* Determine the position of this fragment. */ - end = offset + skb->len - ihl; + end = offset + skb->len - skb_network_offset(skb) - ihl; err = -EINVAL; /* Is this the final fragment? */ @@ -372,7 +381,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) goto err; err = -ENOMEM; - if (pskb_pull(skb, ihl) == NULL) + if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) goto err; err = pskb_trim_rcsum(skb, end - offset); @@ -446,7 +455,7 @@ found: qp->q.fragments = next; qp->q.meat -= free_it->len; - sub_frag_mem_limit(&qp->q, free_it->truesize); + sub_frag_mem_limit(qp->q.net, free_it->truesize); kfree_skb(free_it); } } @@ -470,13 +479,18 @@ found: qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - add_frag_mem_limit(&qp->q, skb->truesize); + add_frag_mem_limit(qp->q.net, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; + fragsize = skb->len + ihl; + + if (fragsize > qp->q.max_size) + qp->q.max_size = fragsize; + if (ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len + ihl > qp->q.max_size) - qp->q.max_size = skb->len + ihl; + fragsize > qp->max_df_size) + qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { @@ -537,7 +551,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, qp->q.fragments = head; } - WARN_ON(head == NULL); + WARN_ON(!head); WARN_ON(FRAG_CB(head)->offset != 0); /* Allocate a new buffer for the datagram. */ @@ -559,7 +573,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct sk_buff *clone; int i, plen = 0; - if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + clone = alloc_skb(0, GFP_ATOMIC); + if (!clone) goto out_nomem; clone->next = head->next; head->next = clone; @@ -572,7 +587,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - add_frag_mem_limit(&qp->q, clone->truesize); + add_frag_mem_limit(qp->q.net, clone->truesize); } skb_push(head, head->data - skb_network_header(head)); @@ -600,18 +615,34 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } fp = next; } - sub_frag_mem_limit(&qp->q, sum_truesize); + sub_frag_mem_limit(qp->q.net, sum_truesize); head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = qp->q.max_size; + IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(head); - /* max_size != 0 implies at least one fragment had IP_DF set */ - iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); iph->tos |= ecn; + + /* When we set IP_DF on a refragmented skb we must also force a + * call to ip_fragment to avoid forwarding a DF-skb of size s while + * original sender only sent fragments of size f (where f < s). + * + * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest + * frag seen to avoid sending tiny DF-fragments in case skb was built + * from one very small df-fragment and one large non-df frag. + */ + if (qp->max_df_size == qp->q.max_size) { + IPCB(head)->flags |= IPSKB_FRAG_PMTU; + iph->frag_off = htons(IP_DF); + } else { + iph->frag_off = 0; + } + + ip_send_check(iph); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; @@ -638,7 +669,8 @@ int ip_defrag(struct sk_buff *skb, u32 user) IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ - if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { + qp = ip_find(net, ip_hdr(skb), user); + if (qp) { int ret; spin_lock(&qp->q.lock); @@ -754,7 +786,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table = ip4_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); - if (table == NULL) + if (!table) goto err_alloc; table[0].data = &net->ipv4.frags.high_thresh; @@ -770,7 +802,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) } hdr = register_net_sysctl(net, "net/ipv4", table); - if (hdr == NULL) + if (!hdr) goto err_reg; net->ipv4.frags_hdr = hdr; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6207275fc749..5fd706473c73 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -182,7 +182,7 @@ static int ipgre_err(struct sk_buff *skb, u32 info, t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->daddr, iph->saddr, tpi->key); - if (t == NULL) + if (!t) return PACKET_REJECT; if (t->parms.iph.daddr == 0 || @@ -423,7 +423,7 @@ static int ipgre_open(struct net_device *dev) return -EADDRNOTAVAIL; dev = rt->dst.dev; ip_rt_put(rt); - if (__in_dev_get_rtnl(dev) == NULL) + if (!__in_dev_get_rtnl(dev)) return -EADDRNOTAVAIL; t->mlink = dev->ifindex; ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); @@ -456,6 +456,7 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_do_ioctl = ipgre_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, }; #define GRE_FEATURES (NETIF_F_SG | \ @@ -621,10 +622,10 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); if (data[IFLA_GRE_LOCAL]) - parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); + parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]); if (data[IFLA_GRE_REMOTE]) - parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); + parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]); if (data[IFLA_GRE_TTL]) parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); @@ -686,6 +687,7 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, }; static void ipgre_tap_setup(struct net_device *dev) @@ -776,8 +778,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || - nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || - nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || + nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || + nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) || nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || nla_put_u8(skb, IFLA_GRE_PMTUDISC, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 3d4da2c16b6a..2db4c8773c1b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -187,7 +187,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } -static int ip_local_deliver_finish(struct sk_buff *skb) +static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); @@ -203,7 +203,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb) raw = raw_local_deliver(skb, protocol); ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot != NULL) { + if (ipprot) { int ret; if (!ipprot->no_policy) { @@ -253,7 +253,8 @@ int ip_local_deliver(struct sk_buff *skb) return 0; } - return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb, + skb->dev, NULL, ip_local_deliver_finish); } @@ -309,12 +310,12 @@ drop: int sysctl_ip_early_demux __read_mostly = 1; EXPORT_SYMBOL(sysctl_ip_early_demux); -static int ip_rcv_finish(struct sk_buff *skb) +static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) { const struct net_protocol *ipprot; int protocol = iph->protocol; @@ -387,7 +388,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) { IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto out; } @@ -450,7 +452,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, + dev, NULL, ip_rcv_finish); csum_error: diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 5b3d91be2db0..bd246792360b 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -264,7 +264,7 @@ int ip_options_compile(struct net *net, unsigned char *iph; int optlen, l; - if (skb != NULL) { + if (skb) { rt = skb_rtable(skb); optptr = (unsigned char *)&(ip_hdr(skb)[1]); } else diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a7aea2048a0d..6bf89a6312bc 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,6 +83,10 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); +static int ip_fragment(struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct sock *, struct sk_buff *)); + /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) { @@ -91,14 +95,19 @@ void ip_send_check(struct iphdr *iph) } EXPORT_SYMBOL(ip_send_check); -int __ip_local_out(struct sk_buff *skb) +static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); ip_send_check(iph); - return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, - skb_dst(skb)->dev, dst_output); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL, + skb_dst(skb)->dev, dst_output_sk); +} + +int __ip_local_out(struct sk_buff *skb) +{ + return __ip_local_out_sk(skb->sk, skb); } int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) @@ -148,7 +157,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(skb, sk); + ip_select_ident(sock_net(sk), skb, sk); if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -163,7 +172,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); -static inline int ip_finish_output2(struct sk_buff *skb) +static int ip_finish_output2(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; @@ -182,7 +191,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); - if (skb2 == NULL) { + if (!skb2) { kfree_skb(skb); return -ENOMEM; } @@ -211,7 +220,8 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output_gso(struct sk_buff *skb) +static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb, + unsigned int mtu) { netdev_features_t features; struct sk_buff *segs; @@ -219,8 +229,8 @@ static int ip_finish_output_gso(struct sk_buff *skb) /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || - skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) - return ip_finish_output2(skb); + skb_gso_network_seglen(skb) <= mtu) + return ip_finish_output2(sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. * @@ -243,7 +253,7 @@ static int ip_finish_output_gso(struct sk_buff *skb) int err; segs->next = NULL; - err = ip_fragment(segs, ip_finish_output2); + err = ip_fragment(sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; @@ -253,22 +263,25 @@ static int ip_finish_output_gso(struct sk_buff *skb) return ret; } -static int ip_finish_output(struct sk_buff *skb) +static int ip_finish_output(struct sock *sk, struct sk_buff *skb) { + unsigned int mtu; + #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ - if (skb_dst(skb)->xfrm != NULL) { + if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); + return dst_output_sk(sk, skb); } #endif + mtu = ip_skb_dst_mtu(skb); if (skb_is_gso(skb)) - return ip_finish_output_gso(skb); + return ip_finish_output_gso(sk, skb, mtu); - if (skb->len > ip_skb_dst_mtu(skb)) - return ip_fragment(skb, ip_finish_output2); + if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) + return ip_fragment(sk, skb, mtu, ip_finish_output2); - return ip_finish_output2(skb); + return ip_finish_output2(sk, skb); } int ip_mc_output(struct sock *sk, struct sk_buff *skb) @@ -307,7 +320,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, - newskb, NULL, newskb->dev, + sk, newskb, NULL, newskb->dev, dev_loopback_xmit); } @@ -322,11 +335,11 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); } - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL, skb->dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -340,7 +353,8 @@ int ip_output(struct sock *sk, struct sk_buff *skb) skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, + NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -376,12 +390,12 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) inet_opt = rcu_dereference(inet->inet_opt); fl4 = &fl->u.ip4; rt = skb_rtable(skb); - if (rt != NULL) + if (rt) goto packet_routed; /* Make sure we can route this packet. */ rt = (struct rtable *)__sk_dst_check(sk, 0); - if (rt == NULL) { + if (!rt) { __be32 daddr; /* Use correct destination address if we have options. */ @@ -430,7 +444,8 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); + ip_select_ident_segs(sock_net(sk), skb, sk, + skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; @@ -448,7 +463,6 @@ no_route: } EXPORT_SYMBOL(ip_queue_xmit); - static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; @@ -472,6 +486,31 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +static int ip_fragment(struct sock *sk, struct sk_buff *skb, + unsigned int mtu, + int (*output)(struct sock *, struct sk_buff *)) +{ + struct iphdr *iph = ip_hdr(skb); + + if ((iph->frag_off & htons(IP_DF)) == 0) + return ip_do_fragment(sk, skb, output); + + if (unlikely(!skb->ignore_df || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + kfree_skb(skb); + return -EMSGSIZE; + } + + return ip_do_fragment(sk, skb, output); +} + /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus @@ -479,7 +518,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +int ip_do_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) { struct iphdr *iph; int ptr; @@ -500,15 +540,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(skb); - if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || - (IPCB(skb)->frag_max_size && - IPCB(skb)->frag_max_size > mtu))) { - IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - kfree_skb(skb); - return -EMSGSIZE; - } + if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) + mtu = IPCB(skb)->frag_max_size; /* * Setup starting values. @@ -516,10 +549,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge) - mtu -= nf_bridge_mtu_reduction(skb); -#endif IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: @@ -586,13 +615,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) ip_options_fragment(frag); offset += skb->len - hlen; iph->frag_off = htons(offset>>3); - if (frag->next != NULL) + if (frag->next) iph->frag_off |= htons(IP_MF); /* Ready, complete checksum */ ip_send_check(iph); } - err = output(skb); + err = output(sk, skb); if (!err) IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); @@ -636,10 +665,7 @@ slow_path: left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ - /* for bridged IP traffic encapsulated inside f.e. a vlan header, - * we need to make room for the encapsulating header - */ - ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb)); + ll_rs = LL_RESERVED_SPACE(rt->dst.dev); /* * Fragment the datagram. @@ -707,6 +733,9 @@ slow_path: iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); + if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) + iph->frag_off |= htons(IP_DF); + /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE @@ -732,7 +761,7 @@ slow_path: ip_send_check(iph); - err = output(skb2); + err = output(sk, skb2); if (err) goto fail; @@ -747,7 +776,7 @@ fail: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } -EXPORT_SYMBOL(ip_fragment); +EXPORT_SYMBOL(ip_do_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) @@ -792,12 +821,13 @@ static inline int ip_ufo_append_data(struct sock *sk, * device, so create one single skb packet containing complete * udp datagram */ - if ((skb = skb_peek_tail(queue)) == NULL) { + skb = skb_peek_tail(queue); + if (!skb) { skb = sock_alloc_send_skb(sk, hh_len + fragheaderlen + transhdrlen + 20, (flags & MSG_DONTWAIT), &err); - if (skb == NULL) + if (!skb) return err; /* reserve space for Hardware header */ @@ -814,7 +844,6 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->csum = 0; - __skb_queue_tail(queue, skb); } else if (skb_is_gso(skb)) { goto append; @@ -963,10 +992,10 @@ alloc_new_skb: skb = sock_wmalloc(sk, alloclen + hh_len + 15, 1, sk->sk_allocation); - if (unlikely(skb == NULL)) + if (unlikely(!skb)) err = -ENOBUFS; } - if (skb == NULL) + if (!skb) goto error; /* @@ -1090,10 +1119,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, */ opt = ipc->opt; if (opt) { - if (cork->opt == NULL) { + if (!cork->opt) { cork->opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); - if (unlikely(cork->opt == NULL)) + if (unlikely(!cork->opt)) return -ENOBUFS; } memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); @@ -1200,7 +1229,8 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, return -EMSGSIZE; } - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + skb = skb_peek_tail(&sk->sk_write_queue); + if (!skb) return -EINVAL; cork->length += size; @@ -1211,13 +1241,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, skb_shinfo(skb)->gso_type = SKB_GSO_UDP; } - while (size > 0) { - int i; - - if (skb_is_gso(skb)) + if (skb_is_gso(skb)) { len = size; - else { + } else { /* Check if the remaining data fits into current packet. */ len = mtu - skb->len; @@ -1269,15 +1296,10 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, continue; } - i = skb_shinfo(skb)->nr_frags; if (len > size) len = size; - if (skb_can_coalesce(skb, i, page, offset)) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); - } else if (i < MAX_SKB_FRAGS) { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, len); - } else { + + if (skb_append_pagefrags(skb, page, offset, len)) { err = -EMSGSIZE; goto error; } @@ -1331,7 +1353,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk, __be16 df = 0; __u8 ttl; - if ((skb = __skb_dequeue(queue)) == NULL) + skb = __skb_dequeue(queue); + if (!skb) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); @@ -1382,7 +1405,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); - ip_select_ident(skb, sk); + ip_select_ident(net, skb, sk); if (opt) { iph->ihl += opt->optlen>>2; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5cd99271d3a6..c3c359ad66e3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -351,7 +351,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, return 0; } } - if (new_ra == NULL) { + if (!new_ra) { spin_unlock_bh(&ip_ra_lock); return -ENOBUFS; } @@ -387,7 +387,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, skb_network_header(skb); serr->port = port; - if (skb_pull(skb, payload - skb->data) != NULL) { + if (skb_pull(skb, payload - skb->data)) { skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; @@ -432,6 +432,15 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +/* For some errors we have valid addr_offset even with zero payload and + * zero port. Also, addr_offset should be supported if port is set. + */ +static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr) +{ + return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; +} + /* IPv4 supports cmsg on all imcp errors and some timestamps * * Timestamp code paths do not initialize the fields expected by cmsg: @@ -482,7 +491,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) err = -EAGAIN; skb = sock_dequeue_err_skb(sk); - if (skb == NULL) + if (!skb) goto out; copied = skb->len; @@ -498,7 +507,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && serr->port) { + if (sin && ipv4_datagram_support_addr(serr)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); @@ -536,12 +545,34 @@ out: * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. */ +static bool setsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_BLOCK_SOURCE: + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case IP_MSFILTER: + case IP_UNBLOCK_SOURCE: + case MCAST_BLOCK_SOURCE: + case MCAST_MSFILTER: + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_UNBLOCK_SOURCE: + return true; + } + return false; +} static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); int val = 0, err; + bool needs_rtnl = setsockopt_needs_rtnl(optname); switch (optname) { case IP_PKTINFO: @@ -560,6 +591,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_TRANSPARENT: case IP_MINTTL: case IP_NODEFRAG: + case IP_BIND_ADDRESS_NO_PORT: case IP_UNICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_ALL: @@ -584,6 +616,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, return ip_mroute_setsockopt(sk, optname, optval, optlen); err = 0; + if (needs_rtnl) + rtnl_lock(); lock_sock(sk); switch (optname) { @@ -708,6 +742,9 @@ static int do_ip_setsockopt(struct sock *sk, int level, } inet->nodefrag = val ? 1 : 0; break; + case IP_BIND_ADDRESS_NO_PORT: + inet->bind_address_no_port = val ? 1 : 0; + break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; @@ -1118,10 +1155,14 @@ mc_msf_out: break; } release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return err; e_inval: release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return -EINVAL; } @@ -1296,6 +1337,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_NODEFRAG: val = inet->nodefrag; break; + case IP_BIND_ADDRESS_NO_PORT: + val = inet->bind_address_no_port; + break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 2cd08280c77b..626d9e56a6bd 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -389,7 +389,6 @@ static int ip_tunnel_bind_dev(struct net_device *dev) hlen = tdev->hard_header_len + tdev->needed_headroom; mtu = tdev->mtu; } - dev->iflink = tunnel->parms.link; dev->needed_headroom = t_hlen + hlen; mtu -= (dev->hard_header_len + t_hlen); @@ -587,7 +586,8 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, EXPORT_SYMBOL(ip_tunnel_encap); static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, - struct rtable *rt, __be16 df) + struct rtable *rt, __be16 df, + const struct iphdr *inner_iph) { struct ip_tunnel *tunnel = netdev_priv(dev); int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; @@ -604,7 +604,8 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && - (df & htons(IP_DF)) && mtu < pkt_size) { + (inner_iph->frag_off & htons(IP_DF)) && + mtu < pkt_size) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); return -E2BIG; @@ -655,7 +656,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (dst == 0) { /* NBMA tunnel */ - if (skb_dst(skb) == NULL) { + if (!skb_dst(skb)) { dev->stats.tx_fifo_errors++; goto tx_error; } @@ -673,7 +674,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); - if (neigh == NULL) + if (!neigh) goto tx_error; addr6 = (const struct in6_addr *)&neigh->primary_key; @@ -738,7 +739,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) { + if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) { ip_rt_put(rt); goto tx_error; } @@ -783,7 +784,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, return; } - err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, + err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); @@ -844,7 +845,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) case SIOCGETTUNNEL: if (dev == itn->fb_tunnel_dev) { t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (t == NULL) + if (!t) t = netdev_priv(dev); } memcpy(p, &t->parms, sizeof(*p)); @@ -877,7 +878,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) break; } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { + if (t) { if (t->dev != dev) { err = -EEXIST; break; @@ -915,7 +916,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) if (dev == itn->fb_tunnel_dev) { err = -ENOENT; t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (t == NULL) + if (!t) goto done; err = -EPERM; if (t == netdev_priv(itn->fb_tunnel_dev)) @@ -980,6 +981,14 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev) } EXPORT_SYMBOL(ip_tunnel_get_link_net); +int ip_tunnel_get_iflink(const struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + return tunnel->parms.link; +} +EXPORT_SYMBOL(ip_tunnel_get_iflink); + int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 88c386cf7d85..6a51a71a6c67 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -74,7 +74,8 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); + __ip_select_ident(dev_net(rt->dst.dev), iph, + skb_shinfo(skb)->gso_segs ?: 1); err = ip_local_out_sk(sk, skb); if (unlikely(net_xmit_eval(err))) @@ -97,7 +98,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) return -ENOMEM; eh = (struct ethhdr *)skb->data; - if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) + if (likely(eth_proto_is_802_3(eh->h_proto))) skb->protocol = eh->h_proto; else skb->protocol = htons(ETH_P_802_2); @@ -164,6 +165,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, { int i; + netdev_stats_to_stats64(tot, &dev->stats); + for_each_possible_cpu(i) { const struct pcpu_sw_netstats *tstats = per_cpu_ptr(dev->tstats, i); @@ -184,22 +187,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, tot->tx_bytes += tx_bytes; } - tot->multicast = dev->stats.multicast; - - tot->rx_crc_errors = dev->stats.rx_crc_errors; - tot->rx_fifo_errors = dev->stats.rx_fifo_errors; - tot->rx_length_errors = dev->stats.rx_length_errors; - tot->rx_frame_errors = dev->stats.rx_frame_errors; - tot->rx_errors = dev->stats.rx_errors; - - tot->tx_fifo_errors = dev->stats.tx_fifo_errors; - tot->tx_carrier_errors = dev->stats.tx_carrier_errors; - tot->tx_dropped = dev->stats.tx_dropped; - tot->tx_aborted_errors = dev->stats.tx_aborted_errors; - tot->tx_errors = dev->stats.tx_errors; - - tot->collisions = dev->stats.collisions; - return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 94efe148181c..0c152087ca15 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -60,12 +60,11 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); - if (tunnel != NULL) { + if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; - skb->mark = be32_to_cpu(tunnel->parms.i_key); return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -91,6 +90,8 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct pcpu_sw_netstats *tstats; struct xfrm_state *x; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; + u32 orig_mark = skb->mark; + int ret; if (!tunnel) return 1; @@ -107,7 +108,11 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); family = x->inner_mode->afinfo->family; - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + skb->mark = be32_to_cpu(tunnel->parms.i_key); + ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); + skb->mark = orig_mark; + + if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); @@ -216,8 +221,6 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl, 0, sizeof(fl)); - skb->mark = be32_to_cpu(tunnel->parms.o_key); - switch (skb->protocol) { case htons(ETH_P_IP): xfrm_decode_session(skb, &fl, AF_INET); @@ -233,6 +236,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } + /* override mark with tunnel output key */ + fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); + return vti_xmit(skb, dev, &fl); } @@ -341,6 +347,7 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_do_ioctl = vti_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, }; static void vti_tunnel_setup(struct net_device *dev) @@ -361,7 +368,6 @@ static int vti_tunnel_init(struct net_device *dev) dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); dev->mtu = ETH_DATA_LEN; dev->flags = IFF_NOARP; - dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; netif_keep_dst(dev); @@ -456,10 +462,10 @@ static void vti_netlink_parms(struct nlattr *data[], parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_LOCAL]) - parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]); + parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) - parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]); + parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); } @@ -505,8 +511,8 @@ static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u32(skb, IFLA_VTI_LINK, p->link); nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key); nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); - nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr); - nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr); + nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr); + nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr); return 0; } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index c0855d50a3fa..d97f4f2787f5 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -63,7 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) struct xfrm_state *t; t = xfrm_state_alloc(net); - if (t == NULL) + if (!t) goto out; t->id.proto = IPPROTO_IPIP; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b26376ef87f6..8e7328c6a390 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -504,7 +504,8 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (!net_eq(dev_net(dev), &init_net)) goto drop; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) return NET_RX_DROP; if (!pskb_may_pull(skb, sizeof(struct arphdr))) @@ -958,7 +959,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str if (skb->pkt_type == PACKET_OTHERHOST) goto drop; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) return NET_RX_DROP; if (!pskb_may_pull(skb, diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 915d215a7d14..254238daf58b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -144,7 +144,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) err = -ENOENT; t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->daddr, iph->saddr, 0); - if (t == NULL) + if (!t) goto out; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { @@ -251,7 +251,8 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EINVAL; } - p.i_key = p.o_key = p.i_flags = p.o_flags = 0; + p.i_key = p.o_key = 0; + p.i_flags = p.o_flags = 0; if (p.iph.ttl) p.iph.frag_off |= htons(IP_DF); @@ -272,6 +273,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_do_ioctl = ipip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, }; #define IPIP_FEATURES (NETIF_F_SG | \ @@ -286,7 +288,6 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; - dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; netif_keep_dst(dev); @@ -325,10 +326,10 @@ static void ipip_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) - parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); + parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) - parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); + parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) { parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); @@ -450,8 +451,8 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || - nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || - nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || + nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || + nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index fe54eba6d00d..3a2c0162c3ba 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -73,9 +73,7 @@ struct mr_table { struct list_head list; -#ifdef CONFIG_NET_NS - struct net *net; -#endif + possible_net_t net; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; @@ -191,7 +189,7 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp, } mrt = ipmr_get_table(rule->fr_net, rule->table); - if (mrt == NULL) + if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; @@ -255,7 +253,7 @@ static int __net_init ipmr_rules_init(struct net *net) INIT_LIST_HEAD(&net->ipv4.mr_tables); mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); - if (mrt == NULL) { + if (!mrt) { err = -ENOMEM; goto err1; } @@ -323,11 +321,11 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) unsigned int i; mrt = ipmr_get_table(net, id); - if (mrt != NULL) + if (mrt) return mrt; mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); - if (mrt == NULL) + if (!mrt) return NULL; write_pnet(&mrt->net, net); mrt->id = id; @@ -429,7 +427,7 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) dev->flags |= IFF_MULTICAST; in_dev = __in_dev_get_rtnl(dev); - if (in_dev == NULL) + if (!in_dev) goto failure; ipv4_devconf_setall(in_dev); @@ -480,8 +478,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +static int reg_vif_get_iflink(const struct net_device *dev) +{ + return 0; +} + static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, + .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) @@ -507,7 +511,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); - if (dev == NULL) + if (!dev) return NULL; dev_net_set(dev, net); @@ -516,7 +520,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) free_netdev(dev); return NULL; } - dev->iflink = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -764,7 +767,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, case 0: if (vifc->vifc_flags == VIFF_USE_IFINDEX) { dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); - if (dev && __in_dev_get_rtnl(dev) == NULL) { + if (dev && !__in_dev_get_rtnl(dev)) { dev_put(dev); return -EADDRNOTAVAIL; } @@ -808,7 +811,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, v->pkt_out = 0; v->link = dev->ifindex; if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) - v->link = dev->iflink; + v->link = dev_get_iflink(dev); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); @@ -1010,7 +1013,7 @@ static int ipmr_cache_report(struct mr_table *mrt, rcu_read_lock(); mroute_sk = rcu_dereference(mrt->mroute_sk); - if (mroute_sk == NULL) { + if (!mroute_sk) { rcu_read_unlock(); kfree_skb(skb); return -EINVAL; @@ -1163,7 +1166,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, return -EINVAL; c = ipmr_cache_alloc(); - if (c == NULL) + if (!c) return -ENOMEM; c->mfc_origin = mfc->mfcc_origin.s_addr; @@ -1285,7 +1288,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -EOPNOTSUPP; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); - if (mrt == NULL) + if (!mrt) return -ENOENT; if (optname != MRT_INIT) { @@ -1448,7 +1451,7 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int return -EOPNOTSUPP; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); - if (mrt == NULL) + if (!mrt) return -ENOENT; if (optname != MRT_VERSION && @@ -1494,7 +1497,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); - if (mrt == NULL) + if (!mrt) return -ENOENT; switch (cmd) { @@ -1568,7 +1571,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); - if (mrt == NULL) + if (!mrt) return -ENOENT; switch (cmd) { @@ -1649,7 +1652,8 @@ static struct notifier_block ip_mr_notifier = { * important for multicast video. */ -static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) +static void ip_encap(struct net *net, struct sk_buff *skb, + __be32 saddr, __be32 daddr) { struct iphdr *iph; const struct iphdr *old_iph = ip_hdr(skb); @@ -1668,14 +1672,14 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(skb, NULL); + ip_select_ident(net, skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); nf_reset(skb); } -static inline int ipmr_forward_finish(struct sk_buff *skb) +static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); @@ -1685,7 +1689,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); - return dst_output(skb); + return dst_output_sk(sk, skb); } /* @@ -1702,7 +1706,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, struct flowi4 fl4; int encap = 0; - if (vif->dev == NULL) + if (!vif->dev) goto out_free; #ifdef CONFIG_IP_PIMSM @@ -1765,7 +1769,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * What do we do with netfilter? -- RR */ if (vif->flags & VIFF_TUNNEL) { - ip_encap(skb, vif->local, vif->remote); + ip_encap(net, skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ vif->dev->stats.tx_packets++; vif->dev->stats.tx_bytes += skb->len; @@ -1784,7 +1788,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev, + NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb, + skb->dev, dev, ipmr_forward_finish); return; @@ -1993,7 +1998,7 @@ int ip_mr_input(struct sk_buff *skb) /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); - if (cache == NULL) { + if (!cache) { int vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) @@ -2004,13 +2009,13 @@ int ip_mr_input(struct sk_buff *skb) /* * No usable cache entry */ - if (cache == NULL) { + if (!cache) { int vif; if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); ip_local_deliver(skb); - if (skb2 == NULL) + if (!skb2) return -ENOBUFS; skb = skb2; } @@ -2069,7 +2074,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; read_unlock(&mrt_lock); - if (reg_dev == NULL) + if (!reg_dev) return 1; skb->mac_header = skb->network_header; @@ -2199,18 +2204,18 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, int err; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); - if (mrt == NULL) + if (!mrt) return -ENOENT; rcu_read_lock(); cache = ipmr_cache_find(mrt, saddr, daddr); - if (cache == NULL && skb->dev) { + if (!cache && skb->dev) { int vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, daddr, vif); } - if (cache == NULL) { + if (!cache) { struct sk_buff *skb2; struct iphdr *iph; struct net_device *dev; @@ -2268,7 +2273,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); @@ -2287,8 +2292,8 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; - if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) || - nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp)) + if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) || + nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; err = __ipmr_fill_mroute(mrt, skb, c, rtm); /* do not break the dump if cache is unresolved */ @@ -2333,7 +2338,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif), GFP_ATOMIC); - if (skb == NULL) + if (!skb) goto errout; err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); @@ -2448,7 +2453,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) struct mr_table *mrt; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); - if (mrt == NULL) + if (!mrt) return ERR_PTR(-ENOENT); iter->mrt = mrt; @@ -2567,7 +2572,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) struct mr_table *mrt; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); - if (mrt == NULL) + if (!mrt) return ERR_PTR(-ENOENT); it->mrt = mrt; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 7ebd6e37875c..61eafc9b4545 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -94,7 +94,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb, { struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (entry->hook == NF_INET_LOCAL_OUT) { + if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); rt_info->tos = iph->tos; @@ -109,7 +109,7 @@ static int nf_ip_reroute(struct sk_buff *skb, { const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (entry->hook == NF_INET_LOCAL_OUT) { + if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); if (!(iph->tos == rt_info->tos && @@ -197,11 +197,4 @@ static int __init ipv4_netfilter_init(void) { return nf_register_afinfo(&nf_ip_afinfo); } - -static void __exit ipv4_netfilter_fini(void) -{ - nf_unregister_afinfo(&nf_ip_afinfo); -} - -module_init(ipv4_netfilter_init); -module_exit(ipv4_netfilter_fini); +subsys_initcall(ipv4_netfilter_init); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 59f883d9cadf..2199a5db25e6 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,24 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -config NF_LOG_ARP - tristate "ARP packet logging" - default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON - -config NF_LOG_IPV4 - tristate "IPv4 packet logging" - default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON +if NF_TABLES config NF_TABLES_IPV4 - depends on NF_TABLES tristate "IPv4 nf_tables support" help This option enables the IPv4 support for nf_tables. +if NF_TABLES_IPV4 + config NFT_CHAIN_ROUTE_IPV4 - depends on NF_TABLES_IPV4 tristate "IPv4 nf_tables route chain support" help This option enables the "route" chain for IPv4 in nf_tables. This @@ -61,22 +53,34 @@ config NFT_CHAIN_ROUTE_IPV4 fields such as the source, destination, type of service and the packet mark. -config NF_REJECT_IPV4 - tristate "IPv4 packet rejection" - default m if NETFILTER_ADVANCED=n - config NFT_REJECT_IPV4 - depends on NF_TABLES_IPV4 select NF_REJECT_IPV4 default NFT_REJECT tristate +endif # NF_TABLES_IPV4 + config NF_TABLES_ARP - depends on NF_TABLES tristate "ARP nf_tables support" help This option enables the ARP support for nf_tables. +endif # NF_TABLES + +config NF_LOG_ARP + tristate "ARP packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + +config NF_LOG_IPV4 + tristate "IPv4 packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + +config NF_REJECT_IPV4 + tristate "IPv4 packet rejection" + default m if NETFILTER_ADVANCED=n + config NF_NAT_IPV4 tristate "IPv4 NAT" depends on NF_CONNTRACK_IPV4 @@ -191,7 +195,8 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP_NF_MANGLE || IP_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f95b6f93814b..92305a1a021a 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -248,16 +248,16 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) unsigned int arpt_do_table(struct sk_buff *skb, unsigned int hook, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct xt_table *table) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; const struct arphdr *arp; - struct arpt_entry *e, *back; + struct arpt_entry *e, **jumpstack; const char *indev, *outdev; - void *table_base; + const void *table_base; + unsigned int cpu, stackidx = 0; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; @@ -265,24 +265,25 @@ unsigned int arpt_do_table(struct sk_buff *skb, if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; - indev = in ? in->name : nulldevname; - outdev = out ? out->name : nulldevname; + indev = state->in ? state->in->name : nulldevname; + outdev = state->out ? state->out->name : nulldevname; local_bh_disable(); addend = xt_write_recseq_begin(); private = table->private; + cpu = smp_processor_id(); /* * Ensure we load private-> members after we've fetched the base * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[smp_processor_id()]; + table_base = private->entries; + jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; e = get_entry(table_base, private->hook_entry[hook]); - back = get_entry(table_base, private->underflow[hook]); - acpar.in = in; - acpar.out = out; + acpar.in = state->in; + acpar.out = state->out; acpar.hooknum = hook; acpar.family = NFPROTO_ARP; acpar.hotdrop = false; @@ -290,13 +291,15 @@ unsigned int arpt_do_table(struct sk_buff *skb, arp = arp_hdr(skb); do { const struct xt_entry_target *t; + struct xt_counters *counter; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); continue; } - ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1); t = arpt_get_target_c(e); @@ -311,18 +314,23 @@ unsigned int arpt_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - e = back; - back = get_entry(table_base, back->comefrom); + if (stackidx == 0) { + e = get_entry(table_base, + private->underflow[hook]); + } else { + e = jumpstack[--stackidx]; + e = arpt_next_entry(e); + } continue; } if (table_base + v != arpt_next_entry(e)) { - /* Save old back ptr in next entry */ - struct arpt_entry *next = arpt_next_entry(e); - next->comefrom = (void *)back - table_base; - /* set back pointer to next entry */ - back = next; + if (stackidx >= private->stacksize) { + verdict = NF_DROP; + break; + } + jumpstack[stackidx++] = e; } e = get_entry(table_base, v); @@ -522,6 +530,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); @@ -539,6 +551,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) err: module_put(t->u.kernel.target->me); out: + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -615,6 +629,7 @@ static inline void cleanup_entry(struct arpt_entry *e) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -703,12 +718,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -723,14 +732,16 @@ static void get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -775,7 +786,7 @@ static int copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; /* ... then copy entire thing ... */ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; @@ -864,16 +875,16 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct arpt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1038,7 +1049,7 @@ static int __do_replace(struct net *net, const char *name, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + loc_cpu_old_entry = oldinfo->entries; xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) cleanup_entry(iter); @@ -1076,14 +1087,16 @@ static int do_replace(struct net *net, const void __user *user, /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1113,7 +1126,7 @@ static int do_replace(struct net *net, const void __user *user, static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1123,7 +1136,6 @@ static int do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct arpt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1179,12 +1191,13 @@ static int do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; + addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + xt_entry_foreach(iter, private->entries, private->size) { + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1394,7 +1407,7 @@ static int translate_compat_table(const char *name, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1414,9 +1427,17 @@ static int translate_compat_table(const char *name, i = 0; xt_entry_foreach(iter1, entry1, newinfo->size) { + iter1->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(iter1->counters.pcnt)) { + ret = -ENOMEM; + break; + } + ret = check_target(iter1, name); - if (ret != 0) + if (ret != 0) { + xt_percpu_counter_free(iter1->counters.pcnt); break; + } ++i; if (strcmp(arpt_get_target(iter1)->u.user.name, XT_ERROR_TARGET) == 0) @@ -1446,11 +1467,6 @@ static int translate_compat_table(const char *name, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1500,14 +1516,16 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1604,7 +1622,6 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *pos; unsigned int size; int ret = 0; - void *loc_cpu_entry; unsigned int i = 0; struct arpt_entry *iter; @@ -1612,11 +1629,9 @@ static int compat_copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy on our node/cpu */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -1785,8 +1800,7 @@ struct xt_table *arpt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(newinfo, loc_cpu_entry, repl); @@ -1817,7 +1831,7 @@ void arpt_unregister_table(struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter); if (private->number > private->initial_entries) diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 802ddecb30b8..93876d03120c 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -28,12 +28,11 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c */ static unsigned int arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - const struct net *net = dev_net((in != NULL) ? in : out); + const struct net *net = dev_net(state->in ? state->in : state->out); - return arpt_do_table(skb, ops->hooknum, in, out, + return arpt_do_table(skb, ops->hooknum, state, net->ipv4.arptable_filter); } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index cf5e82f39d3b..6c72fbb7b49e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -254,15 +254,13 @@ static void trace_packet(const struct sk_buff *skb, const struct xt_table_info *private, const struct ipt_entry *e) { - const void *table_base; const struct ipt_entry *root; const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; struct net *net = dev_net(in ? in : out); - table_base = private->entries[smp_processor_id()]; - root = get_entry(table_base, private->hook_entry[hook]); + root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP_TRACE_COMMENT_RULE]; @@ -288,8 +286,7 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) unsigned int ipt_do_table(struct sk_buff *skb, unsigned int hook, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct xt_table *table) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); @@ -306,8 +303,8 @@ ipt_do_table(struct sk_buff *skb, /* Initialization */ ip = ip_hdr(skb); - indev = in ? in->name : nulldevname; - outdev = out ? out->name : nulldevname; + indev = state->in ? state->in->name : nulldevname; + outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as * if it was a normal packet. All other fragments are treated * normally, except that they will NEVER match rules that ask @@ -317,8 +314,8 @@ ipt_do_table(struct sk_buff *skb, acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; acpar.thoff = ip_hdrlen(skb); acpar.hotdrop = false; - acpar.in = in; - acpar.out = out; + acpar.in = state->in; + acpar.out = state->out; acpar.family = NFPROTO_IPV4; acpar.hooknum = hook; @@ -332,7 +329,7 @@ ipt_do_table(struct sk_buff *skb, * pointer. */ smp_read_barrier_depends(); - table_base = private->entries[cpu]; + table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; @@ -346,6 +343,7 @@ ipt_do_table(struct sk_buff *skb, do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; + struct xt_counters *counter; IP_NF_ASSERT(e); if (!ip_packet_match(ip, indev, outdev, @@ -362,7 +360,8 @@ ipt_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, skb->len, 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, skb->len, 1); t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); @@ -370,7 +369,7 @@ ipt_do_table(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) - trace_packet(skb, hook, in, out, + trace_packet(skb, hook, state->in, state->out, table->name, private, e); #endif /* Standard target? */ @@ -666,6 +665,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -692,6 +695,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, ret = check_target(e, net, name); if (ret) goto err; + return 0; err: module_put(t->u.kernel.target->me); @@ -701,6 +705,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -785,6 +792,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -867,12 +875,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -888,14 +890,16 @@ get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -940,11 +944,7 @@ copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1052,16 +1052,16 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ipt_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1182,7 +1182,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - void *loc_cpu_old_entry; struct ipt_entry *iter; ret = 0; @@ -1225,8 +1224,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); @@ -1263,14 +1261,16 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1301,7 +1301,7 @@ static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1311,7 +1311,6 @@ do_add_counters(struct net *net, const void __user *user, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; struct ipt_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1367,12 +1366,12 @@ do_add_counters(struct net *net, const void __user *user, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); - loc_cpu_entry = private->entries[curcpu]; addend = xt_write_recseq_begin(); - xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + xt_entry_foreach(iter, private->entries, private->size) { + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); @@ -1442,7 +1441,6 @@ static int compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ipt_ip *ip, - unsigned int hookmask, int *size) { struct xt_match *match; @@ -1511,8 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { - ret = compat_find_calc_match(ematch, name, - &e->ip, e->comefrom, &off); + ret = compat_find_calc_match(ematch, name, &e->ip, &off); if (ret != 0) goto release_matches; ++j; @@ -1608,6 +1605,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) unsigned int j; int ret = 0; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -1632,6 +1633,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -1716,7 +1720,7 @@ translate_compat_table(struct net *net, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1768,11 +1772,6 @@ translate_compat_table(struct net *net, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1810,14 +1809,16 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1888,7 +1889,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *pos; unsigned int size; int ret = 0; - const void *loc_cpu_entry; unsigned int i = 0; struct ipt_entry *iter; @@ -1896,14 +1896,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -2078,8 +2073,7 @@ struct xt_table *ipt_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu, but dont care about preemption */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); @@ -2110,7 +2104,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index e90f83a3415b..45cb16a6a4a3 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -367,6 +367,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) struct clusterip_config *config; int ret; + if (par->nft_compat) { + pr_err("cannot use CLUSTERIP target from nftables compat\n"); + return -EOPNOTSUPP; + } + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { @@ -418,6 +423,13 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); + + if (!par->net->xt.clusterip_deprecated_warning) { + pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " + "use xt_cluster instead\n"); + par->net->xt.clusterip_deprecated_warning = true; + } + return ret; } @@ -497,14 +509,12 @@ static void arp_print(struct arp_payload *payload) static unsigned int arp_mangle(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; - struct net *net = dev_net(in ? in : out); + struct net *net = dev_net(state->in ? state->in : state->out); /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || @@ -529,10 +539,10 @@ arp_mangle(const struct nf_hook_ops *ops, * addresses on different interfacs. However, in the CLUSTERIP case * this wouldn't work, since we didn't subscribe the mcast group on * other interfaces */ - if (c->dev != out) { + if (c->dev != state->out) { pr_debug("not mangling arp reply on different " "interface: cip'%s'-skb'%s'\n", - c->dev->name, out->name); + c->dev->name, state->out->name); clusterip_config_put(c); return NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 8f48f5517e33..87907d4bd259 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -34,31 +34,32 @@ static unsigned int reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_reject_info *reject = par->targinfo; + int hook = par->hooknum; switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: - nf_send_unreach(skb, ICMP_NET_UNREACH); + nf_send_unreach(skb, ICMP_NET_UNREACH, hook); break; case IPT_ICMP_HOST_UNREACHABLE: - nf_send_unreach(skb, ICMP_HOST_UNREACH); + nf_send_unreach(skb, ICMP_HOST_UNREACH, hook); break; case IPT_ICMP_PROT_UNREACHABLE: - nf_send_unreach(skb, ICMP_PROT_UNREACH); + nf_send_unreach(skb, ICMP_PROT_UNREACH, hook); break; case IPT_ICMP_PORT_UNREACHABLE: - nf_send_unreach(skb, ICMP_PORT_UNREACH); + nf_send_unreach(skb, ICMP_PORT_UNREACH, hook); break; case IPT_ICMP_NET_PROHIBITED: - nf_send_unreach(skb, ICMP_NET_ANO); + nf_send_unreach(skb, ICMP_NET_ANO, hook); break; case IPT_ICMP_HOST_PROHIBITED: - nf_send_unreach(skb, ICMP_HOST_ANO); + nf_send_unreach(skb, ICMP_HOST_ANO, hook); break; case IPT_ICMP_ADMIN_PROHIBITED: - nf_send_unreach(skb, ICMP_PKT_FILTERED); + nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(skb, par->hooknum); + nf_send_reset(skb, hook); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index a313c3fbeb46..fe8cc183411e 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -18,7 +18,7 @@ #include <net/netfilter/nf_conntrack_synproxy.h> static struct iphdr * -synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; @@ -220,7 +220,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; - nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; @@ -300,11 +300,9 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out)); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 4bfaedf9b34e..8618fd150c96 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -40,7 +40,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, struct net *net = dev_net(dev); int ret __maybe_unused; - if (fib_lookup(net, fl4, &res)) + if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (res.type != RTN_UNICAST) { diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index e08a74a243a8..a0f3beca52d2 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -34,8 +34,7 @@ static const struct xt_table packet_filter = { static unsigned int iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { const struct net *net; @@ -45,9 +44,8 @@ iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, ops->hooknum, in, out, - net->ipv4.iptable_filter); + net = dev_net(state->in ? state->in : state->out); + return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 6a5079c34bb3..62cbb8c5f4a8 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -37,8 +37,9 @@ static const struct xt_table packet_mangler = { }; static unsigned int -ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) +ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) { + struct net_device *out = state->out; unsigned int ret; const struct iphdr *iph; u_int8_t tos; @@ -58,7 +59,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, + ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state, dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { @@ -81,18 +82,16 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) static unsigned int iptable_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { if (ops->hooknum == NF_INET_LOCAL_OUT) - return ipt_mangle_out(skb, out); + return ipt_mangle_out(skb, state); if (ops->hooknum == NF_INET_POST_ROUTING) - return ipt_do_table(skb, ops->hooknum, in, out, - dev_net(out)->ipv4.iptable_mangle); + return ipt_do_table(skb, ops->hooknum, state, + dev_net(state->out)->ipv4.iptable_mangle); /* PREROUTING/INPUT/FORWARD: */ - return ipt_do_table(skb, ops->hooknum, in, out, - dev_net(in)->ipv4.iptable_mangle); + return ipt_do_table(skb, ops->hooknum, state, + dev_net(state->in)->ipv4.iptable_mangle); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 6b67d7e9a75d..0d4d9cdf98a4 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -30,49 +30,40 @@ static const struct xt_table nf_nat_ipv4_table = { static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table); + return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table); } static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain); + return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain); } static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain); + return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain); } static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain); + return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain); } static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain); + return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index b2f7e8f98316..0356e6da4bb7 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -21,8 +21,7 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { const struct net *net; @@ -32,8 +31,8 @@ iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, /* root is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw); + net = dev_net(state->in ? state->in : state->out); + return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index c86647ed2078..4bce3980ccd9 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -38,9 +38,7 @@ static const struct xt_table security_table = { static unsigned int iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { const struct net *net; @@ -50,8 +48,8 @@ iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, /* Somebody is playing with raw sockets. */ return NF_ACCEPT; - net = dev_net((in != NULL) ? in : out); - return ipt_do_table(skb, ops->hooknum, in, out, + net = dev_net(state->in ? state->in : state->out); + return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_security); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5c61328b7704..30ad9554b5e9 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -94,9 +94,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, static unsigned int ipv4_helper(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -123,9 +121,7 @@ static unsigned int ipv4_helper(const struct nf_hook_ops *ops, static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -149,24 +145,20 @@ out: static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb); } static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb); + return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb); } /* Connection tracking may drop packets, but never alters them, so @@ -322,8 +314,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || - nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) + if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || + nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) goto nla_put_failure; return 0; @@ -342,8 +334,8 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[], if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) return -EINVAL; - t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]); + t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); return 0; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index a460a87e14f8..f0dfe92a00d6 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -300,7 +300,9 @@ static int exp_seq_show(struct seq_file *s, void *v) __nf_ct_l3proto_find(exp->tuple.src.l3num), __nf_ct_l4proto_find(exp->tuple.src.l3num, exp->tuple.dst.protonum)); - return seq_putc(s, '\n'); + seq_putc(s, '\n'); + + return 0; } static const struct seq_operations exp_seq_ops = { diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 7e5ca6f2d0cd..c88b7d434718 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -63,9 +63,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(skb->sk); diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index d059182c1466..e7ad950cf9ef 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -10,8 +10,10 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> @@ -27,7 +29,7 @@ static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { - .level = 5, + .level = LOGLEVEL_NOTICE, .logflags = NF_LOG_MASK, }, }, diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 75101980eeee..076aadda0473 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -5,8 +5,10 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> @@ -26,7 +28,7 @@ static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { - .level = 5, + .level = LOGLEVEL_NOTICE, .logflags = NF_LOG_MASK, }, }, diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index fc37711e11f3..e59cc05c09e9 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -256,11 +256,10 @@ EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); unsigned int nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, + const struct nf_hook_state *state, unsigned int (*do_chain)(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct)) { struct nf_conn *ct; @@ -309,7 +308,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(ops, skb, in, out, ct); + ret = do_chain(ops, skb, state, ct); if (ret != NF_ACCEPT) return ret; @@ -323,7 +322,8 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, + state->out)) goto oif_changed; } break; @@ -332,7 +332,7 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) goto oif_changed; } @@ -346,17 +346,16 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); unsigned int nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, + const struct nf_hook_state *state, unsigned int (*do_chain)(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct)) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; - ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); + ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); @@ -367,11 +366,10 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); unsigned int nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, + const struct nf_hook_state *state, unsigned int (*do_chain)(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct)) { #ifdef CONFIG_XFRM @@ -386,7 +384,7 @@ nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); + ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -410,11 +408,10 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); unsigned int nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, + const struct nf_hook_state *state, unsigned int (*do_chain)(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct)) { const struct nf_conn *ct; @@ -427,7 +424,7 @@ nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); + ret = nf_nat_ipv4_fn(ops, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 536da7bc598a..3262e41ff76f 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -13,6 +13,7 @@ #include <net/dst.h> #include <net/netfilter/ipv4/nf_reject.h> #include <linux/netfilter_ipv4.h> +#include <linux/netfilter_bridge.h> #include <net/netfilter/ipv4/nf_reject.h> const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, @@ -43,7 +44,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - __be16 protocol, int ttl) + __u8 protocol, int ttl) { struct iphdr *niph, *oiph = ip_hdr(oldskb); @@ -146,7 +147,8 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) */ if (oldskb->nf_bridge) { struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = oldskb->nf_bridge->physindev; + + nskb->dev = nf_bridge_get_physindev(oldskb); niph->tot_len = htons(nskb->len); ip_send_check(niph); if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), @@ -164,4 +166,27 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) } EXPORT_SYMBOL_GPL(nf_send_reset); +void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) +{ + struct iphdr *iph = ip_hdr(skb_in); + u8 proto; + + if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET)) + return; + + if (skb_csum_unnecessary(skb_in)) { + icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); + return; + } + + if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) + proto = iph->protocol; + else + proto = 0; + + if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0) + icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); +} +EXPORT_SYMBOL_GPL(nf_send_unreach); + MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 19412a4063fb..8412268bbad1 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -17,13 +17,11 @@ static unsigned int nft_do_chain_arp(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo(&pkt, ops, skb, in, out); + nft_set_pktinfo(&pkt, ops, skb, state); return nft_do_chain(&pkt, ops); } diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 6820c8c40842..aa180d3a69a5 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -20,22 +20,18 @@ static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + nft_set_pktinfo_ipv4(&pkt, ops, skb, state); return nft_do_chain(&pkt, ops); } static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { if (unlikely(skb->len < sizeof(struct iphdr) || ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { @@ -45,7 +41,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, return NF_ACCEPT; } - return nft_do_chain_ipv4(ops, skb, in, out, okfn); + return nft_do_chain_ipv4(ops, skb, state); } struct nft_af_info nft_af_ipv4 __read_mostly = { diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index df547bf50078..bf5c30ae14e4 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -28,51 +28,42 @@ static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + nft_set_pktinfo_ipv4(&pkt, ops, skb, state); return nft_do_chain(&pkt, ops); } static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv4_fn(ops, skb, in, out, nft_nat_do_chain); + return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain); } static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv4_in(ops, skb, in, out, nft_nat_do_chain); + return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain); } static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv4_out(ops, skb, in, out, nft_nat_do_chain); + return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain); } static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv4_local_fn(ops, skb, in, out, nft_nat_do_chain); + return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv4 = { diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 125b66766c0a..e335b0afdaf3 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -23,9 +23,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { unsigned int ret; struct nft_pktinfo pkt; @@ -39,7 +37,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + nft_set_pktinfo_ipv4(&pkt, ops, skb, state); mark = skb->mark; iph = ip_hdr(skb); diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 665de06561cd..40e414c4ca56 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -17,20 +17,17 @@ #include <net/netfilter/ipv4/nf_nat_masquerade.h> static void nft_masq_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_masq *priv = nft_expr_priv(expr); struct nf_nat_range range; - unsigned int verdict; memset(&range, 0, sizeof(range)); range.flags = priv->flags; - verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, - &range, pkt->out); - - data[NFT_REG_VERDICT].verdict = verdict; + regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, + &range, pkt->out); } static struct nft_expr_type nft_masq_ipv4_type; diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index 6ecfce63201a..d8d795df9c13 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -18,26 +18,25 @@ #include <net/netfilter/nft_redir.h> static void nft_redir_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_redir *priv = nft_expr_priv(expr); struct nf_nat_ipv4_multi_range_compat mr; - unsigned int verdict; memset(&mr, 0, sizeof(mr)); if (priv->sreg_proto_min) { mr.range[0].min.all = - *(__be16 *)&data[priv->sreg_proto_min].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_min]; mr.range[0].max.all = - *(__be16 *)&data[priv->sreg_proto_max].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_max]; mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } mr.range[0].flags |= priv->flags; - verdict = nf_nat_redirect_ipv4(pkt->skb, &mr, pkt->ops->hooknum); - data[NFT_REG_VERDICT].verdict = verdict; + regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, + pkt->ops->hooknum); } static struct nft_expr_type nft_redir_ipv4_type; diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index d729542bd1b7..b07e58b51158 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -20,21 +20,24 @@ #include <net/netfilter/nft_reject.h> static void nft_reject_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach(pkt->skb, priv->icmp_code); + nf_send_unreach(pkt->skb, priv->icmp_code, + pkt->ops->hooknum); break; case NFT_REJECT_TCP_RST: nf_send_reset(pkt->skb, pkt->ops->hooknum); break; + default: + break; } - data[NFT_REG_VERDICT].verdict = NF_DROP; + regs->verdict.code = NF_DROP; } static struct nft_expr_type nft_reject_ipv4_type; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 208d5439e59b..05ff44b758df 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -64,11 +64,11 @@ EXPORT_SYMBOL_GPL(pingv6_ops); static u16 ping_port_rover; -static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask) +static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { - int res = (num + net_hash_mix(net)) & mask; + u32 res = (num + net_hash_mix(net)) & mask; - pr_debug("hash(%d) = %d\n", num, res); + pr_debug("hash(%u) = %u\n", num, res); return res; } EXPORT_SYMBOL_GPL(ping_hash); @@ -158,6 +158,7 @@ void ping_unhash(struct sock *sk) if (sk_hashed(sk)) { write_lock_bh(&ping_table.lock); hlist_nulls_del(&sk->sk_nulls_node); + sk_nulls_node_init(&sk->sk_nulls_node); sock_put(sk); isk->inet_num = 0; isk->inet_sport = 0; @@ -516,7 +517,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) ntohs(icmph->un.echo.sequence)); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); - if (sk == NULL) { + if (!sk) { pr_debug("no socket, dropping\n"); return; /* No socket for error */ } @@ -692,8 +693,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, } EXPORT_SYMBOL_GPL(ping_common_sendmsg); -static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct net *net = sock_net(sk); struct flowi4 fl4; @@ -849,8 +849,8 @@ do_confirm: goto out; } -int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) { struct inet_sock *isk = inet_sk(sk); int family = sk->sk_family; @@ -972,7 +972,7 @@ bool ping_rcv(struct sk_buff *skb) skb_push(skb, skb->data - (u8 *)icmph); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); - if (sk != NULL) { + if (sk) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); pr_debug("rcv on socket %p\n", sk); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index d8953ef0770c..da5d483e236a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -63,7 +63,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, - tcp_death_row.tw_count, sockets, + atomic_read(&tcp_death_row.tw_count), sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), @@ -298,6 +298,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), + SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), + SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index f027a708b7e0..561cd4b8fc6e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -46,7 +46,6 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/errno.h> -#include <linux/aio.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -293,7 +292,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) read_lock(&raw_v4_hashinfo.lock); raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); - if (raw_sk != NULL) { + if (raw_sk) { iph = (const struct iphdr *)skb->data; net = dev_net(skb->dev); @@ -363,7 +362,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb = sock_alloc_send_skb(sk, length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); - if (skb == NULL) + if (!skb) goto error; skb_reserve(skb, hlen); @@ -404,7 +403,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(skb, NULL); + ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } @@ -412,8 +411,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); - err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, - rt->dst.dev, dst_output); + err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, + NULL, rt->dst.dev, dst_output_sk); if (err > 0) err = net_xmit_errno(err); if (err) @@ -481,8 +480,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } -static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; @@ -709,8 +707,8 @@ out: return ret; * we return it, otherwise we block. */ -static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; @@ -873,7 +871,7 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) + if (skb) amount = skb->len; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ad5064362c5c..e681b852ced1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -152,7 +152,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, static struct dst_ops ipv4_dst_ops = { .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, @@ -458,12 +457,9 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, } #define IP_IDENTS_SZ 2048u -struct ip_ident_bucket { - atomic_t id; - u32 stamp32; -}; -static struct ip_ident_bucket *ip_idents __read_mostly; +static atomic_t *ip_idents __read_mostly; +static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker @@ -471,19 +467,20 @@ static struct ip_ident_bucket *ip_idents __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(bucket->stamp32); + u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; + atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(*p_tstamp); u32 now = (u32)jiffies; u32 delta = 0; - if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); - return atomic_add_return(segs + delta, &bucket->id) - segs; + return atomic_add_return(segs + delta, p_id) - segs; } EXPORT_SYMBOL(ip_idents_reserve); -void __ip_select_ident(struct iphdr *iph, int segs) +void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { static u32 ip_idents_hashrnd __read_mostly; u32 hash, id; @@ -492,7 +489,7 @@ void __ip_select_ident(struct iphdr *iph, int segs) hash = jhash_3words((__force u32)iph->daddr, (__force u32)iph->saddr, - iph->protocol, + iph->protocol ^ net_hash_mix(net), ip_idents_hashrnd); id = ip_idents_reserve(hash, segs); iph->id = htons(id); @@ -750,7 +747,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { - if (fib_lookup(net, fl4, &res) == 0) { + if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, @@ -903,6 +900,10 @@ static int ip_error(struct sk_buff *skb) bool send; int code; + /* IP on this device is disabled. */ + if (!in_dev) + goto out; + net = dev_net(rt->dst.dev); if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { @@ -963,10 +964,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (dst_metric_locked(dst, RTAX_MTU)) return; - if (dst->dev->mtu < mtu) - return; - - if (rt->rt_pmtu && rt->rt_pmtu < mtu) + if (ipv4_mtu(dst) < mtu) return; if (mtu < ip_rt_min_pmtu) @@ -977,7 +975,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, @@ -1057,7 +1055,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); rt = (struct rtable *)odst; - if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { + if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; @@ -1188,7 +1186,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) fl4.flowi4_mark = skb->mark; rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, @@ -1451,7 +1449,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* Primary sanity checks. */ - if (in_dev == NULL) + if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || @@ -1554,7 +1552,7 @@ static int __mkroute_input(struct sk_buff *skb, /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); - if (out_dev == NULL) { + if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return -EINVAL; } @@ -1592,7 +1590,7 @@ static int __mkroute_input(struct sk_buff *skb, fnhe = find_exception(&FIB_RES_NH(*res), daddr); if (do_cache) { - if (fnhe != NULL) + if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); @@ -1718,7 +1716,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.daddr = daddr; fl4.saddr = saddr; - err = fib_lookup(net, &fl4, &res); + err = fib_lookup(net, &fl4, &res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; @@ -2055,7 +2053,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = __ip_dev_find(net, fl4->saddr, false); - if (dev_out == NULL) + if (!dev_out) goto out; /* Special hack: user can direct multicasts @@ -2088,7 +2086,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) if (fl4->flowi4_oif) { dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); rth = ERR_PTR(-ENODEV); - if (dev_out == NULL) + if (!dev_out) goto out; /* RACE: Check return value of inet_select_addr instead. */ @@ -2097,7 +2095,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto out; } if (ipv4_is_local_multicast(fl4->daddr) || - ipv4_is_lbcast(fl4->daddr)) { + ipv4_is_lbcast(fl4->daddr) || + fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); @@ -2124,7 +2123,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res)) { + if (fib_lookup(net, fl4, &res, 0)) { res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif) { @@ -2177,7 +2176,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) if (!res.prefixlen && res.table->tb_num_default > 1 && res.type == RTN_UNICAST && !fl4->flowi4_oif) - fib_select_default(&res); + fib_select_default(fl4, &res); if (!fl4->saddr) fl4->saddr = FIB_RES_PREFSRC(net, res); @@ -2225,7 +2224,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), .check = ipv4_blackhole_dst_check, .mtu = ipv4_blackhole_mtu, .default_advmss = ipv4_default_advmss, @@ -2301,7 +2299,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 metrics[RTAX_MAX]; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); @@ -2321,11 +2319,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (IPCB(skb)->flags & IPSKB_DOREDIRECT) r->rtm_flags |= RTCF_DOREDIRECT; - if (nla_put_be32(skb, RTA_DST, dst)) + if (nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (src) { r->rtm_src_len = 32; - if (nla_put_be32(skb, RTA_SRC, src)) + if (nla_put_in_addr(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && @@ -2338,11 +2336,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, #endif if (!rt_is_input_route(rt) && fl4->saddr != src) { - if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) + if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } if (rt->rt_uses_gateway && - nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; expires = rt->dst.expires; @@ -2423,7 +2421,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) rtm = nlmsg_data(nlh); skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (skb == NULL) { + if (!skb) { err = -ENOBUFS; goto errout; } @@ -2438,8 +2436,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) ip_hdr(skb)->protocol = IPPROTO_ICMP; skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); - src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; - dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; + src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; + dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; @@ -2454,7 +2452,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) struct net_device *dev; dev = __dev_get_by_index(net, iif); - if (dev == NULL) { + if (!dev) { err = -ENODEV; goto errout_free; } @@ -2653,7 +2651,7 @@ static __net_init int sysctl_route_net_init(struct net *net) tbl = ipv4_route_flush_table; if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); - if (tbl == NULL) + if (!tbl) goto err_dup; /* Don't export sysctls to unprivileged users */ @@ -2663,7 +2661,7 @@ static __net_init int sysctl_route_net_init(struct net *net) tbl[0].extra1 = net; net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); - if (net->ipv4.route_hdr == NULL) + if (!net->ipv4.route_hdr) goto err_reg; return 0; @@ -2743,6 +2741,10 @@ int __init ip_rt_init(void) prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); + if (!ip_tstamps) + panic("IP: failed to allocate ip_tstamps\n"); + for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 45fe60c5238e..d70b1f603692 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -219,22 +219,23 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_check); -static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); - if (child) + if (child) { + atomic_set(&req->rsk_refcnt, 1); inet_csk_reqsk_queue_add(sk, req, child); - else + } else { reqsk_free(req); - + } return child; } - +EXPORT_SYMBOL(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored @@ -325,7 +326,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ if (!req) goto out; @@ -336,8 +337,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); ireq->ir_mark = inet_request_mark(sk, skb); ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -345,7 +346,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; - treq->listener = NULL; + treq->tfo_listener = false; + + ireq->ir_iif = sk->sk_bound_dev_if; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) @@ -357,7 +360,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; } - req->expires = 0UL; req->num_retrans = 0; /* @@ -389,7 +391,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); - ret = get_cookie_sock(sk, skb, req, &rt->dst); + ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d151539da8e6..433231ccfb17 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -41,11 +41,19 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; /* Update system visible IP port range */ static void set_local_port_range(struct net *net, int range[2]) { + bool same_parity = !((range[0] ^ range[1]) & 1); + write_seqlock(&net->ipv4.ip_local_ports.lock); + if (same_parity && !net->ipv4.ip_local_ports.warned) { + net->ipv4.ip_local_ports.warned = true; + pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); + } net->ipv4.ip_local_ports.range[0] = range[0]; net->ipv4.ip_local_ports.range[1] = range[1]; write_sequnlock(&net->ipv4.ip_local_ports.lock); @@ -522,7 +530,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_tcp_wmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "tcp_notsent_lowat", @@ -537,7 +545,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_tcp_rmem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "tcp_app_win", @@ -702,7 +710,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = &one, .extra2 = &gso_max_segs, }, { @@ -750,7 +758,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = &min_rcvbuf, }, { .procname = "udp_wmem_min", @@ -758,7 +766,7 @@ static struct ctl_table ipv4_table[] = { .maxlen = sizeof(sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one + .extra1 = &min_sndbuf, }, { } }; @@ -821,6 +829,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_ecn_fallback", + .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), .data = &init_net.ipv4.ip_local_ports.range, @@ -883,6 +898,20 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_probe_threshold", + .data = &init_net.ipv4.sysctl_tcp_probe_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_probe_interval", + .data = &init_net.ipv4.sysctl_tcp_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -895,7 +924,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) int i; table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); - if (table == NULL) + if (!table) goto err_alloc; /* Update the variables to point into the current struct net */ @@ -904,7 +933,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) } net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); - if (net->ipv4.ipv4_hdr == NULL) + if (!net->ipv4.ipv4_hdr) goto err_reg; net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); @@ -942,7 +971,7 @@ static __init int sysctl_ipv4_init(void) struct ctl_table_header *hdr; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); - if (hdr == NULL) + if (!hdr) return -ENOMEM; if (register_pernet_subsys(&ipv4_sysctl_ops)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 995a2259bcfc..45534a5ab430 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -252,6 +252,7 @@ #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> +#include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> @@ -401,6 +402,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; + u64_stats_init(&tp->syncp); tp->reordering = sysctl_tcp_reordering; tcp_enable_early_retrans(tp); @@ -496,7 +498,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Connected or passive Fast Open socket? */ if (sk->sk_state != TCP_SYN_SENT && - (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { + (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -520,8 +522,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Race breaker. If space is freed after * wspace test but before the flags are set, - * IO signal will be lost. + * IO signal will be lost. Memory barrier + * pairs with the input side. */ + smp_mb__after_atomic(); if (sk_stream_is_writeable(sk)) mask |= POLLOUT | POLLWRNORM; } @@ -691,8 +695,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), - tss->flags); + ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, + min(rd_desc->count, len), tss->flags, + skb_socket_splice); if (ret > 0) rd_desc->count -= ret; return ret; @@ -775,7 +780,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, ret = -EAGAIN; break; } - sk_wait_data(sk, &timeo); + sk_wait_data(sk, &timeo, NULL); if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; @@ -805,16 +810,28 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_SYMBOL(tcp_splice_read); -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule) { struct sk_buff *skb; /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); + if (unlikely(tcp_under_memory_pressure(sk))) + sk_mem_reclaim_partial(sk); + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (likely(skb)) { + bool mem_scheduled; + + if (force_schedule) { + mem_scheduled = true; + sk_forced_mem_schedule(sk, skb->truesize); + } else { + mem_scheduled = sk_wmem_schedule(sk, skb->truesize); + } + if (likely(mem_scheduled)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes @@ -904,7 +921,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -983,6 +1001,9 @@ do_error: if (copied) goto out; out_err: + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); return sk_stream_error(sk, flags, err); } @@ -1028,7 +1049,7 @@ static inline int select_size(const struct sock *sk, bool sg) void tcp_free_fastopen_req(struct tcp_sock *tp) { - if (tp->fastopen_req != NULL) { + if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } @@ -1042,12 +1063,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) return -EOPNOTSUPP; - if (tp->fastopen_req != NULL) + if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); - if (unlikely(tp->fastopen_req == NULL)) + if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; @@ -1060,8 +1081,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, return err; } -int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size) +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1120,7 +1140,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sg = !!(sk->sk_route_caps & NETIF_F_SG); - while (iov_iter_count(&msg->msg_iter)) { + while (msg_data_left(msg)) { int copy = 0; int max = size_goal; @@ -1141,7 +1161,8 @@ new_segment: skb = sk_stream_alloc_skb(sk, select_size(sk, sg), - sk->sk_allocation); + sk->sk_allocation, + skb_queue_empty(&sk->sk_write_queue)); if (!skb) goto wait_for_memory; @@ -1164,8 +1185,8 @@ new_segment: } /* Try to append data to the end of skb. */ - if (copy > iov_iter_count(&msg->msg_iter)) - copy = iov_iter_count(&msg->msg_iter); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); /* Where to copy to? */ if (skb_availroom(skb) > 0) { @@ -1222,7 +1243,7 @@ new_segment: tcp_skb_pcount_set(skb, 0); copied += copy; - if (!iov_iter_count(&msg->msg_iter)) { + if (!msg_data_left(msg)) { tcp_tx_timestamp(sk, skb); goto out; } @@ -1272,6 +1293,9 @@ do_error: goto out; out_err: err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); release_sock(sk); return err; } @@ -1539,8 +1563,8 @@ EXPORT_SYMBOL(tcp_read_sock); * Probably, code can be easily improved even more. */ -int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int nonblock, int flags, int *addr_len) +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; @@ -1551,7 +1575,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - struct sk_buff *skb; + struct sk_buff *skb, *last; u32 urg_hole = 0; if (unlikely(flags & MSG_ERRQUEUE)) @@ -1611,7 +1635,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Next get a buffer. */ + last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { + last = skb; /* Now that we have two receive queues this * shouldn't happen. */ @@ -1730,8 +1756,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); - } else - sk_wait_data(sk, &timeo); + } else { + sk_wait_data(sk, &timeo, last); + } if (user_recv) { int chunk; @@ -1914,18 +1941,19 @@ EXPORT_SYMBOL_GPL(tcp_set_state); static const unsigned char new_state[16] = { /* current state: new state: action: */ - /* (Invalid) */ TCP_CLOSE, - /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_SYN_SENT */ TCP_CLOSE, - /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, - /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, - /* TCP_TIME_WAIT */ TCP_CLOSE, - /* TCP_CLOSE */ TCP_CLOSE, - /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, - /* TCP_LAST_ACK */ TCP_LAST_ACK, - /* TCP_LISTEN */ TCP_CLOSE, - /* TCP_CLOSING */ TCP_CLOSING, + [0 /* (Invalid) */] = TCP_CLOSE, + [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_SYN_SENT] = TCP_CLOSE, + [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, + [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, + [TCP_TIME_WAIT] = TCP_CLOSE, + [TCP_CLOSE] = TCP_CLOSE, + [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, + [TCP_LAST_ACK] = TCP_LAST_ACK, + [TCP_LISTEN] = TCP_CLOSE, + [TCP_CLOSING] = TCP_CLOSING, + [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) @@ -2138,7 +2166,7 @@ adjudge_to_death: * aborted (e.g., closed with unread data) before 3WHS * finishes. */ - if (req != NULL) + if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } @@ -2479,6 +2507,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, icsk->icsk_syn_retries = val; break; + case TCP_SAVE_SYN: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->save_syn = val; + break; + case TCP_LINGER2: if (val < 0) tp->linger2 = -1; @@ -2541,10 +2576,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | - TCPF_LISTEN))) + TCPF_LISTEN))) { + tcp_fastopen_init_key_once(true); + err = fastopen_init_queue(sk, val); - else + } else { err = -EINVAL; + } break; case TCP_TIMESTAMP: if (!tp->repair) @@ -2590,13 +2628,17 @@ EXPORT_SYMBOL(compat_tcp_setsockopt); #endif /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(const struct sock *sk, struct tcp_info *info) +void tcp_get_info(struct sock *sk, struct tcp_info *info) { - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; + unsigned int start; + u32 rate; memset(info, 0, sizeof(*info)); + if (sk->sk_type != SOCK_STREAM) + return; info->tcpi_state = sk->sk_state; info->tcpi_ca_state = icsk->icsk_ca_state; @@ -2655,10 +2697,19 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; - info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? - sk->sk_pacing_rate : ~0ULL; - info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? - sk->sk_max_pacing_rate : ~0ULL; + rate = READ_ONCE(sk->sk_pacing_rate); + info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL; + + rate = READ_ONCE(sk->sk_max_pacing_rate); + info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; + + do { + start = u64_stats_fetch_begin_irq(&tp->syncp); + info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; + } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); + info->tcpi_segs_out = tp->segs_out; + info->tcpi_segs_in = tp->segs_in; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2730,6 +2781,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; } + case TCP_CC_INFO: { + const struct tcp_congestion_ops *ca_ops; + union tcp_cc_info info; + size_t sz = 0; + int attr; + + if (get_user(len, optlen)) + return -EFAULT; + + ca_ops = icsk->icsk_ca_ops; + if (ca_ops && ca_ops->get_info) + sz = ca_ops->get_info(sk, ~0U, &attr, &info); + + len = min_t(unsigned int, len, sz); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + return 0; + } case TCP_QUICKACK: val = !icsk->icsk_ack.pingpong; break; @@ -2776,7 +2847,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - if (icsk->icsk_accept_queue.fastopenq != NULL) + if (icsk->icsk_accept_queue.fastopenq) val = icsk->icsk_accept_queue.fastopenq->max_qlen; else val = 0; @@ -2788,6 +2859,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_SAVE_SYN: + val = tp->save_syn; + break; + case TCP_SAVED_SYN: { + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + if (tp->saved_syn) { + if (len < tp->saved_syn[0]) { + if (put_user(tp->saved_syn[0], optlen)) { + release_sock(sk); + return -EFAULT; + } + release_sock(sk); + return -EINVAL; + } + len = tp->saved_syn[0]; + if (put_user(len, optlen)) { + release_sock(sk); + return -EFAULT; + } + if (copy_to_user(optval, tp->saved_syn + 1, len)) { + release_sock(sk); + return -EFAULT; + } + tcp_saved_syn_free(tp); + release_sock(sk); + } else { + release_sock(sk); + len = 0; + if (put_user(len, optlen)) + return -EFAULT; + } + return 0; + } default: return -ENOPROTOOPT; } @@ -2960,7 +3067,7 @@ void tcp_done(struct sock *sk) tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); - if (req != NULL) + if (req) reqsk_fastopen_remove(sk, req, false); sk->sk_shutdown = SHUTDOWN_MASK; @@ -2992,21 +3099,21 @@ __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { - unsigned long limit = nr_free_buffer_pages() / 8; + unsigned long limit = nr_free_buffer_pages() / 16; + limit = max(limit, 128UL); - sysctl_tcp_mem[0] = limit / 4 * 3; - sysctl_tcp_mem[1] = limit; - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ + sysctl_tcp_mem[1] = limit; /* 6.25 % */ + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void) { - struct sk_buff *skb = NULL; unsigned long limit; int max_rshare, max_wshare, cnt; unsigned int i; - BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); + sock_skb_cb_check_size(sizeof(struct tcp_skb_cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c new file mode 100644 index 000000000000..8c6fd3d5e40f --- /dev/null +++ b/net/ipv4/tcp_cdg.c @@ -0,0 +1,433 @@ +/* + * CAIA Delay-Gradient (CDG) congestion control + * + * This implementation is based on the paper: + * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using + * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. + * + * Scavenger traffic (Less-than-Best-Effort) should disable coexistence + * heuristics using parameters use_shadow=0 and use_ineff=0. + * + * Parameters window, backoff_beta, and backoff_factor are crucial for + * throughput and delay. Future work is needed to determine better defaults, + * and to provide guidelines for use in different environments/contexts. + * + * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. + * Parameter window is only configurable when loading tcp_cdg as a module. + * + * Notable differences from paper/FreeBSD: + * o Using Hybrid Slow start and Proportional Rate Reduction. + * o Add toggle for shadow window mechanism. Suggested by David Hayes. + * o Add toggle for non-congestion loss tolerance. + * o Scaling parameter G is changed to a backoff factor; + * conversion is given by: backoff_factor = 1000/(G * window). + * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. + * o More accurate e^-x. + */ +#include <linux/kernel.h> +#include <linux/random.h> +#include <linux/module.h> +#include <net/tcp.h> + +#define HYSTART_ACK_TRAIN 1 +#define HYSTART_DELAY 2 + +static int window __read_mostly = 8; +static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ +static unsigned int backoff_factor __read_mostly = 42; +static unsigned int hystart_detect __read_mostly = 3; +static unsigned int use_ineff __read_mostly = 5; +static bool use_shadow __read_mostly = true; +static bool use_tolerance __read_mostly; + +module_param(window, int, 0444); +MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); +module_param(backoff_beta, uint, 0644); +MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); +module_param(backoff_factor, uint, 0644); +MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); +module_param(hystart_detect, uint, 0644); +MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " + "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); +module_param(use_ineff, uint, 0644); +MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); +module_param(use_shadow, bool, 0644); +MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); +module_param(use_tolerance, bool, 0644); +MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); + +struct minmax { + union { + struct { + s32 min; + s32 max; + }; + u64 v64; + }; +}; + +enum cdg_state { + CDG_UNKNOWN = 0, + CDG_NONFULL = 1, + CDG_FULL = 2, + CDG_BACKOFF = 3, +}; + +struct cdg { + struct minmax rtt; + struct minmax rtt_prev; + struct minmax *gradients; + struct minmax gsum; + bool gfilled; + u8 tail; + u8 state; + u8 delack; + u32 rtt_seq; + u32 undo_cwnd; + u32 shadow_wnd; + u16 backoff_cnt; + u16 sample_cnt; + s32 delay_min; + u32 last_ack; + u32 round_start; +}; + +/** + * nexp_u32 - negative base-e exponential + * @ux: x in units of micro + * + * Returns exp(ux * -1e-6) * U32_MAX. + */ +static u32 __pure nexp_u32(u32 ux) +{ + static const u16 v[] = { + /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ + 65535, + 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, + 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, + }; + u32 msb = ux >> 8; + u32 res; + int i; + + /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ + if (msb > U16_MAX) + return 0; + + /* Scale first eight bits linearly: */ + res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); + + /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ + for (i = 1; msb; i++, msb >>= 1) { + u32 y = v[i & -(msb & 1)] + U32_C(1); + + res = ((u64)res * y) >> 16; + } + + return res; +} + +/* Based on the HyStart algorithm (by Ha et al.) that is implemented in + * tcp_cubic. Differences/experimental changes: + * o Using Hayes' delayed ACK filter. + * o Using a usec clock for the ACK train. + * o Reset ACK train when application limited. + * o Invoked at any cwnd (i.e. also when cwnd < 16). + * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). + */ +static void tcp_cdg_hystart_update(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); + if (ca->delay_min == 0) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { + u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); + + if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { + ca->last_ack = now_us; + ca->round_start = now_us; + } else if (before(now_us, ca->last_ack + 3000)) { + u32 base_owd = max(ca->delay_min / 2U, 125U); + + ca->last_ack = now_us; + if (after(now_us, ca->round_start + base_owd)) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + return; + } + } + } + + if (hystart_detect & HYSTART_DELAY) { + if (ca->sample_cnt < 8) { + ca->sample_cnt++; + } else { + s32 thresh = max(ca->delay_min + ca->delay_min / 8U, + 125U); + + if (ca->rtt.min > thresh) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } + } + } +} + +static s32 tcp_cdg_grad(struct cdg *ca) +{ + s32 gmin = ca->rtt.min - ca->rtt_prev.min; + s32 gmax = ca->rtt.max - ca->rtt_prev.max; + s32 grad; + + if (ca->gradients) { + ca->gsum.min += gmin - ca->gradients[ca->tail].min; + ca->gsum.max += gmax - ca->gradients[ca->tail].max; + ca->gradients[ca->tail].min = gmin; + ca->gradients[ca->tail].max = gmax; + ca->tail = (ca->tail + 1) & (window - 1); + gmin = ca->gsum.min; + gmax = ca->gsum.max; + } + + /* We keep sums to ignore gradients during cwnd reductions; + * the paper's smoothed gradients otherwise simplify to: + * (rtt_latest - rtt_oldest) / window. + * + * We also drop division by window here. + */ + grad = gmin > 0 ? gmin : gmax; + + /* Extrapolate missing values in gradient window: */ + if (!ca->gfilled) { + if (!ca->gradients && window > 1) + grad *= window; /* Memory allocation failed. */ + else if (ca->tail == 0) + ca->gfilled = true; + else + grad = (grad * window) / (int)ca->tail; + } + + /* Backoff was effectual: */ + if (gmin <= -32 || gmax <= -32) + ca->backoff_cnt = 0; + + if (use_tolerance) { + /* Reduce small variations to zero: */ + gmin = DIV_ROUND_CLOSEST(gmin, 64); + gmax = DIV_ROUND_CLOSEST(gmax, 64); + + if (gmin > 0 && gmax <= 0) + ca->state = CDG_FULL; + else if ((gmin > 0 && gmax > 0) || gmax < 0) + ca->state = CDG_NONFULL; + } + return grad; +} + +static bool tcp_cdg_backoff(struct sock *sk, u32 grad) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (prandom_u32() <= nexp_u32(grad * backoff_factor)) + return false; + + if (use_ineff) { + ca->backoff_cnt++; + if (ca->backoff_cnt > use_ineff) + return false; + } + + ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd); + ca->state = CDG_BACKOFF; + tcp_enter_cwr(sk); + return true; +} + +/* Not called in CWR or Recovery state. */ +static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_snd_cwnd; + u32 incr; + + if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect) + tcp_cdg_hystart_update(sk); + + if (after(ack, ca->rtt_seq) && ca->rtt.v64) { + s32 grad = 0; + + if (ca->rtt_prev.v64) + grad = tcp_cdg_grad(ca); + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + ca->last_ack = 0; + ca->sample_cnt = 0; + + if (grad > 0 && tcp_cdg_backoff(sk, grad)) + return; + } + + if (!tcp_is_cwnd_limited(sk)) { + ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd); + return; + } + + prior_snd_cwnd = tp->snd_cwnd; + tcp_reno_cong_avoid(sk, ack, acked); + + incr = tp->snd_cwnd - prior_snd_cwnd; + ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); +} + +static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (rtt_us <= 0) + return; + + /* A heuristic for filtering delayed ACKs, adapted from: + * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support + * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. + */ + if (tp->sacked_out == 0) { + if (num_acked == 1 && ca->delack) { + /* A delayed ACK is only used for the minimum if it is + * provenly lower than an existing non-zero minimum. + */ + ca->rtt.min = min(ca->rtt.min, rtt_us); + ca->delack--; + return; + } else if (num_acked > 1 && ca->delack < 5) { + ca->delack++; + } + } + + ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us); + ca->rtt.max = max(ca->rtt.max, rtt_us); +} + +static u32 tcp_cdg_ssthresh(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->undo_cwnd = tp->snd_cwnd; + + if (ca->state == CDG_BACKOFF) + return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); + + if (ca->state == CDG_NONFULL && use_tolerance) + return tp->snd_cwnd; + + ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd); + if (use_shadow) + return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1); + return max(2U, tp->snd_cwnd >> 1); +} + +static u32 tcp_cdg_undo_cwnd(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd); +} + +static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct minmax *gradients; + + switch (ev) { + case CA_EVENT_CWND_RESTART: + gradients = ca->gradients; + if (gradients) + memset(gradients, 0, window * sizeof(gradients[0])); + memset(ca, 0, sizeof(*ca)); + + ca->gradients = gradients; + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; + break; + case CA_EVENT_COMPLETE_CWR: + ca->state = CDG_UNKNOWN; + ca->rtt_seq = tp->snd_nxt; + ca->rtt_prev = ca->rtt; + ca->rtt.v64 = 0; + break; + default: + break; + } +} + +static void tcp_cdg_init(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* We silently fall back to window = 1 if allocation fails. */ + if (window > 1) + ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), + GFP_NOWAIT | __GFP_NOWARN); + ca->rtt_seq = tp->snd_nxt; + ca->shadow_wnd = tp->snd_cwnd; +} + +static void tcp_cdg_release(struct sock *sk) +{ + struct cdg *ca = inet_csk_ca(sk); + + kfree(ca->gradients); +} + +struct tcp_congestion_ops tcp_cdg __read_mostly = { + .cong_avoid = tcp_cdg_cong_avoid, + .cwnd_event = tcp_cdg_cwnd_event, + .pkts_acked = tcp_cdg_acked, + .undo_cwnd = tcp_cdg_undo_cwnd, + .ssthresh = tcp_cdg_ssthresh, + .release = tcp_cdg_release, + .init = tcp_cdg_init, + .owner = THIS_MODULE, + .name = "cdg", +}; + +static int __init tcp_cdg_register(void) +{ + if (backoff_beta > 1024 || window < 1 || window > 256) + return -ERANGE; + if (!is_power_of_2(window)) + return -EINVAL; + + BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_cdg); + return 0; +} + +static void __exit tcp_cdg_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_cdg); +} + +module_init(tcp_cdg_register); +module_exit(tcp_cdg_unregister); +MODULE_AUTHOR("Kenneth Klette Jonassen"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP CDG"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 62856e185a93..84be008c945c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -83,7 +83,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); - pr_info("%s registered\n", ca->name); + pr_debug("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); @@ -187,6 +187,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; + icsk->icsk_ca_setsockopt = 1; if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); @@ -335,8 +336,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) rcu_read_lock(); ca = __tcp_ca_find_autoload(name); /* No change asking for existing value */ - if (ca == icsk->icsk_ca_ops) + if (ca == icsk->icsk_ca_ops) { + icsk->icsk_ca_setsockopt = 1; goto out; + } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index b504371af742..7092a61c4dc8 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -204,20 +204,26 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { - /* For avoiding denominator == 1. */ - if (ca->acked_bytes_total == 0) - ca->acked_bytes_total = 1; + u64 bytes_ecn = ca->acked_bytes_ecn; + u32 alpha = ca->dctcp_alpha; /* alpha = (1 - g) * alpha + g * F */ - ca->dctcp_alpha = ca->dctcp_alpha - - (ca->dctcp_alpha >> dctcp_shift_g) + - (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / - ca->acked_bytes_total; - if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) - /* Clamp dctcp_alpha to max. */ - ca->dctcp_alpha = DCTCP_MAX_ALPHA; + alpha -= alpha >> dctcp_shift_g; + if (bytes_ecn) { + /* If dctcp_shift_g == 1, a 32bit value would overflow + * after 8 Mbytes. + */ + bytes_ecn <<= (10 - dctcp_shift_g); + do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); + alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); + } + /* dctcp_alpha can be read from dctcp_get_info() without + * synchro, so we ask compiler to not use dctcp_alpha + * as a temporary variable in prior operations. + */ + WRITE_ONCE(ca->dctcp_alpha, alpha); dctcp_reset(tp, ca); } } @@ -277,7 +283,8 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) } } -static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct dctcp *ca = inet_csk_ca(sk); @@ -286,19 +293,19 @@ static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcp_dctcp_info info; - - memset(&info, 0, sizeof(info)); + memset(info, 0, sizeof(struct tcp_dctcp_info)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { - info.dctcp_enabled = 1; - info.dctcp_ce_state = (u16) ca->ce_state; - info.dctcp_alpha = ca->dctcp_alpha; - info.dctcp_ab_ecn = ca->acked_bytes_ecn; - info.dctcp_ab_tot = ca->acked_bytes_total; + info->dctcp.dctcp_enabled = 1; + info->dctcp.dctcp_ce_state = (u16) ca->ce_state; + info->dctcp.dctcp_alpha = ca->dctcp_alpha; + info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn; + info->dctcp.dctcp_ab_tot = ca->acked_bytes_total; } - nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + *attr = INET_DIAG_DCTCPINFO; + return sizeof(*info); } + return 0; } static struct tcp_congestion_ops dctcp __read_mostly = { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 0d73f9ddb55b..479f34946177 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -19,28 +19,29 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { - const struct tcp_sock *tp = tcp_sk(sk); struct tcp_info *info = _info; if (sk->sk_state == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; - } else { + } else if (sk->sk_type == SOCK_STREAM) { + const struct tcp_sock *tp = tcp_sk(sk); + r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } - if (info != NULL) + if (info) tcp_get_info(sk, info); } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r, struct nlattr *bc) { inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); } static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + const struct inet_diag_req_v2 *req) { return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); } @@ -50,6 +51,7 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_type = IPPROTO_TCP, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init tcp_diag_init(void) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index ea82fd492c1b..f9c0fb84e435 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -78,8 +78,6 @@ static bool __tcp_fastopen_cookie_gen(const void *path, struct tcp_fastopen_context *ctx; bool ok = false; - tcp_fastopen_init_key_once(true); - rcu_read_lock(); ctx = rcu_dereference(tcp_fastopen_ctx); if (ctx) { @@ -141,7 +139,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, req->sk = NULL; child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) + if (!child) return false; spin_lock(&queue->fastopenq->lock); @@ -155,12 +153,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, tp = tcp_sk(child); tp->fastopen_rsk = req; - /* Do a hold on the listner sk so that if the listener is being - * closed, the child that has been accepted can live on and still - * access listen_lock. - */ - sock_hold(sk); - tcp_rsk(req)->listener = sk; + tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. @@ -174,6 +167,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); + atomic_set(&req->rsk_refcnt, 1); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, child); @@ -210,6 +204,11 @@ static bool tcp_fastopen_create_child(struct sock *sk, skb_set_owner_r(skb2, child); __skb_queue_tail(&child->sk_receive_queue, skb2); tp->syn_data_acked = 1; + + /* u64_stats_update_begin(&tp->syncp) not needed here, + * as we certainly are not changing upper 32bit value (0) + */ + tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; } else { end_seq = TCP_SKB_CB(skb)->seq + 1; } @@ -218,10 +217,9 @@ static bool tcp_fastopen_create_child(struct sock *sk, sk->sk_data_ready(sk); bh_unlock_sock(child); sock_put(child); - WARN_ON(req->sk == NULL); + WARN_ON(!req->sk); return true; } -EXPORT_SYMBOL(tcp_fastopen_create_child); static bool tcp_fastopen_queue_check(struct sock *sk) { @@ -238,14 +236,14 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * temporarily vs a server not supporting Fast Open at all. */ fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; - if (fastopenq == NULL || fastopenq->max_qlen == 0) + if (!fastopenq || fastopenq->max_qlen == 0) return false; if (fastopenq->qlen >= fastopenq->max_qlen) { struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; - if ((req1 == NULL) || time_after(req1->expires, jiffies)) { + if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) { spin_unlock(&fastopenq->lock); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); @@ -254,7 +252,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk) fastopenq->rskq_rst_head = req1->dl_next; fastopenq->qlen--; spin_unlock(&fastopenq->lock); - reqsk_free(req1); + reqsk_put(req1); } return true; } @@ -308,6 +306,7 @@ fastopen: } else if (foc->len > 0) /* Client presents an invalid cookie */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + valid_foc.exp = foc->exp; *foc = valid_foc; return false; } diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 1d5a30a90adf..f71002e4db0b 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -300,26 +300,27 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_illinois_info(struct sock *sk, u32 ext, - struct sk_buff *skb) +static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct illinois *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = 1, - .tcpv_rttcnt = ca->cnt_rtt, - .tcpv_minrtt = ca->base_rtt, - }; + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = ca->cnt_rtt; + info->vegas.tcpv_minrtt = ca->base_rtt; + info->vegas.tcpv_rtt = 0; - if (info.tcpv_rttcnt > 0) { + if (info->vegas.tcpv_rttcnt > 0) { u64 t = ca->sum_rtt; - do_div(t, info.tcpv_rttcnt); - info.tcpv_rtt = t; + do_div(t, info->vegas.tcpv_rttcnt); + info->vegas.tcpv_rtt = t; } - nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } + return 0; } static struct tcp_congestion_ops tcp_illinois __read_mostly = { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f501ac048366..728f5b3d3c64 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !sk_under_memory_pressure(sk)) { + !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !sk_under_memory_pressure(sk) && + !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -866,7 +866,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, /* This must be called before lost_out is incremented */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { - if ((tp->retransmit_skb_hint == NULL) || + if (!tp->retransmit_skb_hint || before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = skb; @@ -1130,7 +1130,12 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sacktag_state { int reord; int fack_count; - long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + /* Timestamps for earliest and latest never-retransmitted segment + * that was SACKed. RTO needs the earliest RTT to stay conservative, + * but congestion control should still get an accurate delay signal. + */ + struct skb_mstamp first_sackt; + struct skb_mstamp last_sackt; int flag; }; @@ -1233,14 +1238,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; - /* Pick the earliest sequence sacked for RTT */ - if (state->rtt_us < 0) { - struct skb_mstamp now; - - skb_mstamp_get(&now); - state->rtt_us = skb_mstamp_us_delta(&now, - xmit_time); - } + if (state->first_sackt.v64 == 0) + state->first_sackt = *xmit_time; + state->last_sackt = *xmit_time; } if (sacked & TCPCB_LOST) { @@ -1256,7 +1256,7 @@ static u8 tcp_sacktag_one(struct sock *sk, fack_count += pcount; /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ - if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && + if (!tcp_is_fack(tp) && tp->lost_skb_hint && before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; @@ -1316,16 +1316,12 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * code can come after this skb later on it's better to keep * setting gso_size to something. */ - if (!skb_shinfo(prev)->gso_size) { - skb_shinfo(prev)->gso_size = mss; - skb_shinfo(prev)->gso_type = sk->sk_gso_type; - } + if (!TCP_SKB_CB(prev)->tcp_gso_size) + TCP_SKB_CB(prev)->tcp_gso_size = mss; /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (tcp_skb_pcount(skb) <= 1) { - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - } + if (tcp_skb_pcount(skb) <= 1) + TCP_SKB_CB(skb)->tcp_gso_size = 0; /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1535,7 +1531,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; - if ((next_dup != NULL) && + if (next_dup && before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { in_sack = tcp_match_skb_to_sack(sk, skb, next_dup->start_seq, @@ -1551,7 +1547,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, if (in_sack <= 0) { tmp = tcp_shift_skb_data(sk, skb, state, start_seq, end_seq, dup_sack); - if (tmp != NULL) { + if (tmp) { if (tmp != skb) { skb = tmp; continue; @@ -1614,7 +1610,7 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, struct tcp_sacktag_state *state, u32 skip_to_seq) { - if (next_dup == NULL) + if (!next_dup) return skb; if (before(next_dup->start_seq, skip_to_seq)) { @@ -1634,7 +1630,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una, long *sack_rtt_us) + u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1642,7 +1638,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; - struct tcp_sacktag_state state; struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; @@ -1650,9 +1645,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, int i, j; int first_sack_index; - state.flag = 0; - state.reord = tp->packets_out; - state.rtt_us = -1L; + state->flag = 0; + state->reord = tp->packets_out; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1663,7 +1657,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) - state.flag |= FLAG_DSACKING_ACK; + state->flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1728,7 +1722,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } skb = tcp_write_queue_head(sk); - state.fack_count = 0; + state->fack_count = 0; i = 0; if (!tp->sacked_out) { @@ -1762,10 +1756,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, /* Head todo? */ if (before(start_seq, cache->start_seq)) { - skb = tcp_sacktag_skip(skb, sk, &state, + skb = tcp_sacktag_skip(skb, sk, state, start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, - &state, + state, start_seq, cache->start_seq, dup_sack); @@ -1776,21 +1770,21 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, goto advance_sp; skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, - &state, + state, cache->end_seq); /* ...tail remains todo... */ if (tcp_highest_sack_seq(tp) == cache->end_seq) { /* ...but better entrypoint exists! */ skb = tcp_highest_sack(sk); - if (skb == NULL) + if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; cache++; goto walk; } - skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq); + skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); /* Check overlap against next cached too (past this one already) */ cache++; continue; @@ -1798,14 +1792,14 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, if (!before(start_seq, tcp_highest_sack_seq(tp))) { skb = tcp_highest_sack(sk); - if (skb == NULL) + if (!skb) break; - state.fack_count = tp->fackets_out; + state->fack_count = tp->fackets_out; } - skb = tcp_sacktag_skip(skb, sk, &state, start_seq); + skb = tcp_sacktag_skip(skb, sk, state, start_seq); walk: - skb = tcp_sacktag_walk(skb, sk, next_dup, &state, + skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, end_seq, dup_sack); advance_sp: @@ -1820,14 +1814,12 @@ advance_sp: for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; - tcp_mark_lost_retrans(sk); - - tcp_verify_left_out(tp); - - if ((state.reord < tp->fackets_out) && + if ((state->reord < tp->fackets_out) && ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) - tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); + tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); + tcp_mark_lost_retrans(sk); + tcp_verify_left_out(tp); out: #if FASTRETRANS_DEBUG > 0 @@ -1836,8 +1828,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - *sack_rtt_us = state.rtt_us; - return state.flag; + return state->flag; } /* Limits sacked_out so that sum with lost_out isn't ever larger than @@ -1926,14 +1917,13 @@ void tcp_enter_loss(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - bool new_recovery = false; + bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { - new_recovery = true; tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); @@ -2257,7 +2247,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) (oldcnt >= packets)) break; - mss = skb_shinfo(skb)->gso_size; + mss = tcp_skb_mss(skb); err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss, GFP_ATOMIC); if (err < 0) @@ -2557,6 +2547,7 @@ void tcp_enter_cwr(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_CWR); } } +EXPORT_SYMBOL(tcp_enter_cwr); static void tcp_try_keep_open(struct sock *sk) { @@ -2700,16 +2691,21 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); + if ((flag & FLAG_SND_UNA_ADVANCED) && + tcp_try_undo_loss(sk, false)) + return; + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ /* Step 3.b. A timeout is spurious if not all data are * lost, i.e., never-retransmitted data are (s)acked. */ - if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) + if ((flag & FLAG_ORIG_SACK_ACKED) && + tcp_try_undo_loss(sk, true)) return; - if (after(tp->snd_nxt, tp->high_seq) && - (flag & FLAG_DATA_SACKED || is_dupack)) { - tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ + if (after(tp->snd_nxt, tp->high_seq)) { + if (flag & FLAG_DATA_SACKED || is_dupack) + tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; __tcp_push_pending_frames(sk, tcp_current_mss(sk), @@ -2734,8 +2730,6 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } - if (tcp_try_undo_loss(sk, false)) - return; tcp_xmit_retransmit_queue(sk); } @@ -3054,7 +3048,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, long sack_rtt_us) + u32 prior_snd_una, + struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt, now; @@ -3062,8 +3057,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; - long ca_seq_rtt_us = -1L; + long sack_rtt_us = -1L; long seq_rtt_us = -1L; + long ca_rtt_us = -1L; struct sk_buff *skb; u32 pkts_acked = 0; bool rtt_update; @@ -3099,17 +3095,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; - } else { + } else if (!(sacked & TCPCB_SACKED_ACKED)) { last_ackt = skb->skb_mstamp; WARN_ON_ONCE(last_ackt.v64 == 0); if (!first_ackt.v64) first_ackt = last_ackt; - if (!(sacked & TCPCB_SACKED_ACKED)) { - reord = min(pkts_acked, reord); - if (!after(scb->end_seq, tp->high_seq)) - flag |= FLAG_ORIG_SACK_ACKED; - } + reord = min(pkts_acked, reord); + if (!after(scb->end_seq, tp->high_seq)) + flag |= FLAG_ORIG_SACK_ACKED; } if (sacked & TCPCB_SACKED_ACKED) @@ -3154,15 +3148,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, skb_mstamp_get(&now); if (likely(first_ackt.v64)) { seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); - ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + if (sack->first_sackt.v64) { + sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); + ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); } rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); if (flag & FLAG_ACKED) { - const struct tcp_congestion_ops *ca_ops - = inet_csk(sk)->icsk_ca_ops; - tcp_rearm_rto(sk); if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { @@ -3185,11 +3180,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } - } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent @@ -3199,6 +3189,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_rearm_rto(sk); } + if (icsk->icsk_ca_ops->pkts_acked) + icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); + #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); WARN_ON((int)tp->lost_out < 0); @@ -3239,7 +3232,7 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); @@ -3282,6 +3275,28 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); } +/* If we update tp->snd_una, also update tp->bytes_acked */ +static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) +{ + u32 delta = ack - tp->snd_una; + + u64_stats_update_begin(&tp->syncp); + tp->bytes_acked += delta; + u64_stats_update_end(&tp->syncp); + tp->snd_una = ack; +} + +/* If we update tp->rcv_nxt, also update tp->bytes_received */ +static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) +{ + u32 delta = seq - tp->rcv_nxt; + + u64_stats_update_begin(&tp->syncp); + tp->bytes_received += delta; + u64_stats_update_end(&tp->syncp); + tp->rcv_nxt = seq; +} + /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 @@ -3317,11 +3332,41 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 } } - tp->snd_una = ack; + tcp_snd_una_update(tp, ack); return flag; } +/* Return true if we're currently rate-limiting out-of-window ACKs and + * thus shouldn't send a dupack right now. We rate-limit dupacks in + * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS + * attacks that send repeated SYNs or ACKs for the same connection. To + * do this, we do not send a duplicate SYNACK or ACK if the remote + * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. + */ +bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, + int mib_idx, u32 *last_oow_ack_time) +{ + /* Data packets without SYNs are not likely part of an ACK loop. */ + if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && + !tcp_hdr(skb)->syn) + goto not_rate_limited; + + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS_BH(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + +not_rate_limited: + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* RFC 5961 7 [ACK Throttling] */ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) { @@ -3415,6 +3460,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sacktag_state sack_state; u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -3423,7 +3469,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - long sack_rtt_us = -1L; + + sack_state.first_sackt.v64 = 0; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3469,7 +3516,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * Note, we use the fact that SND.UNA>=SND.WL2. */ tcp_update_wl(tp, ack_seq); - tp->snd_una = ack; + tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); @@ -3487,7 +3534,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; @@ -3512,7 +3559,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, - sack_rtt_us); + &sack_state); acked -= tp->packets_out; /* Advance cwnd if state allows */ @@ -3564,7 +3611,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt_us); + &sack_state); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -3573,6 +3620,23 @@ old_ack: return 0; } +static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, + bool syn, struct tcp_fastopen_cookie *foc, + bool exp_opt) +{ + /* Valid only in SYN or SYN-ACK with an even length. */ + if (!foc || !syn || len < 0 || (len & 1)) + return; + + if (len >= TCP_FASTOPEN_COOKIE_MIN && + len <= TCP_FASTOPEN_COOKIE_MAX) + memcpy(foc->val, cookie, len); + else if (len != 0) + len = -1; + foc->len = len; + foc->exp = exp_opt; +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -3662,21 +3726,22 @@ void tcp_parse_options(const struct sk_buff *skb, */ break; #endif + case TCPOPT_FASTOPEN: + tcp_parse_fastopen_option( + opsize - TCPOLEN_FASTOPEN_BASE, + ptr, th->syn, foc, false); + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a - * 16 bits magic number. It's valid only in - * SYN or SYN-ACK with an even size. + * 16 bits magic number. */ - if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || - get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || - foc == NULL || !th->syn || (opsize & 1)) - break; - foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; - if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && - foc->len <= TCP_FASTOPEN_COOKIE_MAX) - memcpy(foc->val, ptr + 2, foc->len); - else if (foc->len != 0) - foc->len = -1; + if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && + get_unaligned_be16(ptr) == + TCPOPT_FASTOPEN_MAGIC) + tcp_parse_fastopen_option(opsize - + TCPOLEN_EXP_FASTOPEN_BASE, + ptr + 2, th->syn, foc, true); break; } @@ -4190,7 +4255,7 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4358,7 +4423,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; - tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); @@ -4445,13 +4510,15 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: - if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - + if (eaten < 0) { + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4640,7 +4707,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) struct sk_buff *head; u32 start, end; - if (skb == NULL) + if (!skb) return; start = TCP_SKB_CB(skb)->seq; @@ -4719,7 +4786,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (sk_under_memory_pressure(sk)) + else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4763,7 +4830,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we are under global TCP memory pressure, do not expand. */ - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) return false; /* If we are under soft global TCP memory pressure, do not expand. */ @@ -4799,6 +4866,8 @@ static void tcp_check_space(struct sock *sk) { if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); + /* pairs with tcp_poll() */ + smp_mb__after_atomic(); if (sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) tcp_new_space(sk); @@ -5095,7 +5164,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(sk->sk_rx_dst == NULL)) + if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. @@ -5197,7 +5266,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); eaten = 1; } @@ -5292,7 +5361,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); - if (skb != NULL) { + if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } @@ -5330,8 +5399,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; - u16 mss = tp->rx_opt.mss_clamp; - bool syn_drop; + u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; + bool syn_drop = false; if (mss == tp->rx_opt.user_mss) { struct tcp_options_received opt; @@ -5343,16 +5412,25 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, mss = opt.mss_clamp; } - if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */ + if (!tp->syn_fastopen) { + /* Ignore an unsolicited cookie */ cookie->len = -1; + } else if (tp->total_retrans) { + /* SYN timed out and the SYN-ACK neither has a cookie nor + * acknowledges data. Presumably the remote received only + * the retransmitted (regular) SYNs: either the original + * SYN-data or the corresponding SYN-ACK was dropped. + */ + syn_drop = (cookie->len < 0 && data); + } else if (cookie->len < 0 && !tp->syn_data) { + /* We requested a cookie but didn't get it. If we did not use + * the (old) exp opt format then try so next time (try_exp=1). + * Otherwise we go back to use the RFC7413 opt (try_exp=2). + */ + try_exp = tp->syn_fastopen_exp ? 2 : 1; + } - /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably - * the remote receives only the retransmitted (regular) SYNs: either - * the original SYN-data or the corresponding SYN-ACK is lost. - */ - syn_drop = (cookie->len <= 0 && data && tp->total_retrans); - - tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); + tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ tcp_for_write_queue_from(data, sk) { @@ -5661,11 +5739,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } req = tp->fastopen_rsk; - if (req != NULL) { + if (req) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (tcp_check_req(sk, skb, req, NULL, true) == NULL) + if (!tcp_check_req(sk, skb, req, true)) goto discard; } @@ -5751,7 +5829,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * ACK we have received, this would have acknowledged * our SYNACK so stop the SYNACK timer. */ - if (req != NULL) { + if (req) { /* Return RST if ack_seq is invalid. * Note that RFC793 only says to generate a * DUPACK for it but for TCP Fast Open it seems @@ -5913,6 +5991,97 @@ static void tcp_ecn_create_request(struct request_sock *req, inet_rsk(req)->ecn_ok = 1; } +static void tcp_openreq_init(struct request_sock *req, + const struct tcp_options_received *rx_opt, + struct sk_buff *skb, const struct sock *sk) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->cookie_ts = 0; + tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_rsk(req)->last_oow_ack_time = 0; + req->mss = rx_opt->mss_clamp; + req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; + ireq->tstamp_ok = rx_opt->tstamp_ok; + ireq->sack_ok = rx_opt->sack_ok; + ireq->snd_wscale = rx_opt->snd_wscale; + ireq->wscale_ok = rx_opt->wscale_ok; + ireq->acked = 0; + ireq->ecn_ok = 0; + ireq->ir_rmt_port = tcp_hdr(skb)->source; + ireq->ir_num = ntohs(tcp_hdr(skb)->dest); + ireq->ir_mark = inet_request_mark(sk, skb); +} + +struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener) +{ + struct request_sock *req = reqsk_alloc(ops, sk_listener); + + if (req) { + struct inet_request_sock *ireq = inet_rsk(req); + + kmemcheck_annotate_bitfield(ireq, flags); + ireq->opt = NULL; + atomic64_set(&ireq->ir_cookie, 0); + ireq->ireq_state = TCP_NEW_SYN_RECV; + write_pnet(&ireq->ireq_net, sock_net(sk_listener)); + ireq->ireq_family = sk_listener->sk_family; + } + + return req; +} +EXPORT_SYMBOL(inet_reqsk_alloc); + +/* + * Return true if a syncookie should be sent + */ +static bool tcp_syn_flood_action(struct sock *sk, + const struct sk_buff *skb, + const char *proto) +{ + const char *msg = "Dropping request"; + bool want_cookie = false; + struct listen_sock *lopt; + +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) { + msg = "Sending cookies"; + want_cookie = true; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); + } else +#endif + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); + + lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; + if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { + lopt->synflood_warned = 1; + pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", + proto, ntohs(tcp_hdr(skb)->dest), msg); + } + return want_cookie; +} + +static void tcp_reqsk_record_syn(const struct sock *sk, + struct request_sock *req, + const struct sk_buff *skb) +{ + if (tcp_sk(sk)->save_syn) { + u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); + u32 *copy; + + copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); + if (copy) { + copy[0] = len; + memcpy(©[1], skb_network_header(skb), len); + req->saved_syn = copy; + } + } +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -5950,7 +6119,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - req = inet_reqsk_alloc(rsk_ops); + req = inet_reqsk_alloc(rsk_ops, sk); if (!req) goto drop; @@ -5967,6 +6136,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); + /* Note: tcp_v6_init_req() might override ir_iif for link locals */ + inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; + af_ops->init_req(req, sk, skb); if (security_inet_conn_request(sk, skb, req)) @@ -6039,9 +6211,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (err || want_cookie) goto drop_and_free; - tcp_rsk(req)->listener = NULL; + tcp_rsk(req)->tfo_listener = false; af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } + tcp_reqsk_record_syn(sk, req, skb); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f1756ee02207..d7d4c2b79cf2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -122,7 +122,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (twp == NULL || (sysctl_tcp_tw_reuse && + (!twp || (sysctl_tcp_tw_reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) @@ -189,7 +189,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; - inet->inet_rcv_saddr = inet->inet_saddr; + sk_rcv_saddr_set(sk, inet->inet_saddr); if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ @@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; - inet->inet_daddr = daddr; + sk_daddr_set(sk, daddr); inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) @@ -310,6 +310,34 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) dst->ops->redirect(dst, sk, skb); } + +/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ +void tcp_req_err(struct sock *sk, u32 seq) +{ + struct request_sock *req = inet_reqsk(sk); + struct net *net = sock_net(sk); + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + WARN_ON(req->sk); + + if (seq != tcp_rsk(req)->snt_isn) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + reqsk_put(req); + } else { + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + } +} +EXPORT_SYMBOL(tcp_req_err); + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -343,8 +371,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) int err; struct net *net = dev_net(icmp_skb->dev); - sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, - iph->saddr, th->source, inet_iif(icmp_skb)); + sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, + th->dest, iph->saddr, ntohs(th->source), + inet_iif(icmp_skb)); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; @@ -353,6 +382,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) inet_twsk_put(inet_twsk(sk)); return; } + seq = ntohl(th->seq); + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_err(sk, seq); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -374,7 +406,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); - seq = ntohl(th->seq); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; @@ -458,42 +489,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } switch (sk->sk_state) { - struct request_sock *req, **prev; - case TCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - - req = inet_csk_search_req(sk, &prev, th->dest, - iph->daddr, iph->saddr); - if (!req) - goto out; - - /* ICMPs are not backlogged, hence we cannot get - an established socket here. - */ - WARN_ON(req->sk); - - if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - /* - * Still in SYN_RECV, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(sk, req, prev); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - goto out; - case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is * is already accepted it is treated as a connected one below. */ - if (fastopen && fastopen->sk == NULL) + if (fastopen && !fastopen->sk) break; if (!sock_owned_by_user(sk)) { @@ -647,7 +648,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (!key) goto release_sk1; - genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb); + genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) goto release_sk1; } else { @@ -855,35 +856,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -/* - * Return true if a syncookie should be sent - */ -bool tcp_syn_flood_action(struct sock *sk, - const struct sk_buff *skb, - const char *proto) -{ - const char *msg = "Dropping request"; - bool want_cookie = false; - struct listen_sock *lopt; - -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { - msg = "Sending cookies"; - want_cookie = true; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); - } else -#endif - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - - lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { - lopt->synflood_warned = 1; - pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", - proto, ntohs(tcp_hdr(skb)->dest), msg); - } - return want_cookie; -} -EXPORT_SYMBOL(tcp_syn_flood_action); #ifdef CONFIG_TCP_MD5SIG /* @@ -897,10 +869,10 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, const union tcp_md5_addr *addr, int family) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; unsigned int size = sizeof(struct in_addr); - struct tcp_md5sig_info *md5sig; + const struct tcp_md5sig_info *md5sig; /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, @@ -923,24 +895,15 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, EXPORT_SYMBOL(tcp_md5_do_lookup); struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, - struct sock *addr_sk) + const struct sock *addr_sk) { - union tcp_md5_addr *addr; + const union tcp_md5_addr *addr; - addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr; + addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; return tcp_md5_do_lookup(sk, addr, AF_INET); } EXPORT_SYMBOL(tcp_v4_md5_lookup); -static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, - struct request_sock *req) -{ - union tcp_md5_addr *addr; - - addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr; - return tcp_md5_do_lookup(sk, addr, AF_INET); -} - /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) @@ -1101,8 +1064,8 @@ clear_hash_noput: return 1; } -int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, - const struct sock *sk, const struct request_sock *req, +int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb) { struct tcp_md5sig_pool *hp; @@ -1110,12 +1073,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, const struct tcphdr *th = tcp_hdr(skb); __be32 saddr, daddr; - if (sk) { - saddr = inet_sk(sk)->inet_saddr; - daddr = inet_sk(sk)->inet_daddr; - } else if (req) { - saddr = inet_rsk(req)->ir_loc_addr; - daddr = inet_rsk(req)->ir_rmt_addr; + if (sk) { /* valid for establish/request sockets */ + saddr = sk->sk_rcv_saddr; + daddr = sk->sk_daddr; } else { const struct iphdr *iph = ip_hdr(skb); saddr = iph->saddr; @@ -1152,8 +1112,9 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); -static bool __tcp_v4_inbound_md5_hash(struct sock *sk, - const struct sk_buff *skb) +/* Called with rcu_read_lock() */ +static bool tcp_v4_inbound_md5_hash(struct sock *sk, + const struct sk_buff *skb) { /* * This gets called for each TCP segment that arrives @@ -1193,7 +1154,7 @@ static bool __tcp_v4_inbound_md5_hash(struct sock *sk, */ genhash = tcp_v4_md5_hash_skb(newhash, hash_expected, - NULL, NULL, skb); + NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", @@ -1205,28 +1166,16 @@ static bool __tcp_v4_inbound_md5_hash(struct sock *sk, } return false; } - -static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) -{ - bool ret; - - rcu_read_lock(); - ret = __tcp_v4_inbound_md5_hash(sk, skb); - rcu_read_unlock(); - - return ret; -} - #endif -static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, +static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); + ireq->no_srccheck = inet_sk(sk_listener)->transparent; ireq->opt = tcp_v4_save_options(skb); } @@ -1259,7 +1208,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG - .md5_lookup = tcp_v4_reqsk_md5_lookup, + .req_md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, #endif .init_req = tcp_v4_init_req, @@ -1318,8 +1267,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->inet_daddr = ireq->ir_rmt_addr; - newinet->inet_rcv_saddr = ireq->ir_loc_addr; + sk_daddr_set(newsk, ireq->ir_rmt_addr); + sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = ireq->opt; rcu_assign_pointer(newinet->inet_opt, inet_opt); @@ -1356,7 +1305,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Copy over the MD5 key from the original socket */ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, AF_INET); - if (key != NULL) { + if (key) { /* * We're using one, so create a matching key * on the newsk structure. If we fail to get @@ -1391,15 +1340,18 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); + struct request_sock *req; struct sock *nsk; - struct request_sock **prev; - /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, - iph->saddr, iph->daddr); - if (req) - return tcp_check_req(sk, skb, req, prev, false); + + req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); + if (req) { + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) + reqsk_put(req); + return nsk; + } nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -1439,7 +1391,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || - dst->ops->check(dst, 0) == NULL) { + !dst->ops->check(dst, 0)) { dst_release(dst); sk->sk_rx_dst = NULL; } @@ -1448,7 +1400,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1517,7 +1469,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) if (sk) { skb->sk = sk; skb->destructor = sock_edemux; - if (sk->sk_state != TCP_TIME_WAIT) { + if (sk_fullsock(sk)) { struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); if (dst) @@ -1674,6 +1626,7 @@ process: skb->dev = NULL; bh_lock_sock_nested(sk); + tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) @@ -1694,7 +1647,7 @@ no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; - if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { + if (tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: @@ -1718,10 +1671,6 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff << 2)) { - inet_twsk_put(inet_twsk(sk)); - goto bad_packet; - } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; @@ -1734,7 +1683,7 @@ do_time_wait: iph->daddr, th->dest, inet_iif(skb)); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_deschedule(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process; @@ -1846,10 +1795,11 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); - BUG_ON(tp->fastopen_rsk != NULL); + BUG_ON(tp->fastopen_rsk); /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); + tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); sock_release_memcg(sk); @@ -1904,13 +1854,13 @@ get_req: } sk = sk_nulls_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { icsk = inet_csk(sk); - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); sk = sk_nulls_next(sk); } get_sk: @@ -1922,7 +1872,7 @@ get_sk: goto out; } icsk = inet_csk(sk); - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: st->uid = sock_i_uid(sk); @@ -1931,7 +1881,7 @@ start_req: st->sbucket = 0; goto get_req; } - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2150,7 +2100,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_OPENREQ: if (v) { struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) @@ -2204,17 +2154,17 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) } EXPORT_SYMBOL(tcp_proc_unregister); -static void get_openreq4(const struct sock *sk, const struct request_sock *req, +static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i, kuid_t uid) { const struct inet_request_sock *ireq = inet_rsk(req); - long delta = req->expires - jiffies; + long delta = req->rsk_timer.expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", i, ireq->ir_loc_addr, - ntohs(inet_sk(sk)->inet_sport), + ireq->ir_num, ireq->ir_rmt_addr, ntohs(ireq->ir_rmt_port), TCP_SYN_RECV, @@ -2225,7 +2175,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, from_kuid_munged(seq_user_ns(f), uid), 0, /* non standard timer */ 0, /* open_requests have no inode */ - atomic_read(&sk->sk_refcnt), + 0, req); } @@ -2291,9 +2241,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) static void get_timewait4_sock(const struct inet_timewait_sock *tw, struct seq_file *f, int i) { + long delta = tw->tw_timer.expires - jiffies; __be32 dest, src; __u16 destp, srcp; - s32 delta = tw->tw_ttd - inet_tw_time_stamp(); dest = tw->tw_daddr; src = tw->tw_rcv_saddr; @@ -2332,7 +2282,7 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) get_tcp4_sock(v, seq, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid); + get_openreq4(v, seq, st->num, st->uid); break; } out: @@ -2458,10 +2408,15 @@ static int __net_init tcp_sk_init(struct net *net) goto fail; *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } + net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn_fallback = 1; + net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; - return 0; + net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; + net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index e5f41bd5ec1b..a51d63a43e33 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -28,7 +28,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s struct tcp_fastopen_metrics { u16 mss; - u16 syn_loss:10; /* Recurring Fast Open SYN losses */ + u16 syn_loss:10, /* Recurring Fast Open SYN losses */ + try_exp:2; /* Request w/ exp. option (once) */ unsigned long last_syn_loss; /* Last Fast Open SYN loss */ struct tcp_fastopen_cookie cookie; }; @@ -40,6 +41,7 @@ struct tcp_fastopen_metrics { struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; + possible_net_t tcpm_net; struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; @@ -52,6 +54,11 @@ struct tcp_metrics_block { struct rcu_head rcu_head; }; +static inline struct net *tm_net(struct tcp_metrics_block *tm) +{ + return read_pnet(&tm->tcpm_net); +} + static bool tcp_metric_locked(struct tcp_metrics_block *tm, enum tcp_metric_index idx) { @@ -74,23 +81,20 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - const struct in6_addr *a6, *b6; - if (a->family != b->family) return false; if (a->family == AF_INET) return a->addr.a4 == b->addr.a4; - - a6 = (const struct in6_addr *) &a->addr.a6[0]; - b6 = (const struct in6_addr *) &b->addr.a6[0]; - - return ipv6_addr_equal(a6, b6); + return ipv6_addr_equal(&a->addr.in6, &b->addr.in6); } struct tcpm_hash_bucket { struct tcp_metrics_block __rcu *chain; }; +static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly; +static unsigned int tcp_metrics_hash_log __read_mostly; + static DEFINE_SPINLOCK(tcp_metrics_lock); static void tcpm_suck_dst(struct tcp_metrics_block *tm, @@ -128,6 +132,8 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, if (fastopen_clear) { tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; + tm->tcpm_fastopen.try_exp = 0; + tm->tcpm_fastopen.cookie.exp = false; tm->tcpm_fastopen.cookie.len = 0; } } @@ -143,6 +149,9 @@ static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst #define TCP_METRICS_RECLAIM_DEPTH 5 #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL +#define deref_locked(p) \ + rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock)) + static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct inetpeer_addr *saddr, struct inetpeer_addr *daddr, @@ -171,9 +180,9 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; - oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); - for (tm = rcu_dereference(oldest->tcpm_next); tm; - tm = rcu_dereference(tm->tcpm_next)) { + oldest = deref_locked(tcp_metrics_hash[hash].chain); + for (tm = deref_locked(oldest->tcpm_next); tm; + tm = deref_locked(tm->tcpm_next)) { if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) oldest = tm; } @@ -183,14 +192,15 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (!tm) goto out_unlock; } + write_pnet(&tm->tcpm_net, net); tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; tcpm_suck_dst(tm, dst, true); if (likely(!reclaim)) { - tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; - rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); + tm->tcpm_next = tcp_metrics_hash[hash].chain; + rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm); } out_unlock: @@ -214,10 +224,11 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s struct tcp_metrics_block *tm; int depth = 0; - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, saddr) && - addr_same(&tm->tcpm_daddr, daddr)) + addr_same(&tm->tcpm_daddr, daddr) && + net_eq(tm_net(tm), net)) break; depth++; } @@ -242,8 +253,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; - *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; + saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr; + daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr; hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif @@ -252,12 +263,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, } net = dev_net(dst->dev); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && - addr_same(&tm->tcpm_daddr, &daddr)) + addr_same(&tm->tcpm_daddr, &daddr) && + net_eq(tm_net(tm), net)) break; } tcpm_check_stamp(tm, dst); @@ -288,9 +301,9 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock hash = (__force unsigned int) daddr.addr.a4; } else { saddr.family = AF_INET6; - *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; + saddr.addr.in6 = tw->tw_v6_rcv_saddr; daddr.family = AF_INET6; - *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; + daddr.addr.in6 = tw->tw_v6_daddr; hash = ipv6_addr_hash(&tw->tw_v6_daddr); } } @@ -299,12 +312,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock return NULL; net = twsk_net(tw); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && - addr_same(&tm->tcpm_daddr, &daddr)) + addr_same(&tm->tcpm_daddr, &daddr) && + net_eq(tm_net(tm), net)) break; } return tm; @@ -336,9 +351,9 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, hash = (__force unsigned int) daddr.addr.a4; } else { saddr.family = AF_INET6; - *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; + saddr.addr.in6 = sk->sk_v6_rcv_saddr; daddr.family = AF_INET6; - *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; + daddr.addr.in6 = sk->sk_v6_daddr; hash = ipv6_addr_hash(&sk->sk_v6_daddr); } } @@ -347,7 +362,8 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, return NULL; net = dev_net(dst->dev); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); tm = __tcp_get_metrics(&saddr, &daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) @@ -492,7 +508,7 @@ void tcp_init_metrics(struct sock *sk) struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ - if (dst == NULL) + if (!dst) goto reset; dst_confirm(dst); @@ -700,6 +716,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, if (tfom->mss) *mss = tfom->mss; *cookie = tfom->cookie; + if (cookie->len <= 0 && tfom->try_exp == 1) + cookie->exp = true; *syn_loss = tfom->syn_loss; *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; } while (read_seqretry(&fastopen_seqlock, seq)); @@ -708,7 +726,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, } void tcp_fastopen_cache_set(struct sock *sk, u16 mss, - struct tcp_fastopen_cookie *cookie, bool syn_lost) + struct tcp_fastopen_cookie *cookie, bool syn_lost, + u16 try_exp) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_metrics_block *tm; @@ -725,6 +744,9 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, tfom->mss = mss; if (cookie && cookie->len > 0) tfom->cookie = *cookie; + else if (try_exp > tfom->try_exp && + tfom->cookie.len <= 0 && !tfom->cookie.exp) + tfom->try_exp = try_exp; if (syn_lost) { ++tfom->syn_loss; tfom->last_syn_loss = jiffies; @@ -773,19 +795,19 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, switch (tm->tcpm_daddr.family) { case AF_INET: - if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, - tm->tcpm_daddr.addr.a4) < 0) + if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4, + tm->tcpm_daddr.addr.a4) < 0) goto nla_put_failure; - if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, - tm->tcpm_saddr.addr.a4) < 0) + if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4, + tm->tcpm_saddr.addr.a4) < 0) goto nla_put_failure; break; case AF_INET6: - if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, - tm->tcpm_daddr.addr.a6) < 0) + if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6, + &tm->tcpm_daddr.addr.in6) < 0) goto nla_put_failure; - if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, - tm->tcpm_saddr.addr.a6) < 0) + if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6, + &tm->tcpm_saddr.addr.in6) < 0) goto nla_put_failure; break; default: @@ -898,17 +920,19 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; + unsigned int max_rows = 1U << tcp_metrics_hash_log; unsigned int row, s_row = cb->args[0]; int s_col = cb->args[1], col = s_col; for (row = s_row; row < max_rows; row++, s_col = 0) { struct tcp_metrics_block *tm; - struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; + struct tcpm_hash_bucket *hb = tcp_metrics_hash + row; rcu_read_lock(); for (col = 0, tm = rcu_dereference(hb->chain); tm; tm = rcu_dereference(tm->tcpm_next), col++) { + if (!net_eq(tm_net(tm), net)) + continue; if (col < s_col) continue; if (tcp_metrics_dump_info(skb, cb, tm) < 0) { @@ -933,7 +957,7 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, a = info->attrs[v4]; if (a) { addr->family = AF_INET; - addr->addr.a4 = nla_get_be32(a); + addr->addr.a4 = nla_get_in_addr(a); if (hash) *hash = (__force unsigned int) addr->addr.a4; return 0; @@ -943,9 +967,9 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; addr->family = AF_INET6; - memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); + addr->addr.in6 = nla_get_in6_addr(a); if (hash) - *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); + *hash = ipv6_addr_hash(&addr->addr.in6); return 0; } return optional ? 1 : -EAFNOSUPPORT; @@ -994,13 +1018,15 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) if (!reply) goto nla_put_failure; - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); ret = -ESRCH; rcu_read_lock(); - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_daddr, &daddr) && - (!src || addr_same(&tm->tcpm_saddr, &saddr))) { + (!src || addr_same(&tm->tcpm_saddr, &saddr)) && + net_eq(tm_net(tm), net)) { ret = tcp_metrics_fill_info(msg, tm); break; } @@ -1020,34 +1046,27 @@ out_free: return ret; } -#define deref_locked_genl(p) \ - rcu_dereference_protected(p, lockdep_genl_is_held() && \ - lockdep_is_held(&tcp_metrics_lock)) - -#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held()) - -static int tcp_metrics_flush_all(struct net *net) +static void tcp_metrics_flush_all(struct net *net) { - unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; - struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; + unsigned int max_rows = 1U << tcp_metrics_hash_log; + struct tcpm_hash_bucket *hb = tcp_metrics_hash; struct tcp_metrics_block *tm; unsigned int row; for (row = 0; row < max_rows; row++, hb++) { + struct tcp_metrics_block __rcu **pp; spin_lock_bh(&tcp_metrics_lock); - tm = deref_locked_genl(hb->chain); - if (tm) - hb->chain = NULL; - spin_unlock_bh(&tcp_metrics_lock); - while (tm) { - struct tcp_metrics_block *next; - - next = deref_genl(tm->tcpm_next); - kfree_rcu(tm, rcu_head); - tm = next; + pp = &hb->chain; + for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { + if (net_eq(tm_net(tm), net)) { + *pp = tm->tcpm_next; + kfree_rcu(tm, rcu_head); + } else { + pp = &tm->tcpm_next; + } } + spin_unlock_bh(&tcp_metrics_lock); } - return 0; } static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) @@ -1064,19 +1083,23 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; - if (ret > 0) - return tcp_metrics_flush_all(net); + if (ret > 0) { + tcp_metrics_flush_all(net); + return 0; + } ret = parse_nl_saddr(info, &saddr); if (ret < 0) src = false; - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); - hb = net->ipv4.tcp_metrics_hash + hash; + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); + hb = tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); - for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { + for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { if (addr_same(&tm->tcpm_daddr, &daddr) && - (!src || addr_same(&tm->tcpm_saddr, &saddr))) { + (!src || addr_same(&tm->tcpm_saddr, &saddr)) && + net_eq(tm_net(tm), net)) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); found = true; @@ -1126,6 +1149,9 @@ static int __net_init tcp_net_metrics_init(struct net *net) size_t size; unsigned int slots; + if (!net_eq(net, &init_net)) + return 0; + slots = tcpmhash_entries; if (!slots) { if (totalram_pages >= 128 * 1024) @@ -1134,14 +1160,14 @@ static int __net_init tcp_net_metrics_init(struct net *net) slots = 8 * 1024; } - net->ipv4.tcp_metrics_hash_log = order_base_2(slots); - size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; + tcp_metrics_hash_log = order_base_2(slots); + size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log; - net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!net->ipv4.tcp_metrics_hash) - net->ipv4.tcp_metrics_hash = vzalloc(size); + tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!tcp_metrics_hash) + tcp_metrics_hash = vzalloc(size); - if (!net->ipv4.tcp_metrics_hash) + if (!tcp_metrics_hash) return -ENOMEM; return 0; @@ -1149,19 +1175,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) static void __net_exit tcp_net_metrics_exit(struct net *net) { - unsigned int i; - - for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) { - struct tcp_metrics_block *tm, *next; - - tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1); - while (tm) { - next = rcu_dereference_protected(tm->tcpm_next, 1); - kfree(tm); - tm = next; - } - } - kvfree(net->ipv4.tcp_metrics_hash); + tcp_metrics_flush_all(net); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { @@ -1175,16 +1189,10 @@ void __init tcp_metrics_init(void) ret = register_pernet_subsys(&tcp_net_metrics_ops); if (ret < 0) - goto cleanup; + panic("Could not allocate the tcp_metrics hash table\n"); + ret = genl_register_family_with_ops(&tcp_metrics_nl_family, tcp_metrics_nl_ops); if (ret < 0) - goto cleanup_subsys; - return; - -cleanup_subsys: - unregister_pernet_subsys(&tcp_net_metrics_ops); - -cleanup: - return; + panic("Could not register tcp_metrics generic netlink\n"); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dd11ac7798c6..4bc00cb79e60 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly; struct inet_timewait_death_row tcp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, - .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, - .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), .hashinfo = &tcp_hashinfo, - .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, - (unsigned long)&tcp_death_row), - .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, - inet_twdr_twkill_work), -/* Short-time timewait calendar */ - - .twcal_hand = -1, - .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, - (unsigned long)&tcp_death_row), }; EXPORT_SYMBOL_GPL(tcp_death_row); @@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); return TCP_TW_RST; } @@ -174,11 +163,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -211,13 +198,12 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -267,8 +253,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -283,18 +268,17 @@ EXPORT_SYMBOL(tcp_timewait_state_process); */ void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct inet_timewait_sock *tw = NULL; const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); + struct inet_timewait_sock *tw; bool recycle_ok = false; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tcp_remember_stamp(sk); - if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) - tw = inet_twsk_alloc(sk, state); + tw = inet_twsk_alloc(sk, &tcp_death_row, state); - if (tw != NULL) { + if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); struct inet_sock *inet = inet_sk(sk); @@ -316,7 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; - tw->tw_flowlabel = np->flow_label >> 12; + tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif @@ -332,7 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) struct tcp_md5sig_key *key; tcptw->tw_md5_key = NULL; key = tp->af_specific->md5_lookup(sk, sk); - if (key != NULL) { + if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()) BUG(); @@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = TCP_TIMEWAIT_LEN; } - inet_twsk_schedule(tw, &tcp_death_row, timeo, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, timeo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -437,7 +420,10 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) rcu_read_unlock(); } - if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner)) + /* If no valid choice made yet, assign current system default ca. */ + if (!ca_got_dst && + (!icsk->icsk_ca_setsockopt || + !try_module_get(icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); @@ -454,7 +440,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); - if (newsk != NULL) { + if (newsk) { const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); @@ -465,6 +451,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 0; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; @@ -553,6 +540,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; + newtp->saved_syn = req->saved_syn; + req->saved_syn = NULL; + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; @@ -572,7 +562,6 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct request_sock **prev, bool fastopen) { struct tcp_options_received tmp_opt; @@ -629,9 +618,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && - !inet_rtx_syn_ack(sk, req)) - req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, - TCP_RTO_MAX) + jiffies; + !inet_rtx_syn_ack(sk, req)) { + unsigned long expires = jiffies; + + expires += min(TCP_TIMEOUT_INIT << req->num_timeout, + TCP_RTO_MAX); + if (!fastopen) + mod_timer_pending(&req->rsk_timer, expires); + else + req->rsk_timer.expires = expires; + } return NULL; } @@ -763,13 +759,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) + if (!child) goto listen_overflow; - inet_csk_reqsk_queue_unlink(sk, req, prev); - inet_csk_reqsk_queue_removed(sk, req); - + inet_csk_reqsk_queue_drop(sk, req); inet_csk_reqsk_queue_add(sk, req, child); + /* Warning: caller must not call reqsk_put(req); + * child stole last reference on it. + */ return child; listen_overflow: @@ -791,7 +788,7 @@ embryonic_reset: tcp_reset(sk); } if (!fastopen) { - inet_csk_reqsk_queue_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); } return NULL; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 9d7930ba8e0f..9864a2dbadce 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -29,8 +29,8 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, } } -struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, + netdev_features_t features) { if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); @@ -77,7 +77,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, oldlen = (u16)~skb->len; __skb_pull(skb, thlen); - mss = tcp_skb_mss(skb); + mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; @@ -242,7 +242,7 @@ found: flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); - mss = tcp_skb_mss(p); + mss = skb_shinfo(p)->gso_size; flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1db253e36045..b1c218df2c85 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -50,8 +50,8 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; -/* Default TSQ limit of two TSO segments */ -int sysctl_tcp_limit_output_bytes __read_mostly = 131072; +/* Default TSQ limit of four TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 262144; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames @@ -350,6 +350,15 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) } } +static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback) + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); +} + static void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, struct sock *sk) @@ -393,8 +402,6 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; @@ -402,8 +409,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->sacked = 0; tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; - shinfo->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -518,17 +523,26 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; + u8 *p = (u8 *)ptr; + u32 len; /* Fast Open option length */ + + if (foc->exp) { + len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; + *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | + TCPOPT_FASTOPEN_MAGIC); + p += TCPOLEN_EXP_FASTOPEN_BASE; + } else { + len = TCPOLEN_FASTOPEN_BASE + foc->len; + *p++ = TCPOPT_FASTOPEN; + *p++ = len; + } - *ptr++ = htonl((TCPOPT_EXP << 24) | - ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | - TCPOPT_FASTOPEN_MAGIC); - - memcpy(ptr, foc->val, foc->len); - if ((foc->len & 3) == 2) { - u8 *align = ((u8 *)ptr) + foc->len; - align[0] = align[1] = TCPOPT_NOP; + memcpy(p, foc->val, foc->len); + if ((len & 3) == 2) { + p[foc->len] = TCPOPT_NOP; + p[foc->len + 1] = TCPOPT_NOP; } - ptr += (foc->len + 3) >> 2; + ptr += (len + 3) >> 2; } } @@ -565,7 +579,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; - if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { + if (likely(sysctl_tcp_timestamps && !*md5)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; @@ -583,13 +597,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } if (fastopen && fastopen->cookie.len >= 0) { - u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; + u32 need = fastopen->cookie.len; + + need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE : + TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = &fastopen->cookie; remaining -= need; tp->syn_fastopen = 1; + tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0; } } @@ -601,15 +619,14 @@ static unsigned int tcp_synack_options(struct sock *sk, struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5, + const struct tcp_md5sig_key *md5, struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; #ifdef CONFIG_TCP_MD5SIG - *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); - if (*md5) { + if (md5) { opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; @@ -620,8 +637,6 @@ static unsigned int tcp_synack_options(struct sock *sk, */ ireq->tstamp_ok &= !ireq->sack_ok; } -#else - *md5 = NULL; #endif /* We always send an MSS option. */ @@ -645,7 +660,10 @@ static unsigned int tcp_synack_options(struct sock *sk, remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (foc != NULL && foc->len >= 0) { - u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; + u32 need = foc->len; + + need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE : + TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; @@ -981,6 +999,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } tcp_options_write((__be32 *)(th + 1), tp, &opts); + skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) tcp_ecn_send(sk, skb, tcp_header_size); @@ -989,7 +1008,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (md5) { sk_nocaps_add(sk, NETIF_F_GSO_MASK); tp->af_specific->calc_md5_hash(opts.hash_location, - md5, sk, NULL, skb); + md5, sk, skb); } #endif @@ -1005,8 +1024,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + tp->segs_out += tcp_skb_pcount(skb); + /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; @@ -1043,25 +1064,17 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - - /* Make sure we own this skb before messing gso_size/gso_segs */ - WARN_ON_ONCE(skb_cloned(skb)); - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { /* Avoid the costly divide in the normal * non-TSO case. */ tcp_skb_pcount_set(skb, 1); - shinfo->gso_size = 0; - shinfo->gso_type = 0; + TCP_SKB_CB(skb)->tcp_gso_size = 0; } else { tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - shinfo->gso_size = mss_now; - shinfo->gso_type = sk->sk_gso_type; + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; } } @@ -1150,8 +1163,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, gfp); - if (buff == NULL) + buff = sk_stream_alloc_skb(sk, nsize, gfp, true); + if (!buff) return -ENOMEM; /* We'll just try again later. */ sk->sk_wmem_queued += buff->truesize; @@ -1193,8 +1206,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* If this packet has been sent out already, we must * adjust the various packet counters. @@ -1274,7 +1287,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } @@ -1354,6 +1367,8 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; + if (icsk->icsk_mtup.enabled) + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; } EXPORT_SYMBOL(tcp_mtup_init); @@ -1604,13 +1619,12 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, * This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, - unsigned int mss_now) +static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); tso_segs = tcp_skb_pcount(skb); } return tso_segs; @@ -1665,7 +1679,7 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, const struct tcp_sock *tp = tcp_sk(sk); unsigned int cwnd_quota; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) return 0; @@ -1707,8 +1721,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now, gfp); - buff = sk_stream_alloc_skb(sk, 0, gfp); - if (unlikely(buff == NULL)) + buff = sk_stream_alloc_skb(sk, 0, gfp, true); + if (unlikely(!buff)) return -ENOMEM; sk->sk_wmem_queued += buff->truesize; @@ -1734,8 +1748,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ - tcp_set_skb_tso_segs(sk, skb, mss_now); - tcp_set_skb_tso_segs(sk, buff, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); + tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); @@ -1752,20 +1766,23 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, bool *is_cwnd_limited, u32 max_segs) { - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - u32 send_win, cong_win, limit, in_flight; + u32 age, send_win, cong_win, limit, in_flight; + struct tcp_sock *tp = tcp_sk(sk); + struct skb_mstamp now; + struct sk_buff *head; int win_divisor; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto send_now; - if (icsk->icsk_ca_state != TCP_CA_Open) + if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR))) goto send_now; - /* Defer for less than two clock ticks. */ - if (tp->tso_deferred && - (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) + /* Avoid bursty behavior by allowing defer + * only if the last write was recent. + */ + if ((s32)(tcp_time_stamp - tp->lsndtime) > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1807,11 +1824,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, goto send_now; } - /* Ok, it looks like it is advisable to defer. - * Do not rearm the timer if already set to not break TCP ACK clocking. - */ - if (!tp->tso_deferred) - tp->tso_deferred = 1 | (jiffies << 1); + head = tcp_write_queue_head(sk); + skb_mstamp_get(&now); + age = skb_mstamp_us_delta(&now, &head->skb_mstamp); + /* If next ACK is likely to come too late (half srtt), do not defer */ + if (age < (tp->srtt_us >> 4)) + goto send_now; + + /* Ok, it looks like it is advisable to defer. */ if (cong_win < send_win && cong_win < skb->len) *is_cwnd_limited = true; @@ -1819,10 +1839,34 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, return true; send_now: - tp->tso_deferred = 0; return false; } +static inline void tcp_mtu_check_reprobe(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + u32 interval; + s32 delta; + + interval = net->ipv4.sysctl_tcp_probe_interval; + delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; + if (unlikely(delta >= interval * HZ)) { + int mss = tcp_current_mss(sk); + + /* Update current search range */ + icsk->icsk_mtup.probe_size = 0; + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + + sizeof(struct tcphdr) + + icsk->icsk_af_ops->net_header_len; + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + + /* Update probe time stamp */ + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + } +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -1837,11 +1881,13 @@ static int tcp_mtu_probe(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *nskb, *next; + struct net *net = sock_net(sk); int len; int probe_size; int size_needed; int copy; int mss_now; + int interval; /* Not currently probing/verifying, * not in recovery, @@ -1854,12 +1900,25 @@ static int tcp_mtu_probe(struct sock *sk) tp->rx_opt.num_sacks || tp->rx_opt.dsack) return -1; - /* Very simple search strategy: just double the MSS. */ + /* Use binary search for probe_size between tcp_mss_base, + * and current mss_clamp. if (search_high - search_low) + * smaller than a threshold, backoff from probing. + */ mss_now = tcp_current_mss(sk); - probe_size = 2 * tp->mss_cache; + probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; - if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { - /* TODO: set timer for probe_converge_event */ + interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; + /* When misfortune happens, we are reprobing actively, + * and then reprobe timer has expired. We stick with current + * probing process by not resetting search range to its orignal. + */ + if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || + interval < net->ipv4.sysctl_tcp_probe_threshold) { + /* Check whether enough time has elaplased for + * another round of probing. + */ + tcp_mtu_check_reprobe(sk); return -1; } @@ -1881,7 +1940,8 @@ static int tcp_mtu_probe(struct sock *sk) } /* We're allowed to probe. Build it now. */ - if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) + nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); + if (!nskb) return -1; sk->sk_wmem_queued += nskb->truesize; sk_mem_charge(sk, nskb->truesize); @@ -1923,7 +1983,7 @@ static int tcp_mtu_probe(struct sock *sk) skb->len, 0); } else { __pskb_trim_head(skb, copy); - tcp_set_skb_tso_segs(sk, skb, mss_now); + tcp_set_skb_tso_segs(skb, mss_now); } TCP_SKB_CB(skb)->seq += copy; } @@ -1933,7 +1993,7 @@ static int tcp_mtu_probe(struct sock *sk) if (len >= probe_size) break; } - tcp_init_tso_segs(sk, nskb, nskb->len); + tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). @@ -1995,7 +2055,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, while ((skb = tcp_send_head(sk))) { unsigned int limit; - tso_segs = tcp_init_tso_segs(sk, skb, mss_now); + tso_segs = tcp_init_tso_segs(skb, mss_now); BUG_ON(!tso_segs); if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { @@ -2017,7 +2077,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; - if (tso_segs == 1 || !max_segs) { + if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) @@ -2030,7 +2090,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } limit = mss_now; - if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp)) + if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, @@ -2179,7 +2239,7 @@ void tcp_send_loss_probe(struct sock *sk) int mss = tcp_current_mss(sk); int err = -1; - if (tcp_send_head(sk) != NULL) { + if (tcp_send_head(sk)) { err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); goto rearm_timer; } @@ -2331,7 +2391,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -2549,11 +2609,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (unlikely(oldpcount > 1)) { if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; - tcp_init_tso_segs(sk, skb, cur_mss); + tcp_init_tso_segs(skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } } + /* RFC3168, section 6.1.1.1. ECN fallback */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + tcp_retrans_try_collapse(sk, skb, cur_mss); /* Make a copy, if the first transmission SKB clone we made @@ -2689,7 +2753,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == tcp_send_head(sk)) break; /* we could do better than to assign each time */ - if (hole == NULL) + if (!hole) tp->retransmit_skb_hint = skb; /* Assume this retransmit will generate @@ -2713,7 +2777,7 @@ begin_fwd: if (!tcp_can_forward_retransmit(sk)) break; /* Backtrack if necessary to non-L'ed skb */ - if (hole != NULL) { + if (hole) { skb = hole; hole = NULL; } @@ -2721,7 +2785,7 @@ begin_fwd: goto begin_fwd; } else if (!(sacked & TCPCB_LOST)) { - if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) + if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; @@ -2751,39 +2815,67 @@ begin_fwd: } } -/* Send a fin. The caller locks the socket for us. This cannot be - * allowed to fail queueing a FIN frame under any circumstances. +/* We allow to exceed memory limits for FIN packets to expedite + * connection tear down and (memory) recovery. + * Otherwise tcp_send_fin() could be tempted to either delay FIN + * or even be forced to close flow without any FIN. + * In general, we want to allow one skb per socket to avoid hangs + * with edge trigger epoll() + */ +void sk_forced_mem_schedule(struct sock *sk, int size) +{ + int amt, status; + + if (size <= sk->sk_forward_alloc) + return; + amt = sk_mem_pages(size); + sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + sk_memory_allocated_add(sk, amt, &status); +} + +/* Send a FIN. The caller locks the socket for us. + * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { + struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_write_queue_tail(sk); - int mss_now; - /* Optimization, tack on the FIN if we have a queue of - * unsent frames. But be careful about outgoing SACKS - * and IP options. + /* Optimization, tack on the FIN if we have one skb in write queue and + * this skb was not yet sent, or we are under memory pressure. + * Note: in the latter case, FIN packet will be sent after a timeout, + * as TCP stack thinks it has already been transmitted. */ - mss_now = tcp_current_mss(sk); - - if (tcp_send_head(sk) != NULL) { - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; - TCP_SKB_CB(skb)->end_seq++; + if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { +coalesce: + TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; + TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; + if (!tcp_send_head(sk)) { + /* This means tskb was already sent. + * Pretend we included the FIN on previous transmit. + * We need to set tp->snd_nxt to the value it would have + * if FIN had been sent. This is because retransmit path + * does not change tp->snd_nxt. + */ + tp->snd_nxt++; + return; + } } else { - /* Socket is locked, keep trying until memory is available. */ - for (;;) { - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); - if (skb) - break; - yield(); + skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + if (unlikely(!skb)) { + if (tskb) + goto coalesce; + return; } + skb_reserve(skb, MAX_TCP_HEADER); + sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } - __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); + __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to @@ -2824,14 +2916,14 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *skb; skb = tcp_write_queue_head(sk); - if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_debug("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); - if (nskb == NULL) + if (!nskb) return -ENOMEM; tcp_unlink_write_queue(skb, sk); __skb_header_release(nskb); @@ -2866,7 +2958,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; struct sk_buff *skb; - struct tcp_md5sig_key *md5; + struct tcp_md5sig_key *md5 = NULL; int tcp_header_size; int mss; @@ -2879,7 +2971,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb_reserve(skb, MAX_TCP_HEADER); skb_dst_set(skb, dst); - security_skb_owned_by(skb, sk); mss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) @@ -2892,7 +2983,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, else #endif skb_mstamp_get(&skb->skb_mstamp); - tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, + +#ifdef CONFIG_TCP_MD5SIG + rcu_read_lock(); + md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); +#endif + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, foc) + sizeof(*th); skb_push(skb, tcp_header_size); @@ -2923,12 +3019,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ - if (md5) { + if (md5) tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, - md5, NULL, req, skb); - } + md5, req_to_sk(req), skb); + rcu_read_unlock(); #endif + /* Do not fool tcpdump (if any), clean our debris */ + skb->tstamp.tv64 = 0; return skb; } EXPORT_SYMBOL(tcp_make_synack); @@ -2966,7 +3064,7 @@ static void tcp_connect_init(struct sock *sk) (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); #ifdef CONFIG_TCP_MD5SIG - if (tp->af_specific->md5_lookup(sk, sk) != NULL) + if (tp->af_specific->md5_lookup(sk, sk)) tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif @@ -3082,7 +3180,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false); if (!syn_data) goto fallback; syn_data->ip_summed = CHECKSUM_PARTIAL; @@ -3148,7 +3246,7 @@ int tcp_connect(struct sock *sk) return 0; } - buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; @@ -3252,7 +3350,7 @@ void tcp_send_ack(struct sock *sk) * sock. */ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); - if (buff == NULL) { + if (!buff) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, @@ -3289,14 +3387,14 @@ EXPORT_SYMBOL_GPL(tcp_send_ack); * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ -static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); - if (skb == NULL) + if (!skb) return -1; /* Reserve space for headers and set control bits. */ @@ -3307,6 +3405,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); + NET_INC_STATS_BH(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3314,12 +3413,12 @@ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; - tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ -int tcp_write_wakeup(struct sock *sk) +int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -3327,8 +3426,8 @@ int tcp_write_wakeup(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return -1; - if ((skb = tcp_send_head(sk)) != NULL && - before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { + skb = tcp_send_head(sk); + if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; @@ -3347,7 +3446,7 @@ int tcp_write_wakeup(struct sock *sk) if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) - tcp_set_skb_tso_segs(sk, skb, mss); + tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); @@ -3356,8 +3455,8 @@ int tcp_write_wakeup(struct sock *sk) return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) - tcp_xmit_probe_skb(sk, 1); - return tcp_xmit_probe_skb(sk, 0); + tcp_xmit_probe_skb(sk, 1, mib); + return tcp_xmit_probe_skb(sk, 0, mib); } } @@ -3371,7 +3470,7 @@ void tcp_send_probe0(struct sock *sk) unsigned long probe_max; int err; - err = tcp_write_wakeup(sk); + err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || !tcp_send_head(sk)) { /* Cancel probe timer, if it is not required. */ @@ -3397,7 +3496,7 @@ void tcp_send_probe0(struct sock *sk) probe_max = TCP_RESOURCE_PROBE_INTERVAL; } inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - inet_csk_rto_backoff(icsk, probe_max), + tcp_probe0_when(sk, probe_max), TCP_RTO_MAX); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0732b787904e..5b752f58a900 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct net *net = sock_net(sk); @@ -166,7 +167,7 @@ static int tcp_write_timeout(struct sock *sk) if (icsk->icsk_retransmits) { dst_negative_advice(sk); if (tp->syn_fastopen || tp->syn_data) - tcp_fastopen_cache_set(sk, 0, NULL, true); + tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (tp->syn_data) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); @@ -246,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk) } out: - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) sk_mem_reclaim(sk); } @@ -326,7 +327,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; - req->rsk_ops->syn_ack_timeout(sk, req); + req->rsk_ops->syn_ack_timeout(req); if (req->num_timeout >= max_retries) { tcp_write_err(sk); @@ -538,19 +539,11 @@ static void tcp_write_timer(unsigned long data) sock_put(sk); } -/* - * Timer for listening sockets - */ - -static void tcp_synack_timer(struct sock *sk) +void tcp_syn_ack_timeout(const struct request_sock *req) { - inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, - TCP_TIMEOUT_INIT, TCP_RTO_MAX); -} + struct net *net = read_pnet(&inet_rsk(req)->ireq_net); -void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) -{ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); + NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS); } EXPORT_SYMBOL(tcp_syn_ack_timeout); @@ -582,7 +575,7 @@ static void tcp_keepalive_timer (unsigned long data) } if (sk->sk_state == TCP_LISTEN) { - tcp_synack_timer(sk); + pr_err("Hmm... keepalive on a LISTEN ???\n"); goto out; } @@ -623,7 +616,7 @@ static void tcp_keepalive_timer (unsigned long data) tcp_write_err(sk); goto out; } - if (tcp_write_wakeup(sk) <= 0) { + if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index a6afde666ab1..a6cea1d5e20d 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -286,19 +286,21 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) } /* Extract info for Tcp socket info provided via netlink. */ -void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct vegas *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = ca->doing_vegas_now, - .tcpv_rttcnt = ca->cntRTT, - .tcpv_rtt = ca->baseRTT, - .tcpv_minrtt = ca->minRTT, - }; - - nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + info->vegas.tcpv_enabled = ca->doing_vegas_now, + info->vegas.tcpv_rttcnt = ca->cntRTT, + info->vegas.tcpv_rtt = ca->baseRTT, + info->vegas.tcpv_minrtt = ca->minRTT, + + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } + return 0; } EXPORT_SYMBOL_GPL(tcp_vegas_get_info); diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index 0531b99d8637..ef9da5306c68 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -19,6 +19,7 @@ void tcp_vegas_init(struct sock *sk); void tcp_vegas_state(struct sock *sk, u8 ca_state); void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info); #endif /* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index bb63fba47d47..c10732e39837 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -256,20 +256,21 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_westwood_info(struct sock *sk, u32 ext, - struct sk_buff *skb) +static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = 1, - .tcpv_rtt = jiffies_to_usecs(ca->rtt), - .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - }; + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = 0; + info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt), + info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), - nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } + return 0; } static struct tcp_congestion_ops tcp_westwood __read_mostly = { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 97ef1f8b7be8..83aa604f9273 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -90,6 +90,7 @@ #include <linux/socket.h> #include <linux/sockios.h> #include <linux/igmp.h> +#include <linux/inetdevice.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> @@ -318,8 +319,8 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); } -static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, - unsigned int port) +static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, + unsigned int port) { return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } @@ -421,9 +422,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, return score; } -static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +static u32 udp_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 udp_ehash_secret __read_mostly; @@ -433,7 +434,6 @@ static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, udp_ehash_secret + net_hash_mix(net)); } - /* called with read_rcu_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, @@ -633,7 +633,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, udptable); - if (sk == NULL) { + if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; /* No socket for error */ } @@ -873,8 +873,7 @@ out: } EXPORT_SYMBOL(udp_push_pending_frames); -int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); @@ -1012,7 +1011,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (connected) rt = (struct rtable *)sk_dst_check(sk, 0); - if (rt == NULL) { + if (!rt) { struct net *net = sock_net(sk); fl4 = &fl4_stack; @@ -1136,7 +1135,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, * sendpage interface can't pass. * This will succeed only when the socket is connected. */ - ret = udp_sendmsg(NULL, sk, &msg, 0); + ret = udp_sendmsg(sk, &msg, 0); if (ret < 0) return ret; } @@ -1172,7 +1171,6 @@ out: return ret; } - /** * first_packet_length - return length of first packet in receive queue * @sk: socket @@ -1254,8 +1252,8 @@ EXPORT_SYMBOL(udp_ioctl); * return it, otherwise we block. */ -int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); @@ -1348,15 +1346,12 @@ csum_copy_err: } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } - int udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -1523,7 +1518,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) /* if we're overly short, let UDP handle it */ encap_rcv = ACCESS_ONCE(up->encap_rcv); - if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { + if (skb->len > sizeof(struct udphdr) && encap_rcv) { int ret; /* Verify checksum before giving to encap */ @@ -1580,7 +1575,6 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) udp_lib_checksum_complete(skb)) goto csum_error; - if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); @@ -1610,7 +1604,6 @@ drop: return -1; } - static void flush_stack(struct sock **stack, unsigned int count, struct sk_buff *skb, unsigned int final) { @@ -1620,7 +1613,7 @@ static void flush_stack(struct sock **stack, unsigned int count, for (i = 0; i < count; i++) { sk = stack[i]; - if (likely(skb1 == NULL)) + if (likely(!skb1)) skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); if (!skb1) { @@ -1803,7 +1796,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); - if (sk != NULL) { + if (sk) { int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) @@ -1968,6 +1961,7 @@ void udp_v4_early_demux(struct sk_buff *skb) struct sock *sk; struct dst_entry *dst; int dif = skb->dev->ifindex; + int ours; /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) @@ -1977,14 +1971,24 @@ void udp_v4_early_demux(struct sk_buff *skb) uh = udp_hdr(skb); if (skb->pkt_type == PACKET_BROADCAST || - skb->pkt_type == PACKET_MULTICAST) + skb->pkt_type == PACKET_MULTICAST) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + + if (!in_dev) + return; + + ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, + iph->protocol); + if (!ours) + return; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif); - else if (skb->pkt_type == PACKET_HOST) + } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif); - else + } else { return; + } if (!sk) return; @@ -2525,6 +2529,16 @@ void __init udp_table_init(struct udp_table *table, const char *name) } } +u32 udp_flow_hashrnd(void) +{ + static u32 hashrnd __read_mostly; + + net_get_random_once(&hashrnd, sizeof(hashrnd)); + + return hashrnd; +} +EXPORT_SYMBOL(udp_flow_hashrnd); + void __init udp_init(void) { unsigned long limit; diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 4a000f1dd757..6116604bf6e8 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -18,8 +18,9 @@ #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, struct inet_diag_req_v2 *req, - struct nlattr *bc) + struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + struct nlattr *bc) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -31,7 +32,8 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, } static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, - const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *req) { int err = -EINVAL; struct sock *sk; @@ -56,7 +58,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, goto out_nosk; err = -ENOENT; - if (sk == NULL) + if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); @@ -90,8 +92,9 @@ out_nosk: return err; } -static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) +static void udp_dump(struct udp_table *table, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, struct nlattr *bc) { int num, s_num, slot, s_slot; struct net *net = sock_net(skb->sk); @@ -144,13 +147,13 @@ done: } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r, struct nlattr *bc) { udp_dump(&udp_table, skb, cb, r, bc); } static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + const struct inet_diag_req_v2 *req) { return udp_dump_one(&udp_table, in_skb, nlh, req); } @@ -167,16 +170,18 @@ static const struct inet_diag_handler udp_diag_handler = { .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, + .idiag_info_size = 0, }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r, + struct nlattr *bc) { udp_dump(&udplite_table, skb, cb, r, bc); } static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, in_skb, nlh, req); } @@ -186,6 +191,7 @@ static const struct inet_diag_handler udplite_diag_handler = { .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, + .idiag_info_size = 0, }; static int __init udp_diag_init(void) diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index f3c27899f62b..7e0fe4bdd967 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -21,8 +21,8 @@ int compat_udp_setsockopt(struct sock *sk, int level, int optname, int compat_udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); #endif -int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len); +int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 4915d8284a86..f9386160cbee 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -285,7 +285,7 @@ void udp_del_offload(struct udp_offload *uo) pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); unlock: spin_unlock(&udp_offload_lock); - if (uo_priv != NULL) + if (uo_priv) call_rcu(&uo_priv->rcu, udp_offload_free_routine); } EXPORT_SYMBOL(udp_del_offload); @@ -394,7 +394,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) break; } - if (uo_priv != NULL) { + if (uo_priv) { NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr), diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index c83b35485056..933ea903f7b8 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -15,12 +15,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket *sock = NULL; struct sockaddr_in udp_addr; - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; - sk_change_net(sock->sk, net); - udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; @@ -47,7 +45,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } *sockp = NULL; return err; @@ -75,7 +73,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); -int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb, +int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck) @@ -92,7 +90,7 @@ int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb, udp_set_csum(nocheck, skb, src, dst, skb->len); - return iptunnel_xmit(skb->sk, rt, skb, src, dst, IPPROTO_UDP, + return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); @@ -101,7 +99,7 @@ void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index aac6197b7a71..60b032f58ccc 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -22,9 +22,9 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } -static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) +static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb) { - if (skb_dst(skb) == NULL) { + if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, @@ -52,7 +52,8 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) iph->tot_len = htons(skb->len); ip_send_check(iph); - NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb, + skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 91771a7c802f..35feda676464 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -63,7 +63,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; - ip_select_ident(skb, NULL); + ip_select_ident(dev_net(dst->dev), skb, NULL); return 0; } diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index dab73813cb92..2878dbfffeb7 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -69,7 +69,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm4_prepare_output); -int xfrm4_output_finish(struct sk_buff *skb) +int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); @@ -77,26 +77,26 @@ int xfrm4_output_finish(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; #endif - return xfrm_output(skb); + return xfrm_output(sk, skb); } -static int __xfrm4_output(struct sk_buff *skb) +static int __xfrm4_output(struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; #ifdef CONFIG_NETFILTER if (!x) { IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); + return dst_output_sk(sk, skb); } #endif - return x->outer_mode->afinfo->output_finish(skb); + return x->outer_mode->afinfo->output_finish(sk, skb); } int xfrm4_output(struct sock *sk, struct sk_buff *skb) { - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL, skb_dst(skb)->dev, __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 6156f68a1e90..bff69746e05f 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -232,7 +232,6 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm4_dst_ops = { .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, .redirect = xfrm4_redirect, @@ -299,7 +298,7 @@ static void __net_exit xfrm4_net_exit(struct net *net) { struct ctl_table *table; - if (net->ipv4.xfrm4_hdr == NULL) + if (!net->ipv4.xfrm4_hdr) return; table = net->ipv4.xfrm4_hdr->ctl_table_arg; diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2e8c06108ab9..0f3f1999719a 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -48,4 +48,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o ifneq ($(CONFIG_IPV6),) obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o +obj-y += mcast_snoop.o endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b6030025f411..21c2c818df3b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -46,6 +46,7 @@ #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> +#include <linux/inet.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_addr.h> @@ -102,6 +103,9 @@ #define INFINITY_LIFE_TIME 0xFFFFFFFF +#define IPV6_MAX_STRLEN \ + sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255") + static inline u32 cstamp_delta(unsigned long cstamp) { return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; @@ -127,6 +131,9 @@ static void ipv6_regen_rndid(unsigned long data); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); static int ipv6_count_addresses(struct inet6_dev *idev); +static int ipv6_generate_stable_address(struct in6_addr *addr, + u8 dad_count, + const struct inet6_dev *idev); /* * Configured unicast address hash table @@ -202,6 +209,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_dad = 1, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, + .stable_secret = { + .initialized = false, + } }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -240,6 +250,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_dad = 1, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, + .stable_secret = { + .initialized = false, + }, }; /* Check if a valid qdisc is available */ @@ -321,7 +334,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) return ERR_PTR(-EINVAL); ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL); - if (ndev == NULL) + if (!ndev) return ERR_PTR(err); rwlock_init(&ndev->lock); @@ -333,7 +346,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->cnf.mtu6 = dev->mtu; ndev->cnf.sysctl = NULL; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); - if (ndev->nd_parms == NULL) { + if (!ndev->nd_parms) { kfree(ndev); return ERR_PTR(err); } @@ -468,7 +481,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; ncm = nlmsg_data(nlh); @@ -506,7 +519,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, int err = -ENOBUFS; skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_ATOMIC); - if (skb == NULL) + if (!skb) goto errout; err = inet6_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, @@ -561,10 +574,10 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, break; default: dev = __dev_get_by_index(net, ifindex); - if (dev == NULL) + if (!dev) goto errout; in6_dev = __in6_dev_get(dev); - if (in6_dev == NULL) + if (!in6_dev) goto errout; devconf = &in6_dev->cnf; break; @@ -572,7 +585,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, err = -ENOBUFS; skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC); - if (skb == NULL) + if (!skb) goto errout; err = inet6_netconf_fill_devconf(skb, ifindex, devconf, @@ -841,7 +854,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); - if (ifa == NULL) { + if (!ifa) { ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; goto out; @@ -860,7 +873,6 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa->peer_addr = *peer_addr; spin_lock_init(&ifa->lock); - spin_lock_init(&ifa->state_lock); INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work); INIT_HLIST_NODE(&ifa->addr_lst); ifa->scope = scope; @@ -1003,10 +1015,10 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) ASSERT_RTNL(); - spin_lock_bh(&ifp->state_lock); + spin_lock_bh(&ifp->lock); state = ifp->state; ifp->state = INET6_IFADDR_STATE_DEAD; - spin_unlock_bh(&ifp->state_lock); + spin_unlock_bh(&ifp->lock); if (state == INET6_IFADDR_STATE_DEAD) goto out; @@ -1546,7 +1558,7 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, : ifp->flags; if (ipv6_addr_equal(&ifp->addr, addr) && !(ifp_flags&banned_flags) && - (dev == NULL || ifp->idev->dev == dev || + (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) { rcu_read_unlock_bh(); return 1; @@ -1568,7 +1580,7 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { - if (dev == NULL || ifp->idev->dev == dev) + if (!dev || ifp->idev->dev == dev) return true; } } @@ -1637,7 +1649,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { - if (dev == NULL || ifp->idev->dev == dev || + if (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { result = ifp; in6_ifa_hold(ifp); @@ -1686,19 +1698,21 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp) { int err = -ENOENT; - spin_lock_bh(&ifp->state_lock); + spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_DAD) { ifp->state = INET6_IFADDR_STATE_POSTDAD; err = 0; } - spin_unlock_bh(&ifp->state_lock); + spin_unlock_bh(&ifp->lock); return err; } void addrconf_dad_failure(struct inet6_ifaddr *ifp) { + struct in6_addr addr; struct inet6_dev *idev = ifp->idev; + struct net *net = dev_net(ifp->idev->dev); if (addrconf_dad_end(ifp)) { in6_ifa_put(ifp); @@ -1708,9 +1722,57 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) net_info_ratelimited("%s: IPv6 duplicate address %pI6c detected!\n", ifp->idev->dev->name, &ifp->addr); - if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { - struct in6_addr addr; + spin_lock_bh(&ifp->lock); + + if (ifp->flags & IFA_F_STABLE_PRIVACY) { + int scope = ifp->scope; + u32 flags = ifp->flags; + struct in6_addr new_addr; + struct inet6_ifaddr *ifp2; + u32 valid_lft, preferred_lft; + int pfxlen = ifp->prefix_len; + int retries = ifp->stable_privacy_retry + 1; + + if (retries > net->ipv6.sysctl.idgen_retries) { + net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n", + ifp->idev->dev->name); + goto errdad; + } + + new_addr = ifp->addr; + if (ipv6_generate_stable_address(&new_addr, retries, + idev)) + goto errdad; + + valid_lft = ifp->valid_lft; + preferred_lft = ifp->prefered_lft; + + spin_unlock_bh(&ifp->lock); + + if (idev->cnf.max_addresses && + ipv6_count_addresses(idev) >= + idev->cnf.max_addresses) + goto lock_errdad; + + net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n", + ifp->idev->dev->name); + + ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen, + scope, flags, valid_lft, + preferred_lft); + if (IS_ERR(ifp2)) + goto lock_errdad; + + spin_lock_bh(&ifp2->lock); + ifp2->stable_privacy_retry = retries; + ifp2->state = INET6_IFADDR_STATE_PREDAD; + spin_unlock_bh(&ifp2->lock); + addrconf_mod_dad_work(ifp2, net->ipv6.sysctl.idgen_delay); + in6_ifa_put(ifp2); +lock_errdad: + spin_lock_bh(&ifp->lock); + } else if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { addr.s6_addr32[0] = htonl(0xfe800000); addr.s6_addr32[1] = 0; @@ -1724,10 +1786,10 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) } } - spin_lock_bh(&ifp->state_lock); +errdad: /* transition from _POSTDAD to _ERRDAD */ ifp->state = INET6_IFADDR_STATE_ERRDAD; - spin_unlock_bh(&ifp->state_lock); + spin_unlock_bh(&ifp->lock); addrconf_mod_dad_work(ifp, 0); } @@ -2052,13 +2114,15 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, struct fib6_table *table; table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX); - if (table == NULL) + if (!table) return NULL; read_lock_bh(&table->tb6_lock); fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); if (!fn) goto out; + + noflags |= RTF_CACHE; for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (rt->dst.dev->ifindex != dev->ifindex) continue; @@ -2186,6 +2250,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) __u32 valid_lft; __u32 prefered_lft; int addr_type; + u32 addr_flags = 0; struct inet6_dev *in6_dev; struct net *net = dev_net(dev); @@ -2215,7 +2280,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) in6_dev = in6_dev_get(dev); - if (in6_dev == NULL) { + if (!in6_dev) { net_dbg_ratelimited("addrconf: device %s not configured\n", dev->name); return; @@ -2292,6 +2357,12 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) in6_dev->token.s6_addr + 8, 8); read_unlock_bh(&in6_dev->lock); tokenized = true; + } else if (in6_dev->addr_gen_mode == + IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + !ipv6_generate_stable_address(&addr, 0, + in6_dev)) { + addr_flags |= IFA_F_STABLE_PRIVACY; + goto ok; } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { in6_dev_put(in6_dev); @@ -2308,9 +2379,8 @@ ok: ifp = ipv6_get_ifaddr(net, &addr, dev, 1); - if (ifp == NULL && valid_lft) { + if (!ifp && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; - u32 addr_flags = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (in6_dev->cnf.optimistic_dad && @@ -2350,7 +2420,7 @@ ok: u32 stored_lft; /* update lifetime (RFC2462 5.5.3 e) */ - spin_lock(&ifp->lock); + spin_lock_bh(&ifp->lock); now = jiffies; if (ifp->valid_lft > (now - ifp->tstamp) / HZ) stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; @@ -2380,12 +2450,12 @@ ok: ifp->tstamp = now; flags = ifp->flags; ifp->flags &= ~IFA_F_DEPRECATED; - spin_unlock(&ifp->lock); + spin_unlock_bh(&ifp->lock); if (!(flags&IFA_F_TENTATIVE)) ipv6_ifa_notify(0, ifp); } else - spin_unlock(&ifp->lock); + spin_unlock_bh(&ifp->lock); manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft, create, now); @@ -2418,7 +2488,7 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) dev = __dev_get_by_index(net, ireq.ifr6_ifindex); err = -ENODEV; - if (dev == NULL) + if (!dev) goto err_exit; #if IS_ENABLED(CONFIG_IPV6_SIT) @@ -2464,6 +2534,23 @@ err_exit: return err; } +static int ipv6_mc_config(struct sock *sk, bool join, + const struct in6_addr *addr, int ifindex) +{ + int ret; + + ASSERT_RTNL(); + + lock_sock(sk); + if (join) + ret = ipv6_sock_mc_join(sk, ifindex, addr); + else + ret = ipv6_sock_mc_drop(sk, ifindex, addr); + release_sock(sk); + + return ret; +} + /* * Manual configuration of address on an interface */ @@ -2476,10 +2563,10 @@ static int inet6_addr_add(struct net *net, int ifindex, struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; + unsigned long timeout; + clock_t expires; int scope; u32 flags; - clock_t expires; - unsigned long timeout; ASSERT_RTNL(); @@ -2501,6 +2588,14 @@ static int inet6_addr_add(struct net *net, int ifindex, if (IS_ERR(idev)) return PTR_ERR(idev); + if (ifa_flags & IFA_F_MCAUTOJOIN) { + int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, + true, pfx, ifindex); + + if (ret < 0) + return ret; + } + scope = ipv6_addr_scope(pfx); timeout = addrconf_timeout_fixup(valid_lft, HZ); @@ -2542,6 +2637,9 @@ static int inet6_addr_add(struct net *net, int ifindex, in6_ifa_put(ifp); addrconf_verify_rtnl(); return 0; + } else if (ifa_flags & IFA_F_MCAUTOJOIN) { + ipv6_mc_config(net->ipv6.mc_autojoin_sk, + false, pfx, ifindex); } return PTR_ERR(ifp); @@ -2562,7 +2660,7 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, return -ENODEV; idev = __in6_dev_get(dev); - if (idev == NULL) + if (!idev) return -ENXIO; read_lock_bh(&idev->lock); @@ -2578,6 +2676,10 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, jiffies); ipv6_del_addr(ifp); addrconf_verify_rtnl(); + if (ipv6_addr_is_multicast(pfx)) { + ipv6_mc_config(net->ipv6.mc_autojoin_sk, + false, pfx, dev->ifindex); + } return 0; } } @@ -2710,7 +2812,7 @@ static void init_loopback(struct net_device *dev) ASSERT_RTNL(); idev = ipv6_find_idev(dev); - if (idev == NULL) { + if (!idev) { pr_debug("%s: add_dev failed\n", __func__); return; } @@ -2757,10 +2859,11 @@ static void init_loopback(struct net_device *dev) } } -static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr) +static void addrconf_add_linklocal(struct inet6_dev *idev, + const struct in6_addr *addr, u32 flags) { struct inet6_ifaddr *ifp; - u32 addr_flags = IFA_F_PERMANENT; + u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (idev->cnf.optimistic_dad && @@ -2768,7 +2871,6 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr addr_flags |= IFA_F_OPTIMISTIC; #endif - ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); if (!IS_ERR(ifp)) { @@ -2778,18 +2880,103 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr } } +static bool ipv6_reserved_interfaceid(struct in6_addr address) +{ + if ((address.s6_addr32[2] | address.s6_addr32[3]) == 0) + return true; + + if (address.s6_addr32[2] == htonl(0x02005eff) && + ((address.s6_addr32[3] & htonl(0xfe000000)) == htonl(0xfe000000))) + return true; + + if (address.s6_addr32[2] == htonl(0xfdffffff) && + ((address.s6_addr32[3] & htonl(0xffffff80)) == htonl(0xffffff80))) + return true; + + return false; +} + +static int ipv6_generate_stable_address(struct in6_addr *address, + u8 dad_count, + const struct inet6_dev *idev) +{ + static DEFINE_SPINLOCK(lock); + static __u32 digest[SHA_DIGEST_WORDS]; + static __u32 workspace[SHA_WORKSPACE_WORDS]; + + static union { + char __data[SHA_MESSAGE_BYTES]; + struct { + struct in6_addr secret; + __be32 prefix[2]; + unsigned char hwaddr[MAX_ADDR_LEN]; + u8 dad_count; + } __packed; + } data; + + struct in6_addr secret; + struct in6_addr temp; + struct net *net = dev_net(idev->dev); + + BUILD_BUG_ON(sizeof(data.__data) != sizeof(data)); + + if (idev->cnf.stable_secret.initialized) + secret = idev->cnf.stable_secret.secret; + else if (net->ipv6.devconf_dflt->stable_secret.initialized) + secret = net->ipv6.devconf_dflt->stable_secret.secret; + else + return -1; + +retry: + spin_lock_bh(&lock); + + sha_init(digest); + memset(&data, 0, sizeof(data)); + memset(workspace, 0, sizeof(workspace)); + memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); + data.prefix[0] = address->s6_addr32[0]; + data.prefix[1] = address->s6_addr32[1]; + data.secret = secret; + data.dad_count = dad_count; + + sha_transform(digest, data.__data, workspace); + + temp = *address; + temp.s6_addr32[2] = (__force __be32)digest[0]; + temp.s6_addr32[3] = (__force __be32)digest[1]; + + spin_unlock_bh(&lock); + + if (ipv6_reserved_interfaceid(temp)) { + dad_count++; + if (dad_count > dev_net(idev->dev)->ipv6.sysctl.idgen_retries) + return -1; + goto retry; + } + + *address = temp; + return 0; +} + static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) { - if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { - struct in6_addr addr; + struct in6_addr addr; - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + + if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { + if (!ipv6_generate_stable_address(&addr, 0, idev)) + addrconf_add_linklocal(idev, &addr, + IFA_F_STABLE_PRIVACY); + else if (prefix_route) + addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { /* addrconf_add_linklocal also adds a prefix_route and we * only need to care about prefix routes if ipv6_generate_eui64 * couldn't generate one. */ if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) - addrconf_add_linklocal(idev, &addr); + addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); } @@ -2834,7 +3021,7 @@ static void addrconf_sit_config(struct net_device *dev) */ idev = ipv6_find_idev(dev); - if (idev == NULL) { + if (!idev) { pr_debug("%s: add_dev failed\n", __func__); return; } @@ -2859,7 +3046,7 @@ static void addrconf_gre_config(struct net_device *dev) ASSERT_RTNL(); idev = ipv6_find_idev(dev); - if (idev == NULL) { + if (!idev) { pr_debug("%s: add_dev failed\n", __func__); return; } @@ -3056,7 +3243,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) neigh_ifdown(&nd_tbl, dev); idev = __in6_dev_get(dev); - if (idev == NULL) + if (!idev) return -ENODEV; /* @@ -3127,10 +3314,10 @@ restart: write_unlock_bh(&idev->lock); - spin_lock_bh(&ifa->state_lock); + spin_lock_bh(&ifa->lock); state = ifa->state; ifa->state = INET6_IFADDR_STATE_DEAD; - spin_unlock_bh(&ifa->state_lock); + spin_unlock_bh(&ifa->lock); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); @@ -3288,12 +3475,12 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp) { bool begin_dad = false; - spin_lock_bh(&ifp->state_lock); + spin_lock_bh(&ifp->lock); if (ifp->state != INET6_IFADDR_STATE_DEAD) { ifp->state = INET6_IFADDR_STATE_PREDAD; begin_dad = true; } - spin_unlock_bh(&ifp->state_lock); + spin_unlock_bh(&ifp->lock); if (begin_dad) addrconf_mod_dad_work(ifp, 0); @@ -3315,7 +3502,7 @@ static void addrconf_dad_work(struct work_struct *w) rtnl_lock(); - spin_lock_bh(&ifp->state_lock); + spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_PREDAD) { action = DAD_BEGIN; ifp->state = INET6_IFADDR_STATE_DAD; @@ -3323,7 +3510,7 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; } - spin_unlock_bh(&ifp->state_lock); + spin_unlock_bh(&ifp->lock); if (action == DAD_BEGIN) { addrconf_dad_begin(ifp); @@ -3811,7 +3998,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) ifm = nlmsg_data(nlh); pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); - if (pfx == NULL) + if (!pfx) return -EINVAL; ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; @@ -3923,7 +4110,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifm = nlmsg_data(nlh); pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); - if (pfx == NULL) + if (!pfx) return -EINVAL; if (tb[IFA_CACHEINFO]) { @@ -3938,17 +4125,17 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) } dev = __dev_get_by_index(net, ifm->ifa_index); - if (dev == NULL) + if (!dev) return -ENODEV; ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; /* We ignore other flags so far. */ ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | - IFA_F_NOPREFIXROUTE; + IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN; ifa = ipv6_get_ifaddr(net, pfx, dev, 1); - if (ifa == NULL) { + if (!ifa) { /* * It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. @@ -4023,7 +4210,7 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, u32 preferred, valid; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), @@ -4052,11 +4239,11 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, } if (!ipv6_addr_any(&ifa->peer_addr)) { - if (nla_put(skb, IFA_LOCAL, 16, &ifa->addr) < 0 || - nla_put(skb, IFA_ADDRESS, 16, &ifa->peer_addr) < 0) + if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 || + nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->peer_addr) < 0) goto error; } else - if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0) + if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0) goto error; if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) @@ -4084,11 +4271,11 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, scope = RT_SCOPE_SITE; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); - if (nla_put(skb, IFA_MULTICAST, 16, &ifmca->mca_addr) < 0 || + if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 || put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { nlmsg_cancel(skb, nlh); @@ -4110,11 +4297,11 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, scope = RT_SCOPE_SITE; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); - if (nla_put(skb, IFA_ANYCAST, 16, &ifaca->aca_addr) < 0 || + if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 || put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { nlmsg_cancel(skb, nlh); @@ -4283,7 +4470,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh) goto errout; addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); - if (addr == NULL) { + if (!addr) { err = -EINVAL; goto errout; } @@ -4326,7 +4513,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) int err = -ENOBUFS; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); - if (skb == NULL) + if (!skb) goto errout; err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0); @@ -4398,6 +4585,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; + /* we omit DEVCONF_STABLE_SECRET for now */ } static inline size_t inet6_ifla6_size(void) @@ -4478,24 +4666,24 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); - if (nla == NULL) + if (!nla) goto nla_put_failure; ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla)); /* XXX - MC not implemented */ nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); - if (nla == NULL) + if (!nla) goto nla_put_failure; snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); - if (nla == NULL) + if (!nla) goto nla_put_failure; snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); - if (nla == NULL) + if (!nla) goto nla_put_failure; if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode)) @@ -4541,7 +4729,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) ASSERT_RTNL(); - if (token == NULL) + if (!token) return -EINVAL; if (ipv6_addr_any(token)) return -EINVAL; @@ -4632,8 +4820,15 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); if (mode != IN6_ADDR_GEN_MODE_EUI64 && - mode != IN6_ADDR_GEN_MODE_NONE) + mode != IN6_ADDR_GEN_MODE_NONE && + mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY) + return -EINVAL; + + if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + !idev->cnf.stable_secret.initialized && + !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized) return -EINVAL; + idev->addr_gen_mode = mode; err = 0; } @@ -4650,7 +4845,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, void *protoinfo; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; hdr = nlmsg_data(nlh); @@ -4665,11 +4860,11 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || - (dev->ifindex != dev->iflink && - nla_put_u32(skb, IFLA_LINK, dev->iflink))) + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; protoinfo = nla_nest_start(skb, IFLA_PROTINFO); - if (protoinfo == NULL) + if (!protoinfo) goto nla_put_failure; if (inet6_fill_ifla6_attrs(skb, idev) < 0) @@ -4730,7 +4925,7 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) int err = -ENOBUFS; skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC); - if (skb == NULL) + if (!skb) goto errout; err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0); @@ -4763,7 +4958,7 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, struct prefix_cacheinfo ci; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*pmsg), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; pmsg = nlmsg_data(nlh); @@ -4802,7 +4997,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, int err = -ENOBUFS; skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC); - if (skb == NULL) + if (!skb) goto errout; err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0); @@ -5042,6 +5237,74 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, return ret; } +static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int err; + struct in6_addr addr; + char str[IPV6_MAX_STRLEN]; + struct ctl_table lctl = *ctl; + struct net *net = ctl->extra2; + struct ipv6_stable_secret *secret = ctl->data; + + if (&net->ipv6.devconf_all->stable_secret == ctl->data) + return -EIO; + + lctl.maxlen = IPV6_MAX_STRLEN; + lctl.data = str; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (!write && !secret->initialized) { + err = -EIO; + goto out; + } + + if (!write) { + err = snprintf(str, sizeof(str), "%pI6", + &secret->secret); + if (err >= sizeof(str)) { + err = -EIO; + goto out; + } + } + + err = proc_dostring(&lctl, write, buffer, lenp, ppos); + if (err || !write) + goto out; + + if (in6_pton(str, -1, addr.in6_u.u6_addr8, -1, NULL) != 1) { + err = -EIO; + goto out; + } + + secret->initialized = true; + secret->secret = addr; + + if (&net->ipv6.devconf_dflt->stable_secret == ctl->data) { + struct net_device *dev; + + for_each_netdev(net, dev) { + struct inet6_dev *idev = __in6_dev_get(dev); + + if (idev) { + idev->addr_gen_mode = + IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + } + } + } else { + struct inet6_dev *idev = ctl->extra1; + + idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + } + +out: + rtnl_unlock(); + + return err; +} static struct addrconf_sysctl_table { @@ -5315,6 +5578,13 @@ static struct addrconf_sysctl_table .proc_handler = proc_dointvec, }, { + .procname = "stable_secret", + .data = &ipv6_devconf.stable_secret, + .maxlen = IPV6_MAX_STRLEN, + .mode = 0600, + .proc_handler = addrconf_sysctl_stable_secret, + }, + { /* sentinel */ } }, @@ -5328,7 +5598,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL); - if (t == NULL) + if (!t) goto out; for (i = 0; t->addrconf_vars[i].data; i++) { @@ -5340,7 +5610,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); t->sysctl_header = register_net_sysctl(net, path, t->addrconf_vars); - if (t->sysctl_header == NULL) + if (!t->sysctl_header) goto free; p->sysctl = t; @@ -5356,7 +5626,7 @@ static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) { struct addrconf_sysctl_table *t; - if (p->sysctl == NULL) + if (!p->sysctl) return; t = p->sysctl; @@ -5399,17 +5669,20 @@ static int __net_init addrconf_init_net(struct net *net) struct ipv6_devconf *all, *dflt; all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL); - if (all == NULL) + if (!all) goto err_alloc_all; dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); - if (dflt == NULL) + if (!dflt) goto err_alloc_dflt; /* these will be inherited by all namespaces */ dflt->autoconf = ipv6_defaults.autoconf; dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; + dflt->stable_secret.initialized = false; + all->stable_secret.initialized = false; + net->ipv6.devconf_all = all; net->ipv6.devconf_dflt = dflt; diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 98cc4cd570e2..ca09bf49ac68 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -133,6 +133,14 @@ static void snmp6_free_dev(struct inet6_dev *idev) free_percpu(idev->stats.ipv6); } +static void in6_dev_finish_destroy_rcu(struct rcu_head *head) +{ + struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu); + + snmp6_free_dev(idev); + kfree(idev); +} + /* Nobody refers to this device, we may destroy it. */ void in6_dev_finish_destroy(struct inet6_dev *idev) @@ -140,7 +148,7 @@ void in6_dev_finish_destroy(struct inet6_dev *idev) struct net_device *dev = idev->dev; WARN_ON(!list_empty(&idev->addr_list)); - WARN_ON(idev->mc_list != NULL); + WARN_ON(idev->mc_list); WARN_ON(timer_pending(&idev->rs_timer)); #ifdef NET_REFCNT_DEBUG @@ -151,7 +159,6 @@ void in6_dev_finish_destroy(struct inet6_dev *idev) pr_warn("Freeing alive inet6 device %p\n", idev); return; } - snmp6_free_dev(idev); - kfree_rcu(idev, rcu); + call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu); } EXPORT_SYMBOL(in6_dev_finish_destroy); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index e43e79d0a612..882124ebb438 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -29,9 +29,7 @@ * Policy Table */ struct ip6addrlbl_entry { -#ifdef CONFIG_NET_NS - struct net *lbl_net; -#endif + possible_net_t lbl_net; struct in6_addr prefix; int prefixlen; int ifindex; @@ -129,9 +127,6 @@ static const __net_initconst struct ip6addrlbl_init_table /* Object management */ static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) { -#ifdef CONFIG_NET_NS - release_net(p->lbl_net); -#endif kfree(p); } @@ -240,9 +235,7 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); -#ifdef CONFIG_NET_NS - newp->lbl_net = hold_net(net); -#endif + write_pnet(&newp->lbl_net, net); atomic_set(&newp->refcnt, 1); return newp; } @@ -484,7 +477,7 @@ static int ip6addrlbl_fill(struct sk_buff *skb, ip6addrlbl_putmsg(nlh, p->prefixlen, p->ifindex, lseq); - if (nla_put(skb, IFAL_ADDRESS, 16, &p->prefix) < 0 || + if (nla_put_in6_addr(skb, IFAL_ADDRESS, &p->prefix) < 0 || nla_put_u32(skb, IFAL_LABEL, p->label) < 0) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e8c4400f23e9..7de52b65173f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -164,11 +164,11 @@ lookup_protocol: answer_flags = answer->flags; rcu_read_unlock(); - WARN_ON(answer_prot->slab == NULL); + WARN_ON(!answer_prot->slab); err = -ENOBUFS; - sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot); - if (sk == NULL) + sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern); + if (!sk) goto out; sock_init_data(sock, sk); @@ -362,7 +362,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) np->saddr = addr->sin6_addr; /* Make sure we are allowed to bind here. */ - if (sk->sk_prot->get_port(sk, snum)) { + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { inet_reset_saddr(sk); err = -EADDRINUSE; goto out; @@ -391,7 +392,7 @@ int inet6_release(struct socket *sock) { struct sock *sk = sock->sk; - if (sk == NULL) + if (!sk) return -EINVAL; /* Free mc lists */ @@ -413,11 +414,11 @@ void inet6_destroy_sock(struct sock *sk) /* Release rx options */ skb = xchg(&np->pktoptions, NULL); - if (skb != NULL) + if (skb) kfree_skb(skb); skb = xchg(&np->rxpmtu, NULL); - if (skb != NULL) + if (skb) kfree_skb(skb); /* Free flowlabels */ @@ -426,7 +427,7 @@ void inet6_destroy_sock(struct sock *sk) /* Free tx options */ opt = xchg(&np->opt, NULL); - if (opt != NULL) + if (opt) sock_kfree_s(sk, opt, opt->tot_len); } EXPORT_SYMBOL_GPL(inet6_destroy_sock); @@ -640,7 +641,7 @@ int inet6_sk_rebuild_header(struct sock *sk) dst = __sk_dst_check(sk, np->dst_cookie); - if (dst == NULL) { + if (!dst) { struct inet_sock *inet = inet_sk(sk); struct in6_addr *final_p, final; struct flowi6 fl6; @@ -766,6 +767,9 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = 0; + net->ipv6.sysctl.idgen_retries = 3; + net->ipv6.sysctl.idgen_delay = 1 * HZ; + net->ipv6.sysctl.flowlabel_state_ranges = 1; atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); @@ -824,7 +828,7 @@ static int __init inet6_init(void) struct list_head *r; int err = 0; - BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); + sock_skb_cb_check_size(sizeof(struct inet6_skb_parm)); /* Register the socket-side information for inet6_create. */ for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index a6727add2624..ed7d4e3f9c10 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -681,7 +681,7 @@ static int ah6_init_state(struct xfrm_state *x) goto error; ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); - if (ahp == NULL) + if (!ahp) return -ENOMEM; ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index baf2742d1ec4..514ac259f543 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -60,6 +60,8 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) int ishost = !net->ipv6.devconf_all->forwarding; int err = 0; + ASSERT_RTNL(); + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (ipv6_addr_is_multicast(addr)) @@ -68,12 +70,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return -EINVAL; pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); - if (pac == NULL) + if (!pac) return -ENOMEM; pac->acl_next = NULL; pac->acl_addr = *addr; - rtnl_lock(); if (ifindex == 0) { struct rt6_info *rt; @@ -92,7 +93,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) } else dev = __dev_get_by_index(net, ifindex); - if (dev == NULL) { + if (!dev) { err = -ENODEV; goto error; } @@ -130,7 +131,6 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) } error: - rtnl_unlock(); if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); return err; @@ -146,7 +146,8 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) struct ipv6_ac_socklist *pac, *prev_pac; struct net *net = sock_net(sk); - rtnl_lock(); + ASSERT_RTNL(); + prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { if ((ifindex == 0 || pac->acl_ifindex == ifindex) && @@ -154,10 +155,8 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) break; prev_pac = pac; } - if (!pac) { - rtnl_unlock(); + if (!pac) return -ENOENT; - } if (prev_pac) prev_pac->acl_next = pac->acl_next; else @@ -166,7 +165,6 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) dev = __dev_get_by_index(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); - rtnl_unlock(); sock_kfree_s(sk, pac, sizeof(*pac)); return 0; @@ -224,7 +222,7 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, struct ifacaddr6 *aca; aca = kzalloc(sizeof(*aca), GFP_ATOMIC); - if (aca == NULL) + if (!aca) return NULL; aca->aca_addr = *addr; @@ -270,7 +268,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) goto out; } aca = aca_alloc(rt, addr); - if (aca == NULL) { + if (!aca) { ip6_rt_put(rt); err = -ENOMEM; goto out; @@ -339,7 +337,7 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev = __in6_dev_get(dev); - if (idev == NULL) + if (!idev) return -ENODEV; return __ipv6_dev_ac_dec(idev, addr); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index ace8daca5c83..b10a88986a98 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -40,7 +40,7 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } -int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); @@ -56,7 +56,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (usin->sin6_family == AF_INET) { if (__ipv6_only_sock(sk)) return -EAFNOSUPPORT; - err = ip4_datagram_connect(sk, uaddr, addr_len); + err = __ip4_datagram_connect(sk, uaddr, addr_len); goto ipv4_connected; } @@ -71,7 +71,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (!flowlabel) return -EINVAL; } } @@ -98,9 +98,9 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sin.sin_addr.s_addr = daddr->s6_addr32[3]; sin.sin_port = usin->sin6_port; - err = ip4_datagram_connect(sk, - (struct sockaddr *) &sin, - sizeof(sin)); + err = __ip4_datagram_connect(sk, + (struct sockaddr *) &sin, + sizeof(sin)); ipv4_connected: if (err) @@ -204,6 +204,16 @@ out: fl6_sock_release(flowlabel); return err; } + +int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip6_datagram_connect(sk, uaddr, addr_len); + release_sock(sk); + return res; +} EXPORT_SYMBOL_GPL(ip6_datagram_connect); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, @@ -325,6 +335,16 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } +/* For some errors we have valid addr_offset even with zero payload and + * zero port. Also, addr_offset should be supported if port is set. + */ +static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr) +{ + return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 || + serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; +} + /* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. * * At one point, excluding local errors was a quick test to identify icmp/icmp6 @@ -373,7 +393,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) err = -EAGAIN; skb = sock_dequeue_err_skb(sk); - if (skb == NULL) + if (!skb) goto out; copied = skb->len; @@ -389,7 +409,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && serr->port) { + if (sin && ipv6_datagram_support_addr(serr)) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; @@ -463,7 +483,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, err = -EAGAIN; skb = xchg(&np->rxpmtu, NULL); - if (skb == NULL) + if (!skb) goto out; copied = skb->len; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index e48f2c7c5c59..060a60b2f8a6 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -76,7 +76,7 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen) len = ALIGN(len, crypto_tfm_ctx_alignment()); } - len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead); + len += sizeof(struct aead_request) + crypto_aead_reqsize(aead); len = ALIGN(len, __alignof__(struct scatterlist)); len += sizeof(struct scatterlist) * nfrags; @@ -96,17 +96,6 @@ static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; } -static inline struct aead_givcrypt_request *esp_tmp_givreq( - struct crypto_aead *aead, u8 *iv) -{ - struct aead_givcrypt_request *req; - - req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), - crypto_tfm_ctx_alignment()); - aead_givcrypt_set_tfm(req, aead); - return req; -} - static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv) { struct aead_request *req; @@ -125,14 +114,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static inline struct scatterlist *esp_givreq_sg( - struct crypto_aead *aead, struct aead_givcrypt_request *req) -{ - return (void *)ALIGN((unsigned long)(req + 1) + - crypto_aead_reqsize(aead), - __alignof__(struct scatterlist)); -} - static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -141,32 +122,57 @@ static void esp_output_done(struct crypto_async_request *base, int err) xfrm_output_resume(skb, err); } +/* Move ESP header back into place. */ +static void esp_restore_header(struct sk_buff *skb, unsigned int offset) +{ + struct ip_esp_hdr *esph = (void *)(skb->data + offset); + void *tmp = ESP_SKB_CB(skb)->tmp; + __be32 *seqhi = esp_tmp_seqhi(tmp); + + esph->seq_no = esph->spi; + esph->spi = *seqhi; +} + +static void esp_output_restore_header(struct sk_buff *skb) +{ + esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); +} + +static void esp_output_done_esn(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + esp_output_restore_header(skb); + esp_output_done(base, err); +} + static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) { int err; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct aead_givcrypt_request *req; + struct aead_request *req; struct scatterlist *sg; - struct scatterlist *asg; struct sk_buff *trailer; void *tmp; int blksize; int clen; int alen; int plen; + int ivlen; int tfclen; int nfrags; int assoclen; - int sglists; int seqhilen; u8 *iv; u8 *tail; __be32 *seqhi; + __be64 seqno; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); + ivlen = crypto_aead_ivsize(aead); tfclen = 0; if (x->tfcpad) { @@ -187,16 +193,14 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) nfrags = err; assoclen = sizeof(*esph); - sglists = 1; seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { - sglists += 2; seqhilen += sizeof(__be32); assoclen += seqhilen; } - tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) { err = -ENOMEM; goto error; @@ -204,9 +208,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); - req = esp_tmp_givreq(aead, iv); - asg = esp_givreq_sg(aead, req); - sg = asg + sglists; + req = esp_tmp_req(aead, iv); + sg = esp_req_sg(aead, req); /* Fill padding... */ tail = skb_tail_pointer(trailer); @@ -227,36 +230,53 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; - esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + aead_request_set_callback(req, 0, esp_output_done, skb); + + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * encryption. + */ + if ((x->props.flags & XFRM_STATE_ESN)) { + esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); + *seqhi = esph->spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + aead_request_set_callback(req, 0, esp_output_done_esn, skb); + } + + esph->spi = x->id.spi; + sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, - esph->enc_data + crypto_aead_ivsize(aead) - skb->data, - clen + alen); + (unsigned char *)esph - skb->data, + assoclen + ivlen + clen + alen); - if ((x->props.flags & XFRM_STATE_ESN)) { - sg_init_table(asg, 3); - sg_set_buf(asg, &esph->spi, sizeof(__be32)); - *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); - sg_set_buf(asg + 1, seqhi, seqhilen); - sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); - } else - sg_init_one(asg, esph, sizeof(*esph)); - - aead_givcrypt_set_callback(req, 0, esp_output_done, skb); - aead_givcrypt_set_crypt(req, sg, sg, clen, iv); - aead_givcrypt_set_assoc(req, asg, assoclen); - aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output.low); + aead_request_set_crypt(req, sg, sg, ivlen + clen, iv); + aead_request_set_ad(req, assoclen); + + seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); + + memset(iv, 0, ivlen); + memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8), + min(ivlen, 8)); ESP_SKB_CB(skb)->tmp = tmp; - err = crypto_aead_givencrypt(req); - if (err == -EINPROGRESS) + err = crypto_aead_encrypt(req); + + switch (err) { + case -EINPROGRESS: goto error; - if (err == -EBUSY) + case -EBUSY: err = NET_XMIT_DROP; + break; + + case 0: + if ((x->props.flags & XFRM_STATE_ESN)) + esp_output_restore_header(skb); + } kfree(tmp); @@ -317,25 +337,38 @@ static void esp_input_done(struct crypto_async_request *base, int err) xfrm_input_resume(skb, esp_input_done2(skb, err)); } +static void esp_input_restore_header(struct sk_buff *skb) +{ + esp_restore_header(skb, 0); + __skb_pull(skb, 4); +} + +static void esp_input_done_esn(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + esp_input_restore_header(skb); + esp_input_done(base, err); +} + static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; - int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); + int ivlen = crypto_aead_ivsize(aead); + int elen = skb->len - sizeof(*esph) - ivlen; int nfrags; int assoclen; - int sglists; int seqhilen; int ret = 0; void *tmp; __be32 *seqhi; u8 *iv; struct scatterlist *sg; - struct scatterlist *asg; - if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) { + if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) { ret = -EINVAL; goto out; } @@ -354,16 +387,14 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) ret = -ENOMEM; assoclen = sizeof(*esph); - sglists = 1; seqhilen = 0; if (x->props.flags & XFRM_STATE_ESN) { - sglists += 2; seqhilen += sizeof(__be32); assoclen += seqhilen; } - tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + tmp = esp_alloc_tmp(aead, nfrags, seqhilen); if (!tmp) goto out; @@ -371,36 +402,39 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); - asg = esp_req_sg(aead, req); - sg = asg + sglists; + sg = esp_req_sg(aead, req); skb->ip_summed = CHECKSUM_NONE; esph = (struct ip_esp_hdr *)skb->data; - /* Get ivec. This can be wrong, check against another impls. */ - iv = esph->enc_data; - - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); + aead_request_set_callback(req, 0, esp_input_done, skb); + /* For ESN we move the header forward by 4 bytes to + * accomodate the high bits. We will move it back after + * decryption. + */ if ((x->props.flags & XFRM_STATE_ESN)) { - sg_init_table(asg, 3); - sg_set_buf(asg, &esph->spi, sizeof(__be32)); - *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; - sg_set_buf(asg + 1, seqhi, seqhilen); - sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); - } else - sg_init_one(asg, esph, sizeof(*esph)); + esph = (void *)skb_push(skb, 4); + *seqhi = esph->spi; + esph->spi = esph->seq_no; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi); + aead_request_set_callback(req, 0, esp_input_done_esn, skb); + } - aead_request_set_callback(req, 0, esp_input_done, skb); - aead_request_set_crypt(req, sg, sg, elen, iv); - aead_request_set_assoc(req, asg, assoclen); + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + aead_request_set_crypt(req, sg, sg, elen + ivlen, iv); + aead_request_set_ad(req, assoclen); ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) goto out; + if ((x->props.flags & XFRM_STATE_ESN)) + esp_input_restore_header(skb); + ret = esp_input_done2(skb, ret); out: @@ -460,10 +494,16 @@ static void esp6_destroy(struct xfrm_state *x) static int esp_init_aead(struct xfrm_state *x) { + char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - aead = crypto_alloc_aead(x->aead->alg_name, 0, 0); + err = -ENAMETOOLONG; + if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", + x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -495,22 +535,26 @@ static int esp_init_authenc(struct xfrm_state *x) int err; err = -EINVAL; - if (x->ealg == NULL) + if (!x->ealg) goto error; err = -ENAMETOOLONG; if ((x->props.flags & XFRM_STATE_ESN)) { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, - "authencesn(%s,%s)", + "%s%sauthencesn(%s,%s)%s", + x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + x->ealg->alg_name, + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) goto error; } else { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, - "authenc(%s,%s)", + "%s%sauthenc(%s,%s)%s", + x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + x->ealg->alg_name, + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) goto error; } diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 8af3eb57f438..5c5d23e59da5 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -82,7 +82,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, if (nexthdr == NEXTHDR_NONE) return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); - if (hp == NULL) + if (!hp) return -1; if (nexthdr == NEXTHDR_FRAGMENT) { __be16 _frag_off, *fp; @@ -91,7 +91,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, frag_off), sizeof(_frag_off), &_frag_off); - if (fp == NULL) + if (!fp) return -1; *frag_offp = *fp; @@ -218,7 +218,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, } hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); - if (hp == NULL) + if (!hp) return -EBADMSG; if (nexthdr == NEXTHDR_ROUTING) { @@ -226,7 +226,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, rh = skb_header_pointer(skb, start, sizeof(_rh), &_rh); - if (rh == NULL) + if (!rh) return -EBADMSG; if (flags && (*flags & IP6_FH_F_SKIP_RH) && @@ -245,7 +245,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, frag_off), sizeof(_frag_off), &_frag_off); - if (fp == NULL) + if (!fp) return -EBADMSG; _frag_off = ntohs(*fp) & ~0x7; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 70bc6abc0639..2367a16eae58 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -199,12 +199,10 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } if (frh->src_len) - nla_memcpy(&rule6->src.addr, tb[FRA_SRC], - sizeof(struct in6_addr)); + rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]); if (frh->dst_len) - nla_memcpy(&rule6->dst.addr, tb[FRA_DST], - sizeof(struct in6_addr)); + rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]); rule6->src.plen = frh->src_len; rule6->dst.plen = frh->dst_len; @@ -250,11 +248,9 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->tos = rule6->tclass; if ((rule6->dst.plen && - nla_put(skb, FRA_DST, sizeof(struct in6_addr), - &rule6->dst.addr)) || + nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || (rule6->src.plen && - nla_put(skb, FRA_SRC, sizeof(struct in6_addr), - &rule6->src.addr))) + nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr))) goto nla_put_failure; return 0; @@ -299,19 +295,16 @@ static int __net_init fib6_rules_net_init(struct net *net) ops = fib_rules_register(&fib6_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); - net->ipv6.fib6_rules_ops = ops; - - err = fib_default_rule_add(net->ipv6.fib6_rules_ops, 0, - RT6_TABLE_LOCAL, 0); + err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL, 0); if (err) goto out_fib6_rules_ops; - err = fib_default_rule_add(net->ipv6.fib6_rules_ops, - 0x7FFE, RT6_TABLE_MAIN, 0); + err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN, 0); if (err) goto out_fib6_rules_ops; + net->ipv6.fib6_rules_ops = ops; out: return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a5e95199585e..713d7434c911 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -160,8 +160,7 @@ static bool is_ineligible(const struct sk_buff *skb) tp = skb_header_pointer(skb, ptr+offsetof(struct icmp6hdr, icmp6_type), sizeof(_type), &_type); - if (tp == NULL || - !(*tp & ICMPV6_INFOMSG_MASK)) + if (!tp || !(*tp & ICMPV6_INFOMSG_MASK)) return true; } return false; @@ -208,7 +207,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct inet_peer *peer; peer = inet_getpeer_v6(net->ipv6.peers, - &rt->rt6i_dst.addr, 1); + &fl6->daddr, 1); res = inet_peer_xrlim_allow(peer, tmo); if (peer) inet_putpeer(peer); @@ -231,7 +230,7 @@ static bool opt_unrec(struct sk_buff *skb, __u32 offset) offset += skb_network_offset(skb); op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); - if (op == NULL) + if (!op) return true; return (*op & 0xC0) == 0x80; } @@ -244,7 +243,7 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, int err = 0; skb = skb_peek(&sk->sk_write_queue); - if (skb == NULL) + if (!skb) goto out; icmp6h = icmp6_hdr(skb); @@ -338,7 +337,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, * We won't send icmp if the destination is known * anycast. */ - if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + if (ipv6_anycast_destination(dst, &fl6->daddr)) { net_dbg_ratelimited("icmp6_send: acast source\n"); dst_release(dst); return ERR_PTR(-EINVAL); @@ -479,7 +478,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); - if (sk == NULL) + if (!sk) return; sk->sk_mark = mark; np = inet6_sk(sk); @@ -565,7 +564,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (!ipv6_unicast_destination(skb) && !(net->ipv6.sysctl.anycast_src_echo_reply && - ipv6_anycast_destination(skb))) + ipv6_anycast_destination(skb_dst(skb), saddr))) saddr = NULL; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); @@ -582,7 +581,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); - if (sk == NULL) + if (!sk) return; sk->sk_mark = mark; np = inet6_sk(sk); @@ -839,7 +838,7 @@ static int __net_init icmpv6_sk_init(struct net *net) net->ipv6.icmp_sk = kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); - if (net->ipv6.icmp_sk == NULL) + if (!net->ipv6.icmp_sk) return -ENOMEM; for_each_possible_cpu(i) { diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 29b32206e494..6927f3fb5597 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -112,22 +112,20 @@ static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, return c & (synq_hsize - 1); } -struct request_sock *inet6_csk_search_req(const struct sock *sk, - struct request_sock ***prevp, +struct request_sock *inet6_csk_search_req(struct sock *sk, const __be16 rport, const struct in6_addr *raddr, const struct in6_addr *laddr, const int iif) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; + struct request_sock *req; + u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries); - for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport, - lopt->hash_rnd, - lopt->nr_table_entries)]; - (req = *prev) != NULL; - prev = &req->dl_next) { + spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); + for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); if (ireq->ir_rmt_port == rport && @@ -135,13 +133,14 @@ struct request_sock *inet6_csk_search_req(const struct sock *sk, ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) && ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) && (!ireq->ir_iif || ireq->ir_iif == iif)) { + atomic_inc(&req->rsk_refcnt); WARN_ON(req->sk != NULL); - *prevp = prev; - return req; + break; } } + spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - return NULL; + return req; } EXPORT_SYMBOL_GPL(inet6_csk_search_req); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 051dffb49c90..b4fd96de97e6 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -23,11 +23,9 @@ #include <net/secure_seq.h> #include <net/ip.h> -static unsigned int inet6_ehashfn(struct net *net, - const struct in6_addr *laddr, - const u16 lport, - const struct in6_addr *faddr, - const __be16 fport) +u32 inet6_ehashfn(const struct net *net, + const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const __be16 fport) { static u32 inet6_ehash_secret __read_mostly; static u32 ipv6_hash_secret __read_mostly; @@ -44,54 +42,6 @@ static unsigned int inet6_ehashfn(struct net *net, inet6_ehash_secret + net_hash_mix(net)); } -static int inet6_sk_ehashfn(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct in6_addr *laddr = &sk->sk_v6_rcv_saddr; - const struct in6_addr *faddr = &sk->sk_v6_daddr; - const __u16 lport = inet->inet_num; - const __be16 fport = inet->inet_dport; - struct net *net = sock_net(sk); - - return inet6_ehashfn(net, laddr, lport, faddr, fport); -} - -int __inet6_hash(struct sock *sk, struct inet_timewait_sock *tw) -{ - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - int twrefcnt = 0; - - WARN_ON(!sk_unhashed(sk)); - - if (sk->sk_state == TCP_LISTEN) { - struct inet_listen_hashbucket *ilb; - - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - spin_lock(&ilb->lock); - __sk_nulls_add_node_rcu(sk, &ilb->head); - spin_unlock(&ilb->lock); - } else { - unsigned int hash; - struct hlist_nulls_head *list; - spinlock_t *lock; - - sk->sk_hash = hash = inet6_sk_ehashfn(sk); - list = &inet_ehash_bucket(hashinfo, hash)->chain; - lock = inet_ehash_lockp(hashinfo, hash); - spin_lock(lock); - __sk_nulls_add_node_rcu(sk, list); - if (tw) { - WARN_ON(sk->sk_hash != tw->tw_hash); - twrefcnt = inet_twsk_unhash(tw); - } - spin_unlock(lock); - } - - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - return twrefcnt; -} -EXPORT_SYMBOL(__inet6_hash); - /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM @@ -296,7 +246,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); } @@ -307,7 +257,7 @@ not_unique: return -EADDRNOTAVAIL; } -static inline u32 inet6_sk_port_offset(const struct sock *sk) +static u32 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); @@ -319,7 +269,11 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk) int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk), - __inet6_check_established, __inet6_hash); + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = inet6_sk_port_offset(sk); + return __inet_hash_connect(death_row, sk, port_offset, + __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 263ef4143bff..55d19861ab20 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -154,10 +154,32 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } +static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) +{ + int cpu; + + if (!non_pcpu_rt->rt6i_pcpu) + return; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_free(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } +} + static void rt6_release(struct rt6_info *rt) { - if (atomic_dec_and_test(&rt->rt6i_ref)) + if (atomic_dec_and_test(&rt->rt6i_ref)) { + rt6_free_pcpu(rt); dst_free(&rt->dst); + } } static void fib6_link_table(struct net *net, struct fib6_table *tb) @@ -693,6 +715,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, { struct rt6_info *iter = NULL; struct rt6_info **ins; + struct rt6_info **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -716,8 +739,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; if (replace) { - found++; - break; + if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { + found++; + break; + } + if (rt_can_ecmp) + fallback_ins = fallback_ins ?: ins; + goto next_iter; } if (iter->dst.dev == rt->dst.dev && @@ -732,6 +760,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, rt6_clean_expires(iter); else rt6_set_expires(iter, rt->dst.expires); + iter->rt6i_pmtu = rt->rt6i_pmtu; return -EEXIST; } /* If we have the same destination and the same metric, @@ -753,9 +782,17 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (iter->rt6i_metric > rt->rt6i_metric) break; +next_iter: ins = &iter->dst.rt6_next; } + if (fallback_ins && !found) { + /* No ECMP-able route found, replace first non-ECMP one */ + ins = fallback_ins; + iter = *ins; + found++; + } + /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; @@ -815,6 +852,8 @@ add: } } else { + int nsiblings; + if (!found) { if (add) goto add; @@ -835,8 +874,27 @@ add: info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } + nsiblings = iter->rt6i_nsiblings; fib6_purge_rt(iter, fn, info->nl_net); rt6_release(iter); + + if (nsiblings) { + /* Replacing an ECMP route, remove all siblings */ + ins = &rt->dst.rt6_next; + iter = *ins; + while (iter) { + if (rt6_qualify_for_ecmp(iter)) { + *ins = iter->dst.rt6_next; + fib6_purge_rt(iter, fn, info->nl_net); + rt6_release(iter); + nsiblings--; + } else { + ins = &iter->dst.rt6_next; + } + iter = *ins; + } + WARN_ON(nsiblings != 0); + } } return 0; @@ -1206,7 +1264,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); - WARN_ON(fn->leaf != NULL); + WARN_ON(fn->leaf); children = 0; child = NULL; @@ -1361,7 +1419,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) #if RT6_DEBUG >= 2 if (rt->dst.obsolete > 0) { - WARN_ON(fn != NULL); + WARN_ON(fn); return -ENOENT; } #endif diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index f45d6db50a45..1f9ebe3cbb4a 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -100,7 +100,6 @@ static void fl_free(struct ip6_flowlabel *fl) if (fl) { if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); - release_net(fl->fl_net); kfree(fl->opt); kfree_rcu(fl, rcu); } @@ -206,7 +205,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); - if (lfl == NULL) + if (!lfl) break; } } @@ -220,7 +219,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, * with the same label can only appear on another sock */ lfl = __fl_lookup(net, fl->label); - if (lfl != NULL) { + if (lfl) { atomic_inc(&lfl->users); spin_unlock_bh(&ip6_fl_lock); return lfl; @@ -298,10 +297,10 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, { struct ipv6_txoptions *fl_opt = fl->opt; - if (fopt == NULL || fopt->opt_flen == 0) + if (!fopt || fopt->opt_flen == 0) return fl_opt; - if (fl_opt != NULL) { + if (fl_opt) { opt_space->hopopt = fl_opt->hopopt; opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; @@ -367,7 +366,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, err = -ENOMEM; fl = kzalloc(sizeof(*fl), GFP_KERNEL); - if (fl == NULL) + if (!fl) goto done; if (olen > 0) { @@ -377,7 +376,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); - if (fl->opt == NULL) + if (!fl->opt) goto done; memset(fl->opt, 0, sizeof(*fl->opt)); @@ -403,7 +402,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, } } - fl->fl_net = hold_net(net); + fl->fl_net = net; fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) @@ -596,8 +595,12 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; + if (net->ipv6.sysctl.flowlabel_state_ranges && + (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) + return -ERANGE; + fl = fl_create(net, sk, &freq, optval, optlen, &err); - if (fl == NULL) + if (!fl) return err; sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); @@ -617,7 +620,7 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) } rcu_read_unlock_bh(); - if (fl1 == NULL) + if (!fl1) fl1 = fl_lookup(net, freq.flr_label); if (fl1) { recheck: @@ -634,7 +637,7 @@ recheck: goto release; err = -ENOMEM; - if (sfl1 == NULL) + if (!sfl1) goto release; if (fl->linger > fl1->linger) fl1->linger = fl->linger; @@ -654,7 +657,7 @@ release: goto done; err = -ENOMEM; - if (sfl1 == NULL) + if (!sfl1) goto done; err = mem_check(sk); @@ -662,7 +665,7 @@ release: goto done; fl1 = fl_intern(net, fl, freq.flr_label); - if (fl1 != NULL) + if (fl1) goto recheck; if (!freq.flr_label) { diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index bc28b7d42a6d..a38d3ac0f18f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -223,7 +223,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, } } - if (cand != NULL) + if (cand) return cand; dev = ign->fb_tunnel_dev; @@ -395,7 +395,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, flags & GRE_KEY ? *(((__be32 *)p) + (grehlen / 4) - 1) : 0, p[1]); - if (t == NULL) + if (!t) return; switch (type) { @@ -760,7 +760,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb_set_inner_protocol(skb, protocol); - ip6tunnel_xmit(skb, dev); + ip6tunnel_xmit(NULL, skb, dev); if (ndst) ip6_tnl_dst_store(tunnel, ndst); return 0; @@ -980,7 +980,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) &p->raddr, &p->laddr, p->link, strict); - if (rt == NULL) + if (!rt) return; if (rt->dst.dev) { @@ -1073,7 +1073,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, } ip6gre_tnl_parm_from_user(&p1, &p); t = ip6gre_tunnel_locate(net, &p1, 0); - if (t == NULL) + if (!t) t = netdev_priv(dev); } memset(&p, 0, sizeof(p)); @@ -1105,7 +1105,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, t = ip6gre_tunnel_locate(net, &p1, cmd == SIOCADDTUNNEL); if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { + if (t) { if (t->dev != dev) { err = -EEXIST; break; @@ -1144,7 +1144,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, err = -ENOENT; ip6gre_tnl_parm_from_user(&p1, &p); t = ip6gre_tunnel_locate(net, &p1, 0); - if (t == NULL) + if (!t) goto done; err = -EPERM; if (t == netdev_priv(ign->fb_tunnel_dev)) @@ -1216,6 +1216,7 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_do_ioctl = ip6gre_tunnel_ioctl, .ndo_change_mtu = ip6gre_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip6_tnl_get_iflink, }; static void ip6gre_dev_free(struct net_device *dev) @@ -1238,7 +1239,6 @@ static void ip6gre_tunnel_setup(struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->flags |= IFF_NOARP; - dev->iflink = 0; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); } @@ -1246,7 +1246,6 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static int ip6gre_tunnel_init(struct net_device *dev) { struct ip6_tnl *tunnel; - int i; tunnel = netdev_priv(dev); @@ -1260,18 +1259,10 @@ static int ip6gre_tunnel_init(struct net_device *dev) if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = alloc_percpu(struct pcpu_sw_netstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; - for_each_possible_cpu(i) { - struct pcpu_sw_netstats *ip6gre_tunnel_stats; - ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i); - u64_stats_init(&ip6gre_tunnel_stats->syncp); - } - - dev->iflink = tunnel->parms.link; - return 0; } @@ -1313,7 +1304,7 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) t = rtnl_dereference(ign->tunnels[prio][h]); - while (t != NULL) { + while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ @@ -1412,7 +1403,7 @@ static int ip6gre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) goto out; if (data[IFLA_GRE_REMOTE]) { - nla_memcpy(&daddr, data[IFLA_GRE_REMOTE], sizeof(struct in6_addr)); + daddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]); if (ipv6_addr_any(&daddr)) return -EINVAL; } @@ -1446,10 +1437,10 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); if (data[IFLA_GRE_LOCAL]) - nla_memcpy(&parms->laddr, data[IFLA_GRE_LOCAL], sizeof(struct in6_addr)); + parms->laddr = nla_get_in6_addr(data[IFLA_GRE_LOCAL]); if (data[IFLA_GRE_REMOTE]) - nla_memcpy(&parms->raddr, data[IFLA_GRE_REMOTE], sizeof(struct in6_addr)); + parms->raddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]); if (data[IFLA_GRE_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_GRE_TTL]); @@ -1480,8 +1471,6 @@ static int ip6gre_tap_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - dev->iflink = tunnel->parms.link; - return 0; } @@ -1493,6 +1482,7 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6gre_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip6_tnl_get_iflink, }; static void ip6gre_tap_setup(struct net_device *dev) @@ -1503,7 +1493,6 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->netdev_ops = &ip6gre_tap_netdev_ops; dev->destructor = ip6gre_dev_free; - dev->iflink = 0; dev->features |= NETIF_F_NETNS_LOCAL; } @@ -1622,8 +1611,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || - nla_put(skb, IFLA_GRE_LOCAL, sizeof(struct in6_addr), &p->laddr) || - nla_put(skb, IFLA_GRE_REMOTE, sizeof(struct in6_addr), &p->raddr) || + nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) || + nla_put_in6_addr(skb, IFLA_GRE_REMOTE, &p->raddr) || nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) || /*nla_put_u8(skb, IFLA_GRE_TOS, t->priority) ||*/ nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index aacdcb4dc762..57990c929cd8 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -46,8 +46,7 @@ #include <net/xfrm.h> #include <net/inet_ecn.h> - -int ip6_rcv_finish(struct sk_buff *skb) +int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb) { if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; @@ -183,7 +182,8 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, dev, NULL, + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb, + dev, NULL, ip6_rcv_finish); err: IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); @@ -198,7 +198,7 @@ drop: */ -static int ip6_input_finish(struct sk_buff *skb) +static int ip6_input_finish(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); const struct inet6_protocol *ipprot; @@ -221,7 +221,7 @@ resubmit: raw = raw6_local_deliver(skb, nexthdr); ipprot = rcu_dereference(inet6_protos[nexthdr]); - if (ipprot != NULL) { + if (ipprot) { int ret; if (ipprot->flags & INET6_PROTO_FINAL) { @@ -277,7 +277,8 @@ discard: int ip6_input(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, NULL, skb, + skb->dev, NULL, ip6_input_finish); } @@ -330,10 +331,10 @@ int ip6_mc_input(struct sk_buff *skb) if (offset < 0) goto out; - if (!ipv6_is_mld(skb, nexthdr, offset)) - goto out; + if (ipv6_is_mld(skb, nexthdr, offset)) + deliver = true; - deliver = true; + goto out; } /* unknown RA - process it normally */ } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 46d452a56d3e..08b62047c67f 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -124,7 +124,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); fptr->frag_off = htons(offset); - if (skb->next != NULL) + if (skb->next) fptr->frag_off |= htons(IP6_MF); offset += (ntohs(ipv6h->payload_len) - sizeof(struct frag_hdr)); @@ -292,8 +292,6 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { static const struct net_offload sit_offload = { .callbacks = { .gso_segment = ipv6_gso_segment, - .gro_receive = ipv6_gro_receive, - .gro_complete = ipv6_gro_complete, }, }; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 36cf0ab685a0..d5f7716662db 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -56,7 +56,7 @@ #include <net/checksum.h> #include <linux/mroute6.h> -static int ip6_finish_output2(struct sk_buff *skb) +static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; @@ -70,7 +70,7 @@ static int ip6_finish_output2(struct sk_buff *skb) if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); - if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && + if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && ((mroute6_socket(dev_net(dev), skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, @@ -82,7 +82,7 @@ static int ip6_finish_output2(struct sk_buff *skb) */ if (newskb) NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, - newskb, NULL, newskb->dev, + sk, newskb, NULL, newskb->dev, dev_loopback_xmit); if (ipv6_hdr(skb)->hop_limit == 0) { @@ -105,7 +105,7 @@ static int ip6_finish_output2(struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt6_nexthop((struct rt6_info *)dst); + nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); @@ -122,14 +122,14 @@ static int ip6_finish_output2(struct sk_buff *skb) return -EINVAL; } -static int ip6_finish_output(struct sk_buff *skb) +static int ip6_finish_output(struct sock *sk, struct sk_buff *skb) { if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) - return ip6_fragment(skb, ip6_finish_output2); + return ip6_fragment(sk, skb, ip6_finish_output2); else - return ip6_finish_output2(skb); + return ip6_finish_output2(sk, skb); } int ip6_output(struct sock *sk, struct sk_buff *skb) @@ -143,7 +143,8 @@ int ip6_output(struct sock *sk, struct sk_buff *skb) return 0; } - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb, + NULL, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } @@ -177,7 +178,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if (skb_headroom(skb) < head_room) { struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); - if (skb2 == NULL) { + if (!skb2) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); @@ -223,8 +224,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, - dst->dev, dst_output); + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, + NULL, dst->dev, dst_output_sk); } skb->dev = dst->dev; @@ -316,10 +317,10 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) return 0; } -static inline int ip6_forward_finish(struct sk_buff *skb) +static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb) { skb_sender_cpu_clear(skb); - return dst_output(skb); + return dst_output_sk(sk, skb); } static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) @@ -458,7 +459,7 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) @@ -511,7 +512,8 @@ int ip6_forward(struct sk_buff *skb) IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); - return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb, + skb->dev, dst->dev, ip6_forward_finish); error: @@ -538,7 +540,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +int ip6_fragment(struct sock *sk, struct sk_buff *skb, + int (*output)(struct sock *, struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); @@ -548,7 +551,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) struct frag_hdr *fh; unsigned int mtu, hlen, left, len; int hroom, troom; - __be32 frag_id = 0; + __be32 frag_id; int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; struct net *net = dev_net(skb_dst(skb)->dev); @@ -561,18 +564,17 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) /* We must not fragment if the socket is set to force MTU discovery * or if the skb it not generated by a local socket. */ - if (unlikely(!skb->ignore_df && skb->len > mtu) || - (IP6CB(skb)->frag_max_size && - IP6CB(skb)->frag_max_size > mtu)) { - if (skb->sk && dst_allfrag(skb_dst(skb))) - sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + if (unlikely(!skb->ignore_df && skb->len > mtu)) + goto fail_toobig; - skb->dev = skb_dst(skb)->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); - kfree_skb(skb); - return -EMSGSIZE; + if (IP6CB(skb)->frag_max_size) { + if (IP6CB(skb)->frag_max_size > mtu) + goto fail_toobig; + + /* don't send fragments larger than what we received */ + mtu = IP6CB(skb)->frag_max_size; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; } if (np && np->frag_size < mtu) { @@ -581,6 +583,9 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } mtu -= hlen + sizeof(struct frag_hdr); + frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); + if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); struct sk_buff *frag2; @@ -629,11 +634,10 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb_reset_network_header(skb); memcpy(skb_network_header(skb), tmp_hdr, hlen); - ipv6_select_ident(fh, rt); fh->nexthdr = nexthdr; fh->reserved = 0; fh->frag_off = htons(IP6_MF); - frag_id = fh->identification; + fh->identification = frag_id; first_len = skb_pagelen(skb); skb->data_len = first_len - skb_headlen(skb); @@ -658,7 +662,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) fh->nexthdr = nexthdr; fh->reserved = 0; fh->frag_off = htons(offset); - if (frag->next != NULL) + if (frag->next) fh->frag_off |= htons(IP6_MF); fh->identification = frag_id; ipv6_hdr(frag)->payload_len = @@ -667,7 +671,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) ip6_copy_metadata(frag, skb); } - err = output(skb); + err = output(sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); @@ -775,11 +779,7 @@ slow_path: */ fh->nexthdr = nexthdr; fh->reserved = 0; - if (!frag_id) { - ipv6_select_ident(fh, rt); - frag_id = fh->identification; - } else - fh->identification = frag_id; + fh->identification = frag_id; /* * Copy a block of the IP datagram. @@ -800,7 +800,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - err = output(frag); + err = output(sk, frag); if (err) goto fail; @@ -812,6 +812,14 @@ slow_path: consume_skb(skb); return err; +fail_toobig: + if (skb->sk && dst_allfrag(skb_dst(skb))) + sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + + skb->dev = skb_dst(skb)->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + err = -EMSGSIZE; + fail: IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); @@ -824,7 +832,7 @@ static inline int ip6_rt_check(const struct rt6key *rt_key, const struct in6_addr *addr_cache) { return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && - (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); + (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); } static struct dst_entry *ip6_sk_dst_check(struct sock *sk, @@ -883,22 +891,45 @@ static int ip6_dst_lookup_tail(struct sock *sk, #endif int err; - if (*dst == NULL) - *dst = ip6_route_output(net, sk, fl6); - - err = (*dst)->error; - if (err) - goto out_err_release; + /* The correct way to handle this would be to do + * ip6_route_get_saddr, and then ip6_route_output; however, + * the route-specific preferred source forces the + * ip6_route_output call _before_ ip6_route_get_saddr. + * + * In source specific routing (no src=any default route), + * ip6_route_output will fail given src=any saddr, though, so + * that's why we try it again later. + */ + if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { + struct rt6_info *rt; + bool had_dst = *dst != NULL; - if (ipv6_addr_any(&fl6->saddr)) { - struct rt6_info *rt = (struct rt6_info *) *dst; + if (!had_dst) + *dst = ip6_route_output(net, sk, fl6); + rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; err = ip6_route_get_saddr(net, rt, &fl6->daddr, sk ? inet6_sk(sk)->srcprefs : 0, &fl6->saddr); if (err) goto out_err_release; + + /* If we had an erroneous initial result, pretend it + * never existed and let the SA-enabled version take + * over. + */ + if (!had_dst && (*dst)->error) { + dst_release(*dst); + *dst = NULL; + } } + if (!*dst) + *dst = ip6_route_output(net, sk, fl6); + + err = (*dst)->error; + if (err) + goto out_err_release; + #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * Here if the dst entry we've looked up @@ -910,7 +941,8 @@ static int ip6_dst_lookup_tail(struct sock *sk, */ rt = (struct rt6_info *) *dst; rcu_read_lock_bh(); - n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt)); + n = __ipv6_neigh_lookup_noref(rt->dst.dev, + rt6_nexthop(rt, &fl6->daddr)); err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; rcu_read_unlock_bh(); @@ -1034,11 +1066,10 @@ static inline int ip6_ufo_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, int transhdrlen, int mtu, unsigned int flags, - struct rt6_info *rt) + const struct flowi6 *fl6) { struct sk_buff *skb; - struct frag_hdr fhdr; int err; /* There is support for UDP large send offload by network @@ -1046,11 +1077,11 @@ static inline int ip6_ufo_append_data(struct sock *sk, * udp datagram */ skb = skb_peek_tail(queue); - if (skb == NULL) { + if (!skb) { skb = sock_alloc_send_skb(sk, hh_len + fragheaderlen + transhdrlen + 20, (flags & MSG_DONTWAIT), &err); - if (skb == NULL) + if (!skb) return err; /* reserve space for Hardware header */ @@ -1080,8 +1111,9 @@ static inline int ip6_ufo_append_data(struct sock *sk, skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - sizeof(struct frag_hdr)) & ~7; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - ipv6_select_ident(&fhdr, rt); - skb_shinfo(skb)->ip6_frag_id = fhdr.identification; + skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), + &fl6->daddr, + &fl6->saddr); append: return skb_append_datato_frags(sk, skb, getfrag, from, @@ -1108,7 +1140,7 @@ static void ip6_append_data_mtu(unsigned int *mtu, unsigned int orig_mtu) { if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { - if (skb == NULL) { + if (!skb) { /* first fragment, reserve header_len */ *mtu = orig_mtu - rt->dst.header_len; @@ -1140,7 +1172,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, return -EINVAL; v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation); - if (unlikely(v6_cork->opt == NULL)) + if (unlikely(!v6_cork->opt)) return -ENOBUFS; v6_cork->opt->tot_len = opt->tot_len; @@ -1274,8 +1306,10 @@ emsgsize: /* If this is the first and only packet and device * supports checksum offloading, let's use it. + * Use transhdrlen, same as IPv4, because partial + * sums only work when transhdrlen is set. */ - if (!skb && sk->sk_protocol == IPPROTO_UDP && + if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && length + fragheaderlen < mtu && rt->dst.dev->features & NETIF_F_V6_CSUM && !exthdrlen) @@ -1304,7 +1338,7 @@ emsgsize: (sk->sk_type == SOCK_DGRAM)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, - transhdrlen, mtu, flags, rt); + transhdrlen, mtu, flags, fl6); if (err) goto error; return 0; @@ -1332,7 +1366,7 @@ alloc_new_skb: else fraggap = 0; /* update mtu and maxfraglen if necessary */ - if (skb == NULL || skb_prev == NULL) + if (!skb || !skb_prev) ip6_append_data_mtu(&mtu, &maxfraglen, fragheaderlen, skb, rt, orig_mtu); @@ -1384,10 +1418,10 @@ alloc_new_skb: skb = sock_wmalloc(sk, alloclen + hh_len, 1, sk->sk_allocation); - if (unlikely(skb == NULL)) + if (unlikely(!skb)) err = -ENOBUFS; } - if (skb == NULL) + if (!skb) goto error; /* * Fill in the control structures @@ -1579,7 +1613,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, unsigned char proto = fl6->flowi6_proto; skb = __skb_dequeue(queue); - if (skb == NULL) + if (!skb) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ddd94eca19b3..2e67b660118b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -64,12 +64,6 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); -#ifdef IP6_TNL_DEBUG -#define IP6_TNL_TRACE(x...) pr_debug("%s:" x "\n", __func__) -#else -#define IP6_TNL_TRACE(x...) do {;} while(0) -#endif - #define HASH_SIZE_SHIFT 5 #define HASH_SIZE (1 << HASH_SIZE_SHIFT) @@ -137,7 +131,7 @@ struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) struct dst_entry *dst = t->dst_cache; if (dst && dst->obsolete && - dst->ops->check(dst, t->dst_cookie) == NULL) { + !dst->ops->check(dst, t->dst_cookie)) { t->dst_cache = NULL; dst_release(dst); return NULL; @@ -157,7 +151,7 @@ EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; - t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + t->dst_cookie = rt6_get_cookie(rt); dst_release(t->dst_cache); t->dst_cache = dst; } @@ -331,7 +325,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); - if (dev == NULL) + if (!dev) goto failed; dev_net_set(dev, net); @@ -502,7 +496,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, &ipv6h->saddr); - if (t == NULL) + if (!t) goto out; tproto = ACCESS_ONCE(t->parms.proto); @@ -813,7 +807,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); - if (t != NULL) { + if (t) { struct pcpu_sw_netstats *tstats; tproto = ACCESS_ONCE(t->parms.proto); @@ -1106,7 +1100,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; - ip6tunnel_xmit(skb, dev); + ip6tunnel_xmit(NULL, skb, dev); if (ndst) ip6_tnl_dst_store(t, ndst); return 0; @@ -1270,8 +1264,6 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) else dev->flags &= ~IFF_POINTOPOINT; - dev->iflink = p->link; - if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); @@ -1280,7 +1272,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) &p->raddr, &p->laddr, p->link, strict); - if (rt == NULL) + if (!rt) return; if (rt->dst.dev) { @@ -1523,6 +1515,13 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) return 0; } +int ip6_tnl_get_iflink(const struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + + return t->parms.link; +} +EXPORT_SYMBOL(ip6_tnl_get_iflink); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, @@ -1531,6 +1530,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_do_ioctl = ip6_tnl_ioctl, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats = ip6_get_stats, + .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1646,12 +1646,10 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) - nla_memcpy(&parms->laddr, data[IFLA_IPTUN_LOCAL], - sizeof(struct in6_addr)); + parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) - nla_memcpy(&parms->raddr, data[IFLA_IPTUN_REMOTE], - sizeof(struct in6_addr)); + parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); @@ -1745,10 +1743,8 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || - nla_put(skb, IFLA_IPTUN_LOCAL, sizeof(struct in6_addr), - &parm->laddr) || - nla_put(skb, IFLA_IPTUN_REMOTE, sizeof(struct in6_addr), - &parm->raddr) || + nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || + nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || @@ -1821,7 +1817,7 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) for (h = 0; h < HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); - while (t != NULL) { + while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 32d9b268e7d8..e1a1136bda7c 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -19,12 +19,10 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, int err; struct socket *sock = NULL; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); + err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; - sk_change_net(sock->sk, net); - udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); @@ -55,14 +53,15 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL_GPL(udp_sock_create6); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sk_buff *skb, +int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, __u8 prio, __u8 ttl, __be16 src_port, @@ -97,7 +96,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sk_buff *skb, ip6h->daddr = *daddr; ip6h->saddr = *saddr; - ip6tunnel_xmit(skb, dev); + ip6tunnel_xmit(sk, skb, dev); return 0; } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5fb9e212eca8..0224c032dca5 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -218,7 +218,7 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p sprintf(name, "ip6_vti%%d"); dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup); - if (dev == NULL) + if (!dev) goto failed; dev_net_set(dev, net); @@ -288,8 +288,7 @@ static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, static void vti6_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); @@ -305,7 +304,7 @@ static int vti6_rcv(struct sk_buff *skb) rcu_read_lock(); t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); - if (t != NULL) { + if (t) { if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) { rcu_read_unlock(); goto discard; @@ -323,7 +322,6 @@ static int vti6_rcv(struct sk_buff *skb) } XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; - skb->mark = be32_to_cpu(t->parms.i_key); rcu_read_unlock(); @@ -343,6 +341,8 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) struct pcpu_sw_netstats *tstats; struct xfrm_state *x; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; + u32 orig_mark = skb->mark; + int ret; if (!t) return 1; @@ -359,7 +359,11 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); family = x->inner_mode->afinfo->family; - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + skb->mark = be32_to_cpu(t->parms.i_key); + ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); + skb->mark = orig_mark; + + if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); @@ -431,6 +435,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct net_device *tdev; struct xfrm_state *x; int err = -1; + int mtu; if (!dst) goto tx_err_link_failure; @@ -464,6 +469,19 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; + mtu = dst_mtu(dst); + if (!skb->ignore_df && skb->len > mtu) { + skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); + + if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + else + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + + return -EMSGSIZE; + } + err = dst_output(skb); if (net_xmit_eval(err) == 0) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); @@ -496,7 +514,6 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) int ret; memset(&fl, 0, sizeof(fl)); - skb->mark = be32_to_cpu(t->parms.o_key); switch (skb->protocol) { case htons(ETH_P_IPV6): @@ -517,6 +534,9 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_err; } + /* override mark with tunnel output key */ + fl.flowi_mark = be32_to_cpu(t->parms.o_key); + ret = vti6_xmit(skb, dev, &fl); if (ret < 0) goto tx_err; @@ -601,8 +621,6 @@ static void vti6_link_config(struct ip6_tnl *t) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; - - dev->iflink = p->link; } /** @@ -716,7 +734,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } else { memset(&p, 0, sizeof(p)); } - if (t == NULL) + if (!t) t = netdev_priv(dev); vti6_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) @@ -736,7 +754,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { + if (t) { if (t->dev != dev) { err = -EEXIST; break; @@ -767,7 +785,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = -ENOENT; vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, 0); - if (t == NULL) + if (!t) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) @@ -808,6 +826,7 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_do_ioctl = vti6_ioctl, .ndo_change_mtu = vti6_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip6_tnl_get_iflink, }; /** @@ -897,12 +916,10 @@ static void vti6_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_LOCAL]) - nla_memcpy(&parms->laddr, data[IFLA_VTI_LOCAL], - sizeof(struct in6_addr)); + parms->laddr = nla_get_in6_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) - nla_memcpy(&parms->raddr, data[IFLA_VTI_REMOTE], - sizeof(struct in6_addr)); + parms->raddr = nla_get_in6_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); @@ -983,10 +1000,8 @@ static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev) struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, parm->link) || - nla_put(skb, IFLA_VTI_LOCAL, sizeof(struct in6_addr), - &parm->laddr) || - nla_put(skb, IFLA_VTI_REMOTE, sizeof(struct in6_addr), - &parm->raddr) || + nla_put_in6_addr(skb, IFLA_VTI_LOCAL, &parm->laddr) || + nla_put_in6_addr(skb, IFLA_VTI_REMOTE, &parm->raddr) || nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key)) goto nla_put_failure; @@ -1027,7 +1042,7 @@ static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) for (h = 0; h < HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); - while (t != NULL) { + while (t) { unregister_netdevice_queue(t->dev, &list); t = rtnl_dereference(t->next); } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 312e0ff47339..74ceb73c1c9a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -56,9 +56,7 @@ struct mr6_table { struct list_head list; -#ifdef CONFIG_NET_NS - struct net *net; -#endif + possible_net_t net; u32 id; struct sock *mroute6_sk; struct timer_list ipmr_expire_timer; @@ -175,7 +173,7 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, } mrt = ip6mr_get_table(rule->fr_net, rule->table); - if (mrt == NULL) + if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; @@ -239,7 +237,7 @@ static int __net_init ip6mr_rules_init(struct net *net) INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); - if (mrt == NULL) { + if (!mrt) { err = -ENOMEM; goto err1; } @@ -307,11 +305,11 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) unsigned int i; mrt = ip6mr_get_table(net, id); - if (mrt != NULL) + if (mrt) return mrt; mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); - if (mrt == NULL) + if (!mrt) return NULL; mrt->id = id; write_pnet(&mrt->net, net); @@ -410,7 +408,7 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) struct mr6_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); - if (mrt == NULL) + if (!mrt) return ERR_PTR(-ENOENT); iter->mrt = mrt; @@ -494,7 +492,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) struct mr6_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); - if (mrt == NULL) + if (!mrt) return ERR_PTR(-ENOENT); it->mrt = mrt; @@ -667,7 +665,7 @@ static int pim6_rcv(struct sk_buff *skb) dev_hold(reg_dev); read_unlock(&mrt_lock); - if (reg_dev == NULL) + if (!reg_dev) goto drop; skb->mac_header = skb->network_header; @@ -720,8 +718,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } +static int reg_vif_get_iflink(const struct net_device *dev) +{ + return 0; +} + static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, + .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) @@ -745,7 +749,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) sprintf(name, "pim6reg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); - if (dev == NULL) + if (!dev) return NULL; dev_net_set(dev, net); @@ -754,7 +758,6 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) free_netdev(dev); return NULL; } - dev->iflink = 0; if (dev_open(dev)) goto failure; @@ -994,7 +997,7 @@ static int mif6_add(struct net *net, struct mr6_table *mrt, v->pkt_out = 0; v->link = dev->ifindex; if (v->flags & MIFF_REGISTER) - v->link = dev->iflink; + v->link = dev_get_iflink(dev); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); @@ -1074,7 +1077,7 @@ skip: static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); - if (c == NULL) + if (!c) return NULL; c->mfc_un.res.minvif = MAXMIFS; return c; @@ -1083,7 +1086,7 @@ static struct mfc6_cache *ip6mr_cache_alloc(void) static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); - if (c == NULL) + if (!c) return NULL; skb_queue_head_init(&c->mfc_un.unres.unresolved); c->mfc_un.unres.expires = jiffies + 10 * HZ; @@ -1200,7 +1203,7 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, skb->ip_summed = CHECKSUM_UNNECESSARY; } - if (mrt->mroute6_sk == NULL) { + if (!mrt->mroute6_sk) { kfree_skb(skb); return -EINVAL; } @@ -1495,7 +1498,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, return -EINVAL; c = ip6mr_cache_alloc(); - if (c == NULL) + if (!c) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; @@ -1665,7 +1668,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns struct mr6_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); - if (mrt == NULL) + if (!mrt) return -ENOENT; if (optname != MRT6_INIT) { @@ -1814,7 +1817,7 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, struct mr6_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); - if (mrt == NULL) + if (!mrt) return -ENOENT; switch (optname) { @@ -1861,7 +1864,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) struct mr6_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); - if (mrt == NULL) + if (!mrt) return -ENOENT; switch (cmd) { @@ -1935,7 +1938,7 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) struct mr6_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); - if (mrt == NULL) + if (!mrt) return -ENOENT; switch (cmd) { @@ -1983,13 +1986,13 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) } #endif -static inline int ip6mr_forward2_finish(struct sk_buff *skb) +static inline int ip6mr_forward2_finish(struct sock *sk, struct sk_buff *skb) { IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); IP6_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTOCTETS, skb->len); - return dst_output(skb); + return dst_output_sk(sk, skb); } /* @@ -2005,7 +2008,7 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, struct dst_entry *dst; struct flowi6 fl6; - if (vif->dev == NULL) + if (!vif->dev) goto out_free; #ifdef CONFIG_IPV6_PIMSM_V2 @@ -2061,7 +2064,8 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, IP6CB(skb)->flags |= IP6SKB_FORWARDED; - return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dev, + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb, + skb->dev, dev, ip6mr_forward2_finish); out_free: @@ -2194,7 +2198,7 @@ int ip6_mr_input(struct sk_buff *skb) read_lock(&mrt_lock); cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); - if (cache == NULL) { + if (!cache) { int vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) @@ -2206,7 +2210,7 @@ int ip6_mr_input(struct sk_buff *skb) /* * No usable cache entry */ - if (cache == NULL) { + if (!cache) { int vif; vif = ip6mr_find_vif(mrt, skb->dev); @@ -2245,13 +2249,13 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0) return -EMSGSIZE; mp_attr = nla_nest_start(skb, RTA_MULTIPATH); - if (mp_attr == NULL) + if (!mp_attr) return -EMSGSIZE; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); - if (nhp == NULL) { + if (!nhp) { nla_nest_cancel(skb, mp_attr); return -EMSGSIZE; } @@ -2284,7 +2288,7 @@ int ip6mr_get_route(struct net *net, struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); - if (mrt == NULL) + if (!mrt) return -ENOENT; read_lock(&mrt_lock); @@ -2309,7 +2313,7 @@ int ip6mr_get_route(struct net *net, } dev = skb->dev; - if (dev == NULL || (vif = ip6mr_find_vif(mrt, dev)) < 0) { + if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { read_unlock(&mrt_lock); return -ENODEV; } @@ -2361,7 +2365,7 @@ static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); - if (nlh == NULL) + if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); @@ -2380,8 +2384,8 @@ static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; - if (nla_put(skb, RTA_SRC, 16, &c->mf6c_origin) || - nla_put(skb, RTA_DST, 16, &c->mf6c_mcastgrp)) + if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || + nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; err = __ip6mr_fill_mroute(mrt, skb, c, rtm); /* do not break the dump if cache is unresolved */ @@ -2426,7 +2430,7 @@ static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, skb = nlmsg_new(mr6_msgsize(mfc->mf6c_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); - if (skb == NULL) + if (!skb) goto errout; err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 8d766d9100cb..63e6956917c9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -85,7 +85,7 @@ int ip6_ra_control(struct sock *sk, int sel) return 0; } } - if (new_ra == NULL) { + if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } @@ -117,6 +117,25 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, return opt; } +static bool setsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IPV6_ADD_MEMBERSHIP: + case IPV6_DROP_MEMBERSHIP: + case IPV6_JOIN_ANYCAST: + case IPV6_LEAVE_ANYCAST: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_MSFILTER: + return true; + } + return false; +} + static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { @@ -124,8 +143,9 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, struct net *net = sock_net(sk); int val, valbool; int retv = -ENOPROTOOPT; + bool needs_rtnl = setsockopt_needs_rtnl(optname); - if (optval == NULL) + if (!optval) val = 0; else { if (optlen >= sizeof(int)) { @@ -140,6 +160,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); + if (needs_rtnl) + rtnl_lock(); lock_sock(sk); switch (optname) { @@ -370,7 +392,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, */ if (optlen == 0) optval = NULL; - else if (optval == NULL) + else if (!optval) goto e_inval; else if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) @@ -421,7 +443,7 @@ sticky_done: if (optlen == 0) goto e_inval; - else if (optlen < sizeof(struct in6_pktinfo) || optval == NULL) + else if (optlen < sizeof(struct in6_pktinfo) || !optval) goto e_inval; if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) { @@ -460,7 +482,7 @@ sticky_done: opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; - if (opt == NULL) + if (!opt) break; memset(opt, 0, sizeof(*opt)); @@ -624,10 +646,10 @@ done: psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) retv = ipv6_sock_mc_join(sk, greq.gr_interface, - &psin6->sin6_addr); + &psin6->sin6_addr); else retv = ipv6_sock_mc_drop(sk, greq.gr_interface, - &psin6->sin6_addr); + &psin6->sin6_addr); break; } case MCAST_JOIN_SOURCE_GROUP: @@ -660,7 +682,7 @@ done: psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, - &psin6->sin6_addr); + &psin6->sin6_addr); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) break; @@ -837,11 +859,15 @@ pref_skip_coa: } release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return retv; e_inval: release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return -EINVAL; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 5ce107c8aab3..083b2927fc67 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -140,6 +140,8 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) struct net *net = sock_net(sk); int err; + ASSERT_RTNL(); + if (!ipv6_addr_is_multicast(addr)) return -EINVAL; @@ -155,13 +157,12 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); - if (mc_lst == NULL) + if (!mc_lst) return -ENOMEM; mc_lst->next = NULL; mc_lst->addr = *addr; - rtnl_lock(); if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, 0); @@ -172,8 +173,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) } else dev = __dev_get_by_index(net, ifindex); - if (dev == NULL) { - rtnl_unlock(); + if (!dev) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } @@ -190,7 +190,6 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) err = ipv6_dev_mc_inc(dev, addr); if (err) { - rtnl_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; } @@ -198,10 +197,9 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) mc_lst->next = np->ipv6_mc_list; rcu_assign_pointer(np->ipv6_mc_list, mc_lst); - rtnl_unlock(); - return 0; } +EXPORT_SYMBOL(ipv6_sock_mc_join); /* * socket leave on multicast group @@ -213,10 +211,11 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) struct ipv6_mc_socklist __rcu **lnk; struct net *net = sock_net(sk); + ASSERT_RTNL(); + if (!ipv6_addr_is_multicast(addr)) return -EINVAL; - rtnl_lock(); for (lnk = &np->ipv6_mc_list; (mc_lst = rtnl_dereference(*lnk)) != NULL; lnk = &mc_lst->next) { @@ -227,7 +226,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) *lnk = mc_lst->next; dev = __dev_get_by_index(net, mc_lst->ifindex); - if (dev != NULL) { + if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); (void) ip6_mc_leave_src(sk, mc_lst, idev); @@ -235,17 +234,16 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); - rtnl_unlock(); atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); return 0; } } - rtnl_unlock(); return -EADDRNOTAVAIL; } +EXPORT_SYMBOL(ipv6_sock_mc_drop); /* called with rcu_read_lock() */ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, @@ -438,7 +436,7 @@ done: read_unlock_bh(&idev->lock); rcu_read_unlock(); if (leavegroup) - return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); + err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; } @@ -825,7 +823,7 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, struct ifmcaddr6 *mc; mc = kzalloc(sizeof(*mc), GFP_ATOMIC); - if (mc == NULL) + if (!mc) return NULL; setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); @@ -862,7 +860,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) /* we need to take a reference on idev */ idev = in6_dev_get(dev); - if (idev == NULL) + if (!idev) return -EINVAL; write_lock_bh(&idev->lock); @@ -1330,7 +1328,7 @@ int igmp6_event_query(struct sk_buff *skb) return -EINVAL; idev = __in6_dev_get(skb->dev); - if (idev == NULL) + if (!idev) return 0; mld = (struct mld_msg *)icmp6_hdr(skb); @@ -1445,7 +1443,7 @@ int igmp6_event_report(struct sk_buff *skb) return -EINVAL; idev = __in6_dev_get(skb->dev); - if (idev == NULL) + if (!idev) return -ENODEV; /* @@ -1646,8 +1644,9 @@ static void mld_sendpack(struct sk_buff *skb) payload_len = skb->len; - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, - dst_output); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net->ipv6.igmp_sk, skb, NULL, skb->dev, + dst_output_sk); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); @@ -1964,7 +1963,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); - if (skb == NULL) { + if (!skb) { rcu_read_lock(); IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTDISCARDS); @@ -2009,8 +2008,8 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) } skb_dst_set(skb, dst); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, - dst_output); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, + NULL, skb->dev, dst_output_sk); out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); @@ -2613,7 +2612,7 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr im = im->next; while (!im) { - if (likely(state->idev != NULL)) + if (likely(state->idev)) read_unlock_bh(&state->idev->lock); state->dev = next_net_device_rcu(state->dev); @@ -2659,7 +2658,7 @@ static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); - if (likely(state->idev != NULL)) { + if (likely(state->idev)) { read_unlock_bh(&state->idev->lock); state->idev = NULL; } @@ -2728,10 +2727,10 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) continue; read_lock_bh(&idev->lock); im = idev->mc_list; - if (likely(im != NULL)) { + if (likely(im)) { spin_lock_bh(&im->mca_lock); psf = im->mca_sources; - if (likely(psf != NULL)) { + if (likely(psf)) { state->im = im; state->idev = idev; break; @@ -2752,7 +2751,7 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s spin_unlock_bh(&state->im->mca_lock); state->im = state->im->next; while (!state->im) { - if (likely(state->idev != NULL)) + if (likely(state->idev)) read_unlock_bh(&state->idev->lock); state->dev = next_net_device_rcu(state->dev); @@ -2806,11 +2805,11 @@ static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); - if (likely(state->im != NULL)) { + if (likely(state->im)) { spin_unlock_bh(&state->im->mca_lock); state->im = NULL; } - if (likely(state->idev != NULL)) { + if (likely(state->idev)) { read_unlock_bh(&state->idev->lock); state->idev = NULL; } @@ -2907,20 +2906,32 @@ static int __net_init igmp6_net_init(struct net *net) inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; + err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); + if (err < 0) { + pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n", + err); + goto out_sock_create; + } + err = igmp6_proc_init(net); if (err) - goto out_sock_create; -out: - return err; + goto out_sock_create_autojoin; + + return 0; +out_sock_create_autojoin: + inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); out_sock_create: inet_ctl_sock_destroy(net->ipv6.igmp_sk); - goto out; +out: + return err; } static void __net_exit igmp6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.igmp_sk); + inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk); igmp6_proc_exit(net); } diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c new file mode 100644 index 000000000000..df8afe5ab31e --- /dev/null +++ b/net/ipv6/mcast_snoop.c @@ -0,0 +1,213 @@ +/* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * + * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki. + */ + +#include <linux/skbuff.h> +#include <net/ipv6.h> +#include <net/mld.h> +#include <net/addrconf.h> +#include <net/ip6_checksum.h> + +static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + unsigned int len; + unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h); + + if (!pskb_may_pull(skb, offset)) + return -EINVAL; + + ip6h = ipv6_hdr(skb); + + if (ip6h->version != 6) + return -EINVAL; + + len = offset + ntohs(ip6h->payload_len); + if (skb->len < len || len <= offset) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_exthdrs(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h; + int offset; + u8 nexthdr; + __be16 frag_off; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_HOPOPTS) + return -ENOMSG; + + nexthdr = ip6h->nexthdr; + offset = skb_network_offset(skb) + sizeof(*ip6h); + offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); + + if (offset < 0) + return -EINVAL; + + if (nexthdr != IPPROTO_ICMPV6) + return -ENOMSG; + + skb_set_transport_header(skb, offset); + + return 0; +} + +static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) +{ + unsigned int len = skb_transport_offset(skb); + + len += sizeof(struct mld2_report); + + return pskb_may_pull(skb, len) ? 0 : -EINVAL; +} + +static int ipv6_mc_check_mld_query(struct sk_buff *skb) +{ + struct mld_msg *mld; + unsigned int len = skb_transport_offset(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + len += sizeof(struct mld_msg); + if (skb->len < len) + return -EINVAL; + + /* MLDv1? */ + if (skb->len != len) { + /* or MLDv2? */ + len += sizeof(struct mld2_query) - sizeof(struct mld_msg); + if (skb->len < len || !pskb_may_pull(skb, len)) + return -EINVAL; + } + + mld = (struct mld_msg *)skb_transport_header(skb); + + /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer + * all-nodes destination address (ff02::1) for general queries + */ + if (ipv6_addr_any(&mld->mld_mca) && + !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) + return -EINVAL; + + return 0; +} + +static int ipv6_mc_check_mld_msg(struct sk_buff *skb) +{ + struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb); + + switch (mld->mld_type) { + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MGM_REPORT: + /* fall through */ + return 0; + case ICMPV6_MLD2_REPORT: + return ipv6_mc_check_mld_reportv2(skb); + case ICMPV6_MGM_QUERY: + return ipv6_mc_check_mld_query(skb); + default: + return -ENOMSG; + } +} + +static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) +{ + return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); +} + +static int __ipv6_mc_check_mld(struct sk_buff *skb, + struct sk_buff **skb_trimmed) + +{ + struct sk_buff *skb_chk = NULL; + unsigned int transport_len; + unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); + int ret; + + transport_len = ntohs(ipv6_hdr(skb)->payload_len); + transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr); + + skb_get(skb); + skb_chk = skb_checksum_trimmed(skb, transport_len, + ipv6_mc_validate_checksum); + if (!skb_chk) + return -EINVAL; + + if (!pskb_may_pull(skb_chk, len)) { + kfree_skb(skb_chk); + return -EINVAL; + } + + ret = ipv6_mc_check_mld_msg(skb_chk); + if (ret) { + kfree_skb(skb_chk); + return ret; + } + + if (skb_trimmed) + *skb_trimmed = skb_chk; + else + kfree_skb(skb_chk); + + return 0; +} + +/** + * ipv6_mc_check_mld - checks whether this is a sane MLD packet + * @skb: the skb to validate + * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional) + * + * Checks whether an IPv6 packet is a valid MLD packet. If so sets + * skb network and transport headers accordingly and returns zero. + * + * -EINVAL: A broken packet was detected, i.e. it violates some internet + * standard + * -ENOMSG: IP header validation succeeded but it is not an MLD packet. + * -ENOMEM: A memory allocation failure happened. + * + * Optionally, an skb pointer might be provided via skb_trimmed (or set it + * to NULL): After parsing an MLD packet successfully it will point to + * an skb which has its tail aligned to the IP packet end. This might + * either be the originally provided skb or a trimmed, cloned version if + * the skb frame had data beyond the IP packet. A cloned skb allows us + * to leave the original skb and its full frame unchanged (which might be + * desirable for layer 2 frame jugglers). + * + * The caller needs to release a reference count from any returned skb_trimmed. + */ +int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) +{ + int ret; + + ret = ipv6_mc_check_ip6hdr(skb); + if (ret < 0) + return ret; + + ret = ipv6_mc_check_exthdrs(skb); + if (ret < 0) + return ret; + + return __ipv6_mc_check_mld(skb, skb_trimmed); +} +EXPORT_SYMBOL(ipv6_mc_check_mld); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 14ecdaf06bf7..c53331cfed95 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -84,6 +84,7 @@ do { \ static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); +static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -117,7 +118,9 @@ static const struct neigh_ops ndisc_direct_ops = { struct neigh_table nd_tbl = { .family = AF_INET6, .key_len = sizeof(struct in6_addr), + .protocol = cpu_to_be16(ETH_P_IPV6), .hash = ndisc_hash, + .key_eq = ndisc_key_eq, .constructor = ndisc_constructor, .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, @@ -294,6 +297,11 @@ static u32 ndisc_hash(const void *pkey, return ndisc_hashfn(pkey, dev, hash_rnd); } +static bool ndisc_key_eq(const struct neighbour *n, const void *pkey) +{ + return neigh_key_eq128(n, pkey); +} + static int ndisc_constructor(struct neighbour *neigh) { struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; @@ -303,7 +311,7 @@ static int ndisc_constructor(struct neighbour *neigh) bool is_multicast = ipv6_addr_is_multicast(addr); in6_dev = in6_dev_get(dev); - if (in6_dev == NULL) { + if (!in6_dev) { return -EINVAL; } @@ -348,7 +356,7 @@ static int pndisc_constructor(struct pneigh_entry *n) struct in6_addr maddr; struct net_device *dev = n->dev; - if (dev == NULL || __in6_dev_get(dev) == NULL) + if (!dev || !__in6_dev_get(dev)) return -EINVAL; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); @@ -361,7 +369,7 @@ static void pndisc_destructor(struct pneigh_entry *n) struct in6_addr maddr; struct net_device *dev = n->dev; - if (dev == NULL || __in6_dev_get(dev) == NULL) + if (!dev || !__in6_dev_get(dev)) return; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_dec(dev, &maddr); @@ -455,8 +463,9 @@ static void ndisc_send_skb(struct sk_buff *skb, idev = __in6_dev_get(dst->dev); IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, - dst_output); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, + NULL, dst->dev, + dst_output_sk); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); @@ -552,7 +561,7 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, int optlen = 0; struct nd_msg *msg; - if (saddr == NULL) { + if (!saddr) { if (ipv6_get_lladdr(dev, &addr_buf, (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) return; @@ -1022,13 +1031,13 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr)); skb = nlmsg_new(msg_size, GFP_ATOMIC); - if (skb == NULL) { + if (!skb) { err = -ENOBUFS; goto errout; } nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0); - if (nlh == NULL) { + if (!nlh) { goto nla_put_failure; } @@ -1041,8 +1050,7 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); - if (nla_put(skb, NDUSEROPT_SRCADDR, sizeof(struct in6_addr), - &ipv6_hdr(ra)->saddr)) + if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -1096,7 +1104,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) */ in6_dev = __in6_dev_get(skb->dev); - if (in6_dev == NULL) { + if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", skb->dev->name); return; @@ -1191,11 +1199,11 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n", rt, lifetime, skb->dev->name); - if (rt == NULL && lifetime) { + if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); - if (rt == NULL) { + if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", __func__); @@ -1203,7 +1211,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); - if (neigh == NULL) { + if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); @@ -1498,7 +1506,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) "Redirect: destination is not a neighbour\n"); goto release; } - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); ret = inet_peer_xrlim_allow(peer, 1*HZ); if (peer) inet_putpeer(peer); @@ -1642,6 +1650,7 @@ int ndisc_rcv(struct sk_buff *skb) static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_change_info *change_info; struct net *net = dev_net(dev); struct inet6_dev *idev; @@ -1656,6 +1665,11 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, ndisc_send_unsol_na(dev); in6_dev_put(idev); break; + case NETDEV_CHANGE: + change_info = ptr; + if (change_info->flags_changed & IFF_NOARP) + neigh_changeaddr(&nd_tbl, dev); + break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); fib6_run_gc(0, net, false); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 398377a9d018..b4de08a83e0b 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -84,7 +84,7 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (entry->hook == NF_INET_LOCAL_OUT) { + if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct ipv6hdr *iph = ipv6_hdr(skb); rt_info->daddr = iph->daddr; @@ -98,7 +98,7 @@ static int nf_ip6_reroute(struct sk_buff *skb, { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (entry->hook == NF_INET_LOCAL_OUT) { + if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct ipv6hdr *iph = ipv6_hdr(skb); if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || @@ -191,6 +191,8 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, static const struct nf_ipv6_ops ipv6ops = { .chk_addr = ipv6_chk_addr, + .route_input = ip6_route_input, + .fragment = ip6_fragment }; static const struct nf_afinfo nf_ip6_afinfo = { diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index a069822936e6..b552cf0d6198 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -25,14 +25,16 @@ config NF_CONNTRACK_IPV6 To compile it as a module, choose M here. If unsure, say N. +if NF_TABLES + config NF_TABLES_IPV6 - depends on NF_TABLES tristate "IPv6 nf_tables support" help This option enables the IPv6 support for nf_tables. +if NF_TABLES_IPV6 + config NFT_CHAIN_ROUTE_IPV6 - depends on NF_TABLES_IPV6 tristate "IPv6 nf_tables route chain support" help This option enables the "route" chain for IPv6 in nf_tables. This @@ -40,16 +42,18 @@ config NFT_CHAIN_ROUTE_IPV6 fields such as the source, destination, flowlabel, hop-limit and the packet mark. -config NF_REJECT_IPV6 - tristate "IPv6 packet rejection" - default m if NETFILTER_ADVANCED=n - config NFT_REJECT_IPV6 - depends on NF_TABLES_IPV6 select NF_REJECT_IPV6 default NFT_REJECT tristate +endif # NF_TABLES_IPV6 +endif # NF_TABLES + +config NF_REJECT_IPV6 + tristate "IPv6 packet rejection" + default m if NETFILTER_ADVANCED=n + config NF_LOG_IPV6 tristate "IPv6 packet logging" default m if NETFILTER_ADVANCED=n @@ -182,7 +186,8 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' - depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW) + depends on NETFILTER_ADVANCED + depends on IP6_NF_MANGLE || IP6_NF_RAW ---help--- This option allows you to match packets whose replies would go out via the interface the packet came in. diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index bb00c6f2a885..3c35ced39b42 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -9,7 +9,10 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> #include <linux/capability.h> #include <linux/in.h> #include <linux/skbuff.h> @@ -234,7 +237,7 @@ static struct nf_loginfo trace_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { - .level = 4, + .level = LOGLEVEL_WARNING, .logflags = NF_LOG_MASK, }, }, @@ -280,15 +283,13 @@ static void trace_packet(const struct sk_buff *skb, const struct xt_table_info *private, const struct ip6t_entry *e) { - const void *table_base; const struct ip6t_entry *root; const char *hookname, *chainname, *comment; const struct ip6t_entry *iter; unsigned int rulenum = 0; struct net *net = dev_net(in ? in : out); - table_base = private->entries[smp_processor_id()]; - root = get_entry(table_base, private->hook_entry[hook]); + root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP6_TRACE_COMMENT_RULE]; @@ -314,8 +315,7 @@ ip6t_next_entry(const struct ip6t_entry *entry) unsigned int ip6t_do_table(struct sk_buff *skb, unsigned int hook, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct xt_table *table) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); @@ -330,8 +330,8 @@ ip6t_do_table(struct sk_buff *skb, unsigned int addend; /* Initialization */ - indev = in ? in->name : nulldevname; - outdev = out ? out->name : nulldevname; + indev = state->in ? state->in->name : nulldevname; + outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as * if it was a normal packet. All other fragments are treated * normally, except that they will NEVER match rules that ask @@ -339,8 +339,8 @@ ip6t_do_table(struct sk_buff *skb, * rule is also a fragment-specific rule, non-fragments won't * match it. */ acpar.hotdrop = false; - acpar.in = in; - acpar.out = out; + acpar.in = state->in; + acpar.out = state->out; acpar.family = NFPROTO_IPV6; acpar.hooknum = hook; @@ -355,7 +355,7 @@ ip6t_do_table(struct sk_buff *skb, */ smp_read_barrier_depends(); cpu = smp_processor_id(); - table_base = private->entries[cpu]; + table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; @@ -365,6 +365,7 @@ ip6t_do_table(struct sk_buff *skb, do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; + struct xt_counters *counter; IP_NF_ASSERT(e); acpar.thoff = 0; @@ -382,7 +383,8 @@ ip6t_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, skb->len, 1); + counter = xt_get_this_cpu_counter(&e->counters); + ADD_COUNTER(*counter, skb->len, 1); t = ip6t_get_target_c(e); IP_NF_ASSERT(t->u.kernel.target); @@ -390,7 +392,7 @@ ip6t_do_table(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) - trace_packet(skb, hook, in, out, + trace_packet(skb, hook, state->in, state->out, table->name, private, e); #endif /* Standard target? */ @@ -677,6 +679,10 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, if (ret) return ret; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; + j = 0; mtpar.net = net; mtpar.table = name; @@ -712,6 +718,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -795,6 +804,8 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); + + xt_percpu_counter_free(e->counters.pcnt); } /* Checks and translates the user-supplied table segment (held in @@ -877,12 +888,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) { - if (newinfo->entries[i] && newinfo->entries[i] != entry0) - memcpy(newinfo->entries[i], entry0, newinfo->size); - } - return ret; } @@ -898,14 +903,16 @@ get_counters(const struct xt_table_info *t, seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; - xt_entry_foreach(iter, t->entries[cpu], t->size) { + xt_entry_foreach(iter, t->entries, t->size) { + struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; + tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); - bcnt = iter->counters.bcnt; - pcnt = iter->counters.pcnt; + bcnt = tmp->bcnt; + pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); @@ -950,11 +957,7 @@ copy_entries_to_user(unsigned int total_size, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1062,16 +1065,16 @@ static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ip6t_entry *iter; - void *loc_cpu_entry; + const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; - /* we dont care about newinfo->entries[] */ + /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; - loc_cpu_entry = info->entries[raw_smp_processor_id()]; + loc_cpu_entry = info->entries; xt_compat_init_offsets(AF_INET6, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); @@ -1192,7 +1195,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - const void *loc_cpu_old_entry; struct ip6t_entry *iter; ret = 0; @@ -1235,8 +1237,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); @@ -1273,14 +1274,16 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1311,7 +1314,7 @@ static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i, curcpu; + unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1321,7 +1324,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, struct xt_table *t; const struct xt_table_info *private; int ret = 0; - const void *loc_cpu_entry; struct ip6t_entry *iter; unsigned int addend; #ifdef CONFIG_COMPAT @@ -1369,7 +1371,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, goto free; } - local_bh_disable(); private = t->private; if (private->number != num_counters) { @@ -1378,16 +1379,15 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, } i = 0; - /* Choose the copy that is on our node */ - curcpu = smp_processor_id(); addend = xt_write_recseq_begin(); - loc_cpu_entry = private->entries[curcpu]; - xt_entry_foreach(iter, loc_cpu_entry, private->size) { - ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + xt_entry_foreach(iter, private->entries, private->size) { + struct xt_counters *tmp; + + tmp = xt_get_this_cpu_counter(&iter->counters); + ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); - unlock_up_free: local_bh_enable(); xt_table_unlock(t); @@ -1454,7 +1454,6 @@ static int compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ip6t_ip6 *ipv6, - unsigned int hookmask, int *size) { struct xt_match *match; @@ -1523,8 +1522,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { - ret = compat_find_calc_match(ematch, name, - &e->ipv6, e->comefrom, &off); + ret = compat_find_calc_match(ematch, name, &e->ipv6, &off); if (ret != 0) goto release_matches; ++j; @@ -1618,6 +1616,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + e->counters.pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(e->counters.pcnt)) + return -ENOMEM; j = 0; mtpar.net = net; mtpar.table = name; @@ -1642,6 +1643,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net, break; cleanup_match(ematch, net); } + + xt_percpu_counter_free(e->counters.pcnt); + return ret; } @@ -1726,7 +1730,7 @@ translate_compat_table(struct net *net, newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } - entry1 = newinfo->entries[raw_smp_processor_id()]; + entry1 = newinfo->entries; pos = entry1; size = total_size; xt_entry_foreach(iter0, entry0, total_size) { @@ -1778,11 +1782,6 @@ translate_compat_table(struct net *net, return ret; } - /* And one copy for every other CPU */ - for_each_possible_cpu(i) - if (newinfo->entries[i] && newinfo->entries[i] != entry1) - memcpy(newinfo->entries[i], entry1, newinfo->size); - *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); @@ -1820,14 +1819,16 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + if (tmp.num_counters == 0) + return -EINVAL; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; @@ -1898,7 +1899,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *pos; unsigned int size; int ret = 0; - const void *loc_cpu_entry; unsigned int i = 0; struct ip6t_entry *iter; @@ -1906,14 +1906,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, if (IS_ERR(counters)) return PTR_ERR(counters); - /* choose the copy that is on our node/cpu, ... - * This choice is lazy (because current thread is - * allowed to migrate to another cpu) - */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - xt_entry_foreach(iter, loc_cpu_entry, total_size) { + xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) @@ -2088,8 +2083,7 @@ struct xt_table *ip6t_register_table(struct net *net, goto out; } - /* choose the copy on our node/cpu, but dont care about preemption */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); @@ -2119,7 +2113,7 @@ void ip6t_unregister_table(struct net *net, struct xt_table *table) private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 544b0a9da1b5..12331efd49cf 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -83,7 +83,8 @@ static int reject_tg6_check(const struct xt_tgchk_param *par) return -EINVAL; } else if (rejinfo->with == IP6T_TCP_RESET) { /* Must specify that it's a TCP packet */ - if (e->ipv6.proto != IPPROTO_TCP || + if (!(e->ipv6.flags & IP6T_F_PROTO) || + e->ipv6.proto != IPPROTO_TCP || (e->ipv6.invflags & XT_INV_PROTO)) { pr_info("TCP_RESET illegal for non-tcp\n"); return -EINVAL; diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index a0d17270117c..6edb7b106de7 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -315,11 +315,9 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out)); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index ca7f6c128086..5c33d8abc077 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -33,13 +33,11 @@ static const struct xt_table packet_filter = { /* The work comes in here from netfilter.c. */ static unsigned int ip6table_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - const struct net *net = dev_net((in != NULL) ? in : out); + const struct net *net = dev_net(state->in ? state->in : state->out); - return ip6t_do_table(skb, ops->hooknum, in, out, - net->ipv6.ip6table_filter); + return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_filter); } static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 307bbb782d14..b551f5b79fe2 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -32,7 +32,7 @@ static const struct xt_table packet_mangler = { }; static unsigned int -ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out) +ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; struct in6_addr saddr, daddr; @@ -57,8 +57,8 @@ ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out) /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, - dev_net(out)->ipv6.ip6table_mangle); + ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, state, + dev_net(state->out)->ipv6.ip6table_mangle); if (ret != NF_DROP && ret != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || @@ -77,17 +77,16 @@ ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out) /* The work comes in here from netfilter.c. */ static unsigned int ip6table_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { if (ops->hooknum == NF_INET_LOCAL_OUT) - return ip6t_mangle_out(skb, out); + return ip6t_mangle_out(skb, state); if (ops->hooknum == NF_INET_POST_ROUTING) - return ip6t_do_table(skb, ops->hooknum, in, out, - dev_net(out)->ipv6.ip6table_mangle); + return ip6t_do_table(skb, ops->hooknum, state, + dev_net(state->out)->ipv6.ip6table_mangle); /* INPUT/FORWARD */ - return ip6t_do_table(skb, ops->hooknum, in, out, - dev_net(in)->ipv6.ip6table_mangle); + return ip6t_do_table(skb, ops->hooknum, state, + dev_net(state->in)->ipv6.ip6table_mangle); } static struct nf_hook_ops *mangle_ops __read_mostly; diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index b0634ac996b7..c3a7f7af0ed4 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -32,49 +32,40 @@ static const struct xt_table nf_nat_ipv6_table = { static unsigned int ip6table_nat_do_chain(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - return ip6t_do_table(skb, ops->hooknum, in, out, net->ipv6.ip6table_nat); + return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_nat); } static unsigned int ip6table_nat_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv6_fn(ops, skb, in, out, ip6table_nat_do_chain); + return nf_nat_ipv6_fn(ops, skb, state, ip6table_nat_do_chain); } static unsigned int ip6table_nat_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv6_in(ops, skb, in, out, ip6table_nat_do_chain); + return nf_nat_ipv6_in(ops, skb, state, ip6table_nat_do_chain); } static unsigned int ip6table_nat_out(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv6_out(ops, skb, in, out, ip6table_nat_do_chain); + return nf_nat_ipv6_out(ops, skb, state, ip6table_nat_do_chain); } static unsigned int ip6table_nat_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv6_local_fn(ops, skb, in, out, ip6table_nat_do_chain); + return nf_nat_ipv6_local_fn(ops, skb, state, ip6table_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 5274740acecc..0b33caad2b69 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -20,13 +20,11 @@ static const struct xt_table packet_raw = { /* The work comes in here from netfilter.c. */ static unsigned int ip6table_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - const struct net *net = dev_net((in != NULL) ? in : out); + const struct net *net = dev_net(state->in ? state->in : state->out); - return ip6t_do_table(skb, ops->hooknum, in, out, - net->ipv6.ip6table_raw); + return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_raw); } static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index ab3b0219ecfa..fcef83c25f7b 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -37,13 +37,11 @@ static const struct xt_table security_table = { static unsigned int ip6table_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - const struct net *net = dev_net((in != NULL) ? in : out); + const struct net *net = dev_net(state->in ? state->in : state->out); - return ip6t_do_table(skb, ops->hooknum, in, out, + return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_security); } diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index b68d0e59c1f8..4ba0c34c627b 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -97,9 +97,7 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, static unsigned int ipv6_helper(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct nf_conn *ct; const struct nf_conn_help *help; @@ -135,9 +133,7 @@ static unsigned int ipv6_helper(const struct nf_hook_ops *ops, static unsigned int ipv6_confirm(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -171,25 +167,21 @@ out: static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_conntrack_in(dev_net(in), PF_INET6, ops->hooknum, skb); + return nf_conntrack_in(dev_net(state->in), PF_INET6, ops->hooknum, skb); } static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { /* root is playing with raw sockets. */ if (skb->len < sizeof(struct ipv6hdr)) { net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); return NF_ACCEPT; } - return nf_conntrack_in(dev_net(out), PF_INET6, ops->hooknum, skb); + return nf_conntrack_in(dev_net(state->out), PF_INET6, ops->hooknum, skb); } static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { @@ -290,10 +282,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) static int ipv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - if (nla_put(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4, - &tuple->src.u3.ip6) || - nla_put(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4, - &tuple->dst.u3.ip6)) + if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || + nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) goto nla_put_failure; return 0; @@ -312,10 +302,8 @@ static int ipv6_nlattr_to_tuple(struct nlattr *tb[], if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) return -EINVAL; - memcpy(&t->src.u3.ip6, nla_data(tb[CTA_IP_V6_SRC]), - sizeof(u_int32_t) * 4); - memcpy(&t->dst.u3.ip6, nla_data(tb[CTA_IP_V6_DST]), - sizeof(u_int32_t) * 4); + t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); + t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); return 0; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 6f187c8d8a1b..6d02498172c1 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -348,7 +348,7 @@ found: fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - add_frag_mem_limit(&fq->q, skb->truesize); + add_frag_mem_limit(fq->q.net, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -430,7 +430,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - add_frag_mem_limit(&fq->q, clone->truesize); + add_frag_mem_limit(fq->q.net, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -454,7 +454,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; } - sub_frag_mem_limit(&fq->q, head->truesize); + sub_frag_mem_limit(fq->q.net, head->truesize); head->ignore_df = 1; head->next = NULL; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index e70382e4dfb5..a45db0b4785c 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -54,9 +54,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct sk_buff *reasm; @@ -77,9 +75,9 @@ static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, nf_ct_frag6_consume_orig(reasm); - NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, reasm, - (struct net_device *) in, (struct net_device *) out, - okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, state->sk, reasm, + state->in, state->out, + state->okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1); return NF_STOLEN; } diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index ddf07e6f59d7..8dd869642f45 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -5,8 +5,10 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> @@ -27,7 +29,7 @@ static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { - .level = 5, + .level = LOGLEVEL_NOTICE, .logflags = NF_LOG_MASK, }, }, diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index c5812e1c1ffb..e76900e0aa92 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -263,11 +263,10 @@ EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, + const struct nf_hook_state *state, unsigned int (*do_chain)(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct)) { struct nf_conn *ct; @@ -318,7 +317,7 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(ops, skb, in, out, ct); + ret = do_chain(ops, skb, state, ct); if (ret != NF_ACCEPT) return ret; @@ -332,7 +331,7 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, pr_debug("Already setup manip %s for ct %p\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) goto oif_changed; } break; @@ -341,7 +340,7 @@ nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out)) goto oif_changed; } @@ -355,17 +354,16 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn); unsigned int nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, + const struct nf_hook_state *state, unsigned int (*do_chain)(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct)) { unsigned int ret; struct in6_addr daddr = ipv6_hdr(skb)->daddr; - ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); + ret = nf_nat_ipv6_fn(ops, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); @@ -376,11 +374,10 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv6_in); unsigned int nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, + const struct nf_hook_state *state, unsigned int (*do_chain)(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct)) { #ifdef CONFIG_XFRM @@ -394,7 +391,7 @@ nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, if (skb->len < sizeof(struct ipv6hdr)) return NF_ACCEPT; - ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); + ret = nf_nat_ipv6_fn(ops, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && @@ -418,11 +415,10 @@ EXPORT_SYMBOL_GPL(nf_nat_ipv6_out); unsigned int nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, + const struct nf_hook_state *state, unsigned int (*do_chain)(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct)) { const struct nf_conn *ct; @@ -434,7 +430,7 @@ nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, if (skb->len < sizeof(struct ipv6hdr)) return NF_ACCEPT; - ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); + ret = nf_nat_ipv6_fn(ops, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index d05b36440e8b..94b4c6dfb400 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -13,6 +13,7 @@ #include <net/ip6_checksum.h> #include <net/netfilter/ipv6/nf_reject.h> #include <linux/netfilter_ipv6.h> +#include <linux/netfilter_bridge.h> #include <net/netfilter/ipv6/nf_reject.h> const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, @@ -65,7 +66,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - __be16 protocol, int hoplimit) + __u8 protocol, int hoplimit) { struct ipv6hdr *ip6h; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); @@ -195,7 +196,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) */ if (oldskb->nf_bridge) { struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = oldskb->nf_bridge->physindev; + + nskb->dev = nf_bridge_get_physindev(oldskb); nskb->protocol = htons(ETH_P_IPV6); ip6h->payload_len = htons(sizeof(struct tcphdr)); if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), @@ -208,4 +210,39 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) } EXPORT_SYMBOL_GPL(nf_send_reset6); +static bool reject6_csum_ok(struct sk_buff *skb, int hook) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int thoff; + __be16 fo; + u8 proto; + + if (skb->csum_bad) + return false; + + if (skb_csum_unnecessary(skb)) + return true; + + proto = ip6h->nexthdr; + thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); + + if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) + return false; + + return nf_ip6_checksum(skb, hook, thoff, proto) == 0; +} + +void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, + unsigned char code, unsigned int hooknum) +{ + if (!reject6_csum_ok(skb_in, hooknum)) + return; + + if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) + skb_in->dev = net->loopback_dev; + + icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); +} +EXPORT_SYMBOL_GPL(nf_send_unreach6); + MODULE_LICENSE("GPL"); diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index 0d812b31277d..c8148ba76d1a 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -18,14 +18,12 @@ static unsigned int nft_do_chain_ipv6(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { struct nft_pktinfo pkt; /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0) + if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0) return NF_DROP; return nft_do_chain(&pkt, ops); @@ -33,9 +31,7 @@ static unsigned int nft_do_chain_ipv6(const struct nf_hook_ops *ops, static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { if (unlikely(skb->len < sizeof(struct ipv6hdr))) { if (net_ratelimit()) @@ -44,7 +40,7 @@ static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops, return NF_ACCEPT; } - return nft_do_chain_ipv6(ops, skb, in, out, okfn); + return nft_do_chain_ipv6(ops, skb, state); } struct nft_af_info nft_af_ipv6 __read_mostly = { diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 1c4b75dd425b..951bb458b7bd 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -26,51 +26,42 @@ static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, + const struct nf_hook_state *state, struct nf_conn *ct) { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out); + nft_set_pktinfo_ipv6(&pkt, ops, skb, state); return nft_do_chain(&pkt, ops); } static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv6_fn(ops, skb, in, out, nft_nat_do_chain); + return nf_nat_ipv6_fn(ops, skb, state, nft_nat_do_chain); } static unsigned int nft_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv6_in(ops, skb, in, out, nft_nat_do_chain); + return nf_nat_ipv6_in(ops, skb, state, nft_nat_do_chain); } static unsigned int nft_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv6_out(ops, skb, in, out, nft_nat_do_chain); + return nf_nat_ipv6_out(ops, skb, state, nft_nat_do_chain); } static unsigned int nft_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { - return nf_nat_ipv6_local_fn(ops, skb, in, out, nft_nat_do_chain); + return nf_nat_ipv6_local_fn(ops, skb, state, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv6 = { diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 42031299585e..0dafdaac5e17 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -24,9 +24,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { unsigned int ret; struct nft_pktinfo pkt; @@ -35,7 +33,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, u32 mark, flowlabel; /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0) + if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0) return NF_DROP; /* save source/dest address, mark, hoplimit, flowlabel, priority */ diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 529c119cbb14..cd1ac1637a05 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -18,19 +18,16 @@ #include <net/netfilter/ipv6/nf_nat_masquerade.h> static void nft_masq_ipv6_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_masq *priv = nft_expr_priv(expr); struct nf_nat_range range; - unsigned int verdict; memset(&range, 0, sizeof(range)); range.flags = priv->flags; - verdict = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); - - data[NFT_REG_VERDICT].verdict = verdict; + regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); } static struct nft_expr_type nft_masq_ipv6_type; diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index 11820b6b3613..effd393bd517 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -18,26 +18,25 @@ #include <net/netfilter/nf_nat_redirect.h> static void nft_redir_ipv6_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_redir *priv = nft_expr_priv(expr); struct nf_nat_range range; - unsigned int verdict; memset(&range, 0, sizeof(range)); if (priv->sreg_proto_min) { range.min_proto.all = - *(__be16 *)&data[priv->sreg_proto_min].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_min], range.max_proto.all = - *(__be16 *)&data[priv->sreg_proto_max].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_max], range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } range.flags |= priv->flags; - verdict = nf_nat_redirect_ipv6(pkt->skb, &range, pkt->ops->hooknum); - data[NFT_REG_VERDICT].verdict = verdict; + regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, + pkt->ops->hooknum); } static struct nft_expr_type nft_redir_ipv6_type; diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index f73285924144..d0d1540ecf87 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -20,7 +20,7 @@ #include <net/netfilter/ipv6/nf_reject.h> static void nft_reject_ipv6_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); @@ -34,9 +34,11 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, case NFT_REJECT_TCP_RST: nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); break; + default: + break; } - data[NFT_REG_VERDICT].verdict = NF_DROP; + regs->verdict.code = NF_DROP; } static struct nft_expr_type nft_reject_ipv6_type; diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 74581f706c4d..928a0fb0b744 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -8,14 +8,17 @@ #include <net/ip6_fib.h> #include <net/addrconf.h> #include <net/secure_seq.h> +#include <linux/netfilter.h> -static u32 __ipv6_select_ident(u32 hashrnd, struct in6_addr *dst, - struct in6_addr *src) +static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, + const struct in6_addr *dst, + const struct in6_addr *src) { u32 hash, id; hash = __ipv6_addr_jhash(dst, hashrnd); hash = __ipv6_addr_jhash(src, hash); + hash ^= net_hash_mix(net); /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, * set the hight order instead thus minimizing possible future @@ -36,7 +39,7 @@ static u32 __ipv6_select_ident(u32 hashrnd, struct in6_addr *dst, * * The network header must be set before calling this. */ -void ipv6_proxy_select_ident(struct sk_buff *skb) +void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) { static u32 ip6_proxy_idents_hashrnd __read_mostly; struct in6_addr buf[2]; @@ -53,22 +56,23 @@ void ipv6_proxy_select_ident(struct sk_buff *skb) net_get_random_once(&ip6_proxy_idents_hashrnd, sizeof(ip6_proxy_idents_hashrnd)); - id = __ipv6_select_ident(ip6_proxy_idents_hashrnd, + id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, &addrs[1], &addrs[0]); skb_shinfo(skb)->ip6_frag_id = htonl(id); } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); -void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) +__be32 ipv6_select_ident(struct net *net, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { static u32 ip6_idents_hashrnd __read_mostly; u32 id; net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); - id = __ipv6_select_ident(ip6_idents_hashrnd, &rt->rt6i_dst.addr, - &rt->rt6i_src.addr); - fhdr->identification = htonl(id); + id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr); + return htonl(id); } EXPORT_SYMBOL(ipv6_select_ident); @@ -134,7 +138,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst) EXPORT_SYMBOL(ip6_dst_hoplimit); #endif -int __ip6_local_out(struct sk_buff *skb) +static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) { int len; @@ -144,19 +148,30 @@ int __ip6_local_out(struct sk_buff *skb) ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); - return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, - skb_dst(skb)->dev, dst_output); + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, + NULL, skb_dst(skb)->dev, dst_output_sk); +} + +int __ip6_local_out(struct sk_buff *skb) +{ + return __ip6_local_out_sk(skb->sk, skb); } EXPORT_SYMBOL_GPL(__ip6_local_out); -int ip6_local_out(struct sk_buff *skb) +int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb) { int err; - err = __ip6_local_out(skb); + err = __ip6_local_out_sk(sk, skb); if (likely(err == 1)) - err = dst_output(skb); + err = dst_output_sk(sk, skb); return err; } +EXPORT_SYMBOL_GPL(ip6_local_out_sk); + +int ip6_local_out(struct sk_buff *skb) +{ + return ip6_local_out_sk(skb->sk, skb); +} EXPORT_SYMBOL_GPL(ip6_local_out); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index a2dfff6ff227..263a5164a6f5 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -77,8 +77,7 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, return 0; } -int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index dae7f1a1e464..ca4700cb26c4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -32,7 +32,7 @@ #include <linux/netfilter_ipv6.h> #include <linux/skbuff.h> #include <linux/compat.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/net_namespace.h> @@ -172,7 +172,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) read_lock(&raw_v6_hashinfo.lock); sk = sk_head(&raw_v6_hashinfo.ht[hash]); - if (sk == NULL) + if (!sk) goto out; net = dev_net(skb->dev); @@ -367,7 +367,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, read_lock(&raw_v6_hashinfo.lock); sk = sk_head(&raw_v6_hashinfo.ht[hash]); - if (sk != NULL) { + if (sk) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; saddr = &ip6h->saddr; @@ -456,9 +456,8 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) * we return it, otherwise we block. */ -static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, - int noblock, int flags, int *addr_len) +static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); @@ -631,7 +630,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb = sock_alloc_send_skb(sk, length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); - if (skb == NULL) + if (!skb) goto error; skb_reserve(skb, hlen); @@ -653,8 +652,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, goto error_fault; IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, - rt->dst.dev, dst_output); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb, + NULL, rt->dst.dev, dst_output_sk); if (err > 0) err = net_xmit_errno(err); if (err) @@ -730,8 +729,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } -static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len) +static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); @@ -791,7 +789,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (!flowlabel) return -EINVAL; } } @@ -833,13 +831,13 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (!flowlabel) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } - if (opt == NULL) + if (!opt) opt = np->opt; if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); @@ -867,6 +865,9 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, fl6.flowi6_oif = np->ucast_oif; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + if (inet->hdrincl) + fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); @@ -1132,7 +1133,7 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) + if (skb) amount = skb_tail_pointer(skb) - skb_transport_header(skb); spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -1326,13 +1327,7 @@ static struct inet_protosw rawv6_protosw = { int __init rawv6_init(void) { - int ret; - - ret = inet6_register_protosw(&rawv6_protosw); - if (ret) - goto out; -out: - return ret; + return inet6_register_protosw(&rawv6_protosw); } void rawv6_exit(void) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index d7d70e69973b..f1159bb76e0a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -144,7 +144,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - if (fq->q.flags & INET_FRAG_EVICTED) + if (inet_frag_evicting(&fq->q)) goto out_rcu_unlock; IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); @@ -330,7 +330,7 @@ found: fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; - add_frag_mem_limit(&fq->q, skb->truesize); + add_frag_mem_limit(fq->q.net, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -430,7 +430,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, int i, plen = 0; clone = alloc_skb(0, GFP_ATOMIC); - if (clone == NULL) + if (!clone) goto out_oom; clone->next = head->next; head->next = clone; @@ -443,7 +443,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - add_frag_mem_limit(&fq->q, clone->truesize); + add_frag_mem_limit(fq->q.net, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -481,7 +481,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } fp = next; } - sub_frag_mem_limit(&fq->q, sum_truesize); + sub_frag_mem_limit(fq->q.net, sum_truesize); head->next = NULL; head->dev = dev; @@ -552,7 +552,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, ip6_frag_ecn(hdr)); - if (fq != NULL) { + if (fq) { int ret; spin_lock(&fq->q.lock); @@ -632,7 +632,7 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) table = ip6_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL); - if (table == NULL) + if (!table) goto err_alloc; table[0].data = &net->ipv6.frags.high_thresh; @@ -648,7 +648,7 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) } hdr = register_net_sysctl(net, "net/ipv6", table); - if (hdr == NULL) + if (!hdr) goto err_reg; net->ipv6.sysctl.frags_hdr = hdr; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4688bd4d7f59..6090969937f8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -72,8 +72,7 @@ enum rt6_nud_state { RT6_NUD_SUCCEED = 1 }; -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest); +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); static unsigned int ip6_mtu(const struct dst_entry *dst); @@ -92,6 +91,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); +static void rt6_dst_from_metrics_check(struct rt6_info *rt); static int rt6_score_route(struct rt6_info *rt, int oif, int strict); #ifdef CONFIG_IPV6_ROUTE_INFO @@ -104,65 +104,82 @@ static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *gwaddr, int ifindex); #endif -static void rt6_bind_peer(struct rt6_info *rt, int create) +struct uncached_list { + spinlock_t lock; + struct list_head head; +}; + +static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); + +static void rt6_uncached_list_add(struct rt6_info *rt) { - struct inet_peer_base *base; - struct inet_peer *peer; + struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); - base = inetpeer_base_ptr(rt->_rt6i_peer); - if (!base) - return; + rt->dst.flags |= DST_NOCACHE; + rt->rt6i_uncached_list = ul; - peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create); - if (peer) { - if (!rt6_set_peer(rt, peer)) - inet_putpeer(peer); + spin_lock_bh(&ul->lock); + list_add_tail(&rt->rt6i_uncached, &ul->head); + spin_unlock_bh(&ul->lock); +} + +static void rt6_uncached_list_del(struct rt6_info *rt) +{ + if (!list_empty(&rt->rt6i_uncached)) { + struct uncached_list *ul = rt->rt6i_uncached_list; + + spin_lock_bh(&ul->lock); + list_del(&rt->rt6i_uncached); + spin_unlock_bh(&ul->lock); } } -static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create) +static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { - if (rt6_has_peer(rt)) - return rt6_peer_ptr(rt); + struct net_device *loopback_dev = net->loopback_dev; + int cpu; + + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + struct rt6_info *rt; + + spin_lock_bh(&ul->lock); + list_for_each_entry(rt, &ul->head, rt6i_uncached) { + struct inet6_dev *rt_idev = rt->rt6i_idev; + struct net_device *rt_dev = rt->dst.dev; - rt6_bind_peer(rt, create); - return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL); + if (rt_idev && (rt_idev->dev == dev || !dev) && + rt_idev->dev != loopback_dev) { + rt->rt6i_idev = in6_dev_get(loopback_dev); + in6_dev_put(rt_idev); + } + + if (rt_dev && (rt_dev == dev || !dev) && + rt_dev != loopback_dev) { + rt->dst.dev = loopback_dev; + dev_hold(rt->dst.dev); + dev_put(rt_dev); + } + } + spin_unlock_bh(&ul->lock); + } } -static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt) +static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) { - return __rt6_get_peer(rt, 1); + return dst_metrics_write_ptr(rt->dst.from); } static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) { - struct rt6_info *rt = (struct rt6_info *) dst; - struct inet_peer *peer; - u32 *p = NULL; + struct rt6_info *rt = (struct rt6_info *)dst; - if (!(rt->dst.flags & DST_HOST)) + if (rt->rt6i_flags & RTF_PCPU) + return rt6_pcpu_cow_metrics(rt); + else if (rt->rt6i_flags & RTF_CACHE) + return NULL; + else return dst_cow_metrics_generic(dst, old); - - peer = rt6_get_peer_create(rt); - if (peer) { - u32 *old_p = __DST_METRICS_PTR(old); - unsigned long prev, new; - - p = peer->metrics; - if (inet_metrics_new(peer) || - (old & DST_METRICS_FORCE_OVERWRITE)) - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); - - new = (unsigned long) p; - prev = cmpxchg(&dst->_metrics, old, new); - - if (prev != old) { - p = __DST_METRICS_PTR(prev); - if (prev & DST_METRICS_READ_ONLY) - p = NULL; - } - } - return p; } static inline const void *choose_neigh_daddr(struct rt6_info *rt, @@ -194,7 +211,6 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, - .protocol = cpu_to_be16(ETH_P_IPV6), .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, @@ -236,7 +252,6 @@ static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, - .protocol = cpu_to_be16(ETH_P_IPV6), .destroy = ip6_dst_destroy, .check = ip6_dst_check, .mtu = ip6_blackhole_mtu, @@ -301,10 +316,10 @@ static const struct rt6_info ip6_blk_hole_entry_template = { #endif /* allocate dst with ip6_dst_ops */ -static inline struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags, - struct fib6_table *table) +static struct rt6_info *__ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0, DST_OBSOLETE_FORCE_CHK, flags); @@ -313,21 +328,51 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, struct dst_entry *dst = &rt->dst; memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); - rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); INIT_LIST_HEAD(&rt->rt6i_siblings); + INIT_LIST_HEAD(&rt->rt6i_uncached); } return rt; } +static struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) +{ + struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table); + + if (rt) { + rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); + if (rt->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **p; + + p = per_cpu_ptr(rt->rt6i_pcpu, cpu); + /* no one shares rt */ + *p = NULL; + } + } else { + dst_destroy((struct dst_entry *)rt); + return NULL; + } + } + + return rt; +} + static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct inet6_dev *idev = rt->rt6i_idev; struct dst_entry *from = dst->from; + struct inet6_dev *idev; - if (!(rt->dst.flags & DST_HOST)) - dst_destroy_metrics_generic(dst); + dst_destroy_metrics_generic(dst); + free_percpu(rt->rt6i_pcpu); + rt6_uncached_list_del(rt); + idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); @@ -335,11 +380,6 @@ static void ip6_dst_destroy(struct dst_entry *dst) dst->from = NULL; dst_release(from); - - if (rt6_has_peer(rt)) { - struct inet_peer *peer = rt6_peer_ptr(rt); - inet_putpeer(peer); - } } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -654,15 +694,33 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, u32 metric, int oif, int strict, bool *do_rr) { - struct rt6_info *rt, *match; + struct rt6_info *rt, *match, *cont; int mpri = -1; match = NULL; - for (rt = rr_head; rt && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + cont = NULL; + for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + + match = find_match(rt, oif, strict, &mpri, match, do_rr); + } + + for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + match = find_match(rt, oif, strict, &mpri, match, do_rr); - for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + } + + if (match || !cont) + return match; + + for (rt = cont; rt; rt = rt->dst.rt6_next) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -696,6 +754,11 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) return match ? match : net->ipv6.ip6_null_entry; } +static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); +} + #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr) @@ -874,9 +937,9 @@ int ip6_ins_rt(struct rt6_info *rt) return __ip6_ins_rt(rt, &info, &mxc); } -static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_info *rt; @@ -884,15 +947,26 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, * Clone the route. */ - rt = ip6_rt_copy(ort, daddr); + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) + ort = (struct rt6_info *)ort->dst.from; - if (rt) { + rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, + 0, ort->rt6i_table); + + if (!rt) + return NULL; + + ip6_rt_copy_init(rt, ort); + rt->rt6i_flags |= RTF_CACHE; + rt->rt6i_metric = 0; + rt->dst.flags |= DST_HOST; + rt->rt6i_dst.addr = *daddr; + rt->rt6i_dst.plen = 128; + + if (!rt6_is_gw_or_nonexthop(ort)) { if (ort->rt6i_dst.plen != 128 && ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; - - rt->rt6i_flags |= RTF_CACHE; - #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { rt->rt6i_src.addr = *saddr; @@ -904,30 +978,65 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, return rt; } -static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, - const struct in6_addr *daddr) +static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) { - struct rt6_info *rt = ip6_rt_copy(ort, daddr); + struct rt6_info *pcpu_rt; - if (rt) - rt->rt6i_flags |= RTF_CACHE; - return rt; + pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev), + rt->dst.dev, rt->dst.flags, + rt->rt6i_table); + + if (!pcpu_rt) + return NULL; + ip6_rt_copy_init(pcpu_rt, rt); + pcpu_rt->rt6i_protocol = rt->rt6i_protocol; + pcpu_rt->rt6i_flags |= RTF_PCPU; + return pcpu_rt; +} + +/* It should be called with read_lock_bh(&tb6_lock) acquired */ +static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) +{ + struct rt6_info *pcpu_rt, *prev, **p; + + p = this_cpu_ptr(rt->rt6i_pcpu); + pcpu_rt = *p; + + if (pcpu_rt) + goto done; + + pcpu_rt = ip6_rt_pcpu_alloc(rt); + if (!pcpu_rt) { + struct net *net = dev_net(rt->dst.dev); + + pcpu_rt = net->ipv6.ip6_null_entry; + goto done; + } + + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = prev; + } + +done: + dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); + return pcpu_rt; } static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt, *nrt; + struct rt6_info *rt; int strict = 0; - int attempts = 3; - int err; strict |= flags & RT6_LOOKUP_F_IFACE; if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; -redo_fib6_lookup_lock: read_lock_bh(&table->tb6_lock); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); @@ -946,51 +1055,52 @@ redo_rt6_select: strict &= ~RT6_LOOKUP_F_REACHABLE; fn = saved_fn; goto redo_rt6_select; - } else { - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - goto out2; } } - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - if (rt->rt6i_flags & RTF_CACHE) - goto out2; + if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); - if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) - nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); - else if (!(rt->dst.flags & DST_HOST)) - nrt = rt6_alloc_clone(rt, &fl6->daddr); - else - goto out2; + rt6_dst_from_metrics_check(rt); + return rt; + } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && + !(rt->rt6i_flags & RTF_GATEWAY))) { + /* Create a RTF_CACHE clone which will not be + * owned by the fib6 tree. It is for the special case where + * the daddr in the skb during the neighbor look-up is different + * from the fl6->daddr used to look-up route here. + */ - ip6_rt_put(rt); - rt = nrt ? : net->ipv6.ip6_null_entry; + struct rt6_info *uncached_rt; - dst_hold(&rt->dst); - if (nrt) { - err = ip6_ins_rt(nrt); - if (!err) - goto out2; - } + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); - if (--attempts <= 0) - goto out2; + uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); + dst_release(&rt->dst); - /* - * Race condition! In the gap, when table->tb6_lock was - * released someone could insert this route. Relookup. - */ - ip6_rt_put(rt); - goto redo_fib6_lookup_lock; + if (uncached_rt) + rt6_uncached_list_add(uncached_rt); + else + uncached_rt = net->ipv6.ip6_null_entry; -out2: - rt->dst.lastuse = jiffies; - rt->dst.__use++; + dst_hold(&uncached_rt->dst); + return uncached_rt; - return rt; + } else { + /* Get a percpu copy */ + + struct rt6_info *pcpu_rt; + + rt->dst.lastuse = jiffies; + rt->dst.__use++; + pcpu_rt = rt6_get_pcpu_route(rt); + read_unlock_bh(&table->tb6_lock); + + return pcpu_rt; + } } static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, @@ -1061,7 +1171,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new = &rt->dst; memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); - rt6_init_peer(rt, net->ipv6.peers); new->__use = 1; new->input = dst_discard; @@ -1095,6 +1204,33 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ +static void rt6_dst_from_metrics_check(struct rt6_info *rt) +{ + if (rt->dst.from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); +} + +static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) +{ + if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + return NULL; + + if (rt6_check_expired(rt)) + return NULL; + + return &rt->dst; +} + +static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) +{ + if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + return &rt->dst; + else + return NULL; +} + static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct rt6_info *rt; @@ -1105,13 +1241,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) - return NULL; - if (rt6_check_expired(rt)) - return NULL; + rt6_dst_from_metrics_check(rt); - return dst; + if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE)) + return rt6_dst_from_check(rt, cookie); + else + return rt6_check(rt, cookie); } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) @@ -1150,24 +1286,63 @@ static void ip6_link_failure(struct sk_buff *skb) } } -static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) +static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) +{ + struct net *net = dev_net(rt->dst.dev); + + rt->rt6i_flags |= RTF_MODIFIED; + rt->rt6i_pmtu = mtu; + rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); +} + +static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, + const struct ipv6hdr *iph, u32 mtu) { struct rt6_info *rt6 = (struct rt6_info *)dst; - dst_confirm(dst); - if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { - struct net *net = dev_net(dst->dev); + if (rt6->rt6i_flags & RTF_LOCAL) + return; - rt6->rt6i_flags |= RTF_MODIFIED; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + dst_confirm(dst); + mtu = max_t(u32, mtu, IPV6_MIN_MTU); + if (mtu >= dst_mtu(dst)) + return; - dst_metric_set(dst, RTAX_MTU, mtu); - rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); + if (rt6->rt6i_flags & RTF_CACHE) { + rt6_do_update_pmtu(rt6, mtu); + } else { + const struct in6_addr *daddr, *saddr; + struct rt6_info *nrt6; + + if (iph) { + daddr = &iph->daddr; + saddr = &iph->saddr; + } else if (sk) { + daddr = &sk->sk_v6_daddr; + saddr = &inet6_sk(sk)->saddr; + } else { + return; + } + nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); + if (nrt6) { + rt6_do_update_pmtu(nrt6, mtu); + + /* ip6_ins_rt(nrt6) will bump the + * rt6->rt6i_node->fn_sernum + * which will fail the next rt6_check() and + * invalidate the sk->sk_dst_cache. + */ + ip6_ins_rt(nrt6); + } } } +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); +} + void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark) { @@ -1184,7 +1359,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu)); + __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); @@ -1343,12 +1518,17 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) static unsigned int ip6_mtu(const struct dst_entry *dst) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + unsigned int mtu = rt->rt6i_pmtu; struct inet6_dev *idev; - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + goto out; + mtu = IPV6_MIN_MTU; rcu_read_lock(); @@ -1478,7 +1658,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, int remaining; u32 *mp; - if (cfg->fc_mx == NULL) + if (!cfg->fc_mx) return 0; mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); @@ -1592,10 +1772,8 @@ int ip6_route_add(struct fib6_config *cfg) ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) { + if (rt->rt6i_dst.plen == 128) rt->dst.flags |= DST_HOST; - dst_metrics_set_force_overwrite(&rt->dst); - } #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); @@ -1653,6 +1831,16 @@ int ip6_route_add(struct fib6_config *cfg) int gwa_type; gw_addr = &cfg->fc_gateway; + + /* if gw_addr is local we will fail to detect this in case + * address is still TENTATIVE (DAD in progress). rt6_lookup() + * will return already-added prefix route via interface that + * prefix route was assigned to, which might be non-loopback. + */ + err = -EINVAL; + if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0)) + goto out; + rt->rt6i_gateway = *gw_addr; gwa_type = ipv6_addr_type(gw_addr); @@ -1666,7 +1854,6 @@ int ip6_route_add(struct fib6_config *cfg) (SIT, PtP, NBMA NOARP links) it is handy to allow some exceptions. --ANK */ - err = -EINVAL; if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; @@ -1787,6 +1974,9 @@ static int ip6_route_del(struct fib6_config *cfg) if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if ((rt->rt6i_flags & RTF_CACHE) && + !(cfg->fc_flags & RTF_CACHE)) + continue; if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -1896,7 +2086,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NEIGH_UPDATE_F_ISROUTER)) ); - nrt = ip6_rt_copy(rt, &msg->dest); + nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); if (!nrt) goto out; @@ -1928,42 +2118,35 @@ out: * Misc support functions */ -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest) +static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - struct net *net = dev_net(ort->dst.dev); - struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0, - ort->rt6i_table); - - if (rt) { - rt->dst.input = ort->dst.input; - rt->dst.output = ort->dst.output; - rt->dst.flags |= DST_HOST; + BUG_ON(from->dst.from); - rt->rt6i_dst.addr = *dest; - rt->rt6i_dst.plen = 128; - dst_copy_metrics(&rt->dst, &ort->dst); - rt->dst.error = ort->dst.error; - rt->rt6i_idev = ort->rt6i_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); - rt->dst.lastuse = jiffies; - - if (ort->rt6i_flags & RTF_GATEWAY) - rt->rt6i_gateway = ort->rt6i_gateway; - else - rt->rt6i_gateway = *dest; - rt->rt6i_flags = ort->rt6i_flags; - rt6_set_from(rt, ort); - rt->rt6i_metric = 0; + rt->rt6i_flags &= ~RTF_EXPIRES; + dst_hold(&from->dst); + rt->dst.from = &from->dst; + dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); +} +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) +{ + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->rt6i_dst = ort->rt6i_dst; + rt->dst.error = ort->dst.error; + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->dst.lastuse = jiffies; + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; + rt6_set_from(rt, ort); + rt->rt6i_metric = ort->rt6i_metric; #ifdef CONFIG_IPV6_SUBTREES - memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); + rt->rt6i_src = ort->rt6i_src; #endif - memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); - rt->rt6i_table = ort->rt6i_table; - } - return rt; + rt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt->rt6i_table = ort->rt6i_table; } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -2247,9 +2430,10 @@ int ip6_route_get_saddr(struct net *net, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt); + struct inet6_dev *idev = + rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; int err = 0; - if (rt->rt6i_prefsrc.plen) + if (rt && rt->rt6i_prefsrc.plen) *saddr = rt->rt6i_prefsrc.addr; else err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, @@ -2337,6 +2521,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev) fib6_clean_all(net, fib6_ifdown, &adn); icmp6_clean_all(fib6_ifdown, &adn); + rt6_uncached_list_flush_dev(net, dev); } struct rt6_mtu_change_arg { @@ -2374,11 +2559,20 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) PMTU discouvery. */ if (rt->dst.dev == arg->dev && - !dst_metric_locked(&rt->dst, RTAX_MTU) && - (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6))) { - dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + !dst_metric_locked(&rt->dst, RTAX_MTU)) { + if (rt->rt6i_flags & RTF_CACHE) { + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) + rt->rt6i_pmtu = arg->mtu; + } else if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + } } return 0; } @@ -2400,6 +2594,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, + [RTA_PREF] = { .type = NLA_U8 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2407,6 +2602,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, { struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; + unsigned int pref; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); @@ -2433,12 +2629,15 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (rtm->rtm_type == RTN_LOCAL) cfg->fc_flags |= RTF_LOCAL; + if (rtm->rtm_flags & RTM_F_CLONED) + cfg->fc_flags |= RTF_CACHE; + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); if (tb[RTA_GATEWAY]) { - nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16); + cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; } @@ -2461,7 +2660,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, } if (tb[RTA_PREFSRC]) - nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16); + cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); if (tb[RTA_OIF]) cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); @@ -2482,6 +2681,14 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); } + if (tb[RTA_PREF]) { + pref = nla_get_u8(tb[RTA_PREF]); + if (pref != ICMPV6_ROUTER_PREF_LOW && + pref != ICMPV6_ROUTER_PREF_HIGH) + pref = ICMPV6_ROUTER_PREF_MEDIUM; + cfg->fc_flags |= RTF_PREF(pref); + } + err = 0; errout: return err; @@ -2495,9 +2702,9 @@ static int ip6_route_multipath(struct fib6_config *cfg, int add) int attrlen; int err = 0, last_err = 0; + remaining = cfg->fc_mp_len; beginning: rtnh = (struct rtnexthop *)cfg->fc_mp; - remaining = cfg->fc_mp_len; /* Parse a Multipath Entry */ while (rtnh_ok(rtnh, remaining)) { @@ -2511,7 +2718,7 @@ beginning: nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - nla_memcpy(&r_cfg.fc_gateway, nla, 16); + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } } @@ -2527,15 +2734,19 @@ beginning: * next hops that have been already added. */ add = 0; + remaining = cfg->fc_mp_len - remaining; goto beginning; } } /* Because each route is added like a single route we remove - * this flag after the first nexthop (if there is a collision, - * we have already fail to add the first nexthop: - * fib6_add_rt2node() has reject it). + * these flags after the first nexthop: if there is a collision, + * we have already failed to add the first nexthop: + * fib6_add_rt2node() has rejected it; when replacing, old + * nexthops have been replaced by first new, the rest should + * be added to it. */ - cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL; + cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | + NLM_F_REPLACE); rtnh = rtnh_next(rtnh, &remaining); } @@ -2585,7 +2796,8 @@ static inline size_t rt6_nlmsg_size(void) + nla_total_size(4) /* RTA_PRIORITY */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) - + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ + + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + + nla_total_size(1); /* RTA_PREF */ } static int rt6_fill_node(struct net *net, @@ -2594,6 +2806,7 @@ static int rt6_fill_node(struct net *net, int iif, int type, u32 portid, u32 seq, int prefix, int nowait, unsigned int flags) { + u32 metrics[RTAX_MAX]; struct rtmsg *rtm; struct nlmsghdr *nlh; long expires; @@ -2660,19 +2873,19 @@ static int rt6_fill_node(struct net *net, rtm->rtm_flags |= RTM_F_CLONED; if (dst) { - if (nla_put(skb, RTA_DST, 16, dst)) + if (nla_put_in6_addr(skb, RTA_DST, dst)) goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) - if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr)) + if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { - if (nla_put(skb, RTA_SRC, 16, src)) + if (nla_put_in6_addr(skb, RTA_SRC, src)) goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && - nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr)) + nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) goto nla_put_failure; #endif if (iif) { @@ -2696,22 +2909,25 @@ static int rt6_fill_node(struct net *net, } else if (dst) { struct in6_addr saddr_buf; if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && - nla_put(skb, RTA_PREFSRC, 16, &saddr_buf)) + nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } if (rt->rt6i_prefsrc.plen) { struct in6_addr saddr_buf; saddr_buf = rt->rt6i_prefsrc.addr; - if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf)) + if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } - if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt6i_pmtu) + metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (rt->rt6i_flags & RTF_GATEWAY) { - if (nla_put(skb, RTA_GATEWAY, 16, &rt->rt6i_gateway) < 0) + if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) goto nla_put_failure; } @@ -2726,6 +2942,9 @@ static int rt6_fill_node(struct net *net, if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; + if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -3199,6 +3418,7 @@ static struct notifier_block ip6_route_dev_notifier = { int __init ip6_route_init(void) { int ret; + int cpu; ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = @@ -3258,6 +3478,13 @@ int __init ip6_route_init(void) if (ret) goto out_register_late_subsys; + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + + INIT_LIST_HEAD(&ul->head); + spin_lock_init(&ul->lock); + } + out: return ret; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e4cbd5798eba..ac35a28599be 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -118,7 +118,7 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, return t; } t = rcu_dereference(sitn->tunnels_wc[0]); - if ((t != NULL) && (t->dev->flags & IFF_UP)) + if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } @@ -251,7 +251,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ipip6_tunnel_setup); - if (dev == NULL) + if (!dev) return NULL; dev_net_set(dev, net); @@ -555,7 +555,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info) skb->dev, iph->daddr, iph->saddr); - if (t == NULL) + if (!t) goto out; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { @@ -671,7 +671,7 @@ static int ipip6_rcv(struct sk_buff *skb) tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); - if (tunnel != NULL) { + if (tunnel) { struct pcpu_sw_netstats *tstats; if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && @@ -733,7 +733,7 @@ static int ipip_rcv(struct sk_buff *skb) iph = ip_hdr(skb); tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); - if (tunnel != NULL) { + if (tunnel) { if (tunnel->parms.iph.protocol != IPPROTO_IPIP && tunnel->parms.iph.protocol != 0) goto drop; @@ -838,7 +838,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (skb_dst(skb)) neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - if (neigh == NULL) { + if (!neigh) { net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } @@ -867,7 +867,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (skb_dst(skb)) neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - if (neigh == NULL) { + if (!neigh) { net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } @@ -983,7 +983,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, IPPROTO_IPV6); - err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, + err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); @@ -1076,7 +1076,6 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; } - dev->iflink = tunnel->parms.link; } static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) @@ -1158,7 +1157,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } t = ipip6_tunnel_locate(net, &p, 0); - if (t == NULL) + if (!t) t = netdev_priv(dev); } @@ -1206,7 +1205,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t != NULL) { + if (t) { if (t->dev != dev) { err = -EEXIST; break; @@ -1242,7 +1241,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) goto done; err = -ENOENT; t = ipip6_tunnel_locate(net, &p, 0); - if (t == NULL) + if (!t) goto done; err = -EPERM; if (t == netdev_priv(sitn->fb_tunnel_dev)) @@ -1336,6 +1335,7 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_do_ioctl = ipip6_tunnel_ioctl, .ndo_change_mtu = ipip6_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, }; static void ipip6_dev_free(struct net_device *dev) @@ -1366,7 +1366,6 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->mtu = ETH_DATA_LEN - t_hlen; dev->flags = IFF_NOARP; netif_keep_dst(dev); - dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; dev->features |= SIT_FEATURES; @@ -1530,8 +1529,7 @@ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], if (data[IFLA_IPTUN_6RD_PREFIX]) { ret = true; - nla_memcpy(&ip6rd->prefix, data[IFLA_IPTUN_6RD_PREFIX], - sizeof(struct in6_addr)); + ip6rd->prefix = nla_get_in6_addr(data[IFLA_IPTUN_6RD_PREFIX]); } if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) { @@ -1683,8 +1681,8 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || - nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || - nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || + nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || + nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, @@ -1694,10 +1692,10 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; #ifdef CONFIG_IPV6_SIT_6RD - if (nla_put(skb, IFLA_IPTUN_6RD_PREFIX, sizeof(struct in6_addr), - &tunnel->ip6rd.prefix) || - nla_put_be32(skb, IFLA_IPTUN_6RD_RELAY_PREFIX, - tunnel->ip6rd.relay_prefix) || + if (nla_put_in6_addr(skb, IFLA_IPTUN_6RD_PREFIX, + &tunnel->ip6rd.prefix) || + nla_put_in_addr(skb, IFLA_IPTUN_6RD_RELAY_PREFIX, + tunnel->ip6rd.relay_prefix) || nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN, tunnel->ip6rd.prefixlen) || nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, @@ -1795,7 +1793,7 @@ static void __net_exit sit_destroy_tunnels(struct net *net, struct ip_tunnel *t; t = rtnl_dereference(sitn->tunnels[prio][h]); - while (t != NULL) { + while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7337fc7947e2..0909f4e0d53c 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -41,22 +41,6 @@ static __u16 const msstab[] = { 9000 - 60, }; -static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct sock *child; - - child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); - if (child) - inet_csk_reqsk_queue_add(sk, req, child); - else - reqsk_free(req); - - return child; -} - static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); @@ -189,13 +173,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp6_request_sock_ops); + req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk); if (!req) goto out; ireq = inet_rsk(req); treq = tcp_rsk(req); - treq->listener = NULL; + treq->tfo_listener = false; if (security_inet_conn_request(sk, skb, req)) goto out_free; @@ -220,7 +204,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->ir_mark = inet_request_mark(sk, skb); - req->expires = 0UL; req->num_retrans = 0; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -264,7 +247,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); - ret = get_cookie_sock(sk, skb, req, dst); + ret = tcp_get_cookie_sock(sk, skb, req, dst); out: return ret; out_free: diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index c5c10fafcfe2..4e705add4f18 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -54,6 +54,27 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "idgen_retries", + .data = &init_net.ipv6.sysctl.idgen_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "idgen_delay", + .data = &init_net.ipv6.sysctl.idgen_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "flowlabel_state_ranges", + .data = &init_net.ipv6.sysctl.flowlabel_state_ranges, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -93,6 +114,9 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels; ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect; + ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries; + ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay; + ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) @@ -163,7 +187,7 @@ int ipv6_sysctl_register(void) int err = -ENOMEM; ip6_header = register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable); - if (ip6_header == NULL) + if (!ip6_header) goto out; err = register_pernet_subsys(&ipv6_sysctl_net_ops); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1f5e62229aaa..6748c4277aff 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -99,21 +99,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) dst_hold(dst); sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - if (rt->rt6i_node) - inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; - } -} - -static void tcp_v6_hash(struct sock *sk) -{ - if (sk->sk_state != TCP_CLOSE) { - if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { - tcp_prot.hash(sk); - return; - } - local_bh_disable(); - __inet6_hash(sk, NULL); - local_bh_enable(); + inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); } } @@ -134,7 +120,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; - struct rt6_info *rt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -154,7 +139,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (!flowlabel) return -EINVAL; fl6_sock_release(flowlabel); } @@ -233,11 +218,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; - } else { - ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); - ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, - &sk->sk_v6_rcv_saddr); } + np->saddr = sk->sk_v6_rcv_saddr; return err; } @@ -263,7 +245,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, goto failure; } - if (saddr == NULL) { + if (!saddr) { saddr = &fl6.saddr; sk->sk_v6_rcv_saddr = *saddr; } @@ -275,10 +257,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(sk, dst, NULL, NULL); - rt = (struct rt6_info *) dst; if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && - ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr)) + ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr)) tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; @@ -340,18 +321,20 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); + struct net *net = dev_net(skb->dev); + struct request_sock *fastopen; struct ipv6_pinfo *np; - struct sock *sk; - int err; struct tcp_sock *tp; - struct request_sock *fastopen; __u32 seq, snd_una; - struct net *net = dev_net(skb->dev); + struct sock *sk; + int err; - sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, - th->dest, &hdr->saddr, th->source, skb->dev->ifindex); + sk = __inet6_lookup_established(net, &tcp_hashinfo, + &hdr->daddr, th->dest, + &hdr->saddr, ntohs(th->source), + skb->dev->ifindex); - if (sk == NULL) { + if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return; @@ -361,6 +344,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, inet_twsk_put(inet_twsk(sk)); return; } + seq = ntohl(th->seq); + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_err(sk, seq); bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -375,7 +361,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } tp = tcp_sk(sk); - seq = ntohl(th->seq); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; @@ -419,37 +404,12 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Might be for an request_sock */ switch (sk->sk_state) { - struct request_sock *req, **prev; - case TCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - - /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, - &hdr->saddr, inet6_iif(skb)); - if (!req) - goto out; - - /* ICMPs are not backlogged, hence we cannot get - * an established socket here. - */ - WARN_ON(req->sk != NULL); - - if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - inet_csk_reqsk_queue_drop(sk, req, prev); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - goto out; - case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is * is already accepted it is treated as a connected one below. */ - if (fastopen && fastopen->sk == NULL) + if (fastopen && !fastopen->sk) break; if (!sock_owned_by_user(sk)) { @@ -497,7 +457,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, &ireq->ir_v6_rmt_addr); fl6->daddr = ireq->ir_v6_rmt_addr; - if (np->repflow && (ireq->pktopts != NULL)) + if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); skb_set_queue_mapping(skb, queue_mapping); @@ -523,17 +483,11 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, } static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk, - struct sock *addr_sk) + const struct sock *addr_sk) { return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr); } -static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk, - struct request_sock *req) -{ - return tcp_v6_md5_do_lookup(sk, &inet_rsk(req)->ir_v6_rmt_addr); -} - static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval, int optlen) { @@ -619,9 +573,9 @@ clear_hash_noput: return 1; } -static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, +static int tcp_v6_md5_hash_skb(char *md5_hash, + const struct tcp_md5sig_key *key, const struct sock *sk, - const struct request_sock *req, const struct sk_buff *skb) { const struct in6_addr *saddr, *daddr; @@ -629,12 +583,9 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, struct hash_desc *desc; const struct tcphdr *th = tcp_hdr(skb); - if (sk) { - saddr = &inet6_sk(sk)->saddr; + if (sk) { /* valid for establish/request sockets */ + saddr = &sk->sk_v6_rcv_saddr; daddr = &sk->sk_v6_daddr; - } else if (req) { - saddr = &inet_rsk(req)->ir_v6_loc_addr; - daddr = &inet_rsk(req)->ir_v6_rmt_addr; } else { const struct ipv6hdr *ip6h = ipv6_hdr(skb); saddr = &ip6h->saddr; @@ -670,8 +621,7 @@ clear_hash_noput: return 1; } -static int __tcp_v6_inbound_md5_hash(struct sock *sk, - const struct sk_buff *skb) +static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) { const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; @@ -685,44 +635,32 @@ static int __tcp_v6_inbound_md5_hash(struct sock *sk, /* We've parsed the options - do we have a hash? */ if (!hash_expected && !hash_location) - return 0; + return false; if (hash_expected && !hash_location) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); - return 1; + return true; } if (!hash_expected && hash_location) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); - return 1; + return true; } /* check the signature */ genhash = tcp_v6_md5_hash_skb(newhash, hash_expected, - NULL, NULL, skb); + NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", genhash ? "failed" : "mismatch", &ip6h->saddr, ntohs(th->source), &ip6h->daddr, ntohs(th->dest)); - return 1; + return true; } - return 0; + return false; } - -static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) -{ - int ret; - - rcu_read_lock(); - ret = __tcp_v6_inbound_md5_hash(sk, skb); - rcu_read_unlock(); - - return ret; -} - #endif static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, @@ -734,8 +672,6 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - ireq->ir_iif = sk->sk_bound_dev_if; - /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) @@ -774,7 +710,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG - .md5_lookup = tcp_v6_reqsk_md5_lookup, + .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif .init_req = tcp_v6_init_req, @@ -811,7 +747,7 @@ static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq, buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); - if (buff == NULL) + if (!buff) return; skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len); @@ -931,7 +867,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) if (!key) goto release_sk1; - genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, NULL, skb); + genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) goto release_sk1; } else { @@ -975,7 +911,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_time_stamp + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), - tw->tw_tclass, (tw->tw_flowlabel << 12)); + tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel)); inet_twsk_put(tw); } @@ -997,17 +933,20 @@ static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) { - struct request_sock *req, **prev; const struct tcphdr *th = tcp_hdr(skb); + struct request_sock *req; struct sock *nsk; /* Find possible connection requests. */ - req = inet6_csk_search_req(sk, &prev, th->source, + req = inet6_csk_search_req(sk, th->source, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); - if (req) - return tcp_check_req(sk, skb, req, prev, false); - + if (req) { + nsk = tcp_check_req(sk, skb, req, false); + if (!nsk) + reqsk_put(req); + return nsk; + } nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), @@ -1067,7 +1006,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst); - if (newsk == NULL) + if (!newsk) return NULL; newtcp6sk = (struct tcp6_sock *)newsk; @@ -1079,11 +1018,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr); - - ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); - - newsk->sk_v6_rcv_saddr = newnp->saddr; + newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; @@ -1128,7 +1063,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } newsk = tcp_create_openreq_child(sk, req, skb); - if (newsk == NULL) + if (!newsk) goto out_nonewsk; /* @@ -1170,7 +1105,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Clone pktoptions received with SYN */ newnp->pktoptions = NULL; - if (ireq->pktopts != NULL) { + if (ireq->pktopts) { newnp->pktoptions = skb_clone(ireq->pktopts, sk_gfp_atomic(sk, GFP_ATOMIC)); consume_skb(ireq->pktopts); @@ -1215,7 +1150,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr); - if (key != NULL) { + if (key) { /* We're using one, so create a matching key * on the newsk structure. If we fail to get * memory, then we end up not copying the key @@ -1232,7 +1167,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_done(newsk); goto out; } - __inet6_hash(newsk, NULL); + __inet_hash(newsk, NULL); return newsk; @@ -1313,7 +1248,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { @@ -1483,6 +1418,7 @@ process: skb->dev = NULL; bh_lock_sock_nested(sk); + tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) @@ -1504,7 +1440,7 @@ no_tcp_socket: tcp_v6_fill_cb(skb, hdr, th); - if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + if (tcp_checksum_complete(skb)) { csum_error: TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: @@ -1529,10 +1465,6 @@ do_time_wait: tcp_v6_fill_cb(skb, hdr, th); - if (skb->len < (th->doff<<2)) { - inet_twsk_put(inet_twsk(sk)); - goto bad_packet; - } if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; @@ -1547,9 +1479,9 @@ do_time_wait: &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif(skb)); - if (sk2 != NULL) { + if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); sk = sk2; tcp_v6_restore_cb(skb); @@ -1595,7 +1527,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) if (sk) { skb->sk = sk; skb->destructor = sock_edemux; - if (sk->sk_state != TCP_TIME_WAIT) { + if (sk_fullsock(sk)) { struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); if (dst) @@ -1700,9 +1632,9 @@ static void tcp_v6_destroy_sock(struct sock *sk) #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, - const struct sock *sk, struct request_sock *req, int i, kuid_t uid) + struct request_sock *req, int i, kuid_t uid) { - int ttd = req->expires - jiffies; + long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr; @@ -1791,9 +1723,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) static void get_timewait6_sock(struct seq_file *seq, struct inet_timewait_sock *tw, int i) { + long delta = tw->tw_timer.expires - jiffies; const struct in6_addr *dest, *src; __u16 destp, srcp; - s32 delta = tw->tw_ttd - inet_tw_time_stamp(); dest = &tw->tw_v6_daddr; src = &tw->tw_v6_rcv_saddr; @@ -1838,7 +1770,7 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) get_tcp6_sock(seq, v, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid); + get_openreq6(seq, v, st->num, st->uid); break; } out: @@ -1902,7 +1834,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = tcp_v6_hash, + .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index c1ab77105b4c..d883c9204c01 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -41,8 +41,8 @@ static int tcp6_gro_complete(struct sk_buff *skb, int thoff) return tcp_gro_complete(skb); } -struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, + netdev_features_t features) { struct tcphdr *th; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d048d46779fc..e51fc3eee6db 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -53,11 +53,11 @@ #include <trace/events/skb.h> #include "udp_impl.h" -static unsigned int udp6_ehashfn(struct net *net, - const struct in6_addr *laddr, - const u16 lport, - const struct in6_addr *faddr, - const __be16 fport) +static u32 udp6_ehashfn(const struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) { static u32 udp6_ehash_secret __read_mostly; static u32 udp_ipv6_hash_secret __read_mostly; @@ -104,9 +104,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) return 0; } -static unsigned int udp6_portaddr_hash(struct net *net, - const struct in6_addr *addr6, - unsigned int port) +static u32 udp6_portaddr_hash(const struct net *net, + const struct in6_addr *addr6, + unsigned int port) { unsigned int hash, mix = net_hash_mix(net); @@ -120,7 +120,6 @@ static unsigned int udp6_portaddr_hash(struct net *net, return hash ^ port; } - int udp_v6_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = @@ -385,14 +384,12 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be } EXPORT_SYMBOL_GPL(udp6_lib_lookup); - /* * This should be easy, if there is something there we * return it, otherwise we block. */ -int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, +int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -528,10 +525,8 @@ csum_copy_err: } unlock_sock_fast(sk, slow); - if (noblock) - return -EAGAIN; - - /* starting over for a new packet */ + /* starting over for a new packet, but check if we need to yield */ + cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } @@ -551,7 +546,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), udptable); - if (sk == NULL) { + if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return; @@ -649,7 +644,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) /* if we're overly short, let UDP handle it */ encap_rcv = ACCESS_ONCE(up->encap_rcv); - if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { + if (skb->len > sizeof(struct udphdr) && encap_rcv) { int ret; /* Verify checksum before giving to encap */ @@ -734,7 +729,9 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && + !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))) return false; if (!inet6_mc_check(sk, loc_addr, rmt_addr)) return false; @@ -750,7 +747,7 @@ static void flush_stack(struct sock **stack, unsigned int count, for (i = 0; i < count; i++) { sk = stack[i]; - if (likely(skb1 == NULL)) + if (likely(!skb1)) skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); if (!skb1) { atomic_inc(&sk->sk_drops); @@ -900,7 +897,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, * for sock caches... i'll skip this for now. */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); - if (sk != NULL) { + if (sk) { int ret; if (!uh->check && !udp_sk(sk)->no_check6_rx) { @@ -1101,8 +1098,7 @@ out: return err; } -int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len) +int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; struct udp_sock *up = udp_sk(sk); @@ -1164,12 +1160,12 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, do_udp_sendmsg: if (__ipv6_only_sock(sk)) return -ENETUNREACH; - return udp_sendmsg(iocb, sk, msg, len); + return udp_sendmsg(sk, msg, len); } } if (up->pending == AF_INET) - return udp_sendmsg(iocb, sk, msg, len); + return udp_sendmsg(sk, msg, len); /* Rough check on arithmetic overflow, better check is made in ip6_append_data(). @@ -1209,7 +1205,7 @@ do_udp_sendmsg: fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (!flowlabel) return -EINVAL; } } @@ -1257,14 +1253,14 @@ do_udp_sendmsg: } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (flowlabel == NULL) + if (!flowlabel) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; connected = 0; } - if (opt == NULL) + if (!opt) opt = np->opt; if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); @@ -1557,7 +1553,6 @@ static struct inet_protosw udpv6_protosw = { .flags = INET_PROTOSW_PERMANENT, }; - int __init udpv6_init(void) { int ret; diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index c779c3c90b9d..0682c031ccdc 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -23,10 +23,9 @@ int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); #endif -int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len); -int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len); +int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len); int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udpv6_destroy_sock(struct sock *sk); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index be2c0ba82c85..7441e1e63893 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -54,7 +54,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, /* Set the IPv6 fragment id if not set yet */ if (!skb_shinfo(skb)->ip6_frag_id) - ipv6_proxy_select_ident(skb); + ipv6_proxy_select_ident(dev_net(skb->dev), skb); segs = NULL; goto out; @@ -113,7 +113,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, fptr->nexthdr = nexthdr; fptr->reserved = 0; if (!skb_shinfo(skb)->ip6_frag_id) - ipv6_proxy_select_ident(skb); + ipv6_proxy_select_ident(dev_net(skb->dev), skb); fptr->identification = skb_shinfo(skb)->ip6_frag_id; /* Fragment the skb. ipv6 header and the remaining fields of the diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index f48fbe4d16f5..74bd17882a2f 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -42,7 +42,8 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) ipv6_hdr(skb)->payload_len = htons(skb->len); __skb_push(skb, skb->data - skb_network_header(skb)); - NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb, + skb->dev, NULL, ip6_rcv_finish); return -1; } diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index 9949a356d62c..1e205c3253ac 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -95,8 +95,8 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); - ip6h->daddr = *(struct in6_addr *)&x->sel.daddr.a6; - ip6h->saddr = *(struct in6_addr *)&x->sel.saddr.a6; + ip6h->daddr = x->sel.daddr.in6; + ip6h->saddr = x->sel.saddr.in6; err = 0; out: return err; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 010f8bd2d577..09c76a7b474d 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -120,7 +120,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm6_prepare_output); -int xfrm6_output_finish(struct sk_buff *skb) +int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) { memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); @@ -128,10 +128,10 @@ int xfrm6_output_finish(struct sk_buff *skb) IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; #endif - return xfrm_output(skb); + return xfrm_output(sk, skb); } -static int __xfrm6_output(struct sk_buff *skb) +static int __xfrm6_output(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; @@ -140,7 +140,7 @@ static int __xfrm6_output(struct sk_buff *skb) #ifdef CONFIG_NETFILTER if (!x) { IP6CB(skb)->flags |= IP6SKB_REROUTED; - return dst_output(skb); + return dst_output_sk(sk, skb); } #endif @@ -160,14 +160,15 @@ static int __xfrm6_output(struct sk_buff *skb) if (x->props.mode == XFRM_MODE_TUNNEL && ((skb->len > mtu && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)))) { - return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); + return ip6_fragment(sk, skb, + x->outer_mode->afinfo->output_finish); } - return x->outer_mode->afinfo->output_finish(skb); + return x->outer_mode->afinfo->output_finish(sk, skb); } int xfrm6_output(struct sock *sk, struct sk_buff *skb) { - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb, NULL, skb_dst(skb)->dev, __xfrm6_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 8d2d01b4800a..ed0583c1b9fc 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -61,9 +61,7 @@ static int xfrm6_get_saddr(struct net *net, return -EHOSTUNREACH; dev = ip6_dst_idev(dst)->dev; - ipv6_dev_get_saddr(dev_net(dev), dev, - (struct in6_addr *)&daddr->a6, 0, - (struct in6_addr *)&saddr->a6); + ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6); dst_release(dst); return 0; } @@ -73,20 +71,12 @@ static int xfrm6_get_tos(const struct flowi *fl) return 0; } -static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) -{ - struct rt6_info *rt = (struct rt6_info *)xdst; - - rt6_init_peer(rt, net->ipv6.peers); -} - static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_node) - path->path_cookie = rt->rt6i_node->fn_sernum; + path->path_cookie = rt6_get_cookie(rt); } path->u.rt6.rt6i_nfheader_len = nfheader_len; @@ -108,16 +98,13 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, return -ENODEV; } - rt6_transfer_peer(&xdst->u.rt6, rt); - /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); xdst->u.rt6.rt6i_metric = rt->rt6i_metric; xdst->u.rt6.rt6i_node = rt->rt6i_node; - if (rt->rt6i_node) - xdst->route_cookie = rt->rt6i_node->fn_sernum; + xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; @@ -257,10 +244,6 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); dst_destroy_metrics_generic(dst); - if (rt6_has_peer(&xdst->u.rt6)) { - struct inet_peer *peer = rt6_peer_ptr(&xdst->u.rt6); - inet_putpeer(peer); - } xfrm_dst_destroy(xdst); } @@ -293,7 +276,6 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm6_dst_ops = { .family = AF_INET6, - .protocol = cpu_to_be16(ETH_P_IPV6), .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, .redirect = xfrm6_redirect, @@ -311,7 +293,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, - .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, @@ -371,7 +352,7 @@ static void __net_exit xfrm6_net_exit(struct net *net) { struct ctl_table *table; - if (net->ipv6.sysctl.xfrm6_hdr == NULL) + if (!net->ipv6.sysctl.xfrm6_hdr) return; table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index f11ad1d95e0e..48d0dc89b58d 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1347,7 +1347,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -ENOMEM; - sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto); + sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto, kern); if (!sk) goto out; @@ -1688,8 +1688,7 @@ out: return rc; } -static int ipx_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int ipx_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct ipx_sock *ipxs = ipx_sk(sk); @@ -1754,8 +1753,8 @@ out: } -static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; struct ipx_sock *ipxs = ipx_sk(sk); diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 568edc72d737..fae6822cc367 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1100,7 +1100,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol, } /* Allocate networking socket */ - sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto); + sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto, kern); if (sk == NULL) return -ENOMEM; @@ -1256,14 +1256,13 @@ static int irda_release(struct socket *sock) } /* - * Function irda_sendmsg (iocb, sock, msg, len) + * Function irda_sendmsg (sock, msg, len) * * Send message down to TinyTP. This function is used for both STREAM and * SEQPACK services. This is possible since it forces the client to * fragment the message if necessary */ -static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int irda_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct irda_sock *self; @@ -1348,13 +1347,13 @@ out: } /* - * Function irda_recvmsg_dgram (iocb, sock, msg, size, flags) + * Function irda_recvmsg_dgram (sock, msg, size, flags) * * Try to receive message and copy it to user. The frame is discarded * after being read, regardless of how much the user actually read */ -static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int irda_recvmsg_dgram(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { struct sock *sk = sock->sk; struct irda_sock *self = irda_sk(sk); @@ -1398,10 +1397,10 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, } /* - * Function irda_recvmsg_stream (iocb, sock, msg, size, flags) + * Function irda_recvmsg_stream (sock, msg, size, flags) */ -static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int irda_recvmsg_stream(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { struct sock *sk = sock->sk; struct irda_sock *self = irda_sk(sk); @@ -1515,14 +1514,14 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, } /* - * Function irda_sendmsg_dgram (iocb, sock, msg, len) + * Function irda_sendmsg_dgram (sock, msg, len) * * Send message down to TinyTP for the unreliable sequenced * packet service... * */ -static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int irda_sendmsg_dgram(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct irda_sock *self; @@ -1594,14 +1593,14 @@ out: } /* - * Function irda_sendmsg_ultra (iocb, sock, msg, len) + * Function irda_sendmsg_ultra (sock, msg, len) * * Send message down to IrLMP for the unreliable Ultra * packet service... */ #ifdef CONFIG_IRDA_ULTRA -static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int irda_sendmsg_ultra(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct irda_sock *self; diff --git a/net/irda/timer.c b/net/irda/timer.c index 0c4c115a5cab..f2280f73b057 100644 --- a/net/irda/timer.c +++ b/net/irda/timer.c @@ -60,8 +60,8 @@ void irlap_start_query_timer(struct irlap_cb *self, int S, int s) * to avoid messing with for incoming connections requests and * to accommodate devices that perform discovery slower than us. * Jean II */ - timeout = ((sysctl_slot_timeout * HZ / 1000) * (S - s) - + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT); + timeout = msecs_to_jiffies(sysctl_slot_timeout) * (S - s) + + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT; /* Set or re-set the timer. We reset the timer for each received * discovery query, which allow us to automatically adjust to diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 53d931172088..918151c11348 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -535,12 +535,12 @@ static void iucv_sock_init(struct sock *sk, struct sock *parent) sk->sk_type = parent->sk_type; } -static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio) +static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; struct iucv_sock *iucv; - sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto); + sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto, kern); if (!sk) return NULL; iucv = iucv_sk(sk); @@ -602,7 +602,7 @@ static int iucv_sock_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL); + sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern); if (!sk) return -ENOMEM; @@ -1026,8 +1026,8 @@ static int iucv_send_iprm(struct iucv_path *path, struct iucv_message *msg, (void *) prmdata, 8); } -static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); @@ -1315,8 +1315,8 @@ static void iucv_process_message_q(struct sock *sk) } } -static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; @@ -1723,7 +1723,7 @@ static int iucv_callback_connreq(struct iucv_path *path, } /* Create the new socket */ - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC); + nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); if (!nsk) { err = pr_iucv->path_sever(path, user_data); iucv_path_free(path); @@ -1933,7 +1933,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb) goto out; } - nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC); + nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0); bh_lock_sock(sk); if ((sk->sk_state != IUCV_LISTEN) || sk_acceptq_is_full(sk) || diff --git a/net/key/af_key.c b/net/key/af_key.c index f8ac939d52b4..b397f0aa9005 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -149,7 +149,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, return -EPROTONOSUPPORT; err = -ENOMEM; - sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto); + sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) goto out; @@ -709,7 +709,7 @@ static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port sin6->sin6_family = AF_INET6; sin6->sin6_port = port; sin6->sin6_flowinfo = 0; - sin6->sin6_addr = *(struct in6_addr *)xaddr->a6; + sin6->sin6_addr = xaddr->in6; sin6->sin6_scope_id = 0; return 128; } @@ -1190,6 +1190,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, memcpy(x->ealg->alg_key, key+1, keysize); } x->props.ealgo = sa->sadb_sa_encrypt; + x->geniv = a->uinfo.encr.geniv; } } /* x->algo.flags = sa->sadb_sa_flags; */ @@ -3588,8 +3589,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } #endif -static int pfkey_sendmsg(struct kiocb *kiocb, - struct socket *sock, struct msghdr *msg, size_t len) +static int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb = NULL; @@ -3630,8 +3630,7 @@ out: return err ? : len; } -static int pfkey_recvmsg(struct kiocb *kiocb, - struct socket *sock, struct msghdr *msg, size_t len, +static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index a29a504492af..f6b090df3930 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1334,9 +1334,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work) if (sock) inet_shutdown(sock, 2); } else { - if (sock) + if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sk); + sock_release(sock); + } } l2tp_tunnel_sock_put(sk); @@ -1399,13 +1400,11 @@ static int l2tp_tunnel_sock_create(struct net *net, if (cfg->local_ip6 && cfg->peer_ip6) { struct sockaddr_l2tpip6 ip6_addr = {0}; - err = sock_create_kern(AF_INET6, SOCK_DGRAM, + err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; - sk_change_net(sock->sk, net); - ip6_addr.l2tp_family = AF_INET6; memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); @@ -1429,13 +1428,11 @@ static int l2tp_tunnel_sock_create(struct net *net, { struct sockaddr_l2tpip ip_addr = {0}; - err = sock_create_kern(AF_INET, SOCK_DGRAM, + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) goto out; - sk_change_net(sock->sk, net); - ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; @@ -1462,7 +1459,7 @@ out: *sockp = sock; if ((err < 0) && sock) { kernel_sock_shutdown(sock, SHUT_RDWR); - sk_release_kernel(sock->sk); + sock_release(sock); *sockp = NULL; } diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 781b3a226ba7..4b552873b556 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -74,7 +74,7 @@ static int l2tp_eth_dev_init(struct net_device *dev) priv->dev = dev; eth_hw_addr_random(dev); - memset(&dev->broadcast[0], 0xff, 6); + eth_broadcast_addr(dev->broadcast); dev->qdisc_tx_busylock = &l2tp_eth_tx_busylock; return 0; } diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 05dfc8aa36af..79649937ec71 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -385,7 +385,7 @@ drop: /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ -static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) +static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct sk_buff *skb; int rc; @@ -506,7 +506,7 @@ no_route: goto out; } -static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, +static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 8611f1b63141..d1ded3777815 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -480,8 +480,7 @@ out: /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ -static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len) +static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); @@ -643,9 +642,8 @@ do_confirm: goto done; } -static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, int noblock, - int flags, int *addr_len) +static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index b4e923f77954..9e13c2ff8789 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -205,9 +205,9 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info #endif if (info->attrs[L2TP_ATTR_IP_SADDR] && info->attrs[L2TP_ATTR_IP_DADDR]) { - cfg.local_ip.s_addr = nla_get_be32( + cfg.local_ip.s_addr = nla_get_in_addr( info->attrs[L2TP_ATTR_IP_SADDR]); - cfg.peer_ip.s_addr = nla_get_be32( + cfg.peer_ip.s_addr = nla_get_in_addr( info->attrs[L2TP_ATTR_IP_DADDR]); } else { ret = -EINVAL; @@ -376,15 +376,17 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla case L2TP_ENCAPTYPE_IP: #if IS_ENABLED(CONFIG_IPV6) if (np) { - if (nla_put(skb, L2TP_ATTR_IP6_SADDR, sizeof(np->saddr), - &np->saddr) || - nla_put(skb, L2TP_ATTR_IP6_DADDR, sizeof(sk->sk_v6_daddr), - &sk->sk_v6_daddr)) + if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR, + &np->saddr) || + nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR, + &sk->sk_v6_daddr)) goto nla_put_failure; } else #endif - if (nla_put_be32(skb, L2TP_ATTR_IP_SADDR, inet->inet_saddr) || - nla_put_be32(skb, L2TP_ATTR_IP_DADDR, inet->inet_daddr)) + if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR, + inet->inet_saddr) || + nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR, + inet->inet_daddr)) goto nla_put_failure; break; } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index cc7a828fc914..f56c9f69e9f2 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -185,9 +185,8 @@ static int pppol2tp_recv_payload_hook(struct sk_buff *skb) /* Receive message. This is the recvmsg for the PPPoL2TP socket. */ -static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, - int flags) +static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) { int err; struct sk_buff *skb; @@ -295,7 +294,7 @@ static void pppol2tp_session_sock_put(struct l2tp_session *session) * when a user application does a sendmsg() on the session socket. L2TP and * PPP headers must be inserted into the user's data. */ -static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, +static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { static const unsigned char ppph[2] = { 0xff, 0x03 }; @@ -543,12 +542,12 @@ static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb) /* socket() handler. Initialize a new struct sock. */ -static int pppol2tp_create(struct net *net, struct socket *sock) +static int pppol2tp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; - sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto); + sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern); if (!sk) goto out; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 2c0b83ce43bd..8dab4e569571 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -168,7 +168,7 @@ static int llc_ui_create(struct net *net, struct socket *sock, int protocol, if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { rc = -ENOMEM; - sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto); + sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern); if (sk) { rc = 0; llc_ui_sk_init(sock, sk); @@ -613,7 +613,7 @@ static int llc_wait_data(struct sock *sk, long timeo) if (signal_pending(current)) break; rc = 0; - if (sk_wait_data(sk, &timeo)) + if (sk_wait_data(sk, &timeo, NULL)) break; } return rc; @@ -704,8 +704,8 @@ out: * Copy received data to the socket user. * Returns non-negative upon success, negative otherwise. */ -static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { DECLARE_SOCKADDR(struct sockaddr_llc *, uaddr, msg->msg_name); const int nonblock = flags & MSG_DONTWAIT; @@ -802,7 +802,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, release_sock(sk); lock_sock(sk); } else - sk_wait_data(sk, &timeo); + sk_wait_data(sk, &timeo, NULL); if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) { net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n", @@ -878,8 +878,7 @@ copy_uaddr: * Transmit data provided by the socket user. * Returns non-negative upon success, negative otherwise. */ -static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 81a61fce3afb..3e821daf9dd4 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -768,7 +768,7 @@ static struct sock *llc_create_incoming_sock(struct sock *sk, struct llc_addr *daddr) { struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC, - sk->sk_prot); + sk->sk_prot, 0); struct llc_sock *newllc, *llc = llc_sk(sk); if (!newsk) @@ -931,9 +931,9 @@ static void llc_sk_init(struct sock *sk) * Allocates a LLC sock and initializes it. Returns the new LLC sock * or %NULL if there's no memory available for one */ -struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot) +struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { - struct sock *sk = sk_alloc(net, family, priority, prot); + struct sock *sk = sk_alloc(net, family, priority, prot, kern); if (!sk) goto out; diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index 64a012a0c6e5..086de496a4c1 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -302,6 +302,20 @@ config MAC80211_DEBUG_COUNTERS ---help--- Selecting this option causes mac80211 to keep additional and very verbose statistics about TX and RX handler use - and show them in debugfs. + as well as a few selected dot11 counters. These will be + exposed in debugfs. + + Note that some of the counters are not concurrency safe + and may thus not always be accurate. If unsure, say N. + +config MAC80211_STA_HASH_MAX_SIZE + int "Station hash table maximum size" if MAC80211_DEBUG_MENU + default 0 + ---help--- + Setting this option to a low value (e.g. 4) allows testing the + hash table with collisions relatively deterministically (just + connect more stations than the number selected here.) + + If unsure, leave the default of 0. diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c index 7869bb40acaa..7663c28ba353 100644 --- a/net/mac80211/aes_ccm.c +++ b/net/mac80211/aes_ccm.c @@ -11,9 +11,8 @@ #include <linux/kernel.h> #include <linux/types.h> -#include <linux/crypto.h> #include <linux/err.h> -#include <crypto/aes.h> +#include <crypto/aead.h> #include <net/mac80211.h> #include "key.h" @@ -23,7 +22,7 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, u8 *data, size_t data_len, u8 *mic, size_t mic_len) { - struct scatterlist assoc, pt, ct[2]; + struct scatterlist sg[3]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] @@ -32,15 +31,14 @@ void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, memset(aead_req, 0, sizeof(aead_req_data)); - sg_init_one(&pt, data, data_len); - sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); - sg_init_table(ct, 2); - sg_set_buf(&ct[0], data, data_len); - sg_set_buf(&ct[1], mic, mic_len); + sg_init_table(sg, 3); + sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad)); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, mic_len); aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, &assoc, assoc.length); - aead_request_set_crypt(aead_req, &pt, ct, data_len, b_0); + aead_request_set_crypt(aead_req, sg, sg, data_len, b_0); + aead_request_set_ad(aead_req, sg[0].length); crypto_aead_encrypt(aead_req); } @@ -49,7 +47,7 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, u8 *data, size_t data_len, u8 *mic, size_t mic_len) { - struct scatterlist assoc, pt, ct[2]; + struct scatterlist sg[3]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] __aligned(__alignof__(struct aead_request)); @@ -60,15 +58,14 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, memset(aead_req, 0, sizeof(aead_req_data)); - sg_init_one(&pt, data, data_len); - sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); - sg_init_table(ct, 2); - sg_set_buf(&ct[0], data, data_len); - sg_set_buf(&ct[1], mic, mic_len); + sg_init_table(sg, 3); + sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad)); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, mic_len); aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, &assoc, assoc.length); - aead_request_set_crypt(aead_req, ct, &pt, data_len + mic_len, b_0); + aead_request_set_crypt(aead_req, sg, sg, data_len + mic_len, b_0); + aead_request_set_ad(aead_req, sg[0].length); return crypto_aead_decrypt(aead_req); } @@ -85,11 +82,15 @@ struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[], return tfm; err = crypto_aead_setkey(tfm, key, key_len); - if (!err) - err = crypto_aead_setauthsize(tfm, mic_len); - if (!err) - return tfm; + if (err) + goto free_aead; + err = crypto_aead_setauthsize(tfm, mic_len); + if (err) + goto free_aead; + + return tfm; +free_aead: crypto_free_aead(tfm); return ERR_PTR(err); } diff --git a/net/mac80211/aes_gcm.c b/net/mac80211/aes_gcm.c index c2bf6698d738..3afe361fd27c 100644 --- a/net/mac80211/aes_gcm.c +++ b/net/mac80211/aes_gcm.c @@ -8,9 +8,8 @@ #include <linux/kernel.h> #include <linux/types.h> -#include <linux/crypto.h> #include <linux/err.h> -#include <crypto/aes.h> +#include <crypto/aead.h> #include <net/mac80211.h> #include "key.h" @@ -19,7 +18,7 @@ void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, u8 *data, size_t data_len, u8 *mic) { - struct scatterlist assoc, pt, ct[2]; + struct scatterlist sg[3]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] @@ -28,15 +27,14 @@ void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, memset(aead_req, 0, sizeof(aead_req_data)); - sg_init_one(&pt, data, data_len); - sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); - sg_init_table(ct, 2); - sg_set_buf(&ct[0], data, data_len); - sg_set_buf(&ct[1], mic, IEEE80211_GCMP_MIC_LEN); + sg_init_table(sg, 3); + sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad)); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN); aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, &assoc, assoc.length); - aead_request_set_crypt(aead_req, &pt, ct, data_len, j_0); + aead_request_set_crypt(aead_req, sg, sg, data_len, j_0); + aead_request_set_ad(aead_req, sg[0].length); crypto_aead_encrypt(aead_req); } @@ -44,7 +42,7 @@ void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, u8 *data, size_t data_len, u8 *mic) { - struct scatterlist assoc, pt, ct[2]; + struct scatterlist sg[3]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] __aligned(__alignof__(struct aead_request)); @@ -55,16 +53,15 @@ int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, memset(aead_req, 0, sizeof(aead_req_data)); - sg_init_one(&pt, data, data_len); - sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad)); - sg_init_table(ct, 2); - sg_set_buf(&ct[0], data, data_len); - sg_set_buf(&ct[1], mic, IEEE80211_GCMP_MIC_LEN); + sg_init_table(sg, 3); + sg_set_buf(&sg[0], &aad[2], be16_to_cpup((__be16 *)aad)); + sg_set_buf(&sg[1], data, data_len); + sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN); aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, &assoc, assoc.length); - aead_request_set_crypt(aead_req, ct, &pt, + aead_request_set_crypt(aead_req, sg, sg, data_len + IEEE80211_GCMP_MIC_LEN, j_0); + aead_request_set_ad(aead_req, sg[0].length); return crypto_aead_decrypt(aead_req); } @@ -80,11 +77,15 @@ struct crypto_aead *ieee80211_aes_gcm_key_setup_encrypt(const u8 key[], return tfm; err = crypto_aead_setkey(tfm, key, key_len); - if (!err) - err = crypto_aead_setauthsize(tfm, IEEE80211_GCMP_MIC_LEN); - if (!err) - return tfm; + if (err) + goto free_aead; + err = crypto_aead_setauthsize(tfm, IEEE80211_GCMP_MIC_LEN); + if (err) + goto free_aead; + + return tfm; +free_aead: crypto_free_aead(tfm); return ERR_PTR(err); } diff --git a/net/mac80211/aes_gmac.c b/net/mac80211/aes_gmac.c index 1c72edcb0083..3ddd927aaf30 100644 --- a/net/mac80211/aes_gmac.c +++ b/net/mac80211/aes_gmac.c @@ -9,8 +9,8 @@ #include <linux/kernel.h> #include <linux/types.h> -#include <linux/crypto.h> #include <linux/err.h> +#include <crypto/aead.h> #include <crypto/aes.h> #include <net/mac80211.h> @@ -24,7 +24,7 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, const u8 *data, size_t data_len, u8 *mic) { - struct scatterlist sg[3], ct[1]; + struct scatterlist sg[4]; char aead_req_data[sizeof(struct aead_request) + crypto_aead_reqsize(tfm)] __aligned(__alignof__(struct aead_request)); @@ -37,21 +37,19 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, memset(aead_req, 0, sizeof(aead_req_data)); memset(zero, 0, GMAC_MIC_LEN); - sg_init_table(sg, 3); + sg_init_table(sg, 4); sg_set_buf(&sg[0], aad, AAD_LEN); sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN); sg_set_buf(&sg[2], zero, GMAC_MIC_LEN); + sg_set_buf(&sg[3], mic, GMAC_MIC_LEN); memcpy(iv, nonce, GMAC_NONCE_LEN); memset(iv + GMAC_NONCE_LEN, 0, sizeof(iv) - GMAC_NONCE_LEN); iv[AES_BLOCK_SIZE - 1] = 0x01; - sg_init_table(ct, 1); - sg_set_buf(&ct[0], mic, GMAC_MIC_LEN); - aead_request_set_tfm(aead_req, tfm); - aead_request_set_assoc(aead_req, sg, AAD_LEN + data_len); - aead_request_set_crypt(aead_req, NULL, ct, 0, iv); + aead_request_set_crypt(aead_req, sg, sg, 0, iv); + aead_request_set_ad(aead_req, AAD_LEN + data_len); crypto_aead_encrypt(aead_req); @@ -70,9 +68,9 @@ struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[], err = crypto_aead_setkey(tfm, key, key_len); if (!err) - return tfm; - if (!err) err = crypto_aead_setauthsize(tfm, GMAC_MIC_LEN); + if (!err) + return tfm; crypto_free_aead(tfm); return ERR_PTR(err); diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 7702978a4c99..5c564a68fb50 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -238,6 +238,14 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; + if (!sta->sta.ht_cap.ht_supported) { + ht_dbg(sta->sdata, + "STA %pM erroneously requests BA session on tid %d w/o QoS\n", + sta->sta.addr, tid); + /* send a response anyway, it's an error case if we get here */ + goto end_no_lock; + } + if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) { ht_dbg(sta->sdata, "Suspend in progress - Denying ADDBA request (%pM tid %d)\n", diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index a360c15cc978..c8ba2e77737c 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -188,6 +188,43 @@ ieee80211_wake_queue_agg(struct ieee80211_sub_if_data *sdata, int tid) __release(agg_queue); } +static void +ieee80211_agg_stop_txq(struct sta_info *sta, int tid) +{ + struct ieee80211_txq *txq = sta->sta.txq[tid]; + struct txq_info *txqi; + + if (!txq) + return; + + txqi = to_txq_info(txq); + + /* Lock here to protect against further seqno updates on dequeue */ + spin_lock_bh(&txqi->queue.lock); + set_bit(IEEE80211_TXQ_STOP, &txqi->flags); + spin_unlock_bh(&txqi->queue.lock); +} + +static void +ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable) +{ + struct ieee80211_txq *txq = sta->sta.txq[tid]; + struct txq_info *txqi; + + if (!txq) + return; + + txqi = to_txq_info(txq); + + if (enable) + set_bit(IEEE80211_TXQ_AMPDU, &txqi->flags); + else + clear_bit(IEEE80211_TXQ_AMPDU, &txqi->flags); + + clear_bit(IEEE80211_TXQ_STOP, &txqi->flags); + drv_wake_tx_queue(sta->sdata->local, txqi); +} + /* * splice packets from the STA's pending to the local pending, * requires a call to ieee80211_agg_splice_finish later @@ -247,6 +284,7 @@ static void ieee80211_remove_tid_tx(struct sta_info *sta, int tid) ieee80211_assign_tid_tx(sta, tid, NULL); ieee80211_agg_splice_finish(sta->sdata, tid); + ieee80211_agg_start_txq(sta, tid, false); kfree_rcu(tid_tx, rcu_head); } @@ -418,6 +456,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) */ clear_bit(HT_AGG_STATE_WANT_START, &tid_tx->state); + ieee80211_agg_stop_txq(sta, tid); + /* * Make sure no packets are being processed. This ensures that * we have a valid starting sequence number and that in-flight @@ -440,6 +480,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) ieee80211_agg_splice_finish(sdata, tid); spin_unlock_bh(&sta->lock); + ieee80211_agg_start_txq(sta, tid, false); + kfree_rcu(tid_tx, rcu_head); return; } @@ -509,18 +551,21 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, struct tid_ampdu_tx *tid_tx; int ret = 0; + trace_api_start_tx_ba_session(pubsta, tid); + if (WARN(sta->reserved_tid == tid, "Requested to start BA session on reserved tid=%d", tid)) return -EINVAL; - trace_api_start_tx_ba_session(pubsta, tid); + if (!pubsta->ht_cap.ht_supported) + return -EINVAL; if (WARN_ON_ONCE(!local->ops->ampdu_action)) return -EINVAL; if ((tid >= IEEE80211_NUM_TIDS) || - !(local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) || - (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)) + !ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) || + ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) return -EINVAL; ht_dbg(sdata, "Open BA session requested for %pM tid %u\n", @@ -666,6 +711,8 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local, ieee80211_agg_splice_finish(sta->sdata, tid); spin_unlock_bh(&sta->lock); + + ieee80211_agg_start_txq(sta, tid, true); } void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid) @@ -793,6 +840,7 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct tid_ampdu_tx *tid_tx; + bool send_delba = false; trace_api_stop_tx_ba_cb(sdata, ra, tid); @@ -824,13 +872,17 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) } if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR && tid_tx->tx_stop) - ieee80211_send_delba(sta->sdata, ra, tid, - WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); + send_delba = true; ieee80211_remove_tid_tx(sta, tid); unlock_sta: spin_unlock_bh(&sta->lock); + + if (send_delba) + ieee80211_send_delba(sdata, ra, tid, + WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); + mutex_unlock(&sta->ampdu_mlme.mtx); unlock: mutex_unlock(&local->sta_mtx); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index dd4ff36c557a..bf7023f6c327 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2,7 +2,7 @@ * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> - * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2013-2015 Intel Mobile Communications GmbH * * This file is GPLv2 as found in COPYING. */ @@ -24,6 +24,7 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, const char *name, + unsigned char name_assign_type, enum nl80211_iftype type, u32 *flags, struct vif_params *params) @@ -33,7 +34,7 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata; int err; - err = ieee80211_if_add(local, name, &wdev, type, params); + err = ieee80211_if_add(local, name, name_assign_type, &wdev, type, params); if (err) return ERR_PTR(err); @@ -136,6 +137,9 @@ static int ieee80211_set_noack_map(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata->noack_map = noack_map; + + ieee80211_check_fast_xmit_iface(sdata); + return 0; } @@ -308,6 +312,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, u32 iv32; u16 iv16; int err = -ENOENT; + struct ieee80211_key_seq kseq = {}; sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -338,10 +343,12 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, iv32 = key->u.tkip.tx.iv32; iv16 = key->u.tkip.tx.iv16; - if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) - drv_get_tkip_seq(sdata->local, - key->conf.hw_key_idx, - &iv32, &iv16); + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + drv_get_key_seq(sdata->local, key, &kseq); + iv32 = kseq.tkip.iv32; + iv16 = kseq.tkip.iv16; + } seq[0] = iv16 & 0xff; seq[1] = (iv16 >> 8) & 0xff; @@ -354,52 +361,44 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn64 = atomic64_read(&key->u.ccmp.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; - params.seq = seq; - params.seq_len = 6; - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn64 = atomic64_read(&key->u.aes_cmac.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; - params.seq = seq; - params.seq_len = 6; - break; + BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != + offsetof(typeof(kseq), aes_cmac)); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn64 = atomic64_read(&key->u.aes_gmac.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; - params.seq = seq; - params.seq_len = 6; - break; + BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != + offsetof(typeof(kseq), aes_gmac)); case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn64 = atomic64_read(&key->u.gcmp.tx_pn); - seq[0] = pn64; - seq[1] = pn64 >> 8; - seq[2] = pn64 >> 16; - seq[3] = pn64 >> 24; - seq[4] = pn64 >> 32; - seq[5] = pn64 >> 40; + BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) != + offsetof(typeof(kseq), gcmp)); + + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) { + drv_get_key_seq(sdata->local, key, &kseq); + memcpy(seq, kseq.ccmp.pn, 6); + } else { + pn64 = atomic64_read(&key->conf.tx_pn); + seq[0] = pn64; + seq[1] = pn64 >> 8; + seq[2] = pn64 >> 16; + seq[3] = pn64 >> 24; + seq[4] = pn64 >> 32; + seq[5] = pn64 >> 40; + } params.seq = seq; params.seq_len = 6; break; + default: + if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + break; + if (WARN_ON(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) + break; + drv_get_key_seq(sdata->local, key, &kseq); + params.seq = kseq.hw.seq; + params.seq_len = kseq.hw.seq_len; + break; } params.key = key->conf.key; @@ -977,6 +976,14 @@ static int sta_apply_auth_flags(struct ieee80211_local *local, if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) && set & BIT(NL80211_STA_FLAG_ASSOCIATED) && !test_sta_flag(sta, WLAN_STA_ASSOC)) { + /* + * When peer becomes associated, init rate control as + * well. Some drivers require rate control initialized + * before drv_sta_state() is called. + */ + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + rate_control_rate_init(sta); + ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (ret) return ret; @@ -1050,6 +1057,10 @@ static int sta_apply_parameters(struct ieee80211_local *local, } } + if (mask & BIT(NL80211_STA_FLAG_WME) && + local->hw.queues >= IEEE80211_NUM_ACS) + sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); + /* auth flags will be set later for TDLS stations */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { ret = sta_apply_auth_flags(local, sta, mask, set); @@ -1064,10 +1075,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, clear_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); } - if (mask & BIT(NL80211_STA_FLAG_WME)) - sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); - if (mask & BIT(NL80211_STA_FLAG_MFP)) { + sta->sta.mfp = !!(set & BIT(NL80211_STA_FLAG_MFP)); if (set & BIT(NL80211_STA_FLAG_MFP)) set_sta_flag(sta, WLAN_STA_MFP); else @@ -1361,6 +1370,7 @@ static int ieee80211_change_station(struct wiphy *wiphy, } sta->sdata = vlansdata; + ieee80211_check_fast_xmit(sta); if (sta->sta_state == IEEE80211_STA_AUTHORIZED && prev_4addr != new_4addr) { @@ -1377,11 +1387,6 @@ static int ieee80211_change_station(struct wiphy *wiphy, if (err) goto out_err; - /* When peer becomes authorized, init rate control as well */ - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && - test_sta_flag(sta, WLAN_STA_AUTHORIZED)) - rate_control_rate_init(sta); - mutex_unlock(&local->sta_mtx); if ((sdata->vif.type == NL80211_IFTYPE_AP || @@ -1488,7 +1493,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, if (next_hop_sta) memcpy(next_hop, next_hop_sta->sta.addr, ETH_ALEN); else - memset(next_hop, 0, ETH_ALEN); + eth_zero_addr(next_hop); memset(pinfo, 0, sizeof(*pinfo)); @@ -1758,7 +1763,7 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, /* our RSSI threshold implementation is supported only for * devices that report signal in dBm. */ - if (!(sdata->local->hw.flags & IEEE80211_HW_SIGNAL_DBM)) + if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) return -ENOTSUPP; conf->rssi_threshold = nconf->rssi_threshold; } @@ -2093,10 +2098,14 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) int err; if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { + ieee80211_check_fast_xmit_all(local); + err = drv_set_frag_threshold(local, wiphy->frag_threshold); - if (err) + if (err) { + ieee80211_check_fast_xmit_all(local); return err; + } } if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || @@ -2273,7 +2282,6 @@ int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata, { struct sta_info *sta; enum ieee80211_smps_mode old_req; - int i; if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP)) return -EINVAL; @@ -2297,52 +2305,44 @@ int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata, } ht_dbg(sdata, - "SMSP %d requested in AP mode, sending Action frame to %d stations\n", + "SMPS %d requested in AP mode, sending Action frame to %d stations\n", smps_mode, atomic_read(&sdata->u.ap.num_mcast_sta)); mutex_lock(&sdata->local->sta_mtx); - for (i = 0; i < STA_HASH_SIZE; i++) { - for (sta = rcu_dereference_protected(sdata->local->sta_hash[i], - lockdep_is_held(&sdata->local->sta_mtx)); - sta; - sta = rcu_dereference_protected(sta->hnext, - lockdep_is_held(&sdata->local->sta_mtx))) { - /* - * Only stations associated to our AP and - * associated VLANs - */ - if (sta->sdata->bss != &sdata->u.ap) - continue; + list_for_each_entry(sta, &sdata->local->sta_list, list) { + /* + * Only stations associated to our AP and + * associated VLANs + */ + if (sta->sdata->bss != &sdata->u.ap) + continue; - /* This station doesn't support MIMO - skip it */ - if (sta_info_tx_streams(sta) == 1) - continue; + /* This station doesn't support MIMO - skip it */ + if (sta_info_tx_streams(sta) == 1) + continue; - /* - * Don't wake up a STA just to send the action frame - * unless we are getting more restrictive. - */ - if (test_sta_flag(sta, WLAN_STA_PS_STA) && - !ieee80211_smps_is_restrictive(sta->known_smps_mode, - smps_mode)) { - ht_dbg(sdata, - "Won't send SMPS to sleeping STA %pM\n", - sta->sta.addr); - continue; - } + /* + * Don't wake up a STA just to send the action frame + * unless we are getting more restrictive. + */ + if (test_sta_flag(sta, WLAN_STA_PS_STA) && + !ieee80211_smps_is_restrictive(sta->known_smps_mode, + smps_mode)) { + ht_dbg(sdata, "Won't send SMPS to sleeping STA %pM\n", + sta->sta.addr); + continue; + } - /* - * If the STA is not authorized, wait until it gets - * authorized and the action frame will be sent then. - */ - if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED)) - continue; + /* + * If the STA is not authorized, wait until it gets + * authorized and the action frame will be sent then. + */ + if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + continue; - ht_dbg(sdata, "Sending SMPS to %pM\n", sta->sta.addr); - ieee80211_send_smps_action(sdata, smps_mode, - sta->sta.addr, - sdata->vif.bss_conf.bssid); - } + ht_dbg(sdata, "Sending SMPS to %pM\n", sta->sta.addr); + ieee80211_send_smps_action(sdata, smps_mode, sta->sta.addr, + sdata->vif.bss_conf.bssid); } mutex_unlock(&sdata->local->sta_mtx); @@ -2407,7 +2407,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) + if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return -EOPNOTSUPP; if (enabled == sdata->u.mgd.powersave && @@ -2422,7 +2422,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, __ieee80211_request_smps_mgd(sdata, sdata->u.mgd.req_smps); sdata_unlock(sdata); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); ieee80211_recalc_ps(local, -1); @@ -2466,7 +2466,7 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) { + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { ret = drv_set_bitrate_mask(local, sdata, mask); if (ret) return ret; @@ -2498,52 +2498,36 @@ static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local, struct ieee80211_roc_work *new_roc, struct ieee80211_roc_work *cur_roc) { - unsigned long j = jiffies; - unsigned long cur_roc_end = cur_roc->hw_start_time + - msecs_to_jiffies(cur_roc->duration); - struct ieee80211_roc_work *next_roc; - int new_dur; + unsigned long now = jiffies; + unsigned long remaining = cur_roc->hw_start_time + + msecs_to_jiffies(cur_roc->duration) - + now; if (WARN_ON(!cur_roc->started || !cur_roc->hw_begun)) return false; - if (time_after(j + IEEE80211_ROC_MIN_LEFT, cur_roc_end)) + /* if it doesn't fit entirely, schedule a new one */ + if (new_roc->duration > jiffies_to_msecs(remaining)) return false; ieee80211_handle_roc_started(new_roc); - new_dur = new_roc->duration - jiffies_to_msecs(cur_roc_end - j); + /* add to dependents so we send the expired event properly */ + list_add_tail(&new_roc->list, &cur_roc->dependents); + return true; +} - /* cur_roc is long enough - add new_roc to the dependents list. */ - if (new_dur <= 0) { - list_add_tail(&new_roc->list, &cur_roc->dependents); - return true; - } +static u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local) +{ + lockdep_assert_held(&local->mtx); - new_roc->duration = new_dur; + local->roc_cookie_counter++; - /* - * if cur_roc was already coalesced before, we might - * want to extend the next roc instead of adding - * a new one. - */ - next_roc = list_entry(cur_roc->list.next, - struct ieee80211_roc_work, list); - if (&next_roc->list != &local->roc_list && - next_roc->chan == new_roc->chan && - next_roc->sdata == new_roc->sdata && - !WARN_ON(next_roc->started)) { - list_add_tail(&new_roc->list, &next_roc->dependents); - next_roc->duration = max(next_roc->duration, - new_roc->duration); - next_roc->type = max(next_roc->type, new_roc->type); - return true; - } - - /* add right after cur_roc */ - list_add(&new_roc->list, &cur_roc->list); + /* wow, you wrapped 64 bits ... more likely a bug */ + if (WARN_ON(local->roc_cookie_counter == 0)) + local->roc_cookie_counter++; - return true; + return local->roc_cookie_counter; } static int ieee80211_start_roc_work(struct ieee80211_local *local, @@ -2583,7 +2567,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, roc->req_duration = duration; roc->frame = txskb; roc->type = type; - roc->mgmt_tx_cookie = (unsigned long)txskb; roc->sdata = sdata; INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work); INIT_LIST_HEAD(&roc->dependents); @@ -2593,17 +2576,10 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, * or the SKB (for mgmt TX) */ if (!txskb) { - /* local->mtx protects this */ - local->roc_cookie_counter++; - roc->cookie = local->roc_cookie_counter; - /* wow, you wrapped 64 bits ... more likely a bug */ - if (WARN_ON(roc->cookie == 0)) { - roc->cookie = 1; - local->roc_cookie_counter++; - } + roc->cookie = ieee80211_mgmt_tx_cookie(local); *cookie = roc->cookie; } else { - *cookie = (unsigned long)txskb; + roc->mgmt_tx_cookie = *cookie; } /* if there's one pending or we're scanning, queue this one */ @@ -2655,17 +2631,9 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, * In the offloaded ROC case, if it hasn't begun, add * this new one to the dependent list to be handled * when the master one begins. If it has begun, - * check that there's still a minimum time left and - * if so, start this one, transmitting the frame, but - * add it to the list directly after this one with - * a reduced time so we'll ask the driver to execute - * it right after finishing the previous one, in the - * hope that it'll also be executed right afterwards, - * effectively extending the old one. - * If there's no minimum time left, just add it to the - * normal list. - * TODO: the ROC type is ignored here, assuming that it - * is better to immediately use the current ROC. + * check if it fits entirely within the existing one, + * in which case it will just be dependent as well. + * Otherwise, schedule it by itself. */ if (!tmp->hw_begun) { list_add_tail(&roc->list, &tmp->dependents); @@ -3284,13 +3252,43 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, return err; } +static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local, + struct sk_buff *skb, u64 *cookie, + gfp_t gfp) +{ + unsigned long spin_flags; + struct sk_buff *ack_skb; + int id; + + ack_skb = skb_copy(skb, gfp); + if (!ack_skb) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&local->ack_status_lock, spin_flags); + id = idr_alloc(&local->ack_status_frames, ack_skb, + 1, 0x10000, GFP_ATOMIC); + spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); + + if (id < 0) { + kfree_skb(ack_skb); + return ERR_PTR(-ENOMEM); + } + + IEEE80211_SKB_CB(skb)->ack_frame_id = id; + + *cookie = ieee80211_mgmt_tx_cookie(local); + IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; + + return ack_skb; +} + static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; - struct sk_buff *skb; + struct sk_buff *skb, *ack_skb; struct sta_info *sta; const struct ieee80211_mgmt *mgmt = (void *)params->buf; bool need_offchan = false; @@ -3339,8 +3337,14 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - if (!sdata->u.mgd.associated) + sdata_lock(sdata); + if (!sdata->u.mgd.associated || + (params->offchan && params->wait && + local->ops->remain_on_channel && + memcmp(sdata->u.mgd.associated->bssid, + mgmt->bssid, ETH_ALEN))) need_offchan = true; + sdata_unlock(sdata); break; case NL80211_IFTYPE_P2P_DEVICE: need_offchan = true; @@ -3396,6 +3400,7 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, /* Update CSA counters */ if (sdata->vif.csa_active && (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_MESH_POINT || sdata->vif.type == NL80211_IFTYPE_ADHOC) && params->n_csa_offsets) { int i; @@ -3422,8 +3427,23 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, skb->dev = sdata->dev; + if (!params->dont_wait_for_ack) { + /* make a copy to preserve the frame contents + * in case of encryption. + */ + ack_skb = ieee80211_make_ack_skb(local, skb, cookie, + GFP_KERNEL); + if (IS_ERR(ack_skb)) { + ret = PTR_ERR(ack_skb); + kfree_skb(skb); + goto out_unlock; + } + } else { + /* for cookie below */ + ack_skb = skb; + } + if (!need_offchan) { - *cookie = (unsigned long) skb; ieee80211_tx_skb(sdata, skb); ret = 0; goto out_unlock; @@ -3431,7 +3451,7 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; - if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) IEEE80211_SKB_CB(skb)->hw_queue = local->hw.offchannel_tx_hw_queue; @@ -3516,7 +3536,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; - struct sk_buff *skb; + struct sk_buff *skb, *ack_skb; int size = sizeof(*nullfunc); __le16 fc; bool qos; @@ -3524,20 +3544,24 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; enum ieee80211_band band; + int ret; + + /* the lock is needed to assign the cookie later */ + mutex_lock(&local->mtx); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { - rcu_read_unlock(); - return -EINVAL; + ret = -EINVAL; + goto unlock; } band = chanctx_conf->def.chan->band; sta = sta_info_get_bss(sdata, peer); if (sta) { qos = sta->sta.wme; } else { - rcu_read_unlock(); - return -ENOLINK; + ret = -ENOLINK; + goto unlock; } if (qos) { @@ -3553,8 +3577,8 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) { - rcu_read_unlock(); - return -ENOMEM; + ret = -ENOMEM; + goto unlock; } skb->dev = dev; @@ -3580,13 +3604,23 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, if (qos) nullfunc->qos_ctrl = cpu_to_le16(7); + ack_skb = ieee80211_make_ack_skb(local, skb, cookie, GFP_ATOMIC); + if (IS_ERR(ack_skb)) { + kfree_skb(skb); + ret = PTR_ERR(ack_skb); + goto unlock; + } + local_bh_disable(); - ieee80211_xmit(sdata, skb); + ieee80211_xmit(sdata, sta, skb); local_bh_enable(); + + ret = 0; +unlock: rcu_read_unlock(); + mutex_unlock(&local->mtx); - *cookie = (unsigned long) skb; - return 0; + return ret; } static int ieee80211_cfg_get_channel(struct wiphy *wiphy, diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 5bcd4e5589d3..f01c18a3160e 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -664,6 +664,8 @@ out: ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); + ieee80211_check_fast_xmit_iface(sdata); + return ret; } @@ -1008,6 +1010,8 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; + ieee80211_change_chanctx(local, new_ctx, chandef); + vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; vif_chsw[0].new_ctx = &new_ctx->conf; @@ -1030,6 +1034,8 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) if (sdata->vif.type == NL80211_IFTYPE_AP) __ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + ieee80211_check_fast_xmit_iface(sdata); + if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx); @@ -1079,6 +1085,8 @@ ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata) if (WARN_ON(!chandef)) return -EINVAL; + ieee80211_change_chanctx(local, new_ctx, chandef); + list_del(&sdata->reserved_chanctx_list); sdata->reserved_chanctx = NULL; @@ -1376,6 +1384,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) __ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + ieee80211_check_fast_xmit_iface(sdata); + sdata->radar_required = sdata->reserved_radar_required; if (sdata->vif.bss_conf.chandef.width != diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index eeb0bbd69d98..3ea8b7de9633 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -1,4 +1,3 @@ - /* * mac80211 debugfs for wireless PHYs * @@ -18,172 +17,6 @@ #define DEBUGFS_FORMAT_BUFFER_SIZE 100 -#define TX_LATENCY_BIN_DELIMTER_C ',' -#define TX_LATENCY_BIN_DELIMTER_S "," -#define TX_LATENCY_BINS_DISABLED "enable(bins disabled)\n" -#define TX_LATENCY_DISABLED "disable\n" - - -/* - * Display if Tx latency statistics & bins are enabled/disabled - */ -static ssize_t sta_tx_latency_stat_read(struct file *file, - char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct ieee80211_local *local = file->private_data; - struct ieee80211_tx_latency_bin_ranges *tx_latency; - char *buf; - int bufsz, i, ret; - int pos = 0; - - rcu_read_lock(); - - tx_latency = rcu_dereference(local->tx_latency); - - if (tx_latency && tx_latency->n_ranges) { - bufsz = tx_latency->n_ranges * 15; - buf = kzalloc(bufsz, GFP_ATOMIC); - if (!buf) - goto err; - - for (i = 0; i < tx_latency->n_ranges; i++) - pos += scnprintf(buf + pos, bufsz - pos, "%d,", - tx_latency->ranges[i]); - pos += scnprintf(buf + pos, bufsz - pos, "\n"); - } else if (tx_latency) { - bufsz = sizeof(TX_LATENCY_BINS_DISABLED) + 1; - buf = kzalloc(bufsz, GFP_ATOMIC); - if (!buf) - goto err; - - pos += scnprintf(buf + pos, bufsz - pos, "%s\n", - TX_LATENCY_BINS_DISABLED); - } else { - bufsz = sizeof(TX_LATENCY_DISABLED) + 1; - buf = kzalloc(bufsz, GFP_ATOMIC); - if (!buf) - goto err; - - pos += scnprintf(buf + pos, bufsz - pos, "%s\n", - TX_LATENCY_DISABLED); - } - - rcu_read_unlock(); - - ret = simple_read_from_buffer(userbuf, count, ppos, buf, pos); - kfree(buf); - - return ret; -err: - rcu_read_unlock(); - return -ENOMEM; -} - -/* - * Receive input from user regarding Tx latency statistics - * The input should indicate if Tx latency statistics and bins are - * enabled/disabled. - * If bins are enabled input should indicate the amount of different bins and - * their ranges. Each bin will count how many Tx frames transmitted within the - * appropriate latency. - * Legal input is: - * a) "enable(bins disabled)" - to enable only general statistics - * b) "a,b,c,d,...z" - to enable general statistics and bins, where all are - * numbers and a < b < c < d.. < z - * c) "disable" - disable all statistics - * NOTE: must configure Tx latency statistics bins before stations connected. - */ - -static ssize_t sta_tx_latency_stat_write(struct file *file, - const char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct ieee80211_local *local = file->private_data; - char buf[128] = {}; - char *bins = buf; - char *token; - int buf_size, i, alloc_size; - int prev_bin = 0; - int n_ranges = 0; - int ret = count; - struct ieee80211_tx_latency_bin_ranges *tx_latency; - - if (sizeof(buf) <= count) - return -EINVAL; - buf_size = count; - if (copy_from_user(buf, userbuf, buf_size)) - return -EFAULT; - - mutex_lock(&local->sta_mtx); - - /* cannot change config once we have stations */ - if (local->num_sta) - goto unlock; - - tx_latency = - rcu_dereference_protected(local->tx_latency, - lockdep_is_held(&local->sta_mtx)); - - /* disable Tx statistics */ - if (!strcmp(buf, TX_LATENCY_DISABLED)) { - if (!tx_latency) - goto unlock; - RCU_INIT_POINTER(local->tx_latency, NULL); - synchronize_rcu(); - kfree(tx_latency); - goto unlock; - } - - /* Tx latency already enabled */ - if (tx_latency) - goto unlock; - - if (strcmp(TX_LATENCY_BINS_DISABLED, buf)) { - /* check how many bins and between what ranges user requested */ - token = buf; - while (*token != '\0') { - if (*token == TX_LATENCY_BIN_DELIMTER_C) - n_ranges++; - token++; - } - n_ranges++; - } - - alloc_size = sizeof(struct ieee80211_tx_latency_bin_ranges) + - n_ranges * sizeof(u32); - tx_latency = kzalloc(alloc_size, GFP_ATOMIC); - if (!tx_latency) { - ret = -ENOMEM; - goto unlock; - } - tx_latency->n_ranges = n_ranges; - for (i = 0; i < n_ranges; i++) { /* setting bin ranges */ - token = strsep(&bins, TX_LATENCY_BIN_DELIMTER_S); - sscanf(token, "%d", &tx_latency->ranges[i]); - /* bins values should be in ascending order */ - if (prev_bin >= tx_latency->ranges[i]) { - ret = -EINVAL; - kfree(tx_latency); - goto unlock; - } - prev_bin = tx_latency->ranges[i]; - } - rcu_assign_pointer(local->tx_latency, tx_latency); - -unlock: - mutex_unlock(&local->sta_mtx); - - return ret; -} - -static const struct file_operations stats_tx_latency_ops = { - .write = sta_tx_latency_stat_write, - .read = sta_tx_latency_stat_read, - .open = simple_open, - .llseek = generic_file_llseek, -}; - int mac80211_format_buffer(char __user *userbuf, size_t count, loff_t *ppos, char *fmt, ...) { @@ -258,62 +91,66 @@ static const struct file_operations reset_ops = { }; #endif +static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { +#define FLAG(F) [IEEE80211_HW_##F] = #F + FLAG(HAS_RATE_CONTROL), + FLAG(RX_INCLUDES_FCS), + FLAG(HOST_BROADCAST_PS_BUFFERING), + FLAG(SIGNAL_UNSPEC), + FLAG(SIGNAL_DBM), + FLAG(NEED_DTIM_BEFORE_ASSOC), + FLAG(SPECTRUM_MGMT), + FLAG(AMPDU_AGGREGATION), + FLAG(SUPPORTS_PS), + FLAG(PS_NULLFUNC_STACK), + FLAG(SUPPORTS_DYNAMIC_PS), + FLAG(MFP_CAPABLE), + FLAG(WANT_MONITOR_VIF), + FLAG(NO_AUTO_VIF), + FLAG(SW_CRYPTO_CONTROL), + FLAG(SUPPORT_FAST_XMIT), + FLAG(REPORTS_TX_ACK_STATUS), + FLAG(CONNECTION_MONITOR), + FLAG(QUEUE_CONTROL), + FLAG(SUPPORTS_PER_STA_GTK), + FLAG(AP_LINK_PS), + FLAG(TX_AMPDU_SETUP_IN_HW), + FLAG(SUPPORTS_RC_TABLE), + FLAG(P2P_DEV_ADDR_FOR_INTF), + FLAG(TIMING_BEACON_ONLY), + FLAG(SUPPORTS_HT_CCK_RATES), + FLAG(CHANCTX_STA_CSA), + FLAG(SUPPORTS_CLONED_SKBS), + FLAG(SINGLE_SCAN_ON_ALL_BANDS), + + /* keep last for the build bug below */ + (void *)0x1 +#undef FLAG +}; + static ssize_t hwflags_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; - int mxln = 500; + size_t bufsz = 30 * NUM_IEEE80211_HW_FLAGS; + char *buf = kzalloc(bufsz, GFP_KERNEL); + char *pos = buf, *end = buf + bufsz - 1; ssize_t rv; - char *buf = kzalloc(mxln, GFP_KERNEL); - int sf = 0; /* how many written so far */ + int i; if (!buf) - return 0; - - sf += scnprintf(buf, mxln - sf, "0x%x\n", local->hw.flags); - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) - sf += scnprintf(buf + sf, mxln - sf, "HAS_RATE_CONTROL\n"); - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) - sf += scnprintf(buf + sf, mxln - sf, "RX_INCLUDES_FCS\n"); - if (local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING) - sf += scnprintf(buf + sf, mxln - sf, - "HOST_BCAST_PS_BUFFERING\n"); - if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE) - sf += scnprintf(buf + sf, mxln - sf, - "2GHZ_SHORT_SLOT_INCAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE) - sf += scnprintf(buf + sf, mxln - sf, - "2GHZ_SHORT_PREAMBLE_INCAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) - sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_UNSPEC\n"); - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) - sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_DBM\n"); - if (local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC) - sf += scnprintf(buf + sf, mxln - sf, - "NEED_DTIM_BEFORE_ASSOC\n"); - if (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT) - sf += scnprintf(buf + sf, mxln - sf, "SPECTRUM_MGMT\n"); - if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) - sf += scnprintf(buf + sf, mxln - sf, "AMPDU_AGGREGATION\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PS\n"); - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) - sf += scnprintf(buf + sf, mxln - sf, "PS_NULLFUNC_STACK\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n"); - if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE) - sf += scnprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) - sf += scnprintf(buf + sf, mxln - sf, - "REPORTS_TX_ACK_STATUS\n"); - if (local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) - sf += scnprintf(buf + sf, mxln - sf, "CONNECTION_MONITOR\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PER_STA_GTK\n"); - if (local->hw.flags & IEEE80211_HW_AP_LINK_PS) - sf += scnprintf(buf + sf, mxln - sf, "AP_LINK_PS\n"); - if (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW) - sf += scnprintf(buf + sf, mxln - sf, "TX_AMPDU_SETUP_IN_HW\n"); + return -ENOMEM; + + /* fail compilation if somebody adds or removes + * a flag without updating the name array above + */ + BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1); + + for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { + if (test_bit(i, local->hw.flags)) + pos += scnprintf(pos, end - pos, "%s", + hw_flag_names[i]); + } rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); @@ -385,8 +222,8 @@ static const struct file_operations stats_ ##name## _ops = { \ .llseek = generic_file_llseek, \ }; -#define DEBUGFS_STATS_ADD(name, field) \ - debugfs_create_u32(#name, 0400, statsd, (u32 *) &field); +#define DEBUGFS_STATS_ADD(name) \ + debugfs_create_u32(#name, 0400, statsd, &local->name); #define DEBUGFS_DEVSTATS_ADD(name) \ debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops); @@ -421,60 +258,34 @@ void debugfs_hw_add(struct ieee80211_local *local) if (!statsd) return; - DEBUGFS_STATS_ADD(transmitted_fragment_count, - local->dot11TransmittedFragmentCount); - DEBUGFS_STATS_ADD(multicast_transmitted_frame_count, - local->dot11MulticastTransmittedFrameCount); - DEBUGFS_STATS_ADD(failed_count, local->dot11FailedCount); - DEBUGFS_STATS_ADD(retry_count, local->dot11RetryCount); - DEBUGFS_STATS_ADD(multiple_retry_count, - local->dot11MultipleRetryCount); - DEBUGFS_STATS_ADD(frame_duplicate_count, - local->dot11FrameDuplicateCount); - DEBUGFS_STATS_ADD(received_fragment_count, - local->dot11ReceivedFragmentCount); - DEBUGFS_STATS_ADD(multicast_received_frame_count, - local->dot11MulticastReceivedFrameCount); - DEBUGFS_STATS_ADD(transmitted_frame_count, - local->dot11TransmittedFrameCount); #ifdef CONFIG_MAC80211_DEBUG_COUNTERS - DEBUGFS_STATS_ADD(tx_handlers_drop, local->tx_handlers_drop); - DEBUGFS_STATS_ADD(tx_handlers_queued, local->tx_handlers_queued); - DEBUGFS_STATS_ADD(tx_handlers_drop_unencrypted, - local->tx_handlers_drop_unencrypted); - DEBUGFS_STATS_ADD(tx_handlers_drop_fragment, - local->tx_handlers_drop_fragment); - DEBUGFS_STATS_ADD(tx_handlers_drop_wep, - local->tx_handlers_drop_wep); - DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc, - local->tx_handlers_drop_not_assoc); - DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port, - local->tx_handlers_drop_unauth_port); - DEBUGFS_STATS_ADD(rx_handlers_drop, local->rx_handlers_drop); - DEBUGFS_STATS_ADD(rx_handlers_queued, local->rx_handlers_queued); - DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc, - local->rx_handlers_drop_nullfunc); - DEBUGFS_STATS_ADD(rx_handlers_drop_defrag, - local->rx_handlers_drop_defrag); - DEBUGFS_STATS_ADD(rx_handlers_drop_short, - local->rx_handlers_drop_short); - DEBUGFS_STATS_ADD(tx_expand_skb_head, - local->tx_expand_skb_head); - DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned, - local->tx_expand_skb_head_cloned); - DEBUGFS_STATS_ADD(rx_expand_skb_head, - local->rx_expand_skb_head); - DEBUGFS_STATS_ADD(rx_expand_skb_head2, - local->rx_expand_skb_head2); - DEBUGFS_STATS_ADD(rx_handlers_fragments, - local->rx_handlers_fragments); - DEBUGFS_STATS_ADD(tx_status_drop, - local->tx_status_drop); + DEBUGFS_STATS_ADD(dot11TransmittedFragmentCount); + DEBUGFS_STATS_ADD(dot11MulticastTransmittedFrameCount); + DEBUGFS_STATS_ADD(dot11FailedCount); + DEBUGFS_STATS_ADD(dot11RetryCount); + DEBUGFS_STATS_ADD(dot11MultipleRetryCount); + DEBUGFS_STATS_ADD(dot11FrameDuplicateCount); + DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); + DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); + DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); + DEBUGFS_STATS_ADD(tx_handlers_drop); + DEBUGFS_STATS_ADD(tx_handlers_queued); + DEBUGFS_STATS_ADD(tx_handlers_drop_wep); + DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc); + DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port); + DEBUGFS_STATS_ADD(rx_handlers_drop); + DEBUGFS_STATS_ADD(rx_handlers_queued); + DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc); + DEBUGFS_STATS_ADD(rx_handlers_drop_defrag); + DEBUGFS_STATS_ADD(rx_handlers_drop_short); + DEBUGFS_STATS_ADD(tx_expand_skb_head); + DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned); + DEBUGFS_STATS_ADD(rx_expand_skb_head_defrag); + DEBUGFS_STATS_ADD(rx_handlers_fragments); + DEBUGFS_STATS_ADD(tx_status_drop); #endif DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount); DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount); DEBUGFS_DEVSTATS_ADD(dot11FCSErrorCount); DEBUGFS_DEVSTATS_ADD(dot11RTSSuccessCount); - - DEBUGFS_DEVSTATS_ADD(tx_latency); } diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 71ac1b5f4da5..e82bf1e9d7a8 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -95,28 +95,13 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn = atomic64_read(&key->u.ccmp.tx_pn); - len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", - (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), - (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn = atomic64_read(&key->u.aes_cmac.tx_pn); - len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", - (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), - (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); - break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn = atomic64_read(&key->u.aes_gmac.tx_pn); - len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", - (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), - (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); - break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn = atomic64_read(&key->u.gcmp.tx_pn); + pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index c68896adfa96..c09c0131bfa2 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -177,7 +177,6 @@ static ssize_t ieee80211_if_write_##name(struct file *file, \ IEEE80211_IF_FILE_R(name) /* common attributes */ -IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC); IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[IEEE80211_BAND_2GHZ], HEX); IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[IEEE80211_BAND_5GHZ], @@ -562,7 +561,6 @@ IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, static void add_common_files(struct ieee80211_sub_if_data *sdata) { - DEBUGFS_ADD(drop_unencrypted); DEBUGFS_ADD(rc_rateidx_mask_2ghz); DEBUGFS_ADD(rc_rateidx_mask_5ghz); DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz); @@ -725,6 +723,7 @@ void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata) debugfs_remove_recursive(sdata->vif.debugfs_dir); sdata->vif.debugfs_dir = NULL; + sdata->debugfs.subdir_stations = NULL; } void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 94c70091bbd7..06d52935036d 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -29,8 +29,6 @@ static ssize_t sta_ ##name## _read(struct file *file, \ format_string, sta->field); \ } #define STA_READ_D(name, field) STA_READ(name, field, "%d\n") -#define STA_READ_U(name, field) STA_READ(name, field, "%u\n") -#define STA_READ_S(name, field) STA_READ(name, field, "%s\n") #define STA_OPS(name) \ static const struct file_operations sta_ ##name## _ops = { \ @@ -39,13 +37,6 @@ static const struct file_operations sta_ ##name## _ops = { \ .llseek = generic_file_llseek, \ } -#define STA_OPS_W(name) \ -static const struct file_operations sta_ ##name## _ops = { \ - .write = sta_##name##_write, \ - .open = simple_open, \ - .llseek = generic_file_llseek, \ -} - #define STA_OPS_RW(name) \ static const struct file_operations sta_ ##name## _ops = { \ .read = sta_##name##_read, \ @@ -59,10 +50,7 @@ static const struct file_operations sta_ ##name## _ops = { \ STA_OPS(name) STA_FILE(aid, sta.aid, D); -STA_FILE(dev, sdata->name, S); -STA_FILE(last_signal, last_signal, D); STA_FILE(last_ack_signal, last_ack_signal, D); -STA_FILE(beacon_loss_count, beacon_loss_count, D); static ssize_t sta_flags_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) @@ -108,40 +96,6 @@ static ssize_t sta_num_ps_buf_frames_read(struct file *file, } STA_OPS(num_ps_buf_frames); -static ssize_t sta_inactive_ms_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - return mac80211_format_buffer(userbuf, count, ppos, "%d\n", - jiffies_to_msecs(jiffies - sta->last_rx)); -} -STA_OPS(inactive_ms); - - -static ssize_t sta_connected_time_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct timespec uptime; - struct tm result; - long connected_time_secs; - char buf[100]; - int res; - ktime_get_ts(&uptime); - connected_time_secs = uptime.tv_sec - sta->last_connected; - time_to_tm(connected_time_secs, 0, &result); - result.tm_year -= 70; - result.tm_mday -= 1; - res = scnprintf(buf, sizeof(buf), - "years - %ld\nmonths - %d\ndays - %d\nclock - %d:%d:%d\n\n", - result.tm_year, result.tm_mon, result.tm_mday, - result.tm_hour, result.tm_min, result.tm_sec); - return simple_read_from_buffer(userbuf, count, ppos, buf, res); -} -STA_OPS(connected_time); - - - static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -366,162 +320,6 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, } STA_OPS(vht_capa); -static ssize_t sta_current_tx_rate_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct rate_info rinfo; - u16 rate; - sta_set_rate_info_tx(sta, &sta->last_tx_rate, &rinfo); - rate = cfg80211_calculate_bitrate(&rinfo); - - return mac80211_format_buffer(userbuf, count, ppos, - "%d.%d MBit/s\n", - rate/10, rate%10); -} -STA_OPS(current_tx_rate); - -static ssize_t sta_last_rx_rate_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct rate_info rinfo; - u16 rate; - - sta_set_rate_info_rx(sta, &rinfo); - - rate = cfg80211_calculate_bitrate(&rinfo); - - return mac80211_format_buffer(userbuf, count, ppos, - "%d.%d MBit/s\n", - rate/10, rate%10); -} -STA_OPS(last_rx_rate); - -static int -sta_tx_latency_stat_header(struct ieee80211_tx_latency_bin_ranges *tx_latency, - char *buf, int pos, int bufsz) -{ - int i; - int range_count = tx_latency->n_ranges; - u32 *bin_ranges = tx_latency->ranges; - - pos += scnprintf(buf + pos, bufsz - pos, - "Station\t\t\tTID\tMax\tAvg"); - if (range_count) { - pos += scnprintf(buf + pos, bufsz - pos, - "\t<=%d", bin_ranges[0]); - for (i = 0; i < range_count - 1; i++) - pos += scnprintf(buf + pos, bufsz - pos, "\t%d-%d", - bin_ranges[i], bin_ranges[i+1]); - pos += scnprintf(buf + pos, bufsz - pos, - "\t%d<", bin_ranges[range_count - 1]); - } - - pos += scnprintf(buf + pos, bufsz - pos, "\n"); - - return pos; -} - -static int -sta_tx_latency_stat_table(struct ieee80211_tx_latency_bin_ranges *tx_lat_range, - struct ieee80211_tx_latency_stat *tx_lat, - char *buf, int pos, int bufsz, int tid) -{ - u32 avg = 0; - int j; - int bin_count = tx_lat->bin_count; - - pos += scnprintf(buf + pos, bufsz - pos, "\t\t\t%d", tid); - /* make sure you don't divide in 0 */ - if (tx_lat->counter) - avg = tx_lat->sum / tx_lat->counter; - - pos += scnprintf(buf + pos, bufsz - pos, "\t%d\t%d", - tx_lat->max, avg); - - if (tx_lat_range->n_ranges && tx_lat->bins) - for (j = 0; j < bin_count; j++) - pos += scnprintf(buf + pos, bufsz - pos, - "\t%d", tx_lat->bins[j]); - pos += scnprintf(buf + pos, bufsz - pos, "\n"); - - return pos; -} - -/* - * Output Tx latency statistics station && restart all statistics information - */ -static ssize_t sta_tx_latency_stat_read(struct file *file, - char __user *userbuf, - size_t count, loff_t *ppos) -{ - struct sta_info *sta = file->private_data; - struct ieee80211_local *local = sta->local; - struct ieee80211_tx_latency_bin_ranges *tx_latency; - char *buf; - int bufsz, ret, i; - int pos = 0; - - bufsz = 20 * IEEE80211_NUM_TIDS * - sizeof(struct ieee80211_tx_latency_stat); - buf = kzalloc(bufsz, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - rcu_read_lock(); - - tx_latency = rcu_dereference(local->tx_latency); - - if (!sta->tx_lat) { - pos += scnprintf(buf + pos, bufsz - pos, - "Tx latency statistics are not enabled\n"); - goto unlock; - } - - pos = sta_tx_latency_stat_header(tx_latency, buf, pos, bufsz); - - pos += scnprintf(buf + pos, bufsz - pos, "%pM\n", sta->sta.addr); - for (i = 0; i < IEEE80211_NUM_TIDS; i++) - pos = sta_tx_latency_stat_table(tx_latency, &sta->tx_lat[i], - buf, pos, bufsz, i); -unlock: - rcu_read_unlock(); - - ret = simple_read_from_buffer(userbuf, count, ppos, buf, pos); - kfree(buf); - - return ret; -} -STA_OPS(tx_latency_stat); - -static ssize_t sta_tx_latency_stat_reset_write(struct file *file, - const char __user *userbuf, - size_t count, loff_t *ppos) -{ - u32 *bins; - int bin_count; - struct sta_info *sta = file->private_data; - int i; - - if (!sta->tx_lat) - return -EINVAL; - - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - bins = sta->tx_lat[i].bins; - bin_count = sta->tx_lat[i].bin_count; - - sta->tx_lat[i].max = 0; - sta->tx_lat[i].sum = 0; - sta->tx_lat[i].counter = 0; - - if (bin_count) - memset(bins, 0, bin_count * sizeof(u32)); - } - - return count; -} -STA_OPS_W(tx_latency_stat_reset); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ @@ -564,32 +362,15 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD(flags); DEBUGFS_ADD(num_ps_buf_frames); - DEBUGFS_ADD(inactive_ms); - DEBUGFS_ADD(connected_time); DEBUGFS_ADD(last_seq_ctrl); DEBUGFS_ADD(agg_status); - DEBUGFS_ADD(dev); - DEBUGFS_ADD(last_signal); - DEBUGFS_ADD(beacon_loss_count); DEBUGFS_ADD(ht_capa); DEBUGFS_ADD(vht_capa); DEBUGFS_ADD(last_ack_signal); - DEBUGFS_ADD(current_tx_rate); - DEBUGFS_ADD(last_rx_rate); - DEBUGFS_ADD(tx_latency_stat); - DEBUGFS_ADD(tx_latency_stat_reset); - - DEBUGFS_ADD_COUNTER(rx_packets, rx_packets); - DEBUGFS_ADD_COUNTER(tx_packets, tx_packets); - DEBUGFS_ADD_COUNTER(rx_bytes, rx_bytes); - DEBUGFS_ADD_COUNTER(tx_bytes, tx_bytes); + DEBUGFS_ADD_COUNTER(rx_duplicates, num_duplicates); DEBUGFS_ADD_COUNTER(rx_fragments, rx_fragments); - DEBUGFS_ADD_COUNTER(rx_dropped, rx_dropped); - DEBUGFS_ADD_COUNTER(tx_fragments, tx_fragments); DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count); - DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed); - DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count); if (sizeof(sta->driver_buffered_tids) == sizeof(u32)) debugfs_create_x32("driver_buffered_tids", 0400, diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index fdeda17b8dd2..32a2e707e222 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -146,7 +146,7 @@ static inline int drv_add_interface(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF) && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)))) return -EINVAL; @@ -417,12 +417,13 @@ static inline int drv_get_stats(struct ieee80211_local *local, return ret; } -static inline void drv_get_tkip_seq(struct ieee80211_local *local, - u8 hw_key_idx, u32 *iv32, u16 *iv16) +static inline void drv_get_key_seq(struct ieee80211_local *local, + struct ieee80211_key *key, + struct ieee80211_key_seq *seq) { - if (local->ops->get_tkip_seq) - local->ops->get_tkip_seq(&local->hw, hw_key_idx, iv32, iv16); - trace_drv_get_tkip_seq(local, hw_key_idx, iv32, iv16); + if (local->ops->get_key_seq) + local->ops->get_key_seq(&local->hw, &key->conf, seq); + trace_drv_get_key_seq(local, &key->conf); } static inline int drv_set_frag_threshold(struct ieee80211_local *local, @@ -941,13 +942,13 @@ static inline void drv_set_rekey_data(struct ieee80211_local *local, trace_drv_return_void(local); } -static inline void drv_rssi_callback(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - const enum ieee80211_rssi_event event) +static inline void drv_event_callback(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const struct ieee80211_event *event) { - trace_drv_rssi_callback(local, sdata, event); - if (local->ops->rssi_callback) - local->ops->rssi_callback(&local->hw, &sdata->vif, event); + trace_drv_event_callback(local, sdata, event); + if (local->ops->event_callback) + local->ops->event_callback(&local->hw, &sdata->vif, event); trace_drv_return_void(local); } @@ -1367,4 +1368,16 @@ drv_tdls_recv_channel_switch(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void drv_wake_tx_queue(struct ieee80211_local *local, + struct txq_info *txq) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif); + + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_wake_tx_queue(local, sdata, txq); + local->ops->wake_tx_queue(&local->hw, &txq->txq); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 52bcea6ad9e8..188faab11c24 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -38,7 +38,7 @@ static void ieee80211_get_ringparam(struct net_device *dev, static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_packets", "rx_bytes", "rx_duplicates", "rx_fragments", "rx_dropped", - "tx_packets", "tx_bytes", "tx_fragments", + "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", "beacon_loss", "sta_state", "txrate", "rxrate", "signal", "channel", "noise", "ch_time", "ch_time_busy", @@ -87,7 +87,6 @@ static void ieee80211_get_stats(struct net_device *dev, \ data[i++] += sinfo.tx_packets; \ data[i++] += sinfo.tx_bytes; \ - data[i++] += sta->tx_fragments; \ data[i++] += sta->tx_filtered_count; \ data[i++] += sta->tx_retry_failed; \ data[i++] += sta->tx_retry_count; \ diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index ff630be2ca75..7a76ce639d58 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -252,8 +252,6 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, break; } - if (bw != sta->sta.bandwidth) - changed = true; sta->sta.bandwidth = bw; sta->cur_max_bandwidth = diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index b606b53a49a7..7f72bc9bae2e 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -146,6 +146,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, csa_settings->chandef.chan->center_freq); presp->csa_counter_offsets[0] = (pos - presp->head); *pos++ = csa_settings->count; + presp->csa_current_counter = csa_settings->count; } /* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */ @@ -188,6 +189,16 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, */ pos = ieee80211_ie_build_ht_oper(pos, &sband->ht_cap, chandef, 0); + + /* add VHT capability and information IEs */ + if (chandef->width != NL80211_CHAN_WIDTH_20 && + chandef->width != NL80211_CHAN_WIDTH_40 && + sband->vht_cap.vht_supported) { + pos = ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, + sband->vht_cap.cap); + pos = ieee80211_ie_build_vht_oper(pos, &sband->vht_cap, + chandef); + } } if (local->hw.queues >= IEEE80211_NUM_ACS) @@ -249,8 +260,6 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, if (presp) kfree_rcu(presp, rcu_head); - sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0; - /* make a copy of the chandef, it could be modified below. */ chandef = *req_chandef; chan = chandef.chan; @@ -417,6 +426,11 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, NL80211_CHAN_WIDTH_20_NOHT); chandef.width = sdata->u.ibss.chandef.width; break; + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_160: + chandef = sdata->u.ibss.chandef; + chandef.chan = cbss->channel; + break; default: /* fall back to 20 MHz for unsupported modes */ cfg80211_chandef_create(&chandef, cbss->channel, @@ -470,22 +484,19 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, struct beacon_data *presp, *old_presp; struct cfg80211_bss *cbss; const struct cfg80211_bss_ies *ies; - u16 capability; + u16 capability = 0; u64 tsf; int ret = 0; sdata_assert_lock(sdata); - capability = WLAN_CAPABILITY_IBSS; - if (ifibss->privacy) - capability |= WLAN_CAPABILITY_PRIVACY; + capability = WLAN_CAPABILITY_PRIVACY; cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, - ifibss->ssid_len, WLAN_CAPABILITY_IBSS | - WLAN_CAPABILITY_PRIVACY, - capability); + ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, + IEEE80211_PRIVACY(ifibss->privacy)); if (WARN_ON(!cbss)) { ret = -EINVAL; @@ -525,23 +536,17 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct cfg80211_bss *cbss; int err, changed = 0; - u16 capability; sdata_assert_lock(sdata); /* update cfg80211 bss information with the new channel */ if (!is_zero_ether_addr(ifibss->bssid)) { - capability = WLAN_CAPABILITY_IBSS; - - if (ifibss->privacy) - capability |= WLAN_CAPABILITY_PRIVACY; - cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, - ifibss->ssid_len, WLAN_CAPABILITY_IBSS | - WLAN_CAPABILITY_PRIVACY, - capability); + ifibss->ssid_len, + IEEE80211_BSS_TYPE_IBSS, + IEEE80211_PRIVACY(ifibss->privacy)); /* XXX: should not really modify cfg80211 data */ if (cbss) { cbss->channel = sdata->csa_chandef.chan; @@ -682,19 +687,13 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) struct cfg80211_bss *cbss; struct beacon_data *presp; struct sta_info *sta; - u16 capability; if (!is_zero_ether_addr(ifibss->bssid)) { - capability = WLAN_CAPABILITY_IBSS; - - if (ifibss->privacy) - capability |= WLAN_CAPABILITY_PRIVACY; - cbss = cfg80211_get_bss(local->hw.wiphy, ifibss->chandef.chan, ifibss->bssid, ifibss->ssid, - ifibss->ssid_len, WLAN_CAPABILITY_IBSS | - WLAN_CAPABILITY_PRIVACY, - capability); + ifibss->ssid_len, + IEEE80211_BSS_TYPE_IBSS, + IEEE80211_PRIVACY(ifibss->privacy)); if (cbss) { cfg80211_unlink_bss(local->hw.wiphy, cbss); @@ -980,110 +979,143 @@ static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata, mgmt->sa, sdata->u.ibss.bssid, NULL, 0, 0, 0); } -static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, size_t len, - struct ieee80211_rx_status *rx_status, - struct ieee802_11_elems *elems) +static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee80211_rx_status *rx_status, + struct ieee802_11_elems *elems, + struct ieee80211_channel *channel) { - struct ieee80211_local *local = sdata->local; - struct cfg80211_bss *cbss; - struct ieee80211_bss *bss; struct sta_info *sta; - struct ieee80211_channel *channel; - u64 beacon_timestamp, rx_timestamp; - u32 supp_rates = 0; enum ieee80211_band band = rx_status->band; enum nl80211_bss_scan_width scan_width; + struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; bool rates_updated = false; + u32 supp_rates = 0; - channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); - if (!channel) + if (sdata->vif.type != NL80211_IFTYPE_ADHOC) return; - if (sdata->vif.type == NL80211_IFTYPE_ADHOC && - ether_addr_equal(mgmt->bssid, sdata->u.ibss.bssid)) { + if (!ether_addr_equal(mgmt->bssid, sdata->u.ibss.bssid)) + return; - rcu_read_lock(); - sta = sta_info_get(sdata, mgmt->sa); - - if (elems->supp_rates) { - supp_rates = ieee80211_sta_get_rates(sdata, elems, - band, NULL); - if (sta) { - u32 prev_rates; - - prev_rates = sta->sta.supp_rates[band]; - /* make sure mandatory rates are always added */ - scan_width = NL80211_BSS_CHAN_WIDTH_20; - if (rx_status->flag & RX_FLAG_5MHZ) - scan_width = NL80211_BSS_CHAN_WIDTH_5; - if (rx_status->flag & RX_FLAG_10MHZ) - scan_width = NL80211_BSS_CHAN_WIDTH_10; - - sta->sta.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(sband, - scan_width); - if (sta->sta.supp_rates[band] != prev_rates) { - ibss_dbg(sdata, - "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n", - sta->sta.addr, prev_rates, - sta->sta.supp_rates[band]); - rates_updated = true; - } - } else { - rcu_read_unlock(); - sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid, - mgmt->sa, supp_rates); + rcu_read_lock(); + sta = sta_info_get(sdata, mgmt->sa); + + if (elems->supp_rates) { + supp_rates = ieee80211_sta_get_rates(sdata, elems, + band, NULL); + if (sta) { + u32 prev_rates; + + prev_rates = sta->sta.supp_rates[band]; + /* make sure mandatory rates are always added */ + scan_width = NL80211_BSS_CHAN_WIDTH_20; + if (rx_status->flag & RX_FLAG_5MHZ) + scan_width = NL80211_BSS_CHAN_WIDTH_5; + if (rx_status->flag & RX_FLAG_10MHZ) + scan_width = NL80211_BSS_CHAN_WIDTH_10; + + sta->sta.supp_rates[band] = supp_rates | + ieee80211_mandatory_rates(sband, scan_width); + if (sta->sta.supp_rates[band] != prev_rates) { + ibss_dbg(sdata, + "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n", + sta->sta.addr, prev_rates, + sta->sta.supp_rates[band]); + rates_updated = true; } + } else { + rcu_read_unlock(); + sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid, + mgmt->sa, supp_rates); } + } + + if (sta && !sta->sta.wme && + elems->wmm_info && local->hw.queues >= IEEE80211_NUM_ACS) { + sta->sta.wme = true; + ieee80211_check_fast_xmit(sta); + } - if (sta && elems->wmm_info) - sta->sta.wme = true; - - if (sta && elems->ht_operation && elems->ht_cap_elem && - sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && - sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_5 && - sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_10) { - /* we both use HT */ - struct ieee80211_ht_cap htcap_ie; - struct cfg80211_chan_def chandef; - - ieee80211_ht_oper_to_chandef(channel, - elems->ht_operation, - &chandef); - - memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie)); - - /* - * fall back to HT20 if we don't use or use - * the other extension channel - */ - if (chandef.center_freq1 != - sdata->u.ibss.chandef.center_freq1) - htcap_ie.cap_info &= - cpu_to_le16(~IEEE80211_HT_CAP_SUP_WIDTH_20_40); - - rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap( - sdata, sband, &htcap_ie, sta); + if (sta && elems->ht_operation && elems->ht_cap_elem && + sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && + sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_5 && + sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_10) { + /* we both use HT */ + struct ieee80211_ht_cap htcap_ie; + struct cfg80211_chan_def chandef; + enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth; + + ieee80211_ht_oper_to_chandef(channel, + elems->ht_operation, + &chandef); + + memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie)); + rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, + &htcap_ie, + sta); + + if (elems->vht_operation && elems->vht_cap_elem && + sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20 && + sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_40) { + /* we both use VHT */ + struct ieee80211_vht_cap cap_ie; + struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap; + + ieee80211_vht_oper_to_chandef(channel, + elems->vht_operation, + &chandef); + memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); + ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, + &cap_ie, sta); + if (memcmp(&cap, &sta->sta.vht_cap, sizeof(cap))) + rates_updated |= true; } - if (sta && rates_updated) { - u32 changed = IEEE80211_RC_SUPP_RATES_CHANGED; - u8 rx_nss = sta->sta.rx_nss; + if (bw != sta->sta.bandwidth) + rates_updated |= true; - /* Force rx_nss recalculation */ - sta->sta.rx_nss = 0; - rate_control_rate_init(sta); - if (sta->sta.rx_nss != rx_nss) - changed |= IEEE80211_RC_NSS_CHANGED; + if (!cfg80211_chandef_compatible(&sdata->u.ibss.chandef, + &chandef)) + WARN_ON_ONCE(1); + } - drv_sta_rc_update(local, sdata, &sta->sta, changed); - } + if (sta && rates_updated) { + u32 changed = IEEE80211_RC_SUPP_RATES_CHANGED; + u8 rx_nss = sta->sta.rx_nss; - rcu_read_unlock(); + /* Force rx_nss recalculation */ + sta->sta.rx_nss = 0; + rate_control_rate_init(sta); + if (sta->sta.rx_nss != rx_nss) + changed |= IEEE80211_RC_NSS_CHANGED; + + drv_sta_rc_update(local, sdata, &sta->sta, changed); } + rcu_read_unlock(); +} + +static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee80211_rx_status *rx_status, + struct ieee802_11_elems *elems) +{ + struct ieee80211_local *local = sdata->local; + struct cfg80211_bss *cbss; + struct ieee80211_bss *bss; + struct ieee80211_channel *channel; + u64 beacon_timestamp, rx_timestamp; + u32 supp_rates = 0; + enum ieee80211_band band = rx_status->band; + + channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); + if (!channel) + return; + + ieee80211_update_sta_info(sdata, mgmt, len, rx_status, elems, channel); + bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, channel); if (!bss) @@ -1273,7 +1305,7 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata) scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef); ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len, - NULL, scan_width); + NULL, 0, scan_width); } static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) @@ -1304,14 +1336,82 @@ static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata) if (ifibss->privacy) capability |= WLAN_CAPABILITY_PRIVACY; - else - sdata->drop_unencrypted = 0; __ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int, &ifibss->chandef, ifibss->basic_rates, capability, 0, true); } +static unsigned ibss_setup_channels(struct wiphy *wiphy, + struct ieee80211_channel **channels, + unsigned int channels_max, + u32 center_freq, u32 width) +{ + struct ieee80211_channel *chan = NULL; + unsigned int n_chan = 0; + u32 start_freq, end_freq, freq; + + if (width <= 20) { + start_freq = center_freq; + end_freq = center_freq; + } else { + start_freq = center_freq - width / 2 + 10; + end_freq = center_freq + width / 2 - 10; + } + + for (freq = start_freq; freq <= end_freq; freq += 20) { + chan = ieee80211_get_channel(wiphy, freq); + if (!chan) + continue; + if (n_chan >= channels_max) + return n_chan; + + channels[n_chan] = chan; + n_chan++; + } + + return n_chan; +} + +static unsigned int +ieee80211_ibss_setup_scan_channels(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + struct ieee80211_channel **channels, + unsigned int channels_max) +{ + unsigned int n_chan = 0; + u32 width, cf1, cf2 = 0; + + switch (chandef->width) { + case NL80211_CHAN_WIDTH_40: + width = 40; + break; + case NL80211_CHAN_WIDTH_80P80: + cf2 = chandef->center_freq2; + /* fall through */ + case NL80211_CHAN_WIDTH_80: + width = 80; + break; + case NL80211_CHAN_WIDTH_160: + width = 160; + break; + default: + width = 20; + break; + } + + cf1 = chandef->center_freq1; + + n_chan = ibss_setup_channels(wiphy, channels, channels_max, cf1, width); + + if (cf2) + n_chan += ibss_setup_channels(wiphy, &channels[n_chan], + channels_max - n_chan, cf2, + width); + + return n_chan; +} + /* * This function is called with state == IEEE80211_IBSS_MLME_SEARCH */ @@ -1325,7 +1425,6 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) const u8 *bssid = NULL; enum nl80211_bss_scan_width scan_width; int active_ibss; - u16 capability; sdata_assert_lock(sdata); @@ -1335,9 +1434,6 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) if (active_ibss) return; - capability = WLAN_CAPABILITY_IBSS; - if (ifibss->privacy) - capability |= WLAN_CAPABILITY_PRIVACY; if (ifibss->fixed_bssid) bssid = ifibss->bssid; if (ifibss->fixed_channel) @@ -1346,8 +1442,8 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) bssid = ifibss->bssid; cbss = cfg80211_get_bss(local->hw.wiphy, chan, bssid, ifibss->ssid, ifibss->ssid_len, - WLAN_CAPABILITY_IBSS | WLAN_CAPABILITY_PRIVACY, - capability); + IEEE80211_BSS_TYPE_IBSS, + IEEE80211_PRIVACY(ifibss->privacy)); if (cbss) { struct ieee80211_bss *bss; @@ -1381,11 +1477,18 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) /* Selected IBSS not found in current scan results - try to scan */ if (time_after(jiffies, ifibss->last_scan_completed + IEEE80211_SCAN_INTERVAL)) { + struct ieee80211_channel *channels[8]; + unsigned int num; + sdata_info(sdata, "Trigger new scan to find an IBSS to join\n"); + num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy, + &ifibss->chandef, + channels, + ARRAY_SIZE(channels)); scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef); ieee80211_request_ibss_scan(sdata, ifibss->ssid, - ifibss->ssid_len, chan, + ifibss->ssid_len, channels, num, scan_width); } else { int interval = IEEE80211_SCAN_INTERVAL; @@ -1742,7 +1845,7 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) ieee80211_ibss_disconnect(sdata); ifibss->ssid_len = 0; - memset(ifibss->bssid, 0, ETH_ALEN); + eth_zero_addr(ifibss->bssid); /* remove beacon */ kfree(sdata->u.ibss.ie); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8d53d65bd2ab..b12f61507f9f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -26,6 +26,7 @@ #include <linux/etherdevice.h> #include <linux/leds.h> #include <linux/idr.h> +#include <linux/rhashtable.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> @@ -180,8 +181,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result; /** * enum ieee80211_packet_rx_flags - packet RX flags - * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed - * (incl. multicast frames) * @IEEE80211_RX_FRAGMENTED: fragmented frame * @IEEE80211_RX_AMSDU: a-MSDU packet * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed @@ -191,7 +190,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result; * @rx_flags field of &struct ieee80211_rx_status. */ enum ieee80211_packet_rx_flags { - IEEE80211_RX_RA_MATCH = BIT(1), IEEE80211_RX_FRAGMENTED = BIT(2), IEEE80211_RX_AMSDU = BIT(3), IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4), @@ -204,6 +202,8 @@ enum ieee80211_packet_rx_flags { * @IEEE80211_RX_CMNTR: received on cooked monitor already * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). + * @IEEE80211_RX_REORDER_TIMER: this frame is released by the + * reorder buffer timeout timer, not the normal RX path * * These flags are used across handling multiple interfaces * for a single frame. @@ -211,6 +211,7 @@ enum ieee80211_packet_rx_flags { enum ieee80211_rx_flags { IEEE80211_RX_CMNTR = BIT(0), IEEE80211_RX_BEACON_REPORTED = BIT(1), + IEEE80211_RX_REORDER_TIMER = BIT(2), }; struct ieee80211_rx_data { @@ -324,12 +325,6 @@ struct mesh_preq_queue { u8 flags; }; -#if HZ/100 == 0 -#define IEEE80211_ROC_MIN_LEFT 1 -#else -#define IEEE80211_ROC_MIN_LEFT (HZ/100) -#endif - struct ieee80211_roc_work { struct list_head list; struct list_head dependents; @@ -724,7 +719,6 @@ struct ieee80211_if_mesh { * enum ieee80211_sub_if_data_flags - virtual interface flags * * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets - * @IEEE80211_SDATA_PROMISC: interface is promisc * @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between * associated stations and deliver multicast frames both @@ -734,7 +728,6 @@ struct ieee80211_if_mesh { */ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_ALLMULTI = BIT(0), - IEEE80211_SDATA_PROMISC = BIT(1), IEEE80211_SDATA_OPERATING_GMODE = BIT(2), IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4), @@ -810,6 +803,19 @@ struct mac80211_qos_map { struct rcu_head rcu_head; }; +enum txq_info_flags { + IEEE80211_TXQ_STOP, + IEEE80211_TXQ_AMPDU, +}; + +struct txq_info { + struct sk_buff_head queue; + unsigned long flags; + + /* keep last! */ + struct ieee80211_txq txq; +}; + struct ieee80211_sub_if_data { struct list_head list; @@ -830,8 +836,6 @@ struct ieee80211_sub_if_data { unsigned long state; - int drop_unencrypted; - char name[IFNAMSIZ]; /* Fragment table for host-based reassembly */ @@ -854,6 +858,7 @@ struct ieee80211_sub_if_data { bool control_port_no_encrypt; int encrypt_headroom; + atomic_t txqs_len[IEEE80211_NUM_ACS]; struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS]; struct mac80211_qos_map __rcu *qos_map; @@ -1030,7 +1035,6 @@ enum queue_stop_reason { #ifdef CONFIG_MAC80211_LEDS struct tpt_led_trigger { - struct led_trigger trig; char name[32]; const struct ieee80211_tpt_blink *blink_table; unsigned int blink_table_len; @@ -1042,24 +1046,6 @@ struct tpt_led_trigger { }; #endif -/* - * struct ieee80211_tx_latency_bin_ranges - Tx latency statistics bins ranges - * - * Measuring Tx latency statistics. Counts how many Tx frames transmitted in a - * certain latency range (in Milliseconds). Each station that uses these - * ranges will have bins to count the amount of frames received in that range. - * The user can configure the ranges via debugfs. - * If ranges is NULL then Tx latency statistics bins are disabled for all - * stations. - * - * @n_ranges: number of ranges that are taken in account - * @ranges: the ranges that the user requested or NULL if disabled. - */ -struct ieee80211_tx_latency_bin_ranges { - int n_ranges; - u32 ranges[]; -}; - /** * mac80211 scan flags - currently active scan mode * @@ -1207,23 +1193,17 @@ struct ieee80211_local { spinlock_t tim_lock; unsigned long num_sta; struct list_head sta_list; - struct sta_info __rcu *sta_hash[STA_HASH_SIZE]; + struct rhashtable sta_hash; struct timer_list sta_cleanup; int sta_generation; - /* - * Tx latency statistics parameters for all stations. - * Can enable via debugfs (NULL when disabled). - */ - struct ieee80211_tx_latency_bin_ranges __rcu *tx_latency; - struct sk_buff_head pending[IEEE80211_MAX_QUEUES]; struct tasklet_struct tx_pending_tasklet; atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES]; - /* number of interfaces with corresponding IFF_ flags */ - atomic_t iff_allmultis, iff_promiscs; + /* number of interfaces with allmulti RX */ + atomic_t iff_allmultis; struct rate_control_ref *rate_ctrl; @@ -1275,6 +1255,15 @@ struct ieee80211_local { struct list_head chanctx_list; struct mutex chanctx_mtx; +#ifdef CONFIG_MAC80211_LEDS + struct led_trigger tx_led, rx_led, assoc_led, radio_led; + struct led_trigger tpt_led; + atomic_t tx_led_active, rx_led_active, assoc_led_active; + atomic_t radio_led_active, tpt_led_active; + struct tpt_led_trigger *tpt_led_trigger; +#endif + +#ifdef CONFIG_MAC80211_DEBUG_COUNTERS /* SNMP counters */ /* dot11CountersTable */ u32 dot11TransmittedFragmentCount; @@ -1287,19 +1276,9 @@ struct ieee80211_local { u32 dot11MulticastReceivedFrameCount; u32 dot11TransmittedFrameCount; -#ifdef CONFIG_MAC80211_LEDS - struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led; - struct tpt_led_trigger *tpt_led_trigger; - char tx_led_name[32], rx_led_name[32], - assoc_led_name[32], radio_led_name[32]; -#endif - -#ifdef CONFIG_MAC80211_DEBUG_COUNTERS /* TX/RX handler statistics */ unsigned int tx_handlers_drop; unsigned int tx_handlers_queued; - unsigned int tx_handlers_drop_unencrypted; - unsigned int tx_handlers_drop_fragment; unsigned int tx_handlers_drop_wep; unsigned int tx_handlers_drop_not_assoc; unsigned int tx_handlers_drop_unauth_port; @@ -1310,8 +1289,7 @@ struct ieee80211_local { unsigned int rx_handlers_drop_short; unsigned int tx_expand_skb_head; unsigned int tx_expand_skb_head_cloned; - unsigned int rx_expand_skb_head; - unsigned int rx_expand_skb_head2; + unsigned int rx_expand_skb_head_defrag; unsigned int rx_handlers_fragments; unsigned int tx_status_drop; #define I802_DEBUG_INC(c) (c)++ @@ -1476,6 +1454,10 @@ static inline struct ieee80211_local *hw_to_local( return container_of(hw, struct ieee80211_local, hw); } +static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq) +{ + return container_of(txq, struct txq_info, txq); +} static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr) { @@ -1568,7 +1550,8 @@ int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata); void ieee80211_scan_work(struct work_struct *work); int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, - struct ieee80211_channel *chan, + struct ieee80211_channel **channels, + unsigned int n_channels, enum nl80211_bss_scan_width scan_width); int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req); @@ -1617,6 +1600,7 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, int ieee80211_iface_init(void); void ieee80211_iface_exit(void); int ieee80211_if_add(struct ieee80211_local *local, const char *name, + unsigned char name_assign_type, struct wireless_dev **new_wdev, enum nl80211_iftype type, struct vif_params *params); int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, @@ -1657,6 +1641,11 @@ struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); +void ieee80211_check_fast_xmit(struct sta_info *sta); +void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); +void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); +void ieee80211_clear_fast_xmit(struct sta_info *sta); + /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap *ht_cap); @@ -1784,7 +1773,8 @@ void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int ke gfp_t gfp); void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify); -void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, @@ -1929,6 +1919,9 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) return true; } +void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct txq_info *txq, int tid); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, @@ -1967,10 +1960,6 @@ int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata, void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata); -size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, - const u8 *ids, int n_ids, - const u8 *after_ric, int n_after_ric, - size_t offset); size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset); u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap); @@ -1979,6 +1968,8 @@ u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 prot_mode); u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); +u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, + const struct cfg80211_chan_def *chandef); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); @@ -1994,6 +1985,9 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef); +void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan, + const struct ieee80211_vht_operation *oper, + struct cfg80211_chan_def *chandef); u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); int __must_check diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 81a27516813e..553ac6dd4867 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -338,7 +338,7 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata, if ((iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_P2P_GO && iftype != NL80211_IFTYPE_MESH_POINT) || - !(sdata->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) { + !ieee80211_hw_check(&sdata->local->hw, QUEUE_CONTROL)) { sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE; return 0; } @@ -378,7 +378,7 @@ static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata) int i; for (i = 0; i < IEEE80211_NUM_ACS; i++) { - if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE; else if (local->hw.queues >= IEEE80211_NUM_ACS) sdata->vif.hw_queue[i] = i; @@ -393,7 +393,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata; int ret; - if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF)) + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return 0; ASSERT_RTNL(); @@ -454,7 +454,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; - if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF)) + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return; ASSERT_RTNL(); @@ -522,6 +522,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) memcpy(sdata->vif.hw_queue, master->vif.hw_queue, sizeof(sdata->vif.hw_queue)); sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef; + + mutex_lock(&local->key_mtx); + sdata->crypto_tx_tailroom_needed_cnt += + master->crypto_tx_tailroom_needed_cnt; + mutex_unlock(&local->key_mtx); + break; } case NL80211_IFTYPE_AP: @@ -697,9 +703,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (sdata->flags & IEEE80211_SDATA_ALLMULTI) atomic_inc(&local->iff_allmultis); - if (sdata->flags & IEEE80211_SDATA_PROMISC) - atomic_inc(&local->iff_promiscs); - if (coming_up) local->open_count++; @@ -819,21 +822,20 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, * (because if we remove a STA after ops->remove_interface() * the driver will have removed the vif info already!) * - * This is relevant only in WDS mode, in all other modes we've - * already removed all stations when disconnecting or similar, - * so warn otherwise. + * In WDS mode a station must exist here and be flushed, for + * AP_VLANs stations may exist since there's nothing else that + * would have removed them, but in other modes there shouldn't + * be any stations. */ flushed = sta_info_flush(sdata); - WARN_ON_ONCE((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || - (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)); + WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + ((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || + (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1))); - /* don't count this interface for promisc/allmulti while it is down */ + /* don't count this interface for allmulti while it is down */ if (sdata->flags & IEEE80211_SDATA_ALLMULTI) atomic_dec(&local->iff_allmultis); - if (sdata->flags & IEEE80211_SDATA_PROMISC) - atomic_dec(&local->iff_promiscs); - if (sdata->vif.type == NL80211_IFTYPE_AP) { local->fif_pspoll--; local->fif_probe_req--; @@ -969,6 +971,13 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + if (sdata->vif.txq) { + struct txq_info *txqi = to_txq_info(sdata->vif.txq); + + ieee80211_purge_tx_queue(&local->hw, &txqi->queue); + atomic_set(&sdata->txqs_len[txqi->txq.ac], 0); + } + if (local->open_count == 0) ieee80211_clear_tx_pending(local); @@ -1040,12 +1049,10 @@ static void ieee80211_set_multicast_list(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; - int allmulti, promisc, sdata_allmulti, sdata_promisc; + int allmulti, sdata_allmulti; allmulti = !!(dev->flags & IFF_ALLMULTI); - promisc = !!(dev->flags & IFF_PROMISC); sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI); - sdata_promisc = !!(sdata->flags & IEEE80211_SDATA_PROMISC); if (allmulti != sdata_allmulti) { if (dev->flags & IFF_ALLMULTI) @@ -1055,13 +1062,6 @@ static void ieee80211_set_multicast_list(struct net_device *dev) sdata->flags ^= IEEE80211_SDATA_ALLMULTI; } - if (promisc != sdata_promisc) { - if (dev->flags & IFF_PROMISC) - atomic_inc(&local->iff_promiscs); - else - atomic_dec(&local->iff_promiscs); - sdata->flags ^= IEEE80211_SDATA_PROMISC; - } spin_lock_bh(&local->filter_lock); __hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len); spin_unlock_bh(&local->filter_lock); @@ -1102,6 +1102,35 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev, return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } +static struct rtnl_link_stats64 * +ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + int i; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *tstats; + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + tstats = per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + stats->rx_packets += rx_packets; + stats->tx_packets += tx_packets; + stats->rx_bytes += rx_bytes; + stats->tx_bytes += tx_bytes; + } + + return stats; +} + static const struct net_device_ops ieee80211_dataif_ops = { .ndo_open = ieee80211_open, .ndo_stop = ieee80211_stop, @@ -1111,6 +1140,7 @@ static const struct net_device_ops ieee80211_dataif_ops = { .ndo_change_mtu = ieee80211_change_mtu, .ndo_set_mac_address = ieee80211_change_mac, .ndo_select_queue = ieee80211_netdev_select_queue, + .ndo_get_stats64 = ieee80211_get_stats64, }; static u16 ieee80211_monitor_select_queue(struct net_device *dev, @@ -1144,14 +1174,21 @@ static const struct net_device_ops ieee80211_monitorif_ops = { .ndo_change_mtu = ieee80211_change_mtu, .ndo_set_mac_address = ieee80211_change_mac, .ndo_select_queue = ieee80211_monitor_select_queue, + .ndo_get_stats64 = ieee80211_get_stats64, }; +static void ieee80211_if_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ieee80211_if_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->netdev_ops = &ieee80211_dataif_ops; - dev->destructor = free_netdev; + dev->destructor = ieee80211_if_free; } static void ieee80211_iface_work(struct work_struct *work) @@ -1508,7 +1545,6 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, } /* reset some values that shouldn't be kept across type changes */ - sdata->drop_unencrypted = 0; if (type == NL80211_IFTYPE_STATION) sdata->u.mgd.use_4addr = false; @@ -1550,7 +1586,7 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: - if (local->hw.flags & IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF) { + if (ieee80211_hw_check(&local->hw, P2P_DEV_ADDR_FOR_INTF)) { list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) continue; @@ -1649,11 +1685,13 @@ static void ieee80211_assign_perm_addr(struct ieee80211_local *local, } int ieee80211_if_add(struct ieee80211_local *local, const char *name, + unsigned char name_assign_type, struct wireless_dev **new_wdev, enum nl80211_iftype type, struct vif_params *params) { struct net_device *ndev = NULL; struct ieee80211_sub_if_data *sdata = NULL; + struct txq_info *txqi; int ret, i; int txqs = 1; @@ -1673,16 +1711,30 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ieee80211_assign_perm_addr(local, wdev->address, type); memcpy(sdata->vif.addr, wdev->address, ETH_ALEN); } else { + int size = ALIGN(sizeof(*sdata) + local->hw.vif_data_size, + sizeof(void *)); + int txq_size = 0; + + if (local->ops->wake_tx_queue) + txq_size += sizeof(struct txq_info) + + local->hw.txq_data_size; + if (local->hw.queues >= IEEE80211_NUM_ACS) txqs = IEEE80211_NUM_ACS; - ndev = alloc_netdev_mqs(sizeof(*sdata) + local->hw.vif_data_size, - name, NET_NAME_UNKNOWN, + ndev = alloc_netdev_mqs(size + txq_size, + name, name_assign_type, ieee80211_if_setup, txqs, 1); if (!ndev) return -ENOMEM; dev_net_set(ndev, wiphy_net(local->hw.wiphy)); + ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!ndev->tstats) { + free_netdev(ndev); + return -ENOMEM; + } + ndev->needed_headroom = local->tx_headroom + 4*6 /* four MAC addresses */ + 2 + 2 + 2 + 2 /* ctl, dur, seq, qos */ @@ -1711,6 +1763,11 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, memcpy(sdata->vif.addr, ndev->dev_addr, ETH_ALEN); memcpy(sdata->name, ndev->name, IFNAMSIZ); + if (txq_size) { + txqi = netdev_priv(ndev) + size; + ieee80211_init_tx_queue(sdata, NULL, txqi, 0); + } + sdata->dev = ndev; } @@ -1806,10 +1863,6 @@ void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata) ieee80211_teardown_sdata(sdata); } -/* - * Remove all interfaces, may only be called at hardware unregistration - * time because it doesn't do RCU-safe list removals. - */ void ieee80211_remove_interfaces(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata, *tmp; @@ -1818,14 +1871,21 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) ASSERT_RTNL(); - /* - * Close all AP_VLAN interfaces first, as otherwise they - * might be closed while the AP interface they belong to - * is closed, causing unregister_netdevice_many() to crash. + /* Before destroying the interfaces, make sure they're all stopped so + * that the hardware is stopped. Otherwise, the driver might still be + * iterating the interfaces during the shutdown, e.g. from a worker + * or from RX processing or similar, and if it does so (using atomic + * iteration) while we're manipulating the list, the iteration will + * crash. + * + * After this, the hardware should be stopped and the driver should + * have stopped all of its activities, so that we can do RCU-unaware + * manipulations of the interface list below. */ - list_for_each_entry(sdata, &local->interfaces, list) - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - dev_close(sdata->dev); + cfg80211_shutdown_all_interfaces(local->hw.wiphy); + + WARN(local->open_count, "%s: open count remains %d\n", + wiphy_name(local->hw.wiphy), local->open_count); mutex_lock(&local->iflist_mtx); list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 0825d76edcfc..b22df3a79a41 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -58,6 +58,25 @@ static void assert_key_lock(struct ieee80211_local *local) lockdep_assert_held(&local->key_mtx); } +static void +update_vlan_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta) +{ + struct ieee80211_sub_if_data *vlan; + + if (sdata->vif.type != NL80211_IFTYPE_AP) + return; + + /* crypto_tx_tailroom_needed_cnt is protected by this */ + assert_key_lock(sdata->local); + + rcu_read_lock(); + + list_for_each_entry_rcu(vlan, &sdata->u.ap.vlans, u.vlan.list) + vlan->crypto_tx_tailroom_needed_cnt += delta; + + rcu_read_unlock(); +} + static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) { /* @@ -79,6 +98,10 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) * http://mid.gmane.org/1308590980.4322.19.camel@jlt3.sipsolutions.net */ + assert_key_lock(sdata->local); + + update_vlan_tailroom_need_count(sdata, 1); + if (!sdata->crypto_tx_tailroom_needed_cnt++) { /* * Flush all XMIT packets currently using HW encryption or no @@ -88,6 +111,17 @@ static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) } } +static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata, + int delta) +{ + assert_key_lock(sdata->local); + + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt < delta); + + update_vlan_tailroom_need_count(sdata, -delta); + sdata->crypto_tx_tailroom_needed_cnt -= delta; +} + static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { struct ieee80211_sub_if_data *sdata; @@ -120,7 +154,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) * is supported; if not, return. */ if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) && - !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK)) + !ieee80211_hw_check(&key->local->hw, SUPPORTS_PER_STA_GTK)) goto out_unsupported; if (sta && !sta->uploaded) @@ -144,7 +178,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) - sdata->crypto_tx_tailroom_needed_cnt--; + decrease_tailroom_need_count(sdata, 1); WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)); @@ -174,7 +208,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) /* all of these we can do in software - if driver can */ if (ret == 1) return 0; - if (key->local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL) + if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) return -EINVAL; return 0; default: @@ -229,6 +263,7 @@ static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, if (uni) { rcu_assign_pointer(sdata->default_unicast_key, key); + ieee80211_check_fast_xmit_iface(sdata); drv_set_default_unicast_key(sdata->local, sdata, idx); } @@ -298,6 +333,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, if (pairwise) { rcu_assign_pointer(sta->ptk[idx], new); sta->ptk_idx = idx; + ieee80211_check_fast_xmit(sta); } else { rcu_assign_pointer(sta->gtk[idx], new); sta->gtk_idx = idx; @@ -483,15 +519,18 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, break; default: if (cs) { - size_t len = (seq_len > MAX_PN_LEN) ? - MAX_PN_LEN : seq_len; + if (seq_len && seq_len != cs->pn_len) { + kfree(key); + return ERR_PTR(-EINVAL); + } key->conf.iv_len = cs->hdr_len; key->conf.icv_len = cs->mic_len; for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) - for (j = 0; j < len; j++) + for (j = 0; j < seq_len; j++) key->u.gen.rx_pn[i][j] = - seq[len - j - 1]; + seq[seq_len - j - 1]; + key->flags |= KEY_FLAG_CIPHER_SCHEME; } } memcpy(key->conf.key, key_data, key_len); @@ -540,7 +579,7 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key, schedule_delayed_work(&sdata->dec_tailroom_needed_wk, HZ/2); } else { - sdata->crypto_tx_tailroom_needed_cnt--; + decrease_tailroom_need_count(sdata, 1); } } @@ -630,6 +669,7 @@ void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom) void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) { struct ieee80211_key *key; + struct ieee80211_sub_if_data *vlan; ASSERT_RTNL(); @@ -638,7 +678,14 @@ void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) mutex_lock(&sdata->local->key_mtx); - sdata->crypto_tx_tailroom_needed_cnt = 0; + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || + sdata->crypto_tx_tailroom_pending_dec); + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || + vlan->crypto_tx_tailroom_pending_dec); + } list_for_each_entry(key, &sdata->key_list, list) { increment_tailroom_need_count(sdata); @@ -648,6 +695,22 @@ void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata) mutex_unlock(&sdata->local->key_mtx); } +void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_sub_if_data *vlan; + + mutex_lock(&sdata->local->key_mtx); + + sdata->crypto_tx_tailroom_needed_cnt = 0; + + if (sdata->vif.type == NL80211_IFTYPE_AP) { + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) + vlan->crypto_tx_tailroom_needed_cnt = 0; + } + + mutex_unlock(&sdata->local->key_mtx); +} + void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, @@ -687,8 +750,8 @@ static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata, { struct ieee80211_key *key, *tmp; - sdata->crypto_tx_tailroom_needed_cnt -= - sdata->crypto_tx_tailroom_pending_dec; + decrease_tailroom_need_count(sdata, + sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; ieee80211_debugfs_key_remove_mgmt_default(sdata); @@ -708,6 +771,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *vlan; + struct ieee80211_sub_if_data *master; struct ieee80211_key *key, *tmp; LIST_HEAD(keys); @@ -727,8 +791,20 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, list_for_each_entry_safe(key, tmp, &keys, list) __ieee80211_key_destroy(key, false); - WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || - sdata->crypto_tx_tailroom_pending_dec); + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + if (sdata->bss) { + master = container_of(sdata->bss, + struct ieee80211_sub_if_data, + u.ap); + + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt != + master->crypto_tx_tailroom_needed_cnt); + } + } else { + WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || + sdata->crypto_tx_tailroom_pending_dec); + } + if (sdata->vif.type == NL80211_IFTYPE_AP) { list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || @@ -792,8 +868,8 @@ void ieee80211_delayed_tailroom_dec(struct work_struct *wk) */ mutex_lock(&sdata->local->key_mtx); - sdata->crypto_tx_tailroom_needed_cnt -= - sdata->crypto_tx_tailroom_pending_dec; + decrease_tailroom_need_count(sdata, + sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; mutex_unlock(&sdata->local->key_mtx); } @@ -827,27 +903,19 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn64 = atomic64_read(&key->u.ccmp.tx_pn); - seq->ccmp.pn[5] = pn64; - seq->ccmp.pn[4] = pn64 >> 8; - seq->ccmp.pn[3] = pn64 >> 16; - seq->ccmp.pn[2] = pn64 >> 24; - seq->ccmp.pn[1] = pn64 >> 32; - seq->ccmp.pn[0] = pn64 >> 40; - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn64 = atomic64_read(&key->u.aes_cmac.tx_pn); - seq->ccmp.pn[5] = pn64; - seq->ccmp.pn[4] = pn64 >> 8; - seq->ccmp.pn[3] = pn64 >> 16; - seq->ccmp.pn[2] = pn64 >> 24; - seq->ccmp.pn[1] = pn64 >> 32; - seq->ccmp.pn[0] = pn64 >> 40; - break; + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_cmac)); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn64 = atomic64_read(&key->u.aes_gmac.tx_pn); + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_gmac)); + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), gcmp)); + pn64 = atomic64_read(&key->conf.tx_pn); seq->ccmp.pn[5] = pn64; seq->ccmp.pn[4] = pn64 >> 8; seq->ccmp.pn[3] = pn64 >> 16; @@ -855,16 +923,6 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, seq->ccmp.pn[1] = pn64 >> 32; seq->ccmp.pn[0] = pn64 >> 40; break; - case WLAN_CIPHER_SUITE_GCMP: - case WLAN_CIPHER_SUITE_GCMP_256: - pn64 = atomic64_read(&key->u.gcmp.tx_pn); - seq->gcmp.pn[5] = pn64; - seq->gcmp.pn[4] = pn64 >> 8; - seq->gcmp.pn[3] = pn64 >> 16; - seq->gcmp.pn[2] = pn64 >> 24; - seq->gcmp.pn[1] = pn64 >> 32; - seq->gcmp.pn[0] = pn64 >> 40; - break; default: WARN_ON(1); } @@ -939,43 +997,25 @@ void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf, break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - pn64 = (u64)seq->ccmp.pn[5] | - ((u64)seq->ccmp.pn[4] << 8) | - ((u64)seq->ccmp.pn[3] << 16) | - ((u64)seq->ccmp.pn[2] << 24) | - ((u64)seq->ccmp.pn[1] << 32) | - ((u64)seq->ccmp.pn[0] << 40); - atomic64_set(&key->u.ccmp.tx_pn, pn64); - break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: - pn64 = (u64)seq->aes_cmac.pn[5] | - ((u64)seq->aes_cmac.pn[4] << 8) | - ((u64)seq->aes_cmac.pn[3] << 16) | - ((u64)seq->aes_cmac.pn[2] << 24) | - ((u64)seq->aes_cmac.pn[1] << 32) | - ((u64)seq->aes_cmac.pn[0] << 40); - atomic64_set(&key->u.aes_cmac.tx_pn, pn64); - break; + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_cmac)); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: - pn64 = (u64)seq->aes_gmac.pn[5] | - ((u64)seq->aes_gmac.pn[4] << 8) | - ((u64)seq->aes_gmac.pn[3] << 16) | - ((u64)seq->aes_gmac.pn[2] << 24) | - ((u64)seq->aes_gmac.pn[1] << 32) | - ((u64)seq->aes_gmac.pn[0] << 40); - atomic64_set(&key->u.aes_gmac.tx_pn, pn64); - break; + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), aes_gmac)); case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - pn64 = (u64)seq->gcmp.pn[5] | - ((u64)seq->gcmp.pn[4] << 8) | - ((u64)seq->gcmp.pn[3] << 16) | - ((u64)seq->gcmp.pn[2] << 24) | - ((u64)seq->gcmp.pn[1] << 32) | - ((u64)seq->gcmp.pn[0] << 40); - atomic64_set(&key->u.gcmp.tx_pn, pn64); + BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) != + offsetof(typeof(*seq), gcmp)); + pn64 = (u64)seq->ccmp.pn[5] | + ((u64)seq->ccmp.pn[4] << 8) | + ((u64)seq->ccmp.pn[3] << 16) | + ((u64)seq->ccmp.pn[2] << 24) | + ((u64)seq->ccmp.pn[1] << 32) | + ((u64)seq->ccmp.pn[0] << 40); + atomic64_set(&key->conf.tx_pn, pn64); break; default: WARN_ON(1); diff --git a/net/mac80211/key.h b/net/mac80211/key.h index d57a9915494f..3f4f9eaac140 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -18,7 +18,6 @@ #define NUM_DEFAULT_KEYS 4 #define NUM_DEFAULT_MGMT_KEYS 2 -#define MAX_PN_LEN 16 struct ieee80211_local; struct ieee80211_sub_if_data; @@ -30,10 +29,12 @@ struct sta_info; * @KEY_FLAG_UPLOADED_TO_HARDWARE: Indicates that this key is present * in the hardware for TX crypto hardware acceleration. * @KEY_FLAG_TAINTED: Key is tainted and packets should be dropped. + * @KEY_FLAG_CIPHER_SCHEME: This key is for a hardware cipher scheme */ enum ieee80211_internal_key_flags { KEY_FLAG_UPLOADED_TO_HARDWARE = BIT(0), KEY_FLAG_TAINTED = BIT(1), + KEY_FLAG_CIPHER_SCHEME = BIT(2), }; enum ieee80211_internal_tkip_state { @@ -76,7 +77,6 @@ struct ieee80211_key { u32 mic_failures; } tkip; struct { - atomic64_t tx_pn; /* * Last received packet number. The first * IEEE80211_NUM_TIDS counters are used with Data @@ -88,21 +88,18 @@ struct ieee80211_key { u32 replays; /* dot11RSNAStatsCCMPReplays */ } ccmp; struct { - atomic64_t tx_pn; u8 rx_pn[IEEE80211_CMAC_PN_LEN]; struct crypto_cipher *tfm; u32 replays; /* dot11RSNAStatsCMACReplays */ u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ } aes_cmac; struct { - atomic64_t tx_pn; u8 rx_pn[IEEE80211_GMAC_PN_LEN]; struct crypto_aead *tfm; u32 replays; /* dot11RSNAStatsCMACReplays */ u32 icverrors; /* dot11RSNAStatsCMACICVErrors */ } aes_gmac; struct { - atomic64_t tx_pn; /* Last received packet number. The first * IEEE80211_NUM_TIDS counters are used with Data * frames and the last counter is used with Robust @@ -114,7 +111,7 @@ struct ieee80211_key { } gcmp; struct { /* generic cipher scheme */ - u8 rx_pn[IEEE80211_NUM_TIDS + 1][MAX_PN_LEN]; + u8 rx_pn[IEEE80211_NUM_TIDS + 1][IEEE80211_MAX_PN_LEN]; } gen; } u; @@ -159,6 +156,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, void ieee80211_free_sta_keys(struct ieee80211_local *local, struct sta_info *sta); void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata); +void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata); #define key_mtx_dereference(local, ref) \ rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx))) diff --git a/net/mac80211/led.c b/net/mac80211/led.c index e2b836446af3..0505845b7ab8 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -12,96 +12,175 @@ #include <linux/export.h> #include "led.h" -#define MAC80211_BLINK_DELAY 50 /* ms */ - -void ieee80211_led_rx(struct ieee80211_local *local) -{ - unsigned long led_delay = MAC80211_BLINK_DELAY; - if (unlikely(!local->rx_led)) - return; - led_trigger_blink_oneshot(local->rx_led, &led_delay, &led_delay, 0); -} - -void ieee80211_led_tx(struct ieee80211_local *local) -{ - unsigned long led_delay = MAC80211_BLINK_DELAY; - if (unlikely(!local->tx_led)) - return; - led_trigger_blink_oneshot(local->tx_led, &led_delay, &led_delay, 0); -} - void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { - if (unlikely(!local->assoc_led)) + if (!atomic_read(&local->assoc_led_active)) return; if (associated) - led_trigger_event(local->assoc_led, LED_FULL); + led_trigger_event(&local->assoc_led, LED_FULL); else - led_trigger_event(local->assoc_led, LED_OFF); + led_trigger_event(&local->assoc_led, LED_OFF); } void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { - if (unlikely(!local->radio_led)) + if (!atomic_read(&local->radio_led_active)) return; if (enabled) - led_trigger_event(local->radio_led, LED_FULL); + led_trigger_event(&local->radio_led, LED_FULL); else - led_trigger_event(local->radio_led, LED_OFF); + led_trigger_event(&local->radio_led, LED_OFF); +} + +void ieee80211_alloc_led_names(struct ieee80211_local *local) +{ + local->rx_led.name = kasprintf(GFP_KERNEL, "%srx", + wiphy_name(local->hw.wiphy)); + local->tx_led.name = kasprintf(GFP_KERNEL, "%stx", + wiphy_name(local->hw.wiphy)); + local->assoc_led.name = kasprintf(GFP_KERNEL, "%sassoc", + wiphy_name(local->hw.wiphy)); + local->radio_led.name = kasprintf(GFP_KERNEL, "%sradio", + wiphy_name(local->hw.wiphy)); +} + +void ieee80211_free_led_names(struct ieee80211_local *local) +{ + kfree(local->rx_led.name); + kfree(local->tx_led.name); + kfree(local->assoc_led.name); + kfree(local->radio_led.name); +} + +static void ieee80211_tx_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tx_led); + + atomic_inc(&local->tx_led_active); +} + +static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tx_led); + + atomic_dec(&local->tx_led_active); +} + +static void ieee80211_rx_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + rx_led); + + atomic_inc(&local->rx_led_active); +} + +static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + rx_led); + + atomic_dec(&local->rx_led_active); +} + +static void ieee80211_assoc_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + assoc_led); + + atomic_inc(&local->assoc_led_active); +} + +static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + assoc_led); + + atomic_dec(&local->assoc_led_active); +} + +static void ieee80211_radio_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + radio_led); + + atomic_inc(&local->radio_led_active); +} + +static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + radio_led); + + atomic_dec(&local->radio_led_active); +} + +static void ieee80211_tpt_led_activate(struct led_classdev *led_cdev) +{ + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tpt_led); + + atomic_inc(&local->tpt_led_active); } -void ieee80211_led_names(struct ieee80211_local *local) +static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev) { - snprintf(local->rx_led_name, sizeof(local->rx_led_name), - "%srx", wiphy_name(local->hw.wiphy)); - snprintf(local->tx_led_name, sizeof(local->tx_led_name), - "%stx", wiphy_name(local->hw.wiphy)); - snprintf(local->assoc_led_name, sizeof(local->assoc_led_name), - "%sassoc", wiphy_name(local->hw.wiphy)); - snprintf(local->radio_led_name, sizeof(local->radio_led_name), - "%sradio", wiphy_name(local->hw.wiphy)); + struct ieee80211_local *local = container_of(led_cdev->trigger, + struct ieee80211_local, + tpt_led); + + atomic_dec(&local->tpt_led_active); } void ieee80211_led_init(struct ieee80211_local *local) { - local->rx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->rx_led) { - local->rx_led->name = local->rx_led_name; - if (led_trigger_register(local->rx_led)) { - kfree(local->rx_led); - local->rx_led = NULL; - } + atomic_set(&local->rx_led_active, 0); + local->rx_led.activate = ieee80211_rx_led_activate; + local->rx_led.deactivate = ieee80211_rx_led_deactivate; + if (local->rx_led.name && led_trigger_register(&local->rx_led)) { + kfree(local->rx_led.name); + local->rx_led.name = NULL; } - local->tx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->tx_led) { - local->tx_led->name = local->tx_led_name; - if (led_trigger_register(local->tx_led)) { - kfree(local->tx_led); - local->tx_led = NULL; - } + atomic_set(&local->tx_led_active, 0); + local->tx_led.activate = ieee80211_tx_led_activate; + local->tx_led.deactivate = ieee80211_tx_led_deactivate; + if (local->tx_led.name && led_trigger_register(&local->tx_led)) { + kfree(local->tx_led.name); + local->tx_led.name = NULL; } - local->assoc_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->assoc_led) { - local->assoc_led->name = local->assoc_led_name; - if (led_trigger_register(local->assoc_led)) { - kfree(local->assoc_led); - local->assoc_led = NULL; - } + atomic_set(&local->assoc_led_active, 0); + local->assoc_led.activate = ieee80211_assoc_led_activate; + local->assoc_led.deactivate = ieee80211_assoc_led_deactivate; + if (local->assoc_led.name && led_trigger_register(&local->assoc_led)) { + kfree(local->assoc_led.name); + local->assoc_led.name = NULL; } - local->radio_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL); - if (local->radio_led) { - local->radio_led->name = local->radio_led_name; - if (led_trigger_register(local->radio_led)) { - kfree(local->radio_led); - local->radio_led = NULL; - } + atomic_set(&local->radio_led_active, 0); + local->radio_led.activate = ieee80211_radio_led_activate; + local->radio_led.deactivate = ieee80211_radio_led_deactivate; + if (local->radio_led.name && led_trigger_register(&local->radio_led)) { + kfree(local->radio_led.name); + local->radio_led.name = NULL; } + atomic_set(&local->tpt_led_active, 0); if (local->tpt_led_trigger) { - if (led_trigger_register(&local->tpt_led_trigger->trig)) { + local->tpt_led.activate = ieee80211_tpt_led_activate; + local->tpt_led.deactivate = ieee80211_tpt_led_deactivate; + if (led_trigger_register(&local->tpt_led)) { kfree(local->tpt_led_trigger); local->tpt_led_trigger = NULL; } @@ -110,58 +189,50 @@ void ieee80211_led_init(struct ieee80211_local *local) void ieee80211_led_exit(struct ieee80211_local *local) { - if (local->radio_led) { - led_trigger_unregister(local->radio_led); - kfree(local->radio_led); - } - if (local->assoc_led) { - led_trigger_unregister(local->assoc_led); - kfree(local->assoc_led); - } - if (local->tx_led) { - led_trigger_unregister(local->tx_led); - kfree(local->tx_led); - } - if (local->rx_led) { - led_trigger_unregister(local->rx_led); - kfree(local->rx_led); - } + if (local->radio_led.name) + led_trigger_unregister(&local->radio_led); + if (local->assoc_led.name) + led_trigger_unregister(&local->assoc_led); + if (local->tx_led.name) + led_trigger_unregister(&local->tx_led); + if (local->rx_led.name) + led_trigger_unregister(&local->rx_led); if (local->tpt_led_trigger) { - led_trigger_unregister(&local->tpt_led_trigger->trig); + led_trigger_unregister(&local->tpt_led); kfree(local->tpt_led_trigger); } } -char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->radio_led_name; + return local->radio_led.name; } EXPORT_SYMBOL(__ieee80211_get_radio_led_name); -char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->assoc_led_name; + return local->assoc_led.name; } EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); -char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->tx_led_name; + return local->tx_led.name; } EXPORT_SYMBOL(__ieee80211_get_tx_led_name); -char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) +const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); - return local->rx_led_name; + return local->rx_led.name; } EXPORT_SYMBOL(__ieee80211_get_rx_led_name); @@ -205,16 +276,17 @@ static void tpt_trig_timer(unsigned long data) } } - read_lock(&tpt_trig->trig.leddev_list_lock); - list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) + read_lock(&local->tpt_led.leddev_list_lock); + list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) led_blink_set(led_cdev, &on, &off); - read_unlock(&tpt_trig->trig.leddev_list_lock); + read_unlock(&local->tpt_led.leddev_list_lock); } -char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, - unsigned int flags, - const struct ieee80211_tpt_blink *blink_table, - unsigned int blink_table_len) +const char * +__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, + unsigned int flags, + const struct ieee80211_tpt_blink *blink_table, + unsigned int blink_table_len) { struct ieee80211_local *local = hw_to_local(hw); struct tpt_led_trigger *tpt_trig; @@ -229,7 +301,7 @@ char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, snprintf(tpt_trig->name, sizeof(tpt_trig->name), "%stpt", wiphy_name(local->hw.wiphy)); - tpt_trig->trig.name = tpt_trig->name; + local->tpt_led.name = tpt_trig->name; tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; @@ -269,10 +341,10 @@ static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) tpt_trig->running = false; del_timer_sync(&tpt_trig->timer); - read_lock(&tpt_trig->trig.leddev_list_lock); - list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) + read_lock(&local->tpt_led.leddev_list_lock); + list_for_each_entry(led_cdev, &local->tpt_led.led_cdevs, trig_list) led_set_brightness(led_cdev, LED_OFF); - read_unlock(&tpt_trig->trig.leddev_list_lock); + read_unlock(&local->tpt_led.leddev_list_lock); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, diff --git a/net/mac80211/led.h b/net/mac80211/led.h index 89f4344f13b9..a7893a1ac98b 100644 --- a/net/mac80211/led.h +++ b/net/mac80211/led.h @@ -11,25 +11,42 @@ #include <linux/leds.h> #include "ieee80211_i.h" +#define MAC80211_BLINK_DELAY 50 /* ms */ + +static inline void ieee80211_led_rx(struct ieee80211_local *local) +{ +#ifdef CONFIG_MAC80211_LEDS + unsigned long led_delay = MAC80211_BLINK_DELAY; + + if (!atomic_read(&local->rx_led_active)) + return; + led_trigger_blink_oneshot(&local->rx_led, &led_delay, &led_delay, 0); +#endif +} + +static inline void ieee80211_led_tx(struct ieee80211_local *local) +{ +#ifdef CONFIG_MAC80211_LEDS + unsigned long led_delay = MAC80211_BLINK_DELAY; + + if (!atomic_read(&local->tx_led_active)) + return; + led_trigger_blink_oneshot(&local->tx_led, &led_delay, &led_delay, 0); +#endif +} + #ifdef CONFIG_MAC80211_LEDS -void ieee80211_led_rx(struct ieee80211_local *local); -void ieee80211_led_tx(struct ieee80211_local *local); void ieee80211_led_assoc(struct ieee80211_local *local, bool associated); void ieee80211_led_radio(struct ieee80211_local *local, bool enabled); -void ieee80211_led_names(struct ieee80211_local *local); +void ieee80211_alloc_led_names(struct ieee80211_local *local); +void ieee80211_free_led_names(struct ieee80211_local *local); void ieee80211_led_init(struct ieee80211_local *local); void ieee80211_led_exit(struct ieee80211_local *local); void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off); #else -static inline void ieee80211_led_rx(struct ieee80211_local *local) -{ -} -static inline void ieee80211_led_tx(struct ieee80211_local *local) -{ -} static inline void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { @@ -38,7 +55,10 @@ static inline void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { } -static inline void ieee80211_led_names(struct ieee80211_local *local) +static inline void ieee80211_alloc_led_names(struct ieee80211_local *local) +{ +} +static inline void ieee80211_free_led_names(struct ieee80211_local *local) { } static inline void ieee80211_led_init(struct ieee80211_local *local) @@ -58,7 +78,7 @@ static inline void ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes) { #ifdef CONFIG_MAC80211_LEDS - if (local->tpt_led_trigger && ieee80211_is_data(fc)) + if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->tx_bytes += bytes; #endif } @@ -67,7 +87,7 @@ static inline void ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes) { #ifdef CONFIG_MAC80211_LEDS - if (local->tpt_led_trigger && ieee80211_is_data(fc)) + if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->rx_bytes += bytes; #endif } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 5e09d354c5a5..3c63468b4dfb 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -41,9 +41,6 @@ void ieee80211_configure_filter(struct ieee80211_local *local) unsigned int changed_flags; unsigned int new_flags = 0; - if (atomic_read(&local->iff_promiscs)) - new_flags |= FIF_PROMISC_IN_BSS; - if (atomic_read(&local->iff_allmultis)) new_flags |= FIF_ALLMULTI; @@ -249,6 +246,7 @@ static void ieee80211_restart_work(struct work_struct *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, restart_work); + struct ieee80211_sub_if_data *sdata; /* wait for scan work complete */ flush_workqueue(local->workqueue); @@ -257,6 +255,8 @@ static void ieee80211_restart_work(struct work_struct *work) "%s called with hardware scan in progress\n", __func__); rtnl_lock(); + list_for_each_entry(sdata, &local->interfaces, list) + flush_delayed_work(&sdata->dec_tailroom_needed_wk); ieee80211_scan_cancel(local); ieee80211_reconfig(local); rtnl_unlock(); @@ -557,6 +557,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, local = wiphy_priv(wiphy); + if (sta_info_init(local)) + goto err_free; + local->hw.wiphy = wiphy; local->hw.priv = (char *)local + ALIGN(sizeof(*local), NETDEV_ALIGN); @@ -629,8 +632,6 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, spin_lock_init(&local->ack_status_lock); idr_init(&local->ack_status_frames); - sta_info_init(local); - for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { skb_queue_head_init(&local->pending[i]); atomic_set(&local->agg_queue_stop[i], 0); @@ -645,11 +646,14 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); - ieee80211_led_names(local); + ieee80211_alloc_led_names(local); ieee80211_roc_setup(local); return &local->hw; + err_free: + wiphy_free(wiphy); + return NULL; } EXPORT_SYMBOL(ieee80211_alloc_hw_nm); @@ -657,7 +661,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) { bool have_wep = !(IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)); - bool have_mfp = local->hw.flags & IEEE80211_HW_MFP_CAPABLE; + bool have_mfp = ieee80211_hw_check(&local->hw, MFP_CAPABLE); int n_suites = 0, r = 0, w = 0; u32 *suites; static const u32 cipher_suites[] = { @@ -677,7 +681,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) WLAN_CIPHER_SUITE_BIP_GMAC_256, }; - if (local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL || + if (ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL) || local->hw.wiphy->cipher_suites) { /* If the driver advertises, or doesn't support SW crypto, * we only need to remove WEP if necessary. @@ -767,8 +771,13 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) suites[w++] = WLAN_CIPHER_SUITE_BIP_GMAC_256; } - for (r = 0; r < local->hw.n_cipher_schemes; r++) + for (r = 0; r < local->hw.n_cipher_schemes; r++) { suites[w++] = cs[r].cipher; + if (WARN_ON(cs[r].pn_len > IEEE80211_MAX_PN_LEN)) { + kfree(suites); + return -EINVAL; + } + } } local->hw.wiphy->cipher_suites = suites; @@ -788,7 +797,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) netdev_features_t feature_whitelist; struct cfg80211_chan_def dflt_chandef = {}; - if (hw->flags & IEEE80211_HW_QUEUE_CONTROL && + if (ieee80211_hw_check(hw, QUEUE_CONTROL) && (local->hw.offchannel_tx_hw_queue == IEEE80211_INVAL_HW_QUEUE || local->hw.offchannel_tx_hw_queue >= local->hw.queues)) return -EINVAL; @@ -836,7 +845,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* Only HW csum features are currently compatible with mac80211 */ feature_whitelist = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | - NETIF_F_HW_CSUM; + NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | + NETIF_F_GSO_SOFTWARE; if (WARN_ON(hw->netdev_features & ~feature_whitelist)) return -EINVAL; @@ -935,9 +945,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* mac80211 supports control port protocol changing */ local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) { local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; - } else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) { + } else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) { local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC; if (hw->max_signal <= 0) { result = -EINVAL; @@ -991,7 +1001,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->hw.wiphy->flags |= WIPHY_FLAG_TDLS_EXTERNAL_SETUP; /* mac80211 supports eCSA, if the driver supports STA CSA at all */ - if (local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA) + if (ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) local->ext_capa[0] |= WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING; local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM; @@ -1035,6 +1045,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->dynamic_ps_forced_timeout = -1; + if (!local->hw.txq_ac_max_pending) + local->hw.txq_ac_max_pending = 64; + result = ieee80211_wep_init(local); if (result < 0) wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", @@ -1056,8 +1069,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) && - !(hw->flags & IEEE80211_HW_NO_AUTO_VIF)) { - result = ieee80211_if_add(local, "wlan%d", NULL, + !ieee80211_hw_check(hw, NO_AUTO_VIF)) { + result = ieee80211_if_add(local, "wlan%d", NET_NAME_ENUM, NULL, NL80211_IFTYPE_STATION, NULL); if (result) wiphy_warn(local->hw.wiphy, @@ -1173,7 +1186,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) destroy_workqueue(local->workqueue); wiphy_unregister(local->hw.wiphy); - sta_info_stop(local); ieee80211_wep_free(local); ieee80211_led_exit(local); kfree(local->int_scan_req); @@ -1201,10 +1213,10 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) ieee80211_free_ack_frame, NULL); idr_destroy(&local->ack_status_frames); - kfree(rcu_access_pointer(local->tx_latency)); - sta_info_stop(local); + ieee80211_free_led_names(local); + wiphy_free(local->hw.wiphy); } EXPORT_SYMBOL(ieee80211_free_hw); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 0c8b2a77d312..817098add1d6 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -520,7 +520,7 @@ int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc, } else { *fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ - memset(hdr->addr1, 0, ETH_ALEN); /* RA is resolved later */ + eth_zero_addr(hdr->addr1); /* RA is resolved later */ memcpy(hdr->addr2, meshsa, ETH_ALEN); memcpy(hdr->addr3, meshda, ETH_ALEN); memcpy(hdr->addr4, meshsa, ETH_ALEN); @@ -574,7 +574,8 @@ static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u32 changed; - ieee80211_sta_expire(sdata, ifmsh->mshcfg.plink_timeout * HZ); + if (ifmsh->mshcfg.plink_timeout > 0) + ieee80211_sta_expire(sdata, ifmsh->mshcfg.plink_timeout * HZ); mesh_path_expire(sdata); changed = mesh_accept_plinks_update(sdata); @@ -679,6 +680,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) *pos++ = 0x0; *pos++ = ieee80211_frequency_to_channel( csa->settings.chandef.chan->center_freq); + bcn->csa_current_counter = csa->settings.count; bcn->csa_counter_offsets[0] = hdr_len + 6; *pos++ = csa->settings.count; *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 214e63b84e5c..085edc1d056b 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -510,14 +510,14 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, - const u8 *preq_elem, u32 metric) + const u8 *preq_elem, u32 orig_metric) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath = NULL; const u8 *target_addr, *orig_addr; const u8 *da; u8 target_flags, ttl, flags; - u32 orig_sn, target_sn, lifetime, orig_metric; + u32 orig_sn, target_sn, lifetime, target_metric; bool reply = false; bool forward = true; bool root_is_gate; @@ -528,7 +528,6 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, target_sn = PREQ_IE_TARGET_SN(preq_elem); orig_sn = PREQ_IE_ORIG_SN(preq_elem); target_flags = PREQ_IE_TARGET_F(preq_elem); - orig_metric = metric; /* Proactive PREQ gate announcements */ flags = PREQ_IE_FLAGS(preq_elem); root_is_gate = !!(flags & RANN_FLAG_IS_GATE); @@ -539,7 +538,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, mhwmp_dbg(sdata, "PREQ is for us\n"); forward = false; reply = true; - metric = 0; + target_metric = 0; if (time_after(jiffies, ifmsh->last_sn_update + net_traversal_jiffies(sdata)) || time_before(jiffies, ifmsh->last_sn_update)) { @@ -556,7 +555,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, reply = true; target_addr = sdata->vif.addr; target_sn = ++ifmsh->sn; - metric = 0; + target_metric = 0; ifmsh->last_sn_update = jiffies; } if (root_is_gate) @@ -574,7 +573,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, } else if ((!(target_flags & MP_F_DO)) && (mpath->flags & MESH_PATH_ACTIVE)) { reply = true; - metric = mpath->metric; + target_metric = mpath->metric; target_sn = mpath->sn; if (target_flags & MP_F_RF) target_flags |= MP_F_DO; @@ -593,7 +592,8 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, mesh_path_sel_frame_tx(MPATH_PREP, 0, orig_addr, orig_sn, 0, target_addr, target_sn, mgmt->sa, 0, ttl, - lifetime, metric, 0, sdata); + lifetime, target_metric, 0, + sdata); } else { ifmsh->mshstats.dropped_frames_ttl++; } @@ -619,13 +619,12 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, if (flags & IEEE80211_PREQ_PROACTIVE_PREP_FLAG) { target_addr = PREQ_IE_TARGET_ADDR(preq_elem); target_sn = PREQ_IE_TARGET_SN(preq_elem); - metric = orig_metric; } mesh_path_sel_frame_tx(MPATH_PREQ, flags, orig_addr, orig_sn, target_flags, target_addr, target_sn, da, hopcount, ttl, lifetime, - metric, preq_id, sdata); + orig_metric, preq_id, sdata); if (!is_multicast_ether_addr(da)) ifmsh->mshstats.fwded_unicast++; else @@ -854,7 +853,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, { struct ieee802_11_elems elems; size_t baselen; - u32 last_hop_metric; + u32 path_metric; struct sta_info *sta; /* need action_code */ @@ -877,21 +876,21 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, if (elems.preq_len != 37) /* Right now we support just 1 destination and no AE */ return; - last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.preq, - MPATH_PREQ); - if (last_hop_metric) + path_metric = hwmp_route_info_get(sdata, mgmt, elems.preq, + MPATH_PREQ); + if (path_metric) hwmp_preq_frame_process(sdata, mgmt, elems.preq, - last_hop_metric); + path_metric); } if (elems.prep) { if (elems.prep_len != 31) /* Right now we support no AE */ return; - last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.prep, - MPATH_PREP); - if (last_hop_metric) + path_metric = hwmp_route_info_get(sdata, mgmt, elems.prep, + MPATH_PREP); + if (path_metric) hwmp_prep_frame_process(sdata, mgmt, elems.prep, - last_hop_metric); + path_metric); } if (elems.perr) { if (elems.perr_len != 15) diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index b488e1859b18..3b59099413fb 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -17,7 +17,7 @@ #define PLINK_GET_PLID(p) (p + 4) #define mod_plink_timer(s, t) (mod_timer(&s->plink_timer, \ - jiffies + HZ * t / 1000)) + jiffies + msecs_to_jiffies(t))) enum plink_event { PLINK_UNDEFINED, @@ -72,10 +72,11 @@ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata, * * @sta: mesh peer link to restart * - * Locking: this function must be called holding sta->lock + * Locking: this function must be called holding sta->plink_lock */ static inline void mesh_plink_fsm_restart(struct sta_info *sta) { + lockdep_assert_held(&sta->plink_lock); sta->plink_state = NL80211_PLINK_LISTEN; sta->llid = sta->plid = sta->reason = 0; sta->plink_retries = 0; @@ -105,9 +106,7 @@ static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) /* (IEEE 802.11-2012 19.4.5) */ short_slot = true; goto out; - } else if (band != IEEE80211_BAND_2GHZ || - (band == IEEE80211_BAND_2GHZ && - local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) + } else if (band != IEEE80211_BAND_2GHZ) goto out; for (i = 0; i < sband->n_bitrates; i++) @@ -213,13 +212,15 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) * All mesh paths with this peer as next hop will be flushed * Returns beacon changed flag if the beacon content changed. * - * Locking: the caller must hold sta->lock + * Locking: the caller must hold sta->plink_lock */ static u32 __mesh_plink_deactivate(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; u32 changed = 0; + lockdep_assert_held(&sta->plink_lock); + if (sta->plink_state == NL80211_PLINK_ESTAB) changed = mesh_plink_dec_estab_count(sdata); sta->plink_state = NL80211_PLINK_BLOCKED; @@ -244,13 +245,13 @@ u32 mesh_plink_deactivate(struct sta_info *sta) struct ieee80211_sub_if_data *sdata = sta->sdata; u32 changed; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); changed = __mesh_plink_deactivate(sta); sta->reason = WLAN_REASON_MESH_PEER_CANCELED; mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, sta->sta.addr, sta->llid, sta->plid, sta->reason); - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return changed; } @@ -305,7 +306,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (action == WLAN_SP_MESH_PEERING_CONFIRM) { /* AID */ pos = skb_put(skb, 2); - put_unaligned_le16(plid, pos + 2); + put_unaligned_le16(plid, pos); } if (ieee80211_add_srates_ie(sdata, skb, true, band) || ieee80211_add_ext_srates_ie(sdata, skb, true, band) || @@ -382,16 +383,18 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, enum ieee80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u32 rates, basic_rates = 0, changed = 0; + enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth; sband = local->hw.wiphy->bands[band]; rates = ieee80211_sta_get_rates(sdata, elems, band, &basic_rates); - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); sta->last_rx = jiffies; /* rates and capabilities don't change during peering */ - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->plink_state == NL80211_PLINK_ESTAB && sta->processed_beacon) goto out; + sta->processed_beacon = true; if (sta->sta.supp_rates[band] != rates) changed |= IEEE80211_RC_SUPP_RATES_CHANGED; @@ -401,6 +404,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, elems->ht_cap_elem, sta)) changed |= IEEE80211_RC_BW_CHANGED; + if (bw != sta->sta.bandwidth) + changed |= IEEE80211_RC_BW_CHANGED; + /* HT peer is operating 20MHz-only */ if (elems->ht_operation && !(elems->ht_operation->ht_param & @@ -415,7 +421,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, else rate_control_rate_update(local, sband, sta, changed); out: - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); } static struct sta_info * @@ -548,7 +554,7 @@ static void mesh_plink_timer(unsigned long data) if (sta->sdata->local->quiescing) return; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); /* If a timer fires just before a state transition on another CPU, * we may have already extended the timeout and changed state by the @@ -559,7 +565,7 @@ static void mesh_plink_timer(unsigned long data) mpl_dbg(sta->sdata, "Ignoring timer for %pM in state %s (timer adjusted)", sta->sta.addr, mplstates[sta->plink_state]); - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return; } @@ -569,7 +575,7 @@ static void mesh_plink_timer(unsigned long data) mpl_dbg(sta->sdata, "Ignoring timer for %pM in state %s (timer deleted)", sta->sta.addr, mplstates[sta->plink_state]); - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return; } @@ -615,15 +621,15 @@ static void mesh_plink_timer(unsigned long data) default: break; } - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); if (action) mesh_plink_frame_tx(sdata, action, sta->sta.addr, sta->llid, sta->plid, reason); } -static inline void mesh_plink_timer_set(struct sta_info *sta, int timeout) +static inline void mesh_plink_timer_set(struct sta_info *sta, u32 timeout) { - sta->plink_timer.expires = jiffies + (HZ * timeout / 1000); + sta->plink_timer.expires = jiffies + msecs_to_jiffies(timeout); sta->plink_timer.data = (unsigned long) sta; sta->plink_timer.function = mesh_plink_timer; sta->plink_timeout = timeout; @@ -670,16 +676,16 @@ u32 mesh_plink_open(struct sta_info *sta) if (!test_sta_flag(sta, WLAN_STA_AUTH)) return 0; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); sta->llid = mesh_get_new_llid(sdata); if (sta->plink_state != NL80211_PLINK_LISTEN && sta->plink_state != NL80211_PLINK_BLOCKED) { - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return 0; } sta->plink_state = NL80211_PLINK_OPN_SNT; mesh_plink_timer_set(sta, sdata->u.mesh.mshcfg.dot11MeshRetryTimeout); - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); mpl_dbg(sdata, "Mesh plink: starting establishment with %pM\n", sta->sta.addr); @@ -696,10 +702,10 @@ u32 mesh_plink_block(struct sta_info *sta) { u32 changed; - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); changed = __mesh_plink_deactivate(sta); sta->plink_state = NL80211_PLINK_BLOCKED; - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); return changed; } @@ -754,7 +760,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, mpl_dbg(sdata, "peer %pM in state %s got event %s\n", sta->sta.addr, mplstates[sta->plink_state], mplevents[event]); - spin_lock_bh(&sta->lock); + spin_lock_bh(&sta->plink_lock); switch (sta->plink_state) { case NL80211_PLINK_LISTEN: switch (event) { @@ -868,7 +874,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, */ break; } - spin_unlock_bh(&sta->lock); + spin_unlock_bh(&sta->plink_lock); if (action) { mesh_plink_frame_tx(sdata, action, sta->sta.addr, sta->llid, sta->plid, sta->reason); @@ -1116,6 +1122,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, WLAN_SP_MESH_PEERING_CONFIRM) { baseaddr += 4; baselen += 4; + + if (baselen > len) + return; } ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems); mesh_process_plink_frame(sdata, mgmt, &elems); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 142f66aece18..9b2cc278ac2a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -118,7 +118,7 @@ void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata) if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) return; - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&sdata->u.mgd.bcn_mon_timer, @@ -134,7 +134,7 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) ifmgd->probe_send_count = 0; - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&sdata->u.mgd.conn_mon_timer, @@ -669,17 +669,15 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) capab = WLAN_CAPABILITY_ESS; if (sband->band == IEEE80211_BAND_2GHZ) { - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; + capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; + capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } if (assoc_data->capability & WLAN_CAPABILITY_PRIVACY) capab |= WLAN_CAPABILITY_PRIVACY; if ((assoc_data->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) && - (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)) + ieee80211_hw_check(&local->hw, SPECTRUM_MGMT)) capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM) @@ -887,7 +885,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) drv_mgd_prepare_tx(local, sdata); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_tx_skb(sdata, skb); @@ -929,7 +927,7 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) @@ -1098,6 +1096,24 @@ static void ieee80211_chswitch_timer(unsigned long data) ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.chswitch_work); } +static void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + u16 reason = WLAN_REASON_TDLS_TEARDOWN_UNSPECIFIED; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + continue; + + ieee80211_tdls_oper_request(&sdata->vif, sta->sta.addr, + NL80211_TDLS_TEARDOWN, reason, + GFP_ATOMIC); + } + rcu_read_unlock(); +} + static void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, u64 timestamp, u32 device_timestamp, @@ -1161,6 +1177,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; } + /* + * Drop all TDLS peers - either we disconnect or move to a different + * channel from this point on. There's no telling what our peer will do. + * The TDLS WIDER_BW scenario is also problematic, as peers might now + * have an incompatible wider chandef. + */ + ieee80211_teardown_tdls_peers(sdata); + mutex_lock(&local->mtx); mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, @@ -1168,24 +1192,16 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, if (!conf) { sdata_info(sdata, "no channel context assigned to vif?, disconnecting\n"); - ieee80211_queue_work(&local->hw, - &ifmgd->csa_connection_drop_work); - mutex_unlock(&local->chanctx_mtx); - mutex_unlock(&local->mtx); - return; + goto drop_connection; } chanctx = container_of(conf, struct ieee80211_chanctx, conf); if (local->use_chanctx && - !(local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA)) { + !ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) { sdata_info(sdata, "driver doesn't support chan-switch with channel contexts\n"); - ieee80211_queue_work(&local->hw, - &ifmgd->csa_connection_drop_work); - mutex_unlock(&local->chanctx_mtx); - mutex_unlock(&local->mtx); - return; + goto drop_connection; } ch_switch.timestamp = timestamp; @@ -1197,11 +1213,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, if (drv_pre_channel_switch(sdata, &ch_switch)) { sdata_info(sdata, "preparing for channel switch failed, disconnecting\n"); - ieee80211_queue_work(&local->hw, - &ifmgd->csa_connection_drop_work); - mutex_unlock(&local->chanctx_mtx); - mutex_unlock(&local->mtx); - return; + goto drop_connection; } res = ieee80211_vif_reserve_chanctx(sdata, &csa_ie.chandef, @@ -1210,11 +1222,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "failed to reserve channel context for channel switch, disconnecting (err=%d)\n", res); - ieee80211_queue_work(&local->hw, - &ifmgd->csa_connection_drop_work); - mutex_unlock(&local->chanctx_mtx); - mutex_unlock(&local->mtx); - return; + goto drop_connection; } mutex_unlock(&local->chanctx_mtx); @@ -1244,6 +1252,11 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, mod_timer(&ifmgd->chswitch_timer, TU_TO_EXP_TIME((csa_ie.count - 1) * cbss->beacon_interval)); + return; + drop_connection: + ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); + mutex_unlock(&local->chanctx_mtx); + mutex_unlock(&local->mtx); } static bool @@ -1359,15 +1372,15 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, */ if (has_80211h_pwr && (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { - sdata_info(sdata, - "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", - pwr_level_80211h, chan_pwr, pwr_reduction_80211h, - sdata->u.mgd.bssid); + sdata_dbg(sdata, + "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", + pwr_level_80211h, chan_pwr, pwr_reduction_80211h, + sdata->u.mgd.bssid); new_ap_level = pwr_level_80211h; } else { /* has_cisco_pwr is always true here. */ - sdata_info(sdata, - "Limiting TX power to %d dBm as advertised by %pM\n", - pwr_level_cisco, sdata->u.mgd.bssid); + sdata_dbg(sdata, + "Limiting TX power to %d dBm as advertised by %pM\n", + pwr_level_cisco, sdata->u.mgd.bssid); new_ap_level = pwr_level_cisco; } @@ -1394,15 +1407,15 @@ static void ieee80211_enable_ps(struct ieee80211_local *local, return; if (conf->dynamic_ps_timeout > 0 && - !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)) { + !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(conf->dynamic_ps_timeout)); } else { - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) ieee80211_send_nullfunc(local, sdata, 1); - if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && - (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) return; conf->flags |= IEEE80211_CONF_PS; @@ -1461,7 +1474,7 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency) int count = 0; int timeout; - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) { + if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) { local->ps_sdata = NULL; return; } @@ -1607,7 +1620,7 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } - if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) && + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && !(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + @@ -1620,8 +1633,8 @@ void ieee80211_dynamic_ps_enable_work(struct work_struct *work) } } - if (!((local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) && - (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) || + if (!(ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && + ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) || (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; local->hw.conf.flags |= IEEE80211_CONF_PS; @@ -1633,9 +1646,6 @@ void ieee80211_dynamic_ps_timer(unsigned long data) { struct ieee80211_local *local = (void *) data; - if (local->quiescing || local->suspended) - return; - ieee80211_queue_work(&local->hw, &local->dynamic_ps_enable_work); } @@ -2045,7 +2055,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_flush_queues(local, sdata, false); /* clear bssid only after building the needed mgmt frames */ - memset(ifmgd->bssid, 0, ETH_ALEN); + eth_zero_addr(ifmgd->bssid); /* remove AP and TDLS peers */ sta_info_flush(sdata); @@ -2149,7 +2159,7 @@ static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) ieee80211_recalc_ps(local, -1); mutex_unlock(&local->iflist_mtx); - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) goto out; /* @@ -2247,7 +2257,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) */ ifmgd->probe_send_count++; - if (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; ieee80211_send_nullfunc(sdata->local, sdata, 0); } else { @@ -2260,7 +2270,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) else ssid_len = ssid[1]; - ieee80211_send_probe_req(sdata, sdata->vif.addr, NULL, + ieee80211_send_probe_req(sdata, sdata->vif.addr, dst, ssid + 2, ssid_len, NULL, 0, (u32) -1, true, 0, ifmgd->associated->channel, false); @@ -2372,6 +2382,24 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_ap_probereq_get); +static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata, + const u8 *buf, size_t len, bool tx, + u16 reason) +{ + struct ieee80211_event event = { + .type = MLME_EVENT, + .u.mlme.data = tx ? DEAUTH_TX_EVENT : DEAUTH_RX_EVENT, + .u.mlme.reason = reason, + }; + + if (tx) + cfg80211_tx_mlme_mgmt(sdata->dev, buf, len); + else + cfg80211_rx_mlme_mgmt(sdata->dev, buf, len); + + drv_event_callback(sdata->local, sdata, &event); +} + static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; @@ -2397,8 +2425,9 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) } mutex_unlock(&local->mtx); - cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, - IEEE80211_DEAUTH_FRAME_LEN); + ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, + WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY); + sdata_unlock(sdata); } @@ -2477,7 +2506,7 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, del_timer_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, auth_data->bss->bssid); - memset(sdata->u.mgd.bssid, 0, ETH_ALEN); + eth_zero_addr(sdata->u.mgd.bssid); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; mutex_lock(&sdata->local->mtx); @@ -2490,6 +2519,34 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.auth_data = NULL; } +static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, + bool assoc) +{ + struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; + + sdata_assert_lock(sdata); + + if (!assoc) { + /* + * we are not associated yet, the only timer that could be + * running is the timeout for the association response which + * which is not relevant anymore. + */ + del_timer_sync(&sdata->u.mgd.timer); + sta_info_destroy_addr(sdata, assoc_data->bss->bssid); + + eth_zero_addr(sdata->u.mgd.bssid); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); + sdata->u.mgd.flags = 0; + mutex_lock(&sdata->local->mtx); + ieee80211_vif_release_channel(sdata); + mutex_unlock(&sdata->local->mtx); + } + + kfree(assoc_data); + sdata->u.mgd.assoc_data = NULL; +} + static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { @@ -2505,7 +2562,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, return; auth_data->expected_transaction = 4; drv_mgd_prepare_tx(sdata->local, sdata); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0, @@ -2522,6 +2579,10 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, u8 bssid[ETH_ALEN]; u16 auth_alg, auth_transaction, status_code; struct sta_info *sta; + struct ieee80211_event event = { + .type = MLME_EVENT, + .u.mlme.data = AUTH_EVENT, + }; sdata_assert_lock(sdata); @@ -2554,6 +2615,9 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, mgmt->sa, status_code); ieee80211_destroy_auth_data(sdata, false); cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); + event.u.mlme.status = MLME_DENIED; + event.u.mlme.reason = status_code; + drv_event_callback(sdata->local, sdata, &event); return; } @@ -2576,6 +2640,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, return; } + event.u.mlme.status = MLME_SUCCESS; + drv_event_callback(sdata->local, sdata, &event); sdata_info(sdata, "authenticated\n"); ifmgd->auth_data->done = true; ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC; @@ -2673,28 +2739,42 @@ static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - const u8 *bssid = NULL; - u16 reason_code; + u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); sdata_assert_lock(sdata); if (len < 24 + 2) return; - if (!ifmgd->associated || - !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) - return; + if (ifmgd->associated && + ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) { + const u8 *bssid = ifmgd->associated->bssid; - bssid = ifmgd->associated->bssid; + sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", + bssid, reason_code, + ieee80211_get_reason_code_string(reason_code)); - reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); + ieee80211_set_disassoc(sdata, 0, 0, false, NULL); - sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", - bssid, reason_code, ieee80211_get_reason_code_string(reason_code)); + ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, + reason_code); + return; + } - ieee80211_set_disassoc(sdata, 0, 0, false, NULL); + if (ifmgd->assoc_data && + ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) { + const u8 *bssid = ifmgd->assoc_data->bss->bssid; - cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); + sdata_info(sdata, + "deauthenticated from %pM while associating (Reason: %u=%s)\n", + bssid, reason_code, + ieee80211_get_reason_code_string(reason_code)); + + ieee80211_destroy_assoc_data(sdata, false); + + cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); + return; + } } @@ -2720,7 +2800,7 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_set_disassoc(sdata, 0, 0, false, NULL); - cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); + ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code); } static void ieee80211_get_rates(struct ieee80211_supported_band *sband, @@ -2774,34 +2854,6 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, } } -static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, - bool assoc) -{ - struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; - - sdata_assert_lock(sdata); - - if (!assoc) { - /* - * we are not associated yet, the only timer that could be - * running is the timeout for the association response which - * which is not relevant anymore. - */ - del_timer_sync(&sdata->u.mgd.timer); - sta_info_destroy_addr(sdata, assoc_data->bss->bssid); - - memset(sdata->u.mgd.bssid, 0, ETH_ALEN); - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); - sdata->u.mgd.flags = 0; - mutex_lock(&sdata->local->mtx); - ieee80211_vif_release_channel(sdata); - mutex_unlock(&sdata->local->mtx); - } - - kfree(assoc_data); - sdata->u.mgd.assoc_data = NULL; -} - static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, size_t len) @@ -2982,10 +3034,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, rate_control_rate_init(sta); - if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) + if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) { set_sta_flag(sta, WLAN_STA_MFP); + sta->sta.mfp = true; + } else { + sta->sta.mfp = false; + } - sta->sta.wme = elems.wmm_param; + sta->sta.wme = elems.wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) @@ -3055,6 +3111,10 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, u8 *pos; bool reassoc; struct cfg80211_bss *bss; + struct ieee80211_event event = { + .type = MLME_EVENT, + .u.mlme.data = ASSOC_EVENT, + }; sdata_assert_lock(sdata); @@ -3106,6 +3166,9 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "%pM denied association (code=%d)\n", mgmt->sa, status_code); ieee80211_destroy_assoc_data(sdata, false); + event.u.mlme.status = MLME_DENIED; + event.u.mlme.reason = status_code; + drv_event_callback(sdata->local, sdata, &event); } else { if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) { /* oops -- internal error -- send timeout for now */ @@ -3113,6 +3176,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, cfg80211_assoc_timeout(sdata->dev, bss); return; } + event.u.mlme.status = MLME_SUCCESS; + drv_event_callback(sdata->local, sdata, &event); sdata_info(sdata, "associated\n"); /* @@ -3272,7 +3337,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, } ifmgd->have_beacon = true; ifmgd->assoc_data->need_beacon = false; - if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { sdata->vif.bss_conf.sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); sdata->vif.bss_conf.sync_device_ts = @@ -3315,6 +3380,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { int sig = ifmgd->ave_beacon_signal; int last_sig = ifmgd->last_ave_beacon_signal; + struct ieee80211_event event = { + .type = RSSI_EVENT, + }; /* * if signal crosses either of the boundaries, invoke callback @@ -3323,12 +3391,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (sig > ifmgd->rssi_max_thold && (last_sig <= ifmgd->rssi_min_thold || last_sig == 0)) { ifmgd->last_ave_beacon_signal = sig; - drv_rssi_callback(local, sdata, RSSI_EVENT_HIGH); + event.u.rssi.data = RSSI_EVENT_HIGH; + drv_event_callback(local, sdata, &event); } else if (sig < ifmgd->rssi_min_thold && (last_sig >= ifmgd->rssi_max_thold || last_sig == 0)) { ifmgd->last_ave_beacon_signal = sig; - drv_rssi_callback(local, sdata, RSSI_EVENT_LOW); + event.u.rssi.data = RSSI_EVENT_LOW; + drv_event_callback(local, sdata, &event); } } @@ -3373,7 +3443,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, len - baselen, false, &elems, care_about_ies, ncrc); - if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) { + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) { bool directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid); @@ -3433,6 +3503,26 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (ifmgd->csa_waiting_bcn) ieee80211_chswitch_post_beacon(sdata); + /* + * Update beacon timing and dtim count on every beacon appearance. This + * will allow the driver to use the most updated values. Do it before + * comparing this one with last received beacon. + * IMPORTANT: These parameters would possibly be out of sync by the time + * the driver will use them. The synchronized view is currently + * guaranteed only in certain callbacks. + */ + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { + sdata->vif.bss_conf.sync_tsf = + le64_to_cpu(mgmt->u.beacon.timestamp); + sdata->vif.bss_conf.sync_device_ts = + rx_status->device_timestamp; + if (elems.tim) + sdata->vif.bss_conf.sync_dtim_count = + elems.tim->dtim_count; + else + sdata->vif.bss_conf.sync_dtim_count = 0; + } + if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid) return; ifmgd->beacon_crc = ncrc; @@ -3460,18 +3550,6 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, else bss_conf->dtim_period = 1; - if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { - sdata->vif.bss_conf.sync_tsf = - le64_to_cpu(mgmt->u.beacon.timestamp); - sdata->vif.bss_conf.sync_device_ts = - rx_status->device_timestamp; - if (elems.tim) - sdata->vif.bss_conf.sync_dtim_count = - elems.tim->dtim_count; - else - sdata->vif.bss_conf.sync_dtim_count = 0; - } - changed |= BSS_CHANGED_BEACON_INFO; ifmgd->have_beacon = true; @@ -3502,8 +3580,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, deauth_buf); - cfg80211_tx_mlme_mgmt(sdata->dev, deauth_buf, - sizeof(deauth_buf)); + ieee80211_report_disconnect(sdata, deauth_buf, + sizeof(deauth_buf), true, + WLAN_REASON_DEAUTH_LEAVING); return; } @@ -3621,8 +3700,8 @@ static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, reason, tx, frame_buf); - cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, - IEEE80211_DEAUTH_FRAME_LEN); + ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, + reason); } static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) @@ -3670,7 +3749,7 @@ static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata) auth_data->expected_transaction = trans; } - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; @@ -3743,7 +3822,7 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) IEEE80211_ASSOC_MAX_TRIES); ieee80211_send_assoc(sdata); - if (!(local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) { + if (!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); @@ -3816,12 +3895,18 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) ieee80211_destroy_auth_data(sdata, false); } else if (ieee80211_probe_auth(sdata)) { u8 bssid[ETH_ALEN]; + struct ieee80211_event event = { + .type = MLME_EVENT, + .u.mlme.data = AUTH_EVENT, + .u.mlme.status = MLME_TIMEOUT, + }; memcpy(bssid, ifmgd->auth_data->bss->bssid, ETH_ALEN); ieee80211_destroy_auth_data(sdata, false); cfg80211_auth_timeout(sdata->dev, bssid); + drv_event_callback(sdata->local, sdata, &event); } } else if (ifmgd->auth_data && ifmgd->auth_data->timeout_started) run_again(sdata, ifmgd->auth_data->timeout); @@ -3831,9 +3916,15 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) if ((ifmgd->assoc_data->need_beacon && !ifmgd->have_beacon) || ieee80211_do_assoc(sdata)) { struct cfg80211_bss *bss = ifmgd->assoc_data->bss; + struct ieee80211_event event = { + .type = MLME_EVENT, + .u.mlme.data = ASSOC_EVENT, + .u.mlme.status = MLME_TIMEOUT, + }; ieee80211_destroy_assoc_data(sdata, false); cfg80211_assoc_timeout(sdata->dev, bss); + drv_event_callback(sdata->local, sdata, &event); } } else if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started) run_again(sdata, ifmgd->assoc_data->timeout); @@ -3845,7 +3936,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) max_tries = max_nullfunc_tries; else max_tries = max_probe_tries; @@ -3870,7 +3961,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) } } else if (time_is_after_jiffies(ifmgd->probe_timeout)) run_again(sdata, ifmgd->probe_timeout); - else if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + else if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { mlme_dbg(sdata, "Failed to send nullfunc to AP %pM after %dms, disconnecting\n", bssid, probe_wait_ms); @@ -3905,12 +3996,8 @@ static void ieee80211_sta_bcn_mon_timer(unsigned long data) { struct ieee80211_sub_if_data *sdata = (struct ieee80211_sub_if_data *) data; - struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - if (local->quiescing) - return; - if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn) return; @@ -3926,9 +4013,6 @@ static void ieee80211_sta_conn_mon_timer(unsigned long data) struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; - if (local->quiescing) - return; - if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn) return; @@ -3946,14 +4030,11 @@ static void ieee80211_sta_monitor_work(struct work_struct *work) static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) { - u32 flags; - if (sdata->vif.type == NL80211_IFTYPE_STATION) { __ieee80211_stop_poll(sdata); /* let's probe the connection once */ - flags = sdata->local->hw.flags; - if (!(flags & IEEE80211_HW_CONNECTION_MONITOR)) + if (!ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.monitor_work); /* and do all the other regular work too */ @@ -3991,6 +4072,34 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata) IEEE80211_DEAUTH_FRAME_LEN); } + /* This is a bit of a hack - we should find a better and more generic + * solution to this. Normally when suspending, cfg80211 will in fact + * deauthenticate. However, it doesn't (and cannot) stop an ongoing + * auth (not so important) or assoc (this is the problem) process. + * + * As a consequence, it can happen that we are in the process of both + * associating and suspending, and receive an association response + * after cfg80211 has checked if it needs to disconnect, but before + * we actually set the flag to drop incoming frames. This will then + * cause the workqueue flush to process the association response in + * the suspend, resulting in a successful association just before it + * tries to remove the interface from the driver, which now though + * has a channel context assigned ... this results in issues. + * + * To work around this (for now) simply deauth here again if we're + * now connected. + */ + if (ifmgd->associated && !sdata->local->wowlan) { + u8 bssid[ETH_ALEN]; + struct cfg80211_deauth_request req = { + .reason_code = WLAN_REASON_DEAUTH_LEAVING, + .bssid = bssid, + }; + + memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN); + ieee80211_mgd_deauth(sdata, &req); + } + sdata_unlock(sdata); } @@ -4233,15 +4342,15 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, - struct cfg80211_bss *cbss, bool assoc) + struct cfg80211_bss *cbss, bool assoc, + bool override) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)cbss->priv; struct sta_info *new_sta = NULL; struct ieee80211_supported_band *sband; - struct ieee80211_sta_ht_cap sta_ht_cap; - bool have_sta = false, is_override = false; + bool have_sta = false; int err; sband = local->hw.wiphy->bands[cbss->channel->band]; @@ -4261,14 +4370,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, return -ENOMEM; } - memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); - ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); - - is_override = (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) != - (sband->ht_cap.cap & - IEEE80211_HT_CAP_SUP_WIDTH_20_40); - - if (new_sta || is_override) { + if (new_sta || override) { err = ieee80211_prep_channel(sdata, cbss); if (err) { if (new_sta) @@ -4345,8 +4447,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.sync_dtim_count = tim_ie[2]; else sdata->vif.bss_conf.sync_dtim_count = 0; - } else if (!(local->hw.flags & - IEEE80211_HW_TIMING_BEACON_ONLY)) { + } else if (!ieee80211_hw_check(&sdata->local->hw, + TIMING_BEACON_ONLY)) { ies = rcu_dereference(cbss->proberesp_ies); /* must be non-NULL since beacon IEs were NULL */ sdata->vif.bss_conf.sync_tsf = ies->tsf; @@ -4379,6 +4481,10 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, } else WARN_ON_ONCE(!ether_addr_equal(ifmgd->bssid, cbss->bssid)); + /* Cancel scan to ensure that nothing interferes with connection */ + if (local->scanning) + ieee80211_scan_cancel(local); + return 0; } @@ -4467,13 +4573,14 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, WLAN_REASON_UNSPECIFIED, false, frame_buf); - cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, - sizeof(frame_buf)); + ieee80211_report_disconnect(sdata, frame_buf, + sizeof(frame_buf), true, + WLAN_REASON_UNSPECIFIED); } sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid); - err = ieee80211_prep_connection(sdata, req->bss, false); + err = ieee80211_prep_connection(sdata, req->bss, false, false); if (err) goto err_clear; @@ -4488,9 +4595,12 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, return 0; err_clear: - memset(ifmgd->bssid, 0, ETH_ALEN); + eth_zero_addr(ifmgd->bssid); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->auth_data = NULL; + mutex_lock(&sdata->local->mtx); + ieee80211_vif_release_channel(sdata); + mutex_unlock(&sdata->local->mtx); err_free: kfree(auth_data); return err; @@ -4545,6 +4655,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband; const u8 *ssidie, *ht_ie, *vht_ie; int i, err; + bool override = false; assoc_data = kzalloc(sizeof(*assoc_data) + req->ie_len, GFP_KERNEL); if (!assoc_data) @@ -4568,8 +4679,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, WLAN_REASON_UNSPECIFIED, false, frame_buf); - cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, - sizeof(frame_buf)); + ieee80211_report_disconnect(sdata, frame_buf, + sizeof(frame_buf), true, + WLAN_REASON_UNSPECIFIED); } if (ifmgd->auth_data && !ifmgd->auth_data->done) { @@ -4648,14 +4760,6 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, } } - if (req->flags & ASSOC_REQ_DISABLE_HT) { - ifmgd->flags |= IEEE80211_STA_DISABLE_HT; - ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; - } - - if (req->flags & ASSOC_REQ_DISABLE_VHT) - ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; - /* Also disable HT if we don't support it or the AP doesn't use WMM */ sband = local->hw.wiphy->bands[req->bss->channel->band]; if (!sband->ht_cap.ht_supported || @@ -4722,7 +4826,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) && - (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK), + ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK), "U-APSD not supported with HW_PS_NULLFUNC_STACK\n")) sdata->vif.driver_flags &= ~IEEE80211_VIF_SUPPORTS_UAPSD; @@ -4767,14 +4871,43 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ifmgd->dtim_period = 0; ifmgd->have_beacon = false; - err = ieee80211_prep_connection(sdata, req->bss, true); + /* override HT/VHT configuration only if the AP and we support it */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { + struct ieee80211_sta_ht_cap sta_ht_cap; + + if (req->flags & ASSOC_REQ_DISABLE_HT) + override = true; + + memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); + ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + + /* check for 40 MHz disable override */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_40MHZ) && + sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && + !(sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) + override = true; + + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + req->flags & ASSOC_REQ_DISABLE_VHT) + override = true; + } + + if (req->flags & ASSOC_REQ_DISABLE_HT) { + ifmgd->flags |= IEEE80211_STA_DISABLE_HT; + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + } + + if (req->flags & ASSOC_REQ_DISABLE_VHT) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + + err = ieee80211_prep_connection(sdata, req->bss, true, override); if (err) goto err_clear; rcu_read_lock(); beacon_ies = rcu_dereference(req->bss->beacon_ies); - if (sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC && + if (ieee80211_hw_check(&sdata->local->hw, NEED_DTIM_BEFORE_ASSOC) && !beacon_ies) { /* * Wait up to one beacon interval ... @@ -4801,7 +4934,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->timeout = jiffies; assoc_data->timeout_started = true; - if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) { + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { sdata->vif.bss_conf.sync_tsf = beacon_ies->tsf; sdata->vif.bss_conf.sync_device_ts = bss->device_ts_beacon; @@ -4831,7 +4964,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, return 0; err_clear: - memset(ifmgd->bssid, 0, ETH_ALEN); + eth_zero_addr(ifmgd->bssid); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->assoc_data = NULL; err_free: @@ -4859,8 +4992,9 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, req->reason_code, tx, frame_buf); ieee80211_destroy_auth_data(sdata, false); - cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, - IEEE80211_DEAUTH_FRAME_LEN); + ieee80211_report_disconnect(sdata, frame_buf, + sizeof(frame_buf), true, + req->reason_code); return 0; } @@ -4874,8 +5008,9 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); - cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, - IEEE80211_DEAUTH_FRAME_LEN); + ieee80211_report_disconnect(sdata, frame_buf, + sizeof(frame_buf), true, + req->reason_code); return 0; } @@ -4907,8 +5042,8 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, req->reason_code, !req->local_state_change, frame_buf); - cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, - IEEE80211_DEAUTH_FRAME_LEN); + ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, + req->reason_code); return 0; } diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 683f0e3cb124..f2c75cf491fc 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -46,7 +46,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) } if (!local->offchannel_ps_enabled || - !(local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) + !ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) /* * If power save was enabled, no need to send a nullfunc * frame because AP knows that we are sleeping. But if the diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index ca405b6b686d..b676b9fa707b 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -23,7 +23,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) ieee80211_del_virtual_monitor(local); - if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); @@ -59,13 +59,46 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) cancel_work_sync(&local->dynamic_ps_enable_work); del_timer_sync(&local->dynamic_ps_timer); - local->wowlan = wowlan && local->open_count; + local->wowlan = wowlan; if (local->wowlan) { - int err = drv_suspend(local, wowlan); + int err; + + /* Drivers don't expect to suspend while some operations like + * authenticating or associating are in progress. It doesn't + * make sense anyway to accept that, since the authentication + * or association would never finish since the driver can't do + * that on its own. + * Thus, clean up in-progress auth/assoc first. + */ + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + if (sdata->vif.type != NL80211_IFTYPE_STATION) + continue; + ieee80211_mgd_quiesce(sdata); + /* If suspended during TX in progress, and wowlan + * is enabled (connection will be active) there + * can be a race where the driver is put out + * of power-save due to TX and during suspend + * dynamic_ps_timer is cancelled and TX packet + * is flushed, leaving the driver in ACTIVE even + * after resuming until dynamic_ps_timer puts + * driver back in DOZE. + */ + if (sdata->u.mgd.associated && + sdata->u.mgd.powersave && + !(local->hw.conf.flags & IEEE80211_CONF_PS)) { + local->hw.conf.flags |= IEEE80211_CONF_PS; + ieee80211_hw_config(local, + IEEE80211_CONF_CHANGE_PS); + } + } + + err = drv_suspend(local, wowlan); if (err < 0) { local->quiescing = false; local->wowlan = false; - if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { @@ -80,6 +113,13 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) return err; } else if (err > 0) { WARN_ON(err != 1); + /* cfg80211 will call back into mac80211 to disconnect + * all interfaces, allow that to proceed properly + */ + ieee80211_wake_queues_by_reason(hw, + IEEE80211_MAX_QUEUE_MAP, + IEEE80211_QUEUE_STOP_REASON_SUSPEND, + false); return err; } else { goto suspend; diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index d53355b011f5..fda33f961d83 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -103,7 +103,7 @@ ieee80211_rate_control_ops_get(const char *name) const struct rate_control_ops *ops; const char *alg_name; - kparam_block_sysfs_write(ieee80211_default_rc_algo); + kernel_param_lock(THIS_MODULE); if (!name) alg_name = ieee80211_default_rc_algo; else @@ -117,7 +117,7 @@ ieee80211_rate_control_ops_get(const char *name) /* try built-in one if specific alg requested but not found */ if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT)) ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT); - kparam_unblock_sysfs_write(ieee80211_default_rc_algo); + kernel_param_unlock(THIS_MODULE); return ops; } @@ -680,12 +680,18 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, info->control.rates[i].count = 0; } - if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL)) return; - ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); + if (ista) { + spin_lock_bh(&sta->rate_ctrl_lock); + ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); + spin_unlock_bh(&sta->rate_ctrl_lock); + } else { + ref->ops->get_rate(ref->priv, NULL, NULL, txrc); + } - if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_RC_TABLE) + if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE)) return; ieee80211_get_tx_rates(&sdata->vif, ista, txrc->skb, @@ -727,7 +733,7 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, if (local->open_count) return -EBUSY; - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) { + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { if (WARN_ON(!local->ops->set_rts_threshold)) return -EINVAL; return 0; diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 38652f09feaf..25c9be5dd7fd 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -42,10 +42,12 @@ static inline void rate_control_tx_status(struct ieee80211_local *local, if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; + spin_lock_bh(&sta->rate_ctrl_lock); if (ref->ops->tx_status) ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb); else ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info); + spin_unlock_bh(&sta->rate_ctrl_lock); } static inline void @@ -64,7 +66,9 @@ rate_control_tx_status_noskb(struct ieee80211_local *local, if (WARN_ON_ONCE(!ref->ops->tx_status_noskb)) return; + spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info); + spin_unlock_bh(&sta->rate_ctrl_lock); } static inline void rate_control_rate_init(struct sta_info *sta) @@ -91,8 +95,10 @@ static inline void rate_control_rate_init(struct sta_info *sta) sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; + spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, priv_sta); + spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } @@ -115,18 +121,20 @@ static inline void rate_control_rate_update(struct ieee80211_local *local, return; } + spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, ista, priv_sta, changed); + spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); } drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); } static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, - struct ieee80211_sta *sta, - gfp_t gfp) + struct sta_info *sta, gfp_t gfp) { - return ref->ops->alloc_sta(ref->priv, sta, gfp); + spin_lock_init(&sta->rate_ctrl_lock); + return ref->ops->alloc_sta(ref->priv, &sta->sta, gfp); } static inline void rate_control_free_sta(struct sta_info *sta) diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index ef6e8a6c4253..247552a7f6c2 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -69,14 +69,39 @@ rix_to_ndx(struct minstrel_sta_info *mi, int rix) return i; } +/* return current EMWA throughput */ +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma) +{ + int usecs; + + usecs = mr->perfect_tx_time; + if (!usecs) + usecs = 1000000; + + /* reset thr. below 10% success */ + if (mr->stats.prob_ewma < MINSTREL_FRAC(10, 100)) + return 0; + + if (prob_ewma > MINSTREL_FRAC(90, 100)) + return MINSTREL_TRUNC(100000 * (MINSTREL_FRAC(90, 100) / usecs)); + else + return MINSTREL_TRUNC(100000 * (prob_ewma / usecs)); +} + /* find & sort topmost throughput rates */ static inline void minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list) { int j = MAX_THR_RATES; + struct minstrel_rate_stats *tmp_mrs = &mi->r[j - 1].stats; + struct minstrel_rate_stats *cur_mrs = &mi->r[i].stats; - while (j > 0 && mi->r[i].stats.cur_tp > mi->r[tp_list[j - 1]].stats.cur_tp) + while (j > 0 && (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_ewma) > + minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_ewma))) { j--; + tmp_mrs = &mi->r[tp_list[j - 1]].stats; + } + if (j < MAX_THR_RATES - 1) memmove(&tp_list[j + 1], &tp_list[j], MAX_THR_RATES - (j + 1)); if (j < MAX_THR_RATES) @@ -127,13 +152,47 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi) rate_control_set_rates(mp->hw, mi->sta, ratetbl); } +/* +* Recalculate statistics and counters of a given rate +*/ +void +minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) +{ + if (unlikely(mrs->attempts > 0)) { + mrs->sample_skipped = 0; + mrs->cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); + if (unlikely(!mrs->att_hist)) { + mrs->prob_ewma = mrs->cur_prob; + } else { + /* update exponential weighted moving variance */ + mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd, + mrs->cur_prob, + mrs->prob_ewma, + EWMA_LEVEL); + + /*update exponential weighted moving avarage */ + mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, + mrs->cur_prob, + EWMA_LEVEL); + } + mrs->att_hist += mrs->attempts; + mrs->succ_hist += mrs->success; + } else { + mrs->sample_skipped++; + } + + mrs->last_success = mrs->success; + mrs->last_attempts = mrs->attempts; + mrs->success = 0; + mrs->attempts = 0; +} + static void minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) { u8 tmp_tp_rate[MAX_THR_RATES]; u8 tmp_prob_rate = 0; - u32 usecs; - int i; + int i, tmp_cur_tp, tmp_prob_tp; for (i = 0; i < MAX_THR_RATES; i++) tmp_tp_rate[i] = 0; @@ -141,38 +200,15 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; struct minstrel_rate_stats *mrs = &mi->r[i].stats; + struct minstrel_rate_stats *tmp_mrs = &mi->r[tmp_prob_rate].stats; - usecs = mr->perfect_tx_time; - if (!usecs) - usecs = 1000000; - - if (unlikely(mrs->attempts > 0)) { - mrs->sample_skipped = 0; - mrs->cur_prob = MINSTREL_FRAC(mrs->success, - mrs->attempts); - mrs->succ_hist += mrs->success; - mrs->att_hist += mrs->attempts; - mrs->probability = minstrel_ewma(mrs->probability, - mrs->cur_prob, - EWMA_LEVEL); - } else - mrs->sample_skipped++; - - mrs->last_success = mrs->success; - mrs->last_attempts = mrs->attempts; - mrs->success = 0; - mrs->attempts = 0; - - /* Update throughput per rate, reset thr. below 10% success */ - if (mrs->probability < MINSTREL_FRAC(10, 100)) - mrs->cur_tp = 0; - else - mrs->cur_tp = mrs->probability * (1000000 / usecs); + /* Update statistics of success probability per rate */ + minstrel_calc_rate_stats(mrs); /* Sample less often below the 10% chance of success. * Sample less often above the 95% chance of success. */ - if (mrs->probability > MINSTREL_FRAC(95, 100) || - mrs->probability < MINSTREL_FRAC(10, 100)) { + if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) || + mrs->prob_ewma < MINSTREL_FRAC(10, 100)) { mr->adjusted_retry_count = mrs->retry_count >> 1; if (mr->adjusted_retry_count > 2) mr->adjusted_retry_count = 2; @@ -192,11 +228,14 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * choose the maximum throughput rate as max_prob_rate * (2) if all success probabilities < 95%, the rate with * highest success probability is chosen as max_prob_rate */ - if (mrs->probability >= MINSTREL_FRAC(95, 100)) { - if (mrs->cur_tp >= mi->r[tmp_prob_rate].stats.cur_tp) + if (mrs->prob_ewma >= MINSTREL_FRAC(95, 100)) { + tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_ewma); + tmp_prob_tp = minstrel_get_tp_avg(&mi->r[tmp_prob_rate], + tmp_mrs->prob_ewma); + if (tmp_cur_tp >= tmp_prob_tp) tmp_prob_rate = i; } else { - if (mrs->probability >= mi->r[tmp_prob_rate].stats.probability) + if (mrs->prob_ewma >= tmp_mrs->prob_ewma) tmp_prob_rate = i; } } @@ -215,7 +254,7 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) #endif /* Reset update timer */ - mi->stats_update = jiffies; + mi->last_stats_update = jiffies; minstrel_update_rates(mp, mi); } @@ -253,7 +292,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, if (mi->sample_deferred > 0) mi->sample_deferred--; - if (time_after(jiffies, mi->stats_update + + if (time_after(jiffies, mi->last_stats_update + (mp->update_interval * HZ) / 1000)) minstrel_update_stats(mp, mi); } @@ -385,7 +424,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * has a probability of >95%, we shouldn't be attempting * to use it, as this only wastes precious airtime */ if (!mrr_capable && - (mi->r[ndx].stats.probability > MINSTREL_FRAC(95, 100))) + (mi->r[ndx].stats.prob_ewma > MINSTREL_FRAC(95, 100))) return; mi->prev_sample = true; @@ -519,7 +558,7 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, } mi->n_rates = n; - mi->stats_update = jiffies; + mi->last_stats_update = jiffies; init_sample_table(mi); minstrel_update_rates(mp, mi); @@ -553,7 +592,7 @@ minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) if (!mi->sample_table) goto error1; - mi->stats_update = jiffies; + mi->last_stats_update = jiffies; return mi; error1: @@ -663,12 +702,18 @@ minstrel_free(void *priv) static u32 minstrel_get_expected_throughput(void *priv_sta) { struct minstrel_sta_info *mi = priv_sta; + struct minstrel_rate_stats *tmp_mrs; int idx = mi->max_tp_rate[0]; + int tmp_cur_tp; /* convert pkt per sec in kbps (1200 is the average pkt size used for * computing cur_tp */ - return MINSTREL_TRUNC(mi->r[idx].stats.cur_tp) * 1200 * 8 / 1024; + tmp_mrs = &mi->r[idx].stats; + tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma); + tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024; + + return tmp_cur_tp; } const struct rate_control_ops mac80211_minstrel = { diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index 410efe620c57..c230bbe93262 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -13,7 +13,6 @@ #define EWMA_DIV 128 #define SAMPLE_COLUMNS 10 /* number of columns in sample table */ - /* scaled fraction values */ #define MINSTREL_SCALE 16 #define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) @@ -24,11 +23,34 @@ /* * Perform EWMA (Exponentially Weighted Moving Average) calculation - */ + */ static inline int minstrel_ewma(int old, int new, int weight) { - return (new * (EWMA_DIV - weight) + old * weight) / EWMA_DIV; + int diff, incr; + + diff = new - old; + incr = (EWMA_DIV - weight) * diff / EWMA_DIV; + + return old + incr; +} + +/* + * Perform EWMSD (Exponentially Weighted Moving Standard Deviation) calculation + */ +static inline int +minstrel_ewmsd(int old_ewmsd, int cur_prob, int prob_ewma, int weight) +{ + int diff, incr, tmp_var; + + /* calculate exponential weighted moving variance */ + diff = MINSTREL_TRUNC((cur_prob - prob_ewma) * 1000000); + incr = (EWMA_DIV - weight) * diff / EWMA_DIV; + tmp_var = old_ewmsd * old_ewmsd; + tmp_var = weight * (tmp_var + diff * incr / 1000000) / EWMA_DIV; + + /* return standard deviation */ + return (u16) int_sqrt(tmp_var); } struct minstrel_rate_stats { @@ -39,11 +61,13 @@ struct minstrel_rate_stats { /* total attempts/success counters */ u64 att_hist, succ_hist; - /* current throughput */ - unsigned int cur_tp; - - /* packet delivery probabilities */ - unsigned int cur_prob, probability; + /* statistis of packet delivery probability + * cur_prob - current prob within last update intervall + * prob_ewma - exponential weighted moving average of prob + * prob_ewmsd - exp. weighted moving standard deviation of prob */ + unsigned int cur_prob; + unsigned int prob_ewma; + u16 prob_ewmsd; /* maximum retry counts */ u8 retry_count; @@ -71,7 +95,7 @@ struct minstrel_rate { struct minstrel_sta_info { struct ieee80211_sta *sta; - unsigned long stats_update; + unsigned long last_stats_update; unsigned int sp_ack_dur; unsigned int rate_avg; @@ -95,6 +119,7 @@ struct minstrel_sta_info { #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *dbg_stats; + struct dentry *dbg_stats_csv; #endif }; @@ -121,7 +146,6 @@ struct minstrel_priv { u32 fixed_rate_idx; struct dentry *dbg_fixed_rate; #endif - }; struct minstrel_debugfs_info { @@ -133,8 +157,13 @@ extern const struct rate_control_ops mac80211_minstrel; void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); void minstrel_remove_sta_debugfs(void *priv, void *priv_sta); +/* Recalculate success probabilities and counters for a given rate using EWMA */ +void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs); +int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma); + /* debugfs */ int minstrel_stats_open(struct inode *inode, struct file *file); +int minstrel_stats_csv_open(struct inode *inode, struct file *file); ssize_t minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); int minstrel_stats_release(struct inode *inode, struct file *file); diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index 2acab1bcaa4b..1db5f7c3318a 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -54,12 +54,28 @@ #include <net/mac80211.h> #include "rc80211_minstrel.h" +ssize_t +minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) +{ + struct minstrel_debugfs_info *ms; + + ms = file->private_data; + return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len); +} + +int +minstrel_stats_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + int minstrel_stats_open(struct inode *inode, struct file *file) { struct minstrel_sta_info *mi = inode->i_private; struct minstrel_debugfs_info *ms; - unsigned int i, tp, prob, eprob; + unsigned int i, tp_max, tp_avg, prob, eprob; char *p; ms = kmalloc(2048, GFP_KERNEL); @@ -68,8 +84,14 @@ minstrel_stats_open(struct inode *inode, struct file *file) file->private_data = ms; p = ms->buf; - p += sprintf(p, "rate tpt eprob *prob" - " *ok(*cum) ok( cum)\n"); + p += sprintf(p, "\n"); + p += sprintf(p, "best __________rate_________ ______" + "statistics______ ________last_______ " + "______sum-of________\n"); + p += sprintf(p, "rate [name idx airtime max_tp] [ ø(tp) ø(prob) " + "sd(prob)] [prob.|retry|suc|att] " + "[#success | #attempts]\n"); + for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; struct minstrel_rate_stats *mrs = &mi->r[i].stats; @@ -79,18 +101,26 @@ minstrel_stats_open(struct inode *inode, struct file *file) *(p++) = (i == mi->max_tp_rate[2]) ? 'C' : ' '; *(p++) = (i == mi->max_tp_rate[3]) ? 'D' : ' '; *(p++) = (i == mi->max_prob_rate) ? 'P' : ' '; - p += sprintf(p, "%3u%s", mr->bitrate / 2, + + p += sprintf(p, " %3u%s ", mr->bitrate / 2, (mr->bitrate & 1 ? ".5" : " ")); + p += sprintf(p, "%3u ", i); + p += sprintf(p, "%6u ", mr->perfect_tx_time); - tp = MINSTREL_TRUNC(mrs->cur_tp / 10); + tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); - eprob = MINSTREL_TRUNC(mrs->probability * 1000); + eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); - p += sprintf(p, " %4u.%1u %3u.%1u %3u.%1u" - " %4u(%4u) %9llu(%9llu)\n", - tp / 10, tp % 10, + p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" + " %3u.%1u %3u %3u %-3u " + "%9llu %-9llu\n", + tp_max / 10, tp_max % 10, + tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, + mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, prob / 10, prob % 10, + mrs->retry_count, mrs->last_success, mrs->last_attempts, (unsigned long long)mrs->succ_hist, @@ -107,25 +137,75 @@ minstrel_stats_open(struct inode *inode, struct file *file) return 0; } -ssize_t -minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) +static const struct file_operations minstrel_stat_fops = { + .owner = THIS_MODULE, + .open = minstrel_stats_open, + .read = minstrel_stats_read, + .release = minstrel_stats_release, + .llseek = default_llseek, +}; + +int +minstrel_stats_csv_open(struct inode *inode, struct file *file) { + struct minstrel_sta_info *mi = inode->i_private; struct minstrel_debugfs_info *ms; + unsigned int i, tp_max, tp_avg, prob, eprob; + char *p; - ms = file->private_data; - return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len); -} + ms = kmalloc(2048, GFP_KERNEL); + if (!ms) + return -ENOMEM; + + file->private_data = ms; + p = ms->buf; + + for (i = 0; i < mi->n_rates; i++) { + struct minstrel_rate *mr = &mi->r[i]; + struct minstrel_rate_stats *mrs = &mi->r[i].stats; + + p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : "")); + p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : "")); + p += sprintf(p, "%s" ,((i == mi->max_tp_rate[2]) ? "C" : "")); + p += sprintf(p, "%s" ,((i == mi->max_tp_rate[3]) ? "D" : "")); + p += sprintf(p, "%s" ,((i == mi->max_prob_rate) ? "P" : "")); + + p += sprintf(p, ",%u%s", mr->bitrate / 2, + (mr->bitrate & 1 ? ".5," : ",")); + p += sprintf(p, "%u,", i); + p += sprintf(p, "%u,",mr->perfect_tx_time); + + tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); + tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); + prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); + eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + + p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u," + "%llu,%llu,%d,%d\n", + tp_max / 10, tp_max % 10, + tp_avg / 10, tp_avg % 10, + eprob / 10, eprob % 10, + mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, + prob / 10, prob % 10, + mrs->retry_count, + mrs->last_success, + mrs->last_attempts, + (unsigned long long)mrs->succ_hist, + (unsigned long long)mrs->att_hist, + mi->total_packets - mi->sample_packets, + mi->sample_packets); + + } + ms->len = p - ms->buf; + + WARN_ON(ms->len + sizeof(*ms) > 2048); -int -minstrel_stats_release(struct inode *inode, struct file *file) -{ - kfree(file->private_data); return 0; } -static const struct file_operations minstrel_stat_fops = { +static const struct file_operations minstrel_stat_csv_fops = { .owner = THIS_MODULE, - .open = minstrel_stats_open, + .open = minstrel_stats_csv_open, .read = minstrel_stats_read, .release = minstrel_stats_release, .llseek = default_llseek, @@ -138,6 +218,9 @@ minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) mi->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, mi, &minstrel_stat_fops); + + mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO, dir, + mi, &minstrel_stat_csv_fops); } void @@ -146,4 +229,6 @@ minstrel_remove_sta_debugfs(void *priv, void *priv_sta) struct minstrel_sta_info *mi = priv_sta; debugfs_remove(mi->dbg_stats); + + debugfs_remove(mi->dbg_stats_csv); } diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 80452cfd2dc5..543b67233535 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -17,10 +17,11 @@ #include "rc80211_minstrel.h" #include "rc80211_minstrel_ht.h" +#define AVG_AMPDU_SIZE 16 #define AVG_PKT_SIZE 1200 /* Number of bits for an average sized packet */ -#define MCS_NBITS (AVG_PKT_SIZE << 3) +#define MCS_NBITS ((AVG_PKT_SIZE * AVG_AMPDU_SIZE) << 3) /* Number of symbols for a packet with (bps) bits per symbol */ #define MCS_NSYMS(bps) DIV_ROUND_UP(MCS_NBITS, (bps)) @@ -33,7 +34,8 @@ ) /* Transmit duration for the raw data part of an average sized packet */ -#define MCS_DURATION(streams, sgi, bps) MCS_SYMBOL_TIME(sgi, MCS_NSYMS((streams) * (bps))) +#define MCS_DURATION(streams, sgi, bps) \ + (MCS_SYMBOL_TIME(sgi, MCS_NSYMS((streams) * (bps))) / AVG_AMPDU_SIZE) #define BW_20 0 #define BW_40 1 @@ -311,67 +313,35 @@ minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index) return &mi->groups[index / MCS_GROUP_RATES].rates[index % MCS_GROUP_RATES]; } - /* - * Recalculate success probabilities and counters for a rate using EWMA + * Return current throughput based on the average A-MPDU length, taking into + * account the expected number of retransmissions and their expected length */ -static void -minstrel_calc_rate_ewma(struct minstrel_rate_stats *mr) +int +minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, + int prob_ewma) { - if (unlikely(mr->attempts > 0)) { - mr->sample_skipped = 0; - mr->cur_prob = MINSTREL_FRAC(mr->success, mr->attempts); - if (!mr->att_hist) - mr->probability = mr->cur_prob; - else - mr->probability = minstrel_ewma(mr->probability, - mr->cur_prob, EWMA_LEVEL); - mr->att_hist += mr->attempts; - mr->succ_hist += mr->success; - } else { - mr->sample_skipped++; - } - mr->last_success = mr->success; - mr->last_attempts = mr->attempts; - mr->success = 0; - mr->attempts = 0; -} - -/* - * Calculate throughput based on the average A-MPDU length, taking into account - * the expected number of retransmissions and their expected length - */ -static void -minstrel_ht_calc_tp(struct minstrel_ht_sta *mi, int group, int rate) -{ - struct minstrel_rate_stats *mr; unsigned int nsecs = 0; - unsigned int tp; - unsigned int prob; - mr = &mi->groups[group].rates[rate]; - prob = mr->probability; - - if (prob < MINSTREL_FRAC(1, 10)) { - mr->cur_tp = 0; - return; - } - - /* - * For the throughput calculation, limit the probability value to 90% to - * account for collision related packet error rate fluctuation - */ - if (prob > MINSTREL_FRAC(9, 10)) - prob = MINSTREL_FRAC(9, 10); + /* do not account throughput if sucess prob is below 10% */ + if (prob_ewma < MINSTREL_FRAC(10, 100)) + return 0; if (group != MINSTREL_CCK_GROUP) nsecs = 1000 * mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len); nsecs += minstrel_mcs_groups[group].duration[rate]; - /* prob is scaled - see MINSTREL_FRAC above */ - tp = 1000000 * ((prob * 1000) / nsecs); - mr->cur_tp = MINSTREL_TRUNC(tp); + /* + * For the throughput calculation, limit the probability value to 90% to + * account for collision related packet error rate fluctuation + * (prob is scaled - see MINSTREL_FRAC above) + */ + if (prob_ewma > MINSTREL_FRAC(90, 100)) + return MINSTREL_TRUNC(100000 * ((MINSTREL_FRAC(90, 100) * 1000) + / nsecs)); + else + return MINSTREL_TRUNC(100000 * ((prob_ewma * 1000) / nsecs)); } /* @@ -385,22 +355,23 @@ static void minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, u16 *tp_list) { - int cur_group, cur_idx, cur_thr, cur_prob; - int tmp_group, tmp_idx, tmp_thr, tmp_prob; + int cur_group, cur_idx, cur_tp_avg, cur_prob; + int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; int j = MAX_THR_RATES; cur_group = index / MCS_GROUP_RATES; cur_idx = index % MCS_GROUP_RATES; - cur_thr = mi->groups[cur_group].rates[cur_idx].cur_tp; - cur_prob = mi->groups[cur_group].rates[cur_idx].probability; + cur_prob = mi->groups[cur_group].rates[cur_idx].prob_ewma; + cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob); do { tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; - tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; - if (cur_thr < tmp_thr || - (cur_thr == tmp_thr && cur_prob <= tmp_prob)) + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, + tmp_prob); + if (cur_tp_avg < tmp_tp_avg || + (cur_tp_avg == tmp_tp_avg && cur_prob <= tmp_prob)) break; j--; } while (j > 0); @@ -420,16 +391,21 @@ static void minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) { struct minstrel_mcs_group_data *mg; - struct minstrel_rate_stats *mr; - int tmp_group, tmp_idx, tmp_tp, tmp_prob, max_tp_group; + struct minstrel_rate_stats *mrs; + int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; + int max_tp_group, cur_tp_avg, cur_group, cur_idx; + int max_gpr_group, max_gpr_idx; + int max_gpr_tp_avg, max_gpr_prob; + cur_group = index / MCS_GROUP_RATES; + cur_idx = index % MCS_GROUP_RATES; mg = &mi->groups[index / MCS_GROUP_RATES]; - mr = &mg->rates[index % MCS_GROUP_RATES]; + mrs = &mg->rates[index % MCS_GROUP_RATES]; tmp_group = mi->max_prob_rate / MCS_GROUP_RATES; tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES; - tmp_tp = mi->groups[tmp_group].rates[tmp_idx].cur_tp; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from * MCS_GROUP as well as CCK_GROUP rates do not allow aggregation */ @@ -438,15 +414,24 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) (max_tp_group != MINSTREL_CCK_GROUP)) return; - if (mr->probability > MINSTREL_FRAC(75, 100)) { - if (mr->cur_tp > tmp_tp) + if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) { + cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, + mrs->prob_ewma); + if (cur_tp_avg > tmp_tp_avg) mi->max_prob_rate = index; - if (mr->cur_tp > mg->rates[mg->max_group_prob_rate].cur_tp) + + max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; + max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; + max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma; + max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group, + max_gpr_idx, + max_gpr_prob); + if (cur_tp_avg > max_gpr_tp_avg) mg->max_group_prob_rate = index; } else { - if (mr->probability > tmp_prob) + if (mrs->prob_ewma > tmp_prob) mi->max_prob_rate = index; - if (mr->probability > mg->rates[mg->max_group_prob_rate].probability) + if (mrs->prob_ewma > mg->rates[mg->max_group_prob_rate].prob_ewma) mg->max_group_prob_rate = index; } } @@ -463,16 +448,18 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, u16 tmp_mcs_tp_rate[MAX_THR_RATES], u16 tmp_cck_tp_rate[MAX_THR_RATES]) { - unsigned int tmp_group, tmp_idx, tmp_cck_tp, tmp_mcs_tp; + unsigned int tmp_group, tmp_idx, tmp_cck_tp, tmp_mcs_tp, tmp_prob; int i; tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES; - tmp_cck_tp = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES; tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES; - tmp_mcs_tp = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma; + tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (tmp_cck_tp > tmp_mcs_tp) { for(i = 0; i < MAX_THR_RATES; i++) { @@ -491,8 +478,7 @@ static inline void minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; - struct minstrel_rate_stats *mr; - int tmp_max_streams, group; + int tmp_max_streams, group, tmp_idx, tmp_prob; int tmp_tp = 0; tmp_max_streams = minstrel_mcs_groups[mi->max_tp_rate[0] / @@ -501,11 +487,16 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) mg = &mi->groups[group]; if (!mg->supported || group == MINSTREL_CCK_GROUP) continue; - mr = minstrel_get_ratestats(mi, mg->max_group_prob_rate); - if (tmp_tp < mr->cur_tp && + + tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; + tmp_prob = mi->groups[group].rates[tmp_idx].prob_ewma; + + if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) && (minstrel_mcs_groups[group].streams < tmp_max_streams)) { mi->max_prob_rate = mg->max_group_prob_rate; - tmp_tp = mr->cur_tp; + tmp_tp = minstrel_ht_get_tp_avg(mi, group, + tmp_idx, + tmp_prob); } } } @@ -523,8 +514,8 @@ static void minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; - struct minstrel_rate_stats *mr; - int group, i, j; + struct minstrel_rate_stats *mrs; + int group, i, j, cur_prob; u16 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES]; u16 tmp_cck_tp_rate[MAX_THR_RATES], index; @@ -563,12 +554,12 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) index = MCS_GROUP_RATES * group + i; - mr = &mg->rates[i]; - mr->retry_updated = false; - minstrel_calc_rate_ewma(mr); - minstrel_ht_calc_tp(mi, group, i); + mrs = &mg->rates[i]; + mrs->retry_updated = false; + minstrel_calc_rate_stats(mrs); + cur_prob = mrs->prob_ewma; - if (!mr->cur_tp) + if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0) continue; /* Find max throughput rate set */ @@ -612,7 +603,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) #endif /* Reset update timer */ - mi->stats_update = jiffies; + mi->last_stats_update = jiffies; } static bool @@ -635,7 +626,7 @@ minstrel_ht_txstat_valid(struct minstrel_priv *mp, struct ieee80211_tx_rate *rat } static void -minstrel_next_sample_idx(struct minstrel_ht_sta *mi) +minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; @@ -776,7 +767,8 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, update = true; } - if (time_after(jiffies, mi->stats_update + (mp->update_interval / 2 * HZ) / 1000)) { + if (time_after(jiffies, mi->last_stats_update + + (mp->update_interval / 2 * HZ) / 1000)) { update = true; minstrel_ht_update_stats(mp, mi); } @@ -789,7 +781,7 @@ static void minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, int index) { - struct minstrel_rate_stats *mr; + struct minstrel_rate_stats *mrs; const struct mcs_group *group; unsigned int tx_time, tx_time_rtscts, tx_time_data; unsigned int cw = mp->cw_min; @@ -798,16 +790,16 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, unsigned int ampdu_len = MINSTREL_TRUNC(mi->avg_ampdu_len); unsigned int overhead = 0, overhead_rtscts = 0; - mr = minstrel_get_ratestats(mi, index); - if (mr->probability < MINSTREL_FRAC(1, 10)) { - mr->retry_count = 1; - mr->retry_count_rtscts = 1; + mrs = minstrel_get_ratestats(mi, index); + if (mrs->prob_ewma < MINSTREL_FRAC(1, 10)) { + mrs->retry_count = 1; + mrs->retry_count_rtscts = 1; return; } - mr->retry_count = 2; - mr->retry_count_rtscts = 2; - mr->retry_updated = true; + mrs->retry_count = 2; + mrs->retry_count_rtscts = 2; + mrs->retry_updated = true; group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len / 1000; @@ -838,9 +830,9 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, tx_time_rtscts += ctime + overhead_rtscts + tx_time_data; if (tx_time_rtscts < mp->segment_size) - mr->retry_count_rtscts++; + mrs->retry_count_rtscts++; } while ((tx_time < mp->segment_size) && - (++mr->retry_count < mp->max_retry)); + (++mrs->retry_count < mp->max_retry)); } @@ -849,22 +841,22 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_sta_rates *ratetbl, int offset, int index) { const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES]; - struct minstrel_rate_stats *mr; + struct minstrel_rate_stats *mrs; u8 idx; u16 flags = group->flags; - mr = minstrel_get_ratestats(mi, index); - if (!mr->retry_updated) + mrs = minstrel_get_ratestats(mi, index); + if (!mrs->retry_updated) minstrel_calc_retransmit(mp, mi, index); - if (mr->probability < MINSTREL_FRAC(20, 100) || !mr->retry_count) { + if (mrs->prob_ewma < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { ratetbl->rate[offset].count = 2; ratetbl->rate[offset].count_rts = 2; ratetbl->rate[offset].count_cts = 2; } else { - ratetbl->rate[offset].count = mr->retry_count; - ratetbl->rate[offset].count_cts = mr->retry_count; - ratetbl->rate[offset].count_rts = mr->retry_count_rtscts; + ratetbl->rate[offset].count = mrs->retry_count; + ratetbl->rate[offset].count_cts = mrs->retry_count; + ratetbl->rate[offset].count_rts = mrs->retry_count_rtscts; } if (index / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) @@ -922,7 +914,7 @@ minstrel_get_duration(int index) static int minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { - struct minstrel_rate_stats *mr; + struct minstrel_rate_stats *mrs; struct minstrel_mcs_group_data *mg; unsigned int sample_dur, sample_group, cur_max_tp_streams; int sample_idx = 0; @@ -938,12 +930,12 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) sample_group = mi->sample_group; mg = &mi->groups[sample_group]; sample_idx = sample_table[mg->column][mg->index]; - minstrel_next_sample_idx(mi); + minstrel_set_next_sample_idx(mi); if (!(mg->supported & BIT(sample_idx))) return -1; - mr = &mg->rates[sample_idx]; + mrs = &mg->rates[sample_idx]; sample_idx += sample_group * MCS_GROUP_RATES; /* @@ -960,7 +952,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * Do not sample if the probability is already higher than 95% * to avoid wasting airtime. */ - if (mr->probability > MINSTREL_FRAC(95, 100)) + if (mrs->prob_ewma > MINSTREL_FRAC(95, 100)) return -1; /* @@ -975,7 +967,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) (cur_max_tp_streams - 1 < minstrel_mcs_groups[sample_group].streams || sample_dur >= minstrel_get_duration(mi->max_prob_rate))) { - if (mr->sample_skipped < 20) + if (mrs->sample_skipped < 20) return -1; if (mi->sample_slow++ > 2) @@ -1078,7 +1070,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, if (sband->band != IEEE80211_BAND_2GHZ) return; - if (!(mp->hw->flags & IEEE80211_HW_SUPPORTS_HT_CCK_RATES)) + if (!ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) return; mi->cck_supported = 0; @@ -1129,7 +1121,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, memset(mi, 0, sizeof(*mi)); mi->sta = sta; - mi->stats_update = jiffies; + mi->last_stats_update = jiffies; ack_dur = ieee80211_frame_duration(sband->band, 10, 60, 1, 1, 0); mi->overhead = ieee80211_frame_duration(sband->band, 0, 60, 1, 1, 0); @@ -1326,16 +1318,19 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) { struct minstrel_ht_sta_priv *msp = priv_sta; struct minstrel_ht_sta *mi = &msp->ht; - int i, j; + int i, j, prob, tp_avg; if (!msp->is_ht) return mac80211_minstrel.get_expected_throughput(priv_sta); i = mi->max_tp_rate[0] / MCS_GROUP_RATES; j = mi->max_tp_rate[0] % MCS_GROUP_RATES; + prob = mi->groups[i].rates[j].prob_ewma; + + /* convert tp_avg from pkt per second in kbps */ + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024; - /* convert cur_tp from pkt per second in kbps */ - return mi->groups[i].rates[j].cur_tp * AVG_PKT_SIZE * 8 / 1024; + return tp_avg; } static const struct rate_control_ops mac80211_minstrel_ht = { diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index f2217d6aa0c2..e8b52a94d24b 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -78,7 +78,7 @@ struct minstrel_ht_sta { u16 max_prob_rate; /* time of last status update */ - unsigned long stats_update; + unsigned long last_stats_update; /* overhead time in usec for each frame */ unsigned int overhead; @@ -112,6 +112,7 @@ struct minstrel_ht_sta_priv { }; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *dbg_stats; + struct dentry *dbg_stats_csv; #endif void *ratelist; void *sample_table; @@ -120,5 +121,7 @@ struct minstrel_ht_sta_priv { void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); void minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta); +int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, + int prob_ewma); #endif diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 20c676b8e5b6..6822ce0f95e5 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -19,7 +19,7 @@ static char * minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) { const struct mcs_group *mg; - unsigned int j, tp, prob, eprob; + unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; char htmode = '2'; char gimode = 'L'; u32 gflags; @@ -38,19 +38,26 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) gimode = 'S'; for (j = 0; j < MCS_GROUP_RATES; j++) { - struct minstrel_rate_stats *mr = &mi->groups[i].rates[j]; + struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; static const int bitrates[4] = { 10, 20, 55, 110 }; int idx = i * MCS_GROUP_RATES + j; if (!(mi->groups[i].supported & BIT(j))) continue; - if (gflags & IEEE80211_TX_RC_MCS) - p += sprintf(p, " HT%c0/%cGI ", htmode, gimode); - else if (gflags & IEEE80211_TX_RC_VHT_MCS) - p += sprintf(p, "VHT%c0/%cGI ", htmode, gimode); - else - p += sprintf(p, " CCK/%cP ", j < 4 ? 'L' : 'S'); + if (gflags & IEEE80211_TX_RC_MCS) { + p += sprintf(p, "HT%c0 ", htmode); + p += sprintf(p, "%cGI ", gimode); + p += sprintf(p, "%d ", mg->streams); + } else if (gflags & IEEE80211_TX_RC_VHT_MCS) { + p += sprintf(p, "VHT%c0 ", htmode); + p += sprintf(p, "%cGI ", gimode); + p += sprintf(p, "%d ", mg->streams); + } else { + p += sprintf(p, "CCK "); + p += sprintf(p, "%cP ", j < 4 ? 'L' : 'S'); + p += sprintf(p, "1 "); + } *(p++) = (idx == mi->max_tp_rate[0]) ? 'A' : ' '; *(p++) = (idx == mi->max_tp_rate[1]) ? 'B' : ' '; @@ -59,29 +66,39 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) *(p++) = (idx == mi->max_prob_rate) ? 'P' : ' '; if (gflags & IEEE80211_TX_RC_MCS) { - p += sprintf(p, " MCS%-2u ", (mg->streams - 1) * 8 + j); + p += sprintf(p, " MCS%-2u", (mg->streams - 1) * 8 + j); } else if (gflags & IEEE80211_TX_RC_VHT_MCS) { - p += sprintf(p, " MCS%-1u/%1u", j, mg->streams); + p += sprintf(p, " MCS%-1u/%1u", j, mg->streams); } else { int r = bitrates[j % 4]; - p += sprintf(p, " %2u.%1uM ", r / 10, r % 10); + p += sprintf(p, " %2u.%1uM", r / 10, r % 10); } - tp = mr->cur_tp / 10; - prob = MINSTREL_TRUNC(mr->cur_prob * 1000); - eprob = MINSTREL_TRUNC(mr->probability * 1000); + p += sprintf(p, " %3u ", idx); - p += sprintf(p, " %4u.%1u %3u.%1u %3u.%1u " - "%3u %4u(%4u) %9llu(%9llu)\n", - tp / 10, tp % 10, + /* tx_time[rate(i)] in usec */ + tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000); + p += sprintf(p, "%6u ", tx_time); + + tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); + prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); + eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + + p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" + " %3u.%1u %3u %3u %-3u " + "%9llu %-9llu\n", + tp_max / 10, tp_max % 10, + tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, + mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, prob / 10, prob % 10, - mr->retry_count, - mr->last_success, - mr->last_attempts, - (unsigned long long)mr->succ_hist, - (unsigned long long)mr->att_hist); + mrs->retry_count, + mrs->last_success, + mrs->last_attempts, + (unsigned long long)mrs->succ_hist, + (unsigned long long)mrs->att_hist); } return p; @@ -94,8 +111,8 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) struct minstrel_ht_sta *mi = &msp->ht; struct minstrel_debugfs_info *ms; unsigned int i; - char *p; int ret; + char *p; if (!msp->is_ht) { inode->i_private = &msp->legacy; @@ -110,8 +127,14 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) file->private_data = ms; p = ms->buf; - p += sprintf(p, " type rate tpt eprob *prob " - "ret *ok(*cum) ok( cum)\n"); + + p += sprintf(p, "\n"); + p += sprintf(p, " best ____________rate__________ " + "______statistics______ ________last_______ " + "______sum-of________\n"); + p += sprintf(p, "mode guard # rate [name idx airtime max_tp] " + "[ ø(tp) ø(prob) sd(prob)] [prob.|retry|suc|att] [#success | " + "#attempts]\n"); p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); for (i = 0; i < MINSTREL_CCK_GROUP; i++) @@ -123,11 +146,10 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) "lookaround %d\n", max(0, (int) mi->total_packets - (int) mi->sample_packets), mi->sample_packets); - p += sprintf(p, "Average A-MPDU length: %d.%d\n", + p += sprintf(p, "Average # of aggregated frames per A-MPDU: %d.%d\n", MINSTREL_TRUNC(mi->avg_ampdu_len), MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10); ms->len = p - ms->buf; - WARN_ON(ms->len + sizeof(*ms) > 32768); return nonseekable_open(inode, file); @@ -141,6 +163,143 @@ static const struct file_operations minstrel_ht_stat_fops = { .llseek = no_llseek, }; +static char * +minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) +{ + const struct mcs_group *mg; + unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; + char htmode = '2'; + char gimode = 'L'; + u32 gflags; + + if (!mi->groups[i].supported) + return p; + + mg = &minstrel_mcs_groups[i]; + gflags = mg->flags; + + if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) + htmode = '4'; + else if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH) + htmode = '8'; + if (gflags & IEEE80211_TX_RC_SHORT_GI) + gimode = 'S'; + + for (j = 0; j < MCS_GROUP_RATES; j++) { + struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; + static const int bitrates[4] = { 10, 20, 55, 110 }; + int idx = i * MCS_GROUP_RATES + j; + + if (!(mi->groups[i].supported & BIT(j))) + continue; + + if (gflags & IEEE80211_TX_RC_MCS) { + p += sprintf(p, "HT%c0,", htmode); + p += sprintf(p, "%cGI,", gimode); + p += sprintf(p, "%d,", mg->streams); + } else if (gflags & IEEE80211_TX_RC_VHT_MCS) { + p += sprintf(p, "VHT%c0,", htmode); + p += sprintf(p, "%cGI,", gimode); + p += sprintf(p, "%d,", mg->streams); + } else { + p += sprintf(p, "CCK,"); + p += sprintf(p, "%cP,", j < 4 ? 'L' : 'S'); + p += sprintf(p, "1,"); + } + + p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[0]) ? "A" : "")); + p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[1]) ? "B" : "")); + p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[2]) ? "C" : "")); + p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[3]) ? "D" : "")); + p += sprintf(p, "%s" ,((idx == mi->max_prob_rate) ? "P" : "")); + + if (gflags & IEEE80211_TX_RC_MCS) { + p += sprintf(p, ",MCS%-2u,", (mg->streams - 1) * 8 + j); + } else if (gflags & IEEE80211_TX_RC_VHT_MCS) { + p += sprintf(p, ",MCS%-1u/%1u,", j, mg->streams); + } else { + int r = bitrates[j % 4]; + p += sprintf(p, ",%2u.%1uM,", r / 10, r % 10); + } + + p += sprintf(p, "%u,", idx); + tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000); + p += sprintf(p, "%u,", tx_time); + + tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); + prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); + eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + + p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u," + "%u,%llu,%llu,", + tp_max / 10, tp_max % 10, + tp_avg / 10, tp_avg % 10, + eprob / 10, eprob % 10, + mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, + prob / 10, prob % 10, + mrs->retry_count, + mrs->last_success, + mrs->last_attempts, + (unsigned long long)mrs->succ_hist, + (unsigned long long)mrs->att_hist); + p += sprintf(p, "%d,%d,%d.%d\n", + max(0, (int) mi->total_packets - + (int) mi->sample_packets), + mi->sample_packets, + MINSTREL_TRUNC(mi->avg_ampdu_len), + MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10); + } + + return p; +} + +static int +minstrel_ht_stats_csv_open(struct inode *inode, struct file *file) +{ + struct minstrel_ht_sta_priv *msp = inode->i_private; + struct minstrel_ht_sta *mi = &msp->ht; + struct minstrel_debugfs_info *ms; + unsigned int i; + int ret; + char *p; + + if (!msp->is_ht) { + inode->i_private = &msp->legacy; + ret = minstrel_stats_csv_open(inode, file); + inode->i_private = msp; + return ret; + } + + ms = kmalloc(32768, GFP_KERNEL); + + if (!ms) + return -ENOMEM; + + file->private_data = ms; + + p = ms->buf; + + p = minstrel_ht_stats_csv_dump(mi, MINSTREL_CCK_GROUP, p); + for (i = 0; i < MINSTREL_CCK_GROUP; i++) + p = minstrel_ht_stats_csv_dump(mi, i, p); + for (i++; i < ARRAY_SIZE(mi->groups); i++) + p = minstrel_ht_stats_csv_dump(mi, i, p); + + ms->len = p - ms->buf; + WARN_ON(ms->len + sizeof(*ms) > 32768); + + return nonseekable_open(inode, file); +} + +static const struct file_operations minstrel_ht_stat_csv_fops = { + .owner = THIS_MODULE, + .open = minstrel_ht_stats_csv_open, + .read = minstrel_stats_read, + .release = minstrel_stats_release, + .llseek = no_llseek, +}; + void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) { @@ -148,6 +307,8 @@ minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) msp->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, msp, &minstrel_ht_stat_fops); + msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO, + dir, msp, &minstrel_ht_stat_csv_fops); } void @@ -156,4 +317,5 @@ minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta) struct minstrel_ht_sta_priv *msp = priv_sta; debugfs_remove(msp->dbg_stats); + debugfs_remove(msp->dbg_stats_csv); } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1eb730bf8752..5dae166cb7f5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -32,6 +32,16 @@ #include "wme.h" #include "rate.h" +static inline void ieee80211_rx_stats(struct net_device *dev, u32 len) +{ + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += len; + u64_stats_update_end(&tstats->syncp); +} + /* * monitor mode reception * @@ -42,7 +52,7 @@ static struct sk_buff *remove_monitor_info(struct ieee80211_local *local, struct sk_buff *skb, unsigned int rtap_vendor_space) { - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) { + if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) { if (likely(skb->len > FCS_LEN)) __pskb_trim(skb, skb->len - FCS_LEN); else { @@ -100,7 +110,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len = ALIGN(len, 8); len += 8; } - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) len += 1; /* antenna field, if we don't have per-chain info */ @@ -175,7 +185,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } mpdulen = skb->len; - if (!(has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS))) + if (!(has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS))) mpdulen += FCS_LEN; rthdr = (struct ieee80211_radiotap_header *)skb_push(skb, rtap_len); @@ -229,7 +239,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } /* IEEE80211_RADIOTAP_FLAGS */ - if (has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)) + if (has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) *pos |= IEEE80211_RADIOTAP_F_FCS; if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC)) *pos |= IEEE80211_RADIOTAP_F_BADFCS; @@ -279,7 +289,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, pos += 2; /* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */ - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM && + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM) && !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { *pos = status->signal; rthdr->it_present |= @@ -448,7 +458,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, * the SKB because it has a bad FCS/PLCP checksum. */ - if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) + if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) present_fcs_len = FCS_LEN; /* ensure hdr->frame_control and vendor radiotap data are in skb head */ @@ -529,8 +539,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, } prev_dev = sdata->dev; - sdata->dev->stats.rx_packets++; - sdata->dev->stats.rx_bytes += skb->len; + ieee80211_rx_stats(sdata->dev, skb->len); } if (prev_dev) { @@ -981,7 +990,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, struct sk_buff *skb = rx->skb; struct ieee80211_local *local = rx->local; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct sta_info *sta = rx->sta; struct tid_ampdu_rx *tid_agg_rx; u16 sc; @@ -1016,10 +1024,6 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_NORMAL) goto dont_reorder; - /* not actually part of this BA session */ - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - goto dont_reorder; - /* new, potentially un-ordered, ampdu frame - process it */ /* reset session timer */ @@ -1073,10 +1077,8 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) if (unlikely(ieee80211_has_retry(hdr->frame_control) && rx->sta->last_seq_ctrl[rx->seqno_idx] == hdr->seq_ctrl)) { - if (status->rx_flags & IEEE80211_RX_RA_MATCH) { - rx->local->dot11FrameDuplicateCount++; - rx->sta->num_duplicates++; - } + I802_DEBUG_INC(rx->local->dot11FrameDuplicateCount); + rx->sta->num_duplicates++; return RX_DROP_UNUSABLE; } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) { rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl; @@ -1185,6 +1187,7 @@ static void sta_ps_start(struct sta_info *sta) struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ps_data *ps; + int tid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) @@ -1194,10 +1197,24 @@ static void sta_ps_start(struct sta_info *sta) atomic_inc(&ps->num_sta_ps); set_sta_flag(sta, WLAN_STA_PS_STA); - if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) + if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_SLEEP, &sta->sta); ps_dbg(sdata, "STA %pM aid %d enters power save mode\n", sta->sta.addr, sta->sta.aid); + + ieee80211_clear_fast_xmit(sta); + + if (!sta->sta.txq[0]) + return; + + for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { + struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); + + if (!skb_queue_len(&txqi->queue)) + set_bit(tid, &sta->txq_buffered_tids); + else + clear_bit(tid, &sta->txq_buffered_tids); + } } static void sta_ps_end(struct sta_info *sta) @@ -1228,7 +1245,7 @@ int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start) struct sta_info *sta_inf = container_of(sta, struct sta_info, sta); bool in_ps; - WARN_ON(!(sta_inf->local->hw.flags & IEEE80211_HW_AP_LINK_PS)); + WARN_ON(!ieee80211_hw_check(&sta_inf->local->hw, AP_LINK_PS)); /* Don't let the same PS state be set twice */ in_ps = test_sta_flag(sta_inf, WLAN_STA_PS_STA); @@ -1252,7 +1269,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); int tid, ac; - if (!rx->sta || !(status->rx_flags & IEEE80211_RX_RA_MATCH)) + if (!rx->sta) return RX_CONTINUE; if (sdata->vif.type != NL80211_IFTYPE_AP && @@ -1264,7 +1281,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) * uAPSD and PS-Poll frames (the latter shouldn't even come up from * it to mac80211 since they're handled.) */ - if (sdata->local->hw.flags & IEEE80211_HW_AP_LINK_PS) + if (ieee80211_hw_check(&sdata->local->hw, AP_LINK_PS)) return RX_CONTINUE; /* @@ -1354,11 +1371,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) } } } else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) { - u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len, - NL80211_IFTYPE_OCB); - /* OCB uses wild-card BSSID */ - if (is_broadcast_ether_addr(bssid)) - sta->last_rx = jiffies; + sta->last_rx = jiffies; } else if (!is_multicast_ether_addr(hdr->addr1)) { /* * Mesh beacons will update last_rx when if they are found to @@ -1373,9 +1386,6 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) } } - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_CONTINUE; - if (rx->sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_rx_notify(rx->sdata, hdr); @@ -1403,7 +1413,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) * Change STA power saving mode only at the end of a frame * exchange sequence. */ - if (!(sta->local->hw.flags & IEEE80211_HW_AP_LINK_PS) && + if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) && !ieee80211_has_morefrags(hdr->frame_control) && !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) && (rx->sdata->vif.type == NL80211_IFTYPE_AP || @@ -1504,13 +1514,6 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) * possible. */ - /* - * No point in finding a key and decrypting if the frame is neither - * addressed to us nor a multicast frame. - */ - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_CONTINUE; - /* start without a key */ rx->key = NULL; fc = hdr->frame_control; @@ -1782,7 +1785,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) frag = sc & IEEE80211_SCTL_FRAG; if (is_multicast_ether_addr(hdr->addr1)) { - rx->local->dot11MulticastReceivedFrameCount++; + I802_DEBUG_INC(rx->local->dot11MulticastReceivedFrameCount); goto out_no_led; } @@ -1865,7 +1868,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->skb = __skb_dequeue(&entry->skb_list); if (skb_tailroom(rx->skb) < entry->extra_len) { - I802_DEBUG_INC(rx->local->rx_expand_skb_head2); + I802_DEBUG_INC(rx->local->rx_expand_skb_head_defrag); if (unlikely(pskb_expand_head(rx->skb, 0, entry->extra_len, GFP_ATOMIC))) { I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); @@ -1913,8 +1916,7 @@ static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) /* Drop unencrypted frames if key is set. */ if (unlikely(!ieee80211_has_protected(fc) && !ieee80211_is_nullfunc(fc) && - ieee80211_is_data(fc) && - (rx->key || rx->sdata->drop_unencrypted))) + ieee80211_is_data(fc) && rx->key)) return -EACCES; return 0; @@ -2042,15 +2044,15 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) struct sk_buff *skb, *xmit_skb; struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; struct sta_info *dsta; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); skb = rx->skb; xmit_skb = NULL; + ieee80211_rx_stats(dev, skb->len); + if ((sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && - (status->rx_flags & IEEE80211_RX_RA_MATCH) && (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) { if (is_multicast_ether_addr(ehdr->h_dest)) { /* @@ -2106,7 +2108,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) /* deliver to local stack */ skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->local->napi) + if (!(rx->flags & IEEE80211_RX_REORDER_TIMER) && + rx->local->napi) napi_gro_receive(rx->local->napi, skb); else netif_receive_skb(skb); @@ -2174,8 +2177,6 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) dev_kfree_skb(rx->skb); continue; } - dev->stats.rx_packets++; - dev->stats.rx_bytes += rx->skb->len; ieee80211_deliver_skb(rx); } @@ -2193,7 +2194,6 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) struct sk_buff *skb = rx->skb, *fwd_skb; struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u16 q, hdrlen; @@ -2224,8 +2224,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) mesh_rmc_check(rx->sdata, hdr->addr3, mesh_hdr)) return RX_DROP_MONITOR; - if (!ieee80211_is_data(hdr->frame_control) || - !(status->rx_flags & IEEE80211_RX_RA_MATCH)) + if (!ieee80211_is_data(hdr->frame_control)) return RX_CONTINUE; if (!mesh_hdr->ttl) @@ -2316,11 +2315,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames); ieee80211_add_pending_skb(local, fwd_skb); out: - if (is_multicast_ether_addr(hdr->addr1) || - sdata->dev->flags & IFF_PROMISC) + if (is_multicast_ether_addr(hdr->addr1)) return RX_CONTINUE; - else - return RX_DROP_MONITOR; + return RX_DROP_MONITOR; } #endif @@ -2401,9 +2398,6 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) rx->skb->dev = dev; - dev->stats.rx_packets++; - dev->stats.rx_bytes += rx->skb->len; - if (local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 && !is_multicast_ether_addr( ((struct ethhdr *)rx->skb->data)->h_dest) && @@ -2434,6 +2428,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) struct { __le16 control, start_seq_num; } __packed bar_data; + struct ieee80211_event event = { + .type = BAR_RX_EVENT, + }; if (!rx->sta) return RX_DROP_MONITOR; @@ -2449,6 +2446,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) return RX_DROP_MONITOR; start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4; + event.u.ba.tid = tid; + event.u.ba.ssn = start_seq_num; + event.u.ba.sta = &rx->sta->sta; /* reset session timer */ if (tid_agg_rx->timeout) @@ -2461,6 +2461,8 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) start_seq_num, frames); spin_unlock(&tid_agg_rx->reorder_lock); + drv_event_callback(rx->local, rx->sdata, &event); + kfree_skb(skb); return RX_QUEUED; } @@ -2541,7 +2543,7 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) { int sig = 0; - if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) sig = status->signal; cfg80211_report_obss_beacon(rx->local->hw.wiphy, @@ -2550,9 +2552,6 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) rx->flags |= IEEE80211_RX_BEACON_REPORTED; } - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_DROP_MONITOR; - if (ieee80211_drop_unencrypted_mgmt(rx)) return RX_DROP_UNUSABLE; @@ -2580,9 +2579,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT) return RX_DROP_UNUSABLE; - if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) - return RX_DROP_UNUSABLE; - switch (mgmt->u.action.category) { case WLAN_CATEGORY_HT: /* reject HT action frames from stations not supporting HT */ @@ -2878,7 +2874,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) * it transmitted were processed or returned. */ - if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, @@ -2943,7 +2939,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) info->flags = IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK | IEEE80211_TX_CTL_NO_CCK_RATE; - if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) info->hw_queue = local->hw.offchannel_tx_hw_queue; } @@ -3066,8 +3062,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, } prev_dev = sdata->dev; - sdata->dev->stats.rx_packets++; - sdata->dev->stats.rx_bytes += skb->len; + ieee80211_rx_stats(sdata->dev, skb->len); } if (prev_dev) { @@ -3129,6 +3124,12 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, goto rxh_next; \ } while (0); + /* Lock here to avoid hitting all of the data used in the RX + * path (e.g. key data, station data, ...) concurrently when + * a frame is released from the reorder buffer due to timeout + * from the timer, potentially concurrently with RX from the + * driver. + */ spin_lock_bh(&rx->local->rx_path_lock); while ((skb = __skb_dequeue(frames))) { @@ -3215,7 +3216,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* This is OK -- must be QoS data frame */ .security_idx = tid, .seqno_idx = tid, - .flags = 0, + .flags = IEEE80211_RX_REORDER_TIMER, }; struct tid_ampdu_rx *tid_agg_rx; @@ -3229,16 +3230,25 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames); spin_unlock(&tid_agg_rx->reorder_lock); + if (!skb_queue_empty(&frames)) { + struct ieee80211_event event = { + .type = BA_FRAME_TIMEOUT, + .u.ba.tid = tid, + .u.ba.sta = &sta->sta, + }; + drv_event_callback(rx.local, rx.sdata, &event); + } + ieee80211_rx_handlers(&rx, &frames); } /* main receive path */ -static bool prepare_for_handlers(struct ieee80211_rx_data *rx, - struct ieee80211_hdr *hdr) +static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) { struct ieee80211_sub_if_data *sdata = rx->sdata; struct sk_buff *skb = rx->skb; + struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type); int multicast = is_multicast_ether_addr(hdr->addr1); @@ -3247,30 +3257,23 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, case NL80211_IFTYPE_STATION: if (!bssid && !sdata->u.mgd.use_4addr) return false; - if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { - if (!(sdata->dev->flags & IFF_PROMISC) || - sdata->u.mgd.use_4addr) - return false; - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } - break; + if (multicast) + return true; + return ether_addr_equal(sdata->vif.addr, hdr->addr1); case NL80211_IFTYPE_ADHOC: if (!bssid) return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) return false; - if (ieee80211_is_beacon(hdr->frame_control)) { + if (ieee80211_is_beacon(hdr->frame_control)) return true; - } else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { + if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) return false; - } else if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { - if (!(sdata->dev->flags & IFF_PROMISC)) - return false; - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } else if (!rx->sta) { + if (!multicast && + !ether_addr_equal(sdata->vif.addr, hdr->addr1)) + return false; + if (!rx->sta) { int rate_idx; if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) rate_idx = 0; /* TODO: HT/VHT rates */ @@ -3279,25 +3282,18 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, ieee80211_ibss_rx_no_sta(sdata, bssid, hdr->addr2, BIT(rate_idx)); } - break; + return true; case NL80211_IFTYPE_OCB: if (!bssid) return false; - if (ieee80211_is_beacon(hdr->frame_control)) { + if (ieee80211_is_beacon(hdr->frame_control)) return false; - } else if (!is_broadcast_ether_addr(bssid)) { - ocb_dbg(sdata, "BSSID mismatch in OCB mode!\n"); + if (!is_broadcast_ether_addr(bssid)) return false; - } else if (!multicast && - !ether_addr_equal(sdata->dev->dev_addr, - hdr->addr1)) { - /* if we are in promisc mode we also accept - * packets not destined for us - */ - if (!(sdata->dev->flags & IFF_PROMISC)) - return false; - rx->flags &= ~IEEE80211_RX_RA_MATCH; - } else if (!rx->sta) { + if (!multicast && + !ether_addr_equal(sdata->dev->dev_addr, hdr->addr1)) + return false; + if (!rx->sta) { int rate_idx; if (status->flag & RX_FLAG_HT) rate_idx = 0; /* TODO: HT rates */ @@ -3306,22 +3302,17 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, ieee80211_ocb_rx_no_sta(sdata, bssid, hdr->addr2, BIT(rate_idx)); } - break; + return true; case NL80211_IFTYPE_MESH_POINT: - if (!multicast && - !ether_addr_equal(sdata->vif.addr, hdr->addr1)) { - if (!(sdata->dev->flags & IFF_PROMISC)) - return false; - - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } - break; + if (multicast) + return true; + return ether_addr_equal(sdata->vif.addr, hdr->addr1); case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: - if (!bssid) { - if (!ether_addr_equal(sdata->vif.addr, hdr->addr1)) - return false; - } else if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) { + if (!bssid) + return ether_addr_equal(sdata->vif.addr, hdr->addr1); + + if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) { /* * Accept public action frames even when the * BSSID doesn't match, this is used for P2P @@ -3333,10 +3324,10 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, return false; if (ieee80211_is_public_action(hdr, skb->len)) return true; - if (!ieee80211_is_beacon(hdr->frame_control)) - return false; - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - } else if (!ieee80211_has_tods(hdr->frame_control)) { + return ieee80211_is_beacon(hdr->frame_control); + } + + if (!ieee80211_has_tods(hdr->frame_control)) { /* ignore data frames to TDLS-peers */ if (ieee80211_is_data(hdr->frame_control)) return false; @@ -3345,30 +3336,22 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, !ether_addr_equal(bssid, hdr->addr1)) return false; } - break; + return true; case NL80211_IFTYPE_WDS: if (bssid || !ieee80211_is_data(hdr->frame_control)) return false; - if (!ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2)) - return false; - break; + return ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2); case NL80211_IFTYPE_P2P_DEVICE: - if (!ieee80211_is_public_action(hdr, skb->len) && - !ieee80211_is_probe_req(hdr->frame_control) && - !ieee80211_is_probe_resp(hdr->frame_control) && - !ieee80211_is_beacon(hdr->frame_control)) - return false; - if (!ether_addr_equal(sdata->vif.addr, hdr->addr1) && - !multicast) - status->rx_flags &= ~IEEE80211_RX_RA_MATCH; - break; + return ieee80211_is_public_action(hdr, skb->len) || + ieee80211_is_probe_req(hdr->frame_control) || + ieee80211_is_probe_resp(hdr->frame_control) || + ieee80211_is_beacon(hdr->frame_control); default: - /* should never get here */ - WARN_ON_ONCE(1); break; } - return true; + WARN_ON_ONCE(1); + return false; } /* @@ -3382,13 +3365,10 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, { struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - struct ieee80211_hdr *hdr = (void *)skb->data; rx->skb = skb; - status->rx_flags |= IEEE80211_RX_RA_MATCH; - if (!prepare_for_handlers(rx, hdr)) + if (!ieee80211_accept_frame(rx)) return false; if (!consume) { @@ -3421,7 +3401,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, __le16 fc; struct ieee80211_rx_data rx; struct ieee80211_sub_if_data *prev; - struct sta_info *sta, *tmp, *prev_sta; + struct sta_info *sta, *prev_sta; + struct rhash_head *tmp; int err = 0; fc = ((struct ieee80211_hdr *)skb->data)->frame_control; @@ -3430,7 +3411,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, rx.local = local; if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) - local->dot11ReceivedFragmentCount++; + I802_DEBUG_INC(local->dot11ReceivedFragmentCount); if (ieee80211_is_mgmt(fc)) { /* drop frame if too short for header */ @@ -3456,9 +3437,13 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, ieee80211_scan_rx(local, skb); if (ieee80211_is_data(fc)) { + const struct bucket_table *tbl; + prev_sta = NULL; - for_each_sta_info(local, hdr->addr2, sta, tmp) { + tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); + + for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) { if (!prev_sta) { prev_sta = sta; continue; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 05f0d711b6d8..11d0901ebb7b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -6,7 +6,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> - * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2013-2015 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -69,10 +69,11 @@ ieee80211_bss_info_update(struct ieee80211_local *local, int clen, srlen; enum nl80211_bss_scan_width scan_width; s32 signal = 0; + bool signal_valid; - if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) + if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) signal = rx_status->signal * 100; - else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) + else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) signal = (rx_status->signal * 100) / local->hw.max_signal; scan_width = NL80211_BSS_CHAN_WIDTH_20; @@ -86,6 +87,11 @@ ieee80211_bss_info_update(struct ieee80211_local *local, GFP_ATOMIC); if (!cbss) return NULL; + /* In case the signal is invalid update the status */ + signal_valid = abs(channel->center_freq - cbss->channel->center_freq) + <= local->hw.wiphy->max_adj_channel_rssi_comp; + if (!signal_valid) + rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; bss = (void *)cbss->priv; @@ -257,7 +263,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; - if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) { + if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); @@ -326,7 +332,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) return; if (hw_scan && !aborted && - !(local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) && + !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(local)) { int rc; @@ -520,7 +526,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; - if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) { + if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { int i, n_bands = 0; u8 bands_counted = 0; @@ -928,11 +934,12 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, - struct ieee80211_channel *chan, + struct ieee80211_channel **channels, + unsigned int n_channels, enum nl80211_bss_scan_width scan_width) { struct ieee80211_local *local = sdata->local; - int ret = -EBUSY; + int ret = -EBUSY, i, n_ch = 0; enum ieee80211_band band; mutex_lock(&local->mtx); @@ -942,9 +949,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, goto unlock; /* fill internal scan request */ - if (!chan) { - int i, max_n; - int n_ch = 0; + if (!channels) { + int max_n; for (band = 0; band < IEEE80211_NUM_BANDS; band++) { if (!local->hw.wiphy->bands[band]) @@ -969,12 +975,19 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, local->int_scan_req->n_channels = n_ch; } else { - if (WARN_ON_ONCE(chan->flags & (IEEE80211_CHAN_NO_IR | - IEEE80211_CHAN_DISABLED))) + for (i = 0; i < n_channels; i++) { + if (channels[i]->flags & (IEEE80211_CHAN_NO_IR | + IEEE80211_CHAN_DISABLED)) + continue; + + local->int_scan_req->channels[n_ch] = channels[i]; + n_ch++; + } + + if (WARN_ON_ONCE(n_ch == 0)) goto unlock; - local->int_scan_req->channels[0] = chan; - local->int_scan_req->n_channels = 1; + local->int_scan_req->n_channels = n_ch; } local->int_scan_req->ssids = &local->scan_ssid; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 00ca8dcc2bcf..666ddac3c87c 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -64,32 +64,22 @@ * freed before they are done using it. */ +static const struct rhashtable_params sta_rht_params = { + .nelem_hint = 3, /* start small */ + .automatic_shrinking = true, + .head_offset = offsetof(struct sta_info, hash_node), + .key_offset = offsetof(struct sta_info, sta.addr), + .key_len = ETH_ALEN, + .hashfn = sta_addr_hash, + .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, +}; + /* Caller must hold local->sta_mtx */ static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { - struct sta_info *s; - - s = rcu_dereference_protected(local->sta_hash[STA_HASH(sta->sta.addr)], - lockdep_is_held(&local->sta_mtx)); - if (!s) - return -ENOENT; - if (s == sta) { - rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], - s->hnext); - return 0; - } - - while (rcu_access_pointer(s->hnext) && - rcu_access_pointer(s->hnext) != sta) - s = rcu_dereference_protected(s->hnext, - lockdep_is_held(&local->sta_mtx)); - if (rcu_access_pointer(s->hnext)) { - rcu_assign_pointer(s->hnext, sta->hnext); - return 0; - } - - return -ENOENT; + return rhashtable_remove_fast(&local->sta_hash, &sta->hash_node, + sta_rht_params); } static void __cleanup_single_sta(struct sta_info *sta) @@ -118,6 +108,16 @@ static void __cleanup_single_sta(struct sta_info *sta) atomic_dec(&ps->num_sta_ps); } + if (sta->sta.txq[0]) { + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); + int n = skb_queue_len(&txqi->queue); + + ieee80211_purge_tx_queue(&local->hw, &txqi->queue); + atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]); + } + } + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]); @@ -160,17 +160,23 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct sta_info *sta; + struct rhash_head *tmp; + const struct bucket_table *tbl; - sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], - lockdep_is_held(&local->sta_mtx)); - while (sta) { - if (sta->sdata == sdata && - ether_addr_equal(sta->sta.addr, addr)) - break; - sta = rcu_dereference_check(sta->hnext, - lockdep_is_held(&local->sta_mtx)); + rcu_read_lock(); + tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); + + for_each_sta_info(local, tbl, addr, sta, tmp) { + if (sta->sdata == sdata) { + rcu_read_unlock(); + /* this is safe as the caller must already hold + * another rcu read section or the mutex + */ + return sta; + } } - return sta; + rcu_read_unlock(); + return NULL; } /* @@ -182,18 +188,24 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct sta_info *sta; + struct rhash_head *tmp; + const struct bucket_table *tbl; - sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)], - lockdep_is_held(&local->sta_mtx)); - while (sta) { - if ((sta->sdata == sdata || - (sta->sdata->bss && sta->sdata->bss == sdata->bss)) && - ether_addr_equal(sta->sta.addr, addr)) - break; - sta = rcu_dereference_check(sta->hnext, - lockdep_is_held(&local->sta_mtx)); + rcu_read_lock(); + tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); + + for_each_sta_info(local, tbl, addr, sta, tmp) { + if (sta->sdata == sdata || + (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { + rcu_read_unlock(); + /* this is safe as the caller must already hold + * another rcu read section or the mutex + */ + return sta; + } } - return sta; + rcu_read_unlock(); + return NULL; } struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, @@ -229,19 +241,13 @@ struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { - int i; - if (sta->rate_ctrl) rate_control_free_sta(sta); - if (sta->tx_lat) { - for (i = 0; i < IEEE80211_NUM_TIDS; i++) - kfree(sta->tx_lat[i].bins); - kfree(sta->tx_lat); - } - sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); + if (sta->sta.txq[0]) + kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); kfree(sta); } @@ -250,9 +256,8 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) static void sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { - lockdep_assert_held(&local->sta_mtx); - sta->hnext = local->sta_hash[STA_HASH(sta->sta.addr)]; - rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta); + rhashtable_insert_fast(&local->sta_hash, &sta->hash_node, + sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) @@ -277,12 +282,12 @@ static void sta_deliver_ps_frames(struct work_struct *wk) static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { - if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, - &sta->sta, gfp); + sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; @@ -293,50 +298,22 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp) { struct ieee80211_local *local = sdata->local; + struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; struct timespec uptime; - struct ieee80211_tx_latency_bin_ranges *tx_latency; int i; - sta = kzalloc(sizeof(*sta) + local->hw.sta_data_size, gfp); + sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); if (!sta) return NULL; - rcu_read_lock(); - tx_latency = rcu_dereference(local->tx_latency); - /* init stations Tx latency statistics && TID bins */ - if (tx_latency) { - sta->tx_lat = kzalloc(IEEE80211_NUM_TIDS * - sizeof(struct ieee80211_tx_latency_stat), - GFP_ATOMIC); - if (!sta->tx_lat) { - rcu_read_unlock(); - goto free; - } - - if (tx_latency->n_ranges) { - for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - /* size of bins is size of the ranges +1 */ - sta->tx_lat[i].bin_count = - tx_latency->n_ranges + 1; - sta->tx_lat[i].bins = - kcalloc(sta->tx_lat[i].bin_count, - sizeof(u32), GFP_ATOMIC); - if (!sta->tx_lat[i].bins) { - rcu_read_unlock(); - goto free; - } - } - } - } - rcu_read_unlock(); - spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); mutex_init(&sta->ampdu_mlme.mtx); #ifdef CONFIG_MAC80211_MESH + spin_lock_init(&sta->plink_lock); if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.user_mpm) init_timer(&sta->plink_timer); @@ -359,8 +336,24 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, for (i = 0; i < ARRAY_SIZE(sta->chain_signal_avg); i++) ewma_init(&sta->chain_signal_avg[i], 1024, 8); + if (local->ops->wake_tx_queue) { + void *txq_data; + int size = sizeof(struct txq_info) + + ALIGN(hw->txq_data_size, sizeof(void *)); + + txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); + if (!txq_data) + goto free; + + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + struct txq_info *txq = txq_data + i * size; + + ieee80211_init_tx_queue(sdata, sta, txq, i); + } + } + if (sta_prepare_rate_control(local, sta, gfp)) - goto free; + goto free_txq; for (i = 0; i < IEEE80211_NUM_TIDS; i++) { /* @@ -382,7 +375,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { struct ieee80211_supported_band *sband = - local->hw.wiphy->bands[ieee80211_get_sdata_band(sdata)]; + hw->wiphy->bands[ieee80211_get_sdata_band(sdata)]; u8 smps = (sband->ht_cap.cap & IEEE80211_HT_CAP_SM_PS) >> IEEE80211_HT_CAP_SM_PS_SHIFT; /* @@ -405,14 +398,13 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, } sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); + return sta; +free_txq: + if (sta->sta.txq[0]) + kfree(to_txq_info(sta->sta.txq[0])); free: - if (sta->tx_lat) { - for (i = 0; i < IEEE80211_NUM_TIDS; i++) - kfree(sta->tx_lat[i].bins); - kfree(sta->tx_lat); - } kfree(sta); return NULL; } @@ -651,7 +643,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) } /* No need to do anything if the driver does all */ - if (local->hw.flags & IEEE80211_HW_AP_LINK_PS) + if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) return; if (sta->dead) @@ -684,6 +676,8 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) indicate_tim |= sta->driver_buffered_tids & tids; + indicate_tim |= + sta->txq_buffered_tids & tids; } done: @@ -992,19 +986,32 @@ static void sta_info_cleanup(unsigned long data) round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } -void sta_info_init(struct ieee80211_local *local) +u32 sta_addr_hash(const void *key, u32 length, u32 seed) +{ + return jhash(key, ETH_ALEN, seed); +} + +int sta_info_init(struct ieee80211_local *local) { + int err; + + err = rhashtable_init(&local->sta_hash, &sta_rht_params); + if (err) + return err; + spin_lock_init(&local->tim_lock); mutex_init(&local->sta_mtx); INIT_LIST_HEAD(&local->sta_list); setup_timer(&local->sta_cleanup, sta_info_cleanup, (unsigned long)local); + return 0; } void sta_info_stop(struct ieee80211_local *local) { del_timer_sync(&local->sta_cleanup); + rhashtable_destroy(&local->sta_hash); } @@ -1068,16 +1075,21 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, } struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, - const u8 *addr, - const u8 *localaddr) + const u8 *addr, + const u8 *localaddr) { - struct sta_info *sta, *nxt; + struct ieee80211_local *local = hw_to_local(hw); + struct sta_info *sta; + struct rhash_head *tmp; + const struct bucket_table *tbl; + + tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); /* * Just return a random station if localaddr is NULL * ... first in list. */ - for_each_sta_info(hw_to_local(hw), addr, sta, nxt) { + for_each_sta_info(local, tbl, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; @@ -1115,7 +1127,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff_head pending; - int filtered = 0, buffered = 0, ac; + int filtered = 0, buffered = 0, ac, i; unsigned long flags; struct ps_data *ps; @@ -1134,10 +1146,22 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1); sta->driver_buffered_tids = 0; + sta->txq_buffered_tids = 0; - if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS)) + if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); + if (sta->sta.txq[0]) { + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); + + if (!skb_queue_len(&txqi->queue)) + continue; + + drv_wake_tx_queue(local, txqi); + } + } + skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ @@ -1195,6 +1219,8 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n", sta->sta.addr, sta->sta.aid, filtered, buffered); + + ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, @@ -1275,7 +1301,7 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, } info->band = chanctx_conf->def.chan->band; - ieee80211_xmit(sdata, skb); + ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } @@ -1319,8 +1345,10 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, /* if we already have frames from software, then we can't also * release from hardware queues */ - if (skb_queue_empty(&frames)) + if (skb_queue_empty(&frames)) { driver_release_tids |= sta->driver_buffered_tids & tids; + driver_release_tids |= sta->txq_buffered_tids & tids; + } if (driver_release_tids) { /* If the driver has data on more than one TID then @@ -1491,6 +1519,9 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, sta_info_recalc_tim(sta); } else { + unsigned long tids = sta->txq_buffered_tids & driver_release_tids; + int tid; + /* * We need to release a frame that is buffered somewhere in the * driver ... it'll have to handle that. @@ -1510,8 +1541,22 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, * that the TID(s) became empty before returning here from the * release function. * Either way, however, when the driver tells us that the TID(s) - * became empty we'll do the TIM recalculation. + * became empty or we find that a txq became empty, we'll do the + * TIM recalculation. */ + + if (!sta->sta.txq[0]) + return; + + for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { + struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); + + if (!(tids & BIT(tid)) || skb_queue_len(&txqi->queue)) + continue; + + sta_info_recalc_tim(sta); + break; + } } } @@ -1574,6 +1619,7 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); + ieee80211_clear_fast_xmit(sta); return; } @@ -1591,6 +1637,7 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); + ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); @@ -1695,6 +1742,7 @@ int sta_info_move_state(struct sta_info *sta, !sta->sdata->u.vlan.sta)) atomic_dec(&sta->sdata->bss->num_mcast_sta); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); + ieee80211_clear_fast_xmit(sta); } break; case IEEE80211_STA_AUTHORIZED: @@ -1704,6 +1752,7 @@ int sta_info_move_state(struct sta_info *sta, !sta->sdata->u.vlan.sta)) atomic_inc(&sta->sdata->bss->num_mcast_sta); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); + ieee80211_check_fast_xmit(sta); } break; default: @@ -1830,8 +1879,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } - if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) || - (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) { + if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || + ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)sta->last_signal; sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL); @@ -1883,7 +1932,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && - local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); tidstats->tx_msdu_retries = sta->tx_msdu_retries[i]; @@ -1891,7 +1940,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && - local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = sta->tx_msdu_failed[i]; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index fb0fc1302a58..226f8ca47ad6 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -16,6 +16,7 @@ #include <linux/workqueue.h> #include <linux/average.h> #include <linux/etherdevice.h> +#include <linux/rhashtable.h> #include "key.h" /** @@ -236,29 +237,38 @@ struct sta_ampdu_mlme { u8 dialog_token_allocator; }; -/* - * struct ieee80211_tx_latency_stat - Tx latency statistics - * - * Measures TX latency and jitter for a station per TID. - * - * @max: worst case latency - * @sum: sum of all latencies - * @counter: amount of Tx frames sent from interface - * @bins: each bin counts how many frames transmitted within a certain - * latency range. when disabled it is NULL. - * @bin_count: amount of bins. - */ -struct ieee80211_tx_latency_stat { - u32 max; - u32 sum; - u32 counter; - u32 *bins; - u32 bin_count; -}; /* Value to indicate no TID reservation */ #define IEEE80211_TID_UNRESERVED 0xff +#define IEEE80211_FAST_XMIT_MAX_IV 18 + +/** + * struct ieee80211_fast_tx - TX fastpath information + * @key: key to use for hw crypto + * @hdr: the 802.11 header to put with the frame + * @hdr_len: actual 802.11 header length + * @sa_offs: offset of the SA + * @da_offs: offset of the DA + * @pn_offs: offset where to put PN for crypto (or 0 if not needed) + * @band: band this will be transmitted on, for tx_info + * @rcu_head: RCU head to free this struct + * + * This struct is small enough so that the common case (maximum crypto + * header length of 8 like for CCMP/GCMP) fits into a single 64-byte + * cache line. + */ +struct ieee80211_fast_tx { + struct ieee80211_key *key; + u8 hdr_len; + u8 sa_offs, da_offs, pn_offs; + u8 band; + u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV + + sizeof(rfc1042_header)]; + + struct rcu_head rcu_head; +}; + /** * struct sta_info - STA information * @@ -267,7 +277,7 @@ struct ieee80211_tx_latency_stat { * * @list: global linked list entry * @free_list: list entry for keeping track of stations to free - * @hnext: hash table linked list pointer + * @hash_node: hash node for rhashtable * @local: pointer to the global information * @sdata: virtual interface this station belongs to * @ptk: peer keys negotiated with this station, if any @@ -275,6 +285,8 @@ struct ieee80211_tx_latency_stat { * @gtk: group keys negotiated with this station, if any * @gtk_idx: last installed group key index * @rate_ctrl: rate control algorithm reference + * @rate_ctrl_lock: spinlock used to protect rate control data + * (data inside the algorithm, so serializes calls there) * @rate_ctrl_priv: rate control private per-STA pointer * @last_tx_rate: rate used for last transmit, to report to userspace as * "the" transmit rate @@ -295,6 +307,7 @@ struct ieee80211_tx_latency_stat { * entered power saving state, these are also delivered to * the station when it leaves powersave or polls for frames * @driver_buffered_tids: bitmap of TIDs the driver has data buffered on + * @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on * @rx_packets: Number of MSDUs received from this STA * @rx_bytes: Number of bytes received from this STA * @last_rx: time (in jiffies) when last frame was received from this STA @@ -312,11 +325,10 @@ struct ieee80211_tx_latency_stat { * @fail_avg: moving percentage of failed MSDUs * @tx_packets: number of RX/TX MSDUs * @tx_bytes: number of bytes transmitted to this STA - * @tx_fragments: number of transmitted MPDUs * @tid_seq: per-TID sequence numbers for sending to this STA * @ampdu_mlme: A-MPDU state machine state * @timer_to_tid: identity mapping to ID timers - * @tx_lat: Tx latency statistics + * @plink_lock: serialize access to plink fields * @llid: Local link ID * @plid: Peer link ID * @reason: Cancel reason on PLINK_HOLDING state @@ -356,12 +368,15 @@ struct ieee80211_tx_latency_stat { * using IEEE80211_NUM_TID entry for non-QoS frames * @rx_msdu: MSDUs received from this station, using IEEE80211_NUM_TID * entry for non-QoS frames + * @fast_tx: TX fastpath information + * @processed_beacon: set to true after peer rates and capabilities are + * processed */ struct sta_info { /* General information, mostly static */ struct list_head list, free_list; struct rcu_head rcu_head; - struct sta_info __rcu *hnext; + struct rhash_head hash_node; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; @@ -370,8 +385,11 @@ struct sta_info { u8 ptk_idx; struct rate_control_ref *rate_ctrl; void *rate_ctrl_priv; + spinlock_t rate_ctrl_lock; spinlock_t lock; + struct ieee80211_fast_tx __rcu *fast_tx; + struct work_struct drv_deliver_wk; u16 listen_interval; @@ -390,6 +408,7 @@ struct sta_info { struct sk_buff_head ps_tx_buf[IEEE80211_NUM_ACS]; struct sk_buff_head tx_filtered[IEEE80211_NUM_ACS]; unsigned long driver_buffered_tids; + unsigned long txq_buffered_tids; /* Updated from RX path only, no locking requirements */ unsigned long rx_packets; @@ -417,7 +436,6 @@ struct sta_info { unsigned int fail_avg; /* Updated from TX path only, no locking requirements */ - u32 tx_fragments; u64 tx_packets[IEEE80211_NUM_ACS]; u64 tx_bytes[IEEE80211_NUM_ACS]; struct ieee80211_tx_rate last_tx_rate; @@ -437,13 +455,12 @@ struct sta_info { struct sta_ampdu_mlme ampdu_mlme; u8 timer_to_tid[IEEE80211_NUM_TIDS]; - struct ieee80211_tx_latency_stat *tx_lat; - #ifdef CONFIG_MAC80211_MESH /* - * Mesh peer link attributes + * Mesh peer link attributes, protected by plink_lock. * TODO: move to a sub-structure that is referenced with pointer? */ + spinlock_t plink_lock; u16 llid; u16 plid; u16 reason; @@ -451,12 +468,14 @@ struct sta_info { enum nl80211_plink_state plink_state; u32 plink_timeout; struct timer_list plink_timer; + s64 t_offset; s64 t_offset_setpoint; /* mesh power save */ enum nl80211_mesh_power_mode local_pm; enum nl80211_mesh_power_mode peer_pm; enum nl80211_mesh_power_mode nonpeer_pm; + bool processed_beacon; #endif #ifdef CONFIG_MAC80211_DEBUGFS @@ -559,10 +578,6 @@ rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid) lockdep_is_held(&sta->ampdu_mlme.mtx)); } -#define STA_HASH_SIZE 256 -#define STA_HASH(sta) (sta[5]) - - /* Maximum number of frames to buffer per power saving station per AC */ #define STA_MAX_TX_BUFFER 64 @@ -583,26 +598,15 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); -static inline -void for_each_sta_info_type_check(struct ieee80211_local *local, - const u8 *addr, - struct sta_info *sta, - struct sta_info *nxt) -{ -} +u32 sta_addr_hash(const void *key, u32 length, u32 seed); + +#define _sta_bucket_idx(_tbl, _a) \ + rht_bucket_index(_tbl, sta_addr_hash(_a, ETH_ALEN, (_tbl)->hash_rnd)) -#define for_each_sta_info(local, _addr, _sta, nxt) \ - for ( /* initialise loop */ \ - _sta = rcu_dereference(local->sta_hash[STA_HASH(_addr)]),\ - nxt = _sta ? rcu_dereference(_sta->hnext) : NULL; \ - /* typecheck */ \ - for_each_sta_info_type_check(local, (_addr), _sta, nxt),\ - /* continue condition */ \ - _sta; \ - /* advance loop */ \ - _sta = nxt, \ - nxt = _sta ? rcu_dereference(_sta->hnext) : NULL \ - ) \ +#define for_each_sta_info(local, tbl, _addr, _sta, _tmp) \ + rht_for_each_entry_rcu(_sta, _tmp, tbl, \ + _sta_bucket_idx(tbl, _addr), \ + hash_node) \ /* compare address and run code only if it matches */ \ if (ether_addr_equal(_sta->sta.addr, (_addr))) @@ -639,7 +643,7 @@ int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, void sta_info_recalc_tim(struct sta_info *sta); -void sta_info_init(struct ieee80211_local *local); +int sta_info_init(struct ieee80211_local *local); void sta_info_stop(struct ieee80211_local *local); /** diff --git a/net/mac80211/status.c b/net/mac80211/status.c index e679b7c9b160..45628f37c083 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -12,7 +12,6 @@ #include <linux/export.h> #include <linux/etherdevice.h> -#include <linux/time.h> #include <net/mac80211.h> #include <asm/unaligned.h> #include "ieee80211_i.h" @@ -182,7 +181,7 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) sta->last_rx = jiffies; if (ieee80211_is_data_qos(mgmt->frame_control)) { @@ -415,8 +414,7 @@ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, if (is_teardown) { /* This mechanism relies on being able to get ACKs */ - WARN_ON(!(local->hw.flags & - IEEE80211_HW_REPORTS_TX_ACK_STATUS)); + WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)); /* Check if peer has ACKed */ if (flags & IEEE80211_TX_STAT_ACK) { @@ -430,6 +428,74 @@ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, } } +static struct ieee80211_sub_if_data * +ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) +{ + struct ieee80211_sub_if_data *sdata; + + if (skb->dev) { + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!sdata->dev) + continue; + + if (skb->dev == sdata->dev) + return sdata; + } + + return NULL; + } + + return rcu_dereference(local->p2p_sdata); +} + +static void ieee80211_report_ack_skb(struct ieee80211_local *local, + struct ieee80211_tx_info *info, + bool acked, bool dropped) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&local->ack_status_lock, flags); + skb = idr_find(&local->ack_status_frames, info->ack_frame_id); + if (skb) + idr_remove(&local->ack_status_frames, info->ack_frame_id); + spin_unlock_irqrestore(&local->ack_status_lock, flags); + + if (!skb) + return; + + if (dropped) { + dev_kfree_skb_any(skb); + return; + } + + if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { + u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_hdr *hdr = (void *)skb->data; + + rcu_read_lock(); + sdata = ieee80211_sdata_from_skb(local, skb); + if (sdata) { + if (ieee80211_is_nullfunc(hdr->frame_control) || + ieee80211_is_qos_nullfunc(hdr->frame_control)) + cfg80211_probe_status(sdata->dev, hdr->addr1, + cookie, acked, + GFP_ATOMIC); + else + cfg80211_mgmt_tx_status(&sdata->wdev, cookie, + skb->data, skb->len, + acked, GFP_ATOMIC); + } + rcu_read_unlock(); + + dev_kfree_skb_any(skb); + } else { + /* consumes skb */ + skb_complete_wifi_ack(skb, acked); + } +} + static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped) { @@ -440,28 +506,12 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, if (dropped) acked = false; - if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX | - IEEE80211_TX_INTFL_MLME_CONN_TX)) { - struct ieee80211_sub_if_data *sdata = NULL; - struct ieee80211_sub_if_data *iter_sdata; - u64 cookie = (unsigned long)skb; + if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { + struct ieee80211_sub_if_data *sdata; rcu_read_lock(); - if (skb->dev) { - list_for_each_entry_rcu(iter_sdata, &local->interfaces, - list) { - if (!iter_sdata->dev) - continue; - - if (skb->dev == iter_sdata->dev) { - sdata = iter_sdata; - break; - } - } - } else { - sdata = rcu_dereference(local->p2p_sdata); - } + sdata = ieee80211_sdata_from_skb(local, skb); if (!sdata) { skb->dev = NULL; @@ -479,106 +529,15 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); - } else if (ieee80211_is_nullfunc(hdr->frame_control) || - ieee80211_is_qos_nullfunc(hdr->frame_control)) { - cfg80211_probe_status(sdata->dev, hdr->addr1, - cookie, acked, GFP_ATOMIC); } else { - cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data, - skb->len, acked, GFP_ATOMIC); + /* we assign ack frame ID for the others */ + WARN_ON(1); } rcu_read_unlock(); + } else if (info->ack_frame_id) { + ieee80211_report_ack_skb(local, info, acked, dropped); } - - if (unlikely(info->ack_frame_id)) { - struct sk_buff *ack_skb; - unsigned long flags; - - spin_lock_irqsave(&local->ack_status_lock, flags); - ack_skb = idr_find(&local->ack_status_frames, - info->ack_frame_id); - if (ack_skb) - idr_remove(&local->ack_status_frames, - info->ack_frame_id); - spin_unlock_irqrestore(&local->ack_status_lock, flags); - - if (ack_skb) { - if (!dropped) { - /* consumes ack_skb */ - skb_complete_wifi_ack(ack_skb, acked); - } else { - dev_kfree_skb_any(ack_skb); - } - } - } -} - -/* - * Measure Tx frame completion and removal time for Tx latency statistics - * calculation. A single Tx frame latency should be measured from when it - * is entering the Kernel until we receive Tx complete confirmation indication - * and remove the skb. - */ -static void ieee80211_tx_latency_end_msrmnt(struct ieee80211_local *local, - struct sk_buff *skb, - struct sta_info *sta, - struct ieee80211_hdr *hdr) -{ - u32 msrmnt; - u16 tid; - u8 *qc; - int i, bin_range_count; - u32 *bin_ranges; - __le16 fc; - struct ieee80211_tx_latency_stat *tx_lat; - struct ieee80211_tx_latency_bin_ranges *tx_latency; - ktime_t skb_arv = skb->tstamp; - - tx_latency = rcu_dereference(local->tx_latency); - - /* assert Tx latency stats are enabled & frame arrived when enabled */ - if (!tx_latency || !ktime_to_ns(skb_arv)) - return; - - fc = hdr->frame_control; - - if (!ieee80211_is_data(fc)) /* make sure it is a data frame */ - return; - - /* get frame tid */ - if (ieee80211_is_data_qos(hdr->frame_control)) { - qc = ieee80211_get_qos_ctl(hdr); - tid = qc[0] & IEEE80211_QOS_CTL_TID_MASK; - } else { - tid = 0; - } - - tx_lat = &sta->tx_lat[tid]; - - /* Calculate the latency */ - msrmnt = ktime_to_ms(ktime_sub(ktime_get(), skb_arv)); - - if (tx_lat->max < msrmnt) /* update stats */ - tx_lat->max = msrmnt; - tx_lat->counter++; - tx_lat->sum += msrmnt; - - if (!tx_lat->bins) /* bins not activated */ - return; - - /* count how many Tx frames transmitted with the appropriate latency */ - bin_range_count = tx_latency->n_ranges; - bin_ranges = tx_latency->ranges; - - for (i = 0; i < bin_range_count; i++) { - if (msrmnt <= bin_ranges[i]) { - tx_lat->bins[i]++; - break; - } - } - if (i == bin_range_count) /* msrmnt is bigger than the biggest range */ - tx_lat->bins[i]++; } /* @@ -699,15 +658,15 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, } if (acked || noack_success) { - local->dot11TransmittedFrameCount++; - if (!pubsta) - local->dot11MulticastTransmittedFrameCount++; - if (retry_count > 0) - local->dot11RetryCount++; - if (retry_count > 1) - local->dot11MultipleRetryCount++; + I802_DEBUG_INC(local->dot11TransmittedFrameCount); + if (!pubsta) + I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); + if (retry_count > 0) + I802_DEBUG_INC(local->dot11RetryCount); + if (retry_count > 1) + I802_DEBUG_INC(local->dot11MultipleRetryCount); } else { - local->dot11FailedCount++; + I802_DEBUG_INC(local->dot11FailedCount); } } EXPORT_SYMBOL(ieee80211_tx_status_noskb); @@ -722,7 +681,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) struct ieee80211_supported_band *sband; struct ieee80211_sub_if_data *sdata; struct net_device *prev_dev = NULL; - struct sta_info *sta, *tmp; + struct sta_info *sta; + struct rhash_head *tmp; int retry_count; int rates_idx; bool send_to_cooked; @@ -731,6 +691,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) int rtap_len; int shift = 0; int tid = IEEE80211_NUM_TIDS; + const struct bucket_table *tbl; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); @@ -739,7 +700,9 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) sband = local->hw.wiphy->bands[info->band]; fc = hdr->frame_control; - for_each_sta_info(local, hdr->addr1, sta, tmp) { + tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); + + for_each_sta_info(local, tbl, hdr->addr1, sta, tmp) { /* skip wrong virtual interface */ if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr)) continue; @@ -767,7 +730,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) ieee80211_get_qos_ctl(hdr), sta, true, acked); - if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) && + if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->last_tx_rate = info->status.rates[rates_idx]; @@ -834,11 +797,11 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) ieee80211_frame_acked(sta, skb); if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) && - (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data, acked, info->status.tx_time); - if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) { + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (info->flags & IEEE80211_TX_STAT_ACK) { if (sta->lost_packets) sta->lost_packets = 0; @@ -853,12 +816,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if (acked) sta->last_ack_signal = info->status.ack_signal; - - /* - * Measure frame removal for tx latency - * statistics calculation - */ - ieee80211_tx_latency_end_msrmnt(local, skb, sta, hdr); } rcu_read_unlock(); @@ -872,13 +829,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if ((info->flags & IEEE80211_TX_STAT_ACK) || (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) { if (ieee80211_is_first_frag(hdr->seq_ctrl)) { - local->dot11TransmittedFrameCount++; + I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (is_multicast_ether_addr(ieee80211_get_DA(hdr))) - local->dot11MulticastTransmittedFrameCount++; + I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) - local->dot11RetryCount++; + I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) - local->dot11MultipleRetryCount++; + I802_DEBUG_INC(local->dot11MultipleRetryCount); } /* This counter shall be incremented for an acknowledged MPDU @@ -888,14 +845,14 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if (!is_multicast_ether_addr(hdr->addr1) || ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) - local->dot11TransmittedFragmentCount++; + I802_DEBUG_INC(local->dot11TransmittedFragmentCount); } else { if (ieee80211_is_first_frag(hdr->seq_ctrl)) - local->dot11FailedCount++; + I802_DEBUG_INC(local->dot11FailedCount); } if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) && - (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) && + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { if (info->flags & IEEE80211_TX_STAT_ACK) { diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index c9f9752217ac..8db6e2994bbc 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -60,6 +60,7 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *ch; struct cfg80211_chan_def chandef; int i, subband_start; + struct wiphy *wiphy = sdata->local->hw.wiphy; for (i = start; i <= end; i += spacing) { if (!ch_cnt) @@ -70,9 +71,8 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata, /* we will be active on the channel */ cfg80211_chandef_create(&chandef, ch, NL80211_CHAN_NO_HT); - if (cfg80211_reg_can_beacon(sdata->local->hw.wiphy, - &chandef, - sdata->wdev.iftype)) { + if (cfg80211_reg_can_beacon_relax(wiphy, &chandef, + sdata->wdev.iftype)) { ch_cnt++; /* * check if the next channel is also part of @@ -136,6 +136,24 @@ ieee80211_tdls_add_supp_channels(struct ieee80211_sub_if_data *sdata, *pos = 2 * subband_cnt; } +static void ieee80211_tdls_add_oper_classes(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + u8 *pos; + u8 op_class; + + if (!ieee80211_chandef_to_operating_class(&sdata->vif.bss_conf.chandef, + &op_class)) + return; + + pos = skb_put(skb, 4); + *pos++ = WLAN_EID_SUPPORTED_REGULATORY_CLASSES; + *pos++ = 2; /* len */ + + *pos++ = op_class; + *pos++ = op_class; /* give current operating class as alternate too */ +} + static void ieee80211_tdls_add_bss_coex_ie(struct sk_buff *skb) { u8 *pos = (void *)skb_put(skb, 3); @@ -149,23 +167,16 @@ static void ieee80211_tdls_add_bss_coex_ie(struct sk_buff *skb) static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata, u16 status_code) { - struct ieee80211_local *local = sdata->local; - u16 capab; - /* The capability will be 0 when sending a failure code */ if (status_code != 0) return 0; - capab = 0; - if (ieee80211_get_sdata_band(sdata) != IEEE80211_BAND_2GHZ) - return capab; - - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; - if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)) - capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; + if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_2GHZ) { + return WLAN_CAPABILITY_SHORT_SLOT_TIME | + WLAN_CAPABILITY_SHORT_PREAMBLE; + } - return capab; + return 0; } static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata, @@ -193,6 +204,17 @@ static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata, memcpy(lnkid->resp_sta, rsp_addr, ETH_ALEN); } +static void +ieee80211_tdls_add_aid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + u8 *pos = (void *)skb_put(skb, 4); + + *pos++ = WLAN_EID_AID; + *pos++ = 2; /* len */ + put_unaligned_le16(ifmgd->aid, pos); +} + /* translate numbering in the WMM parameter IE to the mac80211 notation */ static enum ieee80211_ac_numbers ieee80211_ac_from_wmm(int ac) { @@ -271,21 +293,11 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct ieee80211_sta_ht_cap ht_cap; + struct ieee80211_sta_vht_cap vht_cap; struct sta_info *sta = NULL; size_t offset = 0, noffset; u8 *pos; - rcu_read_lock(); - - /* we should have the peer STA if we're already responding */ - if (action_code == WLAN_TDLS_SETUP_RESPONSE) { - sta = sta_info_get(sdata, peer); - if (WARN_ON_ONCE(!sta)) { - rcu_read_unlock(); - return; - } - } - ieee80211_add_srates_ie(sdata, skb, false, band); ieee80211_add_ext_srates_ie(sdata, skb, false, band); ieee80211_tdls_add_supp_channels(sdata, skb); @@ -338,6 +350,19 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, offset = noffset; } + rcu_read_lock(); + + /* we should have the peer STA if we're already responding */ + if (action_code == WLAN_TDLS_SETUP_RESPONSE) { + sta = sta_info_get(sdata, peer); + if (WARN_ON_ONCE(!sta)) { + rcu_read_unlock(); + return; + } + } + + ieee80211_tdls_add_oper_classes(sdata, skb); + /* * with TDLS we can switch channels, and HT-caps are not necessarily * the same on all bands. The specification limits the setup to a @@ -346,7 +371,9 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[band]; memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap)); - if (action_code == WLAN_TDLS_SETUP_REQUEST && ht_cap.ht_supported) { + if ((action_code == WLAN_TDLS_SETUP_REQUEST || + action_code == WLAN_PUB_ACTION_TDLS_DISCOVER_RES) && + ht_cap.ht_supported) { ieee80211_apply_htcap_overrides(sdata, &ht_cap); /* disable SMPS in TDLS initiator */ @@ -368,12 +395,63 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap); } - rcu_read_unlock(); - if (ht_cap.ht_supported && (ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)) ieee80211_tdls_add_bss_coex_ie(skb); + ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); + + /* add any custom IEs that go before VHT capabilities */ + if (extra_ies_len) { + static const u8 before_vht_cap[] = { + WLAN_EID_SUPP_RATES, + WLAN_EID_COUNTRY, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_SUPPORTED_CHANNELS, + WLAN_EID_RSN, + WLAN_EID_EXT_CAPABILITY, + WLAN_EID_QOS_CAPA, + WLAN_EID_FAST_BSS_TRANSITION, + WLAN_EID_TIMEOUT_INTERVAL, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + WLAN_EID_MULTI_BAND, + }; + noffset = ieee80211_ie_split(extra_ies, extra_ies_len, + before_vht_cap, + ARRAY_SIZE(before_vht_cap), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, extra_ies + offset, noffset - offset); + offset = noffset; + } + + /* build the VHT-cap similarly to the HT-cap */ + memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap)); + if ((action_code == WLAN_TDLS_SETUP_REQUEST || + action_code == WLAN_PUB_ACTION_TDLS_DISCOVER_RES) && + vht_cap.vht_supported) { + ieee80211_apply_vhtcap_overrides(sdata, &vht_cap); + + /* the AID is present only when VHT is implemented */ + if (action_code == WLAN_TDLS_SETUP_REQUEST) + ieee80211_tdls_add_aid(sdata, skb); + + pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); + ieee80211_ie_build_vht_cap(pos, &vht_cap, vht_cap.cap); + } else if (action_code == WLAN_TDLS_SETUP_RESPONSE && + vht_cap.vht_supported && sta->sta.vht_cap.vht_supported) { + /* the peer caps are already intersected with our own */ + memcpy(&vht_cap, &sta->sta.vht_cap, sizeof(vht_cap)); + + /* the AID is present only when VHT is implemented */ + ieee80211_tdls_add_aid(sdata, skb); + + pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); + ieee80211_ie_build_vht_cap(pos, &vht_cap, vht_cap.cap); + } + + rcu_read_unlock(); + /* add any remaining IEs */ if (extra_ies_len) { noffset = extra_ies_len; @@ -381,7 +459,6 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, memcpy(pos, extra_ies + offset, noffset - offset); } - ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); } static void @@ -394,6 +471,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t offset = 0, noffset; struct sta_info *sta, *ap_sta; + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); u8 *pos; rcu_read_lock(); @@ -442,15 +520,19 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, /* if HT support is only added in TDLS, we need an HT-operation IE */ if (!ap_sta->sta.ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) { - struct ieee80211_chanctx_conf *chanctx_conf = - rcu_dereference(sdata->vif.chanctx_conf); - if (!WARN_ON(!chanctx_conf)) { - pos = skb_put(skb, 2 + - sizeof(struct ieee80211_ht_operation)); - /* send an empty HT operation IE */ - ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap, - &chanctx_conf->def, 0); - } + pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation)); + /* send an empty HT operation IE */ + ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap, + &sdata->vif.bss_conf.chandef, 0); + } + + ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); + + /* only include VHT-operation if not on the 2.4GHz band */ + if (band != IEEE80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) { + pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation)); + ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap, + &sdata->vif.bss_conf.chandef); } rcu_read_unlock(); @@ -461,8 +543,6 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, pos = skb_put(skb, noffset - offset); memcpy(pos, extra_ies + offset, noffset - offset); } - - ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); } static void @@ -708,8 +788,12 @@ ieee80211_tdls_build_mgmt_packet_data(struct ieee80211_sub_if_data *sdata, 26 + /* max(WMM-info, WMM-param) */ 2 + max(sizeof(struct ieee80211_ht_cap), sizeof(struct ieee80211_ht_operation)) + + 2 + max(sizeof(struct ieee80211_vht_cap), + sizeof(struct ieee80211_vht_operation)) + 50 + /* supported channels */ 3 + /* 40/20 BSS coex */ + 4 + /* AID */ + 4 + /* oper classes */ extra_ies_len + sizeof(struct ieee80211_tdls_lnkie)); if (!skb) @@ -851,7 +935,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, * packet through the AP. */ if ((action_code == WLAN_TDLS_TEARDOWN) && - (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) { + ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { bool try_resend; /* Should we keep skb for possible resend */ /* If not sending directly to peer - no point in keeping skb */ @@ -907,7 +991,7 @@ ieee80211_tdls_mgmt_setup(struct wiphy *wiphy, struct net_device *dev, if (!is_zero_ether_addr(sdata->u.mgd.tdls_peer) && !ether_addr_equal(sdata->u.mgd.tdls_peer, peer)) { ret = -EBUSY; - goto exit; + goto out_unlock; } /* @@ -922,27 +1006,34 @@ ieee80211_tdls_mgmt_setup(struct wiphy *wiphy, struct net_device *dev, if (!sta_info_get(sdata, peer)) { rcu_read_unlock(); ret = -ENOLINK; - goto exit; + goto out_unlock; } rcu_read_unlock(); } ieee80211_flush_queues(local, sdata, false); + memcpy(sdata->u.mgd.tdls_peer, peer, ETH_ALEN); + mutex_unlock(&local->mtx); + /* we cannot take the mutex while preparing the setup packet */ ret = ieee80211_tdls_prep_mgmt_packet(wiphy, dev, peer, action_code, dialog_token, status_code, peer_capability, initiator, extra_ies, extra_ies_len, 0, NULL); - if (ret < 0) - goto exit; + if (ret < 0) { + mutex_lock(&local->mtx); + eth_zero_addr(sdata->u.mgd.tdls_peer); + mutex_unlock(&local->mtx); + return ret; + } - memcpy(sdata->u.mgd.tdls_peer, peer, ETH_ALEN); ieee80211_queue_delayed_work(&sdata->local->hw, &sdata->u.mgd.tdls_peer_del_work, TDLS_PEER_SETUP_TIMEOUT); + return 0; -exit: +out_unlock: mutex_unlock(&local->mtx); return ret; } @@ -1085,6 +1176,12 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, switch (oper) { case NL80211_TDLS_ENABLE_LINK: + if (sdata->vif.csa_active) { + tdls_dbg(sdata, "TDLS: disallow link during CSA\n"); + ret = -EBUSY; + break; + } + rcu_read_lock(); sta = sta_info_get(sdata, peer); if (!sta) { diff --git a/net/mac80211/trace.c b/net/mac80211/trace.c index 386e45d8a958..edfe0c170a1c 100644 --- a/net/mac80211/trace.c +++ b/net/mac80211/trace.c @@ -8,6 +8,7 @@ #include "debug.h" #define CREATE_TRACE_POINTS #include "trace.h" +#include "trace_msg.h" #ifdef CONFIG_MAC80211_MESSAGE_TRACING void __sdata_info(const char *fmt, ...) diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 263a9561eb26..6f14591d8ca9 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -69,6 +69,17 @@ #define CHANCTX_PR_ARG CHANDEF_PR_ARG, MIN_CHANDEF_PR_ARG, \ __entry->rx_chains_static, __entry->rx_chains_dynamic +#define KEY_ENTRY __field(u32, cipher) \ + __field(u8, hw_key_idx) \ + __field(u8, flags) \ + __field(s8, keyidx) +#define KEY_ASSIGN(k) __entry->cipher = (k)->cipher; \ + __entry->flags = (k)->flags; \ + __entry->keyidx = (k)->keyidx; \ + __entry->hw_key_idx = (k)->hw_key_idx; +#define KEY_PR_FMT " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d" +#define KEY_PR_ARG __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx + /* @@ -522,25 +533,19 @@ TRACE_EVENT(drv_set_key, LOCAL_ENTRY VIF_ENTRY STA_ENTRY - __field(u32, cipher) - __field(u8, hw_key_idx) - __field(u8, flags) - __field(s8, keyidx) + KEY_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; - __entry->cipher = key->cipher; - __entry->flags = key->flags; - __entry->keyidx = key->keyidx; - __entry->hw_key_idx = key->hw_key_idx; + KEY_ASSIGN(key); ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT, - LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT KEY_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, KEY_PR_ARG ) ); @@ -656,28 +661,25 @@ TRACE_EVENT(drv_get_stats, ) ); -TRACE_EVENT(drv_get_tkip_seq, +TRACE_EVENT(drv_get_key_seq, TP_PROTO(struct ieee80211_local *local, - u8 hw_key_idx, u32 *iv32, u16 *iv16), + struct ieee80211_key_conf *key), - TP_ARGS(local, hw_key_idx, iv32, iv16), + TP_ARGS(local, key), TP_STRUCT__entry( LOCAL_ENTRY - __field(u8, hw_key_idx) - __field(u32, iv32) - __field(u16, iv16) + KEY_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; - __entry->hw_key_idx = hw_key_idx; - __entry->iv32 = *iv32; - __entry->iv16 = *iv16; + KEY_ASSIGN(key); ), TP_printk( - LOCAL_PR_FMT, LOCAL_PR_ARG + LOCAL_PR_FMT KEY_PR_FMT, + LOCAL_PR_ARG, KEY_PR_ARG ) ); @@ -1256,28 +1258,28 @@ TRACE_EVENT(drv_set_rekey_data, LOCAL_PR_ARG, VIF_PR_ARG) ); -TRACE_EVENT(drv_rssi_callback, +TRACE_EVENT(drv_event_callback, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - enum ieee80211_rssi_event rssi_event), + const struct ieee80211_event *_event), - TP_ARGS(local, sdata, rssi_event), + TP_ARGS(local, sdata, _event), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY - __field(u32, rssi_event) + __field(u32, type) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; - __entry->rssi_event = rssi_event; + __entry->type = _event->type; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " rssi_event:%d", - LOCAL_PR_ARG, VIF_PR_ARG, __entry->rssi_event + LOCAL_PR_FMT VIF_PR_FMT " event:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->type ) ); @@ -2312,43 +2314,36 @@ TRACE_EVENT(drv_tdls_recv_channel_switch, ) ); -#ifdef CONFIG_MAC80211_MESSAGE_TRACING -#undef TRACE_SYSTEM -#define TRACE_SYSTEM mac80211_msg - -#define MAX_MSG_LEN 100 - -DECLARE_EVENT_CLASS(mac80211_msg_event, - TP_PROTO(struct va_format *vaf), +TRACE_EVENT(drv_wake_tx_queue, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct txq_info *txq), - TP_ARGS(vaf), + TP_ARGS(local, sdata, txq), TP_STRUCT__entry( - __dynamic_array(char, msg, MAX_MSG_LEN) + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(u8, ac) + __field(u8, tid) ), TP_fast_assign( - WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), - MAX_MSG_LEN, vaf->fmt, - *vaf->va) >= MAX_MSG_LEN); - ), + struct ieee80211_sta *sta = txq->txq.sta; - TP_printk("%s", __get_str(msg)) -); + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_ASSIGN; + __entry->ac = txq->txq.ac; + __entry->tid = txq->txq.tid; + ), -DEFINE_EVENT(mac80211_msg_event, mac80211_info, - TP_PROTO(struct va_format *vaf), - TP_ARGS(vaf) -); -DEFINE_EVENT(mac80211_msg_event, mac80211_dbg, - TP_PROTO(struct va_format *vaf), - TP_ARGS(vaf) -); -DEFINE_EVENT(mac80211_msg_event, mac80211_err, - TP_PROTO(struct va_format *vaf), - TP_ARGS(vaf) + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " ac:%d tid:%d", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->ac, __entry->tid + ) ); -#endif #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ diff --git a/net/mac80211/trace_msg.h b/net/mac80211/trace_msg.h new file mode 100644 index 000000000000..768f7c22a190 --- /dev/null +++ b/net/mac80211/trace_msg.h @@ -0,0 +1,53 @@ +#ifdef CONFIG_MAC80211_MESSAGE_TRACING + +#if !defined(__MAC80211_MSG_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) +#define __MAC80211_MSG_DRIVER_TRACE + +#include <linux/tracepoint.h> +#include <net/mac80211.h> +#include "ieee80211_i.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mac80211_msg + +#define MAX_MSG_LEN 100 + +DECLARE_EVENT_CLASS(mac80211_msg_event, + TP_PROTO(struct va_format *vaf), + + TP_ARGS(vaf), + + TP_STRUCT__entry( + __dynamic_array(char, msg, MAX_MSG_LEN) + ), + + TP_fast_assign( + WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), + MAX_MSG_LEN, vaf->fmt, + *vaf->va) >= MAX_MSG_LEN); + ), + + TP_printk("%s", __get_str(msg)) +); + +DEFINE_EVENT(mac80211_msg_event, mac80211_info, + TP_PROTO(struct va_format *vaf), + TP_ARGS(vaf) +); +DEFINE_EVENT(mac80211_msg_event, mac80211_dbg, + TP_PROTO(struct va_format *vaf), + TP_ARGS(vaf) +); +DEFINE_EVENT(mac80211_msg_event, mac80211_err, + TP_PROTO(struct va_format *vaf), + TP_ARGS(vaf) +); +#endif /* !__MAC80211_MSG_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_msg +#include <trace/define_trace.h> + +#endif diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 07bd8db00af8..b8233505bf9f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -20,7 +20,6 @@ #include <linux/bitmap.h> #include <linux/rcupdate.h> #include <linux/export.h> -#include <linux/time.h> #include <net/net_namespace.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> @@ -38,6 +37,16 @@ /* misc utils */ +static inline void ieee80211_tx_stats(struct net_device *dev, u32 len) +{ + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_packets++; + tstats->tx_bytes += len; + u64_stats_update_end(&tstats->syncp); +} + static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct sk_buff *skb, int group_addr, int next_frag_len) @@ -202,11 +211,11 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) struct ieee80211_if_managed *ifmgd; /* driver doesn't support power save */ - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) + if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) return TX_CONTINUE; /* hardware does dynamic power save */ - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS) + if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) return TX_CONTINUE; /* dynamic power save disabled */ @@ -422,7 +431,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) if (ieee80211_is_probe_req(hdr->frame_control)) return TX_CONTINUE; - if (tx->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) + if (ieee80211_hw_check(&tx->local->hw, QUEUE_CONTROL)) info->hw_queue = tx->sdata->vif.cab_queue; /* no stations in PS mode */ @@ -432,7 +441,7 @@ ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx) info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM; /* device releases frame after DTIM beacon */ - if (!(tx->local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING)) + if (!ieee80211_hw_check(&tx->local->hw, HOST_BROADCAST_PS_BUFFERING)) return TX_CONTINUE; /* buffered in mac80211 */ @@ -595,23 +604,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) else if (!is_multicast_ether_addr(hdr->addr1) && (key = rcu_dereference(tx->sdata->default_unicast_key))) tx->key = key; - else if (info->flags & IEEE80211_TX_CTL_INJECTED) - tx->key = NULL; - else if (!tx->sdata->drop_unencrypted) - tx->key = NULL; - else if (tx->skb->protocol == tx->sdata->control_port_protocol) - tx->key = NULL; - else if (ieee80211_is_robust_mgmt_frame(tx->skb) && - !(ieee80211_is_action(hdr->frame_control) && - tx->sta && test_sta_flag(tx->sta, WLAN_STA_MFP))) - tx->key = NULL; - else if (ieee80211_is_mgmt(hdr->frame_control) && - !ieee80211_is_robust_mgmt_frame(tx->skb)) + else tx->key = NULL; - else { - I802_DEBUG_INC(tx->local->tx_handlers_drop_unencrypted); - return TX_DROP; - } if (tx->key) { bool skip_hw = false; @@ -783,12 +777,22 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) return TX_CONTINUE; } +static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid) +{ + u16 *seq = &sta->tid_seq[tid]; + __le16 ret = cpu_to_le16(*seq); + + /* Increase the sequence number. */ + *seq = (*seq + 0x10) & IEEE80211_SCTL_SEQ; + + return ret; +} + static ieee80211_tx_result debug_noinline ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; - u16 *seq; u8 *qc; int tid; @@ -839,13 +843,10 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) qc = ieee80211_get_qos_ctl(hdr); tid = *qc & IEEE80211_QOS_CTL_TID_MASK; - seq = &tx->sta->tid_seq[tid]; tx->sta->tx_msdu[tid]++; - hdr->seq_ctrl = cpu_to_le16(*seq); - - /* Increase the sequence number. */ - *seq = (*seq + 0x10) & IEEE80211_SCTL_SEQ; + if (!tx->sta->sta.txq[0]) + hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; } @@ -996,7 +997,6 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) skb_queue_walk(&tx->skbs, skb) { ac = skb_get_queue_mapping(skb); - tx->sta->tx_fragments++; tx->sta->tx_bytes[ac] += skb->len; } if (ac >= 0) @@ -1086,7 +1086,7 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, * nothing -- this aggregation session is being started * but that might still fail with the driver */ - } else { + } else if (!tx->sta->sta.txq[tid]) { spin_lock(&tx->sta->lock); /* * Need to re-check now, because we may get here @@ -1117,7 +1117,9 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, queued = true; info->control.vif = &tx->sdata->vif; info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; - info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; + info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS | + IEEE80211_TX_CTL_NO_PS_BUFFER | + IEEE80211_TX_STATUS_EOSP; __skb_queue_tail(&tid_tx->pending, skb); if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER) purge_skb = __skb_dequeue(&tid_tx->pending); @@ -1137,11 +1139,13 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, /* * initialises @tx + * pass %NULL for the station if unknown, a valid pointer if known + * or an ERR_PTR() if the station is known not to exist */ static ieee80211_tx_result ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_data *tx, - struct sk_buff *skb) + struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_hdr *hdr; @@ -1164,22 +1168,27 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, hdr = (struct ieee80211_hdr *) skb->data; - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { - tx->sta = rcu_dereference(sdata->u.vlan.sta); - if (!tx->sta && sdata->dev->ieee80211_ptr->use_4addr) - return TX_DROP; - } else if (info->flags & (IEEE80211_TX_CTL_INJECTED | - IEEE80211_TX_INTFL_NL80211_FRAME_TX) || - tx->sdata->control_port_protocol == tx->skb->protocol) { - tx->sta = sta_info_get_bss(sdata, hdr->addr1); + if (likely(sta)) { + if (!IS_ERR(sta)) + tx->sta = sta; + } else { + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + tx->sta = rcu_dereference(sdata->u.vlan.sta); + if (!tx->sta && sdata->wdev.use_4addr) + return TX_DROP; + } else if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX | + IEEE80211_TX_CTL_INJECTED) || + tx->sdata->control_port_protocol == tx->skb->protocol) { + tx->sta = sta_info_get_bss(sdata, hdr->addr1); + } + if (!tx->sta && !is_multicast_ether_addr(hdr->addr1)) + tx->sta = sta_info_get(sdata, hdr->addr1); } - if (!tx->sta) - tx->sta = sta_info_get(sdata, hdr->addr1); if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) && !ieee80211_is_qos_nullfunc(hdr->frame_control) && - (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) && - !(local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)) { + ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) && + !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) { struct tid_ampdu_tx *tid_tx; qc = ieee80211_get_qos_ctl(hdr); @@ -1220,13 +1229,102 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, return TX_CONTINUE; } +static void ieee80211_drv_tx(struct ieee80211_local *local, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_control control = { + .sta = pubsta, + }; + struct ieee80211_txq *txq = NULL; + struct txq_info *txqi; + u8 ac; + + if (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE) + goto tx_normal; + + if (!ieee80211_is_data(hdr->frame_control)) + goto tx_normal; + + if (pubsta) { + u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + + txq = pubsta->txq[tid]; + } else if (vif) { + txq = vif->txq; + } + + if (!txq) + goto tx_normal; + + ac = txq->ac; + txqi = to_txq_info(txq); + atomic_inc(&sdata->txqs_len[ac]); + if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending) + netif_stop_subqueue(sdata->dev, ac); + + skb_queue_tail(&txqi->queue, skb); + drv_wake_tx_queue(local, txqi); + + return; + +tx_normal: + drv_tx(local, &control, skb); +} + +struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); + struct txq_info *txqi = container_of(txq, struct txq_info, txq); + struct ieee80211_hdr *hdr; + struct sk_buff *skb = NULL; + u8 ac = txq->ac; + + spin_lock_bh(&txqi->queue.lock); + + if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags)) + goto out; + + skb = __skb_dequeue(&txqi->queue); + if (!skb) + goto out; + + atomic_dec(&sdata->txqs_len[ac]); + if (__netif_subqueue_stopped(sdata->dev, ac)) + ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]); + + hdr = (struct ieee80211_hdr *)skb->data; + if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { + struct sta_info *sta = container_of(txq->sta, struct sta_info, + sta); + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid); + if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags)) + info->flags |= IEEE80211_TX_CTL_AMPDU; + else + info->flags &= ~IEEE80211_TX_CTL_AMPDU; + } + +out: + spin_unlock_bh(&txqi->queue.lock); + + return skb; +} +EXPORT_SYMBOL(ieee80211_tx_dequeue); + static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct sk_buff_head *skbs, bool txpending) { - struct ieee80211_tx_control control; struct sk_buff *skb, *tmp; unsigned long flags; @@ -1284,10 +1382,9 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); info->control.vif = vif; - control.sta = sta; __skb_unlink(skb, skbs); - drv_tx(local, &control, skb); + ieee80211_drv_tx(local, vif, sta, skb); } return true; @@ -1334,7 +1431,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, vif = &sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; - } else if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) { + } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { dev_kfree_skb(skb); return true; } else @@ -1380,7 +1477,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) CALL_TXH(ieee80211_tx_h_ps_buf); CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); - if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) { @@ -1395,7 +1492,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) /* handlers after fragment must be aware of tx info fragmentation! */ CALL_TXH(ieee80211_tx_h_stats); CALL_TXH(ieee80211_tx_h_encrypt); - if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) + if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_calculate_duration); #undef CALL_TXH @@ -1422,8 +1519,9 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_tx_data tx; + struct sk_buff *skb2; - if (ieee80211_tx_prepare(sdata, &tx, skb) == TX_DROP) + if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP) return false; info->band = band; @@ -1440,6 +1538,14 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, *sta = NULL; } + /* this function isn't suitable for fragmented data frames */ + skb2 = __skb_dequeue(&tx.skbs); + if (WARN_ON(skb2 != skb || !skb_queue_empty(&tx.skbs))) { + ieee80211_free_txskb(hw, skb2); + ieee80211_purge_tx_queue(hw, &tx.skbs); + return false; + } + return true; } EXPORT_SYMBOL(ieee80211_tx_prepare_skb); @@ -1448,7 +1554,8 @@ EXPORT_SYMBOL(ieee80211_tx_prepare_skb); * Returns false if the frame couldn't be transmitted but was queued instead. */ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, bool txpending) + struct sta_info *sta, struct sk_buff *skb, + bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_data tx; @@ -1464,7 +1571,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, /* initialises tx */ led_len = skb->len; - res_prepare = ieee80211_tx_prepare(sdata, &tx, skb); + res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); @@ -1475,7 +1582,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, /* set up hw_queue value early */ if (!(info->flags & IEEE80211_TX_CTL_TX_OFFCHAN) || - !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) + !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; @@ -1502,9 +1609,9 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, } if (skb_cloned(skb) && - (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) || + (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) || !skb_clone_writable(skb, ETH_HLEN) || - sdata->crypto_tx_tailroom_needed_cnt)) + (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt))) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); @@ -1520,7 +1627,8 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, return 0; } -void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) +void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -1555,7 +1663,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) } ieee80211_set_qos_hdr(sdata, skb); - ieee80211_tx(sdata, skb, false); + ieee80211_tx(sdata, sta, skb, false); } static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb) @@ -1776,7 +1884,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, goto fail_rcu; info->band = chandef->chan->band; - ieee80211_xmit(sdata, skb); + ieee80211_xmit(sdata, NULL, skb); rcu_read_unlock(); return NETDEV_TX_OK; @@ -1788,21 +1896,89 @@ fail: return NETDEV_TX_OK; /* meaning, we dealt with the skb */ } -/* - * Measure Tx frame arrival time for Tx latency statistics calculation - * A single Tx frame latency should be measured from when it is entering the - * Kernel until we receive Tx complete confirmation indication and the skb is - * freed. - */ -static void ieee80211_tx_latency_start_msrmnt(struct ieee80211_local *local, - struct sk_buff *skb) +static inline bool ieee80211_is_tdls_setup(struct sk_buff *skb) { - struct ieee80211_tx_latency_bin_ranges *tx_latency; + u16 ethertype = (skb->data[12] << 8) | skb->data[13]; - tx_latency = rcu_dereference(local->tx_latency); - if (!tx_latency) - return; - skb->tstamp = ktime_get(); + return ethertype == ETH_P_TDLS && + skb->len > 14 && + skb->data[14] == WLAN_TDLS_SNAP_RFTYPE; +} + +static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + struct sta_info **sta_out) +{ + struct sta_info *sta; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + sta = rcu_dereference(sdata->u.vlan.sta); + if (sta) { + *sta_out = sta; + return 0; + } else if (sdata->wdev.use_4addr) { + return -ENOLINK; + } + /* fall through */ + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_OCB: + case NL80211_IFTYPE_ADHOC: + if (is_multicast_ether_addr(skb->data)) { + *sta_out = ERR_PTR(-ENOENT); + return 0; + } + sta = sta_info_get_bss(sdata, skb->data); + break; + case NL80211_IFTYPE_WDS: + sta = sta_info_get(sdata, sdata->u.wds.remote_addr); + break; +#ifdef CONFIG_MAC80211_MESH + case NL80211_IFTYPE_MESH_POINT: + /* determined much later */ + *sta_out = NULL; + return 0; +#endif + case NL80211_IFTYPE_STATION: + if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) { + sta = sta_info_get(sdata, skb->data); + if (sta) { + bool tdls_peer, tdls_auth; + + tdls_peer = test_sta_flag(sta, + WLAN_STA_TDLS_PEER); + tdls_auth = test_sta_flag(sta, + WLAN_STA_TDLS_PEER_AUTH); + + if (tdls_peer && tdls_auth) { + *sta_out = sta; + return 0; + } + + /* + * TDLS link during setup - throw out frames to + * peer. Allow TDLS-setup frames to unauthorized + * peers for the special case of a link teardown + * after a TDLS sta is removed due to being + * unreachable. + */ + if (tdls_peer && !tdls_auth && + !ieee80211_is_tdls_setup(skb)) + return -EINVAL; + } + + } + + sta = sta_info_get(sdata, sdata->u.mgd.bssid); + if (!sta) + return -ENOLINK; + break; + default: + return -EINVAL; + } + + *sta_out = sta ?: ERR_PTR(-ENOENT); + return 0; } /** @@ -1824,7 +2000,8 @@ static void ieee80211_tx_latency_start_msrmnt(struct ieee80211_local *local, * Returns: the (possibly reallocated) skb or an ERR_PTR() code */ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, u32 info_flags) + struct sk_buff *skb, u32 info_flags, + struct sta_info *sta) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info; @@ -1837,9 +2014,8 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, const u8 *encaps_data; int encaps_len, skip_header_bytes; int nh_pos, h_pos; - struct sta_info *sta = NULL; - bool wme_sta = false, authorized = false, tdls_auth = false; - bool tdls_peer = false, tdls_setup_frame = false; + bool wme_sta = false, authorized = false; + bool tdls_peer; bool multicast; u16 info_id = 0; struct ieee80211_chanctx_conf *chanctx_conf; @@ -1847,6 +2023,9 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, enum ieee80211_band band; int ret; + if (IS_ERR(sta)) + sta = NULL; + /* convert Ethernet header to proper 802.11 header (based on * operation mode) */ ethertype = (skb->data[12] << 8) | skb->data[13]; @@ -1854,8 +2033,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: - sta = rcu_dereference(sdata->u.vlan.sta); - if (sta) { + if (sdata->wdev.use_4addr) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr.addr1, sta->sta.addr, ETH_ALEN); @@ -1874,7 +2052,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, goto free; } band = chanctx_conf->def.chan->band; - if (sta) + if (sdata->wdev.use_4addr) break; /* fall through */ case NL80211_IFTYPE_AP: @@ -1978,38 +2156,10 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, break; #endif case NL80211_IFTYPE_STATION: - if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) { - sta = sta_info_get(sdata, skb->data); - if (sta) { - authorized = test_sta_flag(sta, - WLAN_STA_AUTHORIZED); - wme_sta = sta->sta.wme; - tdls_peer = test_sta_flag(sta, - WLAN_STA_TDLS_PEER); - tdls_auth = test_sta_flag(sta, - WLAN_STA_TDLS_PEER_AUTH); - } - - if (tdls_peer) - tdls_setup_frame = - ethertype == ETH_P_TDLS && - skb->len > 14 && - skb->data[14] == WLAN_TDLS_SNAP_RFTYPE; - } + /* we already did checks when looking up the RA STA */ + tdls_peer = test_sta_flag(sta, WLAN_STA_TDLS_PEER); - /* - * TDLS link during setup - throw out frames to peer. We allow - * TDLS-setup frames to unauthorized peers for the special case - * of a link teardown after a TDLS sta is removed due to being - * unreachable. - */ - if (tdls_peer && !tdls_auth && !tdls_setup_frame) { - ret = -EINVAL; - goto free; - } - - /* send direct packets to authorized TDLS peers */ - if (tdls_peer && tdls_auth) { + if (tdls_peer) { /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); @@ -2071,26 +2221,19 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, goto free; } - /* - * There's no need to try to look up the destination - * if it is a multicast address (which can only happen - * in AP mode) - */ multicast = is_multicast_ether_addr(hdr.addr1); - if (!multicast) { - sta = sta_info_get(sdata, hdr.addr1); - if (sta) { - authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = sta->sta.wme; - } - } - /* For mesh, the use of the QoS header is mandatory */ - if (ieee80211_vif_is_mesh(&sdata->vif)) + /* sta is always NULL for mesh */ + if (sta) { + authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); + wme_sta = sta->sta.wme; + } else if (ieee80211_vif_is_mesh(&sdata->vif)) { + /* For mesh, the use of the QoS header is mandatory */ wme_sta = true; + } - /* receiver and we are QoS enabled, use a QoS type frame */ - if (wme_sta && local->hw.queues >= IEEE80211_NUM_ACS) { + /* receiver does QoS (which also means we do) use it */ + if (wme_sta) { fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); hdrlen += 2; } @@ -2255,12 +2398,455 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, return ERR_PTR(ret); } +/* + * fast-xmit overview + * + * The core idea of this fast-xmit is to remove per-packet checks by checking + * them out of band. ieee80211_check_fast_xmit() implements the out-of-band + * checks that are needed to get the sta->fast_tx pointer assigned, after which + * much less work can be done per packet. For example, fragmentation must be + * disabled or the fast_tx pointer will not be set. All the conditions are seen + * in the code here. + * + * Once assigned, the fast_tx data structure also caches the per-packet 802.11 + * header and other data to aid packet processing in ieee80211_xmit_fast(). + * + * The most difficult part of this is that when any of these assumptions + * change, an external trigger (i.e. a call to ieee80211_clear_fast_xmit(), + * ieee80211_check_fast_xmit() or friends) is required to reset the data, + * since the per-packet code no longer checks the conditions. This is reflected + * by the calls to these functions throughout the rest of the code, and must be + * maintained if any of the TX path checks change. + */ + +void ieee80211_check_fast_xmit(struct sta_info *sta) +{ + struct ieee80211_fast_tx build = {}, *fast_tx = NULL, *old; + struct ieee80211_local *local = sta->local; + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_hdr *hdr = (void *)build.hdr; + struct ieee80211_chanctx_conf *chanctx_conf; + __le16 fc; + + if (!ieee80211_hw_check(&local->hw, SUPPORT_FAST_XMIT)) + return; + + /* Locking here protects both the pointer itself, and against concurrent + * invocations winning data access races to, e.g., the key pointer that + * is used. + * Without it, the invocation of this function right after the key + * pointer changes wouldn't be sufficient, as another CPU could access + * the pointer, then stall, and then do the cache update after the CPU + * that invalidated the key. + * With the locking, such scenarios cannot happen as the check for the + * key and the fast-tx assignment are done atomically, so the CPU that + * modifies the key will either wait or other one will see the key + * cleared/changed already. + */ + spin_lock_bh(&sta->lock); + if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && + !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) && + sdata->vif.type == NL80211_IFTYPE_STATION) + goto out; + + if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + goto out; + + if (test_sta_flag(sta, WLAN_STA_PS_STA) || + test_sta_flag(sta, WLAN_STA_PS_DRIVER) || + test_sta_flag(sta, WLAN_STA_PS_DELIVER)) + goto out; + + if (sdata->noack_map) + goto out; + + /* fast-xmit doesn't handle fragmentation at all */ + if (local->hw.wiphy->frag_threshold != (u32)-1 && + !local->ops->set_frag_threshold) + goto out; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (!chanctx_conf) { + rcu_read_unlock(); + goto out; + } + build.band = chanctx_conf->def.chan->band; + rcu_read_unlock(); + + fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_ADHOC: + /* DA SA BSSID */ + build.da_offs = offsetof(struct ieee80211_hdr, addr1); + build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + memcpy(hdr->addr3, sdata->u.ibss.bssid, ETH_ALEN); + build.hdr_len = 24; + break; + case NL80211_IFTYPE_STATION: + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + /* DA SA BSSID */ + build.da_offs = offsetof(struct ieee80211_hdr, addr1); + build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + memcpy(hdr->addr3, sdata->u.mgd.bssid, ETH_ALEN); + build.hdr_len = 24; + break; + } + + if (sdata->u.mgd.use_4addr) { + /* non-regular ethertype cannot use the fastpath */ + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | + IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + build.da_offs = offsetof(struct ieee80211_hdr, addr3); + build.sa_offs = offsetof(struct ieee80211_hdr, addr4); + build.hdr_len = 30; + break; + } + fc |= cpu_to_le16(IEEE80211_FCTL_TODS); + /* BSSID SA DA */ + memcpy(hdr->addr1, sdata->u.mgd.bssid, ETH_ALEN); + build.da_offs = offsetof(struct ieee80211_hdr, addr3); + build.sa_offs = offsetof(struct ieee80211_hdr, addr2); + build.hdr_len = 24; + break; + case NL80211_IFTYPE_AP_VLAN: + if (sdata->wdev.use_4addr) { + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | + IEEE80211_FCTL_TODS); + /* RA TA DA SA */ + memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + build.da_offs = offsetof(struct ieee80211_hdr, addr3); + build.sa_offs = offsetof(struct ieee80211_hdr, addr4); + build.hdr_len = 30; + break; + } + /* fall through */ + case NL80211_IFTYPE_AP: + fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); + /* DA BSSID SA */ + build.da_offs = offsetof(struct ieee80211_hdr, addr1); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + build.sa_offs = offsetof(struct ieee80211_hdr, addr3); + build.hdr_len = 24; + break; + default: + /* not handled on fast-xmit */ + goto out; + } + + if (sta->sta.wme) { + build.hdr_len += 2; + fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); + } + + /* We store the key here so there's no point in using rcu_dereference() + * but that's fine because the code that changes the pointers will call + * this function after doing so. For a single CPU that would be enough, + * for multiple see the comment above. + */ + build.key = rcu_access_pointer(sta->ptk[sta->ptk_idx]); + if (!build.key) + build.key = rcu_access_pointer(sdata->default_unicast_key); + if (build.key) { + bool gen_iv, iv_spc, mmic; + + gen_iv = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV; + iv_spc = build.key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE; + mmic = build.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC; + + /* don't handle software crypto */ + if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + goto out; + + switch (build.key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + /* add fixed key ID */ + if (gen_iv) { + (build.hdr + build.hdr_len)[3] = + 0x20 | (build.key->conf.keyidx << 6); + build.pn_offs = build.hdr_len; + } + if (gen_iv || iv_spc) + build.hdr_len += IEEE80211_CCMP_HDR_LEN; + break; + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + /* add fixed key ID */ + if (gen_iv) { + (build.hdr + build.hdr_len)[3] = + 0x20 | (build.key->conf.keyidx << 6); + build.pn_offs = build.hdr_len; + } + if (gen_iv || iv_spc) + build.hdr_len += IEEE80211_GCMP_HDR_LEN; + break; + case WLAN_CIPHER_SUITE_TKIP: + /* cannot handle MMIC or IV generation in xmit-fast */ + if (mmic || gen_iv) + goto out; + if (iv_spc) + build.hdr_len += IEEE80211_TKIP_IV_LEN; + break; + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + /* cannot handle IV generation in fast-xmit */ + if (gen_iv) + goto out; + if (iv_spc) + build.hdr_len += IEEE80211_WEP_IV_LEN; + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + case WLAN_CIPHER_SUITE_BIP_CMAC_256: + case WLAN_CIPHER_SUITE_BIP_GMAC_128: + case WLAN_CIPHER_SUITE_BIP_GMAC_256: + WARN(1, + "management cipher suite 0x%x enabled for data\n", + build.key->conf.cipher); + goto out; + default: + /* we don't know how to generate IVs for this at all */ + if (WARN_ON(gen_iv)) + goto out; + /* pure hardware keys are OK, of course */ + if (!(build.key->flags & KEY_FLAG_CIPHER_SCHEME)) + break; + /* cipher scheme might require space allocation */ + if (iv_spc && + build.key->conf.iv_len > IEEE80211_FAST_XMIT_MAX_IV) + goto out; + if (iv_spc) + build.hdr_len += build.key->conf.iv_len; + } + + fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); + } + + hdr->frame_control = fc; + + memcpy(build.hdr + build.hdr_len, + rfc1042_header, sizeof(rfc1042_header)); + build.hdr_len += sizeof(rfc1042_header); + + fast_tx = kmemdup(&build, sizeof(build), GFP_ATOMIC); + /* if the kmemdup fails, continue w/o fast_tx */ + if (!fast_tx) + goto out; + + out: + /* we might have raced against another call to this function */ + old = rcu_dereference_protected(sta->fast_tx, + lockdep_is_held(&sta->lock)); + rcu_assign_pointer(sta->fast_tx, fast_tx); + if (old) + kfree_rcu(old, rcu_head); + spin_unlock_bh(&sta->lock); +} + +void ieee80211_check_fast_xmit_all(struct ieee80211_local *local) +{ + struct sta_info *sta; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &local->sta_list, list) + ieee80211_check_fast_xmit(sta); + rcu_read_unlock(); +} + +void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + rcu_read_lock(); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata && + (!sta->sdata->bss || sta->sdata->bss != sdata->bss)) + continue; + ieee80211_check_fast_xmit(sta); + } + + rcu_read_unlock(); +} + +void ieee80211_clear_fast_xmit(struct sta_info *sta) +{ + struct ieee80211_fast_tx *fast_tx; + + spin_lock_bh(&sta->lock); + fast_tx = rcu_dereference_protected(sta->fast_tx, + lockdep_is_held(&sta->lock)); + RCU_INIT_POINTER(sta->fast_tx, NULL); + spin_unlock_bh(&sta->lock); + + if (fast_tx) + kfree_rcu(fast_tx, rcu_head); +} + +static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, + struct net_device *dev, struct sta_info *sta, + struct ieee80211_fast_tx *fast_tx, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + u16 ethertype = (skb->data[12] << 8) | skb->data[13]; + int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); + int hw_headroom = sdata->local->hw.extra_tx_headroom; + struct ethhdr eth; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; + struct ieee80211_tx_data tx; + ieee80211_tx_result r; + struct tid_ampdu_tx *tid_tx = NULL; + u8 tid = IEEE80211_NUM_TIDS; + + /* control port protocol needs a lot of special handling */ + if (cpu_to_be16(ethertype) == sdata->control_port_protocol) + return false; + + /* only RFC 1042 SNAP */ + if (ethertype < ETH_P_802_3_MIN) + return false; + + /* don't handle TX status request here either */ + if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + return false; + + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); + if (tid_tx) { + if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) + return false; + if (tid_tx->timeout) + tid_tx->last_tx = jiffies; + } + } + + /* after this point (skb is modified) we cannot return false */ + + if (skb_shared(skb)) { + struct sk_buff *tmp_skb = skb; + + skb = skb_clone(skb, GFP_ATOMIC); + kfree_skb(tmp_skb); + + if (!skb) + return true; + } + + ieee80211_tx_stats(dev, skb->len + extra_head); + + /* will not be crypto-handled beyond what we do here, so use false + * as the may-encrypt argument for the resize to not account for + * more room than we already have in 'extra_head' + */ + if (unlikely(ieee80211_skb_resize(sdata, skb, + max_t(int, extra_head + hw_headroom - + skb_headroom(skb), 0), + false))) { + kfree_skb(skb); + return true; + } + + memcpy(ð, skb->data, ETH_HLEN - 2); + hdr = (void *)skb_push(skb, extra_head); + memcpy(skb->data, fast_tx->hdr, fast_tx->hdr_len); + memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN); + memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN); + + memset(info, 0, sizeof(*info)); + info->band = fast_tx->band; + info->control.vif = &sdata->vif; + info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | + IEEE80211_TX_CTL_DONTFRAG | + (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); + + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + *ieee80211_get_qos_ctl(hdr) = tid; + hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); + } else { + info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; + hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); + sdata->sequence_number += 0x10; + } + + sta->tx_msdu[tid]++; + + info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; + + __skb_queue_head_init(&tx.skbs); + + tx.flags = IEEE80211_TX_UNICAST; + tx.local = local; + tx.sdata = sdata; + tx.sta = sta; + tx.key = fast_tx->key; + + if (fast_tx->key) + info->control.hw_key = &fast_tx->key->conf; + + if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { + tx.skb = skb; + r = ieee80211_tx_h_rate_ctrl(&tx); + skb = tx.skb; + tx.skb = NULL; + + if (r != TX_CONTINUE) { + if (r != TX_QUEUED) + kfree_skb(skb); + return true; + } + } + + /* statistics normally done by ieee80211_tx_h_stats (but that + * has to consider fragmentation, so is more complex) + */ + sta->tx_bytes[skb_get_queue_mapping(skb)] += skb->len; + sta->tx_packets[skb_get_queue_mapping(skb)]++; + + if (fast_tx->pn_offs) { + u64 pn; + u8 *crypto_hdr = skb->data + fast_tx->pn_offs; + + switch (fast_tx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn); + crypto_hdr[0] = pn; + crypto_hdr[1] = pn >> 8; + crypto_hdr[4] = pn >> 16; + crypto_hdr[5] = pn >> 24; + crypto_hdr[6] = pn >> 32; + crypto_hdr[7] = pn >> 40; + break; + } + } + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + + __skb_queue_tail(&tx.skbs, skb); + ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false); + return true; +} + void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + struct sk_buff *next; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); @@ -2269,18 +2855,67 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, rcu_read_lock(); - /* Measure frame arrival for Tx latency statistics calculation */ - ieee80211_tx_latency_start_msrmnt(local, skb); + if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) + goto out_free; - skb = ieee80211_build_hdr(sdata, skb, info_flags); - if (IS_ERR(skb)) - goto out; + if (!IS_ERR_OR_NULL(sta)) { + struct ieee80211_fast_tx *fast_tx; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - dev->trans_start = jiffies; + fast_tx = rcu_dereference(sta->fast_tx); - ieee80211_xmit(sdata, skb); + if (fast_tx && + ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb)) + goto out; + } + + if (skb_is_gso(skb)) { + struct sk_buff *segs; + + segs = skb_gso_segment(skb, 0); + if (IS_ERR(segs)) { + goto out_free; + } else if (segs) { + consume_skb(skb); + skb = segs; + } + } else { + /* we cannot process non-linear frames on this path */ + if (skb_linearize(skb)) { + kfree_skb(skb); + goto out; + } + + /* the frame could be fragmented, software-encrypted, and other + * things so we cannot really handle checksum offload with it - + * fix it up in software before we handle anything else. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (skb_checksum_help(skb)) + goto out_free; + } + } + + next = skb; + while (next) { + skb = next; + next = skb->next; + + skb->prev = NULL; + skb->next = NULL; + + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); + if (IS_ERR(skb)) + goto out; + + ieee80211_tx_stats(dev, skb->len); + + ieee80211_xmit(sdata, sta, skb); + } + goto out; + out_free: + kfree_skb(skb); out: rcu_read_unlock(); } @@ -2308,10 +2943,17 @@ ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, .local = sdata->local, .sdata = sdata, }; + struct sta_info *sta; rcu_read_lock(); - skb = ieee80211_build_hdr(sdata, skb, info_flags); + if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) { + kfree_skb(skb); + skb = ERR_PTR(-EINVAL); + goto out; + } + + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); if (IS_ERR(skb)) goto out; @@ -2369,7 +3011,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, return true; } info->band = chanctx_conf->def.chan->band; - result = ieee80211_tx(sdata, skb, true); + result = ieee80211_tx(sdata, NULL, skb, true); } else { struct sk_buff_head skbs; @@ -3107,7 +3749,7 @@ ieee80211_get_buffered_bc(struct ieee80211_hw *hw, if (sdata->vif.type == NL80211_IFTYPE_AP) sdata = IEEE80211_DEV_TO_SUB_IF(skb->dev); - if (!ieee80211_tx_prepare(sdata, &tx, skb)) + if (!ieee80211_tx_prepare(sdata, &tx, NULL, skb)) break; dev_kfree_skb_any(skb); } @@ -3167,7 +3809,7 @@ int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid) synchronize_net(); /* Tear down BA sessions so we stop aggregating on this TID */ - if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_LOCAL_REQUEST); @@ -3181,7 +3823,7 @@ int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid) ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_RESERVE_TID); - if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) + if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ret = 0; @@ -3239,6 +3881,6 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, */ local_bh_disable(); IEEE80211_SKB_CB(skb)->band = band; - ieee80211_xmit(sdata, skb); + ieee80211_xmit(sdata, NULL, skb); local_bh_enable(); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 747bdcf72e92..43e5aadd7a89 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -308,6 +308,11 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) for (ac = 0; ac < n_acs; ac++) { int ac_queue = sdata->vif.hw_queue[ac]; + if (local->ops->wake_tx_queue && + (atomic_read(&sdata->txqs_len[ac]) > + local->hw.txq_ac_max_pending)) + continue; + if (ac_queue == queue || (sdata->vif.cab_queue == queue && local->queue_stop_reasons[ac_queue] == 0 && @@ -559,7 +564,7 @@ ieee80211_get_vif_queues(struct ieee80211_local *local, { unsigned int queues; - if (sdata && local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) { + if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { int ac; queues = 0; @@ -587,7 +592,7 @@ void __ieee80211_flush_queues(struct ieee80211_local *local, * If no queue was set, or if the HW doesn't support * IEEE80211_HW_QUEUE_CONTROL - flush all queues */ - if (!queues || !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) + if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) queues = ieee80211_get_vif_queues(local, sdata); ieee80211_stop_queues_by_reason(&local->hw, queues, @@ -625,13 +630,14 @@ void ieee80211_wake_vif_queues(struct ieee80211_local *local, reason, true); } -static void __iterate_active_interfaces(struct ieee80211_local *local, - u32 iter_flags, - void (*iterator)(void *data, u8 *mac, - struct ieee80211_vif *vif), - void *data) +static void __iterate_interfaces(struct ieee80211_local *local, + u32 iter_flags, + void (*iterator)(void *data, u8 *mac, + struct ieee80211_vif *vif), + void *data) { struct ieee80211_sub_if_data *sdata; + bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; list_for_each_entry_rcu(sdata, &local->interfaces, list) { switch (sdata->vif.type) { @@ -645,9 +651,9 @@ static void __iterate_active_interfaces(struct ieee80211_local *local, break; } if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) && - !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) + active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) continue; - if (ieee80211_sdata_running(sdata)) + if (ieee80211_sdata_running(sdata) || !active_only) iterator(data, sdata->vif.addr, &sdata->vif); } @@ -656,12 +662,12 @@ static void __iterate_active_interfaces(struct ieee80211_local *local, lockdep_is_held(&local->iflist_mtx) || lockdep_rtnl_is_held()); if (sdata && - (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || + (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) iterator(data, sdata->vif.addr, &sdata->vif); } -void ieee80211_iterate_active_interfaces( +void ieee80211_iterate_interfaces( struct ieee80211_hw *hw, u32 iter_flags, void (*iterator)(void *data, u8 *mac, struct ieee80211_vif *vif), @@ -670,10 +676,10 @@ void ieee80211_iterate_active_interfaces( struct ieee80211_local *local = hw_to_local(hw); mutex_lock(&local->iflist_mtx); - __iterate_active_interfaces(local, iter_flags, iterator, data); + __iterate_interfaces(local, iter_flags, iterator, data); mutex_unlock(&local->iflist_mtx); } -EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces); +EXPORT_SYMBOL_GPL(ieee80211_iterate_interfaces); void ieee80211_iterate_active_interfaces_atomic( struct ieee80211_hw *hw, u32 iter_flags, @@ -684,7 +690,8 @@ void ieee80211_iterate_active_interfaces_atomic( struct ieee80211_local *local = hw_to_local(hw); rcu_read_lock(); - __iterate_active_interfaces(local, iter_flags, iterator, data); + __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, + iterator, data); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); @@ -699,7 +706,8 @@ void ieee80211_iterate_active_interfaces_rtnl( ASSERT_RTNL(); - __iterate_active_interfaces(local, iter_flags, iterator, data); + __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, + iterator, data); } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_rtnl); @@ -742,6 +750,18 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) } EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); +struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (!ieee80211_sdata_running(sdata) || + !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) + return NULL; + + return &sdata->wdev; +} +EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev); + /* * Nothing should have been stuffed into the workqueue during * the suspend->resume cycle. Since we can't check each caller @@ -1811,8 +1831,25 @@ int ieee80211_reconfig(struct ieee80211_local *local) list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && sdata->vif.type != NL80211_IFTYPE_MONITOR && - ieee80211_sdata_running(sdata)) + ieee80211_sdata_running(sdata)) { res = drv_add_interface(local, sdata); + if (WARN_ON(res)) + break; + } + } + + /* If adding any of the interfaces failed above, roll back and + * report failure. + */ + if (res) { + list_for_each_entry_continue_reverse(sdata, &local->interfaces, + list) + if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && + sdata->vif.type != NL80211_IFTYPE_MONITOR && + ieee80211_sdata_running(sdata)) + drv_remove_interface(local, sdata); + ieee80211_handle_reconfig_failure(local); + return res; } /* add channel contexts */ @@ -1986,6 +2023,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* add back keys */ list_for_each_entry(sdata, &local->interfaces, list) + ieee80211_reset_crypto_tx_tailroom(sdata); + + list_for_each_entry(sdata, &local->interfaces, list) if (ieee80211_sdata_running(sdata)) ieee80211_enable_keys(sdata); @@ -2006,7 +2046,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) * about the sessions, but we and the AP still think they * are active. This is really a workaround though. */ - if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { + if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) { mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { @@ -2157,46 +2197,6 @@ void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->chanctx_mtx); } -static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id) -{ - int i; - - for (i = 0; i < n_ids; i++) - if (ids[i] == id) - return true; - return false; -} - -size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, - const u8 *ids, int n_ids, - const u8 *after_ric, int n_after_ric, - size_t offset) -{ - size_t pos = offset; - - while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) { - if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { - pos += 2 + ies[pos + 1]; - - while (pos < ielen && - !ieee80211_id_in_list(after_ric, n_after_ric, - ies[pos])) - pos += 2 + ies[pos + 1]; - } else { - pos += 2 + ies[pos + 1]; - } - } - - return pos; -} - -size_t ieee80211_ie_split(const u8 *ies, size_t ielen, - const u8 *ids, int n_ids, size_t offset) -{ - return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset); -} -EXPORT_SYMBOL(ieee80211_ie_split); - size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) { size_t pos = offset; @@ -2344,6 +2344,41 @@ u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, return pos + sizeof(struct ieee80211_ht_operation); } +u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, + const struct cfg80211_chan_def *chandef) +{ + struct ieee80211_vht_operation *vht_oper; + + *pos++ = WLAN_EID_VHT_OPERATION; + *pos++ = sizeof(struct ieee80211_vht_operation); + vht_oper = (struct ieee80211_vht_operation *)pos; + vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel( + chandef->center_freq1); + if (chandef->center_freq2) + vht_oper->center_freq_seg2_idx = + ieee80211_frequency_to_channel(chandef->center_freq2); + + switch (chandef->width) { + case NL80211_CHAN_WIDTH_160: + vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_160MHZ; + break; + case NL80211_CHAN_WIDTH_80P80: + vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80P80MHZ; + break; + case NL80211_CHAN_WIDTH_80: + vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; + break; + default: + vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT; + break; + } + + /* don't require special VHT peer rates */ + vht_oper->basic_mcs_set = cpu_to_le16(0xffff); + + return pos + sizeof(struct ieee80211_vht_operation); +} + void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef) @@ -2373,6 +2408,39 @@ void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, cfg80211_chandef_create(chandef, control_chan, channel_type); } +void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan, + const struct ieee80211_vht_operation *oper, + struct cfg80211_chan_def *chandef) +{ + if (!oper) + return; + + chandef->chan = control_chan; + + switch (oper->chan_width) { + case IEEE80211_VHT_CHANWIDTH_USE_HT: + break; + case IEEE80211_VHT_CHANWIDTH_80MHZ: + chandef->width = NL80211_CHAN_WIDTH_80; + break; + case IEEE80211_VHT_CHANWIDTH_160MHZ: + chandef->width = NL80211_CHAN_WIDTH_160; + break; + case IEEE80211_VHT_CHANWIDTH_80P80MHZ: + chandef->width = NL80211_CHAN_WIDTH_80P80; + break; + default: + break; + } + + chandef->center_freq1 = + ieee80211_channel_to_frequency(oper->center_freq_seg1_idx, + control_chan->band); + chandef->center_freq2 = + ieee80211_channel_to_frequency(oper->center_freq_seg2_idx, + control_chan->band); +} + int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) @@ -3252,3 +3320,20 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo) return buf; } + +void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct txq_info *txqi, int tid) +{ + skb_queue_head_init(&txqi->queue); + txqi->txq.vif = &sdata->vif; + + if (sta) { + txqi->txq.sta = &sta->sta; + sta->sta.txq[tid] = &txqi->txq; + txqi->txq.ac = ieee802_1d_to_ac[tid & 7]; + } else { + sdata->vif.txq = &txqi->txq; + txqi->txq.ac = IEEE80211_AC_BE; + } +} diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 85f9596da07b..80694d55db74 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -129,10 +129,6 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; - /* don't support VHT for TDLS peers for now */ - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) - return; - /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index a4220e92f0cc..efa3f48f1ec5 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -98,8 +98,7 @@ static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local, hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); - if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN || - skb_headroom(skb) < IEEE80211_WEP_IV_LEN)) + if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN)) return NULL; hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -167,6 +166,9 @@ int ieee80211_wep_encrypt(struct ieee80211_local *local, size_t len; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; + if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN)) + return -1; + iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx); if (!iv) return -1; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 75de6fac40d1..943f7606527e 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -444,7 +444,7 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb, hdr = (struct ieee80211_hdr *) pos; pos += hdrlen; - pn64 = atomic64_inc_return(&key->u.ccmp.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; @@ -670,7 +670,7 @@ static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) hdr = (struct ieee80211_hdr *)pos; pos += hdrlen; - pn64 = atomic64_inc_return(&key->u.gcmp.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; @@ -780,9 +780,8 @@ ieee80211_crypto_cs_encrypt(struct ieee80211_tx_data *tx, struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_key *key = tx->key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - const struct ieee80211_cipher_scheme *cs = key->sta->cipher_scheme; int hdrlen; - u8 *pos; + u8 *pos, iv_len = key->conf.iv_len; if (info->control.hw_key && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { @@ -790,14 +789,14 @@ ieee80211_crypto_cs_encrypt(struct ieee80211_tx_data *tx, return TX_CONTINUE; } - if (unlikely(skb_headroom(skb) < cs->hdr_len && - pskb_expand_head(skb, cs->hdr_len, 0, GFP_ATOMIC))) + if (unlikely(skb_headroom(skb) < iv_len && + pskb_expand_head(skb, iv_len, 0, GFP_ATOMIC))) return TX_DROP; hdrlen = ieee80211_hdrlen(hdr->frame_control); - pos = skb_push(skb, cs->hdr_len); - memmove(pos, pos + cs->hdr_len, hdrlen); + pos = skb_push(skb, iv_len); + memmove(pos, pos + iv_len, hdrlen); return TX_CONTINUE; } @@ -941,7 +940,7 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); @@ -985,7 +984,7 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); @@ -1130,7 +1129,7 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx) mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->u.aes_gmac.tx_pn); + pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); @@ -1217,7 +1216,7 @@ ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx) if (!info->control.hw_key) return TX_DROP; - if (tx->key->sta->cipher_scheme) { + if (tx->key->flags & KEY_FLAG_CIPHER_SCHEME) { res = ieee80211_crypto_cs_encrypt(tx, skb); if (res != TX_CONTINUE) return res; diff --git a/net/mac802154/Kconfig b/net/mac802154/Kconfig index aa462b480a39..fb45287ebac3 100644 --- a/net/mac802154/Kconfig +++ b/net/mac802154/Kconfig @@ -2,6 +2,7 @@ config MAC802154 tristate "Generic IEEE 802.15.4 Soft Networking Stack (mac802154)" depends on IEEE802154 select CRC_CCITT + select CRYPTO select CRYPTO_AUTHENC select CRYPTO_CCM select CRYPTO_CTR diff --git a/net/mac802154/Makefile b/net/mac802154/Makefile index 702d8b466317..17a51e8389e2 100644 --- a/net/mac802154/Makefile +++ b/net/mac802154/Makefile @@ -1,5 +1,7 @@ obj-$(CONFIG_MAC802154) += mac802154.o mac802154-objs := main.o rx.o tx.o mac_cmd.o mib.o \ - iface.o llsec.o util.o cfg.o + iface.o llsec.o util.o cfg.o trace.o + +CFLAGS_trace.o := -I$(src) ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 5d9f68c75e5f..317c4662e544 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -22,13 +22,14 @@ static struct net_device * ieee802154_add_iface_deprecated(struct wpan_phy *wpan_phy, - const char *name, int type) + const char *name, + unsigned char name_assign_type, int type) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); struct net_device *dev; rtnl_lock(); - dev = ieee802154_if_add(local, name, type, + dev = ieee802154_if_add(local, name, name_assign_type, type, cpu_to_le64(0x0000000000000000ULL)); rtnl_unlock(); @@ -45,12 +46,14 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy, static int ieee802154_add_iface(struct wpan_phy *phy, const char *name, + unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { struct ieee802154_local *local = wpan_phy_priv(phy); struct net_device *err; - err = ieee802154_if_add(local, name, type, extended_addr); + err = ieee802154_if_add(local, name, name_assign_type, type, + extended_addr); return PTR_ERR_OR_ZERO(err); } @@ -70,9 +73,9 @@ ieee802154_set_channel(struct wpan_phy *wpan_phy, u8 page, u8 channel) ASSERT_RTNL(); - /* check if phy support this setting */ - if (!(wpan_phy->channels_supported[page] & BIT(channel))) - return -EINVAL; + if (wpan_phy->current_page == page && + wpan_phy->current_channel == channel) + return 0; ret = drv_set_channel(local, page, channel); if (!ret) { @@ -92,9 +95,8 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy, ASSERT_RTNL(); - /* check if phy support this setting */ - if (!(local->hw.flags & IEEE802154_HW_CCA_MODE)) - return -EOPNOTSUPP; + if (wpan_phy_cca_cmp(&wpan_phy->cca, cca)) + return 0; ret = drv_set_cca_mode(local, cca); if (!ret) @@ -104,20 +106,49 @@ ieee802154_set_cca_mode(struct wpan_phy *wpan_phy, } static int +ieee802154_set_cca_ed_level(struct wpan_phy *wpan_phy, s32 ed_level) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + int ret; + + ASSERT_RTNL(); + + if (wpan_phy->cca_ed_level == ed_level) + return 0; + + ret = drv_set_cca_ed_level(local, ed_level); + if (!ret) + wpan_phy->cca_ed_level = ed_level; + + return ret; +} + +static int +ieee802154_set_tx_power(struct wpan_phy *wpan_phy, s32 power) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + int ret; + + ASSERT_RTNL(); + + if (wpan_phy->transmit_power == power) + return 0; + + ret = drv_set_tx_power(local, power); + if (!ret) + wpan_phy->transmit_power = power; + + return ret; +} + +static int ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 pan_id) { ASSERT_RTNL(); - /* TODO - * I am not sure about to check here on broadcast pan_id. - * Broadcast is a valid setting, comment from 802.15.4: - * If this value is 0xffff, the device is not associated. - * - * This could useful to simple deassociate an device. - */ - if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) - return -EINVAL; + if (wpan_dev->pan_id == pan_id) + return 0; wpan_dev->pan_id = pan_id; return 0; @@ -128,12 +159,11 @@ ieee802154_set_backoff_exponent(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); - if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS)) - return -EOPNOTSUPP; + if (wpan_dev->min_be == min_be && + wpan_dev->max_be == max_be) + return 0; wpan_dev->min_be = min_be; wpan_dev->max_be = max_be; @@ -146,20 +176,8 @@ ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, { ASSERT_RTNL(); - /* TODO - * I am not sure about to check here on broadcast short_addr. - * Broadcast is a valid setting, comment from 802.15.4: - * A value of 0xfffe indicates that the device has - * associated but has not been allocated an address. A - * value of 0xffff indicates that the device does not - * have a short address. - * - * I think we should allow to set these settings but - * don't allow to allow socket communication with it. - */ - if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) || - short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST)) - return -EINVAL; + if (wpan_dev->short_addr == short_addr) + return 0; wpan_dev->short_addr = short_addr; return 0; @@ -170,12 +188,10 @@ ieee802154_set_max_csma_backoffs(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 max_csma_backoffs) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); - if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS)) - return -EOPNOTSUPP; + if (wpan_dev->csma_retries == max_csma_backoffs) + return 0; wpan_dev->csma_retries = max_csma_backoffs; return 0; @@ -186,12 +202,10 @@ ieee802154_set_max_frame_retries(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, s8 max_frame_retries) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); - if (!(local->hw.flags & IEEE802154_HW_FRAME_RETRIES)) - return -EOPNOTSUPP; + if (wpan_dev->frame_retries == max_frame_retries) + return 0; wpan_dev->frame_retries = max_frame_retries; return 0; @@ -201,12 +215,10 @@ static int ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool mode) { - struct ieee802154_local *local = wpan_phy_priv(wpan_phy); - ASSERT_RTNL(); - if (!(local->hw.flags & IEEE802154_HW_LBT)) - return -EOPNOTSUPP; + if (wpan_dev->lbt == mode) + return 0; wpan_dev->lbt = mode; return 0; @@ -219,6 +231,8 @@ const struct cfg802154_ops mac802154_config_ops = { .del_virtual_intf = ieee802154_del_iface, .set_channel = ieee802154_set_channel, .set_cca_mode = ieee802154_set_cca_mode, + .set_cca_ed_level = ieee802154_set_cca_ed_level, + .set_tx_power = ieee802154_set_tx_power, .set_pan_id = ieee802154_set_pan_id, .set_short_addr = ieee802154_set_short_addr, .set_backoff_exponent = ieee802154_set_backoff_exponent, diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h index 98180a9fff4a..0550f3365e33 100644 --- a/net/mac802154/driver-ops.h +++ b/net/mac802154/driver-ops.h @@ -1,4 +1,4 @@ -#ifndef __MAC802154_DRVIER_OPS +#ifndef __MAC802154_DRIVER_OPS #define __MAC802154_DRIVER_OPS #include <linux/types.h> @@ -7,6 +7,7 @@ #include <net/mac802154.h> #include "ieee802154_i.h" +#include "trace.h" static inline int drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb) @@ -27,19 +28,25 @@ drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb) static inline int drv_start(struct ieee802154_local *local) { + int ret; + might_sleep(); + trace_802154_drv_start(local); local->started = true; smp_mb(); - - return local->ops->start(&local->hw); + ret = local->ops->start(&local->hw); + trace_802154_drv_return_int(local, ret); + return ret; } static inline void drv_stop(struct ieee802154_local *local) { might_sleep(); + trace_802154_drv_stop(local); local->ops->stop(&local->hw); + trace_802154_drv_return_void(local); /* sync away all work on the tasklet before clearing started */ tasklet_disable(&local->tasklet); @@ -53,13 +60,20 @@ static inline void drv_stop(struct ieee802154_local *local) static inline int drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) { + int ret; + might_sleep(); - return local->ops->set_channel(&local->hw, page, channel); + trace_802154_drv_set_channel(local, page, channel); + ret = local->ops->set_channel(&local->hw, page, channel); + trace_802154_drv_return_int(local, ret); + return ret; } -static inline int drv_set_tx_power(struct ieee802154_local *local, s8 dbm) +static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) { + int ret; + might_sleep(); if (!local->ops->set_txpower) { @@ -67,12 +81,17 @@ static inline int drv_set_tx_power(struct ieee802154_local *local, s8 dbm) return -EOPNOTSUPP; } - return local->ops->set_txpower(&local->hw, dbm); + trace_802154_drv_set_tx_power(local, mbm); + ret = local->ops->set_txpower(&local->hw, mbm); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_cca_mode(struct ieee802154_local *local, const struct wpan_phy_cca *cca) { + int ret; + might_sleep(); if (!local->ops->set_cca_mode) { @@ -80,11 +99,16 @@ static inline int drv_set_cca_mode(struct ieee802154_local *local, return -EOPNOTSUPP; } - return local->ops->set_cca_mode(&local->hw, cca); + trace_802154_drv_set_cca_mode(local, cca); + ret = local->ops->set_cca_mode(&local->hw, cca); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) { + int ret; + might_sleep(); if (!local->ops->set_lbt) { @@ -92,12 +116,17 @@ static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) return -EOPNOTSUPP; } - return local->ops->set_lbt(&local->hw, mode); + trace_802154_drv_set_lbt_mode(local, mode); + ret = local->ops->set_lbt(&local->hw, mode); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int -drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level) +drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) { + int ret; + might_sleep(); if (!local->ops->set_cca_ed_level) { @@ -105,12 +134,16 @@ drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level) return -EOPNOTSUPP; } - return local->ops->set_cca_ed_level(&local->hw, ed_level); + trace_802154_drv_set_cca_ed_level(local, mbm); + ret = local->ops->set_cca_ed_level(&local->hw, mbm); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -121,14 +154,18 @@ static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) filt.pan_id = pan_id; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_pan_id(local, pan_id); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANID_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -139,14 +176,18 @@ drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) filt.ieee_addr = extended_addr; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_extended_addr(local, extended_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_IEEEADDR_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -157,14 +198,18 @@ drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) filt.short_addr = short_addr; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_short_addr(local, short_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_SADDR_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) { struct ieee802154_hw_addr_filt filt; + int ret; might_sleep(); @@ -175,14 +220,19 @@ drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) filt.pan_coord = is_coord; - return local->ops->set_hw_addr_filt(&local->hw, &filt, + trace_802154_drv_set_pan_coord(local, is_coord); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANC_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be, u8 max_csma_backoffs) { + int ret; + might_sleep(); if (!local->ops->set_csma_params) { @@ -190,13 +240,19 @@ drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be, return -EOPNOTSUPP; } - return local->ops->set_csma_params(&local->hw, min_be, max_be, + trace_802154_drv_set_csma_params(local, min_be, max_be, + max_csma_backoffs); + ret = local->ops->set_csma_params(&local->hw, min_be, max_be, max_csma_backoffs); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries) { + int ret; + might_sleep(); if (!local->ops->set_frame_retries) { @@ -204,12 +260,17 @@ drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries) return -EOPNOTSUPP; } - return local->ops->set_frame_retries(&local->hw, max_frame_retries); + trace_802154_drv_set_max_frame_retries(local, max_frame_retries); + ret = local->ops->set_frame_retries(&local->hw, max_frame_retries); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) { + int ret; + might_sleep(); if (!local->ops->set_promiscuous_mode) { @@ -217,7 +278,10 @@ drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) return -EOPNOTSUPP; } - return local->ops->set_promiscuous_mode(&local->hw, on); + trace_802154_drv_set_promiscuous_mode(local, on); + ret = local->ops->set_promiscuous_mode(&local->hw, on); + trace_802154_drv_return_int(local, ret); + return ret; } -#endif /* __MAC802154_DRVIER_OPS */ +#endif /* __MAC802154_DRIVER_OPS */ diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index bebd70ffc7a3..34755d5751a4 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -86,16 +86,12 @@ struct ieee802154_sub_if_data { unsigned long state; char name[IFNAMSIZ]; - spinlock_t mib_lock; - /* protects sec from concurrent access by netlink. access by * encrypt/decrypt/header_create safe without additional protection. */ struct mutex sec_mtx; struct mac802154_llsec sec; - /* must be last, dynamically sized area in this! */ - struct ieee802154_vif vif; }; #define MAC802154_CHAN_NONE 0xff /* No channel is assigned */ @@ -136,12 +132,7 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer); /* MIB callbacks */ -void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val); -__le16 mac802154_dev_get_short_addr(const struct net_device *dev); -__le16 mac802154_dev_get_pan_id(const struct net_device *dev); -void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val); void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan); -u8 mac802154_dev_get_dsn(const struct net_device *dev); int mac802154_get_params(struct net_device *dev, struct ieee802154_llsec_params *params); @@ -182,7 +173,8 @@ void ieee802154_iface_exit(void); void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata); struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, - enum nl802154_iftype type, __le64 extended_addr); + unsigned char name_assign_type, enum nl802154_iftype type, + __le64 extended_addr); void ieee802154_remove_interfaces(struct ieee802154_local *local); #endif /* __IEEE802154_I_H */ diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 6fb6bdf9868c..8b698246a51b 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -62,9 +62,10 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) (struct sockaddr_ieee802154 *)&ifr->ifr_addr; int err = -ENOIOCTLCMD; - ASSERT_RTNL(); + if (cmd != SIOCGIFADDR && cmd != SIOCSIFADDR) + return err; - spin_lock_bh(&sdata->mib_lock); + rtnl_lock(); switch (cmd) { case SIOCGIFADDR: @@ -89,7 +90,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } case SIOCSIFADDR: if (netif_running(dev)) { - spin_unlock_bh(&sdata->mib_lock); + rtnl_unlock(); return -EBUSY; } @@ -111,7 +112,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - spin_unlock_bh(&sdata->mib_lock); + rtnl_unlock(); return err; } @@ -125,7 +126,7 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) return -EBUSY; ieee802154_be64_to_le64(&extended_addr, addr->sa_data); - if (!ieee802154_is_valid_extended_addr(extended_addr)) + if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); @@ -134,19 +135,72 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) return mac802154_wpan_update_llsec(dev); } +static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata) +{ + struct ieee802154_local *local = sdata->local; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + int ret; + + if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { + ret = drv_set_promiscuous_mode(local, + wpan_dev->promiscuous_mode); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_AFILT) { + ret = drv_set_pan_id(local, wpan_dev->pan_id); + if (ret < 0) + return ret; + + ret = drv_set_extended_addr(local, wpan_dev->extended_addr); + if (ret < 0) + return ret; + + ret = drv_set_short_addr(local, wpan_dev->short_addr); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_LBT) { + ret = drv_set_lbt_mode(local, wpan_dev->lbt); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { + ret = drv_set_csma_params(local, wpan_dev->min_be, + wpan_dev->max_be, + wpan_dev->csma_retries); + if (ret < 0) + return ret; + } + + if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { + ret = drv_set_max_frame_retries(local, wpan_dev->frame_retries); + if (ret < 0) + return ret; + } + + return 0; +} + static int mac802154_slave_open(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; - int res = 0; + int res; ASSERT_RTNL(); set_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) { + res = ieee802154_setup_hw(sdata); + if (res) + goto err; + res = drv_start(local); - WARN_ON(res); if (res) goto err; } @@ -174,24 +228,16 @@ ieee802154_check_mac_settings(struct ieee802154_local *local, } if (local->hw.flags & IEEE802154_HW_AFILT) { - if (wpan_dev->pan_id != nwpan_dev->pan_id) - return -EBUSY; - - if (wpan_dev->short_addr != nwpan_dev->short_addr) - return -EBUSY; - - if (wpan_dev->extended_addr != nwpan_dev->extended_addr) + if (wpan_dev->pan_id != nwpan_dev->pan_id || + wpan_dev->short_addr != nwpan_dev->short_addr || + wpan_dev->extended_addr != nwpan_dev->extended_addr) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { - if (wpan_dev->min_be != nwpan_dev->min_be) - return -EBUSY; - - if (wpan_dev->max_be != nwpan_dev->max_be) - return -EBUSY; - - if (wpan_dev->csma_retries != nwpan_dev->csma_retries) + if (wpan_dev->min_be != nwpan_dev->min_be || + wpan_dev->max_be != nwpan_dev->max_be || + wpan_dev->csma_retries != nwpan_dev->csma_retries) return -EBUSY; } @@ -226,8 +272,8 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, * exist really an use case if we need to support * multiple node types at the same time. */ - if (sdata->vif.type == NL802154_IFTYPE_NODE && - nsdata->vif.type == NL802154_IFTYPE_NODE) + if (wpan_dev->iftype == NL802154_IFTYPE_NODE && + nsdata->wpan_dev.iftype == NL802154_IFTYPE_NODE) return -EBUSY; /* check all phy mac sublayer settings are the same. @@ -247,67 +293,13 @@ static int mac802154_wpan_open(struct net_device *dev) { int rc; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; - struct wpan_phy *phy = sdata->local->phy; - rc = ieee802154_check_concurrent_iface(sdata, sdata->vif.type); + rc = ieee802154_check_concurrent_iface(sdata, wpan_dev->iftype); if (rc < 0) return rc; - rc = mac802154_slave_open(dev); - if (rc < 0) - return rc; - - mutex_lock(&phy->pib_lock); - - if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { - rc = drv_set_promiscuous_mode(local, - wpan_dev->promiscuous_mode); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_AFILT) { - rc = drv_set_pan_id(local, wpan_dev->pan_id); - if (rc < 0) - goto out; - - rc = drv_set_extended_addr(local, wpan_dev->extended_addr); - if (rc < 0) - goto out; - - rc = drv_set_short_addr(local, wpan_dev->short_addr); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_LBT) { - rc = drv_set_lbt_mode(local, wpan_dev->lbt); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { - rc = drv_set_csma_params(local, wpan_dev->min_be, - wpan_dev->max_be, - wpan_dev->csma_retries); - if (rc < 0) - goto out; - } - - if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { - rc = drv_set_max_frame_retries(local, wpan_dev->frame_retries); - if (rc < 0) - goto out; - } - - mutex_unlock(&phy->pib_lock); - return 0; - -out: - mutex_unlock(&phy->pib_lock); - return rc; + return mac802154_slave_open(dev); } static int mac802154_slave_close(struct net_device *dev) @@ -317,15 +309,16 @@ static int mac802154_slave_close(struct net_device *dev) ASSERT_RTNL(); - hrtimer_cancel(&local->ifs_timer); - netif_stop_queue(dev); local->open_count--; clear_bit(SDATA_STATE_RUNNING, &sdata->state); - if (!local->open_count) + if (!local->open_count) { + flush_workqueue(local->workqueue); + hrtimer_cancel(&local->ifs_timer); drv_stop(local); + } return 0; } @@ -382,14 +375,12 @@ static int mac802154_header_create(struct sk_buff *skb, hdr.fc.type = cb->type; hdr.fc.security_enabled = cb->secen; hdr.fc.ack_request = cb->ackreq; - hdr.seq = ieee802154_mlme_ops(dev)->get_dsn(dev); + hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; if (mac802154_set_header_security(sdata, &hdr, cb) < 0) return -EINVAL; if (!saddr) { - spin_lock_bh(&sdata->mib_lock); - if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) || wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { @@ -401,8 +392,6 @@ static int mac802154_header_create(struct sk_buff *skb, } hdr.source.pan_id = wpan_dev->pan_id; - - spin_unlock_bh(&sdata->mib_lock); } else { hdr.source = *(const struct ieee802154_addr *)saddr; } @@ -482,13 +471,15 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype type) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; + u8 tmp; /* set some type-dependent values */ - sdata->vif.type = type; sdata->wpan_dev.iftype = type; - get_random_bytes(&wpan_dev->bsn, 1); - get_random_bytes(&wpan_dev->dsn, 1); + get_random_bytes(&tmp, sizeof(tmp)); + atomic_set(&wpan_dev->bsn, tmp); + get_random_bytes(&tmp, sizeof(tmp)); + atomic_set(&wpan_dev->dsn, tmp); /* defaults per 802.15.4-2011 */ wpan_dev->min_be = 3; @@ -511,7 +502,6 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, sdata->dev->ml_priv = &mac802154_mlme_wpan; wpan_dev->promiscuous_mode = false; - spin_lock_init(&sdata->mib_lock); mutex_init(&sdata->sec_mtx); mac802154_llsec_init(&sdata->sec); @@ -530,7 +520,8 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, - enum nl802154_iftype type, __le64 extended_addr) + unsigned char name_assign_type, enum nl802154_iftype type, + __le64 extended_addr) { struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; @@ -538,8 +529,8 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, ASSERT_RTNL(); - ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name, - NET_NAME_UNKNOWN, ieee802154_if_setup); + ndev = alloc_netdev(sizeof(*sdata), name, + name_assign_type, ieee802154_if_setup); if (!ndev) return ERR_PTR(-ENOMEM); @@ -554,7 +545,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, switch (type) { case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; - if (ieee802154_is_valid_extended_addr(extended_addr)) + if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) ieee802154_le64_to_be64(ndev->dev_addr, &extended_addr); else memcpy(ndev->dev_addr, ndev->perm_addr, diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index dcf73958133a..985e9394e2af 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -17,8 +17,9 @@ #include <linux/err.h> #include <linux/bug.h> #include <linux/completion.h> +#include <linux/crypto.h> #include <linux/ieee802154.h> -#include <crypto/algapi.h> +#include <crypto/aead.h> #include "ieee802154_i.h" #include "llsec.h" @@ -134,7 +135,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) for (i = 0; i < ARRAY_SIZE(key->tfm); i++) { key->tfm[i] = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); - if (!key->tfm[i]) + if (IS_ERR(key->tfm[i])) goto err_tfm; if (crypto_aead_setkey(key->tfm[i], template->key, IEEE802154_LLSEC_KEY_SIZE)) @@ -144,7 +145,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template) } key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); - if (!key->tfm0) + if (IS_ERR(key->tfm0)) goto err_tfm; if (crypto_blkcipher_setkey(key->tfm0, template->key, @@ -649,7 +650,7 @@ llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, u8 iv[16]; unsigned char *data; int authlen, assoclen, datalen, rc; - struct scatterlist src, assoc[2], dst[2]; + struct scatterlist sg; struct aead_request *req; authlen = ieee802154_sechdr_authtag_len(&hdr->sec); @@ -659,30 +660,23 @@ llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, if (!req) return -ENOMEM; - sg_init_table(assoc, 2); - sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len); assoclen = skb->mac_len; data = skb_mac_header(skb) + skb->mac_len; datalen = skb_tail_pointer(skb) - data; - if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) { - sg_set_buf(&assoc[1], data, 0); - } else { - sg_set_buf(&assoc[1], data, datalen); + skb_put(skb, authlen); + + sg_init_one(&sg, skb_mac_header(skb), assoclen + datalen + authlen); + + if (!(hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC)) { assoclen += datalen; datalen = 0; } - sg_init_one(&src, data, datalen); - - sg_init_table(dst, 2); - sg_set_buf(&dst[0], data, datalen); - sg_set_buf(&dst[1], skb_put(skb, authlen), authlen); - aead_request_set_callback(req, 0, NULL, NULL); - aead_request_set_assoc(req, assoc, assoclen); - aead_request_set_crypt(req, &src, dst, datalen, iv); + aead_request_set_crypt(req, &sg, &sg, datalen, iv); + aead_request_set_ad(req, assoclen); rc = crypto_aead_encrypt(req); @@ -858,7 +852,7 @@ llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, u8 iv[16]; unsigned char *data; int authlen, datalen, assoclen, rc; - struct scatterlist src, assoc[2]; + struct scatterlist sg; struct aead_request *req; authlen = ieee802154_sechdr_authtag_len(&hdr->sec); @@ -868,27 +862,21 @@ llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec, if (!req) return -ENOMEM; - sg_init_table(assoc, 2); - sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len); assoclen = skb->mac_len; data = skb_mac_header(skb) + skb->mac_len; datalen = skb_tail_pointer(skb) - data; - if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) { - sg_set_buf(&assoc[1], data, 0); - } else { - sg_set_buf(&assoc[1], data, datalen - authlen); + sg_init_one(&sg, skb_mac_header(skb), assoclen + datalen); + + if (!(hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC)) { assoclen += datalen - authlen; - data += datalen - authlen; datalen = authlen; } - sg_init_one(&src, data, datalen); - aead_request_set_callback(req, 0, NULL, NULL); - aead_request_set_assoc(req, assoc, assoclen); - aead_request_set_crypt(req, &src, &src, datalen, iv); + aead_request_set_crypt(req, &sg, &sg, datalen, iv); + aead_request_set_ad(req, assoclen); rc = crypto_aead_decrypt(req); diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c index bdccb4ecd30f..8606da459ff3 100644 --- a/net/mac802154/mac_cmd.c +++ b/net/mac802154/mac_cmd.c @@ -36,37 +36,30 @@ static int mac802154_mlme_start_req(struct net_device *dev, u8 pan_coord, u8 blx, u8 coord_realign) { - struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); - int rc = 0; + struct ieee802154_llsec_params params; + int changed = 0; ASSERT_RTNL(); BUG_ON(addr->mode != IEEE802154_ADDR_SHORT); - mac802154_dev_set_pan_id(dev, addr->pan_id); - mac802154_dev_set_short_addr(dev, addr->short_addr); + dev->ieee802154_ptr->pan_id = addr->pan_id; + dev->ieee802154_ptr->short_addr = addr->short_addr; mac802154_dev_set_page_channel(dev, page, channel); - if (ops->llsec) { - struct ieee802154_llsec_params params; - int changed = 0; + params.pan_id = addr->pan_id; + changed |= IEEE802154_LLSEC_PARAM_PAN_ID; - params.coord_shortaddr = addr->short_addr; - changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR; + params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr); + changed |= IEEE802154_LLSEC_PARAM_HWADDR; - params.pan_id = addr->pan_id; - changed |= IEEE802154_LLSEC_PARAM_PAN_ID; + params.coord_hwaddr = params.hwaddr; + changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR; - params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr); - changed |= IEEE802154_LLSEC_PARAM_HWADDR; + params.coord_shortaddr = addr->short_addr; + changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR; - params.coord_hwaddr = params.hwaddr; - changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR; - - rc = ops->llsec->set_params(dev, ¶ms, changed); - } - - return rc; + return mac802154_set_params(dev, ¶ms, changed); } static int mac802154_set_mac_params(struct net_device *dev, @@ -91,19 +84,19 @@ static int mac802154_set_mac_params(struct net_device *dev, wpan_dev->frame_retries = params->frame_retries; wpan_dev->lbt = params->lbt; - if (local->hw.flags & IEEE802154_HW_TXPOWER) { + if (local->hw.phy->flags & WPAN_PHY_FLAG_TXPOWER) { ret = drv_set_tx_power(local, params->transmit_power); if (ret < 0) return ret; } - if (local->hw.flags & IEEE802154_HW_CCA_MODE) { + if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_MODE) { ret = drv_set_cca_mode(local, ¶ms->cca); if (ret < 0) return ret; } - if (local->hw.flags & IEEE802154_HW_CCA_ED_LEVEL) { + if (local->hw.phy->flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { ret = drv_set_cca_ed_level(local, params->cca_ed_level); if (ret < 0) return ret; @@ -151,9 +144,6 @@ static struct ieee802154_llsec_ops mac802154_llsec_ops = { struct ieee802154_mlme_ops mac802154_mlme_wpan = { .start_req = mac802154_mlme_start_req, - .get_pan_id = mac802154_dev_get_pan_id, - .get_short_addr = mac802154_dev_get_short_addr, - .get_dsn = mac802154_dev_get_dsn, .llsec = &mac802154_llsec_ops, diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 8500378c8318..356b346e1ee8 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -107,6 +107,18 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) skb_queue_head_init(&local->skb_queue); + /* init supported flags with 802.15.4 default ranges */ + phy->supported.max_minbe = 8; + phy->supported.min_maxbe = 3; + phy->supported.max_maxbe = 8; + phy->supported.min_frame_retries = -1; + phy->supported.max_frame_retries = 7; + phy->supported.max_csma_backoffs = 5; + phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE; + + /* always supported */ + phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE); + return &local->hw; } EXPORT_SYMBOL(ieee802154_alloc_hw); @@ -155,24 +167,47 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) ieee802154_setup_wpan_phy_pib(local->phy); + if (!(hw->flags & IEEE802154_HW_CSMA_PARAMS)) { + local->phy->supported.min_csma_backoffs = 4; + local->phy->supported.max_csma_backoffs = 4; + local->phy->supported.min_maxbe = 5; + local->phy->supported.max_maxbe = 5; + local->phy->supported.min_minbe = 3; + local->phy->supported.max_minbe = 3; + } + + if (!(hw->flags & IEEE802154_HW_FRAME_RETRIES)) { + /* TODO should be 3, but our default value is -1 which means + * no ARET handling. + */ + local->phy->supported.min_frame_retries = -1; + local->phy->supported.max_frame_retries = -1; + } + + if (hw->flags & IEEE802154_HW_PROMISCUOUS) + local->phy->supported.iftypes |= BIT(NL802154_IFTYPE_MONITOR); + rc = wpan_phy_register(local->phy); if (rc < 0) goto out_wq; rtnl_lock(); - dev = ieee802154_if_add(local, "wpan%d", NL802154_IFTYPE_NODE, + dev = ieee802154_if_add(local, "wpan%d", NET_NAME_ENUM, + NL802154_IFTYPE_NODE, cpu_to_le64(0x0000000000000000ULL)); if (IS_ERR(dev)) { rtnl_unlock(); rc = PTR_ERR(dev); - goto out_wq; + goto out_phy; } rtnl_unlock(); return 0; +out_phy: + wpan_phy_unregister(local->phy); out_wq: destroy_workqueue(local->workqueue); out: diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 5cf019a57fd7..73f94fbf8785 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -26,81 +26,22 @@ #include "ieee802154_i.h" #include "driver-ops.h" -void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - sdata->wpan_dev.short_addr = val; - spin_unlock_bh(&sdata->mib_lock); -} - -__le16 mac802154_dev_get_short_addr(const struct net_device *dev) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - __le16 ret; - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - ret = sdata->wpan_dev.short_addr; - spin_unlock_bh(&sdata->mib_lock); - - return ret; -} - -__le16 mac802154_dev_get_pan_id(const struct net_device *dev) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - __le16 ret; - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - ret = sdata->wpan_dev.pan_id; - spin_unlock_bh(&sdata->mib_lock); - - return ret; -} - -void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - spin_lock_bh(&sdata->mib_lock); - sdata->wpan_dev.pan_id = val; - spin_unlock_bh(&sdata->mib_lock); -} - -u8 mac802154_dev_get_dsn(const struct net_device *dev) -{ - struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); - - BUG_ON(dev->type != ARPHRD_IEEE802154); - - return sdata->wpan_dev.dsn++; -} - void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; int res; + ASSERT_RTNL(); + BUG_ON(dev->type != ARPHRD_IEEE802154); res = drv_set_channel(local, page, chan); if (res) { pr_debug("set_channel failed\n"); } else { - mutex_lock(&local->phy->pib_lock); local->phy->current_channel = chan; local->phy->current_page = page; - mutex_unlock(&local->phy->pib_lock); } } diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index c0d67b2b4132..d93ad2d4a4fc 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -47,8 +47,6 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, pr_debug("getting packet via slave interface %s\n", sdata->dev->name); - spin_lock_bh(&sdata->mib_lock); - span = wpan_dev->pan_id; sshort = wpan_dev->short_addr; @@ -83,13 +81,10 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, skb->pkt_type = PACKET_OTHERHOST; break; default: - spin_unlock_bh(&sdata->mib_lock); pr_debug("invalid dest mode\n"); goto fail; } - spin_unlock_bh(&sdata->mib_lock); - skb->dev = sdata->dev; rc = mac802154_llsec_decrypt(&sdata->sec, skb); @@ -207,8 +202,10 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, } list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->vif.type != NL802154_IFTYPE_NODE || - !netif_running(sdata->dev)) + if (sdata->wpan_dev.iftype != NL802154_IFTYPE_NODE) + continue; + + if (!ieee802154_sdata_running(sdata)) continue; ieee802154_subif_frame(sdata, skb, &hdr); @@ -232,7 +229,7 @@ ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb) skb->protocol = htons(ETH_P_IEEE802154); list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->vif.type != NL802154_IFTYPE_MONITOR) + if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR) continue; if (!ieee802154_sdata_running(sdata)) diff --git a/net/mac802154/trace.c b/net/mac802154/trace.c new file mode 100644 index 000000000000..863e5e6b983d --- /dev/null +++ b/net/mac802154/trace.c @@ -0,0 +1,9 @@ +#include <linux/module.h> + +#ifndef __CHECKER__ +#include <net/cfg802154.h> +#include "driver-ops.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + +#endif diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h new file mode 100644 index 000000000000..6f30e0c93a16 --- /dev/null +++ b/net/mac802154/trace.h @@ -0,0 +1,272 @@ +/* Based on net/mac80211/trace.h */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mac802154 + +#if !defined(__MAC802154_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) +#define __MAC802154_DRIVER_TRACE + +#include <linux/tracepoint.h> + +#include <net/mac802154.h> +#include "ieee802154_i.h" + +#define MAXNAME 32 +#define LOCAL_ENTRY __array(char, wpan_phy_name, MAXNAME) +#define LOCAL_ASSIGN strlcpy(__entry->wpan_phy_name, \ + wpan_phy_name(local->hw.phy), MAXNAME) +#define LOCAL_PR_FMT "%s" +#define LOCAL_PR_ARG __entry->wpan_phy_name + +#define CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \ + __field(enum nl802154_cca_opts, cca_opt) +#define CCA_ASSIGN \ + do { \ + (__entry->cca_mode) = cca->mode; \ + (__entry->cca_opt) = cca->opt; \ + } while (0) +#define CCA_PR_FMT "cca_mode: %d, cca_opt: %d" +#define CCA_PR_ARG __entry->cca_mode, __entry->cca_opt + +#define BOOL_TO_STR(bo) (bo) ? "true" : "false" + +/* Tracing for driver callbacks */ + +DECLARE_EVENT_CLASS(local_only_evt, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local), + TP_STRUCT__entry( + LOCAL_ENTRY + ), + TP_fast_assign( + LOCAL_ASSIGN; + ), + TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) +); + +DEFINE_EVENT(local_only_evt, 802154_drv_return_void, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(802154_drv_return_int, + TP_PROTO(struct ieee802154_local *local, int ret), + TP_ARGS(local, ret), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, ret) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->ret = ret; + ), + TP_printk(LOCAL_PR_FMT ", returned: %d", LOCAL_PR_ARG, + __entry->ret) +); + +DEFINE_EVENT(local_only_evt, 802154_drv_start, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local) +); + +DEFINE_EVENT(local_only_evt, 802154_drv_stop, + TP_PROTO(struct ieee802154_local *local), + TP_ARGS(local) +); + +TRACE_EVENT(802154_drv_set_channel, + TP_PROTO(struct ieee802154_local *local, u8 page, u8 channel), + TP_ARGS(local, page, channel), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u8, page) + __field(u8, channel) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->page = page; + __entry->channel = channel; + ), + TP_printk(LOCAL_PR_FMT ", page: %d, channel: %d", LOCAL_PR_ARG, + __entry->page, __entry->channel) +); + +TRACE_EVENT(802154_drv_set_cca_mode, + TP_PROTO(struct ieee802154_local *local, + const struct wpan_phy_cca *cca), + TP_ARGS(local, cca), + TP_STRUCT__entry( + LOCAL_ENTRY + CCA_ENTRY + ), + TP_fast_assign( + LOCAL_ASSIGN; + CCA_ASSIGN; + ), + TP_printk(LOCAL_PR_FMT ", " CCA_PR_FMT, LOCAL_PR_ARG, + CCA_PR_ARG) +); + +TRACE_EVENT(802154_drv_set_cca_ed_level, + TP_PROTO(struct ieee802154_local *local, s32 mbm), + TP_ARGS(local, mbm), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(s32, mbm) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->mbm = mbm; + ), + TP_printk(LOCAL_PR_FMT ", ed level: %d", LOCAL_PR_ARG, + __entry->mbm) +); + +TRACE_EVENT(802154_drv_set_tx_power, + TP_PROTO(struct ieee802154_local *local, s32 power), + TP_ARGS(local, power), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(s32, power) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->power = power; + ), + TP_printk(LOCAL_PR_FMT ", mbm: %d", LOCAL_PR_ARG, + __entry->power) +); + +TRACE_EVENT(802154_drv_set_lbt_mode, + TP_PROTO(struct ieee802154_local *local, bool mode), + TP_ARGS(local, mode), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, mode) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->mode = mode; + ), + TP_printk(LOCAL_PR_FMT ", lbt mode: %s", LOCAL_PR_ARG, + BOOL_TO_STR(__entry->mode)) +); + +TRACE_EVENT(802154_drv_set_short_addr, + TP_PROTO(struct ieee802154_local *local, __le16 short_addr), + TP_ARGS(local, short_addr), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(__le16, short_addr) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->short_addr = short_addr; + ), + TP_printk(LOCAL_PR_FMT ", short addr: 0x%04x", LOCAL_PR_ARG, + le16_to_cpu(__entry->short_addr)) +); + +TRACE_EVENT(802154_drv_set_pan_id, + TP_PROTO(struct ieee802154_local *local, __le16 pan_id), + TP_ARGS(local, pan_id), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(__le16, pan_id) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->pan_id = pan_id; + ), + TP_printk(LOCAL_PR_FMT ", pan id: 0x%04x", LOCAL_PR_ARG, + le16_to_cpu(__entry->pan_id)) +); + +TRACE_EVENT(802154_drv_set_extended_addr, + TP_PROTO(struct ieee802154_local *local, __le64 extended_addr), + TP_ARGS(local, extended_addr), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(__le64, extended_addr) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->extended_addr = extended_addr; + ), + TP_printk(LOCAL_PR_FMT ", extended addr: 0x%llx", LOCAL_PR_ARG, + le64_to_cpu(__entry->extended_addr)) +); + +TRACE_EVENT(802154_drv_set_pan_coord, + TP_PROTO(struct ieee802154_local *local, bool is_coord), + TP_ARGS(local, is_coord), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, is_coord) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->is_coord = is_coord; + ), + TP_printk(LOCAL_PR_FMT ", is_coord: %s", LOCAL_PR_ARG, + BOOL_TO_STR(__entry->is_coord)) +); + +TRACE_EVENT(802154_drv_set_csma_params, + TP_PROTO(struct ieee802154_local *local, u8 min_be, u8 max_be, + u8 max_csma_backoffs), + TP_ARGS(local, min_be, max_be, max_csma_backoffs), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(u8, min_be) + __field(u8, max_be) + __field(u8, max_csma_backoffs) + ), + TP_fast_assign( + LOCAL_ASSIGN, + __entry->min_be = min_be; + __entry->max_be = max_be; + __entry->max_csma_backoffs = max_csma_backoffs; + ), + TP_printk(LOCAL_PR_FMT ", min be: %d, max be: %d, max csma backoffs: %d", + LOCAL_PR_ARG, __entry->min_be, __entry->max_be, + __entry->max_csma_backoffs) +); + +TRACE_EVENT(802154_drv_set_max_frame_retries, + TP_PROTO(struct ieee802154_local *local, s8 max_frame_retries), + TP_ARGS(local, max_frame_retries), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(s8, max_frame_retries) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->max_frame_retries = max_frame_retries; + ), + TP_printk(LOCAL_PR_FMT ", max frame retries: %d", LOCAL_PR_ARG, + __entry->max_frame_retries) +); + +TRACE_EVENT(802154_drv_set_promiscuous_mode, + TP_PROTO(struct ieee802154_local *local, bool on), + TP_ARGS(local, on), + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, on) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->on = on; + ), + TP_printk(LOCAL_PR_FMT ", promiscuous mode: %s", LOCAL_PR_ARG, + BOOL_TO_STR(__entry->on)) +); + +#endif /* !__MAC802154_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 5fc979027919..583435f38930 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -65,8 +65,19 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, { if (ifs_handling) { struct ieee802154_local *local = hw_to_local(hw); + u8 max_sifs_size; - if (skb->len > 18) + /* If transceiver sets CRC on his own we need to use lifs + * threshold len above 16 otherwise 18, because it's not + * part of skb->len. + */ + if (hw->flags & IEEE802154_HW_TX_OMIT_CKSUM) + max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE - + IEEE802154_FCS_LEN; + else + max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE; + + if (skb->len > max_sifs_size) hrtimer_start(&local->ifs_timer, ktime_set(0, hw->phy->lifs_period * NSEC_PER_USEC), HRTIMER_MODE_REL); @@ -74,11 +85,10 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, hrtimer_start(&local->ifs_timer, ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC), HRTIMER_MODE_REL); - - consume_skb(skb); } else { ieee802154_wake_queue(hw); - consume_skb(skb); } + + dev_consume_skb_any(skb); } EXPORT_SYMBOL(ieee802154_xmit_complete); diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 37421db88965..17bde799c854 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -1,9 +1,30 @@ # # MPLS configuration # + +menuconfig MPLS + bool "MultiProtocol Label Switching" + default n + ---help--- + MultiProtocol Label Switching routes packets through logical + circuits. Originally conceived as a way of routing packets at + hardware speeds (before hardware was capable of routing ipv4 packets), + MPLS remains a simple way of making tunnels. + + If you have not heard of MPLS you probably want to say N here. + +if MPLS + config NET_MPLS_GSO tristate "MPLS: GSO support" help This is helper module to allow segmentation of non-MPLS GSO packets that have had MPLS stack entries pushed onto them and thus become MPLS GSO packets. + +config MPLS_ROUTING + tristate "MPLS: routing support" + help + Add support for forwarding of mpls packets. + +endif # MPLS diff --git a/net/mpls/Makefile b/net/mpls/Makefile index 6dec088c2d0f..65bbe68c72e6 100644 --- a/net/mpls/Makefile +++ b/net/mpls/Makefile @@ -2,3 +2,6 @@ # Makefile for MPLS. # obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o +obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o + +mpls_router-y := af_mpls.o diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c new file mode 100644 index 000000000000..1f93a5978f2a --- /dev/null +++ b/net/mpls/af_mpls.c @@ -0,0 +1,1153 @@ +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/sysctl.h> +#include <linux/net.h> +#include <linux/module.h> +#include <linux/if_arp.h> +#include <linux/ipv6.h> +#include <linux/mpls.h> +#include <linux/vmalloc.h> +#include <net/ip.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/arp.h> +#include <net/ip_fib.h> +#include <net/netevent.h> +#include <net/netns/generic.h> +#include "internal.h" + +#define LABEL_NOT_SPECIFIED (1<<20) +#define MAX_NEW_LABELS 2 + +/* This maximum ha length copied from the definition of struct neighbour */ +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))) + +struct mpls_route { /* next hop label forwarding entry */ + struct net_device __rcu *rt_dev; + struct rcu_head rt_rcu; + u32 rt_label[MAX_NEW_LABELS]; + u8 rt_protocol; /* routing protocol that set this entry */ + u8 rt_labels; + u8 rt_via_alen; + u8 rt_via_table; + u8 rt_via[0]; +}; + +static int zero = 0; +static int label_limit = (1 << 20) - 1; + +static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, + struct nlmsghdr *nlh, struct net *net, u32 portid, + unsigned int nlm_flags); + +static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) +{ + struct mpls_route *rt = NULL; + + if (index < net->mpls.platform_labels) { + struct mpls_route __rcu **platform_label = + rcu_dereference(net->mpls.platform_label); + rt = rcu_dereference(platform_label[index]); + } + return rt; +} + +static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->mpls_ptr); +} + +static bool mpls_output_possible(const struct net_device *dev) +{ + return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); +} + +static unsigned int mpls_rt_header_size(const struct mpls_route *rt) +{ + /* The size of the layer 2.5 labels to be added for this route */ + return rt->rt_labels * sizeof(struct mpls_shim_hdr); +} + +static unsigned int mpls_dev_mtu(const struct net_device *dev) +{ + /* The amount of data the layer 2 frame can hold */ + return dev->mtu; +} + +static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + +static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, + struct mpls_entry_decoded dec) +{ + /* RFC4385 and RFC5586 encode other packets in mpls such that + * they don't conflict with the ip version number, making + * decoding by examining the ip version correct in everything + * except for the strangest cases. + * + * The strange cases if we choose to support them will require + * manual configuration. + */ + struct iphdr *hdr4; + bool success = true; + + /* The IPv4 code below accesses through the IPv4 header + * checksum, which is 12 bytes into the packet. + * The IPv6 code below accesses through the IPv6 hop limit + * which is 8 bytes into the packet. + * + * For all supported cases there should always be at least 12 + * bytes of packet data present. The IPv4 header is 20 bytes + * without options and the IPv6 header is always 40 bytes + * long. + */ + if (!pskb_may_pull(skb, 12)) + return false; + + /* Use ip_hdr to find the ip protocol version */ + hdr4 = ip_hdr(skb); + if (hdr4->version == 4) { + skb->protocol = htons(ETH_P_IP); + csum_replace2(&hdr4->check, + htons(hdr4->ttl << 8), + htons(dec.ttl << 8)); + hdr4->ttl = dec.ttl; + } + else if (hdr4->version == 6) { + struct ipv6hdr *hdr6 = ipv6_hdr(skb); + skb->protocol = htons(ETH_P_IPV6); + hdr6->hop_limit = dec.ttl; + } + else + /* version 0 and version 1 are used by pseudo wires */ + success = false; + return success; +} + +static int mpls_forward(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + struct mpls_shim_hdr *hdr; + struct mpls_route *rt; + struct mpls_entry_decoded dec; + struct net_device *out_dev; + struct mpls_dev *mdev; + unsigned int hh_len; + unsigned int new_header_size; + unsigned int mtu; + int err; + + /* Careful this entire function runs inside of an rcu critical section */ + + mdev = mpls_dev_get(dev); + if (!mdev || !mdev->input_enabled) + goto drop; + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto drop; + + if (!pskb_may_pull(skb, sizeof(*hdr))) + goto drop; + + /* Read and decode the label */ + hdr = mpls_hdr(skb); + dec = mpls_entry_decode(hdr); + + /* Pop the label */ + skb_pull(skb, sizeof(*hdr)); + skb_reset_network_header(skb); + + skb_orphan(skb); + + rt = mpls_route_input_rcu(net, dec.label); + if (!rt) + goto drop; + + /* Find the output device */ + out_dev = rcu_dereference(rt->rt_dev); + if (!mpls_output_possible(out_dev)) + goto drop; + + if (skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + /* Verify ttl is valid */ + if (dec.ttl <= 1) + goto drop; + dec.ttl -= 1; + + /* Verify the destination can hold the packet */ + new_header_size = mpls_rt_header_size(rt); + mtu = mpls_dev_mtu(out_dev); + if (mpls_pkt_too_big(skb, mtu - new_header_size)) + goto drop; + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + /* Ensure there is enough space for the headers in the skb */ + if (skb_cow(skb, hh_len + new_header_size)) + goto drop; + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + if (unlikely(!new_header_size && dec.bos)) { + /* Penultimate hop popping */ + if (!mpls_egress(rt, skb, dec)) + goto drop; + } else { + bool bos; + int i; + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + /* Push the new labels */ + hdr = mpls_hdr(skb); + bos = dec.bos; + for (i = rt->rt_labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos); + bos = false; + } + } + + err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + return 0; + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static struct packet_type mpls_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_MPLS_UC), + .func = mpls_forward, +}; + +static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { + [RTA_DST] = { .type = NLA_U32 }, + [RTA_OIF] = { .type = NLA_U32 }, +}; + +struct mpls_route_config { + u32 rc_protocol; + u32 rc_ifindex; + u16 rc_via_table; + u16 rc_via_alen; + u8 rc_via[MAX_VIA_ALEN]; + u32 rc_label; + u32 rc_output_labels; + u32 rc_output_label[MAX_NEW_LABELS]; + u32 rc_nlflags; + struct nl_info rc_nlinfo; +}; + +static struct mpls_route *mpls_rt_alloc(size_t alen) +{ + struct mpls_route *rt; + + rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL); + if (rt) + rt->rt_via_alen = alen; + return rt; +} + +static void mpls_rt_free(struct mpls_route *rt) +{ + if (rt) + kfree_rcu(rt, rt_rcu); +} + +static void mpls_notify_route(struct net *net, unsigned index, + struct mpls_route *old, struct mpls_route *new, + const struct nl_info *info) +{ + struct nlmsghdr *nlh = info ? info->nlh : NULL; + unsigned portid = info ? info->portid : 0; + int event = new ? RTM_NEWROUTE : RTM_DELROUTE; + struct mpls_route *rt = new ? new : old; + unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0; + /* Ignore reserved labels for now */ + if (rt && (index >= 16)) + rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags); +} + +static void mpls_route_update(struct net *net, unsigned index, + struct net_device *dev, struct mpls_route *new, + const struct nl_info *info) +{ + struct mpls_route __rcu **platform_label; + struct mpls_route *rt, *old = NULL; + + ASSERT_RTNL(); + + platform_label = rtnl_dereference(net->mpls.platform_label); + rt = rtnl_dereference(platform_label[index]); + if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) { + rcu_assign_pointer(platform_label[index], new); + old = rt; + } + + mpls_notify_route(net, index, old, new, info); + + /* If we removed a route free it now */ + mpls_rt_free(old); +} + +static unsigned find_free_label(struct net *net) +{ + struct mpls_route __rcu **platform_label; + size_t platform_labels; + unsigned index; + + platform_label = rtnl_dereference(net->mpls.platform_label); + platform_labels = net->mpls.platform_labels; + for (index = 16; index < platform_labels; index++) { + if (!rtnl_dereference(platform_label[index])) + return index; + } + return LABEL_NOT_SPECIFIED; +} + +static int mpls_route_add(struct mpls_route_config *cfg) +{ + struct mpls_route __rcu **platform_label; + struct net *net = cfg->rc_nlinfo.nl_net; + struct net_device *dev = NULL; + struct mpls_route *rt, *old; + unsigned index; + int i; + int err = -EINVAL; + + index = cfg->rc_label; + + /* If a label was not specified during insert pick one */ + if ((index == LABEL_NOT_SPECIFIED) && + (cfg->rc_nlflags & NLM_F_CREATE)) { + index = find_free_label(net); + } + + /* The first 16 labels are reserved, and may not be set */ + if (index < 16) + goto errout; + + /* The full 20 bit range may not be supported. */ + if (index >= net->mpls.platform_labels) + goto errout; + + /* Ensure only a supported number of labels are present */ + if (cfg->rc_output_labels > MAX_NEW_LABELS) + goto errout; + + err = -ENODEV; + dev = dev_get_by_index(net, cfg->rc_ifindex); + if (!dev) + goto errout; + + /* Ensure this is a supported device */ + err = -EINVAL; + if (!mpls_dev_get(dev)) + goto errout; + + err = -EINVAL; + if ((cfg->rc_via_table == NEIGH_LINK_TABLE) && + (dev->addr_len != cfg->rc_via_alen)) + goto errout; + + /* Append makes no sense with mpls */ + err = -EOPNOTSUPP; + if (cfg->rc_nlflags & NLM_F_APPEND) + goto errout; + + err = -EEXIST; + platform_label = rtnl_dereference(net->mpls.platform_label); + old = rtnl_dereference(platform_label[index]); + if ((cfg->rc_nlflags & NLM_F_EXCL) && old) + goto errout; + + err = -EEXIST; + if (!(cfg->rc_nlflags & NLM_F_REPLACE) && old) + goto errout; + + err = -ENOENT; + if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old) + goto errout; + + err = -ENOMEM; + rt = mpls_rt_alloc(cfg->rc_via_alen); + if (!rt) + goto errout; + + rt->rt_labels = cfg->rc_output_labels; + for (i = 0; i < rt->rt_labels; i++) + rt->rt_label[i] = cfg->rc_output_label[i]; + rt->rt_protocol = cfg->rc_protocol; + RCU_INIT_POINTER(rt->rt_dev, dev); + rt->rt_via_table = cfg->rc_via_table; + memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen); + + mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo); + + dev_put(dev); + return 0; + +errout: + if (dev) + dev_put(dev); + return err; +} + +static int mpls_route_del(struct mpls_route_config *cfg) +{ + struct net *net = cfg->rc_nlinfo.nl_net; + unsigned index; + int err = -EINVAL; + + index = cfg->rc_label; + + /* The first 16 labels are reserved, and may not be removed */ + if (index < 16) + goto errout; + + /* The full 20 bit range may not be supported */ + if (index >= net->mpls.platform_labels) + goto errout; + + mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo); + + err = 0; +errout: + return err; +} + +#define MPLS_PERDEV_SYSCTL_OFFSET(field) \ + (&((struct mpls_dev *)0)->field) + +static const struct ctl_table mpls_dev_table[] = { + { + .procname = "input", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), + }, + { } +}; + +static int mpls_dev_sysctl_register(struct net_device *dev, + struct mpls_dev *mdev) +{ + char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; + struct ctl_table *table; + int i; + + table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL); + if (!table) + goto out; + + /* Table data contains only offsets relative to the base of + * the mdev at this point, so make them absolute. + */ + for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) + table[i].data = (char *)mdev + (uintptr_t)table[i].data; + + snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); + + mdev->sysctl = register_net_sysctl(dev_net(dev), path, table); + if (!mdev->sysctl) + goto free; + + return 0; + +free: + kfree(table); +out: + return -ENOBUFS; +} + +static void mpls_dev_sysctl_unregister(struct mpls_dev *mdev) +{ + struct ctl_table *table; + + table = mdev->sysctl->ctl_table_arg; + unregister_net_sysctl_table(mdev->sysctl); + kfree(table); +} + +static struct mpls_dev *mpls_add_dev(struct net_device *dev) +{ + struct mpls_dev *mdev; + int err = -ENOMEM; + + ASSERT_RTNL(); + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) + return ERR_PTR(err); + + err = mpls_dev_sysctl_register(dev, mdev); + if (err) + goto free; + + rcu_assign_pointer(dev->mpls_ptr, mdev); + + return mdev; + +free: + kfree(mdev); + return ERR_PTR(err); +} + +static void mpls_ifdown(struct net_device *dev) +{ + struct mpls_route __rcu **platform_label; + struct net *net = dev_net(dev); + struct mpls_dev *mdev; + unsigned index; + + platform_label = rtnl_dereference(net->mpls.platform_label); + for (index = 0; index < net->mpls.platform_labels; index++) { + struct mpls_route *rt = rtnl_dereference(platform_label[index]); + if (!rt) + continue; + if (rtnl_dereference(rt->rt_dev) != dev) + continue; + rt->rt_dev = NULL; + } + + mdev = mpls_dev_get(dev); + if (!mdev) + return; + + mpls_dev_sysctl_unregister(mdev); + + RCU_INIT_POINTER(dev->mpls_ptr, NULL); + + kfree_rcu(mdev, rcu); +} + +static int mpls_dev_notify(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct mpls_dev *mdev; + + switch(event) { + case NETDEV_REGISTER: + /* For now just support ethernet devices */ + if ((dev->type == ARPHRD_ETHER) || + (dev->type == ARPHRD_LOOPBACK)) { + mdev = mpls_add_dev(dev); + if (IS_ERR(mdev)) + return notifier_from_errno(PTR_ERR(mdev)); + } + break; + + case NETDEV_UNREGISTER: + mpls_ifdown(dev); + break; + case NETDEV_CHANGENAME: + mdev = mpls_dev_get(dev); + if (mdev) { + int err; + + mpls_dev_sysctl_unregister(mdev); + err = mpls_dev_sysctl_register(dev, mdev); + if (err) + return notifier_from_errno(err); + } + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mpls_dev_notifier = { + .notifier_call = mpls_dev_notify, +}; + +static int nla_put_via(struct sk_buff *skb, + u8 table, const void *addr, int alen) +{ + static const int table_to_family[NEIGH_NR_TABLES + 1] = { + AF_INET, AF_INET6, AF_DECnet, AF_PACKET, + }; + struct nlattr *nla; + struct rtvia *via; + int family = AF_UNSPEC; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + return -EMSGSIZE; + + if (table <= NEIGH_NR_TABLES) + family = table_to_family[table]; + + via = nla_data(nla); + via->rtvia_family = family; + memcpy(via->rtvia_addr, addr, alen); + return 0; +} + +int nla_put_labels(struct sk_buff *skb, int attrtype, + u8 labels, const u32 label[]) +{ + struct nlattr *nla; + struct mpls_shim_hdr *nla_label; + bool bos; + int i; + nla = nla_reserve(skb, attrtype, labels*4); + if (!nla) + return -EMSGSIZE; + + nla_label = nla_data(nla); + bos = true; + for (i = labels - 1; i >= 0; i--) { + nla_label[i] = mpls_entry_encode(label[i], 0, 0, bos); + bos = false; + } + + return 0; +} + +int nla_get_labels(const struct nlattr *nla, + u32 max_labels, u32 *labels, u32 label[]) +{ + unsigned len = nla_len(nla); + unsigned nla_labels; + struct mpls_shim_hdr *nla_label; + bool bos; + int i; + + /* len needs to be an even multiple of 4 (the label size) */ + if (len & 3) + return -EINVAL; + + /* Limit the number of new labels allowed */ + nla_labels = len/4; + if (nla_labels > max_labels) + return -EINVAL; + + nla_label = nla_data(nla); + bos = true; + for (i = nla_labels - 1; i >= 0; i--, bos = false) { + struct mpls_entry_decoded dec; + dec = mpls_entry_decode(nla_label + i); + + /* Ensure the bottom of stack flag is properly set + * and ttl and tc are both clear. + */ + if ((dec.bos != bos) || dec.ttl || dec.tc) + return -EINVAL; + + switch (dec.label) { + case MPLS_LABEL_IMPLNULL: + /* RFC3032: This is a label that an LSR may + * assign and distribute, but which never + * actually appears in the encapsulation. + */ + return -EINVAL; + } + + label[i] = dec.label; + } + *labels = nla_labels; + return 0; +} + +static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, + struct mpls_route_config *cfg) +{ + struct rtmsg *rtm; + struct nlattr *tb[RTA_MAX+1]; + int index; + int err; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + rtm = nlmsg_data(nlh); + memset(cfg, 0, sizeof(*cfg)); + + if (rtm->rtm_family != AF_MPLS) + goto errout; + if (rtm->rtm_dst_len != 20) + goto errout; + if (rtm->rtm_src_len != 0) + goto errout; + if (rtm->rtm_tos != 0) + goto errout; + if (rtm->rtm_table != RT_TABLE_MAIN) + goto errout; + /* Any value is acceptable for rtm_protocol */ + + /* As mpls uses destination specific addresses + * (or source specific address in the case of multicast) + * all addresses have universal scope. + */ + if (rtm->rtm_scope != RT_SCOPE_UNIVERSE) + goto errout; + if (rtm->rtm_type != RTN_UNICAST) + goto errout; + if (rtm->rtm_flags != 0) + goto errout; + + cfg->rc_label = LABEL_NOT_SPECIFIED; + cfg->rc_protocol = rtm->rtm_protocol; + cfg->rc_nlflags = nlh->nlmsg_flags; + cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; + cfg->rc_nlinfo.nlh = nlh; + cfg->rc_nlinfo.nl_net = sock_net(skb->sk); + + for (index = 0; index <= RTA_MAX; index++) { + struct nlattr *nla = tb[index]; + if (!nla) + continue; + + switch(index) { + case RTA_OIF: + cfg->rc_ifindex = nla_get_u32(nla); + break; + case RTA_NEWDST: + if (nla_get_labels(nla, MAX_NEW_LABELS, + &cfg->rc_output_labels, + cfg->rc_output_label)) + goto errout; + break; + case RTA_DST: + { + u32 label_count; + if (nla_get_labels(nla, 1, &label_count, + &cfg->rc_label)) + goto errout; + + /* The first 16 labels are reserved, and may not be set */ + if (cfg->rc_label < 16) + goto errout; + + break; + } + case RTA_VIA: + { + struct rtvia *via = nla_data(nla); + if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) + goto errout; + cfg->rc_via_alen = nla_len(nla) - + offsetof(struct rtvia, rtvia_addr); + if (cfg->rc_via_alen > MAX_VIA_ALEN) + goto errout; + + /* Validate the address family */ + switch(via->rtvia_family) { + case AF_PACKET: + cfg->rc_via_table = NEIGH_LINK_TABLE; + break; + case AF_INET: + cfg->rc_via_table = NEIGH_ARP_TABLE; + if (cfg->rc_via_alen != 4) + goto errout; + break; + case AF_INET6: + cfg->rc_via_table = NEIGH_ND_TABLE; + if (cfg->rc_via_alen != 16) + goto errout; + break; + default: + /* Unsupported address family */ + goto errout; + } + + memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen); + break; + } + default: + /* Unsupported attribute */ + goto errout; + } + } + + err = 0; +errout: + return err; +} + +static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct mpls_route_config cfg; + int err; + + err = rtm_to_route_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return mpls_route_del(&cfg); +} + + +static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct mpls_route_config cfg; + int err; + + err = rtm_to_route_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return mpls_route_add(&cfg); +} + +static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, + u32 label, struct mpls_route *rt, int flags) +{ + struct net_device *dev; + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = AF_MPLS; + rtm->rtm_dst_len = 20; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_protocol = rt->rt_protocol; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + + if (rt->rt_labels && + nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label)) + goto nla_put_failure; + if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen)) + goto nla_put_failure; + dev = rtnl_dereference(rt->rt_dev); + if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) + goto nla_put_failure; + if (nla_put_labels(skb, RTA_DST, 1, &label)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mpls_route __rcu **platform_label; + size_t platform_labels; + unsigned int index; + + ASSERT_RTNL(); + + index = cb->args[0]; + if (index < 16) + index = 16; + + platform_label = rtnl_dereference(net->mpls.platform_label); + platform_labels = net->mpls.platform_labels; + for (; index < platform_labels; index++) { + struct mpls_route *rt; + rt = rtnl_dereference(platform_label[index]); + if (!rt) + continue; + + if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + index, rt, NLM_F_MULTI) < 0) + break; + } + cb->args[0] = index; + + return skb->len; +} + +static inline size_t lfib_nlmsg_size(struct mpls_route *rt) +{ + size_t payload = + NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(2 + rt->rt_via_alen) /* RTA_VIA */ + + nla_total_size(4); /* RTA_DST */ + if (rt->rt_labels) /* RTA_NEWDST */ + payload += nla_total_size(rt->rt_labels * 4); + if (rt->rt_dev) /* RTA_OIF */ + payload += nla_total_size(4); + return payload; +} + +static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, + struct nlmsghdr *nlh, struct net *net, u32 portid, + unsigned int nlm_flags) +{ + struct sk_buff *skb; + u32 seq = nlh ? nlh->nlmsg_seq : 0; + int err = -ENOBUFS; + + skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = mpls_dump_route(skb, portid, seq, event, label, rt, nlm_flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in lfib_nlmsg_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, portid, RTNLGRP_MPLS_ROUTE, nlh, GFP_KERNEL); + + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); +} + +static int resize_platform_label_table(struct net *net, size_t limit) +{ + size_t size = sizeof(struct mpls_route *) * limit; + size_t old_limit; + size_t cp_size; + struct mpls_route __rcu **labels = NULL, **old; + struct mpls_route *rt0 = NULL, *rt2 = NULL; + unsigned index; + + if (size) { + labels = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!labels) + labels = vzalloc(size); + + if (!labels) + goto nolabels; + } + + /* In case the predefined labels need to be populated */ + if (limit > MPLS_LABEL_IPV4NULL) { + struct net_device *lo = net->loopback_dev; + rt0 = mpls_rt_alloc(lo->addr_len); + if (!rt0) + goto nort0; + RCU_INIT_POINTER(rt0->rt_dev, lo); + rt0->rt_protocol = RTPROT_KERNEL; + rt0->rt_via_table = NEIGH_LINK_TABLE; + memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); + } + if (limit > MPLS_LABEL_IPV6NULL) { + struct net_device *lo = net->loopback_dev; + rt2 = mpls_rt_alloc(lo->addr_len); + if (!rt2) + goto nort2; + RCU_INIT_POINTER(rt2->rt_dev, lo); + rt2->rt_protocol = RTPROT_KERNEL; + rt2->rt_via_table = NEIGH_LINK_TABLE; + memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len); + } + + rtnl_lock(); + /* Remember the original table */ + old = rtnl_dereference(net->mpls.platform_label); + old_limit = net->mpls.platform_labels; + + /* Free any labels beyond the new table */ + for (index = limit; index < old_limit; index++) + mpls_route_update(net, index, NULL, NULL, NULL); + + /* Copy over the old labels */ + cp_size = size; + if (old_limit < limit) + cp_size = old_limit * sizeof(struct mpls_route *); + + memcpy(labels, old, cp_size); + + /* If needed set the predefined labels */ + if ((old_limit <= MPLS_LABEL_IPV6NULL) && + (limit > MPLS_LABEL_IPV6NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2); + rt2 = NULL; + } + + if ((old_limit <= MPLS_LABEL_IPV4NULL) && + (limit > MPLS_LABEL_IPV4NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0); + rt0 = NULL; + } + + /* Update the global pointers */ + net->mpls.platform_labels = limit; + rcu_assign_pointer(net->mpls.platform_label, labels); + + rtnl_unlock(); + + mpls_rt_free(rt2); + mpls_rt_free(rt0); + + if (old) { + synchronize_rcu(); + kvfree(old); + } + return 0; + +nort2: + mpls_rt_free(rt0); +nort0: + kvfree(labels); +nolabels: + return -ENOMEM; +} + +static int mpls_platform_labels(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = table->data; + int platform_labels = net->mpls.platform_labels; + int ret; + struct ctl_table tmp = { + .procname = table->procname, + .data = &platform_labels, + .maxlen = sizeof(int), + .mode = table->mode, + .extra1 = &zero, + .extra2 = &label_limit, + }; + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) + ret = resize_platform_label_table(net, platform_labels); + + return ret; +} + +static const struct ctl_table mpls_table[] = { + { + .procname = "platform_labels", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = mpls_platform_labels, + }, + { } +}; + +static int mpls_net_init(struct net *net) +{ + struct ctl_table *table; + + net->mpls.platform_labels = 0; + net->mpls.platform_label = NULL; + + table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL); + if (table == NULL) + return -ENOMEM; + + table[0].data = net; + net->mpls.ctl = register_net_sysctl(net, "net/mpls", table); + if (net->mpls.ctl == NULL) + return -ENOMEM; + + return 0; +} + +static void mpls_net_exit(struct net *net) +{ + struct mpls_route __rcu **platform_label; + size_t platform_labels; + struct ctl_table *table; + unsigned int index; + + table = net->mpls.ctl->ctl_table_arg; + unregister_net_sysctl_table(net->mpls.ctl); + kfree(table); + + /* An rcu grace period has passed since there was a device in + * the network namespace (and thus the last in flight packet) + * left this network namespace. This is because + * unregister_netdevice_many and netdev_run_todo has completed + * for each network device that was in this network namespace. + * + * As such no additional rcu synchronization is necessary when + * freeing the platform_label table. + */ + rtnl_lock(); + platform_label = rtnl_dereference(net->mpls.platform_label); + platform_labels = net->mpls.platform_labels; + for (index = 0; index < platform_labels; index++) { + struct mpls_route *rt = rtnl_dereference(platform_label[index]); + RCU_INIT_POINTER(platform_label[index], NULL); + mpls_rt_free(rt); + } + rtnl_unlock(); + + kvfree(platform_label); +} + +static struct pernet_operations mpls_net_ops = { + .init = mpls_net_init, + .exit = mpls_net_exit, +}; + +static int __init mpls_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct mpls_shim_hdr) != 4); + + err = register_pernet_subsys(&mpls_net_ops); + if (err) + goto out; + + err = register_netdevice_notifier(&mpls_dev_notifier); + if (err) + goto out_unregister_pernet; + + dev_add_pack(&mpls_packet_type); + + rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL); + rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL); + rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL); + err = 0; +out: + return err; + +out_unregister_pernet: + unregister_pernet_subsys(&mpls_net_ops); + goto out; +} +module_init(mpls_init); + +static void __exit mpls_exit(void) +{ + rtnl_unregister_all(PF_MPLS); + dev_remove_pack(&mpls_packet_type); + unregister_netdevice_notifier(&mpls_dev_notifier); + unregister_pernet_subsys(&mpls_net_ops); +} +module_exit(mpls_exit); + +MODULE_DESCRIPTION("MultiProtocol Label Switching"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_NETPROTO(PF_MPLS); diff --git a/net/mpls/internal.h b/net/mpls/internal.h new file mode 100644 index 000000000000..8cabeb5a1cb9 --- /dev/null +++ b/net/mpls/internal.h @@ -0,0 +1,56 @@ +#ifndef MPLS_INTERNAL_H +#define MPLS_INTERNAL_H + +struct mpls_shim_hdr { + __be32 label_stack_entry; +}; + +struct mpls_entry_decoded { + u32 label; + u8 ttl; + u8 tc; + u8 bos; +}; + +struct mpls_dev { + int input_enabled; + + struct ctl_table_header *sysctl; + struct rcu_head rcu; +}; + +struct sk_buff; + +static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) +{ + return (struct mpls_shim_hdr *)skb_network_header(skb); +} + +static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) +{ + struct mpls_shim_hdr result; + result.label_stack_entry = + cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) | + (tc << MPLS_LS_TC_SHIFT) | + (bos ? (1 << MPLS_LS_S_SHIFT) : 0) | + (ttl << MPLS_LS_TTL_SHIFT)); + return result; +} + +static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr) +{ + struct mpls_entry_decoded result; + unsigned entry = be32_to_cpu(hdr->label_stack_entry); + + result.label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + result.ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + result.tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + result.bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; + + return result; +} + +int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); +int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]); + +#endif /* MPLS_INTERNAL_H */ diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 809df534a720..0183b32da942 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -62,6 +62,7 @@ out: static struct packet_offload mpls_mc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_MC), + .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, @@ -69,6 +70,7 @@ static struct packet_offload mpls_mc_offload __read_mostly = { static struct packet_offload mpls_uc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_UC), + .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index b02660fa9eb0..6eae69a698ed 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1,6 +1,14 @@ menu "Core Netfilter Configuration" depends on NET && INET && NETFILTER +config NETFILTER_INGRESS + bool "Netfilter ingress support" + default y + select NET_INGRESS + help + This allows you to classify packets from ingress using the Netfilter + infrastructure. + config NETFILTER_NETLINK tristate @@ -198,7 +206,7 @@ config NF_CONNTRACK_FTP config NF_CONNTRACK_H323 tristate "H.323 protocol support" - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most @@ -438,30 +446,34 @@ config NF_TABLES To compile it as a module, choose M here. +if NF_TABLES + config NF_TABLES_INET - depends on NF_TABLES && IPV6 + depends on IPV6 select NF_TABLES_IPV4 select NF_TABLES_IPV6 tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support" help This option enables support for a mixed IPv4/IPv6 "inet" table. +config NF_TABLES_NETDEV + tristate "Netfilter nf_tables netdev tables support" + help + This option enables support for the "netdev" table. + config NFT_EXTHDR - depends on NF_TABLES tristate "Netfilter nf_tables IPv6 exthdr module" help This option adds the "exthdr" expression that you can use to match IPv6 extension headers. config NFT_META - depends on NF_TABLES tristate "Netfilter nf_tables meta module" help This option adds the "meta" expression that you can use to match and to set packet metainformation such as the packet mark. config NFT_CT - depends on NF_TABLES depends on NF_CONNTRACK tristate "Netfilter nf_tables conntrack module" help @@ -469,42 +481,36 @@ config NFT_CT connection tracking information such as the flow state. config NFT_RBTREE - depends on NF_TABLES tristate "Netfilter nf_tables rbtree set module" help This option adds the "rbtree" set type (Red Black tree) that is used to build interval-based sets. config NFT_HASH - depends on NF_TABLES tristate "Netfilter nf_tables hash set module" help This option adds the "hash" set type that is used to build one-way mappings between matchings and actions. config NFT_COUNTER - depends on NF_TABLES tristate "Netfilter nf_tables counter module" help This option adds the "counter" expression that you can use to include packet and byte counters in a rule. config NFT_LOG - depends on NF_TABLES tristate "Netfilter nf_tables log module" help This option adds the "log" expression that you can use to log packets matching some criteria. config NFT_LIMIT - depends on NF_TABLES tristate "Netfilter nf_tables limit module" help This option adds the "limit" expression that you can use to ratelimit rule matchings. config NFT_MASQ - depends on NF_TABLES depends on NF_CONNTRACK depends on NF_NAT tristate "Netfilter nf_tables masquerade support" @@ -513,7 +519,6 @@ config NFT_MASQ to perform NAT in the masquerade flavour. config NFT_REDIR - depends on NF_TABLES depends on NF_CONNTRACK depends on NF_NAT tristate "Netfilter nf_tables redirect support" @@ -522,7 +527,6 @@ config NFT_REDIR to perform NAT in the redirect flavour. config NFT_NAT - depends on NF_TABLES depends on NF_CONNTRACK select NF_NAT tristate "Netfilter nf_tables nat module" @@ -531,8 +535,6 @@ config NFT_NAT typical Network Address Translation (NAT) packet transformations. config NFT_QUEUE - depends on NF_TABLES - depends on NETFILTER_XTABLES depends on NETFILTER_NETLINK_QUEUE tristate "Netfilter nf_tables queue module" help @@ -540,7 +542,6 @@ config NFT_QUEUE infrastructure (also known as NFQUEUE) from nftables. config NFT_REJECT - depends on NF_TABLES default m if NETFILTER_ADVANCED=n tristate "Netfilter nf_tables reject support" help @@ -554,7 +555,6 @@ config NFT_REJECT_INET tristate config NFT_COMPAT - depends on NF_TABLES depends on NETFILTER_XTABLES tristate "Netfilter x_tables over nf_tables module" help @@ -562,6 +562,8 @@ config NFT_COMPAT x_tables match/target extensions over the nf_tables framework. +endif # NF_TABLES + config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n @@ -721,7 +723,7 @@ config NETFILTER_XT_TARGET_HL config NETFILTER_XT_TARGET_HMARK tristate '"HMARK" target support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED ---help--- This option adds the "HMARK" target. @@ -863,7 +865,7 @@ config NETFILTER_XT_TARGET_REDIRECT config NETFILTER_XT_TARGET_TEE tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK ---help--- This option adds a "TEE" target with which a packet can be cloned and @@ -873,7 +875,8 @@ config NETFILTER_XT_TARGET_TPROXY tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES @@ -912,7 +915,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the @@ -951,7 +954,7 @@ comment "Xtables matches" config NETFILTER_XT_MATCH_ADDRTYPE tristate '"addrtype" address type match support' - depends on NETFILTER_ADVANCED + default m if NETFILTER_ADVANCED=n ---help--- This option allows you to match what routing thinks of an address, eg. UNICAST, LOCAL, BROADCAST, ... @@ -1124,7 +1127,7 @@ config NETFILTER_XT_MATCH_ESP config NETFILTER_XT_MATCH_HASHLIMIT tristate '"hashlimit" match support' - depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED help This option adds a `hashlimit' match. @@ -1366,7 +1369,8 @@ config NETFILTER_XT_MATCH_SOCKET depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED depends on !NF_CONNTRACK || NF_CONNTRACK - depends on (IPV6 || IPV6=n) + depends on IPV6 || IPV6=n + depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 89f73a9e9874..70d026d46fe7 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -70,11 +70,12 @@ obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o # nf_tables nf_tables-objs += nf_tables_core.o nf_tables_api.o -nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o +obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index fea9ef566427..a0e54974e2c9 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -64,10 +64,27 @@ static DEFINE_MUTEX(nf_hook_mutex); int nf_register_hook(struct nf_hook_ops *reg) { + struct list_head *nf_hook_list; struct nf_hook_ops *elem; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { + switch (reg->pf) { + case NFPROTO_NETDEV: +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) { + BUG_ON(reg->dev == NULL); + nf_hook_list = ®->dev->nf_hooks_ingress; + net_inc_ingress_queue(); + break; + } +#endif + /* Fall through. */ + default: + nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; + break; + } + + list_for_each_entry(elem, nf_hook_list, list) { if (reg->priority < elem->priority) break; } @@ -85,10 +102,23 @@ void nf_unregister_hook(struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); + switch (reg->pf) { + case NFPROTO_NETDEV: +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) { + net_dec_ingress_queue(); + break; + } + break; +#endif + default: + break; + } #ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); + nf_queue_nf_hook_drop(reg); } EXPORT_SYMBOL(nf_unregister_hook); @@ -120,12 +150,8 @@ EXPORT_SYMBOL(nf_unregister_hooks); unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - unsigned int hook, - const struct net_device *indev, - const struct net_device *outdev, - struct nf_hook_ops **elemp, - int (*okfn)(struct sk_buff *), - int hook_thresh) + struct nf_hook_state *state, + struct nf_hook_ops **elemp) { unsigned int verdict; @@ -134,19 +160,19 @@ unsigned int nf_iterate(struct list_head *head, * function because of risk of continuing from deleted element. */ list_for_each_entry_continue_rcu((*elemp), head, list) { - if (hook_thresh > (*elemp)->priority) + if (state->thresh > (*elemp)->priority) continue; /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ repeat: - verdict = (*elemp)->hook(*elemp, skb, indev, outdev, okfn); + verdict = (*elemp)->hook(*elemp, skb, state); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", - (*elemp)->hook, hook); + (*elemp)->hook, state->hook); continue; } #endif @@ -161,11 +187,7 @@ repeat: /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. */ -int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - int hook_thresh) +int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) { struct nf_hook_ops *elem; unsigned int verdict; @@ -174,10 +196,9 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); - elem = list_entry_rcu(&nf_hooks[pf][hook], struct nf_hook_ops, list); + elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); next_hook: - verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, - outdev, &elem, okfn, hook_thresh); + verdict = nf_iterate(state->hook_list, skb, state, &elem); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { @@ -186,8 +207,8 @@ next_hook: if (ret == 0) ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, - verdict >> NF_VERDICT_QBITS); + int err = nf_queue(skb, elem, state, + verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ECANCELED) goto next_hook; diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 6f024a8a1534..d05e759ed0fa 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -41,7 +41,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct mtype *map = set->data; init_timer(&map->gc); - map->gc.data = (unsigned long) set; + map->gc.data = (unsigned long)set; map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -144,10 +144,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (ret == IPSET_ADD_FAILED) { if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(x, set))) + ip_set_timeout_expired(ext_timeout(x, set))) { ret = 0; - else if (!(flags & IPSET_FLAG_EXIST)) + } else if (!(flags & IPSET_FLAG_EXIST)) { + set_bit(e->id, map->members); return -IPSET_ERR_EXIST; + } /* Element is re-added, cleanup extensions */ ip_set_ext_destroy(set, x); } @@ -165,6 +167,10 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_comment(ext_comment(x, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(x, set), ext); + + /* Activate element */ + set_bit(e->id, map->members); + return 0; } @@ -203,10 +209,13 @@ mtype_list(const struct ip_set *set, struct nlattr *adt, *nested; void *x; u32 id, first = cb->args[IPSET_CB_ARG0]; + int ret = 0; adt = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!adt) return -EMSGSIZE; + /* Extensions may be replaced */ + rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < map->elements; cb->args[IPSET_CB_ARG0]++) { id = cb->args[IPSET_CB_ARG0]; @@ -214,7 +223,7 @@ mtype_list(const struct ip_set *set, if (!test_bit(id, map->members) || (SET_WITH_TIMEOUT(set) && #ifdef IP_SET_BITMAP_STORED_TIMEOUT - mtype_is_filled((const struct mtype_elem *) x) && + mtype_is_filled((const struct mtype_elem *)x) && #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; @@ -222,14 +231,16 @@ mtype_list(const struct ip_set *set, if (!nested) { if (id == first) { nla_nest_cancel(skb, adt); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + + goto nla_put_failure; } if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, x, - mtype_is_filled((const struct mtype_elem *) x))) + mtype_is_filled((const struct mtype_elem *)x))) goto nla_put_failure; ipset_nest_end(skb, nested); } @@ -238,29 +249,32 @@ mtype_list(const struct ip_set *set, /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(id == first)) { cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; } ipset_nest_end(skb, adt); - return 0; +out: + rcu_read_unlock(); + return ret; } static void mtype_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct mtype *map = set->data; void *x; u32 id; /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); + * but adding/deleting new entries is locked out + */ + spin_lock_bh(&set->lock); for (id = 0; id < map->elements; id++) if (mtype_gc_test(id, map, set->dsize)) { x = get_ext(set, map, id); @@ -269,7 +283,7 @@ mtype_gc(unsigned long ul_set) ip_set_ext_destroy(set, x); } } - read_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 55b083ec587a..64a564334418 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -36,6 +36,7 @@ IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); #define MTYPE bitmap_ip +#define HOST_MASK 32 /* Type structure */ struct bitmap_ip { @@ -58,7 +59,7 @@ struct bitmap_ip_adt_elem { static inline u32 ip_to_id(const struct bitmap_ip *m, u32 ip) { - return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; + return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts; } /* Common functions */ @@ -80,7 +81,7 @@ static inline int bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map, u32 flags, size_t dsize) { - return !!test_and_set_bit(e->id, map->members); + return !!test_bit(e->id, map->members); } static inline int @@ -137,20 +138,17 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -174,11 +172,12 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); - } else + } else { ip_to = ip; + } if (ip_to > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; @@ -189,8 +188,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -277,16 +276,17 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr >= 32) + if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); - } else + } else { return -IPSET_ERR_PROTOCOL; + } if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); - if (netmask > 32) + if (netmask > HOST_MASK) return -IPSET_ERR_INVALID_NETMASK; first_ip &= ip_set_hostmask(netmask); @@ -360,7 +360,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -377,6 +378,7 @@ bitmap_ip_init(void) static void __exit bitmap_ip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_ip_type); } diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 86104744b00f..1430535118fb 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -36,6 +36,7 @@ IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); #define MTYPE bitmap_ipmac +#define HOST_MASK 32 #define IP_SET_BITMAP_STORED_TIMEOUT enum { @@ -89,7 +90,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, return 0; elem = get_elem(map->extensions, e->id, dsize); if (elem->filled == MAC_FILLED) - return e->ether == NULL || + return !e->ether || ether_addr_equal(e->ether, elem->ether); /* Trigger kernel to fill out the ethernet address */ return -EAGAIN; @@ -130,7 +131,8 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, - * possibly by the kernel */ + * possibly by the kernel + */ if (e->ether) ip_set_timeout_set(timeout, t); else @@ -146,28 +148,35 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac_elem *elem; elem = get_elem(map->extensions, e->id, dsize); - if (test_and_set_bit(e->id, map->members)) { + if (test_bit(e->id, map->members)) { if (elem->filled == MAC_FILLED) { - if (e->ether && (flags & IPSET_FLAG_EXIST)) - memcpy(elem->ether, e->ether, ETH_ALEN); + if (e->ether && + (flags & IPSET_FLAG_EXIST) && + !ether_addr_equal(e->ether, elem->ether)) { + /* memcpy isn't atomic */ + clear_bit(e->id, map->members); + smp_mb__after_atomic(); + ether_addr_copy(elem->ether, e->ether); + } return IPSET_ADD_FAILED; } else if (!e->ether) /* Already added without ethernet address */ return IPSET_ADD_FAILED; /* Fill the MAC address and trigger the timer activation */ - memcpy(elem->ether, e->ether, ETH_ALEN); + clear_bit(e->id, map->members); + smp_mb__after_atomic(); + ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; } else if (e->ether) { /* We can store MAC too */ - memcpy(elem->ether, e->ether, ETH_ALEN); + ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return 0; - } else { - elem->filled = MAC_UNSET; - /* MAC is not stored yet, don't start timer */ - return IPSET_ADD_STORE_PLAIN_TIMEOUT; } + elem->filled = MAC_UNSET; + /* MAC is not stored yet, don't start timer */ + return IPSET_ADD_STORE_PLAIN_TIMEOUT; } static inline int @@ -238,20 +247,17 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -343,11 +349,12 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (cidr >= 32) + if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); - } else + } else { return -IPSET_ERR_PROTOCOL; + } elements = (u64)last_ip - first_ip + 1; @@ -397,7 +404,8 @@ static struct ip_set_type bitmap_ipmac_type = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -414,6 +422,7 @@ bitmap_ipmac_init(void) static void __exit bitmap_ipmac_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_ipmac_type); } diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 005dd36444c3..5338ccd5da46 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -73,7 +73,7 @@ static inline int bitmap_port_do_add(const struct bitmap_port_adt_elem *e, struct bitmap_port *map, u32 flags, size_t dsize) { - return !!test_and_set_bit(e->id, map->members); + return !!test_bit(e->id, map->members); } static inline int @@ -136,19 +136,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], u16 port_to; int ret = 0; - if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) + return -IPSET_ERR_PROTOCOL; + port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); if (port < map->first_port || port > map->last_port) return -IPSET_ERR_BITMAP_RANGE; @@ -168,8 +162,9 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], if (port < map->first_port) return -IPSET_ERR_BITMAP_RANGE; } - } else + } else { port_to = port; + } if (port_to > map->last_port) return -IPSET_ERR_BITMAP_RANGE; @@ -180,8 +175,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -294,7 +289,8 @@ static struct ip_set_type bitmap_port_type = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -311,6 +307,7 @@ bitmap_port_init(void) static void __exit bitmap_port_fini(void) { + rcu_barrier(); ip_set_type_unregister(&bitmap_port_type); } diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index d259da3ce67a..338b4047776f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -32,8 +32,10 @@ static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ struct ip_set_net { struct ip_set * __rcu *ip_set_list; /* all individual sets */ ip_set_id_t ip_set_max; /* max number of sets */ - int is_deleted; /* deleted by ip_set_net_exit */ + bool is_deleted; /* deleted by ip_set_net_exit */ + bool is_destroyed; /* all sets are destroyed */ }; + static int ip_set_net_id __read_mostly; static inline struct ip_set_net *ip_set_pernet(struct net *net) @@ -42,7 +44,7 @@ static inline struct ip_set_net *ip_set_pernet(struct net *net) } #define IP_SET_INC 64 -#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) +#define STRNCMP(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) static unsigned int max_sets; @@ -59,8 +61,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); #define ip_set(inst, id) \ ip_set_dereference((inst)->ip_set_list)[id] -/* - * The set types are implemented in modules and registered set types +/* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is * serialized by ip_set_type_mutex. */ @@ -85,7 +86,7 @@ find_set_type(const char *name, u8 family, u8 revision) struct ip_set_type *type; list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name) && + if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC) && revision >= type->revision_min && @@ -130,9 +131,10 @@ __find_set_type_get(const char *name, u8 family, u8 revision, goto unlock; } /* Make sure the type is already loaded - * but we don't support the revision */ + * but we don't support the revision + */ list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name)) { + if (STRNCMP(type->name, name)) { err = -IPSET_ERR_FIND_TYPE; goto unlock; } @@ -166,7 +168,7 @@ __find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, *min = 255; *max = 0; rcu_read_lock(); list_for_each_entry_rcu(type, &ip_set_type_list, list) - if (STREQ(type->name, name) && + if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC)) { found = true; @@ -208,15 +210,15 @@ ip_set_type_register(struct ip_set_type *type) pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", type->name, family_name(type->family), type->revision_min); - ret = -EINVAL; - goto unlock; + ip_set_type_unlock(); + return -EINVAL; } list_add_rcu(&type->list, &ip_set_type_list); pr_debug("type %s, family %s, revision %u:%u registered.\n", type->name, family_name(type->family), type->revision_min, type->revision_max); -unlock: ip_set_type_unlock(); + return ret; } EXPORT_SYMBOL_GPL(ip_set_type_register); @@ -230,12 +232,12 @@ ip_set_type_unregister(struct ip_set_type *type) pr_warn("ip_set type %s, family %s with revision min %u not registered\n", type->name, family_name(type->family), type->revision_min); - goto unlock; + ip_set_type_unlock(); + return; } list_del_rcu(&type->list); pr_debug("type %s, family %s with revision min %u unregistered.\n", type->name, family_name(type->family), type->revision_min); -unlock: ip_set_type_unlock(); synchronize_rcu(); @@ -289,7 +291,7 @@ static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) { - struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; @@ -306,7 +308,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) { - struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1]; if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; @@ -317,7 +319,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) return -IPSET_ERR_PROTOCOL; memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), - sizeof(struct in6_addr)); + sizeof(struct in6_addr)); return 0; } EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); @@ -365,7 +367,7 @@ size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) { enum ip_set_ext_id id; - size_t offset = 0; + size_t offset = len; u32 cadt_flags = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) @@ -375,12 +377,12 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; - offset += ALIGN(len + offset, ip_set_extensions[id].align); + offset = ALIGN(offset, ip_set_extensions[id].align); set->offset[id] = offset; set->extensions |= ip_set_extensions[id].type; offset += ip_set_extensions[id].len; } - return len + offset; + return offset; } EXPORT_SYMBOL_GPL(ip_set_elem_len); @@ -389,13 +391,22 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { u64 fullmark; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + return -IPSET_ERR_PROTOCOL; + if (tb[IPSET_ATTR_TIMEOUT]) { - if (!(set->extensions & IPSET_EXT_TIMEOUT)) + if (!SET_WITH_TIMEOUT(set)) return -IPSET_ERR_TIMEOUT; ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); } if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) { - if (!(set->extensions & IPSET_EXT_COUNTER)) + if (!SET_WITH_COUNTER(set)) return -IPSET_ERR_COUNTER; if (tb[IPSET_ATTR_BYTES]) ext->bytes = be64_to_cpu(nla_get_be64( @@ -405,25 +416,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], tb[IPSET_ATTR_PACKETS])); } if (tb[IPSET_ATTR_COMMENT]) { - if (!(set->extensions & IPSET_EXT_COMMENT)) + if (!SET_WITH_COMMENT(set)) return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } if (tb[IPSET_ATTR_SKBMARK]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); ext->skbmark = fullmark >> 32; ext->skbmarkmask = fullmark & 0xffffffff; } if (tb[IPSET_ATTR_SKBPRIO]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbprio = be32_to_cpu(nla_get_be32( tb[IPSET_ATTR_SKBPRIO])); } if (tb[IPSET_ATTR_SKBQUEUE]) { - if (!(set->extensions & IPSET_EXT_SKBINFO)) + if (!SET_WITH_SKBINFO(set)) return -IPSET_ERR_SKBINFO; ext->skbqueue = be16_to_cpu(nla_get_be16( tb[IPSET_ATTR_SKBQUEUE])); @@ -432,8 +443,32 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_set_get_extensions); -/* - * Creating/destroying/renaming/swapping affect the existence and +int +ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, + const void *e, bool active) +{ + if (SET_WITH_TIMEOUT(set)) { + unsigned long *timeout = ext_timeout(e, set); + + if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, + htonl(active ? ip_set_timeout_get(timeout) + : *timeout))) + return -EMSGSIZE; + } + if (SET_WITH_COUNTER(set) && + ip_set_put_counter(skb, ext_counter(e, set))) + return -EMSGSIZE; + if (SET_WITH_COMMENT(set) && + ip_set_put_comment(skb, ext_comment(e, set))) + return -EMSGSIZE; + if (SET_WITH_SKBINFO(set) && + ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) + return -EMSGSIZE; + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_put_extensions); + +/* Creating/destroying/renaming/swapping affect the existence and * the properties of a set. All of these can be executed from userspace * only and serialized by the nfnl mutex indirectly from nfnetlink. * @@ -460,8 +495,7 @@ __ip_set_put(struct ip_set *set) write_unlock_bh(&ip_set_ref_lock); } -/* - * Add, del and test set entries from kernel. +/* Add, del and test set entries from kernel. * * The set behind the index must exist and must be referenced * so it can't be destroyed (or changed) under our foot. @@ -489,23 +523,23 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, dev_net(par->in ? par->in : par->out), index); int ret = 0; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return 0; - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set->variant->kadt(set, skb, par, IPSET_ADD, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); ret = 1; } else { /* --return-nomatch: invert matched element */ @@ -528,16 +562,16 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, dev_net(par->in ? par->in : par->out), index); int ret; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); return ret; } @@ -551,23 +585,22 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, dev_net(par->in ? par->in : par->out), index); int ret = 0; - BUG_ON(set == NULL); + BUG_ON(!set); pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); return ret; } EXPORT_SYMBOL_GPL(ip_set_del); -/* - * Find set by name, reference it once. The reference makes sure the +/* Find set by name, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * */ @@ -581,7 +614,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) rcu_read_lock(); for (i = 0; i < inst->ip_set_max; i++) { s = rcu_dereference(inst->ip_set_list)[i]; - if (s != NULL && STREQ(s->name, name)) { + if (s && STRNCMP(s->name, name)) { __ip_set_get(s); index = i; *set = s; @@ -594,8 +627,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set) } EXPORT_SYMBOL_GPL(ip_set_get_byname); -/* - * If the given set pointer points to a valid set, decrement +/* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * @@ -608,7 +640,7 @@ __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index) rcu_read_lock(); set = rcu_dereference(inst->ip_set_list)[index]; - if (set != NULL) + if (set) __ip_set_put(set); rcu_read_unlock(); } @@ -622,8 +654,7 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_put_byindex); -/* - * Get the name of a set behind a set index. +/* Get the name of a set behind a set index. * We assume the set is referenced, so it does exist and * can't be destroyed. The set cannot be renamed due to * the referencing either. @@ -634,7 +665,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index) { const struct ip_set *set = ip_set_rcu_get(net, index); - BUG_ON(set == NULL); + BUG_ON(!set); BUG_ON(set->ref == 0); /* Referenced, so it's safe */ @@ -642,13 +673,11 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_name_byindex); -/* - * Routines to call by external subsystems, which do not +/* Routines to call by external subsystems, which do not * call nfnl_lock for us. */ -/* - * Find set by index, reference it once. The reference makes sure the +/* Find set by index, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * * The nfnl mutex is used in the function. @@ -674,8 +703,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index) } EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); -/* - * If the given set pointer points to a valid set, decrement +/* If the given set pointer points to a valid set, decrement * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * @@ -690,15 +718,14 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index) nfnl_lock(NFNL_SUBSYS_IPSET); if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */ set = ip_set(inst, index); - if (set != NULL) + if (set) __ip_set_put(set); } nfnl_unlock(NFNL_SUBSYS_IPSET); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); -/* - * Communication protocol with userspace over netlink. +/* Communication protocol with userspace over netlink. * * The commands are serialized by the nfnl mutex. */ @@ -725,7 +752,7 @@ start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), sizeof(*nfmsg), flags); - if (nlh == NULL) + if (!nlh) return NULL; nfmsg = nlmsg_data(nlh); @@ -758,7 +785,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id) *id = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); - if (set != NULL && STREQ(set->name, name)) { + if (set && STRNCMP(set->name, name)) { *id = i; break; } @@ -784,10 +811,10 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, *index = IPSET_INVALID_ID; for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s == NULL) { + if (!s) { if (*index == IPSET_INVALID_ID) *index = i; - } else if (STREQ(name, s->name)) { + } else if (STRNCMP(name, s->name)) { /* Name clash */ *set = s; return -EEXIST; @@ -816,18 +843,18 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; - struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; u32 flags = flag_exist(nlh); int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_TYPENAME] == NULL || - attr[IPSET_ATTR_REVISION] == NULL || - attr[IPSET_ATTR_FAMILY] == NULL || - (attr[IPSET_ATTR_DATA] != NULL && + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_TYPENAME] || + !attr[IPSET_ATTR_REVISION] || + !attr[IPSET_ATTR_FAMILY] || + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])))) return -IPSET_ERR_PROTOCOL; @@ -838,33 +865,29 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", name, typename, family_name(family), revision); - /* - * First, and without any locks, allocate and initialize + /* First, and without any locks, allocate and initialize * a normal base set structure. */ - set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); + set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; - rwlock_init(&set->lock); + spin_lock_init(&set->lock); strlcpy(set->name, name, IPSET_MAXNAMELEN); set->family = family; set->revision = revision; - /* - * Next, check that we know the type, and take + /* Next, check that we know the type, and take * a reference on the type, to make sure it stays available * while constructing our new set. * * After referencing the type, we try to create the type * specific part of the set without holding any locks. */ - ret = find_set_type_get(typename, family, revision, &(set->type)); + ret = find_set_type_get(typename, family, revision, &set->type); if (ret) goto out; - /* - * Without holding any locks, create private part. - */ + /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy)) { @@ -878,8 +901,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, /* BTW, ret==0 here. */ - /* - * Here, we have a valid, constructed set and we are protected + /* Here, we have a valid, constructed set and we are protected * by the nfnl mutex. Find the first free index in ip_set_list * and check clashing. */ @@ -887,7 +909,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, if (ret == -EEXIST) { /* If this is the same set and requested, ignore error */ if ((flags & IPSET_FLAG_EXIST) && - STREQ(set->type->name, clash->type->name) && + STRNCMP(set->type->name, clash->type->name) && set->type->family == clash->type->family && set->type->revision_min == clash->type->revision_min && set->type->revision_max == clash->type->revision_max && @@ -902,7 +924,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, /* Wraparound */ goto cleanup; - list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL); + list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL); if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ @@ -916,12 +938,11 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, inst->ip_set_max = i; kfree(tmp); ret = 0; - } else if (ret) + } else if (ret) { goto cleanup; + } - /* - * Finally! Add our shiny new set to the list, and be done. - */ + /* Finally! Add our shiny new set to the list, and be done. */ pr_debug("create: '%s' created with index %u!\n", set->name, index); ip_set(inst, index) = set; @@ -946,12 +967,9 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { }; static void -ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index) +ip_set_destroy_set(struct ip_set *set) { - struct ip_set *set = ip_set(inst, index); - pr_debug("set: %s\n", set->name); - ip_set(inst, index) = NULL; /* Must call it without holding any lock */ set->variant->destroy(set); @@ -986,30 +1004,36 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL && s->ref) { + if (s && s->ref) { ret = -IPSET_ERR_BUSY; goto out; } } + inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL) - ip_set_destroy_set(inst, i); + if (s) { + ip_set(inst, i) = NULL; + ip_set_destroy_set(s); + } } + /* Modified by ip_set_destroy() only, which is serialized */ + inst->is_destroyed = false; } else { s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); - if (s == NULL) { + if (!s) { ret = -ENOENT; goto out; } else if (s->ref) { ret = -IPSET_ERR_BUSY; goto out; } + ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); - ip_set_destroy_set(inst, i); + ip_set_destroy_set(s); } return 0; out: @@ -1024,9 +1048,9 @@ ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set->variant->flush(set); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); } static int @@ -1044,12 +1068,12 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL) + if (s) ip_set_flush_set(s); } } else { s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (s == NULL) + if (!s) return -ENOENT; ip_set_flush_set(s); @@ -1081,12 +1105,12 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_SETNAME2] == NULL)) + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; read_lock_bh(&ip_set_ref_lock); @@ -1098,7 +1122,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); - if (s != NULL && STREQ(s->name, name2)) { + if (s && STRNCMP(s->name, name2)) { ret = -IPSET_ERR_EXIST_SETNAME2; goto out; } @@ -1130,23 +1154,24 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, char from_name[IPSET_MAXNAMELEN]; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_SETNAME2] == NULL)) + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_SETNAME2])) return -IPSET_ERR_PROTOCOL; from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &from_id); - if (from == NULL) + if (!from) return -ENOENT; to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id); - if (to == NULL) + if (!to) return -IPSET_ERR_EXIST_SETNAME2; /* Features must not change. - * Not an artificial restriction anymore, as we must prevent - * possible loops created by swapping in setlist type of sets. */ + * Not an artifical restriction anymore, as we must prevent + * possible loops created by swapping in setlist type of sets. + */ if (!(from->type->features == to->type->features && from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; @@ -1177,12 +1202,16 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, static int ip_set_dump_done(struct netlink_callback *cb) { - struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET]; if (cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", - ip_set(inst, cb->args[IPSET_CB_INDEX])->name); - __ip_set_put_byindex(inst, - (ip_set_id_t) cb->args[IPSET_CB_INDEX]); + struct ip_set_net *inst = + (struct ip_set_net *)cb->args[IPSET_CB_NET]; + ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; + struct ip_set *set = ip_set(inst, index); + + if (set->variant->uref) + set->variant->uref(set, cb, false); + pr_debug("release set %s\n", set->name); + __ip_set_put_byindex(inst, index); } return 0; } @@ -1204,7 +1233,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) { struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; ip_set_id_t index; @@ -1213,27 +1242,23 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); - /* cb->args[IPSET_CB_NET]: net namespace - * [IPSET_CB_DUMP]: dump single set/all sets - * [IPSET_CB_INDEX]: set index - * [IPSET_CB_ARG0]: type specific - */ - if (cda[IPSET_ATTR_SETNAME]) { struct ip_set *set; set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]), &index); - if (set == NULL) + if (!set) return -ENOENT; dump_type = DUMP_ONE; cb->args[IPSET_CB_INDEX] = index; - } else + } else { dump_type = DUMP_ALL; + } if (cda[IPSET_ATTR_FLAGS]) { u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); + dump_type |= (f << 16); } cb->args[IPSET_CB_NET] = (unsigned long)inst; @@ -1251,6 +1276,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0; struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk)); u32 dump_type, dump_flags; + bool is_destroyed; int ret = 0; if (!cb->args[IPSET_CB_DUMP]) { @@ -1258,7 +1284,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) if (ret < 0) { nlh = nlmsg_hdr(cb->skb); /* We have to create and send the error message - * manually :-( */ + * manually :-( + */ if (nlh->nlmsg_flags & NLM_F_ACK) netlink_ack(cb->skb, nlh, ret); return ret; @@ -1276,13 +1303,21 @@ dump_last: pr_debug("dump type, flag: %u %u index: %ld\n", dump_type, dump_flags, cb->args[IPSET_CB_INDEX]); for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) { - index = (ip_set_id_t) cb->args[IPSET_CB_INDEX]; + index = (ip_set_id_t)cb->args[IPSET_CB_INDEX]; + write_lock_bh(&ip_set_ref_lock); set = ip_set(inst, index); - if (set == NULL) { + is_destroyed = inst->is_destroyed; + if (!set || is_destroyed) { + write_unlock_bh(&ip_set_ref_lock); if (dump_type == DUMP_ONE) { ret = -ENOENT; goto out; } + if (is_destroyed) { + /* All sets are just being destroyed */ + ret = 0; + goto out; + } continue; } /* When dumping all sets, we must dump "sorted" @@ -1290,14 +1325,17 @@ dump_last: */ if (dump_type != DUMP_ONE && ((dump_type == DUMP_ALL) == - !!(set->type->features & IPSET_DUMP_LAST))) + !!(set->type->features & IPSET_DUMP_LAST))) { + write_unlock_bh(&ip_set_ref_lock); continue; + } pr_debug("List set: %s\n", set->name); if (!cb->args[IPSET_CB_ARG0]) { /* Start listing: make sure set won't be destroyed */ pr_debug("reference set\n"); - __ip_set_get(set); + set->ref++; } + write_unlock_bh(&ip_set_ref_lock); nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, IPSET_CMD_LIST); @@ -1325,11 +1363,13 @@ dump_last: goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; + if (set->variant->uref) + set->variant->uref(set, cb, true); /* Fall through and add elements */ default: - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->list(set, skb, cb); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); if (!cb->args[IPSET_CB_ARG0]) /* Set is done, proceed with next one */ goto next_set; @@ -1341,6 +1381,8 @@ dump_last: dump_type = DUMP_LAST; cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16); cb->args[IPSET_CB_INDEX] = 0; + if (set && set->variant->uref) + set->variant->uref(set, cb, false); goto dump_last; } goto out; @@ -1355,7 +1397,10 @@ next_set: release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[IPSET_CB_ARG0]) { - pr_debug("release set %s\n", ip_set(inst, index)->name); + set = ip_set(inst, index); + if (set->variant->uref) + set->variant->uref(set, cb, false); + pr_debug("release set %s\n", set->name); __ip_set_put_byindex(inst, index); cb->args[IPSET_CB_ARG0] = 0; } @@ -1407,9 +1452,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); retried = true; } while (ret == -EAGAIN && set->variant->resize && @@ -1425,12 +1470,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, size_t payload = min(SIZE_MAX, sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1]; struct nlattr *cmdattr; u32 *errline; skb2 = nlmsg_new(payload, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); @@ -1447,7 +1492,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, *errline = lineno; - netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1462,25 +1508,25 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || + !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] != NULL && + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] != NULL && + (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || - attr[IPSET_ATTR_LINENO] == NULL)))) + !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; @@ -1517,25 +1563,25 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; u32 flags = flag_exist(nlh); bool use_lineno; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || + !attr[IPSET_ATTR_SETNAME] || !((attr[IPSET_ATTR_DATA] != NULL) ^ (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] != NULL && + (attr[IPSET_ATTR_DATA] && !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] != NULL && + (attr[IPSET_ATTR_ADT] && (!flag_nested(attr[IPSET_ATTR_ADT]) || - attr[IPSET_ATTR_LINENO] == NULL)))) + !attr[IPSET_ATTR_LINENO])))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; use_lineno = !!attr[IPSET_ATTR_LINENO]; @@ -1572,26 +1618,26 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, { struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL || - attr[IPSET_ATTR_DATA] == NULL || + !attr[IPSET_ATTR_SETNAME] || + !attr[IPSET_ATTR_DATA] || !flag_nested(attr[IPSET_ATTR_DATA]))) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy)) return -IPSET_ERR_PROTOCOL; - read_lock_bh(&set->lock); + rcu_read_lock_bh(); ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); - read_unlock_bh(&set->lock); + rcu_read_unlock_bh(); /* Userspace can't trigger element to be re-added */ if (ret == -EAGAIN) ret = 1; @@ -1613,15 +1659,15 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_SETNAME] == NULL)) + !attr[IPSET_ATTR_SETNAME])) return -IPSET_ERR_PROTOCOL; set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (set == NULL) + if (!set) return -ENOENT; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1670,8 +1716,8 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb, int ret = 0; if (unlikely(protocol_failed(attr) || - attr[IPSET_ATTR_TYPENAME] == NULL || - attr[IPSET_ATTR_FAMILY] == NULL)) + !attr[IPSET_ATTR_TYPENAME] || + !attr[IPSET_ATTR_FAMILY])) return -IPSET_ERR_PROTOCOL; family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); @@ -1681,7 +1727,7 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb, return ret; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1726,11 +1772,11 @@ ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh2; int ret = 0; - if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL)) + if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (!skb2) return -ENOMEM; nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, @@ -1858,7 +1904,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) ret = -EFAULT; goto done; } - op = (unsigned int *) data; + op = (unsigned int *)data; if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ @@ -1970,10 +2016,11 @@ ip_set_net_init(struct net *net) if (inst->ip_set_max >= IPSET_INVALID_ID) inst->ip_set_max = IPSET_INVALID_ID - 1; - list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL); + list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL); if (!list) return -ENOMEM; - inst->is_deleted = 0; + inst->is_deleted = false; + inst->is_destroyed = false; rcu_assign_pointer(inst->ip_set_list, list); return 0; } @@ -1986,12 +2033,14 @@ ip_set_net_exit(struct net *net) struct ip_set *set = NULL; ip_set_id_t i; - inst->is_deleted = 1; /* flag for ip_set_nfnl_put */ + inst->is_deleted = true; /* flag for ip_set_nfnl_put */ for (i = 0; i < inst->ip_set_max; i++) { set = ip_set(inst, i); - if (set != NULL) - ip_set_destroy_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + ip_set_destroy_set(set); + } } kfree(rcu_dereference_protected(inst->ip_set_list, 1)); } @@ -2003,11 +2052,11 @@ static struct pernet_operations ip_set_net_ops = { .size = sizeof(struct ip_set_net) }; - static int __init ip_set_init(void) { int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); return ret; diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 29fb01ddff93..42c3e3ba1b94 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -30,7 +30,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct tcphdr *th; th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph); - if (th == NULL) + if (!th) /* No choice either */ return false; @@ -42,7 +42,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const sctp_sctphdr_t *sh; sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh); - if (sh == NULL) + if (!sh) /* No choice either */ return false; @@ -55,7 +55,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct udphdr *uh; uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph); - if (uh == NULL) + if (!uh) /* No choice either */ return false; @@ -67,7 +67,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct icmphdr *ic; ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); - if (ic == NULL) + if (!ic) return false; *port = (__force __be16)htons((ic->type << 8) | ic->code); @@ -78,7 +78,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, const struct icmp6hdr *ic; ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); - if (ic == NULL) + if (!ic) return false; *port = (__force __be16) @@ -98,7 +98,7 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src, __be16 *port, u8 *proto) { const struct iphdr *iph = ip_hdr(skb); - unsigned int protooff = ip_hdrlen(skb); + unsigned int protooff = skb_network_offset(skb) + ip_hdrlen(skb); int protocol = iph->protocol; /* See comments at tcp_match in ip_tables.c */ @@ -116,7 +116,8 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src, return false; default: /* Other protocols doesn't have ports, - so we can match fragments */ + * so we can match fragments. + */ *proto = protocol; return true; } @@ -135,7 +136,9 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src, __be16 frag_off = 0; nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + protoff = ipv6_skip_exthdr(skb, + skb_network_offset(skb) + + sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) return false; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 974ff386db0f..afe905c208af 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -10,19 +10,19 @@ #include <linux/rcupdate.h> #include <linux/jhash.h> +#include <linux/types.h> #include <linux/netfilter/ipset/ip_set_timeout.h> -#ifndef rcu_dereference_bh -#define rcu_dereference_bh(p) rcu_dereference(p) -#endif + +#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) +#define ipset_dereference_protected(p, set) \ + __ipset_dereference_protected(p, spin_is_locked(&(set)->lock)) #define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) /* Hashing which uses arrays to resolve clashing. The hash table is resized * (doubled) when searching becomes too long. * Internally jhash is used with the assumption that the size of the - * stored data is a multiple of sizeof(u32). If storage supports timeout, - * the timeout field must be the last one in the data structure - that field - * is ignored when computing the hash key. + * stored data is a multiple of sizeof(u32). * * Readers and resizing * @@ -35,7 +35,9 @@ /* Number of elements to store in an initial array block */ #define AHASH_INIT_SIZE 4 /* Max number of elements to store in an array block */ -#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE) +#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE) +/* Max muber of elements in the array block when tuned */ +#define AHASH_MAX_TUNED 64 /* Max number of elements can be tuned */ #ifdef IP_SET_HASH_WITH_MULTI @@ -53,8 +55,9 @@ tune_ahash_max(u8 curr, u32 multi) /* Currently, at listing one hash bucket must fit into a message. * Therefore we have a hard limit here. */ - return n > curr && n <= 64 ? n : curr; + return n > curr && n <= AHASH_MAX_TUNED ? n : curr; } + #define TUNE_AHASH_MAX(h, multi) \ ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi)) #else @@ -64,18 +67,23 @@ tune_ahash_max(u8 curr, u32 multi) /* A hash bucket */ struct hbucket { - void *value; /* the array of the values */ + struct rcu_head rcu; /* for call_rcu_bh */ + /* Which positions are used in the array */ + DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ u8 pos; /* position of the first free entry */ -}; + unsigned char value[0]; /* the array of the values */ +} __attribute__ ((aligned)); /* The hash table: the table size stored here in order to make resizing easy */ struct htable { + atomic_t ref; /* References for resizing */ + atomic_t uref; /* References for dumping */ u8 htable_bits; /* size of hash table == 2^htable_bits */ - struct hbucket bucket[0]; /* hashtable buckets */ + struct hbucket __rcu *bucket[0]; /* hashtable buckets */ }; -#define hbucket(h, i) (&((h)->bucket[i])) +#define hbucket(h, i) ((h)->bucket[i]) #ifndef IPSET_NET_COUNT #define IPSET_NET_COUNT 1 @@ -83,8 +91,8 @@ struct htable { /* Book-keeping of the prefixes added to the set */ struct net_prefixes { - u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */ - u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */ + u32 nets[IPSET_NET_COUNT]; /* number of elements for this cidr */ + u8 cidr[IPSET_NET_COUNT]; /* the cidr value */ }; /* Compute the hash table size */ @@ -97,11 +105,11 @@ htable_size(u8 hbits) if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket) + if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; - return hsize * sizeof(struct hbucket) + sizeof(struct htable); + return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } /* Compute htable_bits from the user input parameter hashsize */ @@ -110,6 +118,7 @@ htable_bits(u32 hashsize) { /* Assume that hashsize == 2^htable_bits */ u8 bits = fls(hashsize - 1); + if (jhash_size(bits) != hashsize) /* Round up to the first 2^n value */ bits = fls(hashsize); @@ -117,30 +126,6 @@ htable_bits(u32 hashsize) return bits; } -static int -hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) -{ - if (n->pos >= n->size) { - void *tmp; - - if (n->size >= ahash_max) - /* Trigger rehashing */ - return -EAGAIN; - - tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize, - GFP_ATOMIC); - if (!tmp) - return -ENOMEM; - if (n->size) { - memcpy(tmp, n->value, n->size * dsize); - kfree(n->value); - } - n->value = tmp; - n->size += AHASH_INIT_SIZE; - } - return 0; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -149,17 +134,21 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #endif /* cidr + 1 is stored in net_prefixes to support /0 */ -#define SCIDR(cidr, i) (__CIDR(cidr, i) + 1) +#define NCIDR_PUT(cidr) ((cidr) + 1) +#define NCIDR_GET(cidr) ((cidr) - 1) #ifdef IP_SET_HASH_WITH_NETS_PACKED /* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */ -#define GCIDR(cidr, i) (__CIDR(cidr, i) + 1) -#define NCIDR(cidr) (cidr) +#define DCIDR_PUT(cidr) ((cidr) - 1) +#define DCIDR_GET(cidr, i) (__CIDR(cidr, i) + 1) #else -#define GCIDR(cidr, i) (__CIDR(cidr, i)) -#define NCIDR(cidr) (cidr - 1) +#define DCIDR_PUT(cidr) (cidr) +#define DCIDR_GET(cidr, i) __CIDR(cidr, i) #endif +#define INIT_CIDR(cidr, host_mask) \ + DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask)) + #define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) #ifdef IP_SET_HASH_WITH_NET0 @@ -180,6 +169,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_data_equal #undef mtype_do_data_match #undef mtype_data_set_flags +#undef mtype_data_reset_elem #undef mtype_data_reset_flags #undef mtype_data_netmask #undef mtype_data_list @@ -193,7 +183,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_ahash_memsize #undef mtype_flush #undef mtype_destroy -#undef mtype_gc_init #undef mtype_same_set #undef mtype_kadt #undef mtype_uadt @@ -203,6 +192,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #undef mtype_del #undef mtype_test_cidrs #undef mtype_test +#undef mtype_uref #undef mtype_expire #undef mtype_resize #undef mtype_head @@ -227,6 +217,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_data_list IPSET_TOKEN(MTYPE, _data_list) #define mtype_data_next IPSET_TOKEN(MTYPE, _data_next) #define mtype_elem IPSET_TOKEN(MTYPE, _elem) + #define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy) #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) #define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr) @@ -234,7 +225,6 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize) #define mtype_flush IPSET_TOKEN(MTYPE, _flush) #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) -#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) @@ -244,23 +234,36 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize) #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) #define mtype_test IPSET_TOKEN(MTYPE, _test) +#define mtype_uref IPSET_TOKEN(MTYPE, _uref) #define mtype_expire IPSET_TOKEN(MTYPE, _expire) #define mtype_resize IPSET_TOKEN(MTYPE, _resize) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) +#ifndef MTYPE +#error "MTYPE is not defined!" +#endif + +#ifndef HOST_MASK +#error "HOST_MASK is not defined!" +#endif + #ifndef HKEY_DATALEN #define HKEY_DATALEN sizeof(struct mtype_elem) #endif #define HKEY(data, initval, htable_bits) \ -(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \ +(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval) \ & jhash_mask(htable_bits)) #ifndef htype +#ifndef HTYPE +#error "HTYPE is not defined!" +#endif /* HTYPE */ #define htype HTYPE /* The generic hash structure */ @@ -280,18 +283,16 @@ struct htype { #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif -#ifdef IP_SET_HASH_WITH_RBTREE - struct rb_root rbtree; -#endif #ifdef IP_SET_HASH_WITH_NETS struct net_prefixes nets[0]; /* book-keeping of prefixes */ #endif }; -#endif +#endif /* htype */ #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different - * sized networks */ + * sized networks. cidr == real cidr + 1 to support /0. + */ static void mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) { @@ -299,11 +300,11 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) /* Add in increasing prefix order, so larger cidr first */ for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) { - if (j != -1) + if (j != -1) { continue; - else if (h->nets[i].cidr[n] < cidr) + } else if (h->nets[i].cidr[n] < cidr) { j = i; - else if (h->nets[i].cidr[n] == cidr) { + } else if (h->nets[i].cidr[n] == cidr) { h->nets[cidr - 1].nets[n]++; return; } @@ -322,15 +323,15 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) u8 i, j, net_end = nets_length - 1; for (i = 0; i < nets_length; i++) { - if (h->nets[i].cidr[n] != cidr) - continue; - h->nets[cidr -1].nets[n]--; - if (h->nets[cidr -1].nets[n] > 0) - return; + if (h->nets[i].cidr[n] != cidr) + continue; + h->nets[cidr - 1].nets[n]--; + if (h->nets[cidr - 1].nets[n] > 0) + return; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) - h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; + h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; h->nets[j].cidr[n] = 0; - return; + return; } } #endif @@ -341,15 +342,18 @@ mtype_ahash_memsize(const struct htype *h, const struct htable *t, u8 nets_length, size_t dsize) { u32 i; - size_t memsize = sizeof(*h) - + sizeof(*t) + struct hbucket *n; + size_t memsize = sizeof(*h) + sizeof(*t); + #ifdef IP_SET_HASH_WITH_NETS - + sizeof(struct net_prefixes) * nets_length + memsize += sizeof(struct net_prefixes) * nets_length; #endif - + jhash_size(t->htable_bits) * sizeof(struct hbucket); - - for (i = 0; i < jhash_size(t->htable_bits); i++) - memsize += t->bucket[i].size * dsize; + for (i = 0; i < jhash_size(t->htable_bits); i++) { + n = rcu_dereference_bh(hbucket(t, i)); + if (!n) + continue; + memsize += sizeof(struct hbucket) + n->size * dsize; + } return memsize; } @@ -364,7 +368,8 @@ mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) int i; for (i = 0; i < n->pos; i++) - ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); + if (test_bit(i, n->used)) + ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); } /* Flush a hash type of set: destroy all elements */ @@ -376,16 +381,16 @@ mtype_flush(struct ip_set *set) struct hbucket *n; u32 i; - t = rcu_dereference_bh_nfnl(h->table); + t = ipset_dereference_protected(h->table, set); for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - if (n->size) { - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set, n); - n->size = n->pos = 0; - /* FIXME: use slab cache */ - kfree(n->value); - } + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); } #ifdef IP_SET_HASH_WITH_NETS memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family)); @@ -401,13 +406,13 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - if (n->size) { - if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) - mtype_ext_cleanup(set, n); - /* FIXME: use slab cache */ - kfree(n->value); - } + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + kfree(n); } ip_set_free(t); @@ -419,13 +424,11 @@ mtype_destroy(struct ip_set *set) { struct htype *h = set->data; - if (set->extensions & IPSET_EXT_TIMEOUT) + if (SET_WITH_TIMEOUT(set)) del_timer_sync(&h->gc); - mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true); -#ifdef IP_SET_HASH_WITH_RBTREE - rbtree_destroy(&h->rbtree); -#endif + mtype_ahash_destroy(set, + __ipset_dereference_protected(h->table, 1), true); kfree(h); set->data = NULL; @@ -437,7 +440,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct htype *h = set->data; init_timer(&h->gc); - h->gc.data = (unsigned long) set; + h->gc.data = (unsigned long)set; h->gc.function = gc; h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&h->gc); @@ -470,61 +473,71 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) struct htable *t; struct hbucket *n; struct mtype_elem *data; - u32 i; - int j; + u32 i, j, d; #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + t = ipset_dereference_protected(h->table, set); for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = hbucket(t, i); - for (j = 0; j < n->pos; j++) { + n = __ipset_dereference_protected(hbucket(t, i), 1); + if (!n) + continue; + for (j = 0, d = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) { + d++; + continue; + } data = ahash_data(n, j, dsize); if (ip_set_timeout_expired(ext_timeout(data, set))) { pr_debug("expired %u/%u\n", i, j); + clear_bit(j, n->used); + smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) - mtype_del_cidr(h, SCIDR(data->cidr, k), - nets_length, k); + mtype_del_cidr(h, + NCIDR_PUT(DCIDR_GET(data->cidr, + k)), + nets_length, k); #endif ip_set_ext_destroy(set, data); - if (j != n->pos - 1) - /* Not last one */ - memcpy(data, - ahash_data(n, n->pos - 1, dsize), - dsize); - n->pos--; h->elements--; + d++; } } - if (n->pos + AHASH_INIT_SIZE < n->size) { - void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) - * dsize, - GFP_ATOMIC); + if (d >= AHASH_INIT_SIZE) { + struct hbucket *tmp = kzalloc(sizeof(*tmp) + + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); if (!tmp) /* Still try to delete expired elements */ continue; - n->size -= AHASH_INIT_SIZE; - memcpy(tmp, n->value, n->size * dsize); - kfree(n->value); - n->value = tmp; + tmp->size = n->size - AHASH_INIT_SIZE; + for (j = 0, d = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + memcpy(tmp->value + d * dsize, data, dsize); + set_bit(j, tmp->used); + d++; + } + tmp->pos = d; + rcu_assign_pointer(hbucket(t, i), tmp); + kfree_rcu(n, rcu); } } - rcu_read_unlock_bh(); } static void mtype_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct htype *h = set->data; pr_debug("called\n"); - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); mtype_expire(set, h, NLEN(set->family), set->dsize); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&h->gc); @@ -532,93 +545,152 @@ mtype_gc(unsigned long ul_set) /* Resize a hash: create a new hash table with doubling the hashsize * and inserting the elements to it. Repeat until we succeed or - * fail due to memory pressures. */ + * fail due to memory pressures. + */ static int mtype_resize(struct ip_set *set, bool retried) { struct htype *h = set->data; - struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table); - u8 htable_bits = orig->htable_bits; + struct htable *t, *orig; + u8 htable_bits; + size_t dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; + struct mtype_elem *tmp; #endif struct mtype_elem *data; struct mtype_elem *d; struct hbucket *n, *m; - u32 i, j; + u32 i, j, key; int ret; - /* Try to cleanup once */ - if (SET_WITH_TIMEOUT(set) && !retried) { - i = h->elements; - write_lock_bh(&set->lock); - mtype_expire(set, set->data, NLEN(set->family), set->dsize); - write_unlock_bh(&set->lock); - if (h->elements < i) - return 0; - } +#ifdef IP_SET_HASH_WITH_NETS + tmp = kmalloc(dsize, GFP_KERNEL); + if (!tmp) + return -ENOMEM; +#endif + rcu_read_lock_bh(); + orig = rcu_dereference_bh_nfnl(h->table); + htable_bits = orig->htable_bits; + rcu_read_unlock_bh(); retry: ret = 0; htable_bits++; - pr_debug("attempt to resize set %s from %u to %u, t %p\n", - set->name, orig->htable_bits, htable_bits, orig); if (!htable_bits) { /* In case we have plenty of memory :-) */ pr_warn("Cannot increase the hashsize of set %s further\n", set->name); - return -IPSET_ERR_HASH_FULL; + ret = -IPSET_ERR_HASH_FULL; + goto out; + } + t = ip_set_alloc(htable_size(htable_bits)); + if (!t) { + ret = -ENOMEM; + goto out; } - t = ip_set_alloc(sizeof(*t) - + jhash_size(htable_bits) * sizeof(struct hbucket)); - if (!t) - return -ENOMEM; t->htable_bits = htable_bits; - read_lock_bh(&set->lock); + spin_lock_bh(&set->lock); + orig = __ipset_dereference_protected(h->table, 1); + /* There can't be another parallel resizing, but dumping is possible */ + atomic_set(&orig->ref, 1); + atomic_inc(&orig->uref); + pr_debug("attempt to resize set %s from %u to %u, t %p\n", + set->name, orig->htable_bits, htable_bits, orig); for (i = 0; i < jhash_size(orig->htable_bits); i++) { - n = hbucket(orig, i); + n = __ipset_dereference_protected(hbucket(orig, i), 1); + if (!n) + continue; for (j = 0; j < n->pos; j++) { - data = ahash_data(n, j, set->dsize); + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); #ifdef IP_SET_HASH_WITH_NETS + /* We have readers running parallel with us, + * so the live data cannot be modified. + */ flags = 0; + memcpy(tmp, data, dsize); + data = tmp; mtype_data_reset_flags(data, &flags); #endif - m = hbucket(t, HKEY(data, h->initval, htable_bits)); - ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize); - if (ret < 0) { -#ifdef IP_SET_HASH_WITH_NETS - mtype_data_reset_flags(data, &flags); -#endif - read_unlock_bh(&set->lock); - mtype_ahash_destroy(set, t, false); - if (ret == -EAGAIN) - goto retry; - return ret; + key = HKEY(data, h->initval, htable_bits); + m = __ipset_dereference_protected(hbucket(t, key), 1); + if (!m) { + m = kzalloc(sizeof(*m) + + AHASH_INIT_SIZE * dsize, + GFP_ATOMIC); + if (!m) { + ret = -ENOMEM; + goto cleanup; + } + m->size = AHASH_INIT_SIZE; + RCU_INIT_POINTER(hbucket(t, key), m); + } else if (m->pos >= m->size) { + struct hbucket *ht; + + if (m->size >= AHASH_MAX(h)) { + ret = -EAGAIN; + } else { + ht = kzalloc(sizeof(*ht) + + (m->size + AHASH_INIT_SIZE) + * dsize, + GFP_ATOMIC); + if (!ht) + ret = -ENOMEM; + } + if (ret < 0) + goto cleanup; + memcpy(ht, m, sizeof(struct hbucket) + + m->size * dsize); + ht->size = m->size + AHASH_INIT_SIZE; + kfree(m); + m = ht; + RCU_INIT_POINTER(hbucket(t, key), ht); } - d = ahash_data(m, m->pos++, set->dsize); - memcpy(d, data, set->dsize); + d = ahash_data(m, m->pos, dsize); + memcpy(d, data, dsize); + set_bit(m->pos++, m->used); #ifdef IP_SET_HASH_WITH_NETS mtype_data_reset_flags(d, &flags); #endif } } - rcu_assign_pointer(h->table, t); - read_unlock_bh(&set->lock); + + spin_unlock_bh(&set->lock); /* Give time to other readers of the set */ synchronize_rcu_bh(); pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); - mtype_ahash_destroy(set, orig, false); + /* If there's nobody else dumping the table, destroy it */ + if (atomic_dec_and_test(&orig->uref)) { + pr_debug("Table destroy by resize %p\n", orig); + mtype_ahash_destroy(set, orig, false); + } - return 0; +out: +#ifdef IP_SET_HASH_WITH_NETS + kfree(tmp); +#endif + return ret; + +cleanup: + atomic_set(&orig->ref, 0); + atomic_dec(&orig->uref); + spin_unlock_bh(&set->lock); + mtype_ahash_destroy(set, t, false); + if (ret == -EAGAIN) + goto retry; + goto out; } /* Add an element to a hash and update the internal counters when succeeded, - * otherwise report the proper error code. */ + * otherwise report the proper error code. + */ static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) @@ -627,17 +699,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct htable *t; const struct mtype_elem *d = value; struct mtype_elem *data; - struct hbucket *n; - int i, ret = 0; - int j = AHASH_MAX(h) + 1; + struct hbucket *n, *old = ERR_PTR(-ENOENT); + int i, j = -1; bool flag_exist = flags & IPSET_FLAG_EXIST; + bool deleted = false, forceadd = false, reuse = false; u32 key, multi = 0; - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + if (h->elements >= h->maxelem) { + if (SET_WITH_TIMEOUT(set)) + /* FIXME: when set is full, we slow down here */ + mtype_expire(set, h, NLEN(set->family), set->dsize); + if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set)) + forceadd = true; + } + + t = ipset_dereference_protected(h->table, set); key = HKEY(value, h->initval, t->htable_bits); - n = hbucket(t, key); + n = __ipset_dereference_protected(hbucket(t, key), 1); + if (!n) { + if (forceadd) { + if (net_ratelimit()) + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; + } else if (h->elements >= h->maxelem) { + goto set_full; + } + old = NULL; + n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, + GFP_ATOMIC); + if (!n) + return -ENOMEM; + n->size = AHASH_INIT_SIZE; + goto copy_elem; + } for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) { + /* Reuse first deleted entry */ + if (j == -1) { + deleted = reuse = true; + j = i; + } + continue; + } data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi)) { if (flag_exist || @@ -645,85 +749,94 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_timeout_expired(ext_timeout(data, set)))) { /* Just the extensions could be overwritten */ j = i; - goto reuse_slot; - } else { - ret = -IPSET_ERR_EXIST; - goto out; + goto overwrite_extensions; } + return -IPSET_ERR_EXIST; } /* Reuse first timed out entry */ if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set)) && - j != AHASH_MAX(h) + 1) + j == -1) { j = i; + reuse = true; + } } - if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set) && n->pos) { - /* Choosing the first entry in the array to replace */ - j = 0; - goto reuse_slot; - } - if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem) - /* FIXME: when set is full, we slow down here */ - mtype_expire(set, h, NLEN(set->family), set->dsize); - - if (h->elements >= h->maxelem) { - if (net_ratelimit()) - pr_warn("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); - ret = -IPSET_ERR_HASH_FULL; - goto out; - } - -reuse_slot: - if (j != AHASH_MAX(h) + 1) { - /* Fill out reused slot */ + if (reuse || forceadd) { data = ahash_data(n, j, set->dsize); + if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS - for (i = 0; i < IPSET_NET_COUNT; i++) { - mtype_del_cidr(h, SCIDR(data->cidr, i), - NLEN(set->family), i); - mtype_add_cidr(h, SCIDR(d->cidr, i), - NLEN(set->family), i); - } + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_del_cidr(h, + NCIDR_PUT(DCIDR_GET(data->cidr, i)), + NLEN(set->family), i); #endif - ip_set_ext_destroy(set, data); - } else { - /* Use/create a new slot */ + ip_set_ext_destroy(set, data); + h->elements--; + } + goto copy_data; + } + if (h->elements >= h->maxelem) + goto set_full; + /* Create a new slot */ + if (n->pos >= n->size) { TUNE_AHASH_MAX(h, multi); - ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize); - if (ret != 0) { - if (ret == -EAGAIN) - mtype_data_next(&h->next, d); - goto out; + if (n->size >= AHASH_MAX(h)) { + /* Trigger rehashing */ + mtype_data_next(&h->next, d); + return -EAGAIN; } - data = ahash_data(n, n->pos++, set->dsize); + old = n; + n = kzalloc(sizeof(*n) + + (old->size + AHASH_INIT_SIZE) * set->dsize, + GFP_ATOMIC); + if (!n) + return -ENOMEM; + memcpy(n, old, sizeof(struct hbucket) + + old->size * set->dsize); + n->size = old->size + AHASH_INIT_SIZE; + } + +copy_elem: + j = n->pos++; + data = ahash_data(n, j, set->dsize); +copy_data: + h->elements++; #ifdef IP_SET_HASH_WITH_NETS - for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_add_cidr(h, SCIDR(d->cidr, i), NLEN(set->family), - i); + for (i = 0; i < IPSET_NET_COUNT; i++) + mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), + NLEN(set->family), i); #endif - h->elements++; - } memcpy(data, d, sizeof(struct mtype_elem)); +overwrite_extensions: #ifdef IP_SET_HASH_WITH_NETS mtype_data_set_flags(data, flags); #endif - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(data, set), ext->timeout); if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(data, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(data, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(data, set), ext); + /* Must come last for the case when timed out entry is reused */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(data, set), ext->timeout); + smp_mb__before_atomic(); + set_bit(j, n->used); + if (old != ERR_PTR(-ENOENT)) { + rcu_assign_pointer(hbucket(t, key), n); + if (old) + kfree_rcu(old, rcu); + } -out: - rcu_read_unlock_bh(); - return ret; + return 0; +set_full: + if (net_ratelimit()) + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); + return -IPSET_ERR_HASH_FULL; } -/* Delete an element from the hash: swap it with the last element - * and free up space if possible. +/* Delete an element from the hash and free up space if possible. */ static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -734,55 +847,70 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n; - int i, ret = -IPSET_ERR_EXIST; -#ifdef IP_SET_HASH_WITH_NETS - u8 j; -#endif + int i, j, k, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; + size_t dsize = set->dsize; - rcu_read_lock_bh(); - t = rcu_dereference_bh(h->table); + t = ipset_dereference_protected(h->table, set); key = HKEY(value, h->initval, t->htable_bits); - n = hbucket(t, key); - for (i = 0; i < n->pos; i++) { - data = ahash_data(n, i, set->dsize); + n = __ipset_dereference_protected(hbucket(t, key), 1); + if (!n) + goto out; + for (i = 0, k = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) { + k++; + continue; + } + data = ahash_data(n, i, dsize); if (!mtype_data_equal(data, d, &multi)) continue; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(data, set))) goto out; - if (i != n->pos - 1) - /* Not last one */ - memcpy(data, ahash_data(n, n->pos - 1, set->dsize), - set->dsize); - n->pos--; + ret = 0; + clear_bit(i, n->used); + smp_mb__after_atomic(); + if (i + 1 == n->pos) + n->pos--; h->elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) - mtype_del_cidr(h, SCIDR(d->cidr, j), NLEN(set->family), - j); + mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), + NLEN(set->family), j); #endif ip_set_ext_destroy(set, data); - if (n->pos + AHASH_INIT_SIZE < n->size) { - void *tmp = kzalloc((n->size - AHASH_INIT_SIZE) - * set->dsize, - GFP_ATOMIC); - if (!tmp) { - ret = 0; + + for (; i < n->pos; i++) { + if (!test_bit(i, n->used)) + k++; + } + if (n->pos == 0 && k == 0) { + rcu_assign_pointer(hbucket(t, key), NULL); + kfree_rcu(n, rcu); + } else if (k >= AHASH_INIT_SIZE) { + struct hbucket *tmp = kzalloc(sizeof(*tmp) + + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); + if (!tmp) goto out; + tmp->size = n->size - AHASH_INIT_SIZE; + for (j = 0, k = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + memcpy(tmp->value + k * dsize, data, dsize); + set_bit(j, tmp->used); + k++; } - n->size -= AHASH_INIT_SIZE; - memcpy(tmp, n->value, n->size * set->dsize); - kfree(n->value); - n->value = tmp; + tmp->pos = k; + rcu_assign_pointer(hbucket(t, key), tmp); + kfree_rcu(n, rcu); } - ret = 0; goto out; } out: - rcu_read_unlock_bh(); return ret; } @@ -801,7 +929,8 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, #ifdef IP_SET_HASH_WITH_NETS /* Special test function which takes into account the different network - * sizes added to the set */ + * sizes added to the set + */ static int mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, const struct ip_set_ext *ext, @@ -824,16 +953,21 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) { #if IPSET_NET_COUNT == 2 mtype_data_reset_elem(d, &orig); - mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]), false); + mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false); for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi; k++) { - mtype_data_netmask(d, NCIDR(h->nets[k].cidr[1]), true); + mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]), + true); #else - mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0])); + mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0])); #endif key = HKEY(d, h->initval, t->htable_bits); - n = hbucket(t, key); + n = rcu_dereference_bh(hbucket(t, key)); + if (!n) + continue; for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; data = ahash_data(n, i, set->dsize); if (!mtype_data_equal(data, d, &multi)) continue; @@ -871,13 +1005,13 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, ret = 0; u32 key, multi = 0; - rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, - * try all possible network sizes */ + * try all possible network sizes + */ for (i = 0; i < IPSET_NET_COUNT; i++) - if (GCIDR(d->cidr, i) != SET_HOST_MASK(set->family)) + if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family)) break; if (i == IPSET_NET_COUNT) { ret = mtype_test_cidrs(set, d, ext, mext, flags); @@ -886,8 +1020,14 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, #endif key = HKEY(d, h->initval, t->htable_bits); - n = hbucket(t, key); + n = rcu_dereference_bh(hbucket(t, key)); + if (!n) { + ret = 0; + goto out; + } for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi) && !(SET_WITH_TIMEOUT(set) && @@ -897,7 +1037,6 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, } } out: - rcu_read_unlock_bh(); return ret; } @@ -909,15 +1048,19 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) const struct htable *t; struct nlattr *nested; size_t memsize; + u8 htable_bits; + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize); + htable_bits = t->htable_bits; + rcu_read_unlock_bh(); nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, - htonl(jhash_size(t->htable_bits))) || + htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; #ifdef IP_SET_HASH_WITH_NETMASK @@ -941,32 +1084,63 @@ nla_put_failure: return -EMSGSIZE; } +/* Make possible to run dumping parallel with resizing */ +static void +mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start) +{ + struct htype *h = set->data; + struct htable *t; + + if (start) { + rcu_read_lock_bh(); + t = rcu_dereference_bh_nfnl(h->table); + atomic_inc(&t->uref); + cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; + rcu_read_unlock_bh(); + } else if (cb->args[IPSET_CB_PRIVATE]) { + t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + /* Resizing didn't destroy the hash table */ + pr_debug("Table destroy by dump: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + cb->args[IPSET_CB_PRIVATE] = 0; + } +} + /* Reply a LIST/SAVE request: dump the elements of the specified set */ static int mtype_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { - const struct htype *h = set->data; - const struct htable *t = rcu_dereference_bh_nfnl(h->table); + const struct htable *t; struct nlattr *atd, *nested; const struct hbucket *n; const struct mtype_elem *e; u32 first = cb->args[IPSET_CB_ARG0]; /* We assume that one hash bucket fills into one page */ void *incomplete; - int i; + int i, ret = 0; atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; + pr_debug("list hash set %s\n", set->name); + t = (const struct htable *)cb->args[IPSET_CB_PRIVATE]; + /* Expire may replace a hbucket with another one */ + rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits); cb->args[IPSET_CB_ARG0]++) { incomplete = skb_tail_pointer(skb); - n = hbucket(t, cb->args[IPSET_CB_ARG0]); + n = rcu_dereference(hbucket(t, cb->args[IPSET_CB_ARG0])); pr_debug("cb->arg bucket: %lu, t %p n %p\n", cb->args[IPSET_CB_ARG0], t, n); + if (!n) + continue; for (i = 0; i < n->pos; i++) { + if (!test_bit(i, n->used)) + continue; e = ahash_data(n, i, set->dsize); if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) @@ -977,9 +1151,10 @@ mtype_list(const struct ip_set *set, if (!nested) { if (cb->args[IPSET_CB_ARG0] == first) { nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + goto nla_put_failure; } if (mtype_data_list(skb, e)) goto nla_put_failure; @@ -992,7 +1167,7 @@ mtype_list(const struct ip_set *set, /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nlmsg_trim(skb, incomplete); @@ -1000,20 +1175,24 @@ nla_put_failure: pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n", set->name); cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; + } else { + ipset_nest_end(skb, atd); } - ipset_nest_end(skb, atd); - return 0; +out: + rcu_read_unlock(); + return ret; } static int IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt); + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt); static int IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); + enum ipset_adt adt, u32 *lineno, u32 flags, + bool retried); static const struct ip_set_type_variant mtype_variant = { .kadt = mtype_kadt, @@ -1027,6 +1206,7 @@ static const struct ip_set_type_variant mtype_variant = { .flush = mtype_flush, .head = mtype_head, .list = mtype_list, + .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, }; @@ -1045,7 +1225,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, u8 netmask; #endif size_t hsize; - struct HTYPE *h; + struct htype *h; struct htable *t; #ifndef IP_SET_PROTO_UNDEF @@ -1064,12 +1244,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || -#ifdef IP_SET_HASH_WITH_MARKMASK - !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) || -#endif !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; +#ifdef IP_SET_HASH_WITH_MARKMASK + /* Separated condition in order to avoid directive in argument list */ + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK))) + return -IPSET_ERR_PROTOCOL; +#endif if (tb[IPSET_ATTR_HASHSIZE]) { hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); @@ -1092,7 +1274,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif #ifdef IP_SET_HASH_WITH_MARKMASK if (tb[IPSET_ATTR_MARKMASK]) { - markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); + markmask = ntohl(nla_get_be32(tb[IPSET_ATTR_MARKMASK])); if (markmask == 0) return -IPSET_ERR_INVALID_MARKMASK; @@ -1165,3 +1347,5 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, return 0; } #endif /* IP_SET_EMIT_CREATE */ + +#undef HKEY_DATALEN diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 76959d79e9d1..9d6bf19f7b78 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -56,15 +56,15 @@ hash_ip4_data_equal(const struct hash_ip4_elem *e1, return e1->ip == e2->ip; } -static inline bool +static bool hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -74,7 +74,6 @@ hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e) } #define MTYPE hash_ip4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -109,20 +108,17 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, hosts; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -145,7 +141,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -162,8 +158,8 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -196,10 +192,10 @@ hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -208,12 +204,9 @@ hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e) } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ip6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE @@ -247,22 +240,25 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + if (unlikely(!tb[IPSET_ATTR_IP])) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -301,7 +297,8 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -318,6 +315,7 @@ hash_ip_init(void) static void __exit hash_ip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ip_type); } diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 7abf9788cfa8..a0695a2ab585 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -63,10 +63,10 @@ hash_ipmark4_data_list(struct sk_buff *skb, if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -76,10 +76,8 @@ hash_ipmark4_data_next(struct hash_ipmark4_elem *next, next->ip = d->ip; } -#define MTYPE hash_ipmark4 -#define PF 4 -#define HOST_MASK 32 -#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem) +#define MTYPE hash_ipmark4 +#define HOST_MASK 32 #include "ip_set_hash_gen.h" static int @@ -110,25 +108,22 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip, ip_to = 0; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST || @@ -147,7 +142,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -160,8 +155,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -191,10 +186,10 @@ hash_ipmark6_data_list(struct sk_buff *skb, if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -204,18 +199,13 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ipmark6 -#define PF 6 #define HOST_MASK 128 -#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem) -#define IP_SET_EMIT_CREATE +#define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" - static int hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -243,27 +233,30 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; - e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK])); + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST) { @@ -274,10 +267,8 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; - return ret; + return 0; } static struct ip_set_type hash_ipmark_type __read_mostly = { @@ -307,7 +298,8 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -324,6 +316,7 @@ hash_ipmark_init(void) static void __exit hash_ipmark_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipmark_type); } diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index dcbcceb9a52f..9d84b3dff603 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -69,10 +69,10 @@ hash_ipport4_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -83,10 +83,8 @@ hash_ipport4_data_next(struct hash_ipport4_elem *next, next->port = d->port; } -#define MTYPE hash_ipport4 -#define PF 4 -#define HOST_MASK 32 -#define HKEY_DATALEN sizeof(struct hash_ipport4_elem) +#define MTYPE hash_ipport4 +#define HOST_MASK 32 #include "ip_set_hash_gen.h" static int @@ -118,29 +116,23 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -148,8 +140,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -171,7 +164,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -195,8 +188,8 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } } return ret; @@ -231,10 +224,10 @@ hash_ipport6_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -245,15 +238,11 @@ hash_ipport6_data_next(struct hash_ipport4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_ipport6 -#define PF 6 #define HOST_MASK 128 -#define HKEY_DATALEN sizeof(struct hash_ipport6_elem) -#define IP_SET_EMIT_CREATE +#define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int @@ -285,31 +274,31 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -317,8 +306,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; @@ -341,8 +331,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -376,7 +366,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -393,6 +384,7 @@ hash_ipport_init(void) static void __exit hash_ipport_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipport_type); } diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 7ef93fc887a1..215b7b942038 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -63,17 +63,17 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, static bool hash_ipportip4_data_list(struct sk_buff *skb, - const struct hash_ipportip4_elem *data) + const struct hash_ipportip4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -86,7 +86,6 @@ hash_ipportip4_data_next(struct hash_ipportip4_elem *next, /* Common functions */ #define MTYPE hash_ipportip4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -120,22 +119,19 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -143,10 +139,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -154,8 +147,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -177,7 +171,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -201,8 +195,8 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } } return ret; @@ -240,10 +234,10 @@ hash_ipportip6_data_list(struct sk_buff *skb, nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -254,11 +248,9 @@ hash_ipportip6_data_next(struct hash_ipportip4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_ipportip6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -293,24 +285,27 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], bool with_ports = false; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -318,10 +313,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -329,8 +321,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; @@ -353,8 +346,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -388,7 +381,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -405,6 +399,7 @@ hash_ipportip_init(void) static void __exit hash_ipportip_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipportip_type); } diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index b6012ad92781..9ca719625ea3 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -114,10 +114,10 @@ hash_ipportnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -130,7 +130,6 @@ hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, } #define MTYPE hash_ipportnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -142,7 +141,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -174,23 +173,20 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -205,10 +201,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr = cidr - 1; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -216,14 +209,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -249,7 +244,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > 32) + if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } @@ -270,8 +265,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); + } if (retried) ip = ntohl(h->next.ip); @@ -294,8 +290,8 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = ip2_last + 1; } } @@ -367,10 +363,10 @@ hash_ipportnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -381,11 +377,9 @@ hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_ipportnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -398,7 +392,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_ipportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -429,27 +423,28 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || - tb[IPSET_ATTR_IP_TO] || - tb[IPSET_ATTR_CIDR])) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + if (unlikely(tb[IPSET_ATTR_CIDR])) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (cidr != HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -466,10 +461,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip2, e.cidr + 1); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -477,14 +469,16 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -508,8 +502,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -547,7 +541,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -564,6 +559,7 @@ hash_ipportnet_init(void) static void __exit hash_ipportnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_ipportnet_type); } diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 65690b52a4d5..f1e7d2c0f685 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -52,7 +52,12 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1, static inline bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { - return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether); + if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) + goto nla_put_failure; + return false; + +nla_put_failure: + return true; } static inline void @@ -62,7 +67,6 @@ hash_mac4_data_next(struct hash_mac4_elem *next, } #define MTYPE hash_mac4 -#define PF 4 #define HOST_MASK 32 #define IP_SET_EMIT_CREATE #define IP_SET_PROTO_UNDEF @@ -85,10 +89,10 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, return 0; if (skb_mac_header(skb) < skb->head || - (skb_mac_header(skb) + ETH_HLEN) > skb->data) + (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; - memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); @@ -103,22 +107,16 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - if (unlikely(!tb[IPSET_ATTR_ETHER] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_ETHER])) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER])); if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) return -IPSET_ERR_HASH_ELEM; @@ -149,7 +147,8 @@ static struct ip_set_type hash_mac_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -166,6 +165,7 @@ hash_mac_init(void) static void __exit hash_mac_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_mac_type); } diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 6b3ac10ac2f1..3e4bffdc1cc0 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -95,10 +95,10 @@ hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -109,7 +109,6 @@ hash_net4_data_next(struct hash_net4_elem *next, } #define MTYPE hash_net4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -121,7 +120,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -147,21 +146,18 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], u32 ip = 0, ip_to = 0, last; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -173,6 +169,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -180,7 +177,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt, set) ? -ret: + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } @@ -202,8 +199,8 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip = last + 1; } return ret; @@ -264,10 +261,10 @@ hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -277,11 +274,9 @@ hash_net6_data_next(struct hash_net4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_net6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -294,7 +289,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_net *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -318,36 +313,34 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - - if (!e.cidr || e.cidr > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr || e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip, e.cidr); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -383,7 +376,8 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -400,6 +394,7 @@ hash_net_init(void) static void __exit hash_net_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_net_type); } diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 758b002130d9..43d8c9896fa3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -13,12 +13,12 @@ #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> -#include <linux/rbtree.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <linux/netfilter.h> +#include <linux/netfilter_bridge.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> @@ -36,88 +36,13 @@ MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); -/* Interface name rbtree */ - -struct iface_node { - struct rb_node node; - char iface[IFNAMSIZ]; -}; - -#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface) - -static void -rbtree_destroy(struct rb_root *root) -{ - struct iface_node *node, *next; - - rbtree_postorder_for_each_entry_safe(node, next, root, node) - kfree(node); - - *root = RB_ROOT; -} - -static int -iface_test(struct rb_root *root, const char **iface) -{ - struct rb_node *n = root->rb_node; - - while (n) { - const char *d = iface_data(n); - int res = strcmp(*iface, d); - - if (res < 0) - n = n->rb_left; - else if (res > 0) - n = n->rb_right; - else { - *iface = d; - return 1; - } - } - return 0; -} - -static int -iface_add(struct rb_root *root, const char **iface) -{ - struct rb_node **n = &(root->rb_node), *p = NULL; - struct iface_node *d; - - while (*n) { - char *ifname = iface_data(*n); - int res = strcmp(*iface, ifname); - - p = *n; - if (res < 0) - n = &((*n)->rb_left); - else if (res > 0) - n = &((*n)->rb_right); - else { - *iface = ifname; - return 0; - } - } - - d = kzalloc(sizeof(*d), GFP_ATOMIC); - if (!d) - return -ENOMEM; - strcpy(d->iface, *iface); - - rb_link_node(&d->node, p, n); - rb_insert_color(&d->node, root); - - *iface = d->iface; - return 0; -} - /* Type specific function prefix */ #define HTYPE hash_netiface #define IP_SET_HASH_WITH_NETS -#define IP_SET_HASH_WITH_RBTREE #define IP_SET_HASH_WITH_MULTI #define IP_SET_HASH_WITH_NET0 -#define STREQ(a, b) (strcmp(a, b) == 0) +#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ) /* IPv4 variant */ @@ -136,7 +61,7 @@ struct hash_netiface4_elem { u8 cidr; u8 nomatch; u8 elem; - const char *iface; + char iface[IFNAMSIZ]; }; /* Common functions */ @@ -150,7 +75,7 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - ip1->iface == ip2->iface; + strcmp(ip1->iface, ip2->iface) == 0; } static inline int @@ -192,10 +117,10 @@ hash_netiface4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -206,11 +131,26 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next, } #define MTYPE hash_netiface4 -#define PF 4 #define HOST_MASK 32 #define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) #include "ip_set_hash_gen.h" +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +static const char *get_physindev_name(const struct sk_buff *skb) +{ + struct net_device *dev = nf_bridge_get_physindev(skb); + + return dev ? dev->name : NULL; +} + +static const char *get_physoutdev_name(const struct sk_buff *skb) +{ + struct net_device *dev = nf_bridge_get_physoutdev(skb); + + return dev ? dev->name : NULL; +} +#endif + static int hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -219,11 +159,10 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - int ret; if (e.cidr == 0) return -EINVAL; @@ -233,36 +172,25 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= ip_set_netmask(e.cidr); -#define IFACE(dir) (par->dir ? par->dir->name : NULL) -#define PHYSDEV(dir) (nf_bridge->dir ? nf_bridge->dir->name : NULL) +#define IFACE(dir) (par->dir ? par->dir->name : "") #define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const struct nf_bridge_info *nf_bridge = skb->nf_bridge; + const char *eiface = SRCDIR ? get_physindev_name(skb) : + get_physoutdev_name(skb); - if (!nf_bridge) + if (!eiface) return -EINVAL; - e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + STRLCPY(e.iface, eiface); e.physdev = 1; -#else - e.iface = NULL; #endif - } else - e.iface = SRCDIR ? IFACE(in) : IFACE(out); + } else { + STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + } - if (!e.iface) + if (strlen(e.iface) == 0) return -EINVAL; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; - return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -275,25 +203,21 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; - char iface[IFNAMSIZ]; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -302,21 +226,11 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - e.iface = iface; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; + nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) @@ -337,8 +251,9 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr); + } if (retried) ip = ntohl(h->next.ip); @@ -349,8 +264,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip = last + 1; } return ret; @@ -372,7 +287,7 @@ struct hash_netiface6_elem { u8 cidr; u8 nomatch; u8 elem; - const char *iface; + char iface[IFNAMSIZ]; }; /* Common functions */ @@ -386,7 +301,7 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && - ip1->iface == ip2->iface; + strcmp(ip1->iface, ip2->iface) == 0; } static inline int @@ -428,10 +343,10 @@ hash_netiface6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -441,12 +356,9 @@ hash_netiface6_data_next(struct hash_netiface4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK -#undef HKEY_DATALEN #define MTYPE hash_netiface6 -#define PF 6 #define HOST_MASK 128 #define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) #define IP_SET_EMIT_CREATE @@ -460,11 +372,10 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - int ret; if (e.cidr == 0) return -EINVAL; @@ -476,85 +387,64 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - const struct nf_bridge_info *nf_bridge = skb->nf_bridge; + const char *eiface = SRCDIR ? get_physindev_name(skb) : + get_physoutdev_name(skb); - if (!nf_bridge) + if (!eiface) return -EINVAL; - e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + STRLCPY(e.iface, eiface); e.physdev = 1; -#else - e.iface = NULL; #endif - } else - e.iface = SRCDIR ? IFACE(in) : IFACE(out); + } else { + STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); + } - if (!e.iface) + if (strlen(e.iface) == 0) return -EINVAL; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { - struct hash_netiface *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - char iface[IFNAMSIZ]; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (e.cidr > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (e.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + ip6_netmask(&e.ip, e.cidr); - strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); - e.iface = iface; - ret = iface_test(&h->rbtree, &e.iface); - if (adt == IPSET_ADD) { - if (!ret) { - ret = iface_add(&h->rbtree, &e.iface); - if (ret) - return ret; - } - } else if (!ret) - return ret; + nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) @@ -597,7 +487,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -614,6 +505,7 @@ hash_netiface_init(void) static void __exit hash_netiface_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netiface_type); } diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index ea8772afb6e7..3c862c0a76d1 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -57,8 +57,8 @@ struct hash_netnet4_elem { static inline bool hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1, - const struct hash_netnet4_elem *ip2, - u32 *multi) + const struct hash_netnet4_elem *ip2, + u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp; @@ -84,7 +84,7 @@ hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags) static inline void hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem, - struct hash_netnet4_elem *orig) + struct hash_netnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -103,7 +103,7 @@ hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner) static bool hash_netnet4_data_list(struct sk_buff *skb, - const struct hash_netnet4_elem *data) + const struct hash_netnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -122,28 +122,27 @@ nla_put_failure: static inline void hash_netnet4_data_next(struct hash_netnet4_elem *next, - const struct hash_netnet4_elem *d) + const struct hash_netnet4_elem *d) { next->ipcmp = d->ipcmp; } #define MTYPE hash_netnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; @@ -157,53 +156,50 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet4_elem e = { }; + struct hash_netnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; - u8 cidr, cidr2; int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; + + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > HOST_MASK) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[0] = cidr; } if (tb[IPSET_ATTR_CIDR2]) { - cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!cidr2 || cidr2 > HOST_MASK) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[1] = cidr2; } if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -226,8 +222,9 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); + } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { @@ -238,28 +235,27 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); + } if (retried) ip = ntohl(h->next.ip[0]); while (!after(ip, ip_to)) { e.ip[0] = htonl(ip); - last = ip_set_range_to_cidr(ip, ip_to, &cidr); - e.cidr[0] = cidr; + last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); ip2 = (retried && ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) : ip2_from; while (!after(ip2, ip2_to)) { e.ip[1] = htonl(ip2); - last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2); - e.cidr[1] = cidr2; + last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = last2 + 1; } ip = last + 1; @@ -283,8 +279,8 @@ struct hash_netnet6_elem { static inline bool hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1, - const struct hash_netnet6_elem *ip2, - u32 *multi) + const struct hash_netnet6_elem *ip2, + u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && @@ -311,7 +307,7 @@ hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags) static inline void hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem, - struct hash_netnet6_elem *orig) + struct hash_netnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -330,7 +326,7 @@ hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner) static bool hash_netnet6_data_list(struct sk_buff *skb, - const struct hash_netnet6_elem *data) + const struct hash_netnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -349,34 +345,32 @@ nla_put_failure: static inline void hash_netnet6_data_next(struct hash_netnet4_elem *next, - const struct hash_netnet6_elem *d) + const struct hash_netnet6_elem *d) { } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) - e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK; + e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6); @@ -388,50 +382,52 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet6_elem e = { }; + struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || - ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - if (tb[IPSET_ATTR_CIDR2]) + if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - - if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || - e.cidr[1] > HOST_MASK) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -470,7 +466,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -487,6 +484,7 @@ hash_netnet_init(void) static void __exit hash_netnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netnet_type); } diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index c0ddb58d19dc..731813e0f08c 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -110,10 +110,10 @@ hash_netport4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -125,7 +125,6 @@ hash_netport4_data_next(struct hash_netport4_elem *next, } #define MTYPE hash_netport4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" @@ -137,7 +136,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -167,23 +166,20 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -194,10 +190,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], e.cidr = cidr - 1; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -205,8 +198,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; @@ -215,6 +209,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -240,8 +235,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); + } if (retried) ip = ntohl(h->next.ip); @@ -257,8 +253,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } ip = last + 1; } @@ -326,10 +322,10 @@ hash_netport6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void @@ -340,11 +336,9 @@ hash_netport6_data_next(struct hash_netport4_elem *next, } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netport6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" @@ -357,7 +351,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct hash_netport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport6_elem e = { - .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1, + .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); @@ -387,25 +381,22 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], u8 cidr; int ret; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -417,10 +408,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], } ip6_netmask(&e.ip, e.cidr + 1); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -428,14 +416,16 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -459,8 +449,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -495,7 +485,8 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -512,6 +503,7 @@ hash_netport_init(void) static void __exit hash_netport_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netport_type); } diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index bfaa94c7baa7..0c68734f5cc4 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -54,7 +54,7 @@ struct hash_netportnet4_elem { u16 ccmp; }; u16 padding; - u8 nomatch:1; + u8 nomatch; u8 proto; }; @@ -62,8 +62,8 @@ struct hash_netportnet4_elem { static inline bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, - const struct hash_netportnet4_elem *ip2, - u32 *multi) + const struct hash_netportnet4_elem *ip2, + u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp && @@ -91,7 +91,7 @@ hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) static inline void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, - struct hash_netportnet4_elem *orig) + struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -111,7 +111,7 @@ hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, static bool hash_netportnet4_data_list(struct sk_buff *skb, - const struct hash_netportnet4_elem *data) + const struct hash_netportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -124,37 +124,36 @@ hash_netportnet4_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, - const struct hash_netportnet4_elem *d) + const struct hash_netportnet4_elem *d) { next->ipcmp = d->ipcmp; next->port = d->port; } #define MTYPE hash_netportnet4 -#define PF 4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; @@ -172,58 +171,51 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet4_elem e = { }; + struct hash_netportnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; bool with_ports = false; - u8 cidr, cidr2; int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; - ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) || - ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); - if (!cidr || cidr > HOST_MASK) + e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[0] = cidr; } if (tb[IPSET_ATTR_CIDR2]) { - cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - if (!cidr || cidr > HOST_MASK) + e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; - e.cidr[1] = cidr; } - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -231,14 +223,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -262,8 +256,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); + } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { @@ -281,16 +276,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } else + } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); + } if (retried) ip = ntohl(h->next.ip[0]); while (!after(ip, ip_to)) { e.ip[0] = htonl(ip); - ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr); - e.cidr[0] = cidr; + ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { @@ -301,13 +296,12 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], while (!after(ip2, ip2_to)) { e.ip[1] = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, - &cidr2); - e.cidr[1] = cidr2; + &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; ip2 = ip2_last + 1; } } @@ -326,7 +320,7 @@ struct hash_netportnet6_elem { u16 ccmp; }; u16 padding; - u8 nomatch:1; + u8 nomatch; u8 proto; }; @@ -334,8 +328,8 @@ struct hash_netportnet6_elem { static inline bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, - const struct hash_netportnet6_elem *ip2, - u32 *multi) + const struct hash_netportnet6_elem *ip2, + u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && @@ -364,7 +358,7 @@ hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) static inline void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, - struct hash_netportnet6_elem *orig) + struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } @@ -384,7 +378,7 @@ hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, static bool hash_netportnet6_data_list(struct sk_buff *skb, - const struct hash_netportnet6_elem *data) + const struct hash_netportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; @@ -397,41 +391,39 @@ hash_netportnet6_data_list(struct sk_buff *skb, (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; - return 0; + return false; nla_put_failure: - return 1; + return true; } static inline void hash_netportnet6_data_next(struct hash_netportnet4_elem *next, - const struct hash_netportnet6_elem *d) + const struct hash_netportnet6_elem *d) { next->port = d->port; } #undef MTYPE -#undef PF #undef HOST_MASK #define MTYPE hash_netportnet6 -#define PF 6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, - const struct xt_action_param *par, - enum ipset_adt adt, struct ip_set_adt_opt *opt) + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); - e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); + e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); + e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; @@ -449,57 +441,55 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, static int hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], - enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet6_elem e = { }; + struct hash_netportnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; - e.cidr[0] = e.cidr[1] = HOST_MASK; + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; - if (tb[IPSET_ATTR_LINENO]) - *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); + if (ret) + return ret; - ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) || - ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) || - ip_set_get_extensions(set, tb, &ext); + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); if (ret) return ret; - if (tb[IPSET_ATTR_CIDR]) + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!e.cidr[0] || e.cidr[0] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } - if (tb[IPSET_ATTR_CIDR2]) + if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); - - if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] || - e.cidr[1] > HOST_MASK)) - return -IPSET_ERR_INVALID_CIDR; + if (!e.cidr[1] || e.cidr[1] > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); - if (tb[IPSET_ATTR_PORT]) - e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); - else - return -IPSET_ERR_PROTOCOL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); @@ -507,14 +497,16 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; - } else + } else { return -IPSET_ERR_MISSING_PROTO; + } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } @@ -538,8 +530,8 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret && !ip_set_eexist(ret, flags)) return ret; - else - ret = 0; + + ret = 0; } return ret; } @@ -577,7 +569,8 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -594,6 +587,7 @@ hash_netportnet_init(void) static void __exit hash_netportnet_fini(void) { + rcu_barrier(); ip_set_type_unregister(&hash_netportnet_type); } diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index f8f682806e36..a1fe5377a2b3 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/ip.h> +#include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/errno.h> @@ -27,6 +28,8 @@ MODULE_ALIAS("ip_set_list:set"); /* Member elements */ struct set_elem { + struct rcu_head rcu; + struct list_head list; ip_set_id_t id; }; @@ -41,12 +44,9 @@ struct list_set { u32 size; /* size of set list array */ struct timer_list gc; /* garbage collection */ struct net *net; /* namespace */ - struct set_elem members[0]; /* the set members */ + struct list_head members; /* the set members */ }; -#define list_set_elem(set, map, id) \ - (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize) - static int list_set_ktest(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -54,17 +54,14 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i, cmdflags = opt->cmdflags; + u32 cmdflags = opt->cmdflags; int ret; /* Don't lookup sub-counters at all */ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -91,13 +88,9 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -115,13 +108,9 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, { struct list_set *map = set->data; struct set_elem *e; - u32 i; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; + list_for_each_entry(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -138,110 +127,65 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + int ret = -EINVAL; + rcu_read_lock(); switch (adt) { case IPSET_TEST: - return list_set_ktest(set, skb, par, opt, &ext); + ret = list_set_ktest(set, skb, par, opt, &ext); + break; case IPSET_ADD: - return list_set_kadd(set, skb, par, opt, &ext); + ret = list_set_kadd(set, skb, par, opt, &ext); + break; case IPSET_DEL: - return list_set_kdel(set, skb, par, opt, &ext); + ret = list_set_kdel(set, skb, par, opt, &ext); + break; default: break; } - return -EINVAL; -} - -static bool -id_eq(const struct ip_set *set, u32 i, ip_set_id_t id) -{ - const struct list_set *map = set->data; - const struct set_elem *e; - - if (i >= map->size) - return 0; + rcu_read_unlock(); - e = list_set_elem(set, map, i); - return !!(e->id == id && - !(SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set)))); + return ret; } -static int -list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, - const struct ip_set_ext *ext) -{ - struct list_set *map = set->data; - struct set_elem *e = list_set_elem(set, map, i); +/* Userspace interfaces: we are protected by the nfnl mutex */ - if (e->id != IPSET_INVALID_ID) { - if (i == map->size - 1) { - /* Last element replaced: e.g. add new,before,last */ - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - } else { - struct set_elem *x = list_set_elem(set, map, - map->size - 1); - - /* Last element pushed off */ - if (x->id != IPSET_INVALID_ID) { - ip_set_put_byindex(map->net, x->id); - ip_set_ext_destroy(set, x); - } - memmove(list_set_elem(set, map, i + 1), e, - set->dsize * (map->size - (i + 1))); - /* Extensions must be initialized to zero */ - memset(e, 0, set->dsize); - } - } - - e->id = d->id; - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(e, set), ext->timeout); - if (SET_WITH_COUNTER(set)) - ip_set_init_counter(ext_counter(e, set), ext); - if (SET_WITH_COMMENT(set)) - ip_set_init_comment(ext_comment(e, set), ext); - if (SET_WITH_SKBINFO(set)) - ip_set_init_skbinfo(ext_skbinfo(e, set), ext); - return 0; -} - -static int -list_set_del(struct ip_set *set, u32 i) +static void +__list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; - struct set_elem *e = list_set_elem(set, map, i); ip_set_put_byindex(map->net, e->id); + /* We may call it, because we don't have a to be destroyed + * extension which is used by the kernel. + */ ip_set_ext_destroy(set, e); + kfree_rcu(e, rcu); +} - if (i < map->size - 1) - memmove(e, list_set_elem(set, map, i + 1), - set->dsize * (map->size - (i + 1))); +static inline void +list_set_del(struct ip_set *set, struct set_elem *e) +{ + list_del_rcu(&e->list); + __list_set_del(set, e); +} - /* Last element */ - e = list_set_elem(set, map, map->size - 1); - e->id = IPSET_INVALID_ID; - return 0; +static inline void +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) +{ + list_replace_rcu(&old->list, &e->list); + __list_set_del(set, old); } static void set_cleanup_entries(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e; - u32 i = 0; + struct set_elem *e, *n; - while (i < map->size) { - e = list_set_elem(set, map, i); - if (e->id != IPSET_INVALID_ID && - ip_set_timeout_expired(ext_timeout(e, set))) - list_set_del(set, i); - /* Check element moved to position i in next loop */ - else - i++; - } + list_for_each_entry_safe(e, n, &map->members, list) + if (ip_set_timeout_expired(ext_timeout(e, set))) + list_set_del(set, e); } static int @@ -250,31 +194,46 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; - u32 i; + struct set_elem *e, *next, *prev = NULL; int ret; - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return 0; - else if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - else if (e->id != d->id) + else if (e->id != d->id) { + prev = e; continue; + } - if (d->before == 0) - return 1; - else if (d->before > 0) - ret = id_eq(set, i + 1, d->refid); - else - ret = i > 0 && id_eq(set, i - 1, d->refid); + if (d->before == 0) { + ret = 1; + } else if (d->before > 0) { + next = list_next_entry(e, list); + ret = !list_is_last(&e->list, &map->members) && + next->id == d->refid; + } else { + ret = prev && prev->id == d->refid; + } return ret; } return 0; } +static void +list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext, + struct set_elem *e) +{ + if (SET_WITH_COUNTER(set)) + ip_set_init_counter(ext_counter(e, set), ext); + if (SET_WITH_COMMENT(set)) + ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); + /* Update timeout last */ + if (SET_WITH_TIMEOUT(set)) + ip_set_timeout_set(ext_timeout(e, set), ext->timeout); +} static int list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -282,60 +241,78 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; + struct set_elem *e, *n, *prev, *next; bool flag_exist = flags & IPSET_FLAG_EXIST; - u32 i, ret = 0; if (SET_WITH_TIMEOUT(set)) set_cleanup_entries(set); - /* Check already added element */ - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - goto insert; - else if (e->id != d->id) + /* Find where to add the new entry */ + n = prev = next = NULL; + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - - if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) || - (d->before < 0 && - (i == 0 || !id_eq(set, i - 1, d->refid)))) - /* Before/after doesn't match */ + else if (d->id == e->id) + n = e; + else if (d->before == 0 || e->id != d->refid) + continue; + else if (d->before > 0) + next = e; + else + prev = e; + } + /* Re-add already existing element */ + if (n) { + if ((d->before > 0 && !next) || + (d->before < 0 && !prev)) return -IPSET_ERR_REF_EXIST; if (!flag_exist) - /* Can't re-add */ return -IPSET_ERR_EXIST; /* Update extensions */ - ip_set_ext_destroy(set, e); + ip_set_ext_destroy(set, n); + list_set_init_extensions(set, ext, n); - if (SET_WITH_TIMEOUT(set)) - ip_set_timeout_set(ext_timeout(e, set), ext->timeout); - if (SET_WITH_COUNTER(set)) - ip_set_init_counter(ext_counter(e, set), ext); - if (SET_WITH_COMMENT(set)) - ip_set_init_comment(ext_comment(e, set), ext); - if (SET_WITH_SKBINFO(set)) - ip_set_init_skbinfo(ext_skbinfo(e, set), ext); /* Set is already added to the list */ ip_set_put_byindex(map->net, d->id); return 0; } -insert: - ret = -IPSET_ERR_LIST_FULL; - for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - ret = d->before != 0 ? -IPSET_ERR_REF_EXIST - : list_set_add(set, i, d, ext); - else if (e->id != d->refid) - continue; - else if (d->before > 0) - ret = list_set_add(set, i, d, ext); - else if (i + 1 < map->size) - ret = list_set_add(set, i + 1, d, ext); + /* Add new entry */ + if (d->before == 0) { + /* Append */ + n = list_empty(&map->members) ? NULL : + list_last_entry(&map->members, struct set_elem, list); + } else if (d->before > 0) { + /* Insert after next element */ + if (!list_is_last(&next->list, &map->members)) + n = list_next_entry(next, list); + } else { + /* Insert before prev element */ + if (prev->list.prev != &map->members) + n = list_prev_entry(prev, list); } + /* Can we replace a timed out entry? */ + if (n && + !(SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(n, set)))) + n = NULL; + + e = kzalloc(set->dsize, GFP_KERNEL); + if (!e) + return -ENOMEM; + e->id = d->id; + INIT_LIST_HEAD(&e->list); + list_set_init_extensions(set, ext, e); + if (n) + list_set_replace(set, e, n); + else if (next) + list_add_tail_rcu(&e->list, &next->list); + else if (prev) + list_add_rcu(&e->list, &prev->list); + else + list_add_tail_rcu(&e->list, &map->members); - return ret; + return 0; } static int @@ -344,32 +321,30 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e; - u32 i; - - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - return d->before != 0 ? -IPSET_ERR_REF_EXIST - : -IPSET_ERR_EXIST; - else if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + struct set_elem *e, *next, *prev = NULL; + + list_for_each_entry(e, &map->members, list) { + if (SET_WITH_TIMEOUT(set) && + ip_set_timeout_expired(ext_timeout(e, set))) continue; - else if (e->id != d->id) + else if (e->id != d->id) { + prev = e; continue; + } - if (d->before == 0) - return list_set_del(set, i); - else if (d->before > 0) { - if (!id_eq(set, i + 1, d->refid)) + if (d->before > 0) { + next = list_next_entry(e, list); + if (list_is_last(&e->list, &map->members) || + next->id != d->refid) return -IPSET_ERR_REF_EXIST; - return list_set_del(set, i); - } else if (i == 0 || !id_eq(set, i - 1, d->refid)) - return -IPSET_ERR_REF_EXIST; - else - return list_set_del(set, i); + } else if (d->before < 0) { + if (!prev || prev->id != d->refid) + return -IPSET_ERR_REF_EXIST; + } + list_set_del(set, e); + return 0; } - return -IPSET_ERR_EXIST; + return d->before != 0 ? -IPSET_ERR_REF_EXIST : -IPSET_ERR_EXIST; } static int @@ -383,19 +358,13 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], struct ip_set *s; int ret = 0; - if (unlikely(!tb[IPSET_ATTR_NAME] || - !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) - return -IPSET_ERR_PROTOCOL; - if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + if (unlikely(!tb[IPSET_ATTR_NAME] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; @@ -410,6 +379,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + e.before = f & IPSET_FLAG_BEFORE; } @@ -447,27 +417,26 @@ static void list_set_flush(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e; - u32 i; - - for (i = 0; i < map->size; i++) { - e = list_set_elem(set, map, i); - if (e->id != IPSET_INVALID_ID) { - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - e->id = IPSET_INVALID_ID; - } - } + struct set_elem *e, *n; + + list_for_each_entry_safe(e, n, &map->members, list) + list_set_del(set, e); } static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; + struct set_elem *e, *n; if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); - list_set_flush(set); + list_for_each_entry_safe(e, n, &map->members, list) { + list_del(&e->list); + ip_set_put_byindex(map->net, e->id); + ip_set_ext_destroy(set, e); + kfree(e); + } kfree(map); set->data = NULL; @@ -478,6 +447,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) { const struct list_set *map = set->data; struct nlattr *nested; + struct set_elem *e; + u32 n = 0; + + list_for_each_entry(e, &map->members, list) + n++; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) @@ -485,7 +459,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + map->size * set->dsize))) + htonl(sizeof(*map) + n * set->dsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -502,18 +476,22 @@ list_set_list(const struct ip_set *set, { const struct list_set *map = set->data; struct nlattr *atd, *nested; - u32 i, first = cb->args[IPSET_CB_ARG0]; - const struct set_elem *e; + u32 i = 0, first = cb->args[IPSET_CB_ARG0]; + struct set_elem *e; + int ret = 0; atd = ipset_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; - for (; cb->args[IPSET_CB_ARG0] < map->size; - cb->args[IPSET_CB_ARG0]++) { - i = cb->args[IPSET_CB_ARG0]; - e = list_set_elem(set, map, i); - if (e->id == IPSET_INVALID_ID) - goto finish; + list_for_each_entry(e, &map->members, list) { + if (i == first) + break; + i++; + } + + rcu_read_lock(); + list_for_each_entry_from(e, &map->members, list) { + i++; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -521,9 +499,10 @@ list_set_list(const struct ip_set *set, if (!nested) { if (i == first) { nla_nest_cancel(skb, atd); - return -EMSGSIZE; - } else - goto nla_put_failure; + ret = -EMSGSIZE; + goto out; + } + goto nla_put_failure; } if (nla_put_string(skb, IPSET_ATTR_NAME, ip_set_name_byindex(map->net, e->id))) @@ -532,20 +511,23 @@ list_set_list(const struct ip_set *set, goto nla_put_failure; ipset_nest_end(skb, nested); } -finish: + ipset_nest_end(skb, atd); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; - return 0; + goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(i == first)) { cb->args[IPSET_CB_ARG0] = 0; - return -EMSGSIZE; + ret = -EMSGSIZE; } + cb->args[IPSET_CB_ARG0] = i - 1; ipset_nest_end(skb, atd); - return 0; +out: + rcu_read_unlock(); + return ret; } static bool @@ -577,12 +559,12 @@ static const struct ip_set_type_variant set_variant = { static void list_set_gc(unsigned long ul_set) { - struct ip_set *set = (struct ip_set *) ul_set; + struct ip_set *set = (struct ip_set *)ul_set; struct list_set *map = set->data; - write_lock_bh(&set->lock); + spin_lock_bh(&set->lock); set_cleanup_entries(set); - write_unlock_bh(&set->lock); + spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -594,7 +576,7 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) struct list_set *map = set->data; init_timer(&map->gc); - map->gc.data = (unsigned long) set; + map->gc.data = (unsigned long)set; map->gc.function = gc; map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); @@ -606,24 +588,16 @@ static bool init_list_set(struct net *net, struct ip_set *set, u32 size) { struct list_set *map; - struct set_elem *e; - u32 i; - map = kzalloc(sizeof(*map) + - min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize, - GFP_KERNEL); + map = kzalloc(sizeof(*map), GFP_KERNEL); if (!map) return false; map->size = size; map->net = net; + INIT_LIST_HEAD(&map->members); set->data = map; - for (i = 0; i < size; i++) { - e = list_set_elem(set, map, i); - e->id = IPSET_INVALID_ID; - } - return true; } @@ -678,7 +652,8 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, - [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, + .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, @@ -695,6 +670,7 @@ list_set_init(void) static void __exit list_set_fini(void) { + rcu_barrier(); ip_set_type_unregister(&list_set_type); } diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c index 04d15fdc99ee..1c8a42c1056c 100644 --- a/net/netfilter/ipset/pfxlen.c +++ b/net/netfilter/ipset/pfxlen.c @@ -1,9 +1,7 @@ #include <linux/export.h> #include <linux/netfilter/ipset/pfxlen.h> -/* - * Prefixlen maps for fast conversions, by Jan Engelhardt. - */ +/* Prefixlen maps for fast conversions, by Jan Engelhardt. */ #define E(a, b, c, d) \ {.ip6 = { \ @@ -11,8 +9,7 @@ htonl(c), htonl(d), \ } } -/* - * This table works for both IPv4 and IPv6; +/* This table works for both IPv4 and IPv6; * just use prefixlen_netmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_netmask_map[] = { @@ -149,13 +146,12 @@ const union nf_inet_addr ip_set_netmask_map[] = { EXPORT_SYMBOL_GPL(ip_set_netmask_map); #undef E -#define E(a, b, c, d) \ - {.ip6 = { (__force __be32) a, (__force __be32) b, \ - (__force __be32) c, (__force __be32) d, \ +#define E(a, b, c, d) \ + {.ip6 = { (__force __be32)a, (__force __be32)b, \ + (__force __be32)c, (__force __be32)d, \ } } -/* - * This table works for both IPv4 and IPv6; +/* This table works for both IPv4 and IPv6; * just use prefixlen_hostmask_map[prefixlength].ip. */ const union nf_inet_addr ip_set_hostmask_map[] = { diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b87ca32efa0b..38fbc194b9cb 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -119,24 +119,24 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); - s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.inbytes += skb->len; + s->cnt.inpkts++; + s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_lock(); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); - s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.inbytes += skb->len; + s->cnt.inpkts++; + s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); - s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.inbytes += skb->len; + s->cnt.inpkts++; + s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); } } @@ -153,24 +153,24 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); - s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.outbytes += skb->len; + s->cnt.outpkts++; + s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_lock(); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); - s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.outbytes += skb->len; + s->cnt.outpkts++; + s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); - s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.outbytes += skb->len; + s->cnt.outpkts++; + s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); } } @@ -183,13 +183,19 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) struct ip_vs_cpu_stats *s; s = this_cpu_ptr(cp->dest->stats.cpustats); - s->ustats.conns++; + u64_stats_update_begin(&s->syncp); + s->cnt.conns++; + u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); - s->ustats.conns++; + u64_stats_update_begin(&s->syncp); + s->cnt.conns++; + u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats.cpustats); - s->ustats.conns++; + u64_stats_update_begin(&s->syncp); + s->cnt.conns++; + u64_stats_update_end(&s->syncp); } @@ -313,7 +319,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * return *ignored=0 i.e. ICMP and NF_DROP */ sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); @@ -461,7 +473,13 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, } sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; @@ -1046,6 +1064,26 @@ static inline bool is_new_conn(const struct sk_buff *skb, } } +static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, + int conn_reuse_mode) +{ + /* Controlled (FTP DATA or persistence)? */ + if (cp->control) + return false; + + switch (cp->protocol) { + case IPPROTO_TCP: + return (cp->state == IP_VS_TCP_S_TIME_WAIT) || + ((conn_reuse_mode & 2) && + (cp->state == IP_VS_TCP_S_FIN_WAIT) && + (cp->flags & IP_VS_CONN_F_NOOUTPUT)); + case IPPROTO_SCTP: + return cp->state == IP_VS_SCTP_S_CLOSED; + default: + return false; + } +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int @@ -1246,8 +1284,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) */ static unsigned int ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { return ip_vs_out(ops->hooknum, skb, AF_INET); } @@ -1258,8 +1295,7 @@ ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, */ static unsigned int ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { return ip_vs_out(ops->hooknum, skb, AF_INET); } @@ -1273,8 +1309,7 @@ ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb, */ static unsigned int ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { return ip_vs_out(ops->hooknum, skb, AF_INET6); } @@ -1285,8 +1320,7 @@ ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, */ static unsigned int ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { return ip_vs_out(ops->hooknum, skb, AF_INET6); } @@ -1585,6 +1619,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) struct ip_vs_conn *cp; int ret, pkts; struct netns_ipvs *ipvs; + int conn_reuse_mode; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -1653,10 +1688,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) */ cp = pp->conn_in_get(af, skb, &iph, 0); - if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && - unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && - is_new_conn(skb, &iph)) { - ip_vs_conn_expire_now(cp); + conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); + if (conn_reuse_mode && !iph.fragoffs && + is_new_conn(skb, &iph) && cp && + ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && + unlikely(!atomic_read(&cp->dest->weight))) || + unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) { + if (!atomic_read(&cp->n_control)) + ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); cp = NULL; } @@ -1738,9 +1777,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) */ static unsigned int ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { return ip_vs_in(ops->hooknum, skb, AF_INET); } @@ -1751,8 +1788,7 @@ ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, */ static unsigned int ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { return ip_vs_in(ops->hooknum, skb, AF_INET); } @@ -1765,9 +1801,7 @@ ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb, */ static unsigned int ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { return ip_vs_in(ops->hooknum, skb, AF_INET6); } @@ -1778,8 +1812,7 @@ ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, */ static unsigned int ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { return ip_vs_in(ops->hooknum, skb, AF_INET6); } @@ -1798,8 +1831,7 @@ ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb, */ static unsigned int ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { int r; struct net *net; @@ -1820,8 +1852,7 @@ ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 static unsigned int ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct nf_hook_state *state) { int r; struct net *net; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ed99448671c3..24c554201a76 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -729,9 +729,9 @@ static void ip_vs_trash_cleanup(struct net *net) } static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { -#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c spin_lock_bh(&src->lock); @@ -747,13 +747,28 @@ ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) } static void +ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) +{ + dst->conns = (u32)src->conns; + dst->inpkts = (u32)src->inpkts; + dst->outpkts = (u32)src->outpkts; + dst->inbytes = src->inbytes; + dst->outbytes = src->outbytes; + dst->cps = (u32)src->cps; + dst->inpps = (u32)src->inpps; + dst->outpps = (u32)src->outpps; + dst->inbps = (u32)src->inbps; + dst->outbps = (u32)src->outbps; +} + +static void ip_vs_zero_stats(struct ip_vs_stats *stats) { spin_lock_bh(&stats->lock); /* get current counters as zero point, rates are zeroed */ -#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c +#define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c IP_VS_ZERO_STATS_COUNTER(conns); IP_VS_ZERO_STATS_COUNTER(inpkts); @@ -827,15 +842,16 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); - sched = rcu_dereference_protected(svc->scheduler, 1); if (add) { ip_vs_start_estimator(svc->net, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; - if (sched->add_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->add_dest) sched->add_dest(svc, dest); } else { - if (sched->upd_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->upd_dest) sched->upd_dest(svc, dest); } } @@ -1069,7 +1085,7 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, struct ip_vs_scheduler *sched; sched = rcu_dereference_protected(svc->scheduler, 1); - if (sched->del_dest) + if (sched && sched->del_dest) sched->del_dest(svc, dest); } } @@ -1160,11 +1176,14 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, ip_vs_use_count_inc(); /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - ret = -ENOENT; - goto out_err; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_err; + } } if (u->pe_name && *u->pe_name) { @@ -1225,10 +1244,12 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + } /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); @@ -1276,17 +1297,20 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { - struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; /* * Lookup the scheduler, by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - return -ENOENT; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } } old_sched = sched; @@ -1314,14 +1338,20 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { + if (old_sched) { + ip_vs_unbind_scheduler(svc, old_sched); + RCU_INIT_POINTER(svc->scheduler, NULL); + /* Wait all svc->sched_data users */ + synchronize_rcu(); + } /* Bind the new scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) { - old_sched = sched; - goto out; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + ip_vs_scheduler_put(sched); + goto out; + } } - /* Unbind the old scheduler on success */ - ip_vs_unbind_scheduler(svc, old_sched); } /* @@ -1808,6 +1838,12 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "conn_reuse_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -1961,6 +1997,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -1969,18 +2006,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - sched->name); + sched_name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - sched->name, + sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", - svc->fwmark, sched->name, + svc->fwmark, sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } @@ -2044,7 +2081,7 @@ static const struct file_operations ip_vs_info_fops = { static int ip_vs_stats_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats_user show; + struct ip_vs_kstats show; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, @@ -2053,17 +2090,22 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) " Conns Packets Packets Bytes Bytes\n"); ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns, - show.inpkts, show.outpkts, - (unsigned long long) show.inbytes, - (unsigned long long) show.outbytes); - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", + (unsigned long long)show.conns, + (unsigned long long)show.inpkts, + (unsigned long long)show.outpkts, + (unsigned long long)show.inbytes, + (unsigned long long)show.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq, "%8X %8X %8X %16X %16X\n", - show.cps, show.inpps, show.outpps, - show.inbps, show.outbps); + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", + (unsigned long long)show.cps, + (unsigned long long)show.inpps, + (unsigned long long)show.outpps, + (unsigned long long)show.inbps, + (unsigned long long)show.outbps); return 0; } @@ -2086,7 +2128,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; - struct ip_vs_stats_user rates; + struct ip_vs_kstats kstats; int i; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ @@ -2098,41 +2140,41 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) for_each_possible_cpu(i) { struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); unsigned int start; - __u64 inbytes, outbytes; + u64 conns, inpkts, outpkts, inbytes, outbytes; do { start = u64_stats_fetch_begin_irq(&u->syncp); - inbytes = u->ustats.inbytes; - outbytes = u->ustats.outbytes; + conns = u->cnt.conns; + inpkts = u->cnt.inpkts; + outpkts = u->cnt.outpkts; + inbytes = u->cnt.inbytes; + outbytes = u->cnt.outbytes; } while (u64_stats_fetch_retry_irq(&u->syncp, start)); - seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", - i, u->ustats.conns, u->ustats.inpkts, - u->ustats.outpkts, (__u64)inbytes, - (__u64)outbytes); + seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", + i, (u64)conns, (u64)inpkts, + (u64)outpkts, (u64)inbytes, + (u64)outbytes); } - spin_lock_bh(&tot_stats->lock); + ip_vs_copy_stats(&kstats, tot_stats); - seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", - tot_stats->ustats.conns, tot_stats->ustats.inpkts, - tot_stats->ustats.outpkts, - (unsigned long long) tot_stats->ustats.inbytes, - (unsigned long long) tot_stats->ustats.outbytes); + seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", + (unsigned long long)kstats.conns, + (unsigned long long)kstats.inpkts, + (unsigned long long)kstats.outpkts, + (unsigned long long)kstats.inbytes, + (unsigned long long)kstats.outbytes); - ip_vs_read_estimator(&rates, tot_stats); - - spin_unlock_bh(&tot_stats->lock); - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ +/* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq, " %8X %8X %8X %16X %16X\n", - rates.cps, - rates.inpps, - rates.outpps, - rates.inbps, - rates.outbps); + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", + kstats.cps, + kstats.inpps, + kstats.outpps, + kstats.inbps, + kstats.outbps); return 0; } @@ -2400,18 +2442,22 @@ static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; + struct ip_vs_kstats kstats; + char *sched_name; sched = rcu_dereference_protected(src->scheduler, 1); + sched_name = sched ? sched->name : "none"; dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; dst->num_dests = src->num_dests; - ip_vs_copy_stats(&dst->stats, &src->stats); + ip_vs_copy_stats(&kstats, &src->stats); + ip_vs_export_stats_user(&dst->stats, &kstats); } static inline int @@ -2485,6 +2531,7 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; + struct ip_vs_kstats kstats; memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { @@ -2506,7 +2553,8 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, entry.activeconns = atomic_read(&dest->activeconns); entry.inactconns = atomic_read(&dest->inactconns); entry.persistconns = atomic_read(&dest->persistconns); - ip_vs_copy_stats(&entry.stats, &dest->stats); + ip_vs_copy_stats(&kstats, &dest->stats); + ip_vs_export_stats_user(&entry.stats, &kstats); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; @@ -2798,25 +2846,51 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, - struct ip_vs_stats *stats) + struct ip_vs_kstats *kstats) +{ + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + + if (!nl_stats) + return -EMSGSIZE; + + if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || + nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) + goto nla_put_failure; + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, + struct ip_vs_kstats *kstats) { - struct ip_vs_stats_user ustats; struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) return -EMSGSIZE; - ip_vs_copy_stats(&ustats, stats); - - if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) || - nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) || - nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) || - nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) || - nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) || - nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) || - nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) || - nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) || - nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps)) + if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) || + nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) || + nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps)) goto nla_put_failure; nla_nest_end(skb, nl_stats); @@ -2835,6 +2909,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; + struct ip_vs_kstats kstats; + char *sched_name; nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) @@ -2853,14 +2929,18 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, } sched = rcu_dereference_protected(svc->scheduler, 1); + sched_name = sched ? sched->name : "none"; pe = rcu_dereference_protected(svc->pe, 1); - if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) goto nla_put_failure; - if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + ip_vs_copy_stats(&kstats, &svc->stats); + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) + goto nla_put_failure; + if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_service); @@ -3032,6 +3112,7 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) { struct nlattr *nl_dest; + struct ip_vs_kstats kstats; nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); if (!nl_dest) @@ -3054,7 +3135,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) atomic_read(&dest->persistconns)) || nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; - if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + ip_vs_copy_stats(&kstats, &dest->stats); + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) + goto nla_put_failure; + if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_dest); @@ -3732,6 +3816,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) ipvs->sysctl_pmtu_disc = 1; tbl[idx++].data = &ipvs->sysctl_pmtu_disc; tbl[idx++].data = &ipvs->sysctl_backup_only; + ipvs->sysctl_conn_reuse_mode = 1; + tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); @@ -3757,6 +3843,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); ip_vs_stop_estimator(net, &ipvs->tot_stats); + + if (!net_eq(net, &init_net)) + kfree(ipvs->sysctl_tbl); } #else diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 1425e9a924c4..ef0eb0a8d552 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -45,17 +45,19 @@ NOTES. - * The stored value for average bps is scaled by 2^5, so that maximal - rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. - * A lot code is taken from net/sched/estimator.c + * Netlink users can see 64-bit values but sockopt users are restricted + to 32-bit values for conns, packets, bps, cps and pps. + + * A lot of code is taken from net/core/gen_estimator.c */ /* * Make a summary from each cpu */ -static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, +static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, struct ip_vs_cpu_stats __percpu *stats) { int i; @@ -64,27 +66,31 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, for_each_possible_cpu(i) { struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); unsigned int start; - __u64 inbytes, outbytes; + u64 conns, inpkts, outpkts, inbytes, outbytes; + if (add) { - sum->conns += s->ustats.conns; - sum->inpkts += s->ustats.inpkts; - sum->outpkts += s->ustats.outpkts; do { start = u64_stats_fetch_begin(&s->syncp); - inbytes = s->ustats.inbytes; - outbytes = s->ustats.outbytes; + conns = s->cnt.conns; + inpkts = s->cnt.inpkts; + outpkts = s->cnt.outpkts; + inbytes = s->cnt.inbytes; + outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); + sum->conns += conns; + sum->inpkts += inpkts; + sum->outpkts += outpkts; sum->inbytes += inbytes; sum->outbytes += outbytes; } else { add = true; - sum->conns = s->ustats.conns; - sum->inpkts = s->ustats.inpkts; - sum->outpkts = s->ustats.outpkts; do { start = u64_stats_fetch_begin(&s->syncp); - sum->inbytes = s->ustats.inbytes; - sum->outbytes = s->ustats.outbytes; + sum->conns = s->cnt.conns; + sum->inpkts = s->cnt.inpkts; + sum->outpkts = s->cnt.outpkts; + sum->inbytes = s->cnt.inbytes; + sum->outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); } } @@ -95,10 +101,7 @@ static void estimation_timer(unsigned long arg) { struct ip_vs_estimator *e; struct ip_vs_stats *s; - u32 n_conns; - u32 n_inpkts, n_outpkts; - u64 n_inbytes, n_outbytes; - u32 rate; + u64 rate; struct net *net = (struct net *)arg; struct netns_ipvs *ipvs; @@ -108,33 +111,29 @@ static void estimation_timer(unsigned long arg) s = container_of(e, struct ip_vs_stats, est); spin_lock(&s->lock); - ip_vs_read_cpu_stats(&s->ustats, s->cpustats); - n_conns = s->ustats.conns; - n_inpkts = s->ustats.inpkts; - n_outpkts = s->ustats.outpkts; - n_inbytes = s->ustats.inbytes; - n_outbytes = s->ustats.outbytes; + ip_vs_read_cpu_stats(&s->kstats, s->cpustats); /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns) << 9; - e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps) >> 2; - - rate = (n_inpkts - e->last_inpkts) << 9; - e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps) >> 2; - - rate = (n_outpkts - e->last_outpkts) << 9; - e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps) >> 2; - - rate = (n_inbytes - e->last_inbytes) << 4; - e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps) >> 2; - - rate = (n_outbytes - e->last_outbytes) << 4; - e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps) >> 2; + rate = (s->kstats.conns - e->last_conns) << 9; + e->last_conns = s->kstats.conns; + e->cps += ((s64)rate - (s64)e->cps) >> 2; + + rate = (s->kstats.inpkts - e->last_inpkts) << 9; + e->last_inpkts = s->kstats.inpkts; + e->inpps += ((s64)rate - (s64)e->inpps) >> 2; + + rate = (s->kstats.outpkts - e->last_outpkts) << 9; + e->last_outpkts = s->kstats.outpkts; + e->outpps += ((s64)rate - (s64)e->outpps) >> 2; + + /* scaled by 2^5, but divided 2 seconds */ + rate = (s->kstats.inbytes - e->last_inbytes) << 4; + e->last_inbytes = s->kstats.inbytes; + e->inbps += ((s64)rate - (s64)e->inbps) >> 2; + + rate = (s->kstats.outbytes - e->last_outbytes) << 4; + e->last_outbytes = s->kstats.outbytes; + e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } spin_unlock(&ipvs->est_lock); @@ -166,14 +165,14 @@ void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; - struct ip_vs_stats_user *u = &stats->ustats; + struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ - est->last_inbytes = u->inbytes; - est->last_outbytes = u->outbytes; - est->last_conns = u->conns; - est->last_inpkts = u->inpkts; - est->last_outpkts = u->outpkts; + est->last_inbytes = k->inbytes; + est->last_outbytes = k->outbytes; + est->last_conns = k->conns; + est->last_inpkts = k->inpkts; + est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; @@ -182,8 +181,7 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) } /* Get decoded rates */ -void ip_vs_read_estimator(struct ip_vs_stats_user *dst, - struct ip_vs_stats *stats) +void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 199760c71f39..7e8141647943 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -74,7 +74,7 @@ void ip_vs_unbind_scheduler(struct ip_vs_service *svc, if (sched->done_service) sched->done_service(svc); - /* svc->scheduler can not be set to NULL */ + /* svc->scheduler can be set to NULL only by caller */ } @@ -147,21 +147,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { - struct ip_vs_scheduler *sched; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; - sched = rcu_dereference(svc->scheduler); if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", - sched->name, svc->fwmark, svc->fwmark, msg); + sched_name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index d93ceeb3ef04..d99ad93eb855 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -612,7 +612,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, pkts = atomic_add_return(1, &cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); - ip_vs_sync_conn(net, cp->control, pkts); + ip_vs_sync_conn(net, cp, pkts); } } @@ -845,10 +845,27 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, struct ip_vs_conn *cp; struct netns_ipvs *ipvs = net_ipvs(net); - if (!(flags & IP_VS_CONN_F_TEMPLATE)) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { cp = ip_vs_conn_in_get(param); - else + if (cp && ((cp->dport != dport) || + !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { + if (!(flags & IP_VS_CONN_F_INACTIVE)) { + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + cp = NULL; + } else { + /* This is the expiration message for the + * connection that was already replaced, so we + * just ignore it. + */ + __ip_vs_conn_put(cp); + kfree(param->pe_data); + return; + } + } + } else { cp = ip_vs_ct_in_get(param); + } if (cp) { /* Free pe_data */ @@ -1388,9 +1405,11 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) mreq.imr_ifindex = dev->ifindex; + rtnl_lock(); lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); release_sock(sk); + rtnl_unlock(); return ret; } @@ -1438,18 +1457,12 @@ static struct socket *make_send_sock(struct net *net, int id) struct socket *sock; int result; - /* First create a socket move it to right name space later */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + /* First create a socket */ + result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); @@ -1478,7 +1491,7 @@ static struct socket *make_send_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1499,17 +1512,11 @@ static struct socket *make_receive_sock(struct net *net, int id) int result; /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = SK_CAN_REUSE; result = sysctl_sync_sock_size(ipvs); @@ -1535,7 +1542,7 @@ static struct socket *make_receive_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1673,7 +1680,7 @@ done: ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo); return 0; @@ -1710,7 +1717,7 @@ static int sync_thread_backup(void *data) } /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); @@ -1835,11 +1842,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) return 0; outsocket: - sk_release_kernel(sock->sk); + sock_release(sock); outtinfo: if (tinfo) { - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 3aedbda7658a..258a0b0e82a2 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -130,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; - fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? FLOWI_FLAG_KNOWN_NH : 0; @@ -209,7 +208,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) struct sock *sk = skb->sk; struct rtable *ort = skb_rtable(skb); - if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + if (!skb->dev && sk && sk_fullsock(sk)) ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); } @@ -364,13 +363,16 @@ err_unreach: #ifdef CONFIG_IP_VS_IPV6 static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, - struct in6_addr *ret_saddr, int do_xfrm) + struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) { struct dst_entry *dst; struct flowi6 fl6 = { .daddr = *daddr, }; + if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; + dst = ip6_route_output(net, NULL, &fl6); if (dst->error) goto out_err; @@ -427,7 +429,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, } dst = __ip_vs_route_output_v6(net, &dest->addr.in6, &dest_dst->dst_saddr.in6, - do_xfrm); + do_xfrm, rt_mode); if (!dst) { __ip_vs_dst_set(dest, NULL, NULL, 0); spin_unlock_bh(&dest->dst_lock); @@ -435,7 +437,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, goto err_unreach; } rt = (struct rt6_info *) dst; - cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", @@ -446,7 +448,8 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, *ret_saddr = dest_dst->dst_saddr.in6; } else { noref = 0; - dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, + rt_mode); if (!dst) goto err_unreach; rt = (struct rt6_info *) dst; @@ -501,6 +504,13 @@ err_put: return -1; err_unreach: + /* The ip6_link_failure function requires the dev field to be set + * in order to get the net (further for the sake of fwmark + * reflection). + */ + if (!skb->dev) + skb->dev = skb_dst(skb)->dev; + dst_link_failure(skb); return -1; } @@ -519,10 +529,27 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset(skb); skb_forward_csum(skb); + if (!skb->sk) + skb_sender_cpu_clear(skb); } return ret; } +/* In the event of a remote destination, it's possible that we would have + * matches against an old socket (particularly a TIME-WAIT socket). This + * causes havoc down the line (ip_local_out et. al. expect regular sockets + * and invalid memory accesses will happen) so simply drop the association + * in this case. +*/ +static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) +{ + /* If dev is set, the packet came from the LOCAL_IN callback and + * not from a local TCP socket. + */ + if (skb->dev) + skb_orphan(skb); +} + /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, struct ip_vs_conn *cp, int local) @@ -534,12 +561,23 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 1); + + /* Remove the early_demux association unless it's bound for the + * exact same port and address on this host after translation. + */ + if (!local || cp->vport != cp->dport || + !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) + ip_vs_drop_early_demux_sk(skb); + if (!local) { skb_forward_csum(skb); - NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, - dst_output); + if (!skb->sk) + skb_sender_cpu_clear(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output_sk); } else ret = NF_ACCEPT; + return ret; } @@ -553,9 +591,12 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) ip_vs_notrack(skb); if (!local) { + ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); - NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, - dst_output); + if (!skb->sk) + skb_sender_cpu_clear(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output_sk); } else ret = NF_ACCEPT; return ret; @@ -781,7 +822,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); @@ -841,6 +882,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, struct ipv6hdr *old_ipv6h = NULL; #endif + ip_vs_drop_early_demux_sk(skb); + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) @@ -924,7 +967,8 @@ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { - struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + struct net *net = skb_net(skb); + struct netns_ipvs *ipvs = net_ipvs(net); struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ @@ -991,7 +1035,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = cp->daddr.ip; iph->saddr = saddr; iph->ttl = ttl; - ip_select_ident(skb, NULL); + ip_select_ident(net, skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; @@ -1163,7 +1207,8 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL); + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH); if (local < 0) goto tx_error; if (local) { @@ -1345,7 +1390,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index a4b5e2a435ac..45da11afa785 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -47,9 +47,11 @@ seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) return 0; counter = acct->counter; - return seq_printf(s, "packets=%llu bytes=%llu ", - (unsigned long long)atomic64_read(&counter[dir].packets), - (unsigned long long)atomic64_read(&counter[dir].bytes)); + seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)atomic64_read(&counter[dir].packets), + (unsigned long long)atomic64_read(&counter[dir].bytes)); + + return 0; }; EXPORT_SYMBOL_GPL(seq_print_acct); diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index b8b95f4027ca..57a26cc90c9f 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -88,7 +88,6 @@ static int amanda_help(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - struct ts_state ts; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple *tuple; unsigned int dataoff, start, stop, off, i; @@ -113,23 +112,20 @@ static int amanda_help(struct sk_buff *skb, return NF_ACCEPT; } - memset(&ts, 0, sizeof(ts)); start = skb_find_text(skb, dataoff, skb->len, - search[SEARCH_CONNECT].ts, &ts); + search[SEARCH_CONNECT].ts); if (start == UINT_MAX) goto out; start += dataoff + search[SEARCH_CONNECT].len; - memset(&ts, 0, sizeof(ts)); stop = skb_find_text(skb, start, skb->len, - search[SEARCH_NEWLINE].ts, &ts); + search[SEARCH_NEWLINE].ts); if (stop == UINT_MAX) goto out; stop += start; for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { - memset(&ts, 0, sizeof(ts)); - off = skb_find_text(skb, start, stop, search[i].ts, &ts); + off = skb_find_text(skb, start, stop, search[i].ts); if (off == UINT_MAX) continue; off += start + search[i].len; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 13fad8668f83..651039ad1681 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -287,6 +287,46 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) spin_unlock(&pcpu->lock); } +/* Released via destroy_conntrack() */ +struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags) +{ + struct nf_conn *tmpl; + + tmpl = kzalloc(sizeof(struct nf_conn), GFP_KERNEL); + if (tmpl == NULL) + return NULL; + + tmpl->status = IPS_TEMPLATE; + write_pnet(&tmpl->ct_net, net); + +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (zone) { + struct nf_conntrack_zone *nf_ct_zone; + + nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC); + if (!nf_ct_zone) + goto out_free; + nf_ct_zone->id = zone; + } +#endif + atomic_set(&tmpl->ct_general.use, 0); + + return tmpl; +#ifdef CONFIG_NF_CONNTRACK_ZONES +out_free: + kfree(tmpl); + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); + +static void nf_ct_tmpl_free(struct nf_conn *tmpl) +{ + nf_ct_ext_destroy(tmpl); + nf_ct_ext_free(tmpl); + kfree(tmpl); +} + static void destroy_conntrack(struct nf_conntrack *nfct) { @@ -298,6 +338,10 @@ destroy_conntrack(struct nf_conntrack *nfct) NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); + if (unlikely(nf_ct_is_template(ct))) { + nf_ct_tmpl_free(ct); + return; + } rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto && l4proto->destroy) @@ -540,28 +584,6 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); -/* deletion from this larval template list happens via nf_ct_put() */ -void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) -{ - struct ct_pcpu *pcpu; - - __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); - nf_conntrack_get(&tmpl->ct_general); - - /* add this conntrack to the (per cpu) tmpl list */ - local_bh_disable(); - tmpl->cpu = smp_processor_id(); - pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); - - spin_lock(&pcpu->lock); - /* Overload tuple linked list to put us in template list. */ - hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &pcpu->tmpl); - spin_unlock_bh(&pcpu->lock); -} -EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); - /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -1751,7 +1773,6 @@ int nf_conntrack_init_net(struct net *net) spin_lock_init(&pcpu->lock); INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL); } net->ct.stat = alloc_percpu(struct ip_conntrack_stat); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 91a1837acd0e..b45a4223cb05 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -219,7 +219,8 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } - return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); + return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && + nf_ct_zone(a->master) == nf_ct_zone(b->master); } static inline int expect_matches(const struct nf_conntrack_expect *a, @@ -561,7 +562,9 @@ static int exp_seq_show(struct seq_file *s, void *v) helper->expect_policy[expect->class].name); } - return seq_putc(s, '\n'); + seq_putc(s, '\n'); + + return 0; } static const struct seq_operations exp_seq_ops = { diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 1d69f5b9748f..9511af04dc81 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -779,8 +779,8 @@ static int callforward_do_filter(struct net *net, flowi6_to_flowi(&fl1), false)) { if (!afinfo->route(net, (struct dst_entry **)&rt2, flowi6_to_flowi(&fl2), false)) { - if (ipv6_addr_equal(rt6_nexthop(rt1), - rt6_nexthop(rt2)) && + if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr), + rt6_nexthop(rt2, &fl2.daddr)) && rt1->dst.dev == rt2->dst.dev) ret = 1; dst_release(&rt2->dst); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d1c23940a86a..6b8b0abbfab4 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2995,11 +2995,6 @@ ctnetlink_create_expect(struct net *net, u16 zone, } err = nf_ct_expect_related_report(exp, portid, report); - if (err < 0) - goto err_exp; - - return 0; -err_exp: nf_ct_expect_put(exp); err_ct: nf_ct_put(ct); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 60865f110309..2281be419a74 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -90,7 +90,13 @@ static int generic_packet(struct nf_conn *ct, static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { - return nf_generic_should_process(nf_ct_protonum(ct)); + bool ret; + + ret = nf_generic_should_process(nf_ct_protonum(ct)); + if (!ret) + pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n", + nf_ct_protonum(ct)); + return ret; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 5caa0c41bf26..70383de72054 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -202,7 +202,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW - * sLA -> sTW Last ACK detected. + * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ @@ -261,7 +261,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW - * sLA -> sTW Last ACK detected. + * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ @@ -906,6 +906,7 @@ static int tcp_packet(struct nf_conn *ct, 1 : ct->proto.tcp.last_win; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = ct->proto.tcp.last_wscale; + ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; memset(&ct->proto.tcp.seen[dir], 0, @@ -923,7 +924,9 @@ static int tcp_packet(struct nf_conn *ct, * may be in sync but we are not. In that case, we annotate * the TCP options and let the packet go through. If it is a * valid SYN packet, the server will reply with a SYN/ACK, and - * then we'll get in sync. Otherwise, the server ignores it. */ + * then we'll get in sync. Otherwise, the server potentially + * responds with a challenge ACK if implementing RFC5961. + */ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { struct ip_ct_tcp_state seen = {}; @@ -939,6 +942,13 @@ static int tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_SACK_PERM; } + /* Mark the potential for RFC5961 challenge ACK, + * this pose a special problem for LAST_ACK state + * as ACK is intrepretated as ACKing last FIN. + */ + if (old_state == TCP_CONNTRACK_LAST_ACK) + ct->proto.tcp.last_flags |= + IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) @@ -970,6 +980,25 @@ static int tcp_packet(struct nf_conn *ct, nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; + case TCP_CONNTRACK_TIME_WAIT: + /* RFC5961 compliance cause stack to send "challenge-ACK" + * e.g. in response to spurious SYNs. Conntrack MUST + * not believe this ACK is acking last FIN. + */ + if (old_state == TCP_CONNTRACK_LAST_ACK && + index == TCP_ACK_SET && + ct->proto.tcp.last_dir != dir && + ct->proto.tcp.last_index == TCP_SYN_SET && + (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) { + /* Detected RFC5961 challenge ACK */ + ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: challenge-ACK ignored "); + return NF_ACCEPT; /* Don't change state */ + } + break; case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 61a3c927e63c..399210693c2a 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -14,16 +14,12 @@ /* core.c */ unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - unsigned int hook, const struct net_device *indev, - const struct net_device *outdev, - struct nf_hook_ops **elemp, - int (*okfn)(struct sk_buff *), int hook_thresh); + struct nf_hook_state *state, struct nf_hook_ops **elemp); /* nf_queue.c */ -int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, u_int8_t pf, - unsigned int hook, struct net_device *indev, - struct net_device *outdev, int (*okfn)(struct sk_buff *), - unsigned int queuenum); +int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, + struct nf_hook_state *state, unsigned int queuenum); +void nf_queue_nf_hook_drop(struct nf_hook_ops *ops); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index a2233e77cf39..a5aa5967b8e1 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -17,6 +17,7 @@ #include <net/route.h> #include <linux/netfilter.h> +#include <linux/netfilter_bridge.h> #include <linux/netfilter/xt_LOG.h> #include <net/netfilter/nf_log.h> @@ -133,7 +134,7 @@ EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) { - if (!sk || sk->sk_state == TCP_TIME_WAIT) + if (!sk || !sk_fullsock(sk)) return; read_lock_bh(&sk->sk_callback_lock); @@ -163,10 +164,10 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, const struct net_device *physindev; const struct net_device *physoutdev; - physindev = skb->nf_bridge->physindev; + physindev = nf_bridge_get_physindev(skb); if (physindev && in != physindev) nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); - physoutdev = skb->nf_bridge->physoutdev; + physoutdev = nf_bridge_get_physoutdev(skb); if (physoutdev && out != physoutdev) nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 4c8b68e5fa16..8a8b2abc35ff 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -10,6 +10,7 @@ #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/netfilter.h> +#include <linux/netfilter_bridge.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/protocol.h> @@ -47,19 +48,25 @@ EXPORT_SYMBOL(nf_unregister_queue_handler); void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { + struct nf_hook_state *state = &entry->state; + /* Release those devices we held, or Alexey will kill me. */ - if (entry->indev) - dev_put(entry->indev); - if (entry->outdev) - dev_put(entry->outdev); + if (state->in) + dev_put(state->in); + if (state->out) + dev_put(state->out); + if (state->sk) + sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { - struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; + struct net_device *physdev; - if (nf_bridge->physindev) - dev_put(nf_bridge->physindev); - if (nf_bridge->physoutdev) - dev_put(nf_bridge->physoutdev); + physdev = nf_bridge_get_physindev(entry->skb); + if (physdev) + dev_put(physdev); + physdev = nf_bridge_get_physoutdev(entry->skb); + if (physdev) + dev_put(physdev); } #endif /* Drop reference to owner of hook which queued us. */ @@ -70,22 +77,25 @@ EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); /* Bump dev refs so they don't vanish while packet is out */ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { + struct nf_hook_state *state = &entry->state; + if (!try_module_get(entry->elem->owner)) return false; - if (entry->indev) - dev_hold(entry->indev); - if (entry->outdev) - dev_hold(entry->outdev); + if (state->in) + dev_hold(state->in); + if (state->out) + dev_hold(state->out); + if (state->sk) + sock_hold(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { - struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; struct net_device *physdev; - physdev = nf_bridge->physindev; + physdev = nf_bridge_get_physindev(entry->skb); if (physdev) dev_hold(physdev); - physdev = nf_bridge->physoutdev; + physdev = nf_bridge_get_physoutdev(entry->skb); if (physdev) dev_hold(physdev); } @@ -95,17 +105,31 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); +void nf_queue_nf_hook_drop(struct nf_hook_ops *ops) +{ + const struct nf_queue_handler *qh; + struct net *net; + + rtnl_lock(); + rcu_read_lock(); + qh = rcu_dereference(queue_handler); + if (qh) { + for_each_net(net) { + qh->nf_hook_drop(net, ops); + } + } + rcu_read_unlock(); + rtnl_unlock(); +} + /* * Any packet that leaves via this function must come back * through nf_reinject(). */ int nf_queue(struct sk_buff *skb, - struct nf_hook_ops *elem, - u_int8_t pf, unsigned int hook, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - unsigned int queuenum) + struct nf_hook_ops *elem, + struct nf_hook_state *state, + unsigned int queuenum) { int status = -ENOENT; struct nf_queue_entry *entry = NULL; @@ -121,7 +145,7 @@ int nf_queue(struct sk_buff *skb, goto err_unlock; } - afinfo = nf_get_afinfo(pf); + afinfo = nf_get_afinfo(state->pf); if (!afinfo) goto err_unlock; @@ -134,11 +158,7 @@ int nf_queue(struct sk_buff *skb, *entry = (struct nf_queue_entry) { .skb = skb, .elem = elem, - .pf = pf, - .hook = hook, - .indev = indev, - .outdev = outdev, - .okfn = okfn, + .state = *state, .size = sizeof(*entry) + afinfo->route_key_size, }; @@ -184,30 +204,29 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) } if (verdict == NF_ACCEPT) { - afinfo = nf_get_afinfo(entry->pf); + afinfo = nf_get_afinfo(entry->state.pf); if (!afinfo || afinfo->reroute(skb, entry) < 0) verdict = NF_DROP; } + entry->state.thresh = INT_MIN; + if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(&nf_hooks[entry->pf][entry->hook], - skb, entry->hook, - entry->indev, entry->outdev, &elem, - entry->okfn, INT_MIN); + verdict = nf_iterate(entry->state.hook_list, + skb, &entry->state, &elem); } switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_STOP: local_bh_disable(); - entry->okfn(skb); + entry->state.okfn(entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: - err = nf_queue(skb, elem, entry->pf, entry->hook, - entry->indev, entry->outdev, entry->okfn, - verdict >> NF_VERDICT_QBITS); + err = nf_queue(skb, elem, &entry->state, + verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ECANCELED) goto next_hook; diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 52e20c9a46a5..71f1e9fdfa18 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -11,6 +11,7 @@ #include <asm/unaligned.h> #include <net/tcp.h> #include <net/netns/generic.h> +#include <linux/proc_fs.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter/x_tables.h> @@ -348,12 +349,10 @@ static void __net_exit synproxy_proc_exit(struct net *net) static int __net_init synproxy_net_init(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); - struct nf_conntrack_tuple t; struct nf_conn *ct; int err = -ENOMEM; - memset(&t, 0, sizeof(t)); - ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); + ct = nf_ct_tmpl_alloc(net, 0, GFP_KERNEL); if (IS_ERR(ct)) { err = PTR_ERR(ct); goto err1; @@ -364,7 +363,8 @@ static int __net_init synproxy_net_init(struct net *net) if (!nfct_synproxy_ext_add(ct)) goto err2; - nf_conntrack_tmpl_insert(net, ct); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_get(&ct->ct_general); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ac1a9528dbf2..cfe636808541 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -127,13 +127,46 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } +int nft_register_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) +{ + if (basechain->flags & NFT_BASECHAIN_DISABLED) + return 0; + + return nf_register_hooks(basechain->ops, hook_nops); +} +EXPORT_SYMBOL_GPL(nft_register_basechain); + +void nft_unregister_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) +{ + if (basechain->flags & NFT_BASECHAIN_DISABLED) + return; + + nf_unregister_hooks(basechain->ops, hook_nops); +} +EXPORT_SYMBOL_GPL(nft_unregister_basechain); + +static int nf_tables_register_hooks(const struct nft_table *table, + struct nft_chain *chain, + unsigned int hook_nops) +{ + if (table->flags & NFT_TABLE_F_DORMANT || + !(chain->flags & NFT_BASE_CHAIN)) + return 0; + + return nft_register_basechain(nft_base_chain(chain), hook_nops); +} + static void nf_tables_unregister_hooks(const struct nft_table *table, - const struct nft_chain *chain, + struct nft_chain *chain, unsigned int hook_nops) { - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops); + if (table->flags & NFT_TABLE_F_DORMANT || + !(chain->flags & NFT_BASE_CHAIN)) + return; + + nft_unregister_basechain(nft_base_chain(chain), hook_nops); } /* Internal table flags */ @@ -198,36 +231,31 @@ static int nft_delchain(struct nft_ctx *ctx) static inline bool nft_rule_is_active(struct net *net, const struct nft_rule *rule) { - return (rule->genmask & (1 << net->nft.gencursor)) == 0; -} - -static inline int gencursor_next(struct net *net) -{ - return net->nft.gencursor+1 == 1 ? 1 : 0; + return (rule->genmask & nft_genmask_cur(net)) == 0; } static inline int nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) { - return (rule->genmask & (1 << gencursor_next(net))) == 0; + return (rule->genmask & nft_genmask_next(net)) == 0; } static inline void nft_rule_activate_next(struct net *net, struct nft_rule *rule) { /* Now inactive, will be active in the future */ - rule->genmask = (1 << net->nft.gencursor); + rule->genmask = nft_genmask_cur(net); } static inline void nft_rule_deactivate_next(struct net *net, struct nft_rule *rule) { - rule->genmask = (1 << gencursor_next(net)); + rule->genmask = nft_genmask_next(net); } static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) { - rule->genmask &= ~(1 << gencursor_next(net)); + rule->genmask &= ~nft_genmask_next(net); } static int @@ -401,7 +429,8 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi, } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { - [NFTA_TABLE_NAME] = { .type = NLA_STRING }, + [NFTA_TABLE_NAME] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, }; @@ -564,7 +593,7 @@ static int nf_tables_table_enable(const struct nft_af_info *afi, if (!(chain->flags & NFT_BASE_CHAIN)) continue; - err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); + err = nft_register_basechain(nft_base_chain(chain), afi->nops); if (err < 0) goto err; @@ -579,20 +608,20 @@ err: if (i-- <= 0) break; - nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops); + nft_unregister_basechain(nft_base_chain(chain), afi->nops); } return err; } static void nf_tables_table_disable(const struct nft_af_info *afi, - struct nft_table *table) + struct nft_table *table) { struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) { if (chain->flags & NFT_BASE_CHAIN) - nf_unregister_hooks(nft_base_chain(chain)->ops, - afi->nops); + nft_unregister_basechain(nft_base_chain(chain), + afi->nops); } } @@ -683,29 +712,33 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, return -EINVAL; } + err = -EAFNOSUPPORT; if (!try_module_get(afi->owner)) - return -EAFNOSUPPORT; + goto err1; - table = kzalloc(sizeof(*table) + nla_len(name), GFP_KERNEL); - if (table == NULL) { - module_put(afi->owner); - return -ENOMEM; - } + err = -ENOMEM; + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (table == NULL) + goto err2; - nla_strlcpy(table->name, name, nla_len(name)); + nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); table->flags = flags; nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); - if (err < 0) { - kfree(table); - module_put(afi->owner); - return err; - } + if (err < 0) + goto err3; + list_add_tail_rcu(&table->list, &afi->tables); return 0; +err3: + kfree(table); +err2: + module_put(afi->owner); +err1: + return err; } static int nft_flush_table(struct nft_ctx *ctx) @@ -883,6 +916,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 }, [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 }, + [NFTA_HOOK_DEV] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, }; static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) @@ -956,6 +991,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority))) goto nla_put_failure; + if (basechain->dev_name[0] && + nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name)) + goto nla_put_failure; nla_nest_end(skb, nest); if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -1167,9 +1205,13 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) BUG_ON(chain->use > 0); if (chain->flags & NFT_BASE_CHAIN) { - module_put(nft_base_chain(chain)->type->owner); - free_percpu(nft_base_chain(chain)->stats); - kfree(nft_base_chain(chain)); + struct nft_base_chain *basechain = nft_base_chain(chain); + + module_put(basechain->type->owner); + free_percpu(basechain->stats); + if (basechain->ops[0].dev != NULL) + dev_put(basechain->ops[0].dev); + kfree(basechain); } else { kfree(chain); } @@ -1188,6 +1230,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, struct nlattr *ha[NFTA_HOOK_MAX + 1]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct net_device *dev = NULL; u8 policy = NF_ACCEPT; u64 handle = 0; unsigned int i; @@ -1327,17 +1370,43 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return -ENOENT; hookfn = type->hooks[hooknum]; + if (afi->flags & NFT_AF_NEEDS_DEV) { + char ifname[IFNAMSIZ]; + + if (!ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + + nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); + dev = dev_get_by_name(net, ifname); + if (!dev) { + module_put(type->owner); + return -ENOENT; + } + } else if (ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); if (basechain == NULL) { module_put(type->owner); + if (dev != NULL) + dev_put(dev); return -ENOMEM; } + if (dev != NULL) + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); if (IS_ERR(stats)) { module_put(type->owner); kfree(basechain); + if (dev != NULL) + dev_put(dev); return PTR_ERR(stats); } basechain->stats = stats; @@ -1346,11 +1415,14 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (stats == NULL) { module_put(type->owner); kfree(basechain); + if (dev != NULL) + dev_put(dev); return -ENOMEM; } rcu_assign_pointer(basechain->stats, stats); } + write_pnet(&basechain->pnet, net); basechain->type = type; chain = &basechain->chain; @@ -1362,6 +1434,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, ops->priority = priority; ops->priv = chain; ops->hook = afi->hooks[ops->hooknum]; + ops->dev = dev; if (hookfn) ops->hook = hookfn; if (afi->hook_ops_init) @@ -1378,16 +1451,12 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); - chain->net = net; chain->table = table; nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) { - err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops); - if (err < 0) - goto err1; - } + err = nf_tables_register_hooks(table, chain, afi->nops); + if (err < 0) + goto err1; nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); @@ -1547,6 +1616,23 @@ nla_put_failure: return -1; }; +int nft_expr_dump(struct sk_buff *skb, unsigned int attr, + const struct nft_expr *expr) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, attr); + if (!nest) + goto nla_put_failure; + if (nf_tables_fill_expr_info(skb, expr) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + return -1; +} + struct nft_expr_info { const struct nft_expr_ops *ops; struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; @@ -1624,6 +1710,39 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx, module_put(expr->ops->type->owner); } +struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, + const struct nlattr *nla) +{ + struct nft_expr_info info; + struct nft_expr *expr; + int err; + + err = nf_tables_expr_parse(ctx, nla, &info); + if (err < 0) + goto err1; + + err = -ENOMEM; + expr = kzalloc(info.ops->size, GFP_KERNEL); + if (expr == NULL) + goto err2; + + err = nf_tables_newexpr(ctx, &info, expr); + if (err < 0) + goto err2; + + return expr; +err2: + module_put(info.ops->type->owner); +err1: + return ERR_PTR(err); +} + +void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) +{ + nf_tables_expr_destroy(ctx, expr); + kfree(expr); +} + /* * Rules */ @@ -1705,12 +1824,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, if (list == NULL) goto nla_put_failure; nft_rule_for_each_expr(expr, next, rule) { - struct nlattr *elem = nla_nest_start(skb, NFTA_LIST_ELEM); - if (elem == NULL) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) goto nla_put_failure; - if (nf_tables_fill_expr_info(skb, expr) < 0) - goto nla_put_failure; - nla_nest_end(skb, elem); } nla_nest_end(skb, list); @@ -2161,7 +2276,7 @@ nft_select_set_ops(const struct nlattr * const nla[], features = 0; if (nla[NFTA_SET_FLAGS] != NULL) { features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); - features &= NFT_SET_INTERVAL | NFT_SET_MAP; + features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT; } bops = NULL; @@ -2218,6 +2333,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_POLICY] = { .type = NLA_U32 }, [NFTA_SET_DESC] = { .type = NLA_NESTED }, [NFTA_SET_ID] = { .type = NLA_U32 }, + [NFTA_SET_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -2368,6 +2485,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + if (set->timeout && + nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout))) + goto nla_put_failure; + if (set->gc_int && + nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int))) + goto nla_put_failure; + if (set->policy != NFT_SET_POL_PERFORMANCE) { if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy))) goto nla_put_failure; @@ -2580,7 +2704,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, char name[IFNAMSIZ]; unsigned int size; bool create; - u32 ktype, dtype, flags, policy; + u64 timeout; + u32 ktype, dtype, flags, policy, gc_int; struct nft_set_desc desc; int err; @@ -2600,15 +2725,20 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, } desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); - if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data)) + if (desc.klen == 0 || desc.klen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; flags = 0; if (nla[NFTA_SET_FLAGS] != NULL) { flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | - NFT_SET_INTERVAL | NFT_SET_MAP)) + NFT_SET_INTERVAL | NFT_SET_TIMEOUT | + NFT_SET_MAP | NFT_SET_EVAL)) return -EINVAL; + /* Only one of both operations is supported */ + if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) == + (NFT_SET_MAP | NFT_SET_EVAL)) + return -EOPNOTSUPP; } dtype = 0; @@ -2625,14 +2755,26 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_DATA_LEN] == NULL) return -EINVAL; desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN])); - if (desc.dlen == 0 || - desc.dlen > FIELD_SIZEOF(struct nft_data, data)) + if (desc.dlen == 0 || desc.dlen > NFT_DATA_VALUE_MAXLEN) return -EINVAL; } else - desc.dlen = sizeof(struct nft_data); + desc.dlen = sizeof(struct nft_verdict); } else if (flags & NFT_SET_MAP) return -EINVAL; + timeout = 0; + if (nla[NFTA_SET_TIMEOUT] != NULL) { + if (!(flags & NFT_SET_TIMEOUT)) + return -EINVAL; + timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_TIMEOUT])); + } + gc_int = 0; + if (nla[NFTA_SET_GC_INTERVAL] != NULL) { + if (!(flags & NFT_SET_TIMEOUT)) + return -EINVAL; + gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); + } + policy = NFT_SET_POL_PERFORMANCE; if (nla[NFTA_SET_POLICY] != NULL) policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); @@ -2692,6 +2834,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, goto err2; INIT_LIST_HEAD(&set->bindings); + write_pnet(&set->pnet, net); set->ops = ops; set->ktype = ktype; set->klen = desc.klen; @@ -2700,6 +2843,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, set->flags = flags; set->size = desc.size; set->policy = policy; + set->timeout = timeout; + set->gc_int = gc_int; err = ops->init(set, &desc, nla); if (err < 0) @@ -2768,12 +2913,14 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, const struct nft_set_elem *elem) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); enum nft_registers dreg; dreg = nft_type_to_reg(set->dtype); - return nft_validate_data_load(ctx, dreg, &elem->data, - set->dtype == NFT_DATA_VERDICT ? - NFT_DATA_VERDICT : NFT_DATA_VALUE); + return nft_validate_register_store(ctx, dreg, nft_set_ext_data(ext), + set->dtype == NFT_DATA_VERDICT ? + NFT_DATA_VERDICT : NFT_DATA_VALUE, + set->dlen); } int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, @@ -2785,12 +2932,13 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS) return -EBUSY; - if (set->flags & NFT_SET_MAP) { + if (binding->flags & NFT_SET_MAP) { /* If the set is already bound to the same chain all * jumps are already validated for that chain. */ list_for_each_entry(i, &set->bindings, list) { - if (i->chain == binding->chain) + if (binding->flags & NFT_SET_MAP && + i->chain == binding->chain) goto bind; } @@ -2824,6 +2972,35 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, nf_tables_set_destroy(ctx, set); } +const struct nft_set_ext_type nft_set_ext_types[] = { + [NFT_SET_EXT_KEY] = { + .align = __alignof__(u32), + }, + [NFT_SET_EXT_DATA] = { + .align = __alignof__(u32), + }, + [NFT_SET_EXT_EXPR] = { + .align = __alignof__(struct nft_expr), + }, + [NFT_SET_EXT_FLAGS] = { + .len = sizeof(u8), + .align = __alignof__(u8), + }, + [NFT_SET_EXT_TIMEOUT] = { + .len = sizeof(u64), + .align = __alignof__(u64), + }, + [NFT_SET_EXT_EXPIRATION] = { + .len = sizeof(unsigned long), + .align = __alignof__(unsigned long), + }, + [NFT_SET_EXT_USERDATA] = { + .len = sizeof(struct nft_userdata), + .align = __alignof__(struct nft_userdata), + }, +}; +EXPORT_SYMBOL_GPL(nft_set_ext_types); + /* * Set elements */ @@ -2832,6 +3009,9 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, + [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -2870,6 +3050,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_elem *elem) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -2877,20 +3058,52 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, &elem->key, NFT_DATA_VALUE, - set->klen) < 0) + if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext), + NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; - if (set->flags & NFT_SET_MAP && - !(elem->flags & NFT_SET_ELEM_INTERVAL_END) && - nft_data_dump(skb, NFTA_SET_ELEM_DATA, &elem->data, + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && + nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, set->dlen) < 0) goto nla_put_failure; - if (elem->flags != 0) - if (nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(elem->flags))) + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) && + nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0) + goto nla_put_failure; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, + htonl(*nft_set_ext_flags(ext)))) + goto nla_put_failure; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && + nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, + cpu_to_be64(*nft_set_ext_timeout(ext)))) + goto nla_put_failure; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + unsigned long expires, now = jiffies; + + expires = *nft_set_ext_expiration(ext); + if (time_before(now, expires)) + expires -= now; + else + expires = 0; + + if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, + cpu_to_be64(jiffies_to_msecs(expires)))) + goto nla_put_failure; + } + + if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { + struct nft_userdata *udata; + + udata = nft_set_ext_userdata(ext); + if (nla_put(skb, NFTA_SET_ELEM_USERDATA, + udata->len + 1, udata->data)) goto nla_put_failure; + } nla_nest_end(skb, nest); return 0; @@ -3111,20 +3324,65 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, return trans; } +void *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const u32 *key, const u32 *data, + u64 timeout, gfp_t gfp) +{ + struct nft_set_ext *ext; + void *elem; + + elem = kzalloc(set->ops->elemsize + tmpl->len, gfp); + if (elem == NULL) + return NULL; + + ext = nft_set_elem_ext(set, elem); + nft_set_ext_init(ext, tmpl); + + memcpy(nft_set_ext_key(ext), key, set->klen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) + memcpy(nft_set_ext_data(ext), data, set->dlen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) + *nft_set_ext_expiration(ext) = + jiffies + msecs_to_jiffies(timeout); + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) + *nft_set_ext_timeout(ext) = timeout; + + return elem; +} + +void nft_set_elem_destroy(const struct nft_set *set, void *elem) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + + nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE); + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) + nft_data_uninit(nft_set_ext_data(ext), set->dtype); + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); + + kfree(elem); +} +EXPORT_SYMBOL_GPL(nft_set_elem_destroy); + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_data_desc d1, d2; + struct nft_set_ext_tmpl tmpl; + struct nft_set_ext *ext; struct nft_set_elem elem; struct nft_set_binding *binding; + struct nft_userdata *udata; + struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; + u64 timeout; + u32 flags; + u8 ulen; int err; - if (set->size && set->nelems == set->size) - return -ENFILE; - err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy); if (err < 0) @@ -3133,38 +3391,59 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_KEY] == NULL) return -EINVAL; - elem.flags = 0; + nft_set_ext_prepare(&tmpl); + + flags = 0; if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { - elem.flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); - if (elem.flags & ~NFT_SET_ELEM_INTERVAL_END) + flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); + if (flags & ~NFT_SET_ELEM_INTERVAL_END) return -EINVAL; + if (!(set->flags & NFT_SET_INTERVAL) && + flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); } if (set->flags & NFT_SET_MAP) { if (nla[NFTA_SET_ELEM_DATA] == NULL && - !(elem.flags & NFT_SET_ELEM_INTERVAL_END)) + !(flags & NFT_SET_ELEM_INTERVAL_END)) return -EINVAL; if (nla[NFTA_SET_ELEM_DATA] != NULL && - elem.flags & NFT_SET_ELEM_INTERVAL_END) + flags & NFT_SET_ELEM_INTERVAL_END) return -EINVAL; } else { if (nla[NFTA_SET_ELEM_DATA] != NULL) return -EINVAL; } - err = nft_data_init(ctx, &elem.key, &d1, nla[NFTA_SET_ELEM_KEY]); + timeout = 0; + if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_ELEM_TIMEOUT])); + } else if (set->flags & NFT_SET_TIMEOUT) { + timeout = set->timeout; + } + + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; err = -EINVAL; if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) goto err2; - err = -EEXIST; - if (set->ops->get(set, &elem) == 0) - goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len); + if (timeout > 0) { + nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); + if (timeout != set->timeout) + nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); + } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &elem.data, &d2, nla[NFTA_SET_ELEM_DATA]); + err = nft_data_init(ctx, &data, sizeof(data), &d2, + nla[NFTA_SET_ELEM_DATA]); if (err < 0) goto err2; @@ -3180,32 +3459,68 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, .chain = (struct nft_chain *)binding->chain, }; - err = nft_validate_data_load(&bind_ctx, dreg, - &elem.data, d2.type); + if (!(binding->flags & NFT_SET_MAP)) + continue; + + err = nft_validate_register_store(&bind_ctx, dreg, + &data, + d2.type, d2.len); if (err < 0) goto err3; } + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); + } + + /* The full maximum length of userdata can exceed the maximum + * offset value (U8_MAX) for following extensions, therefor it + * must be the last extension added. + */ + ulen = 0; + if (nla[NFTA_SET_ELEM_USERDATA] != NULL) { + ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]); + if (ulen > 0) + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA, + ulen); + } + + err = -ENOMEM; + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, + timeout, GFP_KERNEL); + if (elem.priv == NULL) + goto err3; + + ext = nft_set_elem_ext(set, elem.priv); + if (flags) + *nft_set_ext_flags(ext) = flags; + if (ulen > 0) { + udata = nft_set_ext_userdata(ext); + udata->len = ulen - 1; + nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) - goto err3; + goto err4; + ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; err = set->ops->insert(set, &elem); if (err < 0) - goto err4; + goto err5; nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err4: +err5: kfree(trans); +err4: + kfree(elem.priv); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_uninit(&elem.data, d2.type); + nft_data_uninit(&data, d2.type); err2: - nft_data_uninit(&elem.key, d1.type); + nft_data_uninit(&elem.key.val, d1.type); err1: return err; } @@ -3241,11 +3556,15 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, return -EBUSY; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + if (set->size && + !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) + return -ENFILE; + err = nft_add_set_elem(&ctx, set, attr); - if (err < 0) + if (err < 0) { + atomic_dec(&set->nelems); break; - - set->nelems++; + } } return err; } @@ -3268,7 +3587,8 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_KEY] == NULL) goto err1; - err = nft_data_init(ctx, &elem.key, &desc, nla[NFTA_SET_ELEM_KEY]); + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) goto err1; @@ -3276,21 +3596,26 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) goto err2; - err = set->ops->get(set, &elem); - if (err < 0) - goto err2; - trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) { err = -ENOMEM; goto err2; } + elem.priv = set->ops->deactivate(set, &elem); + if (elem.priv == NULL) { + err = -ENOENT; + goto err3; + } + nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; + +err3: + kfree(trans); err2: - nft_data_uninit(&elem.key, desc.type); + nft_data_uninit(&elem.key.val, desc.type); err1: return err; } @@ -3322,11 +3647,36 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, if (err < 0) break; - set->nelems--; + set->ndeact++; } return err; } +void nft_set_gc_batch_release(struct rcu_head *rcu) +{ + struct nft_set_gc_batch *gcb; + unsigned int i; + + gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); + for (i = 0; i < gcb->head.cnt; i++) + nft_set_elem_destroy(gcb->head.set, gcb->elems[i]); + kfree(gcb); +} +EXPORT_SYMBOL_GPL(nft_set_gc_batch_release); + +struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, + gfp_t gfp) +{ + struct nft_set_gc_batch *gcb; + + gcb = kzalloc(sizeof(*gcb), gfp); + if (gcb == NULL) + return gcb; + gcb->head.set = set; + return gcb; +} +EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc); + static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { @@ -3526,6 +3876,10 @@ static void nf_tables_commit_release(struct nft_trans *trans) case NFT_MSG_DELSET: nft_set_destroy(nft_trans_set(trans)); break; + case NFT_MSG_DELSETELEM: + nft_set_elem_destroy(nft_trans_elem_set(trans), + nft_trans_elem(trans).priv); + break; } kfree(trans); } @@ -3540,7 +3894,7 @@ static int nf_tables_commit(struct sk_buff *skb) while (++net->nft.base_seq == 0); /* A new generation has just started */ - net->nft.gencursor = gencursor_next(net); + net->nft.gencursor = nft_gencursor_next(net); /* Make sure all packets have left the previous generation before * purging old rules. @@ -3611,24 +3965,23 @@ static int nf_tables_commit(struct sk_buff *skb) NFT_MSG_DELSET, GFP_KERNEL); break; case NFT_MSG_NEWSETELEM: - nf_tables_setelem_notify(&trans->ctx, - nft_trans_elem_set(trans), - &nft_trans_elem(trans), + te = (struct nft_trans_elem *)trans->data; + + te->set->ops->activate(te->set, &te->elem); + nf_tables_setelem_notify(&trans->ctx, te->set, + &te->elem, NFT_MSG_NEWSETELEM, 0); nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; + nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_DELSETELEM, 0); - te->set->ops->get(te->set, &te->elem); - nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); - if (te->set->flags & NFT_SET_MAP && - !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END)) - nft_data_uninit(&te->elem.data, te->set->dtype); te->set->ops->remove(te->set, &te->elem); - nft_trans_destroy(trans); + atomic_dec(&te->set->nelems); + te->set->ndeact--; break; } } @@ -3660,6 +4013,10 @@ static void nf_tables_abort_release(struct nft_trans *trans) case NFT_MSG_NEWSET: nft_set_destroy(nft_trans_set(trans)); break; + case NFT_MSG_NEWSETELEM: + nft_set_elem_destroy(nft_trans_elem_set(trans), + nft_trans_elem(trans).priv); + break; } kfree(trans); } @@ -3728,18 +4085,17 @@ static int nf_tables_abort(struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: - nft_trans_elem_set(trans)->nelems--; te = (struct nft_trans_elem *)trans->data; - te->set->ops->get(te->set, &te->elem); - nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); - if (te->set->flags & NFT_SET_MAP && - !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END)) - nft_data_uninit(&te->elem.data, te->set->dtype); + te->set->ops->remove(te->set, &te->elem); - nft_trans_destroy(trans); + atomic_dec(&te->set->nelems); break; case NFT_MSG_DELSETELEM: - nft_trans_elem_set(trans)->nelems++; + te = (struct nft_trans_elem *)trans->data; + + te->set->ops->activate(te->set, &te->elem); + te->set->ndeact--; + nft_trans_destroy(trans); break; } @@ -3814,13 +4170,18 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, const struct nft_set_elem *elem) { - if (elem->flags & NFT_SET_ELEM_INTERVAL_END) + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_data *data; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; - switch (elem->data.verdict) { + data = nft_set_ext_data(ext); + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - return nf_tables_check_loops(ctx, elem->data.chain); + return nf_tables_check_loops(ctx, data->verdict.chain); default: return 0; } @@ -3853,10 +4214,11 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, if (data == NULL) continue; - switch (data->verdict) { + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - err = nf_tables_check_loops(ctx, data->chain); + err = nf_tables_check_loops(ctx, + data->verdict.chain); if (err < 0) return err; default: @@ -3871,7 +4233,8 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, continue; list_for_each_entry(binding, &set->bindings, list) { - if (binding->chain != chain) + if (!(binding->flags & NFT_SET_MAP) || + binding->chain != chain) continue; iter.skip = 0; @@ -3889,85 +4252,129 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, } /** - * nft_validate_input_register - validate an expressions' input register + * nft_parse_register - parse a register value from a netlink attribute * - * @reg: the register number + * @attr: netlink attribute * - * Validate that the input register is one of the general purpose - * registers. + * Parse and translate a register value from a netlink attribute. + * Registers used to be 128 bit wide, these register numbers will be + * mapped to the corresponding 32 bit register numbers. */ -int nft_validate_input_register(enum nft_registers reg) +unsigned int nft_parse_register(const struct nlattr *attr) { - if (reg <= NFT_REG_VERDICT) - return -EINVAL; - if (reg > NFT_REG_MAX) - return -ERANGE; - return 0; + unsigned int reg; + + reg = ntohl(nla_get_be32(attr)); + switch (reg) { + case NFT_REG_VERDICT...NFT_REG_4: + return reg * NFT_REG_SIZE / NFT_REG32_SIZE; + default: + return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00; + } +} +EXPORT_SYMBOL_GPL(nft_parse_register); + +/** + * nft_dump_register - dump a register value to a netlink attribute + * + * @skb: socket buffer + * @attr: attribute number + * @reg: register number + * + * Construct a netlink attribute containing the register number. For + * compatibility reasons, register numbers being a multiple of 4 are + * translated to the corresponding 128 bit register numbers. + */ +int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg) +{ + if (reg % (NFT_REG_SIZE / NFT_REG32_SIZE) == 0) + reg = reg / (NFT_REG_SIZE / NFT_REG32_SIZE); + else + reg = reg - NFT_REG_SIZE / NFT_REG32_SIZE + NFT_REG32_00; + + return nla_put_be32(skb, attr, htonl(reg)); } -EXPORT_SYMBOL_GPL(nft_validate_input_register); +EXPORT_SYMBOL_GPL(nft_dump_register); /** - * nft_validate_output_register - validate an expressions' output register + * nft_validate_register_load - validate a load from a register * * @reg: the register number + * @len: the length of the data * - * Validate that the output register is one of the general purpose - * registers or the verdict register. + * Validate that the input register is one of the general purpose + * registers and that the length of the load is within the bounds. */ -int nft_validate_output_register(enum nft_registers reg) +int nft_validate_register_load(enum nft_registers reg, unsigned int len) { - if (reg < NFT_REG_VERDICT) + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) + return -EINVAL; + if (len == 0) return -EINVAL; - if (reg > NFT_REG_MAX) + if (reg * NFT_REG32_SIZE + len > FIELD_SIZEOF(struct nft_regs, data)) return -ERANGE; + return 0; } -EXPORT_SYMBOL_GPL(nft_validate_output_register); +EXPORT_SYMBOL_GPL(nft_validate_register_load); /** - * nft_validate_data_load - validate an expressions' data load + * nft_validate_register_store - validate an expressions' register store * * @ctx: context of the expression performing the load * @reg: the destination register number * @data: the data to load * @type: the data type + * @len: the length of the data * * Validate that a data load uses the appropriate data type for - * the destination register. A value of NULL for the data means - * that its runtime gathered data, which is always of type - * NFT_DATA_VALUE. + * the destination register and the length is within the bounds. + * A value of NULL for the data means that its runtime gathered + * data. */ -int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg, - const struct nft_data *data, - enum nft_data_types type) +int nft_validate_register_store(const struct nft_ctx *ctx, + enum nft_registers reg, + const struct nft_data *data, + enum nft_data_types type, unsigned int len) { int err; switch (reg) { case NFT_REG_VERDICT: - if (data == NULL || type != NFT_DATA_VERDICT) + if (type != NFT_DATA_VERDICT) return -EINVAL; - if (data->verdict == NFT_GOTO || data->verdict == NFT_JUMP) { - err = nf_tables_check_loops(ctx, data->chain); + if (data != NULL && + (data->verdict.code == NFT_GOTO || + data->verdict.code == NFT_JUMP)) { + err = nf_tables_check_loops(ctx, data->verdict.chain); if (err < 0) return err; - if (ctx->chain->level + 1 > data->chain->level) { + if (ctx->chain->level + 1 > + data->verdict.chain->level) { if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE) return -EMLINK; - data->chain->level = ctx->chain->level + 1; + data->verdict.chain->level = ctx->chain->level + 1; } } return 0; default: + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) + return -EINVAL; + if (len == 0) + return -EINVAL; + if (reg * NFT_REG32_SIZE + len > + FIELD_SIZEOF(struct nft_regs, data)) + return -ERANGE; + if (data != NULL && type != NFT_DATA_VALUE) return -EINVAL; return 0; } } -EXPORT_SYMBOL_GPL(nft_validate_data_load); +EXPORT_SYMBOL_GPL(nft_validate_register_store); static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, @@ -3988,11 +4395,11 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (!tb[NFTA_VERDICT_CODE]) return -EINVAL; - data->verdict = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); + data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); - switch (data->verdict) { + switch (data->verdict.code) { default: - switch (data->verdict & NF_VERDICT_MASK) { + switch (data->verdict.code & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_DROP: case NF_QUEUE: @@ -4004,7 +4411,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, case NFT_CONTINUE: case NFT_BREAK: case NFT_RETURN: - desc->len = sizeof(data->verdict); break; case NFT_JUMP: case NFT_GOTO: @@ -4018,21 +4424,21 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, return -EOPNOTSUPP; chain->use++; - data->chain = chain; - desc->len = sizeof(data); + data->verdict.chain = chain; break; } + desc->len = sizeof(data->verdict); desc->type = NFT_DATA_VERDICT; return 0; } static void nft_verdict_uninit(const struct nft_data *data) { - switch (data->verdict) { + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->chain->use--; + data->verdict.chain->use--; break; } } @@ -4045,13 +4451,14 @@ static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data) if (!nest) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict))) + if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict.code))) goto nla_put_failure; - switch (data->verdict) { + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - if (nla_put_string(skb, NFTA_VERDICT_CHAIN, data->chain->name)) + if (nla_put_string(skb, NFTA_VERDICT_CHAIN, + data->verdict.chain->name)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -4061,7 +4468,8 @@ nla_put_failure: return -1; } -static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data, +static int nft_value_init(const struct nft_ctx *ctx, + struct nft_data *data, unsigned int size, struct nft_data_desc *desc, const struct nlattr *nla) { unsigned int len; @@ -4069,10 +4477,10 @@ static int nft_value_init(const struct nft_ctx *ctx, struct nft_data *data, len = nla_len(nla); if (len == 0) return -EINVAL; - if (len > sizeof(data->data)) + if (len > size) return -EOVERFLOW; - nla_memcpy(data->data, nla, sizeof(data->data)); + nla_memcpy(data->data, nla, len); desc->type = NFT_DATA_VALUE; desc->len = len; return 0; @@ -4085,8 +4493,7 @@ static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data, } static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { - [NFTA_DATA_VALUE] = { .type = NLA_BINARY, - .len = FIELD_SIZEOF(struct nft_data, data) }, + [NFTA_DATA_VALUE] = { .type = NLA_BINARY }, [NFTA_DATA_VERDICT] = { .type = NLA_NESTED }, }; @@ -4095,6 +4502,7 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * * @ctx: context of the expression using the data * @data: destination struct nft_data + * @size: maximum data length * @desc: data description * @nla: netlink attribute containing data * @@ -4104,7 +4512,8 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = { * The caller can indicate that it only wants to accept data of type * NFT_DATA_VALUE by passing NULL for the ctx argument. */ -int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, +int nft_data_init(const struct nft_ctx *ctx, + struct nft_data *data, unsigned int size, struct nft_data_desc *desc, const struct nlattr *nla) { struct nlattr *tb[NFTA_DATA_MAX + 1]; @@ -4115,7 +4524,8 @@ int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, return err; if (tb[NFTA_DATA_VALUE]) - return nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]); + return nft_value_init(ctx, data, size, desc, + tb[NFTA_DATA_VALUE]); if (tb[NFTA_DATA_VERDICT] && ctx != NULL) return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]); return -EINVAL; @@ -4133,9 +4543,9 @@ EXPORT_SYMBOL_GPL(nft_data_init); */ void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) { - switch (type) { - case NFT_DATA_VALUE: + if (type < NFT_DATA_VERDICT) return; + switch (type) { case NFT_DATA_VERDICT: return nft_verdict_uninit(data); default: diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 2d298dccb6dd..f77bad46ac68 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -8,6 +8,7 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ +#include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/list.h> @@ -21,24 +22,66 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> +enum nft_trace { + NFT_TRACE_RULE, + NFT_TRACE_RETURN, + NFT_TRACE_POLICY, +}; + +static const char *const comments[] = { + [NFT_TRACE_RULE] = "rule", + [NFT_TRACE_RETURN] = "return", + [NFT_TRACE_POLICY] = "policy", +}; + +static struct nf_loginfo trace_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = LOGLEVEL_WARNING, + .logflags = NF_LOG_MASK, + }, + }, +}; + +static void __nft_trace_packet(const struct nft_pktinfo *pkt, + const struct nft_chain *chain, + int rulenum, enum nft_trace type) +{ + struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); + + nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, + pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", + chain->table->name, chain->name, comments[type], + rulenum); +} + +static inline void nft_trace_packet(const struct nft_pktinfo *pkt, + const struct nft_chain *chain, + int rulenum, enum nft_trace type) +{ + if (unlikely(pkt->skb->nf_trace)) + __nft_trace_packet(pkt, chain, rulenum, type); +} + static void nft_cmp_fast_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1]) + struct nft_regs *regs) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); u32 mask = nft_cmp_fast_mask(priv->len); - if ((data[priv->sreg].data[0] & mask) == priv->data) + if ((regs->data[priv->sreg] & mask) == priv->data) return; - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + regs->verdict.code = NFT_BREAK; } static bool nft_payload_fast_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - struct nft_data *dest = &data[priv->dreg]; + u32 *dest = ®s->data[priv->dreg]; unsigned char *ptr; if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) @@ -51,12 +94,13 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) return false; + *dest = 0; if (priv->len == 2) - *(u16 *)dest->data = *(u16 *)ptr; + *(u16 *)dest = *(u16 *)ptr; else if (priv->len == 4) - *(u32 *)dest->data = *(u32 *)ptr; + *(u32 *)dest = *(u32 *)ptr; else - *(u8 *)dest->data = *(u8 *)ptr; + *(u8 *)dest = *(u8 *)ptr; return true; } @@ -66,62 +110,30 @@ struct nft_jumpstack { int rulenum; }; -enum nft_trace { - NFT_TRACE_RULE, - NFT_TRACE_RETURN, - NFT_TRACE_POLICY, -}; - -static const char *const comments[] = { - [NFT_TRACE_RULE] = "rule", - [NFT_TRACE_RETURN] = "return", - [NFT_TRACE_POLICY] = "policy", -}; - -static struct nf_loginfo trace_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = 4, - .logflags = NF_LOG_MASK, - }, - }, -}; - -static void nft_trace_packet(const struct nft_pktinfo *pkt, - const struct nft_chain *chain, - int rulenum, enum nft_trace type) -{ - struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - - nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, - pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", - chain->table->name, chain->name, comments[type], - rulenum); -} - unsigned int nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) { const struct nft_chain *chain = ops->priv, *basechain = chain; + const struct net *chain_net = read_pnet(&nft_base_chain(basechain)->pnet); + const struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); const struct nft_rule *rule; const struct nft_expr *expr, *last; - struct nft_data data[NFT_REG_MAX + 1]; + struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; struct nft_stats *stats; int rulenum; - /* - * Cache cursor to avoid problems in case that the cursor is updated - * while traversing the ruleset. - */ - unsigned int gencursor = ACCESS_ONCE(chain->net->nft.gencursor); + unsigned int gencursor = nft_genmask_cur(net); + + /* Ignore chains that are not for the current network namespace */ + if (!net_eq(net, chain_net)) + return NF_ACCEPT; do_chain: rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); next_rule: - data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + regs.verdict.code = NFT_CONTINUE; list_for_each_entry_continue_rcu(rule, &chain->rules, list) { /* This rule is not active, skip. */ @@ -132,62 +144,52 @@ next_rule: nft_rule_for_each_expr(expr, last, rule) { if (expr->ops == &nft_cmp_fast_ops) - nft_cmp_fast_eval(expr, data); + nft_cmp_fast_eval(expr, ®s); else if (expr->ops != &nft_payload_fast_ops || - !nft_payload_fast_eval(expr, data, pkt)) - expr->ops->eval(expr, data, pkt); + !nft_payload_fast_eval(expr, ®s, pkt)) + expr->ops->eval(expr, ®s, pkt); - if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE) + if (regs.verdict.code != NFT_CONTINUE) break; } - switch (data[NFT_REG_VERDICT].verdict) { + switch (regs.verdict.code) { case NFT_BREAK: - data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + regs.verdict.code = NFT_CONTINUE; continue; case NFT_CONTINUE: - if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); continue; } break; } - switch (data[NFT_REG_VERDICT].verdict & NF_VERDICT_MASK) { + switch (regs.verdict.code & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_DROP: case NF_QUEUE: - if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); - - return data[NFT_REG_VERDICT].verdict; + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + return regs.verdict.code; } - switch (data[NFT_REG_VERDICT].verdict) { + switch (regs.verdict.code) { case NFT_JUMP: - if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); - BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); jumpstack[stackptr].chain = chain; jumpstack[stackptr].rule = rule; jumpstack[stackptr].rulenum = rulenum; stackptr++; - chain = data[NFT_REG_VERDICT].chain; - goto do_chain; + /* fall through */ case NFT_GOTO: - if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); - chain = data[NFT_REG_VERDICT].chain; + chain = regs.verdict.chain; goto do_chain; - case NFT_RETURN: - if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); - break; case NFT_CONTINUE: - if (unlikely(pkt->skb->nf_trace && !(chain->flags & NFT_BASE_CHAIN))) - nft_trace_packet(pkt, chain, ++rulenum, NFT_TRACE_RETURN); + rulenum++; + /* fall through */ + case NFT_RETURN: + nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); break; default: WARN_ON(1); @@ -201,8 +203,7 @@ next_rule: goto next_rule; } - if (unlikely(pkt->skb->nf_trace)) - nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); rcu_read_lock_bh(); stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); @@ -244,8 +245,14 @@ int __init nf_tables_core_module_init(void) if (err < 0) goto err6; + err = nft_dynset_module_init(); + if (err < 0) + goto err7; + return 0; +err7: + nft_payload_module_exit(); err6: nft_byteorder_module_exit(); err5: @@ -262,6 +269,7 @@ err1: void nf_tables_core_module_exit(void) { + nft_dynset_module_exit(); nft_payload_module_exit(); nft_byteorder_module_exit(); nft_bitwise_module_exit(); diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c new file mode 100644 index 000000000000..2cae4d4a03b7 --- /dev/null +++ b/net/netfilter/nf_tables_netdev.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <net/netfilter/nf_tables.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_tables_ipv6.h> + +static inline void +nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, + const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct iphdr *iph, _iph; + u32 len, thoff; + + nft_set_pktinfo(pkt, ops, skb, state); + + iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), + &_iph); + if (!iph) + return; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return; + + len = ntohs(iph->tot_len); + thoff = iph->ihl * 4; + if (skb->len < len) + return; + else if (len < thoff) + return; + + pkt->tprot = iph->protocol; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; +} + +static inline void +__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, + const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *ip6h, _ip6h; + unsigned int thoff = 0; + unsigned short frag_off; + int protohdr; + u32 pkt_len; + + ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), + &_ip6h); + if (!ip6h) + return; + + if (ip6h->version != 6) + return; + + pkt_len = ntohs(ip6h->payload_len); + if (pkt_len + sizeof(*ip6h) > skb->len) + return; + + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + if (protohdr < 0) + return; + + pkt->tprot = protohdr; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = frag_off; +#endif +} + +static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, + const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + nft_set_pktinfo(pkt, ops, skb, state); + __nft_netdev_set_pktinfo_ipv6(pkt, ops, skb, state); +} + +static unsigned int +nft_do_chain_netdev(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + nft_netdev_set_pktinfo_ipv4(&pkt, ops, skb, state); + break; + case htons(ETH_P_IPV6): + nft_netdev_set_pktinfo_ipv6(&pkt, ops, skb, state); + break; + default: + nft_set_pktinfo(&pkt, ops, skb, state); + break; + } + + return nft_do_chain(&pkt, ops); +} + +static struct nft_af_info nft_af_netdev __read_mostly = { + .family = NFPROTO_NETDEV, + .nhooks = NF_NETDEV_NUMHOOKS, + .owner = THIS_MODULE, + .flags = NFT_AF_NEEDS_DEV, + .nops = 1, + .hooks = { + [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + }, +}; + +static int nf_tables_netdev_init_net(struct net *net) +{ + net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.netdev == NULL) + return -ENOMEM; + + memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev)); + + if (nft_register_afinfo(net, net->nft.netdev) < 0) + goto err; + + return 0; +err: + kfree(net->nft.netdev); + return -ENOMEM; +} + +static void nf_tables_netdev_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.netdev); + kfree(net->nft.netdev); +} + +static struct pernet_operations nf_tables_netdev_net_ops = { + .init = nf_tables_netdev_init_net, + .exit = nf_tables_netdev_exit_net, +}; + +static const struct nf_chain_type nft_filter_chain_netdev = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_NETDEV, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_NETDEV_INGRESS), +}; + +static void nft_netdev_event(unsigned long event, struct nft_af_info *afi, + struct net_device *dev, struct nft_table *table, + struct nft_base_chain *basechain) +{ + switch (event) { + case NETDEV_REGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + BUG_ON(!(basechain->flags & NFT_BASECHAIN_DISABLED)); + + dev_hold(dev); + basechain->ops[0].dev = dev; + basechain->flags &= ~NFT_BASECHAIN_DISABLED; + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nft_register_basechain(basechain, afi->nops); + break; + case NETDEV_UNREGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + BUG_ON(basechain->flags & NFT_BASECHAIN_DISABLED); + + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nft_unregister_basechain(basechain, afi->nops); + + dev_put(basechain->ops[0].dev); + basechain->ops[0].dev = NULL; + basechain->flags |= NFT_BASECHAIN_DISABLED; + break; + case NETDEV_CHANGENAME: + if (dev->ifindex != basechain->ops[0].dev->ifindex) + return; + + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + break; + } +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nft_af_info *afi; + struct nft_table *table; + struct nft_chain *chain; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) { + if (afi->family != NFPROTO_NETDEV) + continue; + + list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry(chain, &table->chains, list) { + if (!(chain->flags & NFT_BASE_CHAIN)) + continue; + + nft_netdev_event(event, afi, dev, table, + nft_base_chain(chain)); + } + } + } + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + + return NOTIFY_DONE; +} + +static struct notifier_block nf_tables_netdev_notifier = { + .notifier_call = nf_tables_netdev_event, +}; + +static int __init nf_tables_netdev_init(void) +{ + int ret; + + nft_register_chain_type(&nft_filter_chain_netdev); + ret = register_pernet_subsys(&nf_tables_netdev_net_ops); + if (ret < 0) + nft_unregister_chain_type(&nft_filter_chain_netdev); + + register_netdevice_notifier(&nf_tables_netdev_notifier); + + return ret; +} + +static void __exit nf_tables_netdev_exit(void) +{ + unregister_netdevice_notifier(&nf_tables_netdev_notifier); + unregister_pernet_subsys(&nf_tables_netdev_net_ops); + nft_unregister_chain_type(&nft_filter_chain_netdev); +} + +module_init(nf_tables_netdev_init); +module_exit(nf_tables_netdev_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_FAMILY(5); /* NFPROTO_NETDEV */ diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 8b117c90ecd7..0c0e8ecf02ab 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -269,6 +269,12 @@ static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) } } +enum { + NFNL_BATCH_FAILURE = (1 << 0), + NFNL_BATCH_DONE = (1 << 1), + NFNL_BATCH_REPLAY = (1 << 2), +}; + static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, u_int16_t subsys_id) { @@ -276,13 +282,15 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; - bool success = true, done = false; static LIST_HEAD(err_list); + u32 status; int err; if (subsys_id >= NFNL_SUBSYS_COUNT) return netlink_ack(skb, nlh, -EINVAL); replay: + status = 0; + skb = netlink_skb_clone(oskb, GFP_KERNEL); if (!skb) return netlink_ack(oskb, nlh, -ENOMEM); @@ -336,10 +344,10 @@ replay: if (type == NFNL_MSG_BATCH_BEGIN) { /* Malformed: Batch begin twice */ nfnl_err_reset(&err_list); - success = false; + status |= NFNL_BATCH_FAILURE; goto done; } else if (type == NFNL_MSG_BATCH_END) { - done = true; + status |= NFNL_BATCH_DONE; goto done; } else if (type < NLMSG_MIN_TYPE) { err = -EINVAL; @@ -382,11 +390,8 @@ replay: * original skb. */ if (err == -EAGAIN) { - nfnl_err_reset(&err_list); - ss->abort(oskb); - nfnl_unlock(subsys_id); - kfree_skb(skb); - goto replay; + status |= NFNL_BATCH_REPLAY; + goto next; } } ack: @@ -402,7 +407,7 @@ ack: */ nfnl_err_reset(&err_list); netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); - success = false; + status |= NFNL_BATCH_FAILURE; goto done; } /* We don't stop processing the batch on errors, thus, @@ -410,19 +415,26 @@ ack: * triggers. */ if (err) - success = false; + status |= NFNL_BATCH_FAILURE; } - +next: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } done: - if (success && done) + if (status & NFNL_BATCH_REPLAY) { + ss->abort(oskb); + nfnl_err_reset(&err_list); + nfnl_unlock(subsys_id); + kfree_skb(skb); + goto replay; + } else if (status == NFNL_BATCH_DONE) { ss->commit(oskb); - else + } else { ss->abort(oskb); + } nfnl_err_deliver(&err_list, oskb); nfnl_unlock(subsys_id); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 11d85b3813f2..4670821b569d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -23,6 +23,7 @@ #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/netfilter.h> +#include <linux/netfilter_bridge.h> #include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_log.h> @@ -62,7 +63,7 @@ struct nfulnl_instance { struct timer_list timer; struct net *net; struct user_namespace *peer_user_ns; /* User namespace of the peer process */ - int peer_portid; /* PORTID of the peer process */ + u32 peer_portid; /* PORTID of the peer process */ /* configurable parameters */ unsigned int flushtimeout; /* timeout until queue flush */ @@ -151,7 +152,7 @@ static void nfulnl_timer(unsigned long data); static struct nfulnl_instance * instance_create(struct net *net, u_int16_t group_num, - int portid, struct user_namespace *user_ns) + u32 portid, struct user_namespace *user_ns) { struct nfulnl_instance *inst; struct nfnl_log_net *log = nfnl_log_pernet(net); @@ -448,14 +449,18 @@ __build_packet_message(struct nfnl_log_net *log, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { + struct net_device *physindev; + /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; - if (skb->nf_bridge && skb->nf_bridge->physindev && + + physindev = nf_bridge_get_physindev(skb); + if (physindev && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, - htonl(skb->nf_bridge->physindev->ifindex))) + htonl(physindev->ifindex))) goto nla_put_failure; } #endif @@ -479,14 +484,18 @@ __build_packet_message(struct nfnl_log_net *log, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { + struct net_device *physoutdev; + /* Case 2: indev is a bridge group, we need to look * for physical device (when called from ipv4) */ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; - if (skb->nf_bridge && skb->nf_bridge->physoutdev && + + physoutdev = nf_bridge_get_physoutdev(skb); + if (physoutdev && nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, - htonl(skb->nf_bridge->physoutdev->ifindex))) + htonl(physoutdev->ifindex))) goto nla_put_failure; } #endif @@ -539,7 +548,7 @@ __build_packet_message(struct nfnl_log_net *log, /* UID */ sk = skb->sk; - if (sk && sk->sk_state != TCP_TIME_WAIT) { + if (sk && sk_fullsock(sk)) { read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { struct file *file = sk->sk_socket->file; @@ -589,8 +598,6 @@ nla_put_failure: return -1; } -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_ULOG, .u = { @@ -998,11 +1005,13 @@ static int seq_show(struct seq_file *s, void *v) { const struct nfulnl_instance *inst = v; - return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", - inst->group_num, - inst->peer_portid, inst->qlen, - inst->copy_mode, inst->copy_range, - inst->flushtimeout, atomic_read(&inst->use)); + seq_printf(s, "%5u %6u %5u %1u %5u %6u %2u\n", + inst->group_num, + inst->peer_portid, inst->qlen, + inst->copy_mode, inst->copy_range, + inst->flushtimeout, atomic_read(&inst->use)); + + return 0; } static const struct seq_operations nful_seq_ops = { @@ -1062,7 +1071,13 @@ static struct pernet_operations nfnl_log_net_ops = { static int __init nfnetlink_log_init(void) { - int status = -ENOMEM; + int status; + + status = register_pernet_subsys(&nfnl_log_net_ops); + if (status < 0) { + pr_err("failed to register pernet ops\n"); + goto out; + } netlink_register_notifier(&nfulnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfulnl_subsys); @@ -1077,28 +1092,23 @@ static int __init nfnetlink_log_init(void) goto cleanup_subsys; } - status = register_pernet_subsys(&nfnl_log_net_ops); - if (status < 0) { - pr_err("failed to register pernet ops\n"); - goto cleanup_logger; - } return status; -cleanup_logger: - nf_log_unregister(&nfulnl_logger); cleanup_subsys: nfnetlink_subsys_unregister(&nfulnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfulnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_log_net_ops); +out: return status; } static void __exit nfnetlink_log_fini(void) { - unregister_pernet_subsys(&nfnl_log_net_ops); nf_log_unregister(&nfulnl_logger); nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_log_net_ops); } MODULE_DESCRIPTION("netfilter userspace logging"); diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 0db8515e76da..685cc6a17163 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -25,6 +25,7 @@ #include <linux/proc_fs.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> +#include <linux/netfilter_bridge.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_queue.h> #include <linux/list.h> @@ -54,7 +55,7 @@ struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; - int peer_portid; + u32 peer_portid; unsigned int queue_maxlen; unsigned int copy_range; unsigned int queue_dropped; @@ -109,8 +110,7 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) } static struct nfqnl_instance * -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, - int portid) +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) { struct nfqnl_instance *inst; unsigned int h; @@ -257,7 +257,7 @@ static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) { const struct cred *cred; - if (sk->sk_state == TCP_TIME_WAIT) + if (!sk_fullsock(sk)) return 0; read_lock_bh(&sk->sk_callback_lock); @@ -278,6 +278,23 @@ nla_put_failure: return -1; } +static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) +{ + u32 seclen = 0; +#if IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (!skb || !sk_fullsock(skb->sk)) + return 0; + + read_lock_bh(&skb->sk->sk_callback_lock); + + if (skb->secmark) + security_secid_to_secctx(skb->secmark, secdata, &seclen); + + read_unlock_bh(&skb->sk->sk_callback_lock); +#endif + return seclen; +} + static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, @@ -297,6 +314,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_conn *ct = NULL; enum ip_conntrack_info uninitialized_var(ctinfo); bool csum_verify; + char *secdata = NULL; + u32 seclen = 0; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) @@ -314,13 +333,13 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (entskb->tstamp.tv64) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); - if (entry->hook <= NF_INET_FORWARD || - (entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) + if (entry->state.hook <= NF_INET_FORWARD || + (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) csum_verify = !skb_csum_unnecessary(entskb); else csum_verify = false; - outdev = entry->outdev; + outdev = entry->state.out; switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { case NFQNL_COPY_META: @@ -352,6 +371,12 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + nla_total_size(sizeof(u_int32_t))); /* gid */ } + if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { + seclen = nfqnl_get_sk_secctx(entskb, &secdata); + if (seclen) + size += nla_total_size(seclen); + } + skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, GFP_ATOMIC); if (!skb) { @@ -368,23 +393,23 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, return NULL; } nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = entry->pf; + nfmsg->nfgen_family = entry->state.pf; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(queue->queue_num); nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); pmsg->hw_protocol = entskb->protocol; - pmsg->hook = entry->hook; + pmsg->hook = entry->state.hook; *packet_id_ptr = &pmsg->packet_id; - indev = entry->indev; + indev = entry->state.in; if (indev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else - if (entry->pf == PF_BRIDGE) { + if (entry->state.pf == PF_BRIDGE) { /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ @@ -396,14 +421,18 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { + int physinif; + /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; - if (entskb->nf_bridge && entskb->nf_bridge->physindev && + + physinif = nf_bridge_get_physinif(entskb); + if (physinif && nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, - htonl(entskb->nf_bridge->physindev->ifindex))) + htonl(physinif))) goto nla_put_failure; } #endif @@ -414,7 +443,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else - if (entry->pf == PF_BRIDGE) { + if (entry->state.pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ @@ -426,14 +455,18 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { + int physoutif; + /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; - if (entskb->nf_bridge && entskb->nf_bridge->physoutdev && + + physoutif = nf_bridge_get_physoutif(entskb); + if (physoutif && nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, - htonl(entskb->nf_bridge->physoutdev->ifindex))) + htonl(physoutif))) goto nla_put_failure; } #endif @@ -471,6 +504,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) goto nla_put_failure; + if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) + goto nla_put_failure; + if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0) goto nla_put_failure; @@ -633,8 +669,8 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) struct nfqnl_instance *queue; struct sk_buff *skb, *segs; int err = -ENOBUFS; - struct net *net = dev_net(entry->indev ? - entry->indev : entry->outdev); + struct net *net = dev_net(entry->state.in ? + entry->state.in : entry->state.out); struct nfnl_queue_net *q = nfnl_queue_pernet(net); /* rcu_read_lock()ed by nf_hook_slow() */ @@ -647,7 +683,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) skb = entry->skb; - switch (entry->pf) { + switch (entry->state.pf) { case NFPROTO_IPV4: skb->protocol = htons(ETH_P_IP); break; @@ -757,19 +793,20 @@ nfqnl_set_mode(struct nfqnl_instance *queue, static int dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { - if (entry->indev) - if (entry->indev->ifindex == ifindex) + if (entry->state.in) + if (entry->state.in->ifindex == ifindex) return 1; - if (entry->outdev) - if (entry->outdev->ifindex == ifindex) + if (entry->state.out) + if (entry->state.out->ifindex == ifindex) return 1; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { - if (entry->skb->nf_bridge->physindev && - entry->skb->nf_bridge->physindev->ifindex == ifindex) - return 1; - if (entry->skb->nf_bridge->physoutdev && - entry->skb->nf_bridge->physoutdev->ifindex == ifindex) + int physinif, physoutif; + + physinif = nf_bridge_get_physinif(entry->skb); + physoutif = nf_bridge_get_physoutif(entry->skb); + + if (physinif == ifindex || physoutif == ifindex) return 1; } #endif @@ -797,8 +834,6 @@ nfqnl_dev_drop(struct net *net, int ifindex) rcu_read_unlock(); } -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -815,6 +850,27 @@ static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; +static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr) +{ + return entry->elem == (struct nf_hook_ops *)ops_ptr; +} + +static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook) +{ + struct nfnl_queue_net *q = nfnl_queue_pernet(net); + int i; + + rcu_read_lock(); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct nfqnl_instance *inst; + struct hlist_head *head = &q->instance_table[i]; + + hlist_for_each_entry_rcu(inst, head, hlist) + nfqnl_flush(inst, nf_hook_cmp, (unsigned long)hook); + } + rcu_read_unlock(); +} + static int nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -860,7 +916,7 @@ static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { }; static struct nfqnl_instance * -verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid) +verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) { struct nfqnl_instance *queue; @@ -1022,7 +1078,8 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { }; static const struct nf_queue_handler nfqh = { - .outfn = &nfqnl_enqueue_packet, + .outfn = &nfqnl_enqueue_packet, + .nf_hook_drop = &nfqnl_nf_hook_drop, }; static int @@ -1133,7 +1190,12 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ret = -EOPNOTSUPP; goto err_out_unlock; } - +#if !IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (flags & mask & NFQA_CFG_F_SECCTX) { + ret = -EOPNOTSUPP; + goto err_out_unlock; + } +#endif spin_lock_bh(&queue->lock); queue->flags &= ~mask; queue->flags |= flags & mask; @@ -1242,13 +1304,13 @@ static int seq_show(struct seq_file *s, void *v) { const struct nfqnl_instance *inst = v; - seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", + seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", inst->queue_num, inst->peer_portid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, inst->id_sequence, 1); - return seq_has_overflowed(s); + return 0; } static const struct seq_operations nfqnl_seq_ops = { @@ -1308,7 +1370,13 @@ static struct pernet_operations nfnl_queue_net_ops = { static int __init nfnetlink_queue_init(void) { - int status = -ENOMEM; + int status; + + status = register_pernet_subsys(&nfnl_queue_net_ops); + if (status < 0) { + pr_err("nf_queue: failed to register pernet ops\n"); + goto out; + } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); @@ -1317,19 +1385,13 @@ static int __init nfnetlink_queue_init(void) goto cleanup_netlink_notifier; } - status = register_pernet_subsys(&nfnl_queue_net_ops); - if (status < 0) { - pr_err("nf_queue: failed to register pernet ops\n"); - goto cleanup_subsys; - } register_netdevice_notifier(&nfqnl_dev_notifier); nf_register_queue_handler(&nfqh); return status; -cleanup_subsys: - nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); +out: return status; } @@ -1337,9 +1399,9 @@ static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); - unregister_pernet_subsys(&nfnl_queue_net_ops); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_queue_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 4fb6ee2c1106..d71cc18fa35d 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -26,18 +26,16 @@ struct nft_bitwise { }; static void nft_bitwise_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); - const struct nft_data *src = &data[priv->sreg]; - struct nft_data *dst = &data[priv->dreg]; + const u32 *src = ®s->data[priv->sreg]; + u32 *dst = ®s->data[priv->dreg]; unsigned int i; - for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) { - dst->data[i] = (src->data[i] & priv->mask.data[i]) ^ - priv->xor.data[i]; - } + for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) + dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { @@ -63,28 +61,27 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, tb[NFTA_BITWISE_XOR] == NULL) return -EINVAL; - priv->sreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_SREG])); - err = nft_validate_input_register(priv->sreg); + priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); + priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); + err = nft_validate_register_load(priv->sreg, priv->len); if (err < 0) return err; - priv->dreg = ntohl(nla_get_be32(tb[NFTA_BITWISE_DREG])); - err = nft_validate_output_register(priv->dreg); + priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); + err = nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, priv->len); if (err < 0) return err; - err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); - if (err < 0) - return err; - - priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); - err = nft_data_init(NULL, &priv->mask, &d1, tb[NFTA_BITWISE_MASK]); + err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, + tb[NFTA_BITWISE_MASK]); if (err < 0) return err; if (d1.len != priv->len) return -EINVAL; - err = nft_data_init(NULL, &priv->xor, &d2, tb[NFTA_BITWISE_XOR]); + err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, + tb[NFTA_BITWISE_XOR]); if (err < 0) return err; if (d2.len != priv->len) @@ -97,9 +94,9 @@ static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_bitwise *priv = nft_expr_priv(expr); - if (nla_put_be32(skb, NFTA_BITWISE_SREG, htonl(priv->sreg))) + if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_BITWISE_DREG, htonl(priv->dreg))) + if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) goto nla_put_failure; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index c39ed8d29df1..fde5145f2e36 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -26,16 +26,17 @@ struct nft_byteorder { }; static void nft_byteorder_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_byteorder *priv = nft_expr_priv(expr); - struct nft_data *src = &data[priv->sreg], *dst = &data[priv->dreg]; + u32 *src = ®s->data[priv->sreg]; + u32 *dst = ®s->data[priv->dreg]; union { u32 u32; u16 u16; } *s, *d; unsigned int i; - s = (void *)src->data; - d = (void *)dst->data; + s = (void *)src; + d = (void *)dst; switch (priv->size) { case 4: @@ -87,19 +88,6 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, tb[NFTA_BYTEORDER_OP] == NULL) return -EINVAL; - priv->sreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SREG])); - err = nft_validate_input_register(priv->sreg); - if (err < 0) - return err; - - priv->dreg = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); - if (err < 0) - return err; - priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP])); switch (priv->op) { case NFT_BYTEORDER_NTOH: @@ -109,10 +97,6 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, return -EINVAL; } - priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); - if (priv->len == 0 || priv->len > FIELD_SIZEOF(struct nft_data, data)) - return -EINVAL; - priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE])); switch (priv->size) { case 2: @@ -122,16 +106,24 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, return -EINVAL; } - return 0; + priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]); + priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); + err = nft_validate_register_load(priv->sreg, priv->len); + if (err < 0) + return err; + + priv->dreg = nft_parse_register(tb[NFTA_BYTEORDER_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, priv->len); } static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_byteorder *priv = nft_expr_priv(expr); - if (nla_put_be32(skb, NFTA_BYTEORDER_SREG, htonl(priv->sreg))) + if (nft_dump_register(skb, NFTA_BYTEORDER_SREG, priv->sreg)) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_BYTEORDER_DREG, htonl(priv->dreg))) + if (nft_dump_register(skb, NFTA_BYTEORDER_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op))) goto nla_put_failure; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index e2b3f51c81f1..e25b35d70e4d 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -25,13 +25,13 @@ struct nft_cmp_expr { }; static void nft_cmp_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_cmp_expr *priv = nft_expr_priv(expr); int d; - d = nft_data_cmp(&data[priv->sreg], &priv->data, priv->len); + d = memcmp(®s->data[priv->sreg], &priv->data, priv->len); switch (priv->op) { case NFT_CMP_EQ: if (d != 0) @@ -59,7 +59,7 @@ static void nft_cmp_eval(const struct nft_expr *expr, return; mismatch: - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = { @@ -75,12 +75,16 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct nft_data_desc desc; int err; - priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); - priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); - - err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc, + tb[NFTA_CMP_DATA]); BUG_ON(err < 0); + priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); + err = nft_validate_register_load(priv->sreg, desc.len); + if (err < 0) + return err; + + priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); priv->len = desc.len; return 0; } @@ -89,7 +93,7 @@ static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_cmp_expr *priv = nft_expr_priv(expr); - if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) + if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op))) goto nla_put_failure; @@ -122,13 +126,18 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, u32 mask; int err; - priv->sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); - - err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, sizeof(data), &desc, + tb[NFTA_CMP_DATA]); BUG_ON(err < 0); - desc.len *= BITS_PER_BYTE; + priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); + err = nft_validate_register_load(priv->sreg, desc.len); + if (err < 0) + return err; + + desc.len *= BITS_PER_BYTE; mask = nft_cmp_fast_mask(desc.len); + priv->data = data.data[0] & mask; priv->len = desc.len; return 0; @@ -139,7 +148,7 @@ static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); struct nft_data data; - if (nla_put_be32(skb, NFTA_CMP_SREG, htonl(priv->sreg))) + if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ))) goto nla_put_failure; @@ -167,7 +176,6 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_data_desc desc; struct nft_data data; - enum nft_registers sreg; enum nft_cmp_ops op; int err; @@ -176,11 +184,6 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) tb[NFTA_CMP_DATA] == NULL) return ERR_PTR(-EINVAL); - sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); - err = nft_validate_input_register(sreg); - if (err < 0) - return ERR_PTR(err); - op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); switch (op) { case NFT_CMP_EQ: @@ -194,7 +197,8 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) return ERR_PTR(-EINVAL); } - err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); + err = nft_data_init(NULL, &data, sizeof(data), &desc, + tb[NFTA_CMP_DATA]); if (err < 0) return ERR_PTR(err); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 65f3e2b6be44..66def315eb56 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -20,6 +20,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_bridge/ebtables.h> +#include <linux/netfilter_arp/arp_tables.h> #include <net/netfilter/nf_tables.h> static int nft_compat_chain_validate_dependency(const char *tablename, @@ -42,6 +43,7 @@ union nft_entry { struct ipt_entry e4; struct ip6t_entry e6; struct ebt_entry ebt; + struct arpt_entry arp; }; static inline void @@ -53,7 +55,7 @@ nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info) } static void nft_target_eval_xt(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { void *info = nft_expr_priv(expr); @@ -70,16 +72,16 @@ static void nft_target_eval_xt(const struct nft_expr *expr, switch (ret) { case XT_CONTINUE: - data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + regs->verdict.code = NFT_CONTINUE; break; default: - data[NFT_REG_VERDICT].verdict = ret; + regs->verdict.code = ret; break; } } static void nft_target_eval_bridge(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { void *info = nft_expr_priv(expr); @@ -96,19 +98,19 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, switch (ret) { case EBT_ACCEPT: - data[NFT_REG_VERDICT].verdict = NF_ACCEPT; + regs->verdict.code = NF_ACCEPT; break; case EBT_DROP: - data[NFT_REG_VERDICT].verdict = NF_DROP; + regs->verdict.code = NF_DROP; break; case EBT_CONTINUE: - data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + regs->verdict.code = NFT_CONTINUE; break; case EBT_RETURN: - data[NFT_REG_VERDICT].verdict = NFT_RETURN; + regs->verdict.code = NFT_RETURN; break; default: - data[NFT_REG_VERDICT].verdict = ret; + regs->verdict.code = ret; break; } } @@ -143,6 +145,8 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; + case NFPROTO_ARP: + break; } par->entryinfo = entry; par->target = target; @@ -157,6 +161,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void target_compat_from_user(struct xt_target *t, void *in, void *out) @@ -300,7 +305,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, } static void nft_match_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { void *info = nft_expr_priv(expr); @@ -313,16 +318,16 @@ static void nft_match_eval(const struct nft_expr *expr, ret = match->match(skb, (struct xt_action_param *)&pkt->xt); if (pkt->xt.hotdrop) { - data[NFT_REG_VERDICT].verdict = NF_DROP; + regs->verdict.code = NF_DROP; return; } - switch(ret) { - case true: - data[NFT_REG_VERDICT].verdict = NFT_CONTINUE; + switch (ret ? 1 : 0) { + case 1: + regs->verdict.code = NFT_CONTINUE; break; - case false: - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + case 0: + regs->verdict.code = NFT_BREAK; break; } } @@ -357,6 +362,8 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; + case NFPROTO_ARP: + break; } par->entryinfo = entry; par->match = match; @@ -371,6 +378,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, par->hook_mask = 0; } par->family = ctx->afi->family; + par->nft_compat = true; } static void match_compat_from_user(struct xt_match *m, void *in, void *out) @@ -543,6 +551,9 @@ nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb, case NFPROTO_BRIDGE: fmt = "ebt_%s"; break; + case NFPROTO_ARP: + fmt = "arpt_%s"; + break; default: pr_err("nft_compat: unsupported protocol %d\n", nfmsg->nfgen_family); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index c89ee486ce54..17591239229f 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -24,7 +24,7 @@ struct nft_counter { }; static void nft_counter_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_counter *priv = nft_expr_priv(expr); @@ -92,6 +92,7 @@ static struct nft_expr_type nft_counter_type __read_mostly = { .ops = &nft_counter_ops, .policy = nft_counter_policy, .maxattr = NFTA_COUNTER_MAX, + .flags = NFT_EXPR_STATEFUL, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index cc5603016242..8cbca3432f90 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -31,11 +31,11 @@ struct nft_ct { }; static void nft_ct_get_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); - struct nft_data *dest = &data[priv->dreg]; + u32 *dest = ®s->data[priv->dreg]; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; const struct nf_conn_help *help; @@ -54,8 +54,10 @@ static void nft_ct_get_eval(const struct nft_expr *expr, state = NF_CT_STATE_UNTRACKED_BIT; else state = NF_CT_STATE_BIT(ctinfo); - dest->data[0] = state; + *dest = state; return; + default: + break; } if (ct == NULL) @@ -63,26 +65,26 @@ static void nft_ct_get_eval(const struct nft_expr *expr, switch (priv->key) { case NFT_CT_DIRECTION: - dest->data[0] = CTINFO2DIR(ctinfo); + *dest = CTINFO2DIR(ctinfo); return; case NFT_CT_STATUS: - dest->data[0] = ct->status; + *dest = ct->status; return; #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: - dest->data[0] = ct->mark; + *dest = ct->mark; return; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: - dest->data[0] = ct->secmark; + *dest = ct->secmark; return; #endif case NFT_CT_EXPIRATION: diff = (long)jiffies - (long)ct->timeout.expires; if (diff < 0) diff = 0; - dest->data[0] = jiffies_to_msecs(diff); + *dest = jiffies_to_msecs(diff); return; case NFT_CT_HELPER: if (ct->master == NULL) @@ -93,9 +95,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, helper = rcu_dereference(help->helper); if (helper == NULL) goto err; - if (strlen(helper->name) >= sizeof(dest->data)) - goto err; - strncpy((char *)dest->data, helper->name, sizeof(dest->data)); + strncpy((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); return; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { @@ -103,58 +103,60 @@ static void nft_ct_get_eval(const struct nft_expr *expr, unsigned int size; if (!labels) { - memset(dest->data, 0, sizeof(dest->data)); + memset(dest, 0, NF_CT_LABELS_MAX_SIZE); return; } - BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > sizeof(dest->data)); size = labels->words * sizeof(long); - - memcpy(dest->data, labels->bits, size); - if (size < sizeof(dest->data)) - memset(((char *) dest->data) + size, 0, - sizeof(dest->data) - size); + memcpy(dest, labels->bits, size); + if (size < NF_CT_LABELS_MAX_SIZE) + memset(((char *) dest) + size, 0, + NF_CT_LABELS_MAX_SIZE - size); return; } #endif + default: + break; } tuple = &ct->tuplehash[priv->dir].tuple; switch (priv->key) { case NFT_CT_L3PROTOCOL: - dest->data[0] = nf_ct_l3num(ct); + *dest = nf_ct_l3num(ct); return; case NFT_CT_SRC: - memcpy(dest->data, tuple->src.u3.all, + memcpy(dest, tuple->src.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_DST: - memcpy(dest->data, tuple->dst.u3.all, + memcpy(dest, tuple->dst.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_PROTOCOL: - dest->data[0] = nf_ct_protonum(ct); + *dest = nf_ct_protonum(ct); return; case NFT_CT_PROTO_SRC: - dest->data[0] = (__force __u16)tuple->src.u.all; + *dest = (__force __u16)tuple->src.u.all; return; case NFT_CT_PROTO_DST: - dest->data[0] = (__force __u16)tuple->dst.u.all; + *dest = (__force __u16)tuple->dst.u.all; return; + default: + break; } return; err: - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + regs->verdict.code = NFT_BREAK; } static void nft_ct_set_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; #ifdef CONFIG_NF_CONNTRACK_MARK - u32 value = data[priv->sreg].data[0]; + u32 value = regs->data[priv->sreg]; #endif enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -172,6 +174,8 @@ static void nft_ct_set_eval(const struct nft_expr *expr, } break; #endif + default: + break; } } @@ -220,12 +224,17 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); + unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { - case NFT_CT_STATE: case NFT_CT_DIRECTION: + if (tb[NFTA_CT_DIRECTION] != NULL) + return -EINVAL; + len = sizeof(u8); + break; + case NFT_CT_STATE: case NFT_CT_STATUS: #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: @@ -233,22 +242,54 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: #endif + case NFT_CT_EXPIRATION: + if (tb[NFTA_CT_DIRECTION] != NULL) + return -EINVAL; + len = sizeof(u32); + break; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: + if (tb[NFTA_CT_DIRECTION] != NULL) + return -EINVAL; + len = NF_CT_LABELS_MAX_SIZE; + break; #endif - case NFT_CT_EXPIRATION: case NFT_CT_HELPER: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; + len = NF_CT_HELPER_NAME_LEN; break; + case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + len = sizeof(u8); + break; case NFT_CT_SRC: case NFT_CT_DST: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + + switch (ctx->afi->family) { + case NFPROTO_IPV4: + len = FIELD_SIZEOF(struct nf_conntrack_tuple, + src.u3.ip); + break; + case NFPROTO_IPV6: + case NFPROTO_INET: + len = FIELD_SIZEOF(struct nf_conntrack_tuple, + src.u3.ip6); + break; + default: + return -EAFNOSUPPORT; + } + break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; + len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u.all); break; default: return -EOPNOTSUPP; @@ -265,12 +306,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, } } - priv->dreg = ntohl(nla_get_be32(tb[NFTA_CT_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - - err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + priv->dreg = nft_parse_register(tb[NFTA_CT_DREG]); + err = nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); if (err < 0) return err; @@ -286,20 +324,22 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); + unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: + len = FIELD_SIZEOF(struct nf_conn, mark); break; #endif default: return -EOPNOTSUPP; } - priv->sreg = ntohl(nla_get_be32(tb[NFTA_CT_SREG])); - err = nft_validate_input_register(priv->sreg); + priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); + err = nft_validate_register_load(priv->sreg, len); if (err < 0) return err; @@ -320,7 +360,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ct *priv = nft_expr_priv(expr); - if (nla_put_be32(skb, NFTA_CT_DREG, htonl(priv->dreg))) + if (nft_dump_register(skb, NFTA_CT_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; @@ -347,7 +387,7 @@ static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_ct *priv = nft_expr_priv(expr); - if (nla_put_be32(skb, NFTA_CT_SREG, htonl(priv->sreg))) + if (nft_dump_register(skb, NFTA_CT_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c new file mode 100644 index 000000000000..513a8ef60a59 --- /dev/null +++ b/net/netfilter/nft_dynset.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2015 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> + +struct nft_dynset { + struct nft_set *set; + struct nft_set_ext_tmpl tmpl; + enum nft_dynset_ops op:8; + enum nft_registers sreg_key:8; + enum nft_registers sreg_data:8; + u64 timeout; + struct nft_expr *expr; + struct nft_set_binding binding; +}; + +static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, + struct nft_regs *regs) +{ + const struct nft_dynset *priv = nft_expr_priv(expr); + struct nft_set_ext *ext; + u64 timeout; + void *elem; + + if (set->size && !atomic_add_unless(&set->nelems, 1, set->size)) + return NULL; + + timeout = priv->timeout ? : set->timeout; + elem = nft_set_elem_init(set, &priv->tmpl, + ®s->data[priv->sreg_key], + ®s->data[priv->sreg_data], + timeout, GFP_ATOMIC); + if (elem == NULL) { + if (set->size) + atomic_dec(&set->nelems); + return NULL; + } + + ext = nft_set_elem_ext(set, elem); + if (priv->expr != NULL) + nft_expr_clone(nft_set_ext_expr(ext), priv->expr); + + return elem; +} + +static void nft_dynset_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_dynset *priv = nft_expr_priv(expr); + struct nft_set *set = priv->set; + const struct nft_set_ext *ext; + const struct nft_expr *sexpr; + u64 timeout; + + if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, + expr, regs, &ext)) { + sexpr = NULL; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + sexpr = nft_set_ext_expr(ext); + + if (priv->op == NFT_DYNSET_OP_UPDATE && + nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + timeout = priv->timeout ? : set->timeout; + *nft_set_ext_expiration(ext) = jiffies + timeout; + } else if (sexpr == NULL) + goto out; + + if (sexpr != NULL) + sexpr->ops->eval(sexpr, regs, pkt); + return; + } +out: + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { + [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING }, + [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, + [NFTA_DYNSET_OP] = { .type = NLA_U32 }, + [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, + [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, + [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, +}; + +static int nft_dynset_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + struct nft_set *set; + u64 timeout; + int err; + + if (tb[NFTA_DYNSET_SET_NAME] == NULL || + tb[NFTA_DYNSET_OP] == NULL || + tb[NFTA_DYNSET_SREG_KEY] == NULL) + return -EINVAL; + + set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME]); + if (IS_ERR(set)) { + if (tb[NFTA_DYNSET_SET_ID]) + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_DYNSET_SET_ID]); + if (IS_ERR(set)) + return PTR_ERR(set); + } + + if (set->flags & NFT_SET_CONSTANT) + return -EBUSY; + + priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); + switch (priv->op) { + case NFT_DYNSET_OP_ADD: + break; + case NFT_DYNSET_OP_UPDATE: + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; + break; + default: + return -EOPNOTSUPP; + } + + timeout = 0; + if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + timeout = be64_to_cpu(nla_get_be64(tb[NFTA_DYNSET_TIMEOUT])); + } + + priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]); + err = nft_validate_register_load(priv->sreg_key, set->klen);; + if (err < 0) + return err; + + if (tb[NFTA_DYNSET_SREG_DATA] != NULL) { + if (!(set->flags & NFT_SET_MAP)) + return -EINVAL; + if (set->dtype == NFT_DATA_VERDICT) + return -EOPNOTSUPP; + + priv->sreg_data = nft_parse_register(tb[NFTA_DYNSET_SREG_DATA]); + err = nft_validate_register_load(priv->sreg_data, set->dlen); + if (err < 0) + return err; + } else if (set->flags & NFT_SET_MAP) + return -EINVAL; + + if (tb[NFTA_DYNSET_EXPR] != NULL) { + if (!(set->flags & NFT_SET_EVAL)) + return -EINVAL; + if (!(set->flags & NFT_SET_ANONYMOUS)) + return -EOPNOTSUPP; + + priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]); + if (IS_ERR(priv->expr)) + return PTR_ERR(priv->expr); + + err = -EOPNOTSUPP; + if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL)) + goto err1; + } else if (set->flags & NFT_SET_EVAL) + return -EINVAL; + + nft_set_ext_prepare(&priv->tmpl); + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); + if (set->flags & NFT_SET_MAP) + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen); + if (priv->expr != NULL) + nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR, + priv->expr->ops->size); + if (set->flags & NFT_SET_TIMEOUT) { + if (timeout || set->timeout) + nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); + } + + priv->timeout = timeout; + + err = nf_tables_bind_set(ctx, set, &priv->binding); + if (err < 0) + goto err1; + + priv->set = set; + return 0; + +err1: + if (priv->expr != NULL) + nft_expr_destroy(ctx, priv->expr); + return err; +} + +static void nft_dynset_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); + if (priv->expr != NULL) + nft_expr_destroy(ctx, priv->expr); +} + +static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_dynset *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key)) + goto nla_put_failure; + if (priv->set->flags & NFT_SET_MAP && + nft_dump_register(skb, NFTA_DYNSET_SREG_DATA, priv->sreg_data)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_DYNSET_OP, htonl(priv->op))) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name)) + goto nla_put_failure; + if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout))) + goto nla_put_failure; + if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_dynset_type; +static const struct nft_expr_ops nft_dynset_ops = { + .type = &nft_dynset_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_dynset)), + .eval = nft_dynset_eval, + .init = nft_dynset_init, + .destroy = nft_dynset_destroy, + .dump = nft_dynset_dump, +}; + +static struct nft_expr_type nft_dynset_type __read_mostly = { + .name = "dynset", + .ops = &nft_dynset_ops, + .policy = nft_dynset_policy, + .maxattr = NFTA_DYNSET_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_dynset_module_init(void) +{ + return nft_register_expr(&nft_dynset_type); +} + +void nft_dynset_module_exit(void) +{ + nft_unregister_expr(&nft_dynset_type); +} diff --git a/net/netfilter/nft_expr_template.c b/net/netfilter/nft_expr_template.c deleted file mode 100644 index b6eed4d5a096..000000000000 --- a/net/netfilter/nft_expr_template.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> - -struct nft_template { - -}; - -static void nft_template_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) -{ - struct nft_template *priv = nft_expr_priv(expr); - -} - -static const struct nla_policy nft_template_policy[NFTA_TEMPLATE_MAX + 1] = { - [NFTA_TEMPLATE_ATTR] = { .type = NLA_U32 }, -}; - -static int nft_template_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_template *priv = nft_expr_priv(expr); - - return 0; -} - -static void nft_template_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) -{ - struct nft_template *priv = nft_expr_priv(expr); - -} - -static int nft_template_dump(struct sk_buff *skb, const struct nft_expr *expr) -{ - const struct nft_template *priv = nft_expr_priv(expr); - - NLA_PUT_BE32(skb, NFTA_TEMPLATE_ATTR, priv->field); - return 0; - -nla_put_failure: - return -1; -} - -static struct nft_expr_type nft_template_type; -static const struct nft_expr_ops nft_template_ops = { - .type = &nft_template_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_template)), - .eval = nft_template_eval, - .init = nft_template_init, - .destroy = nft_template_destroy, - .dump = nft_template_dump, -}; - -static struct nft_expr_type nft_template_type __read_mostly = { - .name = "template", - .ops = &nft_template_ops, - .policy = nft_template_policy, - .maxattr = NFTA_TEMPLATE_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_template_module_init(void) -{ - return nft_register_expr(&nft_template_type); -} - -static void __exit nft_template_module_exit(void) -{ - nft_unregister_expr(&nft_template_type); -} - -module_init(nft_template_module_init); -module_exit(nft_template_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("template"); diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 55c939f5371f..ba7aed13e174 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -26,11 +26,11 @@ struct nft_exthdr { }; static void nft_exthdr_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); - struct nft_data *dest = &data[priv->dreg]; + u32 *dest = ®s->data[priv->dreg]; unsigned int offset = 0; int err; @@ -39,11 +39,12 @@ static void nft_exthdr_eval(const struct nft_expr *expr, goto err; offset += priv->offset; - if (skb_copy_bits(pkt->skb, offset, dest->data, priv->len) < 0) + dest[priv->len / NFT_REG32_SIZE] = 0; + if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { @@ -58,7 +59,6 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); - int err; if (tb[NFTA_EXTHDR_DREG] == NULL || tb[NFTA_EXTHDR_TYPE] == NULL || @@ -69,22 +69,17 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); - if (priv->len == 0 || - priv->len > FIELD_SIZEOF(struct nft_data, data)) - return -EINVAL; + priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); - priv->dreg = ntohl(nla_get_be32(tb[NFTA_EXTHDR_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, priv->len); } static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_exthdr *priv = nft_expr_priv(expr); - if (nla_put_be32(skb, NFTA_EXTHDR_DREG, htonl(priv->dreg))) + if (nft_dump_register(skb, NFTA_EXTHDR_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) goto nla_put_failure; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 37c15e674884..3f9d45d3d9b7 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -15,6 +15,7 @@ #include <linux/log2.h> #include <linux/jhash.h> #include <linux/netlink.h> +#include <linux/workqueue.h> #include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> @@ -23,119 +24,175 @@ /* We target a hash table size of 4, element hint is 75% of final size */ #define NFT_HASH_ELEMENT_HINT 3 +struct nft_hash { + struct rhashtable ht; + struct delayed_work gc_work; +}; + struct nft_hash_elem { struct rhash_head node; - struct nft_data key; - struct nft_data data[]; + struct nft_set_ext ext; +}; + +struct nft_hash_cmp_arg { + const struct nft_set *set; + const u32 *key; + u8 genmask; }; -static bool nft_hash_lookup(const struct nft_set *set, - const struct nft_data *key, - struct nft_data *data) +static const struct rhashtable_params nft_hash_params; + +static inline u32 nft_hash_key(const void *data, u32 len, u32 seed) +{ + const struct nft_hash_cmp_arg *arg = data; + + return jhash(arg->key, len, seed); +} + +static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed) { - struct rhashtable *priv = nft_set_priv(set); + const struct nft_hash_elem *he = data; + + return jhash(nft_set_ext_key(&he->ext), len, seed); +} + +static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct nft_hash_cmp_arg *x = arg->key; + const struct nft_hash_elem *he = ptr; + + if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) + return 1; + if (nft_set_elem_expired(&he->ext)) + return 1; + if (!nft_set_elem_active(&he->ext, x->genmask)) + return 1; + return 0; +} + +static bool nft_hash_lookup(const struct nft_set *set, const u32 *key, + const struct nft_set_ext **ext) +{ + struct nft_hash *priv = nft_set_priv(set); const struct nft_hash_elem *he; + struct nft_hash_cmp_arg arg = { + .genmask = nft_genmask_cur(read_pnet(&set->pnet)), + .set = set, + .key = key, + }; - he = rhashtable_lookup(priv, key); - if (he && set->flags & NFT_SET_MAP) - nft_data_copy(data, he->data); + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + if (he != NULL) + *ext = &he->ext; return !!he; } -static int nft_hash_insert(const struct nft_set *set, - const struct nft_set_elem *elem) +static bool nft_hash_update(struct nft_set *set, const u32 *key, + void *(*new)(struct nft_set *, + const struct nft_expr *, + struct nft_regs *regs), + const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_set_ext **ext) { - struct rhashtable *priv = nft_set_priv(set); + struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; - unsigned int size; - - if (elem->flags != 0) - return -EINVAL; + struct nft_hash_cmp_arg arg = { + .genmask = NFT_GENMASK_ANY, + .set = set, + .key = key, + }; - size = sizeof(*he); - if (set->flags & NFT_SET_MAP) - size += sizeof(he->data[0]); + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + if (he != NULL) + goto out; - he = kzalloc(size, GFP_KERNEL); + he = new(set, expr, regs); if (he == NULL) - return -ENOMEM; - - nft_data_copy(&he->key, &elem->key); - if (set->flags & NFT_SET_MAP) - nft_data_copy(he->data, &elem->data); - - rhashtable_insert(priv, &he->node); + goto err1; + if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params)) + goto err2; +out: + *ext = &he->ext; + return true; - return 0; +err2: + nft_set_elem_destroy(set, he); +err1: + return false; } -static void nft_hash_elem_destroy(const struct nft_set *set, - struct nft_hash_elem *he) +static int nft_hash_insert(const struct nft_set *set, + const struct nft_set_elem *elem) { - nft_data_uninit(&he->key, NFT_DATA_VALUE); - if (set->flags & NFT_SET_MAP) - nft_data_uninit(he->data, set->dtype); - kfree(he); + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he = elem->priv; + struct nft_hash_cmp_arg arg = { + .genmask = nft_genmask_next(read_pnet(&set->pnet)), + .set = set, + .key = elem->key.val.data, + }; + + return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params); } -static void nft_hash_remove(const struct nft_set *set, - const struct nft_set_elem *elem) +static void nft_hash_activate(const struct nft_set *set, + const struct nft_set_elem *elem) { - struct rhashtable *priv = nft_set_priv(set); + struct nft_hash_elem *he = elem->priv; - rhashtable_remove(priv, elem->cookie); - synchronize_rcu(); - kfree(elem->cookie); + nft_set_elem_change_active(set, &he->ext); + nft_set_elem_clear_busy(&he->ext); } -struct nft_compare_arg { - const struct nft_set *set; - struct nft_set_elem *elem; -}; - -static bool nft_hash_compare(void *ptr, void *arg) +static void *nft_hash_deactivate(const struct nft_set *set, + const struct nft_set_elem *elem) { - struct nft_hash_elem *he = ptr; - struct nft_compare_arg *x = arg; - - if (!nft_data_cmp(&he->key, &x->elem->key, x->set->klen)) { - x->elem->cookie = he; - x->elem->flags = 0; - if (x->set->flags & NFT_SET_MAP) - nft_data_copy(&x->elem->data, he->data); + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he; + struct nft_hash_cmp_arg arg = { + .genmask = nft_genmask_next(read_pnet(&set->pnet)), + .set = set, + .key = elem->key.val.data, + }; - return true; + rcu_read_lock(); + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + if (he != NULL) { + if (!nft_set_elem_mark_busy(&he->ext)) + nft_set_elem_change_active(set, &he->ext); + else + he = NULL; } + rcu_read_unlock(); - return false; + return he; } -static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) +static void nft_hash_remove(const struct nft_set *set, + const struct nft_set_elem *elem) { - struct rhashtable *priv = nft_set_priv(set); - struct nft_compare_arg arg = { - .set = set, - .elem = elem, - }; + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he = elem->priv; - if (rhashtable_lookup_compare(priv, &elem->key, - &nft_hash_compare, &arg)) - return 0; - - return -ENOENT; + rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); } static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, struct nft_set_iter *iter) { - struct rhashtable *priv = nft_set_priv(set); - const struct nft_hash_elem *he; + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he; struct rhashtable_iter hti; struct nft_set_elem elem; + u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int err; - err = rhashtable_walk_init(priv, &hti); + err = rhashtable_walk_init(&priv->ht, &hti); iter->err = err; if (err) return; @@ -159,11 +216,12 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, if (iter->count < iter->skip) goto cont; + if (nft_set_elem_expired(&he->ext)) + goto cont; + if (!nft_set_elem_active(&he->ext, genmask)) + goto cont; - memcpy(&elem.key, &he->key, sizeof(elem.key)); - if (set->flags & NFT_SET_MAP) - memcpy(&elem.data, he->data, sizeof(elem.data)); - elem.flags = 0; + elem.priv = he; iter->err = iter->fn(ctx, set, iter, &elem); if (iter->err < 0) @@ -178,47 +236,102 @@ out: rhashtable_walk_exit(&hti); } +static void nft_hash_gc(struct work_struct *work) +{ + struct nft_set *set; + struct nft_hash_elem *he; + struct nft_hash *priv; + struct nft_set_gc_batch *gcb = NULL; + struct rhashtable_iter hti; + int err; + + priv = container_of(work, struct nft_hash, gc_work.work); + set = nft_set_container_of(priv); + + err = rhashtable_walk_init(&priv->ht, &hti); + if (err) + goto schedule; + + err = rhashtable_walk_start(&hti); + if (err && err != -EAGAIN) + goto out; + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) + goto out; + continue; + } + + if (!nft_set_elem_expired(&he->ext)) + continue; + if (nft_set_elem_mark_busy(&he->ext)) + continue; + + gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); + if (gcb == NULL) + goto out; + rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); + atomic_dec(&set->nelems); + nft_set_gc_batch_add(gcb, he); + } +out: + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + nft_set_gc_batch_complete(gcb); +schedule: + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); +} + static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) { - return sizeof(struct rhashtable); + return sizeof(struct nft_hash); } +static const struct rhashtable_params nft_hash_params = { + .head_offset = offsetof(struct nft_hash_elem, node), + .hashfn = nft_hash_key, + .obj_hashfn = nft_hash_obj, + .obj_cmpfn = nft_hash_cmp, + .automatic_shrinking = true, +}; + static int nft_hash_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const tb[]) { - struct rhashtable *priv = nft_set_priv(set); - struct rhashtable_params params = { - .nelem_hint = desc->size ? : NFT_HASH_ELEMENT_HINT, - .head_offset = offsetof(struct nft_hash_elem, node), - .key_offset = offsetof(struct nft_hash_elem, key), - .key_len = set->klen, - .hashfn = jhash, - }; + struct nft_hash *priv = nft_set_priv(set); + struct rhashtable_params params = nft_hash_params; + int err; - return rhashtable_init(priv, ¶ms); + params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT; + params.key_len = set->klen; + + err = rhashtable_init(&priv->ht, ¶ms); + if (err < 0) + return err; + + INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc); + if (set->flags & NFT_SET_TIMEOUT) + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); + return 0; } -static void nft_hash_destroy(const struct nft_set *set) +static void nft_hash_elem_destroy(void *ptr, void *arg) { - struct rhashtable *priv = nft_set_priv(set); - const struct bucket_table *tbl; - struct nft_hash_elem *he; - struct rhash_head *pos, *next; - unsigned int i; - - /* Stop an eventual async resizing */ - priv->being_destroyed = true; - mutex_lock(&priv->mutex); + nft_set_elem_destroy((const struct nft_set *)arg, ptr); +} - tbl = rht_dereference(priv->tbl, priv); - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_safe(he, pos, next, tbl, i, node) - nft_hash_elem_destroy(set, he); - } - mutex_unlock(&priv->mutex); +static void nft_hash_destroy(const struct nft_set *set) +{ + struct nft_hash *priv = nft_set_priv(set); - rhashtable_destroy(priv); + cancel_delayed_work_sync(&priv->gc_work); + rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy, + (void *)set); } static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, @@ -227,11 +340,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, unsigned int esize; esize = sizeof(struct nft_hash_elem); - if (features & NFT_SET_MAP) - esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]); - if (desc->size) { - est->size = sizeof(struct rhashtable) + + est->size = sizeof(struct nft_hash) + roundup_pow_of_two(desc->size * 4 / 3) * sizeof(struct nft_hash_elem *) + desc->size * esize; @@ -251,15 +361,18 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, static struct nft_set_ops nft_hash_ops __read_mostly = { .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), .estimate = nft_hash_estimate, .init = nft_hash_init, .destroy = nft_hash_destroy, - .get = nft_hash_get, .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, .remove = nft_hash_remove, .lookup = nft_hash_lookup, + .update = nft_hash_update, .walk = nft_hash_walk, - .features = NFT_SET_MAP, + .features = NFT_SET_MAP | NFT_SET_TIMEOUT, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 810385eb7249..db3b746858e3 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -24,12 +24,12 @@ struct nft_immediate_expr { }; static void nft_immediate_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); - nft_data_copy(&data[priv->dreg], &priv->data); + nft_data_copy(®s->data[priv->dreg], &priv->data, priv->dlen); } static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { @@ -49,17 +49,15 @@ static int nft_immediate_init(const struct nft_ctx *ctx, tb[NFTA_IMMEDIATE_DATA] == NULL) return -EINVAL; - priv->dreg = ntohl(nla_get_be32(tb[NFTA_IMMEDIATE_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - - err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); + err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc, + tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; priv->dlen = desc.len; - err = nft_validate_data_load(ctx, priv->dreg, &priv->data, desc.type); + priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); + err = nft_validate_register_store(ctx, priv->dreg, &priv->data, + desc.type, desc.len); if (err < 0) goto err1; @@ -81,7 +79,7 @@ static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); - if (nla_put_be32(skb, NFTA_IMMEDIATE_DREG, htonl(priv->dreg))) + if (nft_dump_register(skb, NFTA_IMMEDIATE_DREG, priv->dreg)) goto nla_put_failure; return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data, diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 85da5bd02f64..435c1ccd6c0e 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -27,7 +27,7 @@ struct nft_limit { }; static void nft_limit_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_limit *priv = nft_expr_priv(expr); @@ -45,7 +45,7 @@ static void nft_limit_eval(const struct nft_expr *expr, } spin_unlock_bh(&limit_lock); - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { @@ -98,6 +98,7 @@ static struct nft_expr_type nft_limit_type __read_mostly = { .ops = &nft_limit_ops, .policy = nft_limit_policy, .maxattr = NFTA_LIMIT_MAX, + .flags = NFT_EXPR_STATEFUL, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index bde05f28cf14..a13d6a386d63 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -27,7 +27,7 @@ struct nft_log { }; static void nft_log_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_log *priv = nft_expr_priv(expr); @@ -78,7 +78,7 @@ static int nft_log_init(const struct nft_ctx *ctx, li->u.log.level = ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL])); } else { - li->u.log.level = 4; + li->u.log.level = LOGLEVEL_WARNING; } if (tb[NFTA_LOG_FLAGS] != NULL) { li->u.log.logflags = diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 9615b8b9fb37..b3c31ef8015d 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -26,15 +26,20 @@ struct nft_lookup { }; static void nft_lookup_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; + const struct nft_set_ext *ext; - if (set->ops->lookup(set, &data[priv->sreg], &data[priv->dreg])) + if (set->ops->lookup(set, ®s->data[priv->sreg], &ext)) { + if (set->flags & NFT_SET_MAP) + nft_data_copy(®s->data[priv->dreg], + nft_set_ext_data(ext), set->dlen); return; - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + } + regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { @@ -66,8 +71,11 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return PTR_ERR(set); } - priv->sreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_SREG])); - err = nft_validate_input_register(priv->sreg); + if (set->flags & NFT_SET_EVAL) + return -EOPNOTSUPP; + + priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]); + err = nft_validate_register_load(priv->sreg, set->klen); if (err < 0) return err; @@ -75,19 +83,16 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_MAP)) return -EINVAL; - priv->dreg = ntohl(nla_get_be32(tb[NFTA_LOOKUP_DREG])); - err = nft_validate_output_register(priv->dreg); + priv->dreg = nft_parse_register(tb[NFTA_LOOKUP_DREG]); + err = nft_validate_register_store(ctx, priv->dreg, NULL, + set->dtype, set->dlen); if (err < 0) return err; - - if (priv->dreg == NFT_REG_VERDICT) { - if (set->dtype != NFT_DATA_VERDICT) - return -EINVAL; - } else if (set->dtype == NFT_DATA_VERDICT) - return -EINVAL; } else if (set->flags & NFT_SET_MAP) return -EINVAL; + priv->binding.flags = set->flags & NFT_SET_MAP; + err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) return err; @@ -110,10 +115,10 @@ static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_LOOKUP_SREG, htonl(priv->sreg))) + if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg)) goto nla_put_failure; if (priv->set->flags & NFT_SET_MAP) - if (nla_put_be32(skb, NFTA_LOOKUP_DREG, htonl(priv->dreg))) + if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; return 0; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e99911eda915..52561e1c31e2 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -25,65 +25,68 @@ #include <net/netfilter/nft_meta.h> void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = pkt->in, *out = pkt->out; - struct nft_data *dest = &data[priv->dreg]; + u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { case NFT_META_LEN: - dest->data[0] = skb->len; + *dest = skb->len; break; case NFT_META_PROTOCOL: - *(__be16 *)dest->data = skb->protocol; + *dest = 0; + *(__be16 *)dest = skb->protocol; break; case NFT_META_NFPROTO: - dest->data[0] = pkt->ops->pf; + *dest = pkt->ops->pf; break; case NFT_META_L4PROTO: - dest->data[0] = pkt->tprot; + *dest = pkt->tprot; break; case NFT_META_PRIORITY: - dest->data[0] = skb->priority; + *dest = skb->priority; break; case NFT_META_MARK: - dest->data[0] = skb->mark; + *dest = skb->mark; break; case NFT_META_IIF: if (in == NULL) goto err; - dest->data[0] = in->ifindex; + *dest = in->ifindex; break; case NFT_META_OIF: if (out == NULL) goto err; - dest->data[0] = out->ifindex; + *dest = out->ifindex; break; case NFT_META_IIFNAME: if (in == NULL) goto err; - strncpy((char *)dest->data, in->name, sizeof(dest->data)); + strncpy((char *)dest, in->name, IFNAMSIZ); break; case NFT_META_OIFNAME: if (out == NULL) goto err; - strncpy((char *)dest->data, out->name, sizeof(dest->data)); + strncpy((char *)dest, out->name, IFNAMSIZ); break; case NFT_META_IIFTYPE: if (in == NULL) goto err; - *(u16 *)dest->data = in->type; + *dest = 0; + *(u16 *)dest = in->type; break; case NFT_META_OIFTYPE: if (out == NULL) goto err; - *(u16 *)dest->data = out->type; + *dest = 0; + *(u16 *)dest = out->type; break; case NFT_META_SKUID: - if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) + if (skb->sk == NULL || !sk_fullsock(skb->sk)) goto err; read_lock_bh(&skb->sk->sk_callback_lock); @@ -93,13 +96,12 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; } - dest->data[0] = - from_kuid_munged(&init_user_ns, + *dest = from_kuid_munged(&init_user_ns, skb->sk->sk_socket->file->f_cred->fsuid); read_unlock_bh(&skb->sk->sk_callback_lock); break; case NFT_META_SKGID: - if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT) + if (skb->sk == NULL || !sk_fullsock(skb->sk)) goto err; read_lock_bh(&skb->sk->sk_callback_lock); @@ -108,8 +110,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, read_unlock_bh(&skb->sk->sk_callback_lock); goto err; } - dest->data[0] = - from_kgid_munged(&init_user_ns, + *dest = from_kgid_munged(&init_user_ns, skb->sk->sk_socket->file->f_cred->fsgid); read_unlock_bh(&skb->sk->sk_callback_lock); break; @@ -119,33 +120,33 @@ void nft_meta_get_eval(const struct nft_expr *expr, if (dst == NULL) goto err; - dest->data[0] = dst->tclassid; + *dest = dst->tclassid; break; } #endif #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: - dest->data[0] = skb->secmark; + *dest = skb->secmark; break; #endif case NFT_META_PKTTYPE: if (skb->pkt_type != PACKET_LOOPBACK) { - dest->data[0] = skb->pkt_type; + *dest = skb->pkt_type; break; } switch (pkt->ops->pf) { case NFPROTO_IPV4: if (ipv4_is_multicast(ip_hdr(skb)->daddr)) - dest->data[0] = PACKET_MULTICAST; + *dest = PACKET_MULTICAST; else - dest->data[0] = PACKET_BROADCAST; + *dest = PACKET_BROADCAST; break; case NFPROTO_IPV6: if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) - dest->data[0] = PACKET_MULTICAST; + *dest = PACKET_MULTICAST; else - dest->data[0] = PACKET_BROADCAST; + *dest = PACKET_BROADCAST; break; default: WARN_ON(1); @@ -153,23 +154,22 @@ void nft_meta_get_eval(const struct nft_expr *expr, } break; case NFT_META_CPU: - dest->data[0] = smp_processor_id(); + *dest = raw_smp_processor_id(); break; case NFT_META_IIFGROUP: if (in == NULL) goto err; - dest->data[0] = in->group; + *dest = in->group; break; case NFT_META_OIFGROUP: if (out == NULL) goto err; - dest->data[0] = out->group; + *dest = out->group; break; case NFT_META_CGROUP: - if (skb->sk == NULL) - break; - - dest->data[0] = skb->sk->sk_classid; + if (skb->sk == NULL || !sk_fullsock(skb->sk)) + goto err; + *dest = skb->sk->sk_classid; break; default: WARN_ON(1); @@ -178,17 +178,17 @@ void nft_meta_get_eval(const struct nft_expr *expr, return; err: - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + regs->verdict.code = NFT_BREAK; } EXPORT_SYMBOL_GPL(nft_meta_get_eval); void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; - u32 value = data[meta->sreg].data[0]; + u32 value = regs->data[meta->sreg]; switch (meta->key) { case NFT_META_MARK: @@ -218,22 +218,22 @@ int nft_meta_get_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); - int err; + unsigned int len; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { - case NFT_META_LEN: case NFT_META_PROTOCOL: + case NFT_META_IIFTYPE: + case NFT_META_OIFTYPE: + len = sizeof(u16); + break; case NFT_META_NFPROTO: case NFT_META_L4PROTO: + case NFT_META_LEN: case NFT_META_PRIORITY: case NFT_META_MARK: case NFT_META_IIF: case NFT_META_OIF: - case NFT_META_IIFNAME: - case NFT_META_OIFNAME: - case NFT_META_IIFTYPE: - case NFT_META_OIFTYPE: case NFT_META_SKUID: case NFT_META_SKGID: #ifdef CONFIG_IP_ROUTE_CLASSID @@ -247,21 +247,19 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_IIFGROUP: case NFT_META_OIFGROUP: case NFT_META_CGROUP: + len = sizeof(u32); + break; + case NFT_META_IIFNAME: + case NFT_META_OIFNAME: + len = IFNAMSIZ; break; default: return -EOPNOTSUPP; } - priv->dreg = ntohl(nla_get_be32(tb[NFTA_META_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - - err = nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); - if (err < 0) - return err; - - return 0; + priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); } EXPORT_SYMBOL_GPL(nft_meta_get_init); @@ -270,20 +268,24 @@ int nft_meta_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); + unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_MARK: case NFT_META_PRIORITY: + len = sizeof(u32); + break; case NFT_META_NFTRACE: + len = sizeof(u8); break; default: return -EOPNOTSUPP; } - priv->sreg = ntohl(nla_get_be32(tb[NFTA_META_SREG])); - err = nft_validate_input_register(priv->sreg); + priv->sreg = nft_parse_register(tb[NFTA_META_SREG]); + err = nft_validate_register_load(priv->sreg, len); if (err < 0) return err; @@ -298,7 +300,7 @@ int nft_meta_get_dump(struct sk_buff *skb, if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_META_DREG, htonl(priv->dreg))) + if (nft_dump_register(skb, NFTA_META_DREG, priv->dreg)) goto nla_put_failure; return 0; @@ -314,7 +316,7 @@ int nft_meta_set_dump(struct sk_buff *skb, if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key))) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_META_SREG, htonl(priv->sreg))) + if (nft_dump_register(skb, NFTA_META_SREG, priv->sreg)) goto nla_put_failure; return 0; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index a0837c6c9283..ee2d71753746 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -37,7 +37,7 @@ struct nft_nat { }; static void nft_nat_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); @@ -49,33 +49,32 @@ static void nft_nat_eval(const struct nft_expr *expr, if (priv->sreg_addr_min) { if (priv->family == AF_INET) { range.min_addr.ip = (__force __be32) - data[priv->sreg_addr_min].data[0]; + regs->data[priv->sreg_addr_min]; range.max_addr.ip = (__force __be32) - data[priv->sreg_addr_max].data[0]; + regs->data[priv->sreg_addr_max]; } else { memcpy(range.min_addr.ip6, - data[priv->sreg_addr_min].data, - sizeof(struct nft_data)); + ®s->data[priv->sreg_addr_min], + sizeof(range.min_addr.ip6)); memcpy(range.max_addr.ip6, - data[priv->sreg_addr_max].data, - sizeof(struct nft_data)); + ®s->data[priv->sreg_addr_max], + sizeof(range.max_addr.ip6)); } range.flags |= NF_NAT_RANGE_MAP_IPS; } if (priv->sreg_proto_min) { range.min_proto.all = - *(__be16 *)&data[priv->sreg_proto_min].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_min]; range.max_proto.all = - *(__be16 *)&data[priv->sreg_proto_max].data[0]; + *(__be16 *)®s->data[priv->sreg_proto_max]; range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } range.flags |= priv->flags; - data[NFT_REG_VERDICT].verdict = - nf_nat_setup_info(ct, &range, priv->type); + regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { @@ -119,6 +118,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_nat *priv = nft_expr_priv(expr); + unsigned int alen, plen; u32 family; int err; @@ -146,25 +146,34 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); - if (family != AF_INET && family != AF_INET6) - return -EAFNOSUPPORT; if (family != ctx->afi->family) return -EOPNOTSUPP; + + switch (family) { + case NFPROTO_IPV4: + alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip); + break; + case NFPROTO_IPV6: + alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip6); + break; + default: + return -EAFNOSUPPORT; + } priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { priv->sreg_addr_min = - ntohl(nla_get_be32(tb[NFTA_NAT_REG_ADDR_MIN])); - - err = nft_validate_input_register(priv->sreg_addr_min); + nft_parse_register(tb[NFTA_NAT_REG_ADDR_MIN]); + err = nft_validate_register_load(priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { priv->sreg_addr_max = - ntohl(nla_get_be32(tb[NFTA_NAT_REG_ADDR_MAX])); + nft_parse_register(tb[NFTA_NAT_REG_ADDR_MAX]); - err = nft_validate_input_register(priv->sreg_addr_max); + err = nft_validate_register_load(priv->sreg_addr_max, + alen); if (err < 0) return err; } else { @@ -172,19 +181,21 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } } + plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { priv->sreg_proto_min = - ntohl(nla_get_be32(tb[NFTA_NAT_REG_PROTO_MIN])); + nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]); - err = nft_validate_input_register(priv->sreg_proto_min); + err = nft_validate_register_load(priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { priv->sreg_proto_max = - ntohl(nla_get_be32(tb[NFTA_NAT_REG_PROTO_MAX])); + nft_parse_register(tb[NFTA_NAT_REG_PROTO_MAX]); - err = nft_validate_input_register(priv->sreg_proto_max); + err = nft_validate_register_load(priv->sreg_proto_max, + plen); if (err < 0) return err; } else { @@ -220,18 +231,18 @@ static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (priv->sreg_addr_min) { - if (nla_put_be32(skb, NFTA_NAT_REG_ADDR_MIN, - htonl(priv->sreg_addr_min)) || - nla_put_be32(skb, NFTA_NAT_REG_ADDR_MAX, - htonl(priv->sreg_addr_max))) + if (nft_dump_register(skb, NFTA_NAT_REG_ADDR_MIN, + priv->sreg_addr_min) || + nft_dump_register(skb, NFTA_NAT_REG_ADDR_MAX, + priv->sreg_addr_max)) goto nla_put_failure; } if (priv->sreg_proto_min) { - if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN, - htonl(priv->sreg_proto_min)) || - nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX, - htonl(priv->sreg_proto_max))) + if (nft_dump_register(skb, NFTA_NAT_REG_PROTO_MIN, + priv->sreg_proto_min) || + nft_dump_register(skb, NFTA_NAT_REG_PROTO_MAX, + priv->sreg_proto_max)) goto nla_put_failure; } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 85daa84bfdfe..94fb3b27a2c5 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -18,12 +18,12 @@ #include <net/netfilter/nf_tables.h> static void nft_payload_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - struct nft_data *dest = &data[priv->dreg]; + u32 *dest = ®s->data[priv->dreg]; int offset; switch (priv->base) { @@ -43,11 +43,12 @@ static void nft_payload_eval(const struct nft_expr *expr, } offset += priv->offset; - if (skb_copy_bits(skb, offset, dest->data, priv->len) < 0) + dest[priv->len / NFT_REG32_SIZE] = 0; + if (skb_copy_bits(skb, offset, dest, priv->len) < 0) goto err; return; err: - data[NFT_REG_VERDICT].verdict = NFT_BREAK; + regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { @@ -62,24 +63,21 @@ static int nft_payload_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_payload *priv = nft_expr_priv(expr); - int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + priv->dreg = nft_parse_register(tb[NFTA_PAYLOAD_DREG]); - priv->dreg = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_DREG])); - err = nft_validate_output_register(priv->dreg); - if (err < 0) - return err; - return nft_validate_data_load(ctx, priv->dreg, NULL, NFT_DATA_VALUE); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, priv->len); } static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_payload *priv = nft_expr_priv(expr); - if (nla_put_be32(skb, NFTA_PAYLOAD_DREG, htonl(priv->dreg)) || + if (nft_dump_register(skb, NFTA_PAYLOAD_DREG, priv->dreg) || nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) || nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) || nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len))) @@ -131,9 +129,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, } offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); - len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); - if (len == 0 || len > FIELD_SIZEOF(struct nft_data, data)) - return ERR_PTR(-EINVAL); + len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && base != NFT_PAYLOAD_LL_HEADER) diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index e8ae2f6bf232..96805d21d618 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -28,7 +28,7 @@ struct nft_queue { }; static void nft_queue_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_queue *priv = nft_expr_priv(expr); @@ -51,7 +51,7 @@ static void nft_queue_eval(const struct nft_expr *expr, if (priv->flags & NFT_QUEUE_FLAG_BYPASS) ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; - data[NFT_REG_VERDICT].verdict = ret; + regs->verdict.code = ret; } static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index 46214f245665..1c30f41cff5b 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -26,25 +26,25 @@ struct nft_rbtree { struct nft_rbtree_elem { struct rb_node node; - u16 flags; - struct nft_data key; - struct nft_data data[]; + struct nft_set_ext ext; }; -static bool nft_rbtree_lookup(const struct nft_set *set, - const struct nft_data *key, - struct nft_data *data) + +static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, + const struct nft_set_ext **ext) { const struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; - const struct rb_node *parent = priv->root.rb_node; + const struct rb_node *parent; + u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int d; spin_lock_bh(&nft_rbtree_lock); + parent = priv->root.rb_node; while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = nft_data_cmp(&rbe->key, key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); if (d < 0) { parent = parent->rb_left; interval = rbe; @@ -52,12 +52,17 @@ static bool nft_rbtree_lookup(const struct nft_set *set, parent = parent->rb_right; else { found: - if (rbe->flags & NFT_SET_ELEM_INTERVAL_END) + if (!nft_set_elem_active(&rbe->ext, genmask)) { + parent = parent->rb_left; + continue; + } + if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(&rbe->ext) & + NFT_SET_ELEM_INTERVAL_END) goto out; - if (set->flags & NFT_SET_MAP) - nft_data_copy(data, rbe->data); - spin_unlock_bh(&nft_rbtree_lock); + + *ext = &rbe->ext; return true; } } @@ -71,23 +76,13 @@ out: return false; } -static void nft_rbtree_elem_destroy(const struct nft_set *set, - struct nft_rbtree_elem *rbe) -{ - nft_data_uninit(&rbe->key, NFT_DATA_VALUE); - if (set->flags & NFT_SET_MAP && - !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) - nft_data_uninit(rbe->data, set->dtype); - - kfree(rbe); -} - static int __nft_rbtree_insert(const struct nft_set *set, struct nft_rbtree_elem *new) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *parent, **p; + u8 genmask = nft_genmask_next(read_pnet(&set->pnet)); int d; parent = NULL; @@ -95,13 +90,18 @@ static int __nft_rbtree_insert(const struct nft_set *set, while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = nft_data_cmp(&rbe->key, &new->key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), + nft_set_ext_key(&new->ext), + set->klen); if (d < 0) p = &parent->rb_left; else if (d > 0) p = &parent->rb_right; - else - return -EEXIST; + else { + if (nft_set_elem_active(&rbe->ext, genmask)) + return -EEXIST; + p = &parent->rb_left; + } } rb_link_node(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); @@ -111,31 +111,13 @@ static int __nft_rbtree_insert(const struct nft_set *set, static int nft_rbtree_insert(const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_rbtree_elem *rbe; - unsigned int size; + struct nft_rbtree_elem *rbe = elem->priv; int err; - size = sizeof(*rbe); - if (set->flags & NFT_SET_MAP && - !(elem->flags & NFT_SET_ELEM_INTERVAL_END)) - size += sizeof(rbe->data[0]); - - rbe = kzalloc(size, GFP_KERNEL); - if (rbe == NULL) - return -ENOMEM; - - rbe->flags = elem->flags; - nft_data_copy(&rbe->key, &elem->key); - if (set->flags & NFT_SET_MAP && - !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) - nft_data_copy(rbe->data, &elem->data); - spin_lock_bh(&nft_rbtree_lock); err = __nft_rbtree_insert(set, rbe); - if (err < 0) - kfree(rbe); - spin_unlock_bh(&nft_rbtree_lock); + return err; } @@ -143,42 +125,49 @@ static void nft_rbtree_remove(const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe = elem->cookie; + struct nft_rbtree_elem *rbe = elem->priv; spin_lock_bh(&nft_rbtree_lock); rb_erase(&rbe->node, &priv->root); spin_unlock_bh(&nft_rbtree_lock); - kfree(rbe); } -static int nft_rbtree_get(const struct nft_set *set, struct nft_set_elem *elem) +static void nft_rbtree_activate(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_rbtree_elem *rbe = elem->priv; + + nft_set_elem_change_active(set, &rbe->ext); +} + +static void *nft_rbtree_deactivate(const struct nft_set *set, + const struct nft_set_elem *elem) { const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; struct nft_rbtree_elem *rbe; + u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int d; - spin_lock_bh(&nft_rbtree_lock); while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = nft_data_cmp(&rbe->key, &elem->key, set->klen); + d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val, + set->klen); if (d < 0) parent = parent->rb_left; else if (d > 0) parent = parent->rb_right; else { - elem->cookie = rbe; - if (set->flags & NFT_SET_MAP && - !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) - nft_data_copy(&elem->data, rbe->data); - elem->flags = rbe->flags; - spin_unlock_bh(&nft_rbtree_lock); - return 0; + if (!nft_set_elem_active(&rbe->ext, genmask)) { + parent = parent->rb_left; + continue; + } + nft_set_elem_change_active(set, &rbe->ext); + return rbe; } } - spin_unlock_bh(&nft_rbtree_lock); - return -ENOENT; + return NULL; } static void nft_rbtree_walk(const struct nft_ctx *ctx, @@ -186,21 +175,21 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_set_iter *iter) { const struct nft_rbtree *priv = nft_set_priv(set); - const struct nft_rbtree_elem *rbe; + struct nft_rbtree_elem *rbe; struct nft_set_elem elem; struct rb_node *node; + u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); spin_lock_bh(&nft_rbtree_lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (iter->count < iter->skip) goto cont; + if (!nft_set_elem_active(&rbe->ext, genmask)) + goto cont; - rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_data_copy(&elem.key, &rbe->key); - if (set->flags & NFT_SET_MAP && - !(rbe->flags & NFT_SET_ELEM_INTERVAL_END)) - nft_data_copy(&elem.data, rbe->data); - elem.flags = rbe->flags; + elem.priv = rbe; iter->err = iter->fn(ctx, set, iter, &elem); if (iter->err < 0) { @@ -237,7 +226,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_rbtree_elem_destroy(set, rbe); + nft_set_elem_destroy(set, rbe); } } @@ -247,9 +236,6 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, unsigned int nsize; nsize = sizeof(struct nft_rbtree_elem); - if (features & NFT_SET_MAP) - nsize += FIELD_SIZEOF(struct nft_rbtree_elem, data[0]); - if (desc->size) est->size = sizeof(struct nft_rbtree) + desc->size * nsize; else @@ -262,12 +248,14 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, static struct nft_set_ops nft_rbtree_ops __read_mostly = { .privsize = nft_rbtree_privsize, + .elemsize = offsetof(struct nft_rbtree_elem, ext), .estimate = nft_rbtree_estimate, .init = nft_rbtree_init, .destroy = nft_rbtree_destroy, .insert = nft_rbtree_insert, .remove = nft_rbtree_remove, - .get = nft_rbtree_get, + .deactivate = nft_rbtree_deactivate, + .activate = nft_rbtree_activate, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .features = NFT_SET_INTERVAL | NFT_SET_MAP, diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index d7e9e93a4e90..03f7bf40ae75 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -44,25 +44,28 @@ int nft_redir_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_redir *priv = nft_expr_priv(expr); + unsigned int plen; int err; err = nft_redir_validate(ctx, expr, NULL); if (err < 0) return err; + plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { priv->sreg_proto_min = - ntohl(nla_get_be32(tb[NFTA_REDIR_REG_PROTO_MIN])); + nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]); - err = nft_validate_input_register(priv->sreg_proto_min); + err = nft_validate_register_load(priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_REDIR_REG_PROTO_MAX]) { priv->sreg_proto_max = - ntohl(nla_get_be32(tb[NFTA_REDIR_REG_PROTO_MAX])); + nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MAX]); - err = nft_validate_input_register(priv->sreg_proto_max); + err = nft_validate_register_load(priv->sreg_proto_max, + plen); if (err < 0) return err; } else { @@ -85,11 +88,11 @@ int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) const struct nft_redir *priv = nft_expr_priv(expr); if (priv->sreg_proto_min) { - if (nla_put_be32(skb, NFTA_REDIR_REG_PROTO_MIN, - htonl(priv->sreg_proto_min))) + if (nft_dump_register(skb, NFTA_REDIR_REG_PROTO_MIN, + priv->sreg_proto_min)) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_REDIR_REG_PROTO_MAX, - htonl(priv->sreg_proto_max))) + if (nft_dump_register(skb, NFTA_REDIR_REG_PROTO_MAX, + priv->sreg_proto_max)) goto nla_put_failure; } diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 57d3e1af5630..0522fc9bfb0a 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -63,6 +63,8 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; + default: + break; } return 0; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 7b5f9d58680a..635dbba93d01 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -18,7 +18,7 @@ #include <net/netfilter/ipv6/nf_reject.h> static void nft_reject_inet_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], + struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); @@ -28,14 +28,16 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, case NFPROTO_IPV4: switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach(pkt->skb, priv->icmp_code); + nf_send_unreach(pkt->skb, priv->icmp_code, + pkt->ops->hooknum); break; case NFT_REJECT_TCP_RST: nf_send_reset(pkt->skb, pkt->ops->hooknum); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach(pkt->skb, - nft_reject_icmp_code(priv->icmp_code)); + nft_reject_icmp_code(priv->icmp_code), + pkt->ops->hooknum); break; } break; @@ -56,7 +58,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, } break; } - data[NFT_REG_VERDICT].verdict = NF_DROP; + + regs->verdict.code = NF_DROP; } static int nft_reject_inet_init(const struct nft_ctx *ctx, @@ -105,6 +108,8 @@ static int nft_reject_inet_dump(struct sk_buff *skb, if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; + default: + break; } return 0; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 51a459c3c649..d324fe71260c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -658,35 +658,23 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user); struct xt_table_info *xt_alloc_table_info(unsigned int size) { - struct xt_table_info *newinfo; - int cpu; + struct xt_table_info *info = NULL; + size_t sz = sizeof(*info) + size; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); - if (!newinfo) - return NULL; - - newinfo->size = size; - - for_each_possible_cpu(cpu) { - if (size <= PAGE_SIZE) - newinfo->entries[cpu] = kmalloc_node(size, - GFP_KERNEL, - cpu_to_node(cpu)); - else - newinfo->entries[cpu] = vmalloc_node(size, - cpu_to_node(cpu)); - - if (newinfo->entries[cpu] == NULL) { - xt_free_table_info(newinfo); + if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!info) { + info = vmalloc(sz); + if (!info) return NULL; - } } - - return newinfo; + memset(info, 0, sizeof(*info)); + info->size = size; + return info; } EXPORT_SYMBOL(xt_alloc_table_info); @@ -694,9 +682,6 @@ void xt_free_table_info(struct xt_table_info *info) { int cpu; - for_each_possible_cpu(cpu) - kvfree(info->entries[cpu]); - if (info->jumpstack != NULL) { for_each_possible_cpu(cpu) kvfree(info->jumpstack[cpu]); @@ -705,7 +690,7 @@ void xt_free_table_info(struct xt_table_info *info) free_percpu(info->stackptr); - kfree(info); + kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); @@ -947,11 +932,9 @@ static int xt_table_seq_show(struct seq_file *seq, void *v) { struct xt_table *table = list_entry(v, struct xt_table, list); - if (strlen(table->name)) { + if (*table->name) seq_printf(seq, "%s\n", table->name); - return seq_has_overflowed(seq); - } else - return 0; + return 0; } static const struct seq_operations xt_table_seq_ops = { @@ -1087,10 +1070,8 @@ static int xt_match_seq_show(struct seq_file *seq, void *v) if (trav->curr == trav->head) return 0; match = list_entry(trav->curr, struct xt_match, list); - if (*match->name == '\0') - return 0; - seq_printf(seq, "%s\n", match->name); - return seq_has_overflowed(seq); + if (*match->name) + seq_printf(seq, "%s\n", match->name); } return 0; } @@ -1142,10 +1123,8 @@ static int xt_target_seq_show(struct seq_file *seq, void *v) if (trav->curr == trav->head) return 0; target = list_entry(trav->curr, struct xt_target, list); - if (*target->name == '\0') - return 0; - seq_printf(seq, "%s\n", target->name); - return seq_has_overflowed(seq); + if (*target->name) + seq_printf(seq, "%s\n", target->name); } return 0; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 75747aecdebe..c6630030c912 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -184,7 +184,6 @@ out: static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { - struct nf_conntrack_tuple t; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -202,8 +201,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (ret < 0) goto err1; - memset(&t, 0, sizeof(t)); - ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL); + ct = nf_ct_tmpl_alloc(par->net, info->zone, GFP_KERNEL); ret = PTR_ERR(ct); if (IS_ERR(ct)) goto err2; @@ -227,8 +225,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (ret < 0) goto err3; } - - nf_conntrack_tmpl_insert(par->net, ct); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_get(&ct->ct_general); out: info->ct = ct; return 0; diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index f407ebc13481..29d2c31f406c 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -126,6 +126,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) goto out; } + sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index e762de5ee89b..8c3190e2fc6a 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -277,6 +277,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; @@ -299,6 +302,9 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par) "FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } + if (par->nft_compat) + return 0; + xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 292934d23482..a747eb475b68 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -152,6 +152,7 @@ tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) fl6.daddr = info->gw.in6; fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 50e1e5aaf4ce..cca96cec1b68 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -42,15 +42,21 @@ enum nf_tproxy_lookup_t { static bool tproxy_sk_is_transparent(struct sock *sk) { - if (sk->sk_state != TCP_TIME_WAIT) { - if (inet_sk(sk)->transparent) - return true; - sock_put(sk); - } else { + switch (sk->sk_state) { + case TCP_TIME_WAIT: if (inet_twsk(sk)->tw_transparent) return true; - inet_twsk_put(inet_twsk(sk)); + break; + case TCP_NEW_SYN_RECV: + if (inet_rsk(inet_reqsk(sk))->no_srccheck) + return true; + break; + default: + if (inet_sk(sk)->transparent) + return true; } + + sock_gen_put(sk); return false; } @@ -266,7 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_deschedule(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk)); sk = sk2; } @@ -431,7 +437,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, tgi->lport ? tgi->lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_deschedule(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk)); sk = sk2; } diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index fab6eea1bf38..5b4743cc0436 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -73,7 +73,7 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; - if (rt->rt6i_flags & RTF_ANYCAST) + if (ipv6_anycast_destination((struct dst_entry *)rt, addr)) ret |= XT_ADDRTYPE_ANYCAST; dst_release(&rt->dst); diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 7198d660b4de..a1d126f29463 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -39,7 +39,7 @@ cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info *info = par->matchinfo; - if (skb->sk == NULL) + if (skb->sk == NULL || !sk_fullsock(skb->sk)) return false; return (info->id == skb->sk->sk_classid) ^ info->invert; diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 23345238711b..ebd41dc501e5 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -23,6 +23,7 @@ MODULE_ALIAS("ipt_mark"); MODULE_ALIAS("ip6t_mark"); MODULE_ALIAS("ipt_MARK"); MODULE_ALIAS("ip6t_MARK"); +MODULE_ALIAS("arpt_MARK"); static unsigned int mark_tg(struct sk_buff *skb, const struct xt_action_param *par) diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index f440f57a452f..1caaccbc306c 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -25,16 +25,15 @@ MODULE_ALIAS("ip6t_physdev"); static bool physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) { - static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct xt_physdev_info *info = par->matchinfo; + const struct net_device *physdev; unsigned long ret; const char *indev, *outdev; - const struct nf_bridge_info *nf_bridge; /* Not a bridged IP packet or no info available yet: * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if * the destination device will be a bridge. */ - if (!(nf_bridge = skb->nf_bridge)) { + if (!skb->nf_bridge) { /* Return MATCH if the invert flags of the used options are on */ if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && !(info->invert & XT_PHYSDEV_OP_BRIDGED)) @@ -54,31 +53,41 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) return true; } + physdev = nf_bridge_get_physoutdev(skb); + outdev = physdev ? physdev->name : NULL; + /* This only makes sense in the FORWARD and POSTROUTING chains */ if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && - (!!(nf_bridge->mask & BRNF_BRIDGED) ^ - !(info->invert & XT_PHYSDEV_OP_BRIDGED))) + (!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED))) return false; + physdev = nf_bridge_get_physindev(skb); + indev = physdev ? physdev->name : NULL; + if ((info->bitmask & XT_PHYSDEV_OP_ISIN && - (!nf_bridge->physindev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) || + (!indev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) || (info->bitmask & XT_PHYSDEV_OP_ISOUT && - (!nf_bridge->physoutdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT)))) + (!outdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT)))) return false; if (!(info->bitmask & XT_PHYSDEV_OP_IN)) goto match_outdev; - indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; - ret = ifname_compare_aligned(indev, info->physindev, info->in_mask); - if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN)) - return false; + if (indev) { + ret = ifname_compare_aligned(indev, info->physindev, + info->in_mask); + + if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN)) + return false; + } match_outdev: if (!(info->bitmask & XT_PHYSDEV_OP_OUT)) return true; - outdev = nf_bridge->physoutdev ? - nf_bridge->physoutdev->name : nulldevname; + + if (!outdev) + return false; + ret = ifname_compare_aligned(outdev, info->physoutdev, info->out_mask); return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT)); diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 0d47afea9682..5669e5b453f4 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -9,14 +9,16 @@ */ /* Kernel module which implements the set match and SET target - * for netfilter/iptables. */ + * for netfilter/iptables. + */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> -#include <linux/netfilter/xt_set.h> +#include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_timeout.h> +#include <uapi/linux/netfilter/xt_set.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -52,6 +54,7 @@ static bool set_match_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v0 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.u.compat.dim, info->match_set.u.compat.flags, 0, UINT_MAX); @@ -68,10 +71,10 @@ compat_flags(struct xt_set_info_v0 *info) info->u.compat.dim = IPSET_DIM_ZERO; if (info->u.flags[0] & IPSET_MATCH_INV) info->u.compat.flags |= IPSET_INV_MATCH; - for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) { + for (i = 0; i < IPSET_DIM_MAX - 1 && info->u.flags[i]; i++) { info->u.compat.dim++; if (info->u.flags[i] & IPSET_SRC) - info->u.compat.flags |= (1<<info->u.compat.dim); + info->u.compat.flags |= (1 << info->u.compat.dim); } } @@ -88,7 +91,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par) info->match_set.index); return -ENOENT; } - if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { + if (info->match_set.u.flags[IPSET_DIM_MAX - 1] != 0) { pr_warn("Protocol error: set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; @@ -114,6 +117,7 @@ static bool set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v1 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, 0, UINT_MAX); @@ -178,9 +182,10 @@ static bool set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v3 *info = par->matchinfo; + int ret; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, info->flags, UINT_MAX); - int ret; if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) @@ -193,7 +198,7 @@ set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) return ret; if (!match_counter0(opt.ext.packets, &info->packets)) - return 0; + return false; return match_counter0(opt.ext.bytes, &info->bytes); } @@ -224,9 +229,10 @@ static bool set_match_v4(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_set_info_match_v4 *info = par->matchinfo; + int ret; + ADT_OPT(opt, par->family, info->match_set.dim, info->match_set.flags, info->flags, UINT_MAX); - int ret; if (info->packets.op != IPSET_COUNTER_NONE || info->bytes.op != IPSET_COUNTER_NONE) @@ -239,7 +245,7 @@ set_match_v4(const struct sk_buff *skb, struct xt_action_param *par) return ret; if (!match_counter(opt.ext.packets, &info->packets)) - return 0; + return false; return match_counter(opt.ext.bytes, &info->bytes); } @@ -252,6 +258,7 @@ static unsigned int set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v0 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim, info->add_set.u.compat.flags, 0, UINT_MAX); ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim, @@ -290,8 +297,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) return -ENOENT; } } - if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || - info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { + if (info->add_set.u.flags[IPSET_DIM_MAX - 1] != 0 || + info->del_set.u.flags[IPSET_DIM_MAX - 1] != 0) { pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); @@ -324,6 +331,7 @@ static unsigned int set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v1 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, 0, UINT_MAX); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -392,6 +400,7 @@ static unsigned int set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v2 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -399,8 +408,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -418,6 +427,8 @@ static unsigned int set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_set_info_target_v3 *info = par->targinfo; + int ret; + ADT_OPT(add_opt, par->family, info->add_set.dim, info->add_set.flags, info->flags, info->timeout); ADT_OPT(del_opt, par->family, info->del_set.dim, @@ -425,12 +436,10 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) ADT_OPT(map_opt, par->family, info->map_set.dim, info->map_set.flags, 0, UINT_MAX); - int ret; - /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -456,7 +465,6 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } - static int set_target_v3_checkentry(const struct xt_tgchk_param *par) { @@ -496,8 +504,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) !(par->hook_mask & (1 << NF_INET_FORWARD | 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { - pr_warn("mapping of prio or/and queue is allowed only" - "from OUTPUT/FORWARD/POSTROUTING chains\n"); + pr_warn("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); return -EINVAL; } index = ip_set_nfnl_get_byindex(par->net, @@ -518,8 +525,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (info->add_set.dim > IPSET_DIM_MAX || info->del_set.dim > IPSET_DIM_MAX || info->map_set.dim > IPSET_DIM_MAX) { - pr_warn("Protocol error: SET target dimension " - "is over the limit!\n"); + pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) @@ -545,7 +551,6 @@ set_target_v3_destroy(const struct xt_tgdtor_param *par) ip_set_nfnl_put(par->net, info->map_set.index); } - static struct xt_match set_matches[] __read_mostly = { { .name = "set", diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 13332dbf291d..43e26c881100 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -129,13 +129,24 @@ xt_socket_get_sock_v4(struct net *net, const u8 protocol, return NULL; } -static bool -socket_match(const struct sk_buff *skb, struct xt_action_param *par, - const struct xt_socket_mtinfo1 *info) +static bool xt_socket_sk_is_transparent(struct sock *sk) +{ + switch (sk->sk_state) { + case TCP_TIME_WAIT: + return inet_twsk(sk)->tw_transparent; + + case TCP_NEW_SYN_RECV: + return inet_rsk(inet_reqsk(sk))->no_srccheck; + + default: + return inet_sk(sk)->transparent; + } +} + +static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb, + const struct net_device *indev) { const struct iphdr *iph = ip_hdr(skb); - struct udphdr _hdr, *hp = NULL; - struct sock *sk = skb->sk; __be32 uninitialized_var(daddr), uninitialized_var(saddr); __be16 uninitialized_var(dport), uninitialized_var(sport); u8 uninitialized_var(protocol); @@ -145,10 +156,12 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, #endif if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { + struct udphdr _hdr, *hp; + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); if (hp == NULL) - return false; + return NULL; protocol = iph->protocol; saddr = iph->saddr; @@ -158,16 +171,17 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, } else if (iph->protocol == IPPROTO_ICMP) { if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, - &sport, &dport)) - return false; + &sport, &dport)) + return NULL; } else { - return false; + return NULL; } #ifdef XT_SOCKET_HAVE_CONNTRACK - /* Do the lookup with the original socket address in case this is a - * reply packet of an established SNAT-ted connection. */ - + /* Do the lookup with the original socket address in + * case this is a reply packet of an established + * SNAT-ted connection. + */ ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct) && ((iph->protocol != IPPROTO_ICMP && @@ -183,10 +197,19 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, } #endif + return xt_socket_get_sock_v4(dev_net(skb->dev), protocol, saddr, daddr, + sport, dport, indev); +} + +static bool +socket_match(const struct sk_buff *skb, struct xt_action_param *par, + const struct xt_socket_mtinfo1 *info) +{ + struct sk_buff *pskb = (struct sk_buff *)skb; + struct sock *sk = skb->sk; + if (!sk) - sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol, - saddr, daddr, sport, dport, - par->in); + sk = xt_socket_lookup_slow_v4(skb, par->in); if (sk) { bool wildcard; bool transparent = true; @@ -195,16 +218,18 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, * unless XT_SOCKET_NOWILDCARD is set */ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && - sk->sk_state != TCP_TIME_WAIT && + sk_fullsock(sk) && inet_sk(sk)->inet_rcv_saddr == 0); /* Ignore non-transparent sockets, - if XT_SOCKET_TRANSPARENT is used */ + * if XT_SOCKET_TRANSPARENT is used + */ if (info->flags & XT_SOCKET_TRANSPARENT) - transparent = ((sk->sk_state != TCP_TIME_WAIT && - inet_sk(sk)->transparent) || - (sk->sk_state == TCP_TIME_WAIT && - inet_twsk(sk)->tw_transparent)); + transparent = xt_socket_sk_is_transparent(sk); + + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && + transparent) + pskb->mark = sk->sk_mark; if (sk != skb->sk) sock_gen_put(sk); @@ -213,12 +238,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, sk = NULL; } - pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n", - protocol, &saddr, ntohs(sport), - &daddr, ntohs(dport), - &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); - - return (sk != NULL); + return sk != NULL; } static bool @@ -232,7 +252,7 @@ socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) } static bool -socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) { return socket_match(skb, par, par->matchinfo); } @@ -315,28 +335,26 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol, return NULL; } -static bool -socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) +static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb, + const struct net_device *indev) { - struct ipv6hdr ipv6_var, *iph = ipv6_hdr(skb); - struct udphdr _hdr, *hp = NULL; - struct sock *sk = skb->sk; - const struct in6_addr *daddr = NULL, *saddr = NULL; __be16 uninitialized_var(dport), uninitialized_var(sport); - int thoff = 0, uninitialized_var(tproto); - const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + const struct in6_addr *daddr = NULL, *saddr = NULL; + struct ipv6hdr *iph = ipv6_hdr(skb); + int thoff = 0, tproto; tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) { pr_debug("unable to find transport header in IPv6 packet, dropping\n"); - return NF_DROP; + return NULL; } if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) { - hp = skb_header_pointer(skb, thoff, - sizeof(_hdr), &_hdr); + struct udphdr _hdr, *hp; + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); if (hp == NULL) - return false; + return NULL; saddr = &iph->saddr; sport = hp->source; @@ -344,17 +362,28 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) dport = hp->dest; } else if (tproto == IPPROTO_ICMPV6) { + struct ipv6hdr ipv6_var; + if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, &sport, &dport, &ipv6_var)) - return false; + return NULL; } else { - return false; + return NULL; } + return xt_socket_get_sock_v6(dev_net(skb->dev), tproto, saddr, daddr, + sport, dport, indev); +} + +static bool +socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + struct sk_buff *pskb = (struct sk_buff *)skb; + struct sock *sk = skb->sk; + if (!sk) - sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto, - saddr, daddr, sport, dport, - par->in); + sk = xt_socket_lookup_slow_v6(skb, par->in); if (sk) { bool wildcard; bool transparent = true; @@ -363,16 +392,18 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) * unless XT_SOCKET_NOWILDCARD is set */ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && - sk->sk_state != TCP_TIME_WAIT && + sk_fullsock(sk) && ipv6_addr_any(&sk->sk_v6_rcv_saddr)); /* Ignore non-transparent sockets, - if XT_SOCKET_TRANSPARENT is used */ + * if XT_SOCKET_TRANSPARENT is used + */ if (info->flags & XT_SOCKET_TRANSPARENT) - transparent = ((sk->sk_state != TCP_TIME_WAIT && - inet_sk(sk)->transparent) || - (sk->sk_state == TCP_TIME_WAIT && - inet_twsk(sk)->tw_transparent)); + transparent = xt_socket_sk_is_transparent(sk); + + if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && + transparent) + pskb->mark = sk->sk_mark; if (sk != skb->sk) sock_gen_put(sk); @@ -381,13 +412,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) sk = NULL; } - pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu " - "(orig %pI6:%hu) sock %p\n", - tproto, saddr, ntohs(sport), - daddr, ntohs(dport), - &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); - - return (sk != NULL); + return sk != NULL; } #endif @@ -413,6 +438,19 @@ static int socket_mt_v2_check(const struct xt_mtchk_param *par) return 0; } +static int socket_mt_v3_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo3 *info = + (struct xt_socket_mtinfo3 *)par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V3) { + pr_info("unknown flags 0x%x\n", + info->flags & ~XT_SOCKET_FLAGS_V3); + return -EINVAL; + } + return 0; +} + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", @@ -427,7 +465,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV4, - .match = socket_mt4_v1_v2, + .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -439,7 +477,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV6, - .match = socket_mt6_v1_v2, + .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -451,7 +489,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 2, .family = NFPROTO_IPV4, - .match = socket_mt4_v1_v2, + .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -463,7 +501,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 2, .family = NFPROTO_IPV6, - .match = socket_mt6_v1_v2, + .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -471,6 +509,30 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .me = THIS_MODULE, }, #endif + { + .name = "socket", + .revision = 3, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1_v2_v3, + .checkentry = socket_mt_v3_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 3, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1_v2_v3, + .checkentry = socket_mt_v3_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#endif }; static int __init socket_mt_init(void) diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 5699adb97652..0bc3460319c8 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -26,13 +26,12 @@ static bool string_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_string_info *conf = par->matchinfo; - struct ts_state state; bool invert; invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT; return (skb_find_text((struct sk_buff *)skb, conf->from_offset, - conf->to_offset, conf->config, &state) + conf->to_offset, conf->config) != UINT_MAX) ^ invert; } diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 70440748fe5c..13f777f20995 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -293,15 +293,13 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, return -ENOMEM; addr_struct.s_addr = iter4->addr; - ret_val = nla_put(skb, NLBL_MGMT_A_IPV4ADDR, - sizeof(struct in_addr), - &addr_struct); + ret_val = nla_put_in_addr(skb, NLBL_MGMT_A_IPV4ADDR, + addr_struct.s_addr); if (ret_val != 0) return ret_val; addr_struct.s_addr = iter4->mask; - ret_val = nla_put(skb, NLBL_MGMT_A_IPV4MASK, - sizeof(struct in_addr), - &addr_struct); + ret_val = nla_put_in_addr(skb, NLBL_MGMT_A_IPV4MASK, + addr_struct.s_addr); if (ret_val != 0) return ret_val; map4 = netlbl_domhsh_addr4_entry(iter4); @@ -328,14 +326,12 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, if (nla_b == NULL) return -ENOMEM; - ret_val = nla_put(skb, NLBL_MGMT_A_IPV6ADDR, - sizeof(struct in6_addr), - &iter6->addr); + ret_val = nla_put_in6_addr(skb, NLBL_MGMT_A_IPV6ADDR, + &iter6->addr); if (ret_val != 0) return ret_val; - ret_val = nla_put(skb, NLBL_MGMT_A_IPV6MASK, - sizeof(struct in6_addr), - &iter6->mask); + ret_val = nla_put_in6_addr(skb, NLBL_MGMT_A_IPV6MASK, + &iter6->mask); if (ret_val != 0) return ret_val; map6 = netlbl_domhsh_addr6_entry(iter6); diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index aec7994f78cf..b0380927f05f 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1117,34 +1117,30 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd, struct in_addr addr_struct; addr_struct.s_addr = addr4->list.addr; - ret_val = nla_put(cb_arg->skb, - NLBL_UNLABEL_A_IPV4ADDR, - sizeof(struct in_addr), - &addr_struct); + ret_val = nla_put_in_addr(cb_arg->skb, + NLBL_UNLABEL_A_IPV4ADDR, + addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; addr_struct.s_addr = addr4->list.mask; - ret_val = nla_put(cb_arg->skb, - NLBL_UNLABEL_A_IPV4MASK, - sizeof(struct in_addr), - &addr_struct); + ret_val = nla_put_in_addr(cb_arg->skb, + NLBL_UNLABEL_A_IPV4MASK, + addr_struct.s_addr); if (ret_val != 0) goto list_cb_failure; secid = addr4->secid; } else { - ret_val = nla_put(cb_arg->skb, - NLBL_UNLABEL_A_IPV6ADDR, - sizeof(struct in6_addr), - &addr6->list.addr); + ret_val = nla_put_in6_addr(cb_arg->skb, + NLBL_UNLABEL_A_IPV6ADDR, + &addr6->list.addr); if (ret_val != 0) goto list_cb_failure; - ret_val = nla_put(cb_arg->skb, - NLBL_UNLABEL_A_IPV6MASK, - sizeof(struct in6_addr), - &addr6->list.mask); + ret_val = nla_put_in6_addr(cb_arg->skb, + NLBL_UNLABEL_A_IPV6MASK, + &addr6->list.mask); if (ret_val != 0) goto list_cb_failure; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 05919bf3f670..d8e2e3918ce2 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -76,20 +76,21 @@ struct listeners { }; /* state bits */ -#define NETLINK_CONGESTED 0x0 +#define NETLINK_S_CONGESTED 0x0 /* flags */ -#define NETLINK_KERNEL_SOCKET 0x1 -#define NETLINK_RECV_PKTINFO 0x2 -#define NETLINK_BROADCAST_SEND_ERROR 0x4 -#define NETLINK_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_KERNEL_SOCKET 0x1 +#define NETLINK_F_RECV_PKTINFO 0x2 +#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_F_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_LISTEN_ALL_NSID 0x10 static inline int netlink_is_kernel(struct sock *sk) { - return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET; + return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; } -struct netlink_table *nl_table; +struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); @@ -116,6 +117,8 @@ static ATOMIC_NOTIFIER_HEAD(netlink_chain); static DEFINE_SPINLOCK(netlink_tap_lock); static struct list_head netlink_tap_all __read_mostly; +static const struct rhashtable_params netlink_rhashtable_params; + static inline u32 netlink_group_mask(u32 group) { return group ? 1 << (group - 1) : 0; @@ -155,7 +158,7 @@ static int __netlink_remove_tap(struct netlink_tap *nt) out: spin_unlock(&netlink_tap_lock); - if (found && nt->module) + if (found) module_put(nt->module); return found ? 0 : -ENODEV; @@ -254,8 +257,9 @@ static void netlink_overrun(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { + if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_S_CONGESTED, + &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; sk->sk_error_report(sk); } @@ -268,8 +272,8 @@ static void netlink_rcv_wake(struct sock *sk) struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(NETLINK_CONGESTED, &nlk->state); - if (!test_bit(NETLINK_CONGESTED, &nlk->state)) + clear_bit(NETLINK_S_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } @@ -353,25 +357,52 @@ err1: return NULL; } + +static void +__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec, + unsigned int order) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct sk_buff_head *queue; + struct netlink_ring *ring; + + queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; + + spin_lock_bh(&queue->lock); + + ring->frame_max = req->nm_frame_nr - 1; + ring->head = 0; + ring->frame_size = req->nm_frame_size; + ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; + + swap(ring->pg_vec_len, req->nm_block_nr); + swap(ring->pg_vec_order, order); + swap(ring->pg_vec, pg_vec); + + __skb_queue_purge(queue); + spin_unlock_bh(&queue->lock); + + WARN_ON(atomic_read(&nlk->mapped)); + + if (pg_vec) + free_pg_vec(pg_vec, order, req->nm_block_nr); +} + static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, - bool closing, bool tx_ring) + bool tx_ring) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ring *ring; - struct sk_buff_head *queue; void **pg_vec = NULL; unsigned int order = 0; - int err; ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; - if (!closing) { - if (atomic_read(&nlk->mapped)) - return -EBUSY; - if (atomic_read(&ring->pending)) - return -EBUSY; - } + if (atomic_read(&nlk->mapped)) + return -EBUSY; + if (atomic_read(&ring->pending)) + return -EBUSY; if (req->nm_block_nr) { if (ring->pg_vec != NULL) @@ -403,31 +434,19 @@ static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, return -EINVAL; } - err = -EBUSY; mutex_lock(&nlk->pg_vec_lock); - if (closing || atomic_read(&nlk->mapped) == 0) { - err = 0; - spin_lock_bh(&queue->lock); - - ring->frame_max = req->nm_frame_nr - 1; - ring->head = 0; - ring->frame_size = req->nm_frame_size; - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; - - swap(ring->pg_vec_len, req->nm_block_nr); - swap(ring->pg_vec_order, order); - swap(ring->pg_vec, pg_vec); - - __skb_queue_purge(queue); - spin_unlock_bh(&queue->lock); - - WARN_ON(atomic_read(&nlk->mapped)); + if (atomic_read(&nlk->mapped) == 0) { + __netlink_set_ring(sk, req, tx_ring, pg_vec, order); + mutex_unlock(&nlk->pg_vec_lock); + return 0; } + mutex_unlock(&nlk->pg_vec_lock); if (pg_vec) free_pg_vec(pg_vec, order, req->nm_block_nr); - return err; + + return -EBUSY; } static void netlink_mm_open(struct vm_area_struct *vma) @@ -896,10 +915,10 @@ static void netlink_sock_destruct(struct sock *sk) memset(&req, 0, sizeof(req)); if (nlk->rx_ring.pg_vec) - netlink_set_ring(sk, &req, true, false); + __netlink_set_ring(sk, &req, false, NULL, 0); memset(&req, 0, sizeof(req)); if (nlk->tx_ring.pg_vec) - netlink_set_ring(sk, &req, true, true); + __netlink_set_ring(sk, &req, true, NULL, 0); } #endif /* CONFIG_NETLINK_MMAP */ @@ -970,41 +989,50 @@ netlink_unlock_table(void) struct netlink_compare_arg { - struct net *net; + possible_net_t pnet; u32 portid; }; -static bool netlink_compare(void *ptr, void *arg) +/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */ +#define netlink_compare_arg_len \ + (offsetof(struct netlink_compare_arg, portid) + sizeof(u32)) + +static inline int netlink_compare(struct rhashtable_compare_arg *arg, + const void *ptr) { - struct netlink_compare_arg *x = arg; - struct sock *sk = ptr; + const struct netlink_compare_arg *x = arg->key; + const struct netlink_sock *nlk = ptr; + + return nlk->portid != x->portid || + !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); +} - return nlk_sk(sk)->portid == x->portid && - net_eq(sock_net(sk), x->net); +static void netlink_compare_arg_init(struct netlink_compare_arg *arg, + struct net *net, u32 portid) +{ + memset(arg, 0, sizeof(*arg)); + write_pnet(&arg->pnet, net); + arg->portid = portid; } static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, struct net *net) { - struct netlink_compare_arg arg = { - .net = net, - .portid = portid, - }; + struct netlink_compare_arg arg; - return rhashtable_lookup_compare(&table->hash, &portid, - &netlink_compare, &arg); + netlink_compare_arg_init(&arg, net, portid); + return rhashtable_lookup_fast(&table->hash, &arg, + netlink_rhashtable_params); } -static bool __netlink_insert(struct netlink_table *table, struct sock *sk) +static int __netlink_insert(struct netlink_table *table, struct sock *sk) { - struct netlink_compare_arg arg = { - .net = sock_net(sk), - .portid = nlk_sk(sk)->portid, - }; + struct netlink_compare_arg arg; - return rhashtable_lookup_compare_insert(&table->hash, - &nlk_sk(sk)->node, - &netlink_compare, &arg); + netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); + return rhashtable_lookup_insert_key(&table->hash, &arg, + &nlk_sk(sk)->node, + netlink_rhashtable_params); } static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) @@ -1066,9 +1094,11 @@ static int netlink_insert(struct sock *sk, u32 portid) nlk_sk(sk)->portid = portid; sock_hold(sk); - err = 0; - if (!__netlink_insert(table, sk)) { - err = -EADDRINUSE; + err = __netlink_insert(table, sk); + if (err) { + if (err == -EEXIST) + err = -EADDRINUSE; + nlk_sk(sk)->portid = 0; sock_put(sk); } @@ -1082,7 +1112,8 @@ static void netlink_remove(struct sock *sk) struct netlink_table *table; table = &nl_table[sk->sk_protocol]; - if (rhashtable_remove(&table->hash, &nlk_sk(sk)->node)) { + if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, + netlink_rhashtable_params)) { WARN_ON(atomic_read(&sk->sk_refcnt) == 1); __sock_put(sk); } @@ -1104,14 +1135,15 @@ static struct proto netlink_proto = { }; static int __netlink_create(struct net *net, struct socket *sock, - struct mutex *cb_mutex, int protocol) + struct mutex *cb_mutex, int protocol, + int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; - sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); + sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; @@ -1173,7 +1205,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, if (err < 0) goto out; - err = __netlink_create(net, sock, cb_mutex, protocol); + err = __netlink_create(net, sock, cb_mutex, protocol, kern); if (err < 0) goto out_module; @@ -1283,20 +1315,24 @@ static int netlink_autobind(struct socket *sock) struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; - static s32 rover = -4097; + s32 rover = -4096; + bool ok; retry: cond_resched(); rcu_read_lock(); - if (__netlink_lookup(table, portid, net)) { + ok = !__netlink_lookup(table, portid, net); + rcu_read_unlock(); + if (!ok) { /* Bind collision, search negative portid values. */ - portid = rover--; - if (rover > -4097) + if (rover == -4096) + /* rover will be in range [S32_MIN, -4097] */ + rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN); + else if (rover >= -4096) rover = -4097; - rcu_read_unlock(); + portid = rover--; goto retry; } - rcu_read_unlock(); err = netlink_insert(sk, portid); if (err == -EADDRINUSE) @@ -1616,13 +1652,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size, if (data == NULL) return NULL; - skb = build_skb(data, size); + skb = __build_skb(data, size); if (skb == NULL) vfree(data); - else { - skb->head_frag = 0; + else skb->destructor = netlink_skb_destructor; - } return skb; } @@ -1645,7 +1679,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !netlink_skb_is_mmaped(skb)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { @@ -1660,7 +1694,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) && + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -1884,7 +1918,7 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(NETLINK_CONGESTED, &nlk->state)) { + !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); @@ -1920,8 +1954,17 @@ static void do_one_broadcast(struct sock *sk, !test_bit(p->group - 1, nlk->groups)) return; - if (!net_eq(sock_net(sk), p->net)) - return; + if (!net_eq(sock_net(sk), p->net)) { + if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID)) + return; + + if (!peernet_has_id(sock_net(sk), p->net)) + return; + + if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, + CAP_NET_BROADCAST)) + return; + } if (p->failure) { netlink_overrun(sk); @@ -1945,23 +1988,33 @@ static void do_one_broadcast(struct sock *sk, netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; - } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + goto out; + } + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if (sk_filter(sk, p->skb2)) { + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; - } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + goto out; + } + NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); + NETLINK_CB(p->skb2).nsid_is_set = true; + val = netlink_broadcast_deliver(sk, p->skb2); + if (val < 0) { netlink_overrun(sk); - if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR) + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; p->skb2 = NULL; } +out: sock_put(sk); } @@ -2046,7 +2099,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) !test_bit(p->group - 1, nlk->groups)) goto out; - if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) { + if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) { ret = 1; goto out; } @@ -2065,7 +2118,7 @@ out: * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the - * NETLINK_RECV_NO_ENOBUFS socket option. + * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { @@ -2125,9 +2178,9 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETLINK_PKTINFO: if (val) - nlk->flags |= NETLINK_RECV_PKTINFO; + nlk->flags |= NETLINK_F_RECV_PKTINFO; else - nlk->flags &= ~NETLINK_RECV_PKTINFO; + nlk->flags &= ~NETLINK_F_RECV_PKTINFO; err = 0; break; case NETLINK_ADD_MEMBERSHIP: @@ -2156,18 +2209,18 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, } case NETLINK_BROADCAST_ERROR: if (val) - nlk->flags |= NETLINK_BROADCAST_SEND_ERROR; + nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR; else - nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; + nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR; err = 0; break; case NETLINK_NO_ENOBUFS: if (val) { - nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(NETLINK_CONGESTED, &nlk->state); + nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS; + clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { - nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; + nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS; } err = 0; break; @@ -2185,11 +2238,21 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, return -EINVAL; if (copy_from_user(&req, optval, sizeof(req))) return -EFAULT; - err = netlink_set_ring(sk, &req, false, + err = netlink_set_ring(sk, &req, optname == NETLINK_TX_RING); break; } #endif /* CONFIG_NETLINK_MMAP */ + case NETLINK_LISTEN_ALL_NSID: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) + return -EPERM; + + if (val) + nlk->flags |= NETLINK_F_LISTEN_ALL_NSID; + else + nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2216,7 +2279,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2226,7 +2289,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0; + val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; @@ -2236,12 +2299,34 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) return -EINVAL; len = sizeof(int); - val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0; + val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; err = 0; break; + case NETLINK_LIST_MEMBERSHIPS: { + int pos, idx, shift; + + err = 0; + netlink_table_grab(); + for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { + if (len - pos < sizeof(u32)) + break; + + idx = pos / sizeof(unsigned long); + shift = (pos % sizeof(unsigned long)) * 8; + if (put_user((u32)(nlk->groups[idx] >> shift), + (u32 __user *)(optval + pos))) { + err = -EFAULT; + break; + } + } + if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) + err = -EFAULT; + netlink_table_ungrab(); + break; + } default: err = -ENOPROTOOPT; } @@ -2256,8 +2341,17 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } -static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + if (!NETLINK_CB(skb).nsid_is_set) + return; + + put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), + &NETLINK_CB(skb).nsid); +} + +static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); @@ -2346,8 +2440,7 @@ out: return err; } -static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len, +static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; @@ -2409,8 +2502,10 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, msg->msg_namelen = sizeof(*addr); } - if (nlk->flags & NETLINK_RECV_PKTINFO) + if (nlk->flags & NETLINK_F_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); + if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); @@ -2464,17 +2559,10 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - /* - * We have to just have a reference on the net from sk, but don't - * get_net it. Besides, we cannot get and then put the net here. - * So we create one inside init_net and the move it to net. - */ - - if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0) + if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; - sk_change_net(sk, net); if (!cfg || cfg->groups < 32) groups = 32; @@ -2493,7 +2581,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, goto out_sock_release; nlk = nlk_sk(sk); - nlk->flags |= NETLINK_KERNEL_SOCKET; + nlk->flags |= NETLINK_F_KERNEL_SOCKET; netlink_table_grab(); if (!nl_table[unit].registered) { @@ -2530,7 +2618,10 @@ EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) { - sk_release_kernel(sk); + if (sk == NULL || sk->sk_socket == NULL) + return; + + sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); @@ -3116,17 +3207,27 @@ static struct pernet_operations __net_initdata netlink_net_ops = { .exit = netlink_net_exit, }; +static inline u32 netlink_hash(const void *data, u32 len, u32 seed) +{ + const struct netlink_sock *nlk = data; + struct netlink_compare_arg arg; + + netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); + return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); +} + +static const struct rhashtable_params netlink_rhashtable_params = { + .head_offset = offsetof(struct netlink_sock, node), + .key_len = netlink_compare_arg_len, + .obj_hashfn = netlink_hash, + .obj_cmpfn = netlink_compare, + .automatic_shrinking = true, +}; + static int __init netlink_proto_init(void) { int i; int err = proto_register(&netlink_proto, 0); - struct rhashtable_params ht_params = { - .head_offset = offsetof(struct netlink_sock, node), - .key_offset = offsetof(struct netlink_sock, portid), - .key_len = sizeof(u32), /* portid */ - .hashfn = jhash, - .max_shift = 16, /* 64K */ - }; if (err != 0) goto out; @@ -3138,7 +3239,8 @@ static int __init netlink_proto_init(void) goto panic; for (i = 0; i < MAX_LINKS; i++) { - if (rhashtable_init(&nl_table[i].hash, &ht_params) < 0) { + if (rhashtable_init(&nl_table[i].hash, + &netlink_rhashtable_params) < 0) { while (--i > 0) rhashtable_destroy(&nl_table[i].hash); kfree(nl_table); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 69f1d5e9959f..ed212ffc1d9d 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -433,7 +433,7 @@ static int nr_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto); + sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern); if (sk == NULL) return -ENOMEM; @@ -476,7 +476,7 @@ static struct sock *nr_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) return NULL; - sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot); + sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0); if (sk == NULL) return NULL; @@ -1023,8 +1023,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) return 1; } -static int nr_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int nr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); @@ -1133,8 +1132,8 @@ out: return err; } -static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int nr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name); diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 6ae063cebf7d..988f542481a8 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -65,36 +65,6 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) return 1; } -#ifdef CONFIG_INET - -static int nr_rebuild_header(struct sk_buff *skb) -{ - unsigned char *bp = skb->data; - - if (arp_find(bp + 7, skb)) - return 1; - - bp[6] &= ~AX25_CBIT; - bp[6] &= ~AX25_EBIT; - bp[6] |= AX25_SSSID_SPARE; - bp += AX25_ADDR_LEN; - - bp[6] &= ~AX25_CBIT; - bp[6] |= AX25_EBIT; - bp[6] |= AX25_SSSID_SPARE; - - return 0; -} - -#else - -static int nr_rebuild_header(struct sk_buff *skb) -{ - return 1; -} - -#endif - static int nr_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) @@ -188,7 +158,6 @@ static netdev_tx_t nr_xmit(struct sk_buff *skb, struct net_device *dev) static const struct header_ops nr_header_ops = { .create = nr_header, - .rebuild= nr_rebuild_header, }; static const struct net_device_ops nr_netdev_ops = { diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 96b64d2f6dbf..d72a4f1558f2 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -31,7 +31,6 @@ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> -#include <linux/netfilter.h> #include <linux/init.h> #include <linux/spinlock.h> #include <net/netrom.h> diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 2277276f52bc..54e40fa47822 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -40,7 +40,7 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto, read_lock(&proto_tab_lock); if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) { - rc = proto_tab[proto]->create(net, sock, proto_tab[proto]); + rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern); module_put(proto_tab[proto]->owner); } read_unlock(&proto_tab_lock); diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index de1789e3cc82..1f68724d44d3 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -225,7 +225,7 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction); /* Sock API */ -struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp); +struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern); void nfc_llcp_sock_free(struct nfc_llcp_sock *sock); void nfc_llcp_accept_unlink(struct sock *sk); void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index b18f07ccb504..98876274a1ee 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -934,7 +934,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, sock->ssap = ssap; } - new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC); + new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0); if (new_sk == NULL) { reason = LLCP_DM_REJ; release_sock(&sock->sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index e181e290427c..b7de0da46acd 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -750,8 +750,8 @@ error: return ret; } -static int llcp_sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int llcp_sock_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -793,8 +793,8 @@ static int llcp_sock_sendmsg(struct kiocb *iocb, struct socket *sock, return nfc_llcp_send_i_frame(llcp_sock, msg, len); } -static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int llcp_sock_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; @@ -942,12 +942,12 @@ static void llcp_sock_destruct(struct sock *sk) } } -struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp) +struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; - sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto); + sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern); if (!sk) return NULL; @@ -993,7 +993,7 @@ void nfc_llcp_sock_free(struct nfc_llcp_sock *sock) } static int llcp_sock_create(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto) + const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; @@ -1009,7 +1009,7 @@ static int llcp_sock_create(struct net *net, struct socket *sock, else sock->ops = &llcp_sock_ops; - sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC); + sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) return -ENOMEM; diff --git a/net/nfc/nci/Kconfig b/net/nfc/nci/Kconfig index a4f1e42e3481..901c1ddba841 100644 --- a/net/nfc/nci/Kconfig +++ b/net/nfc/nci/Kconfig @@ -19,3 +19,10 @@ config NFC_NCI_SPI an NFC Controller (NFCC) and a Device Host (DH). Say yes if you use an NCI driver that requires SPI link layer. + +config NFC_NCI_UART + depends on NFC_NCI && TTY + tristate "NCI over UART protocol support" + default n + help + Say yes if you use an NCI driver that requires UART link layer. diff --git a/net/nfc/nci/Makefile b/net/nfc/nci/Makefile index 7ed8949266cc..b4b85b82e988 100644 --- a/net/nfc/nci/Makefile +++ b/net/nfc/nci/Makefile @@ -7,3 +7,6 @@ obj-$(CONFIG_NFC_NCI) += nci.o nci-objs := core.o data.o lib.o ntf.o rsp.o hci.o nci-$(CONFIG_NFC_NCI_SPI) += spi.o + +nci_uart-y += uart.o +obj-$(CONFIG_NFC_NCI_UART) += nci_uart.o diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 9575a1892607..95af2d24d5be 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -28,6 +28,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <linux/module.h> +#include <linux/kernel.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/completion.h> @@ -73,6 +74,7 @@ void nci_req_complete(struct nci_dev *ndev, int result) complete(&ndev->req_completion); } } +EXPORT_SYMBOL(nci_req_complete); static void nci_req_cancel(struct nci_dev *ndev, int err) { @@ -323,6 +325,32 @@ static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt) sizeof(struct nci_rf_deactivate_cmd), &cmd); } +struct nci_prop_cmd_param { + __u16 opcode; + size_t len; + __u8 *payload; +}; + +static void nci_prop_cmd_req(struct nci_dev *ndev, unsigned long opt) +{ + struct nci_prop_cmd_param *param = (struct nci_prop_cmd_param *)opt; + + nci_send_cmd(ndev, param->opcode, param->len, param->payload); +} + +int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) +{ + struct nci_prop_cmd_param param; + + param.opcode = nci_opcode_pack(NCI_GID_PROPRIETARY, oid); + param.len = len; + param.payload = payload; + + return __nci_request(ndev, nci_prop_cmd_req, (unsigned long)¶m, + msecs_to_jiffies(NCI_CMD_TIMEOUT)); +} +EXPORT_SYMBOL(nci_prop_cmd); + static int nci_open_device(struct nci_dev *ndev) { int rc = 0; @@ -343,11 +371,17 @@ static int nci_open_device(struct nci_dev *ndev) set_bit(NCI_INIT, &ndev->flags); - rc = __nci_request(ndev, nci_reset_req, 0, - msecs_to_jiffies(NCI_RESET_TIMEOUT)); + if (ndev->ops->init) + rc = ndev->ops->init(ndev); + + if (!rc) { + rc = __nci_request(ndev, nci_reset_req, 0, + msecs_to_jiffies(NCI_RESET_TIMEOUT)); + } - if (ndev->ops->setup) - ndev->ops->setup(ndev); + if (!rc && ndev->ops->setup) { + rc = ndev->ops->setup(ndev); + } if (!rc) { rc = __nci_request(ndev, nci_init_req, 0, @@ -407,6 +441,12 @@ static int nci_close_device(struct nci_dev *ndev) set_bit(NCI_INIT, &ndev->flags); __nci_request(ndev, nci_reset_req, 0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); + + /* After this point our queues are empty + * and no works are scheduled. + */ + ndev->ops->close(ndev); + clear_bit(NCI_INIT, &ndev->flags); del_timer_sync(&ndev->cmd_timer); @@ -414,10 +454,6 @@ static int nci_close_device(struct nci_dev *ndev) /* Flush cmd wq */ flush_workqueue(ndev->cmd_wq); - /* After this point our queues are empty - * and no works are scheduled. */ - ndev->ops->close(ndev); - /* Clear flags */ ndev->flags = 0; @@ -762,7 +798,7 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev, if (atomic_read(&ndev->state) == NCI_POLL_ACTIVE) { nci_request(ndev, nci_rf_deactivate_req, - NCI_DEACTIVATE_TYPE_SLEEP_MODE, + NCI_DEACTIVATE_TYPE_IDLE_MODE, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } } @@ -907,6 +943,16 @@ static int nci_se_io(struct nfc_dev *nfc_dev, u32 se_idx, return 0; } +static int nci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name) +{ + struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); + + if (!ndev->ops->fw_download) + return -ENOTSUPP; + + return ndev->ops->fw_download(ndev, firmware_name); +} + static struct nfc_ops nci_nfc_ops = { .dev_up = nci_dev_up, .dev_down = nci_dev_down, @@ -922,6 +968,7 @@ static struct nfc_ops nci_nfc_ops = { .disable_se = nci_disable_se, .discover_se = nci_discover_se, .se_io = nci_se_io, + .fw_download = nci_fw_download, }; /* ---- Interface to NCI drivers ---- */ @@ -950,6 +997,14 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops, return NULL; ndev->ops = ops; + + if (ops->n_prop_ops > NCI_MAX_PROPRIETARY_CMD) { + pr_err("Too many proprietary commands: %zd\n", + ops->n_prop_ops); + ops->prop_ops = NULL; + ops->n_prop_ops = 0; + } + ndev->tx_headroom = tx_headroom; ndev->tx_tailroom = tx_tailroom; init_completion(&ndev->req_completion); @@ -1154,6 +1209,49 @@ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) return 0; } +/* Proprietary commands API */ +static struct nci_prop_ops *prop_cmd_lookup(struct nci_dev *ndev, + __u16 opcode) +{ + size_t i; + struct nci_prop_ops *prop_op; + + if (!ndev->ops->prop_ops || !ndev->ops->n_prop_ops) + return NULL; + + for (i = 0; i < ndev->ops->n_prop_ops; i++) { + prop_op = &ndev->ops->prop_ops[i]; + if (prop_op->opcode == opcode) + return prop_op; + } + + return NULL; +} + +int nci_prop_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, + struct sk_buff *skb) +{ + struct nci_prop_ops *prop_op; + + prop_op = prop_cmd_lookup(ndev, rsp_opcode); + if (!prop_op || !prop_op->rsp) + return -ENOTSUPP; + + return prop_op->rsp(ndev, skb); +} + +int nci_prop_ntf_packet(struct nci_dev *ndev, __u16 ntf_opcode, + struct sk_buff *skb) +{ + struct nci_prop_ops *prop_op; + + prop_op = prop_cmd_lookup(ndev, ntf_opcode); + if (!prop_op || !prop_op->ntf) + return -ENOTSUPP; + + return prop_op->ntf(ndev, skb); +} + /* ---- NCI TX Data worker thread ---- */ static void nci_tx_work(struct work_struct *work) diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index ed54ec533836..af002df640c7 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -639,22 +639,19 @@ int nci_hci_dev_session_init(struct nci_dev *ndev) ndev->hci_dev->init_data.gates[0].gate, ndev->hci_dev->init_data.gates[0].pipe); if (r < 0) - goto exit; + return r; r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, &skb); if (r < 0) - goto exit; + return r; if (skb->len && skb->len == strlen(ndev->hci_dev->init_data.session_id) && - memcmp(ndev->hci_dev->init_data.session_id, - skb->data, skb->len) == 0 && + !memcmp(ndev->hci_dev->init_data.session_id, skb->data, skb->len) && ndev->ops->hci_load_session) { /* Restore gate<->pipe table from some proprietary location. */ r = ndev->ops->hci_load_session(ndev); - if (r < 0) - goto exit; } else { r = nci_hci_dev_connect_gates(ndev, ndev->hci_dev->init_data.gate_count, @@ -667,8 +664,6 @@ int nci_hci_dev_session_init(struct nci_dev *ndev) ndev->hci_dev->init_data.session_id, strlen(ndev->hci_dev->init_data.session_id)); } - if (r == 0) - goto exit; exit: kfree_skb(skb); diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 3218071072ac..5d1c2e391c56 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -758,6 +758,15 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) /* strip the nci control header */ skb_pull(skb, NCI_CTRL_HDR_SIZE); + if (nci_opcode_gid(ntf_opcode) == NCI_GID_PROPRIETARY) { + if (nci_prop_ntf_packet(ndev, ntf_opcode, skb)) { + pr_err("unsupported ntf opcode 0x%x\n", + ntf_opcode); + } + + goto end; + } + switch (ntf_opcode) { case NCI_OP_CORE_CONN_CREDITS_NTF: nci_core_conn_credits_ntf_packet(ndev, skb); @@ -796,5 +805,6 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) break; } +end: kfree_skb(skb); } diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index 02486bc2ceea..408bd8f857ab 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -296,6 +296,15 @@ void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) /* strip the nci control header */ skb_pull(skb, NCI_CTRL_HDR_SIZE); + if (nci_opcode_gid(rsp_opcode) == NCI_GID_PROPRIETARY) { + if (nci_prop_rsp_packet(ndev, rsp_opcode, skb) == -ENOTSUPP) { + pr_err("unsupported rsp opcode 0x%x\n", + rsp_opcode); + } + + goto end; + } + switch (rsp_opcode) { case NCI_OP_CORE_RESET_RSP: nci_core_reset_rsp_packet(ndev, skb); @@ -346,6 +355,7 @@ void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) break; } +end: kfree_skb(skb); /* trigger the next cmd */ diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c new file mode 100644 index 000000000000..21d8875673a4 --- /dev/null +++ b/net/nfc/nci/uart.c @@ -0,0 +1,494 @@ +/* + * Copyright (C) 2015, Marvell International Ltd. + * + * This software file (the "File") is distributed by Marvell International + * Ltd. under the terms of the GNU General Public License Version 2, June 1991 + * (the "License"). You may use, redistribute and/or modify this File in + * accordance with the terms and conditions of the License, a copy of which + * is available on the worldwide web at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt. + * + * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE + * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY DISCLAIMED. The License provides additional details about + * this warranty disclaimer. + */ + +/* Inspired (hugely) by HCI LDISC implementation in Bluetooth. + * + * Copyright (C) 2000-2001 Qualcomm Incorporated + * Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com> + * Copyright (C) 2004-2005 Marcel Holtmann <marcel@holtmann.org> + */ + +#include <linux/module.h> + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/interrupt.h> +#include <linux/ptrace.h> +#include <linux/poll.h> + +#include <linux/slab.h> +#include <linux/tty.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/signal.h> +#include <linux/ioctl.h> +#include <linux/skbuff.h> + +#include <net/nfc/nci.h> +#include <net/nfc/nci_core.h> + +/* TX states */ +#define NCI_UART_SENDING 1 +#define NCI_UART_TX_WAKEUP 2 + +static struct nci_uart *nci_uart_drivers[NCI_UART_DRIVER_MAX]; + +static inline struct sk_buff *nci_uart_dequeue(struct nci_uart *nu) +{ + struct sk_buff *skb = nu->tx_skb; + + if (!skb) + skb = skb_dequeue(&nu->tx_q); + else + nu->tx_skb = NULL; + + return skb; +} + +static inline int nci_uart_queue_empty(struct nci_uart *nu) +{ + if (nu->tx_skb) + return 0; + + return skb_queue_empty(&nu->tx_q); +} + +static int nci_uart_tx_wakeup(struct nci_uart *nu) +{ + if (test_and_set_bit(NCI_UART_SENDING, &nu->tx_state)) { + set_bit(NCI_UART_TX_WAKEUP, &nu->tx_state); + return 0; + } + + schedule_work(&nu->write_work); + + return 0; +} + +static void nci_uart_write_work(struct work_struct *work) +{ + struct nci_uart *nu = container_of(work, struct nci_uart, write_work); + struct tty_struct *tty = nu->tty; + struct sk_buff *skb; + +restart: + clear_bit(NCI_UART_TX_WAKEUP, &nu->tx_state); + + if (nu->ops.tx_start) + nu->ops.tx_start(nu); + + while ((skb = nci_uart_dequeue(nu))) { + int len; + + set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); + len = tty->ops->write(tty, skb->data, skb->len); + skb_pull(skb, len); + if (skb->len) { + nu->tx_skb = skb; + break; + } + kfree_skb(skb); + } + + if (test_bit(NCI_UART_TX_WAKEUP, &nu->tx_state)) + goto restart; + + if (nu->ops.tx_done && nci_uart_queue_empty(nu)) + nu->ops.tx_done(nu); + + clear_bit(NCI_UART_SENDING, &nu->tx_state); +} + +static int nci_uart_set_driver(struct tty_struct *tty, unsigned int driver) +{ + struct nci_uart *nu = NULL; + int ret; + + if (driver >= NCI_UART_DRIVER_MAX) + return -EINVAL; + + if (!nci_uart_drivers[driver]) + return -ENOENT; + + nu = kzalloc(sizeof(*nu), GFP_KERNEL); + if (!nu) + return -ENOMEM; + + memcpy(nu, nci_uart_drivers[driver], sizeof(struct nci_uart)); + nu->tty = tty; + tty->disc_data = nu; + skb_queue_head_init(&nu->tx_q); + INIT_WORK(&nu->write_work, nci_uart_write_work); + spin_lock_init(&nu->rx_lock); + + ret = nu->ops.open(nu); + if (ret) { + tty->disc_data = NULL; + kfree(nu); + } else if (!try_module_get(nu->owner)) { + nu->ops.close(nu); + tty->disc_data = NULL; + kfree(nu); + return -ENOENT; + } + return ret; +} + +/* ------ LDISC part ------ */ + +/* nci_uart_tty_open + * + * Called when line discipline changed to NCI_UART. + * + * Arguments: + * tty pointer to tty info structure + * Return Value: + * 0 if success, otherwise error code + */ +static int nci_uart_tty_open(struct tty_struct *tty) +{ + /* Error if the tty has no write op instead of leaving an exploitable + * hole + */ + if (!tty->ops->write) + return -EOPNOTSUPP; + + tty->disc_data = NULL; + tty->receive_room = 65536; + + /* Flush any pending characters in the driver and line discipline. */ + + /* FIXME: why is this needed. Note don't use ldisc_ref here as the + * open path is before the ldisc is referencable. + */ + + if (tty->ldisc->ops->flush_buffer) + tty->ldisc->ops->flush_buffer(tty); + tty_driver_flush_buffer(tty); + + return 0; +} + +/* nci_uart_tty_close() + * + * Called when the line discipline is changed to something + * else, the tty is closed, or the tty detects a hangup. + */ +static void nci_uart_tty_close(struct tty_struct *tty) +{ + struct nci_uart *nu = (void *)tty->disc_data; + + /* Detach from the tty */ + tty->disc_data = NULL; + + if (!nu) + return; + + if (nu->tx_skb) + kfree_skb(nu->tx_skb); + if (nu->rx_skb) + kfree_skb(nu->rx_skb); + + skb_queue_purge(&nu->tx_q); + + nu->ops.close(nu); + nu->tty = NULL; + module_put(nu->owner); + + cancel_work_sync(&nu->write_work); + + kfree(nu); +} + +/* nci_uart_tty_wakeup() + * + * Callback for transmit wakeup. Called when low level + * device driver can accept more send data. + * + * Arguments: tty pointer to associated tty instance data + * Return Value: None + */ +static void nci_uart_tty_wakeup(struct tty_struct *tty) +{ + struct nci_uart *nu = (void *)tty->disc_data; + + if (!nu) + return; + + clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); + + if (tty != nu->tty) + return; + + nci_uart_tx_wakeup(nu); +} + +/* nci_uart_tty_receive() + * + * Called by tty low level driver when receive data is + * available. + * + * Arguments: tty pointer to tty isntance data + * data pointer to received data + * flags pointer to flags for data + * count count of received data in bytes + * + * Return Value: None + */ +static void nci_uart_tty_receive(struct tty_struct *tty, const u8 *data, + char *flags, int count) +{ + struct nci_uart *nu = (void *)tty->disc_data; + + if (!nu || tty != nu->tty) + return; + + spin_lock(&nu->rx_lock); + nu->ops.recv_buf(nu, (void *)data, flags, count); + spin_unlock(&nu->rx_lock); + + tty_unthrottle(tty); +} + +/* nci_uart_tty_ioctl() + * + * Process IOCTL system call for the tty device. + * + * Arguments: + * + * tty pointer to tty instance data + * file pointer to open file object for device + * cmd IOCTL command code + * arg argument for IOCTL call (cmd dependent) + * + * Return Value: Command dependent + */ +static int nci_uart_tty_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct nci_uart *nu = (void *)tty->disc_data; + int err = 0; + + switch (cmd) { + case NCIUARTSETDRIVER: + if (!nu) + return nci_uart_set_driver(tty, (unsigned int)arg); + else + return -EBUSY; + break; + default: + err = n_tty_ioctl_helper(tty, file, cmd, arg); + break; + } + + return err; +} + +/* We don't provide read/write/poll interface for user space. */ +static ssize_t nci_uart_tty_read(struct tty_struct *tty, struct file *file, + unsigned char __user *buf, size_t nr) +{ + return 0; +} + +static ssize_t nci_uart_tty_write(struct tty_struct *tty, struct file *file, + const unsigned char *data, size_t count) +{ + return 0; +} + +static unsigned int nci_uart_tty_poll(struct tty_struct *tty, + struct file *filp, poll_table *wait) +{ + return 0; +} + +static int nci_uart_send(struct nci_uart *nu, struct sk_buff *skb) +{ + /* Queue TX packet */ + skb_queue_tail(&nu->tx_q, skb); + + /* Try to start TX (if possible) */ + nci_uart_tx_wakeup(nu); + + return 0; +} + +/* -- Default recv_buf handler -- + * + * This handler supposes that NCI frames are sent over UART link without any + * framing. It reads NCI header, retrieve the packet size and once all packet + * bytes are received it passes it to nci_uart driver for processing. + */ +static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data, + char *flags, int count) +{ + int chunk_len; + + if (!nu->ndev) { + nfc_err(nu->tty->dev, + "receive data from tty but no NCI dev is attached yet, drop buffer\n"); + return 0; + } + + /* Decode all incoming data in packets + * and enqueue then for processing. + */ + while (count > 0) { + /* If this is the first data of a packet, allocate a buffer */ + if (!nu->rx_skb) { + nu->rx_packet_len = -1; + nu->rx_skb = nci_skb_alloc(nu->ndev, + NCI_MAX_PACKET_SIZE, + GFP_KERNEL); + if (!nu->rx_skb) + return -ENOMEM; + } + + /* Eat byte after byte till full packet header is received */ + if (nu->rx_skb->len < NCI_CTRL_HDR_SIZE) { + *skb_put(nu->rx_skb, 1) = *data++; + --count; + continue; + } + + /* Header was received but packet len was not read */ + if (nu->rx_packet_len < 0) + nu->rx_packet_len = NCI_CTRL_HDR_SIZE + + nci_plen(nu->rx_skb->data); + + /* Compute how many bytes are missing and how many bytes can + * be consumed. + */ + chunk_len = nu->rx_packet_len - nu->rx_skb->len; + if (count < chunk_len) + chunk_len = count; + memcpy(skb_put(nu->rx_skb, chunk_len), data, chunk_len); + data += chunk_len; + count -= chunk_len; + + /* Chcek if packet is fully received */ + if (nu->rx_packet_len == nu->rx_skb->len) { + /* Pass RX packet to driver */ + if (nu->ops.recv(nu, nu->rx_skb) != 0) + nfc_err(nu->tty->dev, "corrupted RX packet\n"); + /* Next packet will be a new one */ + nu->rx_skb = NULL; + } + } + + return 0; +} + +/* -- Default recv handler -- */ +static int nci_uart_default_recv(struct nci_uart *nu, struct sk_buff *skb) +{ + return nci_recv_frame(nu->ndev, skb); +} + +int nci_uart_register(struct nci_uart *nu) +{ + if (!nu || !nu->ops.open || + !nu->ops.recv || !nu->ops.close) + return -EINVAL; + + /* Set the send callback */ + nu->ops.send = nci_uart_send; + + /* Install default handlers if not overridden */ + if (!nu->ops.recv_buf) + nu->ops.recv_buf = nci_uart_default_recv_buf; + if (!nu->ops.recv) + nu->ops.recv = nci_uart_default_recv; + + /* Add this driver in the driver list */ + if (nci_uart_drivers[nu->driver]) { + pr_err("driver %d is already registered\n", nu->driver); + return -EBUSY; + } + nci_uart_drivers[nu->driver] = nu; + + pr_info("NCI uart driver '%s [%d]' registered\n", nu->name, nu->driver); + + return 0; +} +EXPORT_SYMBOL_GPL(nci_uart_register); + +void nci_uart_unregister(struct nci_uart *nu) +{ + pr_info("NCI uart driver '%s [%d]' unregistered\n", nu->name, + nu->driver); + + /* Remove this driver from the driver list */ + nci_uart_drivers[nu->driver] = NULL; +} +EXPORT_SYMBOL_GPL(nci_uart_unregister); + +void nci_uart_set_config(struct nci_uart *nu, int baudrate, int flow_ctrl) +{ + struct ktermios new_termios; + + if (!nu->tty) + return; + + down_read(&nu->tty->termios_rwsem); + new_termios = nu->tty->termios; + up_read(&nu->tty->termios_rwsem); + tty_termios_encode_baud_rate(&new_termios, baudrate, baudrate); + + if (flow_ctrl) + new_termios.c_cflag |= CRTSCTS; + else + new_termios.c_cflag &= ~CRTSCTS; + + tty_set_termios(nu->tty, &new_termios); +} +EXPORT_SYMBOL_GPL(nci_uart_set_config); + +static struct tty_ldisc_ops nci_uart_ldisc = { + .magic = TTY_LDISC_MAGIC, + .owner = THIS_MODULE, + .name = "n_nci", + .open = nci_uart_tty_open, + .close = nci_uart_tty_close, + .read = nci_uart_tty_read, + .write = nci_uart_tty_write, + .poll = nci_uart_tty_poll, + .receive_buf = nci_uart_tty_receive, + .write_wakeup = nci_uart_tty_wakeup, + .ioctl = nci_uart_tty_ioctl, +}; + +static int __init nci_uart_init(void) +{ + memset(nci_uart_drivers, 0, sizeof(nci_uart_drivers)); + return tty_register_ldisc(N_NCI, &nci_uart_ldisc); +} + +static void __exit nci_uart_exit(void) +{ + tty_unregister_ldisc(N_NCI); +} + +module_init(nci_uart_init); +module_exit(nci_uart_exit); + +MODULE_AUTHOR("Marvell International Ltd."); +MODULE_DESCRIPTION("NFC NCI UART driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_LDISC(N_NCI); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 14a2d11581da..f85f37ed19b2 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -5,6 +5,12 @@ * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * + * Vendor commands implementation based on net/wireless/nl80211.c + * which is: + * + * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -1489,6 +1495,50 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) return nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); } +static int nfc_genl_vendor_cmd(struct sk_buff *skb, + struct genl_info *info) +{ + struct nfc_dev *dev; + struct nfc_vendor_cmd *cmd; + u32 dev_idx, vid, subcmd; + u8 *data; + size_t data_len; + int i; + + if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || + !info->attrs[NFC_ATTR_VENDOR_ID] || + !info->attrs[NFC_ATTR_VENDOR_SUBCMD]) + return -EINVAL; + + dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]); + subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); + + dev = nfc_get_device(dev_idx); + if (!dev || !dev->vendor_cmds || !dev->n_vendor_cmds) + return -ENODEV; + + data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); + if (data) { + data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); + if (data_len == 0) + return -EINVAL; + } else { + data_len = 0; + } + + for (i = 0; i < dev->n_vendor_cmds; i++) { + cmd = &dev->vendor_cmds[i]; + + if (cmd->vendor_id != vid || cmd->subcmd != subcmd) + continue; + + return cmd->doit(dev, data, data_len); + } + + return -EOPNOTSUPP; +} + static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, @@ -1579,12 +1629,17 @@ static const struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_activate_target, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_VENDOR, + .doit = nfc_genl_vendor_cmd, + .policy = nfc_genl_policy, + }, }; struct urelease_work { struct work_struct w; - int portid; + u32 portid; }; static void nfc_urelease_event_work(struct work_struct *work) diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index a8ce80b47720..5c93e8412a26 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -30,7 +30,7 @@ struct nfc_protocol { struct proto *proto; struct module *owner; int (*create)(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto); + const struct nfc_protocol *nfc_proto, int kern); }; struct nfc_rawsock { diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 373e138c0ab6..e9a91488fe3d 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -211,8 +211,7 @@ static void rawsock_tx_work(struct work_struct *work) } } -static int rawsock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int rawsock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nfc_dev *dev = nfc_rawsock(sk)->dev; @@ -248,8 +247,8 @@ static int rawsock_sendmsg(struct kiocb *iocb, struct socket *sock, return len; } -static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int rawsock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; @@ -335,7 +334,7 @@ static void rawsock_destruct(struct sock *sk) } static int rawsock_create(struct net *net, struct socket *sock, - const struct nfc_protocol *nfc_proto) + const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; @@ -349,7 +348,7 @@ static int rawsock_create(struct net *net, struct socket *sock, else sock->ops = &rawsock_ops; - sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto); + sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern); if (!sk) return -ENOMEM; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index b7d818c59423..15840401a2ce 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -6,6 +6,7 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET select LIBCRC32C + select MPLS select NET_MPLS_GSO ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized @@ -58,7 +59,7 @@ config OPENVSWITCH_VXLAN config OPENVSWITCH_GENEVE tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH - depends on GENEVE + depends on GENEVE_CORE default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create geneve vport. diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index b491c1c296fe..8a8c0b8b4f63 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -608,17 +608,16 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) } static int output_userspace(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) { struct ovs_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.userdata = NULL; - upcall.portid = 0; - upcall.egress_tun_info = NULL; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -647,6 +646,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, break; } + case OVS_USERSPACE_ATTR_ACTIONS: { + /* Include actions. */ + upcall.actions = actions; + upcall.actions_len = actions_len; + break; + } + } /* End of switch. */ } @@ -654,7 +660,8 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, } static int sample(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) { const struct nlattr *acts_list = NULL; const struct nlattr *a; @@ -688,7 +695,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, */ if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && nla_is_last(a, rem))) - return output_userspace(dp, skb, key, a); + return output_userspace(dp, skb, key, a, actions, actions_len); skb = skb_clone(skb, GFP_ATOMIC); if (!skb) @@ -872,7 +879,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, key, a); + output_userspace(dp, skb, key, a, attr, len); break; case OVS_ACTION_ATTR_HASH: @@ -916,7 +923,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, key, a); + err = sample(dp, skb, key, a, attr, len); break; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 5bae7243c577..ff8c4a4c1609 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -203,7 +203,6 @@ static void destroy_dp_rcu(struct rcu_head *rcu) ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); - release_net(ovs_dp_get_net(dp)); kfree(dp->ports); kfree(dp); } @@ -273,10 +272,9 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_upcall_info upcall; int error; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.userdata = NULL; upcall.portid = ovs_vport_find_upcall_portid(p, skb); - upcall.egress_tun_info = NULL; error = ovs_dp_upcall(dp, skb, key, &upcall); if (unlikely(error)) kfree_skb(skb); @@ -398,6 +396,10 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, if (upcall_info->egress_tun_info) size += nla_total_size(ovs_tun_key_attr_size()); + /* OVS_PACKET_ATTR_ACTIONS */ + if (upcall_info->actions_len) + size += nla_total_size(upcall_info->actions_len); + return size; } @@ -479,6 +481,17 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, nla_nest_end(user_skb, nla); } + if (upcall_info->actions_len) { + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); + err = ovs_nla_put_actions(upcall_info->actions, + upcall_info->actions_len, + user_skb); + if (!err) + nla_nest_end(user_skb, nla); + else + nla_nest_cancel(user_skb, nla); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { @@ -546,7 +559,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) /* Normally, setting the skb 'protocol' field would be handled by a * call to eth_type_trans(), but it assumes there's a sending * device, which we may not have. */ - if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(eth->h_proto)) packet->protocol = eth->h_proto; else packet->protocol = htons(ETH_P_802_2); @@ -1501,7 +1514,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (dp == NULL) goto err_free_reply; - ovs_dp_set_net(dp, hold_net(sock_net(skb->sk))); + ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); @@ -1575,7 +1588,6 @@ err_destroy_percpu: err_destroy_table: ovs_flow_tbl_destroy(&dp->table); err_free_dp: - release_net(ovs_dp_get_net(dp)); kfree(dp); err_free_reply: kfree_skb(reply); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 3ece94563079..cd691e935e08 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -84,10 +84,8 @@ struct datapath { /* Stats. */ struct dp_stats_percpu __percpu *stats_percpu; -#ifdef CONFIG_NET_NS /* Network namespace ref. */ - struct net *net; -#endif + possible_net_t net; u32 user_features; }; @@ -118,6 +116,8 @@ struct ovs_skb_cb { struct dp_upcall_info { const struct ovs_tunnel_info *egress_tun_info; const struct nlattr *userdata; + const struct nlattr *actions; + int actions_len; u32 portid; u8 cmd; }; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 50ec42f170a0..bc7b0aba994a 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -100,7 +100,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, new_stats = kmem_cache_alloc_node(flow_stats_cache, - GFP_THISNODE | + GFP_NOWAIT | + __GFP_THISNODE | + __GFP_NOWARN | __GFP_NOMEMALLOC, node); if (likely(new_stats)) { @@ -330,7 +332,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) proto = *(__be16 *) skb->data; __skb_pull(skb, sizeof(__be16)); - if (ntohs(proto) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(proto)) return proto; if (skb->len < sizeof(struct llc_snap_hdr)) @@ -347,7 +349,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) __skb_pull(skb, sizeof(struct llc_snap_hdr)); - if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(llc->ethertype)) return llc->ethertype; return htons(ETH_P_802_2); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 22b18c145c92..624e41c4267f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -535,11 +535,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, - nla_get_be32(a), is_mask); + nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, - nla_get_be32(a), is_mask); + nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TOS: SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, @@ -648,10 +648,12 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; if (output->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, + output->ipv4_src)) return -EMSGSIZE; if (output->ipv4_dst && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, + output->ipv4_dst)) return -EMSGSIZE; if (output->ipv4_tos && nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) @@ -814,7 +816,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, if (is_mask) { /* Always exact match EtherType. */ eth_type = htons(0xffff); - } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + } else if (!eth_proto_is_802_3(eth_type)) { OVS_NLERR(log, "EtherType %x is less than min %x", ntohs(eth_type), ETH_P_802_3_MIN); return -EINVAL; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 4613df8c8290..65523948fb95 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -752,7 +752,7 @@ int ovs_flow_init(void) BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) - + (num_possible_nodes() + + (nr_node_ids * sizeof(struct flow_stats *)), 0, 0, NULL); if (flow_cache == NULL) diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index bf02fd5808c9..208c576bd1b6 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -46,11 +46,6 @@ static inline struct geneve_port *geneve_vport(const struct vport *vport) return vport_priv(vport); } -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - /* Convert 64 bit tunnel ID to 24 bit VNI. */ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) { diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 4776282c6417..33e6d6e2908f 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -125,6 +125,7 @@ static struct vport *netdev_create(const struct vport_parms *parms) if (err) goto error_master_upper_dev_unlink; + dev_disable_lro(netdev_vport->dev); dev_set_promiscuity(netdev_vport->dev, 1); netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 3277a7520e31..6d39766e7828 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -222,7 +222,8 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + struct sock *sk = vxlan_port->vs->sock->sk; + __be16 dst_port = inet_sk(sk)->inet_sport; const struct ovs_key_ipv4_tunnel *tun_key; struct vxlan_metadata md = {0}; struct rtable *rt; @@ -255,7 +256,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) vxflags = vxlan_port->exts | (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0); - err = vxlan_xmit_skb(rt, skb, fl.saddr, tun_key->ipv4_dst, + err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst, tun_key->ipv4_tos, tun_key->ipv4_ttl, df, src_port, dst_port, &md, false, vxflags); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f8db7064d81c..ed458b315ef4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -216,10 +216,16 @@ static void prb_fill_vlan_info(struct tpacket_kbdq_core *, static void packet_flush_mclist(struct sock *sk); struct packet_skb_cb { - unsigned int origlen; union { struct sockaddr_pkt pkt; - struct sockaddr_ll ll; + union { + /* Trick: alias skb original length with + * ll.sll_family and ll.protocol in order + * to save room. + */ + unsigned int origlen; + struct sockaddr_ll ll; + }; } sa; }; @@ -537,15 +543,11 @@ static void prb_init_blk_timer(struct packet_sock *po, pkc->retire_blk_timer.expires = jiffies; } -static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) +static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; - if (tx_ring) - BUG(); - - pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : - GET_PBDQC_FROM_RB(&po->rx_ring); + pkc = GET_PBDQC_FROM_RB(&po->rx_ring); prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); } @@ -601,7 +603,7 @@ static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, - union tpacket_req_u *req_u, int tx_ring) + union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; @@ -628,7 +630,7 @@ static void init_prb_bdqc(struct packet_sock *po, p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); - prb_setup_retire_blk_timer(po, tx_ring); + prb_setup_retire_blk_timer(po); prb_open_block(p1, pbd); } @@ -1228,27 +1230,81 @@ static void packet_free_pending(struct packet_sock *po) free_percpu(po->tx_ring.pending_refcnt); } -static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +#define ROOM_POW_OFF 2 +#define ROOM_NONE 0x0 +#define ROOM_LOW 0x1 +#define ROOM_NORMAL 0x2 + +static bool __tpacket_has_room(struct packet_sock *po, int pow_off) +{ + int idx, len; + + len = po->rx_ring.frame_max + 1; + idx = po->rx_ring.head; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} + +static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +{ + int idx, len; + + len = po->rx_ring.prb_bdqc.knum_blocks; + idx = po->rx_ring.prb_bdqc.kactive_blk_num; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} + +static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { struct sock *sk = &po->sk; - bool has_room; + int ret = ROOM_NONE; + + if (po->prot_hook.func != tpacket_rcv) { + int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - (skb ? skb->truesize : 0); + if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + return ROOM_NORMAL; + else if (avail > 0) + return ROOM_LOW; + else + return ROOM_NONE; + } - if (po->prot_hook.func != tpacket_rcv) - return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) - <= sk->sk_rcvbuf; + if (po->tp_version == TPACKET_V3) { + if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_v3_has_room(po, 0)) + ret = ROOM_LOW; + } else { + if (__tpacket_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_has_room(po, 0)) + ret = ROOM_LOW; + } - spin_lock(&sk->sk_receive_queue.lock); - if (po->tp_version == TPACKET_V3) - has_room = prb_lookup_block(po, &po->rx_ring, - po->rx_ring.prb_bdqc.kactive_blk_num, - TP_STATUS_KERNEL); - else - has_room = packet_lookup_frame(po, &po->rx_ring, - po->rx_ring.head, - TP_STATUS_KERNEL); - spin_unlock(&sk->sk_receive_queue.lock); + return ret; +} + +static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +{ + int ret; + bool has_room; + + spin_lock_bh(&po->sk.sk_receive_queue.lock); + ret = __packet_rcv_has_room(po, skb); + has_room = ret == ROOM_NORMAL; + if (po->pressure == has_room) + po->pressure = !has_room; + spin_unlock_bh(&po->sk.sk_receive_queue.lock); - return has_room; + return ret; } static void packet_sock_destruct(struct sock *sk) @@ -1266,14 +1322,18 @@ static void packet_sock_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } -static int fanout_rr_next(struct packet_fanout *f, unsigned int num) +static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { - int x = atomic_read(&f->rr_cur) + 1; + u32 rxhash; + int i, count = 0; - if (x >= num) - x = 0; + rxhash = skb_get_hash(skb); + for (i = 0; i < ROLLOVER_HLEN; i++) + if (po->rollover->history[i] == rxhash) + count++; - return x; + po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + return count > (ROLLOVER_HLEN >> 1); } static unsigned int fanout_demux_hash(struct packet_fanout *f, @@ -1287,13 +1347,9 @@ static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { - int cur, old; + unsigned int val = atomic_inc_return(&f->rr_cur); - cur = atomic_read(&f->rr_cur); - while ((old = atomic_cmpxchg(&f->rr_cur, cur, - fanout_rr_next(f, num))) != cur) - cur = old; - return cur; + return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, @@ -1312,22 +1368,40 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f, static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, - unsigned int idx, unsigned int skip, + unsigned int idx, bool try_self, unsigned int num) { - unsigned int i, j; + struct packet_sock *po, *po_next, *po_skip = NULL; + unsigned int i, j, room = ROOM_NONE; + + po = pkt_sk(f->arr[idx]); - i = j = min_t(int, f->next[idx], num - 1); + if (try_self) { + room = packet_rcv_has_room(po, skb); + if (room == ROOM_NORMAL || + (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) + return idx; + po_skip = po; + } + + i = j = min_t(int, po->rollover->sock, num - 1); do { - if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { + po_next = pkt_sk(f->arr[i]); + if (po_next != po_skip && !po_next->pressure && + packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) - f->next[idx] = i; + po->rollover->sock = i; + atomic_long_inc(&po->rollover->num); + if (room == ROOM_LOW) + atomic_long_inc(&po->rollover->num_huge); return i; } + if (++i == num) i = 0; } while (i != j); + atomic_long_inc(&po->rollover->num_failed); return idx; } @@ -1347,7 +1421,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; - unsigned int num = f->num_members; + unsigned int num = READ_ONCE(f->num_members); struct packet_sock *po; unsigned int idx; @@ -1380,17 +1454,14 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: - idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); + idx = fanout_demux_rollover(f, skb, 0, false, num); break; } - po = pkt_sk(f->arr[idx]); - if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && - unlikely(!packet_rcv_has_room(po, skb))) { - idx = fanout_demux_rollover(f, skb, idx, idx, num); - po = pkt_sk(f->arr[idx]); - } + if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) + idx = fanout_demux_rollover(f, skb, idx, true, num); + po = pkt_sk(f->arr[idx]); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } @@ -1461,6 +1532,16 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (po->fanout) return -EALREADY; + if (type == PACKET_FANOUT_ROLLOVER || + (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { + po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); + if (!po->rollover) + return -ENOMEM; + atomic_long_set(&po->rollover->num, 0); + atomic_long_set(&po->rollover->num_huge, 0); + atomic_long_set(&po->rollover->num_failed, 0); + } + mutex_lock(&fanout_mutex); match = NULL; list_for_each_entry(f, &fanout_list, list) { @@ -1509,6 +1590,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } out: mutex_unlock(&fanout_mutex); + if (err) { + kfree(po->rollover); + po->rollover = NULL; + } return err; } @@ -1530,6 +1615,9 @@ static void fanout_release(struct sock *sk) kfree(f); } mutex_unlock(&fanout_mutex); + + if (po->rollover) + kfree_rcu(po->rollover, rcu); } static const struct proto_ops packet_ops; @@ -1608,8 +1696,8 @@ oom: * protocol layers and you must therefore supply it with a complete frame */ -static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); @@ -1818,13 +1906,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb = nskb; } - BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 > - sizeof(skb->cb)); + sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); sll = &PACKET_SKB_CB(skb)->sa.ll; - sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; - sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(po->origdev)) sll->sll_ifindex = orig_dev->ifindex; @@ -1833,7 +1918,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, sll->sll_halen = dev_parse_header(skb, sll->sll_addr); - PACKET_SKB_CB(skb)->origlen = skb->len; + /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). + * Use their space for storing the original skb length. + */ + PACKET_SKB_CB(skb)->sa.origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; @@ -1847,7 +1935,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; - skb->dropcount = atomic_read(&sk->sk_drops); + sock_skb_set_dropcount(sk, skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); @@ -1910,14 +1998,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } } - if (skb->ip_summed == CHECKSUM_PARTIAL) - status |= TP_STATUS_CSUMNOTREADY; - snaplen = skb->len; res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + status |= TP_STATUS_CSUMNOTREADY; + else if (skb->pkt_type != PACKET_OUTGOING && + (skb->ip_summed == CHECKSUM_COMPLETE || + skb_csum_unnecessary(skb))) + status |= TP_STATUS_CSUM_VALID; + if (snaplen > res) snaplen = res; @@ -2300,14 +2393,18 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll), - 0, &err); + !need_wait, &err); - if (unlikely(skb == NULL)) + if (unlikely(skb == NULL)) { + /* we assume the socket was initially writeable ... */ + if (likely(len_sum > 0)) + err = len_sum; goto out_status; - + } tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, addr, hlen); - if (tp_len > dev->mtu + dev->hard_header_len) { + if (likely(tp_len >= 0) && + tp_len > dev->mtu + dev->hard_header_len) { struct ethhdr *ehdr; /* Earlier code assumed this would be a VLAN pkt, * double-check this now that we have the actual @@ -2603,8 +2700,7 @@ out: return err; } -static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); @@ -2689,7 +2785,7 @@ static int packet_release(struct socket *sock) static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) { struct packet_sock *po = pkt_sk(sk); - const struct net_device *dev_curr; + struct net_device *dev_curr; __be16 proto_curr; bool need_rehook; @@ -2713,15 +2809,13 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) po->num = proto; po->prot_hook.type = proto; - - if (po->prot_hook.dev) - dev_put(po->prot_hook.dev); - po->prot_hook.dev = dev; po->ifindex = dev ? dev->ifindex : 0; packet_cached_dev_assign(po, dev); } + if (dev_curr) + dev_put(dev_curr); if (proto == 0 || !need_rehook) goto out_unlock; @@ -2822,7 +2916,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, sock->state = SS_UNCONNECTED; err = -ENOBUFS; - sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); + sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; @@ -2852,6 +2946,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); + po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) @@ -2884,13 +2979,14 @@ out: * If necessary we block. */ -static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = 0; + unsigned int origlen = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) @@ -2928,6 +3024,9 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb == NULL) goto out; + if (pkt_sk(sk)->pressure) + packet_rcv_has_room(pkt_sk(sk), NULL); + if (pkt_sk(sk)->has_vnet_hdr) { struct virtio_net_hdr vnet_hdr = { 0 }; @@ -2990,6 +3089,15 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, if (err) goto out_free; + if (sock->type != SOCK_PACKET) { + struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; + + /* Original length was stored in sockaddr_ll fields */ + origlen = PACKET_SKB_CB(skb)->sa.origlen; + sll->sll_family = AF_PACKET; + sll->sll_protocol = skb->protocol; + } + sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { @@ -3001,6 +3109,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_namelen = sizeof(struct sockaddr_pkt); } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; + msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); } @@ -3014,7 +3123,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; - aux.tp_len = PACKET_SKB_CB(skb)->origlen; + else if (skb->pkt_type != PACKET_OUTGOING && + (skb->ip_summed == CHECKSUM_COMPLETE || + skb_csum_unnecessary(skb))) + aux.tp_status |= TP_STATUS_CSUM_VALID; + + aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); @@ -3456,6 +3570,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; + struct tpacket_rollover_stats rstats; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3531,6 +3646,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ((u32)po->fanout->flags << 24)) : 0); break; + case PACKET_ROLLOVER_STATS: + if (!po->rollover) + return -EINVAL; + rstats.tp_all = atomic_long_read(&po->rollover->num); + rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); + rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); + data = &rstats; + lv = sizeof(rstats); + break; case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; break; @@ -3668,6 +3792,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= POLLIN | POLLRDNORM; } + if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + po->pressure = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { @@ -3857,7 +3983,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, * it above but just being paranoid */ if (!tx_ring) - init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); + init_prb_bdqc(po, rb, pg_vec, req_u); break; default: break; diff --git a/net/packet/internal.h b/net/packet/internal.h index cdddf6a30399..e20b3e8829b8 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -74,9 +74,7 @@ extern struct mutex fanout_mutex; #define PACKET_FANOUT_MAX 256 struct packet_fanout { -#ifdef CONFIG_NET_NS - struct net *net; -#endif + possible_net_t net; unsigned int num_members; u16 id; u8 type; @@ -84,12 +82,21 @@ struct packet_fanout { atomic_t rr_cur; struct list_head list; struct sock *arr[PACKET_FANOUT_MAX]; - int next[PACKET_FANOUT_MAX]; spinlock_t lock; atomic_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; }; +struct packet_rollover { + int sock; + struct rcu_head rcu; + atomic_long_t num; + atomic_long_t num_huge; + atomic_long_t num_failed; +#define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32)) + u32 history[ROLLOVER_HLEN] ____cacheline_aligned; +} ____cacheline_aligned_in_smp; + struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; @@ -104,8 +111,10 @@ struct packet_sock { auxdata:1, origdev:1, has_vnet_hdr:1; + int pressure; int ifindex; /* bound device */ __be16 num; + struct packet_rollover *rollover; struct packet_mclist *mclist; atomic_t mapped; enum tpacket_versions tp_version; diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 32ab87d34828..10d42f3220ab 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -97,7 +97,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, goto out; } - sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot); + sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern); if (sk == NULL) { err = -ENOMEM; goto out; diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 26054b4b467c..5e710435ffa9 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -83,8 +83,7 @@ static int pn_init(struct sock *sk) return 0; } -static int pn_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len) +static int pn_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_pn *, target, msg->msg_name); struct sk_buff *skb; @@ -125,9 +124,8 @@ static int pn_sendmsg(struct kiocb *iocb, struct sock *sk, return (err >= 0) ? len : err; } -static int pn_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, int noblock, - int flags, int *addr_len) +static int pn_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) { struct sk_buff *skb = NULL; struct sockaddr_pn sa; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 5d3f2b7507d4..850a86cde0b3 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -845,7 +845,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) } /* Create a new to-be-accepted sock */ - newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot); + newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0); if (!newsk) { pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); err = -ENOBUFS; @@ -1118,8 +1118,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) } -static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len) +static int pep_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct pep_sock *pn = pep_sk(sk); struct sk_buff *skb; @@ -1246,9 +1245,8 @@ struct sk_buff *pep_read(struct sock *sk) return skb; } -static int pep_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, int noblock, - int flags, int *addr_len) +static int pep_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) { struct sk_buff *skb; int err; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 008214a3d5eb..d575ef4e9aa6 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -425,15 +425,15 @@ out: return err; } -static int pn_socket_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int pn_socket_sendmsg(struct socket *sock, struct msghdr *m, + size_t total_len) { struct sock *sk = sock->sk; if (pn_socket_autobind(sock)) return -EAGAIN; - return sk->sk_prot->sendmsg(iocb, sk, m, total_len); + return sk->sk_prot->sendmsg(sk, m, total_len); } const struct proto_ops phonet_dgram_ops = { diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 10443377fb9d..896834cd3b9a 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -40,15 +40,6 @@ #include "rds.h" -char *rds_str_array(char **array, size_t elements, size_t index) -{ - if ((index < elements) && array[index]) - return array[index]; - else - return "unknown"; -} -EXPORT_SYMBOL(rds_str_array); - /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); static unsigned long rds_sock_count; @@ -270,6 +261,28 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, return ret; } +static int rds_set_transport(struct rds_sock *rs, char __user *optval, + int optlen) +{ + int t_type; + + if (rs->rs_transport) + return -EOPNOTSUPP; /* previously attached to transport */ + + if (optlen != sizeof(int)) + return -EINVAL; + + if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type))) + return -EFAULT; + + if (t_type < 0 || t_type >= RDS_TRANS_COUNT) + return -EINVAL; + + rs->rs_transport = rds_trans_get(t_type); + + return rs->rs_transport ? 0 : -ENOPROTOOPT; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -300,6 +313,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, case RDS_CONG_MONITOR: ret = rds_cong_monitor(rs, optval, optlen); break; + case SO_RDS_TRANSPORT: + lock_sock(sock->sk); + ret = rds_set_transport(rs, optval, optlen); + release_sock(sock->sk); + break; default: ret = -ENOPROTOOPT; } @@ -312,6 +330,7 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret = -ENOPROTOOPT, len; + int trans; if (level != SOL_RDS) goto out; @@ -337,6 +356,19 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, else ret = 0; break; + case SO_RDS_TRANSPORT: + if (len < sizeof(int)) { + ret = -EINVAL; + break; + } + trans = (rs->rs_transport ? rs->rs_transport->t_type : + RDS_TRANS_NONE); /* unbound */ + if (put_user(trans, (int __user *)optval) || + put_user(sizeof(int), optlen)) + ret = -EFAULT; + else + ret = 0; + break; default: break; } @@ -440,7 +472,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); + sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/rds/bind.c b/net/rds/bind.c index a2e6562da751..4ebd29c128b6 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -181,6 +181,10 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (ret) goto out; + if (rs->rs_transport) { /* previously bound */ + ret = 0; + goto out; + } trans = rds_trans_get_preferred(sin->sin_addr.s_addr); if (!trans) { ret = -EADDRNOTAVAIL; diff --git a/net/rds/connection.c b/net/rds/connection.c index 378c3a6acf84..da6da57e5f36 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -126,11 +126,14 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *loop_trans; unsigned long flags; int ret; + struct rds_transport *otrans = trans; + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + goto new_conn; rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && - !is_outgoing) { + laddr == faddr && !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we @@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, if (conn) goto out; +new_conn: conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); @@ -193,6 +197,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, } atomic_set(&conn->c_state, RDS_CONN_DOWN); + conn->c_send_gen = 0; conn->c_reconnect_jiffies = 0; INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); @@ -229,13 +234,22 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, /* Creating normal conn */ struct rds_connection *found; - found = rds_conn_lookup(head, laddr, faddr, trans); + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + found = NULL; + else + found = rds_conn_lookup(head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - hlist_add_head_rcu(&conn->c_hash_node, head); + if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) || + (otrans->t_type != RDS_TRANS_TCP)) { + /* Only the active side should be added to + * reconnect list for TCP. + */ + hlist_add_head_rcu(&conn->c_hash_node, head); + } rds_cong_add_conn(conn); rds_conn_count++; } diff --git a/net/rds/ib.h b/net/rds/ib.h index c36d713229e0..86d88ec5d556 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -235,28 +235,34 @@ extern struct workqueue_struct *rds_ib_wq; * doesn't define it. */ static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) + struct scatterlist *sglist, + unsigned int sg_dma_len, + int direction) { + struct scatterlist *sg; unsigned int i; - for (i = 0; i < sg_dma_len; ++i) { + for_each_sg(sglist, sg, sg_dma_len, i) { ib_dma_sync_single_for_cpu(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), + ib_sg_dma_address(dev, sg), + ib_sg_dma_len(dev, sg), direction); } } #define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) + struct scatterlist *sglist, + unsigned int sg_dma_len, + int direction) { + struct scatterlist *sg; unsigned int i; - for (i = 0; i < sg_dma_len; ++i) { + for_each_sg(sglist, sg, sg_dma_len, i) { ib_dma_sync_single_for_device(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), + ib_sg_dma_address(dev, sg), + ib_sg_dma_len(dev, sg), direction); } } @@ -339,7 +345,6 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); extern wait_queue_head_t rds_ib_ring_empty_wait; /* ib_send.c */ -char *rds_ib_wc_status_str(enum ib_wc_status status); void rds_ib_xmit_complete(struct rds_connection *conn); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 31b74f5e61ad..0da2a45b33bd 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -39,36 +39,6 @@ #include "rds.h" #include "ib.h" -static char *rds_ib_event_type_strings[] = { -#define RDS_IB_EVENT_STRING(foo) \ - [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) - RDS_IB_EVENT_STRING(CQ_ERR), - RDS_IB_EVENT_STRING(QP_FATAL), - RDS_IB_EVENT_STRING(QP_REQ_ERR), - RDS_IB_EVENT_STRING(QP_ACCESS_ERR), - RDS_IB_EVENT_STRING(COMM_EST), - RDS_IB_EVENT_STRING(SQ_DRAINED), - RDS_IB_EVENT_STRING(PATH_MIG), - RDS_IB_EVENT_STRING(PATH_MIG_ERR), - RDS_IB_EVENT_STRING(DEVICE_FATAL), - RDS_IB_EVENT_STRING(PORT_ACTIVE), - RDS_IB_EVENT_STRING(PORT_ERR), - RDS_IB_EVENT_STRING(LID_CHANGE), - RDS_IB_EVENT_STRING(PKEY_CHANGE), - RDS_IB_EVENT_STRING(SM_CHANGE), - RDS_IB_EVENT_STRING(SRQ_ERR), - RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), - RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), - RDS_IB_EVENT_STRING(CLIENT_REREGISTER), -#undef RDS_IB_EVENT_STRING -}; - -static char *rds_ib_event_str(enum ib_event_type type) -{ - return rds_str_array(rds_ib_event_type_strings, - ARRAY_SIZE(rds_ib_event_type_strings), type); -}; - /* * Set the selected protocol version */ @@ -183,8 +153,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ - if (dp && dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + if (dp) { + /* dp structure start is not guaranteed to be 8 bytes aligned. + * Since dp_ack_seq is 64-bit extended load operations can be + * used so go through get_unaligned to avoid unaligned errors. + */ + __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); + + if (dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), + NULL); + } rds_connect_complete(conn); } @@ -234,7 +213,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, static void rds_ib_cq_event_handler(struct ib_event *event, void *data) { rdsdebug("event %u (%s) data %p\n", - event->event, rds_ib_event_str(event->event), data); + event->event, ib_event_msg(event->event), data); } static void rds_ib_qp_event_handler(struct ib_event *event, void *data) @@ -243,7 +222,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) struct rds_ib_connection *ic = conn->c_transport_data; rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, - rds_ib_event_str(event->event)); + ib_event_msg(event->event)); switch (event->event) { case IB_EVENT_COMM_EST: @@ -252,7 +231,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) default: rdsdebug("Fatal QP Event %u (%s) " "- connection %pI4->%pI4, reconnecting\n", - event->event, rds_ib_event_str(event->event), + event->event, ib_event_msg(event->event), &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; @@ -268,6 +247,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct rds_ib_connection *ic = conn->c_transport_data; struct ib_device *dev = ic->i_cm_id->device; struct ib_qp_init_attr attr; + struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; int ret; @@ -291,9 +271,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_pd = rds_ibdev->pd; ic->i_mr = rds_ibdev->mr; + cq_attr.cqe = ic->i_send_ring.w_nr + 1; ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, rds_ib_cq_event_handler, conn, - ic->i_send_ring.w_nr + 1, 0); + &cq_attr); if (IS_ERR(ic->i_send_cq)) { ret = PTR_ERR(ic->i_send_cq); ic->i_send_cq = NULL; @@ -301,9 +282,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto out; } + cq_attr.cqe = ic->i_recv_ring.w_nr; ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, rds_ib_cq_event_handler, conn, - ic->i_recv_ring.w_nr, 0); + &cq_attr); if (IS_ERR(ic->i_recv_cq)) { ret = PTR_ERR(ic->i_recv_cq); ic->i_recv_cq = NULL; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 273b8bff6ba4..657ba9f5d308 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr = rds_ib_alloc_fmr(rds_ibdev); - if (IS_ERR(ibmr)) + if (IS_ERR(ibmr)) { + rds_ib_dev_put(rds_ibdev); return ibmr; + } ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); if (ret == 0) diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 1b981a4e42c2..cac5b4506ee3 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -956,7 +956,7 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", (unsigned long long)wc.wr_id, wc.status, - rds_ib_wc_status_str(wc.status), wc.byte_len, + ib_wc_status_msg(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_rx_cq_event); @@ -978,7 +978,7 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, "status %u (%s), disconnecting and " "reconnecting\n", &conn->c_faddr, wc.status, - rds_ib_wc_status_str(wc.status)); + ib_wc_status_msg(wc.status)); } /* diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index bd3825d38abc..5d0a704fa039 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -39,40 +39,6 @@ #include "rds.h" #include "ib.h" -static char *rds_ib_wc_status_strings[] = { -#define RDS_IB_WC_STATUS_STR(foo) \ - [IB_WC_##foo] = __stringify(IB_WC_##foo) - RDS_IB_WC_STATUS_STR(SUCCESS), - RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), - RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), - RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), - RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), - RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), - RDS_IB_WC_STATUS_STR(MW_BIND_ERR), - RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), - RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), - RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), - RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), - RDS_IB_WC_STATUS_STR(REM_OP_ERR), - RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), - RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), - RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), - RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), - RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), - RDS_IB_WC_STATUS_STR(INV_EECN_ERR), - RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), - RDS_IB_WC_STATUS_STR(FATAL_ERR), - RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), - RDS_IB_WC_STATUS_STR(GENERAL_ERR), -#undef RDS_IB_WC_STATUS_STR -}; - -char *rds_ib_wc_status_str(enum ib_wc_status status) -{ - return rds_str_array(rds_ib_wc_status_strings, - ARRAY_SIZE(rds_ib_wc_status_strings), status); -} - /* * Convert IB-specific error message to RDS error message and call core * completion handler. @@ -293,7 +259,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) while (ib_poll_cq(cq, 1, &wc) > 0) { rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", (unsigned long long)wc.wr_id, wc.status, - rds_ib_wc_status_str(wc.status), wc.byte_len, + ib_wc_status_msg(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_tx_cq_event); @@ -344,7 +310,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rds_ib_conn_error(conn, "send completion on %pI4 had status " "%u (%s), disconnecting and reconnecting\n", &conn->c_faddr, wc.status, - rds_ib_wc_status_str(wc.status)); + ib_wc_status_msg(wc.status)); } } } @@ -605,6 +571,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } rds_message_addref(rm); + rm->data.op_dmasg = 0; + rm->data.op_dmaoff = 0; ic->i_data_op = &rm->data; /* Finalize the header */ @@ -658,7 +626,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &ic->i_data_op->op_sg[sg]; + scat = &ic->i_data_op->op_sg[rm->data.op_dmasg]; i = 0; do { unsigned int len = 0; @@ -680,17 +648,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* Set up the data, if present */ if (i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]) { - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + len = min(RDS_FRAG_SIZE, + ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff); send->s_wr.num_sge = 2; - send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send->s_sge[1].addr = ib_sg_dma_address(dev, scat); + send->s_sge[1].addr += rm->data.op_dmaoff; send->s_sge[1].length = len; bytes_sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { + rm->data.op_dmaoff += len; + if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) { scat++; - off = 0; + rm->data.op_dmasg++; + rm->data.op_dmaoff = 0; } } diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index a6c2bea9f8f9..8f486fa32079 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -179,6 +179,7 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, void *context) { struct ib_device *dev = rds_iwdev->dev; + struct ib_cq_init_attr cq_attr = {}; unsigned int send_size, recv_size; int ret; @@ -198,9 +199,10 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, attr->sq_sig_type = IB_SIGNAL_REQ_WR; attr->qp_type = IB_QPT_RC; + cq_attr.cqe = send_size; attr->send_cq = ib_create_cq(dev, send_cq_handler, rds_iw_cq_event_handler, - context, send_size, 0); + context, &cq_attr); if (IS_ERR(attr->send_cq)) { ret = PTR_ERR(attr->send_cq); attr->send_cq = NULL; @@ -208,9 +210,10 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, goto out; } + cq_attr.cqe = recv_size; attr->recv_cq = ib_create_cq(dev, recv_cq_handler, rds_iw_cq_event_handler, - context, recv_size, 0); + context, &cq_attr); if (IS_ERR(attr->recv_cq)) { ret = PTR_ERR(attr->recv_cq); attr->recv_cq = NULL; diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 13834780a308..334fe98c5084 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -581,6 +581,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; rds_message_addref(rm); + rm->data.op_dmasg = 0; + rm->data.op_dmaoff = 0; ic->i_rm = rm; /* Finalize the header */ @@ -622,7 +624,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->data.op_sg[sg]; + scat = &rm->data.op_sg[rm->data.op_dmasg]; sent = 0; i = 0; @@ -656,10 +658,11 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + len = min(RDS_FRAG_SIZE, + ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff); rds_iw_xmit_populate_wr(ic, send, pos, - ib_sg_dma_address(dev, scat) + off, len, - send_flags); + ib_sg_dma_address(dev, scat) + rm->data.op_dmaoff, len, + send_flags); /* * We want to delay signaling completions just enough to get @@ -687,10 +690,11 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { + rm->data.op_dmaoff += len; + if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) { scat++; - off = 0; + rm->data.op_dmaoff = 0; + rm->data.op_dmasg++; } add_header: diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 6cd9d1deafc3..208240836043 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -37,34 +37,6 @@ static struct rdma_cm_id *rds_rdma_listen_id; -static char *rds_cm_event_strings[] = { -#define RDS_CM_EVENT_STRING(foo) \ - [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) - RDS_CM_EVENT_STRING(ADDR_RESOLVED), - RDS_CM_EVENT_STRING(ADDR_ERROR), - RDS_CM_EVENT_STRING(ROUTE_RESOLVED), - RDS_CM_EVENT_STRING(ROUTE_ERROR), - RDS_CM_EVENT_STRING(CONNECT_REQUEST), - RDS_CM_EVENT_STRING(CONNECT_RESPONSE), - RDS_CM_EVENT_STRING(CONNECT_ERROR), - RDS_CM_EVENT_STRING(UNREACHABLE), - RDS_CM_EVENT_STRING(REJECTED), - RDS_CM_EVENT_STRING(ESTABLISHED), - RDS_CM_EVENT_STRING(DISCONNECTED), - RDS_CM_EVENT_STRING(DEVICE_REMOVAL), - RDS_CM_EVENT_STRING(MULTICAST_JOIN), - RDS_CM_EVENT_STRING(MULTICAST_ERROR), - RDS_CM_EVENT_STRING(ADDR_CHANGE), - RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), -#undef RDS_CM_EVENT_STRING -}; - -static char *rds_cm_event_str(enum rdma_cm_event_type type) -{ - return rds_str_array(rds_cm_event_strings, - ARRAY_SIZE(rds_cm_event_strings), type); -}; - int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -74,7 +46,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, int ret = 0; rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, - event->event, rds_cm_event_str(event->event)); + event->event, rdma_event_msg(event->event)); if (cm_id->device->node_type == RDMA_NODE_RNIC) trans = &rds_iw_transport; @@ -139,7 +111,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, default: /* things like device disconnect? */ printk(KERN_ERR "RDS: unknown event %u (%s)!\n", - event->event, rds_cm_event_str(event->event)); + event->event, rdma_event_msg(event->event)); break; } @@ -148,7 +120,7 @@ out: mutex_unlock(&conn->c_cm_lock); rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, - rds_cm_event_str(event->event), ret); + rdma_event_msg(event->event), ret); return ret; } diff --git a/net/rds/rds.h b/net/rds/rds.h index c2a5eef41343..2260c1e434b1 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -110,6 +110,7 @@ struct rds_connection { void *c_transport_data; atomic_t c_state; + unsigned long c_send_gen; unsigned long c_flags; unsigned long c_reconnect_jiffies; struct delayed_work c_send_w; @@ -362,6 +363,8 @@ struct rds_message { unsigned int op_active:1; unsigned int op_nents; unsigned int op_count; + unsigned int op_dmasg; + unsigned int op_dmaoff; struct scatterlist *op_sg; } data; }; @@ -407,11 +410,6 @@ struct rds_notifier { * should try hard not to block. */ -#define RDS_TRANS_IB 0 -#define RDS_TRANS_IWARP 1 -#define RDS_TRANS_TCP 2 -#define RDS_TRANS_COUNT 3 - struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; @@ -574,7 +572,6 @@ struct rds_statistics { }; /* af_rds.c */ -char *rds_str_array(char **array, size_t elements, size_t index); void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); @@ -702,8 +699,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_incoming *inc, gfp_t gfp); -int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size, int msg_flags); +int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int msg_flags); void rds_clear_recv_queue(struct rds_sock *rs); int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); void rds_inc_info_copy(struct rds_incoming *inc, @@ -711,8 +708,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, __be32 saddr, __be32 daddr, int flip); /* send.c */ -int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t payload_len); +int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); void rds_send_reset(struct rds_connection *conn); int rds_send_xmit(struct rds_connection *conn); struct sockaddr_in; @@ -803,6 +799,7 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); +struct rds_transport *rds_trans_get(int t_type); int rds_trans_init(void); void rds_trans_exit(void); diff --git a/net/rds/recv.c b/net/rds/recv.c index f9ec1acd801c..a00462b0d01d 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -395,8 +395,8 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) return 0; } -int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size, int msg_flags) +int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int msg_flags) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); diff --git a/net/rds/send.c b/net/rds/send.c index 42f65d4305c8..e9430f537f9c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -140,8 +140,11 @@ int rds_send_xmit(struct rds_connection *conn) struct scatterlist *sg; int ret = 0; LIST_HEAD(to_be_dropped); + int batch_count; + unsigned long send_gen = 0; restart: + batch_count = 0; /* * sendmsg calls here after having queued its message on the send @@ -157,6 +160,17 @@ restart: } /* + * we record the send generation after doing the xmit acquire. + * if someone else manages to jump in and do some work, we'll use + * this to avoid a goto restart farther down. + * + * The acquire_in_xmit() check above ensures that only one + * caller can increment c_send_gen at any time. + */ + conn->c_send_gen++; + send_gen = conn->c_send_gen; + + /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, * we do the opposite to avoid races. */ @@ -202,6 +216,16 @@ restart: if (!rm) { unsigned int len; + batch_count++; + + /* we want to process as big a batch as we can, but + * we also want to avoid softlockups. If we've been + * through a lot of messages, lets back off and see + * if anyone else jumps in + */ + if (batch_count >= 1024) + goto over_batch; + spin_lock_irqsave(&conn->c_lock, flags); if (!list_empty(&conn->c_send_queue)) { @@ -357,9 +381,9 @@ restart: } } +over_batch: if (conn->c_trans->xmit_complete) conn->c_trans->xmit_complete(conn); - release_in_xmit(conn); /* Nuke any messages we decided not to retransmit. */ @@ -380,10 +404,15 @@ restart: * If the transport cannot continue (i.e ret != 0), then it must * call us when more room is available, such as from the tx * completion handler. + * + * We have an extra generation check here so that if someone manages + * to jump in after our release_in_xmit, we'll see that they have done + * some work and we will skip our goto */ if (ret == 0) { smp_mb(); - if (!list_empty(&conn->c_send_queue)) { + if (!list_empty(&conn->c_send_queue) && + send_gen == conn->c_send_gen) { rds_stats_inc(s_send_lock_queue_raced); goto restart; } @@ -920,8 +949,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, return ret; } -int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t payload_len) +int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index f9f564a6c960..973109c7b8e8 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk) case TCP_ESTABLISHED: rds_connect_complete(conn); break; + case TCP_CLOSE_WAIT: case TCP_CLOSE: rds_conn_drop(conn); default: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 23ab4dcd1d9f..0da49e34495f 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work); static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); static struct socket *rds_tcp_listen_sock; +static int rds_tcp_keepalive(struct socket *sock) +{ + /* values below based on xs_udp_default_timeout */ + int keepidle = 5; /* send a probe 'keepidle' secs after last data */ + int keepcnt = 5; /* number of unack'ed probes before declaring dead */ + int keepalive = 1; + int ret = 0; + + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&keepalive, sizeof(keepalive)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, + (char *)&keepcnt, sizeof(keepcnt)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, + (char *)&keepidle, sizeof(keepidle)); + if (ret < 0) + goto bail; + + /* KEEPINTVL is the interval between successive probes. We follow + * the model in xs_tcp_finish_connecting() and re-use keepidle. + */ + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, + (char *)&keepidle, sizeof(keepidle)); +bail: + return ret; +} + static int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; int ret; struct inet_sock *inet; + struct rds_tcp_connection *rs_tcp; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); @@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock) if (ret < 0) goto out; + ret = rds_tcp_keepalive(new_sock); + if (ret < 0) + goto out; + rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); @@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock) ret = PTR_ERR(conn); goto out; } + /* An incoming SYN request came in, and TCP just accepted it. + * We always create a new conn for listen side of TCP, and do not + * add it to the c_hash_list. + * + * If the client reboots, this conn will need to be cleaned up. + * rds_tcp_state_change() will do that cleanup + */ + rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; + WARN_ON(!rs_tcp || rs_tcp->t_sock); /* * see the comment above rds_queue_delayed_reconnect() diff --git a/net/rds/transport.c b/net/rds/transport.c index 7f2ac4fec367..83498e1c75b8 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -73,7 +73,7 @@ EXPORT_SYMBOL_GPL(rds_trans_unregister); void rds_trans_put(struct rds_transport *trans) { - if (trans && trans->t_owner) + if (trans) module_put(trans->t_owner); } @@ -101,6 +101,27 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr) return ret; } +struct rds_transport *rds_trans_get(int t_type) +{ + struct rds_transport *ret = NULL; + struct rds_transport *trans; + unsigned int i; + + down_read(&rds_trans_sem); + for (i = 0; i < RDS_TRANS_COUNT; i++) { + trans = transports[i]; + + if (trans && trans->t_type == t_type && + (!trans->t_owner || try_module_get(trans->t_owner))) { + ret = trans; + break; + } + } + up_read(&rds_trans_sem); + + return ret; +} + /* * This returns the number of stats entries in the snapshot and only * copies them using the iter if there is enough space for them. The diff --git a/net/rfkill/core.c b/net/rfkill/core.c index fa7cd792791c..f12149a29cb1 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -794,7 +794,8 @@ void rfkill_resume_polling(struct rfkill *rfkill) } EXPORT_SYMBOL(rfkill_resume_polling); -static int rfkill_suspend(struct device *dev, pm_message_t state) +#ifdef CONFIG_PM_SLEEP +static int rfkill_suspend(struct device *dev) { struct rfkill *rfkill = to_rfkill(dev); @@ -818,13 +819,18 @@ static int rfkill_resume(struct device *dev) return 0; } +static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume); +#define RFKILL_PM_OPS (&rfkill_pm_ops) +#else +#define RFKILL_PM_OPS NULL +#endif + static struct class rfkill_class = { .name = "rfkill", .dev_release = rfkill_release, .dev_groups = rfkill_dev_groups, .dev_uevent = rfkill_dev_uevent, - .suspend = rfkill_suspend, - .resume = rfkill_resume, + .pm = RFKILL_PM_OPS, }; bool rfkill_blocked(struct rfkill *rfkill) diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index d978f2f46ff3..d5d58d919552 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -112,21 +112,17 @@ static int rfkill_gpio_probe(struct platform_device *pdev) rfkill->clk = devm_clk_get(&pdev->dev, NULL); - gpio = devm_gpiod_get(&pdev->dev, "reset"); - if (!IS_ERR(gpio)) { - ret = gpiod_direction_output(gpio, 0); - if (ret) - return ret; - rfkill->reset_gpio = gpio; - } + gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_OUT_LOW); + if (IS_ERR(gpio)) + return PTR_ERR(gpio); - gpio = devm_gpiod_get(&pdev->dev, "shutdown"); - if (!IS_ERR(gpio)) { - ret = gpiod_direction_output(gpio, 0); - if (ret) - return ret; - rfkill->shutdown_gpio = gpio; - } + rfkill->reset_gpio = gpio; + + gpio = devm_gpiod_get_optional(&pdev->dev, "shutdown", GPIOD_OUT_LOW); + if (IS_ERR(gpio)) + return PTR_ERR(gpio); + + rfkill->shutdown_gpio = gpio; /* Make sure at-least one of the GPIO is defined and that * a name is specified for this instance diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 43bac7c4dd9e..129d357d2722 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -192,7 +192,8 @@ static void rose_kill_by_device(struct net_device *dev) if (rose->device == dev) { rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); - rose->neighbour->use--; + if (rose->neighbour) + rose->neighbour->use--; rose->device = NULL; } } @@ -520,7 +521,7 @@ static int rose_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto); + sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern); if (sk == NULL) return -ENOMEM; @@ -559,7 +560,7 @@ static struct sock *rose_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) return NULL; - sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto); + sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0); if (sk == NULL) return NULL; @@ -1046,8 +1047,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros return 1; } -static int rose_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int rose_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); @@ -1211,8 +1211,8 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock, } -static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static int rose_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index 50005888be57..369ca81a8c5d 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -41,6 +41,9 @@ static int rose_header(struct sk_buff *skb, struct net_device *dev, { unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2); + if (daddr) + memcpy(buff + 7, daddr, dev->addr_len); + *buff++ = ROSE_GFI | ROSE_Q_BIT; *buff++ = 0x00; *buff++ = ROSE_DATA; @@ -53,43 +56,6 @@ static int rose_header(struct sk_buff *skb, struct net_device *dev, return -37; } -static int rose_rebuild_header(struct sk_buff *skb) -{ -#ifdef CONFIG_INET - struct net_device *dev = skb->dev; - struct net_device_stats *stats = &dev->stats; - unsigned char *bp = (unsigned char *)skb->data; - struct sk_buff *skbn; - unsigned int len; - - if (arp_find(bp + 7, skb)) { - return 1; - } - - if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { - kfree_skb(skb); - return 1; - } - - if (skb->sk != NULL) - skb_set_owner_w(skbn, skb->sk); - - kfree_skb(skb); - - len = skbn->len; - - if (!rose_route_frame(skbn, NULL)) { - kfree_skb(skbn); - stats->tx_errors++; - return 1; - } - - stats->tx_packets++; - stats->tx_bytes += len; -#endif - return 1; -} - static int rose_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; @@ -134,19 +100,26 @@ static int rose_close(struct net_device *dev) static netdev_tx_t rose_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; + unsigned int len = skb->len; if (!netif_running(dev)) { printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n"); return NETDEV_TX_BUSY; } - dev_kfree_skb(skb); - stats->tx_errors++; + + if (!rose_route_frame(skb, NULL)) { + dev_kfree_skb(skb); + stats->tx_errors++; + return NETDEV_TX_OK; + } + + stats->tx_packets++; + stats->tx_bytes += len; return NETDEV_TX_OK; } static const struct header_ops rose_header_ops = { .create = rose_header, - .rebuild = rose_rebuild_header, }; static const struct net_device_ops rose_netdev_ops = { diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index e873d7d9f857..c76638cc2cd5 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -25,7 +25,6 @@ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> -#include <linux/netfilter.h> #include <net/rose.h> static void rose_ftimer_expiry(unsigned long); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 40148932c8a4..0fc76d845103 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -31,7 +31,6 @@ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> -#include <linux/netfilter.h> #include <linux/init.h> #include <net/rose.h> #include <linux/seq_file.h> diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7b1670489638..25d60ed15284 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -441,8 +441,7 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, * - sends a call data packet * - may send an abort (abort code in control data) */ -static int rxrpc_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t len) +static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { struct rxrpc_transport *trans; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); @@ -482,7 +481,7 @@ static int rxrpc_sendmsg(struct kiocb *iocb, struct socket *sock, switch (rx->sk.sk_state) { case RXRPC_SERVER_LISTENING: if (!m->msg_name) { - ret = rxrpc_server_sendmsg(iocb, rx, m, len); + ret = rxrpc_server_sendmsg(rx, m, len); break; } case RXRPC_SERVER_BOUND: @@ -492,7 +491,7 @@ static int rxrpc_sendmsg(struct kiocb *iocb, struct socket *sock, break; } case RXRPC_CLIENT_CONNECTED: - ret = rxrpc_client_sendmsg(iocb, rx, trans, m, len); + ret = rxrpc_client_sendmsg(rx, trans, m, len); break; default: ret = -ENOTCONN; @@ -633,7 +632,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, sock->ops = &rxrpc_rpc_ops; sock->state = SS_UNCONNECTED; - sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto); + sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern); if (!sk) return -ENOMEM; diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 481f89f93789..4505a691d88c 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -28,7 +28,7 @@ const char *rxrpc_pkts[] = { "?00", "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", - "?09", "?10", "?11", "?12", "?13", "?14", "?15" + "?09", "?10", "?11", "?12", "VERSION", "?14", "?15" }; /* @@ -593,6 +593,20 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, rxrpc_queue_conn(conn); } +/* + * post endpoint-level events to the local endpoint + * - this includes debug and version messages + */ +static void rxrpc_post_packet_to_local(struct rxrpc_local *local, + struct sk_buff *skb) +{ + _enter("%p,%p", local, skb); + + atomic_inc(&local->usage); + skb_queue_tail(&local->event_queue, skb); + rxrpc_queue_work(&local->event_processor); +} + static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, struct sk_buff *skb, struct rxrpc_skb_priv *sp) @@ -699,6 +713,11 @@ void rxrpc_data_ready(struct sock *sk) goto bad_message; } + if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) { + rxrpc_post_packet_to_local(local, skb); + goto out; + } + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) goto bad_message; @@ -731,6 +750,8 @@ void rxrpc_data_ready(struct sock *sk) else goto cant_route_call; } + +out: rxrpc_put_local(local); return; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ba9fd36d3f15..aef1bd294e17 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -152,11 +152,13 @@ struct rxrpc_local { struct work_struct destroyer; /* endpoint destroyer */ struct work_struct acceptor; /* incoming call processor */ struct work_struct rejecter; /* packet reject writer */ + struct work_struct event_processor; /* endpoint event processor */ struct list_head services; /* services listening on this endpoint */ struct list_head link; /* link in endpoint list */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ + struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ atomic_t usage; @@ -548,10 +550,9 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t, extern unsigned rxrpc_resend_timeout; int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *); -int rxrpc_client_sendmsg(struct kiocb *, struct rxrpc_sock *, - struct rxrpc_transport *, struct msghdr *, size_t); -int rxrpc_server_sendmsg(struct kiocb *, struct rxrpc_sock *, struct msghdr *, - size_t); +int rxrpc_client_sendmsg(struct rxrpc_sock *, struct rxrpc_transport *, + struct msghdr *, size_t); +int rxrpc_server_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); /* * ar-peer.c @@ -572,8 +573,7 @@ extern const struct file_operations rxrpc_connection_seq_fops; * ar-recvmsg.c */ void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *); -int rxrpc_recvmsg(struct kiocb *, struct socket *, struct msghdr *, size_t, - int); +int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* * ar-security.c diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c index 87f7135d238b..78483b4602bf 100644 --- a/net/rxrpc/ar-local.c +++ b/net/rxrpc/ar-local.c @@ -13,16 +13,22 @@ #include <linux/net.h> #include <linux/skbuff.h> #include <linux/slab.h> +#include <linux/udp.h> +#include <linux/ip.h> #include <net/sock.h> #include <net/af_rxrpc.h> +#include <generated/utsrelease.h> #include "ar-internal.h" +static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; + static LIST_HEAD(rxrpc_locals); DEFINE_RWLOCK(rxrpc_local_lock); static DECLARE_RWSEM(rxrpc_local_sem); static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq); static void rxrpc_destroy_local(struct work_struct *work); +static void rxrpc_process_local_events(struct work_struct *work); /* * allocate a new local @@ -37,11 +43,13 @@ struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx) INIT_WORK(&local->destroyer, &rxrpc_destroy_local); INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls); INIT_WORK(&local->rejecter, &rxrpc_reject_packets); + INIT_WORK(&local->event_processor, &rxrpc_process_local_events); INIT_LIST_HEAD(&local->services); INIT_LIST_HEAD(&local->link); init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->accept_queue); skb_queue_head_init(&local->reject_queue); + skb_queue_head_init(&local->event_queue); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); atomic_set(&local->usage, 1); @@ -65,8 +73,8 @@ static int rxrpc_create_local(struct rxrpc_local *local) _enter("%p{%d}", local, local->srx.transport_type); /* create a socket to represent the local endpoint */ - ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP, - &local->socket); + ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type, + IPPROTO_UDP, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; @@ -264,10 +272,12 @@ static void rxrpc_destroy_local(struct work_struct *work) ASSERT(list_empty(&local->services)); ASSERT(!work_pending(&local->acceptor)); ASSERT(!work_pending(&local->rejecter)); + ASSERT(!work_pending(&local->event_processor)); /* finish cleaning up the local descriptor */ rxrpc_purge_queue(&local->accept_queue); rxrpc_purge_queue(&local->reject_queue); + rxrpc_purge_queue(&local->event_queue); kernel_sock_shutdown(local->socket, SHUT_RDWR); sock_release(local->socket); @@ -308,3 +318,91 @@ void __exit rxrpc_destroy_all_locals(void) _leave(""); } + +/* + * Reply to a version request + */ +static void rxrpc_send_version_request(struct rxrpc_local *local, + struct rxrpc_header *hdr, + struct sk_buff *skb) +{ + struct sockaddr_in sin; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter(""); + + sin.sin_family = AF_INET; + sin.sin_port = udp_hdr(skb)->source; + sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + + msg.msg_name = &sin; + msg.msg_namelen = sizeof(sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr->seq = 0; + hdr->serial = 0; + hdr->type = RXRPC_PACKET_TYPE_VERSION; + hdr->flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); + hdr->userStatus = 0; + hdr->_rsvd = 0; + + iov[0].iov_base = hdr; + iov[0].iov_len = sizeof(*hdr); + iov[1].iov_base = (char *)rxrpc_version_string; + iov[1].iov_len = sizeof(rxrpc_version_string); + + len = iov[0].iov_len + iov[1].iov_len; + + _proto("Tx VERSION (reply)"); + + ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); + if (ret < 0) + _debug("sendmsg failed: %d", ret); + + _leave(""); +} + +/* + * Process event packets targetted at a local endpoint. + */ +static void rxrpc_process_local_events(struct work_struct *work) +{ + struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor); + struct sk_buff *skb; + char v; + + _enter(""); + + atomic_inc(&local->usage); + + while ((skb = skb_dequeue(&local->event_queue))) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + kdebug("{%d},{%u}", local->debug_id, sp->hdr.type); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_VERSION: + if (skb_copy_bits(skb, 0, &v, 1) < 0) + return; + _proto("Rx VERSION { %02x }", v); + if (v == 0) + rxrpc_send_version_request(local, &sp->hdr, skb); + break; + + default: + /* Just ignore anything we don't understand */ + break; + } + + rxrpc_put_local(local); + rxrpc_free_skb(skb); + } + + rxrpc_put_local(local); + _leave(""); +} diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 8331c95e1522..c0042807bfc6 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -23,8 +23,7 @@ */ unsigned rxrpc_resend_timeout = 4 * HZ; -static int rxrpc_send_data(struct kiocb *iocb, - struct rxrpc_sock *rx, +static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, struct msghdr *msg, size_t len); @@ -129,9 +128,8 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) * - caller holds the socket locked * - the socket may be either a client socket or a server socket */ -int rxrpc_client_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx, - struct rxrpc_transport *trans, struct msghdr *msg, - size_t len) +int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans, + struct msghdr *msg, size_t len) { struct rxrpc_conn_bundle *bundle; enum rxrpc_command cmd; @@ -191,7 +189,7 @@ int rxrpc_client_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx, /* request phase complete for this client call */ ret = -EPROTO; } else { - ret = rxrpc_send_data(iocb, rx, call, msg, len); + ret = rxrpc_send_data(rx, call, msg, len); } rxrpc_put_call(call); @@ -232,7 +230,7 @@ int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, call->state != RXRPC_CALL_SERVER_SEND_REPLY) { ret = -EPROTO; /* request phase complete for this client call */ } else { - ret = rxrpc_send_data(NULL, call->socket, call, msg, len); + ret = rxrpc_send_data(call->socket, call, msg, len); } release_sock(&call->socket->sk); @@ -271,8 +269,7 @@ EXPORT_SYMBOL(rxrpc_kernel_abort_call); * send a message through a server socket * - caller holds the socket locked */ -int rxrpc_server_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx, - struct msghdr *msg, size_t len) +int rxrpc_server_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) { enum rxrpc_command cmd; struct rxrpc_call *call; @@ -313,7 +310,7 @@ int rxrpc_server_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx, break; } - ret = rxrpc_send_data(iocb, rx, call, msg, len); + ret = rxrpc_send_data(rx, call, msg, len); break; case RXRPC_CMD_SEND_ABORT: @@ -520,8 +517,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, * - must be called in process context * - caller holds the socket locked */ -static int rxrpc_send_data(struct kiocb *iocb, - struct rxrpc_sock *rx, +static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, struct msghdr *msg, size_t len) { @@ -546,11 +542,7 @@ static int rxrpc_send_data(struct kiocb *iocb, call->tx_pending = NULL; copied = 0; - if (len > iov_iter_count(&msg->msg_iter)) - len = iov_iter_count(&msg->msg_iter); - while (len) { - int copy; - + do { if (!skb) { size_t size, chunk, max, space; @@ -572,8 +564,8 @@ static int rxrpc_send_data(struct kiocb *iocb, max &= ~(call->conn->size_align - 1UL); chunk = max; - if (chunk > len && !more) - chunk = len; + if (chunk > msg_data_left(msg) && !more) + chunk = msg_data_left(msg); space = chunk + call->conn->size_align; space &= ~(call->conn->size_align - 1UL); @@ -616,23 +608,23 @@ static int rxrpc_send_data(struct kiocb *iocb, sp = rxrpc_skb(skb); /* append next segment of data to the current buffer */ - copy = skb_tailroom(skb); - ASSERTCMP(copy, >, 0); - if (copy > len) - copy = len; - if (copy > sp->remain) - copy = sp->remain; - - _debug("add"); - ret = skb_add_data(skb, &msg->msg_iter, copy); - _debug("added"); - if (ret < 0) - goto efault; - sp->remain -= copy; - skb->mark += copy; - copied += copy; - - len -= copy; + if (msg_data_left(msg) > 0) { + int copy = skb_tailroom(skb); + ASSERTCMP(copy, >, 0); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); + if (copy > sp->remain) + copy = sp->remain; + + _debug("add"); + ret = skb_add_data(skb, &msg->msg_iter, copy); + _debug("added"); + if (ret < 0) + goto efault; + sp->remain -= copy; + skb->mark += copy; + copied += copy; + } /* check for the far side aborting the call or a network error * occurring */ @@ -640,7 +632,8 @@ static int rxrpc_send_data(struct kiocb *iocb, goto call_aborted; /* add the packet to the send queue if it's now full */ - if (sp->remain <= 0 || (!len && !more)) { + if (sp->remain <= 0 || + (msg_data_left(msg) == 0 && !more)) { struct rxrpc_connection *conn = call->conn; uint32_t seq; size_t pad; @@ -670,7 +663,7 @@ static int rxrpc_send_data(struct kiocb *iocb, sp->hdr.serviceId = conn->service_id; sp->hdr.flags = conn->out_clientflag; - if (len == 0 && !more) + if (msg_data_left(msg) == 0 && !more) sp->hdr.flags |= RXRPC_LAST_PACKET; else if (CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz) > 1) @@ -686,10 +679,10 @@ static int rxrpc_send_data(struct kiocb *iocb, memcpy(skb->head, &sp->hdr, sizeof(struct rxrpc_header)); - rxrpc_queue_packet(call, skb, !iov_iter_count(&msg->msg_iter) && !more); + rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); skb = NULL; } - } + } while (msg_data_left(msg) > 0); success: ret = copied; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 19a560626dc4..b92beded7459 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -43,8 +43,8 @@ void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) * - we need to be careful about two or more threads calling recvmsg * simultaneously */ -int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct rxrpc_skb_priv *sp; struct rxrpc_call *call = NULL, *continue_call = NULL; @@ -150,7 +150,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, &call->conn->trans->peer->srx, len); msg->msg_namelen = len; } - sock_recv_ts_and_drops(msg, &rx->sk, skb); + sock_recv_timestamp(msg, &rx->sk, skb); } /* receive the message */ diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2274e723a3df..daa33432b716 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -312,6 +312,7 @@ config NET_SCH_PIE config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT + select NET_INGRESS ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. @@ -477,6 +478,16 @@ config NET_CLS_BPF To compile this code as a module, choose M here: the module will be called cls_bpf. +config NET_CLS_FLOWER + tristate "Flower classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + a configurable combination of packet keys and masks. + + To compile this code as a module, choose M here: the module will + be called cls_flower. + config NET_EMATCH bool "Extended Matches" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 7ca7f4c1b8c2..690c1689e090 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o +obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3d43e4979f27..43ec92680ae8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -45,7 +45,7 @@ void tcf_hash_destroy(struct tc_action *a) } EXPORT_SYMBOL(tcf_hash_destroy); -int tcf_hash_release(struct tc_action *a, int bind) +int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) { struct tcf_common *p = a->priv; int ret = 0; @@ -53,7 +53,7 @@ int tcf_hash_release(struct tc_action *a, int bind) if (p) { if (bind) p->tcfc_bindcnt--; - else if (p->tcfc_bindcnt > 0) + else if (strict && p->tcfc_bindcnt > 0) return -EPERM; p->tcfc_refcnt--; @@ -64,9 +64,10 @@ int tcf_hash_release(struct tc_action *a, int bind) ret = 1; } } + return ret; } -EXPORT_SYMBOL(tcf_hash_release); +EXPORT_SYMBOL(__tcf_hash_release); static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, struct tc_action *a) @@ -136,7 +137,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a) head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; hlist_for_each_entry_safe(p, n, head, tcfc_head) { a->priv = p; - ret = tcf_hash_release(a, 0); + ret = __tcf_hash_release(a, false, true); if (ret == ACT_P_DELETED) { module_put(a->ops->owner); n_i++; @@ -392,11 +393,6 @@ int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, list_for_each_entry(a, actions, list) { repeat: ret = a->ops->act(skb, a, res); - if (TC_MUNGED & skb->tc_verd) { - /* copied already, allow trampling */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); - } if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ if (ret != TC_ACT_PIPE) @@ -413,7 +409,7 @@ int tcf_action_destroy(struct list_head *actions, int bind) int ret = 0; list_for_each_entry_safe(a, tmp, actions, list) { - ret = tcf_hash_release(a, bind); + ret = __tcf_hash_release(a, bind, true); if (ret == ACT_P_DELETED) module_put(a->ops->owner); else if (ret < 0) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 5f6288fa3f12..d0edeb7a1950 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -13,26 +13,51 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/filter.h> +#include <linux/bpf.h> + #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_bpf.h> #include <net/tc_act/tc_bpf.h> -#define BPF_TAB_MASK 15 +#define BPF_TAB_MASK 15 +#define ACT_BPF_NAME_LEN 256 + +struct tcf_bpf_cfg { + struct bpf_prog *filter; + struct sock_filter *bpf_ops; + const char *bpf_name; + u32 bpf_fd; + u16 bpf_num_ops; + bool is_ebpf; +}; -static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a, +static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { - struct tcf_bpf *b = a->priv; + struct tcf_bpf *prog = act->priv; int action, filter_res; + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; - spin_lock(&b->tcf_lock); + if (unlikely(!skb_mac_header_was_set(skb))) + return TC_ACT_UNSPEC; - b->tcf_tm.lastuse = jiffies; - bstats_update(&b->tcf_bstats, skb); + spin_lock(&prog->tcf_lock); - filter_res = BPF_PROG_RUN(b->filter, skb); + prog->tcf_tm.lastuse = jiffies; + bstats_update(&prog->tcf_bstats, skb); + + /* Needed here for accessing maps. */ + rcu_read_lock(); + if (at_ingress) { + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(prog->filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(prog->filter, skb); + } + rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the @@ -52,52 +77,87 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a, break; case TC_ACT_SHOT: action = filter_res; - b->tcf_qstats.drops++; + prog->tcf_qstats.drops++; break; case TC_ACT_UNSPEC: - action = b->tcf_action; + action = prog->tcf_action; break; default: action = TC_ACT_UNSPEC; break; } - spin_unlock(&b->tcf_lock); + spin_unlock(&prog->tcf_lock); return action; } -static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *a, +static bool tcf_bpf_is_ebpf(const struct tcf_bpf *prog) +{ + return !prog->bpf_ops; +} + +static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, + struct sk_buff *skb) +{ + struct nlattr *nla; + + if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, prog->bpf_num_ops)) + return -EMSGSIZE; + + nla = nla_reserve(skb, TCA_ACT_BPF_OPS, prog->bpf_num_ops * + sizeof(struct sock_filter)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + return 0; +} + +static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd)) + return -EMSGSIZE; + + if (prog->bpf_name && + nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) + return -EMSGSIZE; + + return 0; +} + +static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { unsigned char *tp = skb_tail_pointer(skb); - struct tcf_bpf *b = a->priv; + struct tcf_bpf *prog = act->priv; struct tc_act_bpf opt = { - .index = b->tcf_index, - .refcnt = b->tcf_refcnt - ref, - .bindcnt = b->tcf_bindcnt - bind, - .action = b->tcf_action, + .index = prog->tcf_index, + .refcnt = prog->tcf_refcnt - ref, + .bindcnt = prog->tcf_bindcnt - bind, + .action = prog->tcf_action, }; - struct tcf_t t; - struct nlattr *nla; + struct tcf_t tm; + int ret; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, b->bpf_num_ops)) - goto nla_put_failure; - - nla = nla_reserve(skb, TCA_ACT_BPF_OPS, b->bpf_num_ops * - sizeof(struct sock_filter)); - if (!nla) + if (tcf_bpf_is_ebpf(prog)) + ret = tcf_bpf_dump_ebpf_info(prog, skb); + else + ret = tcf_bpf_dump_bpf_info(prog, skb); + if (ret) goto nla_put_failure; - memcpy(nla_data(nla), b->bpf_ops, nla_len(nla)); + tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install); + tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); + tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); - t.install = jiffies_to_clock_t(jiffies - b->tcf_tm.install); - t.lastuse = jiffies_to_clock_t(jiffies - b->tcf_tm.lastuse); - t.expires = jiffies_to_clock_t(b->tcf_tm.expires); - if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(t), &t)) + if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm)) goto nla_put_failure; + return skb->len; nla_put_failure: @@ -107,36 +167,21 @@ nla_put_failure: static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = { [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) }, + [TCA_ACT_BPF_FD] = { .type = NLA_U32 }, + [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; -static int tcf_bpf_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action *a, - int ovr, int bind) +static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg) { - struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; - struct tc_act_bpf *parm; - struct tcf_bpf *b; - u16 bpf_size, bpf_num_ops; struct sock_filter *bpf_ops; - struct sock_fprog_kern tmp; + struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; + u16 bpf_size, bpf_num_ops; int ret; - if (!nla) - return -EINVAL; - - ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy); - if (ret < 0) - return ret; - - if (!tb[TCA_ACT_BPF_PARMS] || - !tb[TCA_ACT_BPF_OPS_LEN] || !tb[TCA_ACT_BPF_OPS]) - return -EINVAL; - parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]); if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) return -EINVAL; @@ -146,68 +191,184 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return -EINVAL; bpf_ops = kzalloc(bpf_size, GFP_KERNEL); - if (!bpf_ops) + if (bpf_ops == NULL) return -ENOMEM; memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size); - tmp.len = bpf_num_ops; - tmp.filter = bpf_ops; + fprog_tmp.len = bpf_num_ops; + fprog_tmp.filter = bpf_ops; - ret = bpf_prog_create(&fp, &tmp); - if (ret) - goto free_bpf_ops; + ret = bpf_prog_create(&fp, &fprog_tmp); + if (ret < 0) { + kfree(bpf_ops); + return ret; + } + + cfg->bpf_ops = bpf_ops; + cfg->bpf_num_ops = bpf_num_ops; + cfg->filter = fp; + cfg->is_ebpf = false; + + return 0; +} + +static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) +{ + struct bpf_prog *fp; + char *name = NULL; + u32 bpf_fd; + + bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); + + fp = bpf_prog_get(bpf_fd); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + if (fp->type != BPF_PROG_TYPE_SCHED_ACT) { + bpf_prog_put(fp); + return -EINVAL; + } + + if (tb[TCA_ACT_BPF_NAME]) { + name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]), + nla_len(tb[TCA_ACT_BPF_NAME]), + GFP_KERNEL); + if (!name) { + bpf_prog_put(fp); + return -ENOMEM; + } + } + + cfg->bpf_fd = bpf_fd; + cfg->bpf_name = name; + cfg->filter = fp; + cfg->is_ebpf = true; + + return 0; +} + +static void tcf_bpf_cfg_cleanup(const struct tcf_bpf_cfg *cfg) +{ + if (cfg->is_ebpf) + bpf_prog_put(cfg->filter); + else + bpf_prog_destroy(cfg->filter); + + kfree(cfg->bpf_ops); + kfree(cfg->bpf_name); +} + +static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, + struct tcf_bpf_cfg *cfg) +{ + cfg->is_ebpf = tcf_bpf_is_ebpf(prog); + cfg->filter = prog->filter; + + cfg->bpf_ops = prog->bpf_ops; + cfg->bpf_name = prog->bpf_name; +} + +static int tcf_bpf_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *act, + int replace, int bind) +{ + struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; + struct tcf_bpf_cfg cfg, old; + struct tc_act_bpf *parm; + struct tcf_bpf *prog; + bool is_bpf, is_ebpf; + int ret; + + if (!nla) + return -EINVAL; + + ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy); + if (ret < 0) + return ret; + + is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; + is_ebpf = tb[TCA_ACT_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || + !tb[TCA_ACT_BPF_PARMS]) + return -EINVAL; - if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*b), bind); - if (ret) + parm = nla_data(tb[TCA_ACT_BPF_PARMS]); + + memset(&cfg, 0, sizeof(cfg)); + + ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : + tcf_bpf_init_from_efd(tb, &cfg); + if (ret < 0) + return ret; + + if (!tcf_hash_check(parm->index, act, bind)) { + ret = tcf_hash_create(parm->index, est, act, + sizeof(*prog), bind); + if (ret < 0) goto destroy_fp; ret = ACT_P_CREATED; } else { + /* Don't override defaults. */ if (bind) goto destroy_fp; - tcf_hash_release(a, bind); - if (!ovr) { + + tcf_hash_release(act, bind); + if (!replace) { ret = -EEXIST; goto destroy_fp; } } - b = to_bpf(a); - spin_lock_bh(&b->tcf_lock); - b->tcf_action = parm->action; - b->bpf_num_ops = bpf_num_ops; - b->bpf_ops = bpf_ops; - b->filter = fp; - spin_unlock_bh(&b->tcf_lock); + prog = to_bpf(act); + spin_lock_bh(&prog->tcf_lock); + + if (ret != ACT_P_CREATED) + tcf_bpf_prog_fill_cfg(prog, &old); + + prog->bpf_ops = cfg.bpf_ops; + prog->bpf_name = cfg.bpf_name; + + if (cfg.bpf_num_ops) + prog->bpf_num_ops = cfg.bpf_num_ops; + if (cfg.bpf_fd) + prog->bpf_fd = cfg.bpf_fd; + + prog->tcf_action = parm->action; + prog->filter = cfg.filter; + + spin_unlock_bh(&prog->tcf_lock); if (ret == ACT_P_CREATED) - tcf_hash_insert(a); + tcf_hash_insert(act); + else + tcf_bpf_cfg_cleanup(&old); + return ret; destroy_fp: - bpf_prog_destroy(fp); -free_bpf_ops: - kfree(bpf_ops); + tcf_bpf_cfg_cleanup(&cfg); return ret; } -static void tcf_bpf_cleanup(struct tc_action *a, int bind) +static void tcf_bpf_cleanup(struct tc_action *act, int bind) { - struct tcf_bpf *b = a->priv; + struct tcf_bpf_cfg tmp; - bpf_prog_destroy(b->filter); + tcf_bpf_prog_fill_cfg(act->priv, &tmp); + tcf_bpf_cfg_cleanup(&tmp); } -static struct tc_action_ops act_bpf_ops = { - .kind = "bpf", - .type = TCA_ACT_BPF, - .owner = THIS_MODULE, - .act = tcf_bpf, - .dump = tcf_bpf_dump, - .cleanup = tcf_bpf_cleanup, - .init = tcf_bpf_init, +static struct tc_action_ops act_bpf_ops __read_mostly = { + .kind = "bpf", + .type = TCA_ACT_BPF, + .owner = THIS_MODULE, + .act = tcf_bpf, + .dump = tcf_bpf_dump, + .cleanup = tcf_bpf_cleanup, + .init = tcf_bpf_init, }; static int __init bpf_init_module(void) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 8e472518f9f6..295d14bd6c67 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -63,7 +63,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, skb->mark = c->mark; /* using overlimits stats to count how many packets marked */ ca->tcf_qstats.overlimits++; - nf_ct_put(c); goto out; } @@ -82,7 +81,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, nf_ct_put(c); out: - skb->nfct = NULL; spin_unlock(&ca->tcf_lock); return ca->tcf_action; } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5953517ec059..a42a3b257226 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -151,13 +151,13 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, } at = G_TC_AT(skb->tc_verd); - skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action); + skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 == NULL) goto out; if (!(at & AT_EGRESS)) { if (m->tcfm_ok_push) - skb_push(skb2, skb2->dev->hard_header_len); + skb_push(skb2, skb->mac_len); } /* mirror is always swallowed */ diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 59649d588d79..ff8b466a73f6 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -68,13 +68,12 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; } else { - p = to_pedit(a); - tcf_hash_release(a, bind); if (bind) return 0; + tcf_hash_release(a, bind); if (!ovr) return -EEXIST; - + p = to_pedit(a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) @@ -108,7 +107,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = a->priv; - int i, munged = 0; + int i; unsigned int off; if (skb_unclone(skb, GFP_ATOMIC)) @@ -156,11 +155,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, *ptr = ((*ptr & tkey->mask) ^ tkey->val); if (ptr == &_data) skb_store_bits(skb, off + offset, ptr, 4); - munged++; } - if (munged) - skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); goto done; } else WARN(1, "pedit BUG: index %d\n", p->tcf_index); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index baef987fe2c0..a75864d93142 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -81,6 +81,11 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) struct tcf_proto_ops *t; int rc = -ENOENT; + /* Wait for outstanding call_rcu()s, if any, from a + * tcf_proto_ops's destroy() handler. + */ + rcu_barrier(); + write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { if (t == ops) { @@ -286,7 +291,7 @@ replay: RCU_INIT_POINTER(*back, next); tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); - tcf_destroy(tp); + tcf_destroy(tp, true); err = 0; goto errout; } @@ -301,14 +306,19 @@ replay: err = -EEXIST; if (n->nlmsg_flags & NLM_F_EXCL) { if (tp_created) - tcf_destroy(tp); + tcf_destroy(tp, true); goto errout; } break; case RTM_DELTFILTER: err = tp->ops->delete(tp, fh); - if (err == 0) + if (err == 0) { + struct tcf_proto *next = rtnl_dereference(tp->next); + tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); + if (tcf_destroy(tp, false)) + RCU_INIT_POINTER(*back, next); + } goto errout; case RTM_GETTFILTER: err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); @@ -329,7 +339,7 @@ replay: tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); } else { if (tp_created) - tcf_destroy(tp); + tcf_destroy(tp, true); } errout: diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index fc399db86f11..0b8c3ace671f 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -96,11 +96,14 @@ static void basic_delete_filter(struct rcu_head *head) kfree(f); } -static void basic_destroy(struct tcf_proto *tp) +static bool basic_destroy(struct tcf_proto *tp, bool force) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f, *n; + if (!force && !list_empty(&head->flist)) + return false; + list_for_each_entry_safe(f, n, &head->flist, link) { list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); @@ -108,6 +111,7 @@ static void basic_destroy(struct tcf_proto *tp) } RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); + return true; } static int basic_delete(struct tcf_proto *tp, unsigned long arg) diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5f3ee9e4b5bf..e5168f8b9640 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -16,6 +16,8 @@ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/filter.h> +#include <linux/bpf.h> + #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> @@ -24,6 +26,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_DESCRIPTION("TC BPF based classifier"); +#define CLS_BPF_NAME_LEN 256 + struct cls_bpf_head { struct list_head plist; u32 hgen; @@ -32,18 +36,24 @@ struct cls_bpf_head { struct cls_bpf_prog { struct bpf_prog *filter; - struct sock_filter *bpf_ops; - struct tcf_exts exts; - struct tcf_result res; struct list_head link; + struct tcf_result res; + struct tcf_exts exts; u32 handle; - u16 bpf_num_ops; + union { + u32 bpf_fd; + u16 bpf_num_ops; + }; + struct sock_filter *bpf_ops; + const char *bpf_name; struct tcf_proto *tp; struct rcu_head rcu; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_FD] = { .type = NLA_U32 }, + [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, @@ -54,10 +64,29 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); struct cls_bpf_prog *prog; - int ret; - +#ifdef CONFIG_NET_CLS_ACT + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; +#else + bool at_ingress = false; +#endif + int ret = -1; + + if (unlikely(!skb_mac_header_was_set(skb))) + return -1; + + /* Needed here for accessing maps. */ + rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { - int filter_res = BPF_PROG_RUN(prog->filter, skb); + int filter_res; + + if (at_ingress) { + /* It is safe to push/pull even if skb_shared() */ + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(prog->filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(prog->filter, skb); + } if (filter_res == 0) continue; @@ -70,10 +99,16 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (ret < 0) continue; - return ret; + break; } + rcu_read_unlock(); - return -1; + return ret; +} + +static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) +{ + return !prog->bpf_ops; } static int cls_bpf_init(struct tcf_proto *tp) @@ -94,8 +129,12 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) { tcf_exts_destroy(&prog->exts); - bpf_prog_destroy(prog->filter); + if (cls_bpf_is_ebpf(prog)) + bpf_prog_put(prog->filter); + else + bpf_prog_destroy(prog->filter); + kfree(prog->bpf_name); kfree(prog->bpf_ops); kfree(prog); } @@ -114,14 +153,18 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); + return 0; } -static void cls_bpf_destroy(struct tcf_proto *tp) +static bool cls_bpf_destroy(struct tcf_proto *tp, bool force) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog, *tmp; + if (!force && !list_empty(&head->plist)) + return false; + list_for_each_entry_safe(prog, tmp, &head->plist, link) { list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); @@ -130,6 +173,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp) RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); + return true; } static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) @@ -151,69 +195,121 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) return ret; } -static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, - struct cls_bpf_prog *prog, - unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) +static int cls_bpf_prog_from_ops(struct nlattr **tb, + struct cls_bpf_prog *prog, u32 classid) { struct sock_filter *bpf_ops; - struct tcf_exts exts; - struct sock_fprog_kern tmp; + struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; u16 bpf_size, bpf_num_ops; - u32 classid; int ret; - if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) - return -EINVAL; - - tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); - ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); - if (ret < 0) - return ret; - - classid = nla_get_u32(tb[TCA_BPF_CLASSID]); bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]); - if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) { - ret = -EINVAL; - goto errout; - } + if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) + return -EINVAL; bpf_size = bpf_num_ops * sizeof(*bpf_ops); - if (bpf_size != nla_len(tb[TCA_BPF_OPS])) { - ret = -EINVAL; - goto errout; - } + if (bpf_size != nla_len(tb[TCA_BPF_OPS])) + return -EINVAL; bpf_ops = kzalloc(bpf_size, GFP_KERNEL); - if (bpf_ops == NULL) { - ret = -ENOMEM; - goto errout; - } + if (bpf_ops == NULL) + return -ENOMEM; memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); - tmp.len = bpf_num_ops; - tmp.filter = bpf_ops; + fprog_tmp.len = bpf_num_ops; + fprog_tmp.filter = bpf_ops; - ret = bpf_prog_create(&fp, &tmp); - if (ret) - goto errout_free; + ret = bpf_prog_create(&fp, &fprog_tmp); + if (ret < 0) { + kfree(bpf_ops); + return ret; + } - prog->bpf_num_ops = bpf_num_ops; prog->bpf_ops = bpf_ops; + prog->bpf_num_ops = bpf_num_ops; + prog->bpf_name = NULL; + + prog->filter = fp; + prog->res.classid = classid; + + return 0; +} + +static int cls_bpf_prog_from_efd(struct nlattr **tb, + struct cls_bpf_prog *prog, u32 classid) +{ + struct bpf_prog *fp; + char *name = NULL; + u32 bpf_fd; + + bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); + + fp = bpf_prog_get(bpf_fd); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + if (fp->type != BPF_PROG_TYPE_SCHED_CLS) { + bpf_prog_put(fp); + return -EINVAL; + } + + if (tb[TCA_BPF_NAME]) { + name = kmemdup(nla_data(tb[TCA_BPF_NAME]), + nla_len(tb[TCA_BPF_NAME]), + GFP_KERNEL); + if (!name) { + bpf_prog_put(fp); + return -ENOMEM; + } + } + + prog->bpf_ops = NULL; + prog->bpf_fd = bpf_fd; + prog->bpf_name = name; + prog->filter = fp; prog->res.classid = classid; + return 0; +} + +static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, + struct cls_bpf_prog *prog, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts exts; + bool is_bpf, is_ebpf; + u32 classid; + int ret; + + is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; + is_ebpf = tb[TCA_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || + !tb[TCA_BPF_CLASSID]) + return -EINVAL; + + tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + if (ret < 0) + return ret; + + classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + + ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) : + cls_bpf_prog_from_efd(tb, prog, classid); + if (ret < 0) { + tcf_exts_destroy(&exts); + return ret; + } + tcf_bind_filter(tp, &prog->res, base); tcf_exts_change(tp, &prog->exts, &exts); return 0; -errout_free: - kfree(bpf_ops); -errout: - tcf_exts_destroy(&exts); - return ret; } static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, @@ -282,7 +378,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, goto errout; if (oldprog) { - list_replace_rcu(&prog->link, &oldprog->link); + list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); call_rcu(&oldprog->rcu, __cls_bpf_delete_prog); } else { @@ -297,11 +393,43 @@ errout: return ret; } +static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, + struct sk_buff *skb) +{ + struct nlattr *nla; + + if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) + return -EMSGSIZE; + + nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * + sizeof(struct sock_filter)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + return 0; +} + +static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd)) + return -EMSGSIZE; + + if (prog->bpf_name && + nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) + return -EMSGSIZE; + + return 0; +} + static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *tm) { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; - struct nlattr *nest, *nla; + struct nlattr *nest; + int ret; if (prog == NULL) return skb->len; @@ -314,16 +442,14 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) goto nla_put_failure; - if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) - goto nla_put_failure; - nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * - sizeof(struct sock_filter)); - if (nla == NULL) + if (cls_bpf_is_ebpf(prog)) + ret = cls_bpf_dump_ebpf_info(prog, skb); + else + ret = cls_bpf_dump_bpf_info(prog, skb); + if (ret) goto nla_put_failure; - memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); - if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 221697ab0247..ea611b216412 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -143,14 +143,18 @@ errout: return err; } -static void cls_cgroup_destroy(struct tcf_proto *tp) +static bool cls_cgroup_destroy(struct tcf_proto *tp, bool force) { struct cls_cgroup_head *head = rtnl_dereference(tp->root); + if (!force) + return false; + if (head) { RCU_INIT_POINTER(tp->root, NULL); call_rcu(&head->rcu, cls_cgroup_destroy_rcu); } + return true; } static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg) diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 461410394d08..bb2a0f529c1f 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -26,7 +26,7 @@ #include <net/pkt_cls.h> #include <net/ip.h> #include <net/route.h> -#include <net/flow_keys.h> +#include <net/flow_dissector.h> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netfilter/nf_conntrack.h> @@ -68,35 +68,41 @@ static inline u32 addr_fold(void *addr) static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->src) - return ntohl(flow->src); + __be32 src = flow_get_u32_src(flow); + + if (src) + return ntohl(src); + return addr_fold(skb->sk); } static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->dst) - return ntohl(flow->dst); + __be32 dst = flow_get_u32_dst(flow); + + if (dst) + return ntohl(dst); + return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) { - return flow->ip_proto; + return flow->basic.ip_proto; } static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[0]); + if (flow->ports.ports) + return ntohs(flow->ports.src); return addr_fold(skb->sk); } static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[1]); + if (flow->ports.ports) + return ntohs(flow->ports.dst); return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } @@ -295,7 +301,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) - skb_flow_dissect(skb, &flow_keys); + skb_flow_dissect_flow_keys(skb, &flow_keys); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; @@ -419,6 +425,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (!fnew) goto err2; + tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); + fold = (struct flow_filter *)*arg; if (fold) { err = -EINVAL; @@ -480,7 +488,6 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, fnew->mask = ~0U; fnew->tp = tp; get_random_bytes(&fnew->hashrnd, 4); - tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); } fnew->perturb_timer.function = flow_perturbation; @@ -520,7 +527,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (*arg == 0) list_add_tail_rcu(&fnew->list, &head->filters); else - list_replace_rcu(&fnew->list, &fold->list); + list_replace_rcu(&fold->list, &fnew->list); *arg = (unsigned long)fnew; @@ -557,17 +564,21 @@ static int flow_init(struct tcf_proto *tp) return 0; } -static void flow_destroy(struct tcf_proto *tp) +static bool flow_destroy(struct tcf_proto *tp, bool force) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f, *next; + if (!force && !list_empty(&head->filters)) + return false; + list_for_each_entry_safe(f, next, &head->filters, list) { list_del_rcu(&f->list); call_rcu(&f->rcu, flow_destroy_filter); } RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); + return true; } static unsigned long flow_get(struct tcf_proto *tp, u32 handle) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c new file mode 100644 index 000000000000..2f3d03f99487 --- /dev/null +++ b/net/sched/cls_flower.c @@ -0,0 +1,691 @@ +/* + * net/sched/cls_flower.c Flower classifier + * + * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/rhashtable.h> + +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/ip.h> + +#include <net/sch_generic.h> +#include <net/pkt_cls.h> +#include <net/ip.h> +#include <net/flow_dissector.h> + +struct fl_flow_key { + int indev_ifindex; + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; + struct flow_dissector_key_eth_addrs eth; + struct flow_dissector_key_addrs ipaddrs; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_ports tp; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct fl_flow_mask_range { + unsigned short int start; + unsigned short int end; +}; + +struct fl_flow_mask { + struct fl_flow_key key; + struct fl_flow_mask_range range; + struct rcu_head rcu; +}; + +struct cls_fl_head { + struct rhashtable ht; + struct fl_flow_mask mask; + struct flow_dissector dissector; + u32 hgen; + bool mask_assigned; + struct list_head filters; + struct rhashtable_params ht_params; + struct rcu_head rcu; +}; + +struct cls_fl_filter { + struct rhash_head ht_node; + struct fl_flow_key mkey; + struct tcf_exts exts; + struct tcf_result res; + struct fl_flow_key key; + struct list_head list; + u32 handle; + struct rcu_head rcu; +}; + +static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) +{ + return mask->range.end - mask->range.start; +} + +static void fl_mask_update_range(struct fl_flow_mask *mask) +{ + const u8 *bytes = (const u8 *) &mask->key; + size_t size = sizeof(mask->key); + size_t i, first = 0, last = size - 1; + + for (i = 0; i < sizeof(mask->key); i++) { + if (bytes[i]) { + if (!first && i) + first = i; + last = i; + } + } + mask->range.start = rounddown(first, sizeof(long)); + mask->range.end = roundup(last + 1, sizeof(long)); +} + +static void *fl_key_get_start(struct fl_flow_key *key, + const struct fl_flow_mask *mask) +{ + return (u8 *) key + mask->range.start; +} + +static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + const long *lkey = fl_key_get_start(key, mask); + const long *lmask = fl_key_get_start(&mask->key, mask); + long *lmkey = fl_key_get_start(mkey, mask); + int i; + + for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) + *lmkey++ = *lkey++ & *lmask++; +} + +static void fl_clear_masked_range(struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); +} + +static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_fl_head *head = rcu_dereference_bh(tp->root); + struct cls_fl_filter *f; + struct fl_flow_key skb_key; + struct fl_flow_key skb_mkey; + + fl_clear_masked_range(&skb_key, &head->mask); + skb_key.indev_ifindex = skb->skb_iif; + /* skb_flow_dissect() does not set n_proto in case an unknown protocol, + * so do it rather here. + */ + skb_key.basic.n_proto = skb->protocol; + skb_flow_dissect(skb, &head->dissector, &skb_key); + + fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); + + f = rhashtable_lookup_fast(&head->ht, + fl_key_get_start(&skb_mkey, &head->mask), + head->ht_params); + if (f) { + *res = f->res; + return tcf_exts_exec(skb, &f->exts, res); + } + return -1; +} + +static int fl_init(struct tcf_proto *tp) +{ + struct cls_fl_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + + INIT_LIST_HEAD_RCU(&head->filters); + rcu_assign_pointer(tp->root, head); + + return 0; +} + +static void fl_destroy_filter(struct rcu_head *head) +{ + struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); + + tcf_exts_destroy(&f->exts); + kfree(f); +} + +static bool fl_destroy(struct tcf_proto *tp, bool force) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f, *next; + + if (!force && !list_empty(&head->filters)) + return false; + + list_for_each_entry_safe(f, next, &head->filters, list) { + list_del_rcu(&f->list); + call_rcu(&f->rcu, fl_destroy_filter); + } + RCU_INIT_POINTER(tp->root, NULL); + if (head->mask_assigned) + rhashtable_destroy(&head->ht); + kfree_rcu(head, rcu); + return true; +} + +static unsigned long fl_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry(f, &head->filters, list) + if (f->handle == handle) + return (unsigned long) f; + return 0; +} + +static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { + [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_FLOWER_CLASSID] = { .type = NLA_U32 }, + [TCA_FLOWER_INDEV] = { .type = NLA_STRING, + .len = IFNAMSIZ }, + [TCA_FLOWER_KEY_ETH_DST] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_DST_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_IP_PROTO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IPV4_SRC] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_SRC_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 }, +}; + +static void fl_set_key_val(struct nlattr **tb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + if (!tb[val_type]) + return; + memcpy(val, nla_data(tb[val_type]), len); + if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type]) + memset(mask, 0xff, len); + else + memcpy(mask, nla_data(tb[mask_type]), len); +} + +static int fl_set_key(struct net *net, struct nlattr **tb, + struct fl_flow_key *key, struct fl_flow_key *mask) +{ +#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FLOWER_INDEV]) { + int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); + if (err < 0) + return err; + key->indev_ifindex = err; + mask->indev_ifindex = 0xffffffff; + } +#endif + + fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)); + fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)); + fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + if (key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) { + fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto)); + } + if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)); + fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)); + } else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)); + fl_set_key_val(tb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)); + } + if (key->basic.ip_proto == IPPROTO_TCP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } else if (key->basic.ip_proto == IPPROTO_UDP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } + + return 0; +} + +static bool fl_mask_eq(struct fl_flow_mask *mask1, + struct fl_flow_mask *mask2) +{ + const long *lmask1 = fl_key_get_start(&mask1->key, mask1); + const long *lmask2 = fl_key_get_start(&mask2->key, mask2); + + return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) && + !memcmp(lmask1, lmask2, fl_mask_range(mask1)); +} + +static const struct rhashtable_params fl_ht_params = { + .key_offset = offsetof(struct cls_fl_filter, mkey), /* base offset */ + .head_offset = offsetof(struct cls_fl_filter, ht_node), + .automatic_shrinking = true, +}; + +static int fl_init_hashtable(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + head->ht_params = fl_ht_params; + head->ht_params.key_len = fl_mask_range(mask); + head->ht_params.key_offset += mask->range.start; + + return rhashtable_init(&head->ht, &head->ht_params); +} + +#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) +#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) +#define FL_KEY_MEMBER_END_OFFSET(member) \ + (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member)) + +#define FL_KEY_IN_RANGE(mask, member) \ + (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \ + FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start) + +#define FL_KEY_SET(keys, cnt, id, member) \ + do { \ + keys[cnt].key_id = id; \ + keys[cnt].offset = FL_KEY_MEMBER_OFFSET(member); \ + cnt++; \ + } while(0); + +#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \ + do { \ + if (FL_KEY_IN_RANGE(mask, member)) \ + FL_KEY_SET(keys, cnt, id, member); \ + } while(0); + +static void fl_init_dissector(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; + size_t cnt = 0; + + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); + + skb_flow_dissector_init(&head->dissector, keys, cnt); +} + +static int fl_check_assign_mask(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + int err; + + if (head->mask_assigned) { + if (!fl_mask_eq(&head->mask, mask)) + return -EINVAL; + else + return 0; + } + + /* Mask is not assigned yet. So assign it and init hashtable + * according to that. + */ + err = fl_init_hashtable(head, mask); + if (err) + return err; + memcpy(&head->mask, mask, sizeof(head->mask)); + head->mask_assigned = true; + + fl_init_dissector(head, mask); + + return 0; +} + +static int fl_set_parms(struct net *net, struct tcf_proto *tp, + struct cls_fl_filter *f, struct fl_flow_mask *mask, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts e; + int err; + + tcf_exts_init(&e, TCA_FLOWER_ACT, 0); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + return err; + + if (tb[TCA_FLOWER_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + + err = fl_set_key(net, tb, &f->key, &mask->key); + if (err) + goto errout; + + fl_mask_update_range(mask); + fl_set_masked_key(&f->mkey, &f->key, mask); + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(&e); + return err; +} + +static u32 fl_grab_new_handle(struct tcf_proto *tp, + struct cls_fl_head *head) +{ + unsigned int i = 0x80000000; + u32 handle; + + do { + if (++head->hgen == 0x7FFFFFFF) + head->hgen = 1; + } while (--i > 0 && fl_get(tp, head->hgen)); + + if (unlikely(i == 0)) { + pr_err("Insufficient number of handles\n"); + handle = 0; + } else { + handle = head->hgen; + } + + return handle; +} + +static int fl_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg; + struct cls_fl_filter *fnew; + struct nlattr *tb[TCA_FLOWER_MAX + 1]; + struct fl_flow_mask mask = {}; + int err; + + if (!tca[TCA_OPTIONS]) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy); + if (err < 0) + return err; + + if (fold && handle && fold->handle != handle) + return -EINVAL; + + fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + if (!fnew) + return -ENOBUFS; + + tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + + if (!handle) { + handle = fl_grab_new_handle(tp, head); + if (!handle) { + err = -EINVAL; + goto errout; + } + } + fnew->handle = handle; + + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); + if (err) + goto errout; + + err = fl_check_assign_mask(head, &mask); + if (err) + goto errout; + + err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, + head->ht_params); + if (err) + goto errout; + if (fold) + rhashtable_remove_fast(&head->ht, &fold->ht_node, + head->ht_params); + + *arg = (unsigned long) fnew; + + if (fold) { + list_replace_rcu(&fold->list, &fnew->list); + tcf_unbind_filter(tp, &fold->res); + call_rcu(&fold->rcu, fl_destroy_filter); + } else { + list_add_tail_rcu(&fnew->list, &head->filters); + } + + return 0; + +errout: + kfree(fnew); + return err; +} + +static int fl_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) arg; + + rhashtable_remove_fast(&head->ht, &f->ht_node, + head->ht_params); + list_del_rcu(&f->list); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, fl_destroy_filter); + return 0; +} + +static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry_rcu(f, &head->filters, list) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static int fl_dump_key_val(struct sk_buff *skb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + int err; + + if (!memchr_inv(mask, 0, len)) + return 0; + err = nla_put(skb, val_type, len, val); + if (err) + return err; + if (mask_type != TCA_FLOWER_UNSPEC) { + err = nla_put(skb, mask_type, len, mask); + if (err) + return err; + } + return 0; +} + +static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) fh; + struct nlattr *nest; + struct fl_flow_key *key, *mask; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) + goto nla_put_failure; + + key = &f->key; + mask = &head->mask.key; + + if (mask->indev_ifindex) { + struct net_device *dev; + + dev = __dev_get_by_index(net, key->indev_ifindex); + if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name)) + goto nla_put_failure; + } + + if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)) || + fl_dump_key_val(skb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)) || + fl_dump_key_val(skb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto))) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) && + fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto))) + goto nla_put_failure; + + if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && + (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)) || + fl_dump_key_val(skb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)))) + goto nla_put_failure; + else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS && + (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)) || + fl_dump_key_val(skb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)))) + goto nla_put_failure; + + if (key->basic.ip_proto == IPPROTO_TCP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + else if (key->basic.ip_proto == IPPROTO_UDP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct tcf_proto_ops cls_fl_ops __read_mostly = { + .kind = "flower", + .classify = fl_classify, + .init = fl_init, + .destroy = fl_destroy, + .get = fl_get, + .change = fl_change, + .delete = fl_delete, + .walk = fl_walk, + .dump = fl_dump, + .owner = THIS_MODULE, +}; + +static int __init cls_fl_init(void) +{ + return register_tcf_proto_ops(&cls_fl_ops); +} + +static void __exit cls_fl_exit(void) +{ + unregister_tcf_proto_ops(&cls_fl_ops); +} + +module_init(cls_fl_init); +module_exit(cls_fl_exit); + +MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>"); +MODULE_DESCRIPTION("Flower classifier"); +MODULE_LICENSE("GPL v2"); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index a5269f76004c..715e01e5910a 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -33,6 +33,7 @@ struct fw_head { u32 mask; + bool mask_set; struct fw_filter __rcu *ht[HTSIZE]; struct rcu_head rcu; }; @@ -113,6 +114,14 @@ static unsigned long fw_get(struct tcf_proto *tp, u32 handle) static int fw_init(struct tcf_proto *tp) { + struct fw_head *head; + + head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + + head->mask_set = false; + rcu_assign_pointer(tp->root, head); return 0; } @@ -124,14 +133,20 @@ static void fw_delete_filter(struct rcu_head *head) kfree(f); } -static void fw_destroy(struct tcf_proto *tp) +static bool fw_destroy(struct tcf_proto *tp, bool force) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; int h; if (head == NULL) - return; + return true; + + if (!force) { + for (h = 0; h < HTSIZE; h++) + if (rcu_access_pointer(head->ht[h])) + return false; + } for (h = 0; h < HTSIZE; h++) { while ((f = rtnl_dereference(head->ht[h])) != NULL) { @@ -143,6 +158,7 @@ static void fw_destroy(struct tcf_proto *tp) } RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); + return true; } static int fw_delete(struct tcf_proto *tp, unsigned long arg) @@ -286,17 +302,11 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (!handle) return -EINVAL; - if (head == NULL) { - u32 mask = 0xFFFFFFFF; + if (!head->mask_set) { + head->mask = 0xFFFFFFFF; if (tb[TCA_FW_MASK]) - mask = nla_get_u32(tb[TCA_FW_MASK]); - - head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); - if (head == NULL) - return -ENOBUFS; - head->mask = mask; - - rcu_assign_pointer(tp->root, head); + head->mask = nla_get_u32(tb[TCA_FW_MASK]); + head->mask_set = true; } f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 2ecd24688554..08a3b0a6f5ab 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -258,6 +258,13 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle) static int route4_init(struct tcf_proto *tp) { + struct route4_head *head; + + head = kzalloc(sizeof(struct route4_head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + + rcu_assign_pointer(tp->root, head); return 0; } @@ -270,13 +277,20 @@ route4_delete_filter(struct rcu_head *head) kfree(f); } -static void route4_destroy(struct tcf_proto *tp) +static bool route4_destroy(struct tcf_proto *tp, bool force) { struct route4_head *head = rtnl_dereference(tp->root); int h1, h2; if (head == NULL) - return; + return true; + + if (!force) { + for (h1 = 0; h1 <= 256; h1++) { + if (rcu_access_pointer(head->table[h1])) + return false; + } + } for (h1 = 0; h1 <= 256; h1++) { struct route4_bucket *b; @@ -301,6 +315,7 @@ static void route4_destroy(struct tcf_proto *tp) } RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); + return true; } static int route4_delete(struct tcf_proto *tp, unsigned long arg) @@ -484,13 +499,6 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; err = -ENOBUFS; - if (head == NULL) { - head = kzalloc(sizeof(struct route4_head), GFP_KERNEL); - if (head == NULL) - goto errout; - rcu_assign_pointer(tp->root, head); - } - f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL); if (!f) goto errout; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index edd8ade3fbc1..02fa82792dab 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -291,13 +291,20 @@ rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) kfree_rcu(f, rcu); } -static void rsvp_destroy(struct tcf_proto *tp) +static bool rsvp_destroy(struct tcf_proto *tp, bool force) { struct rsvp_head *data = rtnl_dereference(tp->root); int h1, h2; if (data == NULL) - return; + return true; + + if (!force) { + for (h1 = 0; h1 < 256; h1++) { + if (rcu_access_pointer(data->ht[h1])) + return false; + } + } RCU_INIT_POINTER(tp->root, NULL); @@ -319,6 +326,7 @@ static void rsvp_destroy(struct tcf_proto *tp) } } kfree_rcu(data, rcu); + return true; } static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index bd49bf547a47..a557dbaf5afe 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -468,11 +468,14 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) } } -static void tcindex_destroy(struct tcf_proto *tp) +static bool tcindex_destroy(struct tcf_proto *tp, bool force) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcf_walker walker; + if (!force) + return false; + pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); walker.count = 0; walker.skip = 0; @@ -481,6 +484,7 @@ static void tcindex_destroy(struct tcf_proto *tp) RCU_INIT_POINTER(tp->root, NULL); call_rcu(&p->rcu, __tcindex_destroy); + return true; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 95fdf4e40051..cab9e9b43967 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -463,13 +463,35 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) return -ENOENT; } -static void u32_destroy(struct tcf_proto *tp) +static bool ht_empty(struct tc_u_hnode *ht) +{ + unsigned int h; + + for (h = 0; h <= ht->divisor; h++) + if (rcu_access_pointer(ht->ht[h])) + return false; + + return true; +} + +static bool u32_destroy(struct tcf_proto *tp, bool force) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); WARN_ON(root_ht == NULL); + if (!force) { + if (root_ht) { + if (root_ht->refcnt > 1) + return false; + if (root_ht->refcnt == 1) { + if (!ht_empty(root_ht)) + return false; + } + } + } + if (root_ht && --root_ht->refcnt == 0) u32_destroy_hnode(tp, root_ht); @@ -494,6 +516,7 @@ static void u32_destroy(struct tcf_proto *tp) } tp->data = NULL; + return true; } static int u32_delete(struct tcf_proto *tp, unsigned long arg) diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index a3d79c8bf3b8..df0328ba6a48 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -92,8 +92,8 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, rcu_read_lock(); - if (dev && skb->skb_iif) - indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif); + if (skb->skb_iif) + indev = dev_get_by_index_rcu(em->net, skb->skb_iif); acpar.in = indev ? indev : dev; acpar.out = dev; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index f03c3de16c27..73e2ed576ceb 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -34,7 +34,6 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, { struct text_match *tm = EM_TEXT_PRIV(m); int from, to; - struct ts_state state; from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data; from += tm->from_offset; @@ -42,7 +41,7 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data; to += tm->to_offset; - return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX; + return skb_find_text(skb, from, to, tm->config) != UINT_MAX; } static int em_text_change(struct net *net, void *data, int len, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 243b7d169d61..f06aa01d60fd 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -815,10 +815,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, if (dev->flags & IFF_UP) dev_deactivate(dev); - if (new && new->ops->attach) { - new->ops->attach(new); - num_q = 0; - } + if (new && new->ops->attach) + goto skip; for (i = 0; i < num_q; i++) { struct netdev_queue *dev_queue = dev_ingress_queue(dev); @@ -834,12 +832,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, qdisc_destroy(old); } +skip: if (!ingress) { notify_and_destroy(net, skb, n, classid, dev->qdisc, new); if (new && !new->ops->attach) atomic_inc(&new->refcnt); dev->qdisc = new ? : &noop_qdisc; + + if (new && new->ops->attach) + new->ops->attach(new); } else { notify_and_destroy(net, skb, n, classid, old, new); } @@ -1816,13 +1818,8 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, continue; err = tp->classify(skb, tp, res); - if (err >= 0) { -#ifdef CONFIG_NET_CLS_ACT - if (err != TC_ACT_RECLASSIFY && skb->tc_verd) - skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); -#endif + if (err >= 0) return err; - } } return -1; } @@ -1834,23 +1831,22 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, int err = 0; #ifdef CONFIG_NET_CLS_ACT const struct tcf_proto *otp = tp; + int limit = 0; reclassify: #endif err = tc_classify_compat(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT if (err == TC_ACT_RECLASSIFY) { - u32 verd = G_TC_VERD(skb->tc_verd); tp = otp; - if (verd++ >= MAX_REC_LOOP) { + if (unlikely(limit++ >= MAX_REC_LOOP)) { net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", tp->q->ops->id, tp->prio & 0xffff, ntohs(tp->protocol)); return TC_ACT_SHOT; } - skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); goto reclassify; } #endif @@ -1858,11 +1854,15 @@ reclassify: } EXPORT_SYMBOL(tc_classify); -void tcf_destroy(struct tcf_proto *tp) +bool tcf_destroy(struct tcf_proto *tp, bool force) { - tp->ops->destroy(tp); - module_put(tp->ops->owner); - kfree_rcu(tp, rcu); + if (tp->ops->destroy(tp, force)) { + module_put(tp->ops->owner); + kfree_rcu(tp, rcu); + return true; + } + + return false; } void tcf_destroy_chain(struct tcf_proto __rcu **fl) @@ -1871,7 +1871,7 @@ void tcf_destroy_chain(struct tcf_proto __rcu **fl) while ((tp = rtnl_dereference(*fl)) != NULL) { RCU_INIT_POINTER(*fl, tp->next); - tcf_destroy(tp); + tcf_destroy(tp, true); } } EXPORT_SYMBOL(tcf_destroy_chain); @@ -1879,13 +1879,10 @@ EXPORT_SYMBOL(tcf_destroy_chain); #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { - struct timespec ts; - - hrtimer_get_res(CLOCK_MONOTONIC, &ts); seq_printf(seq, "%08x %08x %08x %08x\n", (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 1000000, - (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts))); + (u32)NSEC_PER_SEC / hrtimer_resolution); return 0; } diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index c009eb9045ce..6a783afe4960 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -18,7 +18,7 @@ #include <net/pkt_sched.h> #include <net/inet_ecn.h> #include <net/red.h> -#include <net/flow_keys.h> +#include <net/flow_dissector.h> /* CHOKe stateless AQM for fair bandwidth allocation @@ -133,16 +133,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) --sch->q.qlen; } -/* private part of skb->cb[] that a qdisc is allowed to use - * is limited to QDISC_CB_PRIV_LEN bytes. - * As a flow key might be too large, we store a part of it only. - */ -#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3) - struct choke_skb_cb { u16 classid; u8 keys_valid; - u8 keys[QDISC_CB_PRIV_LEN - 3]; + struct flow_keys_digest keys; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) @@ -176,19 +170,19 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect(skb1, &temp); - memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN); + skb_flow_dissect_flow_keys(skb1, &temp); + make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect(skb2, &temp); - memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN); + skb_flow_dissect_flow_keys(skb2, &temp); + make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } return !memcmp(&choke_skb_cb(skb1)->keys, &choke_skb_cb(skb2)->keys, - CHOKE_K_LEN); + sizeof(choke_skb_cb(skb1)->keys)); } /* @@ -391,6 +385,19 @@ static void choke_reset(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); + while (q->head != q->tail) { + struct sk_buff *skb = q->tab[q->head]; + + q->head = (q->head + 1) & q->tab_mask; + if (!skb) + continue; + qdisc_qstats_backlog_dec(sch, skb); + --sch->q.qlen; + qdisc_drop(skb, sch); + } + + memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); + q->head = q->tail = 0; red_restart(&q->vars); } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index de28f8e968e8..535007d5f0b5 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -6,7 +6,7 @@ * * Implemented on linux by : * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> - * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -109,6 +109,7 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, [TCA_CODEL_ECN] = { .type = NLA_U32 }, + [TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 }, }; static int codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -133,6 +134,12 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]); + + q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_CODEL_INTERVAL]) { u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); @@ -164,7 +171,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt) sch->limit = DEFAULT_CODEL_LIMIT; - codel_params_init(&q->params); + codel_params_init(&q->params, sch); codel_vars_init(&q->vars); codel_stats_init(&q->stats); @@ -201,7 +208,10 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_CODEL_ECN, q->params.ecn)) goto nla_put_failure; - + if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD, + codel_time_to_us(q->params.ce_threshold))) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: @@ -220,6 +230,7 @@ static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .ldelay = codel_time_to_us(q->vars.ldelay), .dropping = q->vars.dropping, .ecn_mark = q->stats.ecn_mark, + .ce_mark = q->stats.ce_mark, }; if (q->vars.dropping) { diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index dfcea20e3171..f377702d4b91 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Meant to be mostly used for localy generated traffic : + * Meant to be mostly used for locally generated traffic : * Fast classification depends on skb->sk being set before reaching us. * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. * All packets belonging to a socket are considered as a 'flow'. @@ -63,7 +63,7 @@ struct fq_flow { struct sk_buff *tail; /* last skb in the list */ unsigned long age; /* jiffies when flow was emptied, for gc */ }; - struct rb_node fq_node; /* anchor in fq_root[] trees */ + struct rb_node fq_node; /* anchor in fq_root[] trees */ struct sock *sk; int qlen; /* number of packets in flow queue */ int credit; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 1e52decb7b59..21ca33c9f036 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -6,7 +6,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> */ #include <linux/module.h> @@ -23,7 +23,6 @@ #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> -#include <net/flow_keys.h> #include <net/codel.h> /* Fair Queue CoDel. @@ -68,15 +67,9 @@ struct fq_codel_sched_data { }; static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, - const struct sk_buff *skb) + struct sk_buff *skb) { - struct flow_keys keys; - unsigned int hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); + u32 hash = skb_get_hash_perturb(skb, q->perturbation); return reciprocal_scale(hash, q->flows_cnt); } @@ -162,14 +155,23 @@ static unsigned int fq_codel_drop(struct Qdisc *sch) skb = dequeue_head(flow); len = qdisc_pkt_len(skb); q->backlogs[idx] -= len; - kfree_skb(skb); sch->q.qlen--; qdisc_qstats_drop(sch); qdisc_qstats_backlog_dec(sch, skb); + kfree_skb(skb); flow->dropped++; return idx; } +static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) +{ + unsigned int prev_backlog; + + prev_backlog = sch->qstats.backlog; + fq_codel_drop(sch); + return prev_backlog - sch->qstats.backlog; +} + static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -299,6 +301,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -329,6 +332,12 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); + + q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); @@ -391,7 +400,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) q->perturbation = prandom_u32(); INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); - codel_params_init(&q->cparams); + codel_params_init(&q->cparams, sch); codel_stats_init(&q->cstats); q->cparams.ecn = true; @@ -448,6 +457,11 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->flows_cnt)) goto nla_put_failure; + if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, + codel_time_to_us(q->cparams.ce_threshold))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: @@ -466,6 +480,7 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.drop_overlimit = q->drop_overlimit; st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; st.qdisc_stats.new_flow_count = q->new_flow_count; + st.qdisc_stats.ce_mark = q->cstats.ce_mark; list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; @@ -598,7 +613,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .enqueue = fq_codel_enqueue, .dequeue = fq_codel_dequeue, .peek = qdisc_peek_dequeued, - .drop = fq_codel_drop, + .drop = fq_codel_qdisc_drop, .init = fq_codel_init, .reset = fq_codel_reset, .destroy = fq_codel_destroy, diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index a4ca4517cdc8..abb9f2fec28f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -165,7 +165,8 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) * if no default DP has been configured. This * allows for DP flows to be left untouched. */ - if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len) + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= + sch->limit)) return qdisc_enqueue_tail(skb, sch); else goto drop; @@ -229,7 +230,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; } - if (q->backlog + qdisc_pkt_len(skb) <= q->limit) { + if (gred_backlog(t, q, sch) + qdisc_pkt_len(skb) <= q->limit) { q->backlog += qdisc_pkt_len(skb); return qdisc_enqueue_tail(skb, sch); } @@ -397,7 +398,10 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, q->DP = dp; q->prio = prio; - q->limit = ctl->limit; + if (ctl->limit > sch->limit) + q->limit = sch->limit; + else + q->limit = ctl->limit; if (q->backlog == 0) red_end_of_idle_period(&q->vars); @@ -414,6 +418,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, + [TCA_GRED_LIMIT] = { .type = NLA_U32 }, }; static int gred_change(struct Qdisc *sch, struct nlattr *opt) @@ -433,11 +438,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) if (err < 0) return err; - if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) + if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { + if (tb[TCA_GRED_LIMIT] != NULL) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); return gred_change_table_def(sch, opt); + } if (tb[TCA_GRED_PARMS] == NULL || - tb[TCA_GRED_STAB] == NULL) + tb[TCA_GRED_STAB] == NULL || + tb[TCA_GRED_LIMIT] != NULL) return -EINVAL; max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; @@ -501,6 +510,14 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) return -EINVAL; + if (tb[TCA_GRED_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); + else { + u32 qlen = qdisc_dev(sch)->tx_queue_len ? : 1; + + sch->limit = qlen * psched_mtu(qdisc_dev(sch)); + } + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } @@ -531,6 +548,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) + goto nla_put_failure; + parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; @@ -553,7 +573,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opt.limit = q->limit; opt.DP = q->DP; - opt.backlog = q->backlog; + opt.backlog = gred_backlog(table, q, sch); opt.prio = q->prio; opt.qth_min = q->parms.qth_min >> q->parms.Wlog; opt.qth_max = q->parms.qth_max >> q->parms.Wlog; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 15d3aabfe250..9d15cb6b8cb1 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -9,7 +9,6 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> -#include <net/flow_keys.h> #include <net/pkt_sched.h> #include <net/sock.h> @@ -176,22 +175,6 @@ static u32 hhf_time_stamp(void) return jiffies; } -static unsigned int skb_hash(const struct hhf_sched_data *q, - const struct sk_buff *skb) -{ - struct flow_keys keys; - unsigned int hash; - - if (skb->sk && skb->sk->sk_hash) - return skb->sk->sk_hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); - return hash; -} - /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, @@ -280,7 +263,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) } /* Get hashed flow-id of the skb. */ - hash = skb_hash(q, skb); + hash = skb_get_hash_perturb(skb, q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index eb5b8445fef9..e7c648fa9dc3 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -12,16 +12,10 @@ #include <linux/list.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> + #include <net/netlink.h> #include <net/pkt_sched.h> - -struct ingress_qdisc_data { - struct tcf_proto __rcu *filter_list; -}; - -/* ------------------------- Class/flow operations ------------------------- */ - static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; @@ -49,50 +43,25 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) { - struct ingress_qdisc_data *p = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); - return &p->filter_list; + return &dev->ingress_cl_list; } -/* --------------------------- Qdisc operations ---------------------------- */ - -static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { - struct ingress_qdisc_data *p = qdisc_priv(sch); - struct tcf_result res; - struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result; - - result = tc_classify(skb, fl, &res); - - qdisc_bstats_update(sch, skb); - switch (result) { - case TC_ACT_SHOT: - result = TC_ACT_SHOT; - qdisc_qstats_drop(sch); - break; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - result = TC_ACT_STOLEN; - break; - case TC_ACT_RECLASSIFY: - case TC_ACT_OK: - skb->tc_index = TC_H_MIN(res.classid); - default: - result = TC_ACT_OK; - break; - } - - return result; -} + net_inc_ingress_queue(); + sch->flags |= TCQ_F_CPUSTATS; -/* ------------------------------------------------------------- */ + return 0; +} static void ingress_destroy(struct Qdisc *sch) { - struct ingress_qdisc_data *p = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); - tcf_destroy_chain(&p->filter_list); + tcf_destroy_chain(&dev->ingress_cl_list); + net_dec_ingress_queue(); } static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -102,6 +71,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; + return nla_nest_end(skb, nest); nla_put_failure: @@ -122,8 +92,7 @@ static const struct Qdisc_class_ops ingress_class_ops = { static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", - .priv_size = sizeof(struct ingress_qdisc_data), - .enqueue = ingress_enqueue, + .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, .owner = THIS_MODULE, @@ -139,6 +108,7 @@ static void __exit ingress_module_exit(void) unregister_qdisc(&ingress_qdisc_ops); } -module_init(ingress_module_init) -module_exit(ingress_module_exit) +module_init(ingress_module_init); +module_exit(ingress_module_exit); + MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 179f1c8c0d8b..5abd1d9de989 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -440,9 +440,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { struct Qdisc *rootq = qdisc_root(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - q->duplicate = 0; - qdisc_enqueue_root(skb2, rootq); + q->duplicate = 0; + rootq->enqueue(skb2, rootq); q->duplicate = dupsave; } @@ -560,8 +560,8 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) tfifo_dequeue: skb = __skb_dequeue(&sch->q); if (skb) { -deliver: qdisc_qstats_backlog_dec(sch, skb); +deliver: qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; @@ -578,6 +578,7 @@ deliver: rb_erase(p, &q->t_root); sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; skb->prev = NULL; skb->tstamp = netem_skb_cb(skb)->tstamp_save; diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index 89f8fcf73f18..ade9445a55ab 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -216,6 +216,7 @@ static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .peek = qdisc_peek_head, .init = plug_init, .change = plug_change, + .reset = qdisc_reset_queue, .owner = THIS_MODULE, }; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 3ec7e88a43ca..b8d73bca683c 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -339,8 +339,7 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - if (!hlist_unhashed(&agg->nonfull_next)) - hlist_del_init(&agg->nonfull_next); + hlist_del_init(&agg->nonfull_next); q->wsum -= agg->class_weight; if (q->wsum != 0) q->iwsum = ONE_FP / q->wsum; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 5819dd82630d..4b815193326c 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -26,7 +26,6 @@ #include <net/ip.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> -#include <net/flow_keys.h> /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) @@ -285,9 +284,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) int i; u32 p_min = ~0; u32 minqlen = ~0; - u32 r, slot, salt, sfbhash; + u32 r, sfbhash; + u32 slot = q->slot; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { qdisc_qstats_overlimit(sch); @@ -309,22 +308,17 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) fl = rcu_dereference_bh(q->filter_list); if (fl) { + u32 salt; + /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; - keys.src = salt; - keys.dst = 0; - keys.ports = 0; + sfbhash = jhash_1word(salt, q->bins[slot].perturbation); } else { - skb_flow_dissect(skb, &keys); + sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation); } - slot = q->slot; - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -356,10 +350,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); + sfbhash = skb_get_hash_perturb(skb, + q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b877140beda5..52f75a5473e1 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -23,7 +23,6 @@ #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> -#include <net/flow_keys.h> #include <net/red.h> @@ -156,30 +155,10 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index return &q->dep[val - SFQ_MAX_FLOWS]; } -/* - * In order to be able to quickly rehash our queue when timer changes - * q->perturbation, we store flow_keys in skb->cb[] - */ -struct sfq_skb_cb { - struct flow_keys keys; -}; - -static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) -{ - qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); - return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; -} - static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { - const struct flow_keys *keys = &sfq_skb_cb(skb)->keys; - unsigned int hash; - - hash = jhash_3words((__force u32)keys->dst, - (__force u32)keys->src ^ keys->ip_proto, - (__force u32)keys->ports, q->perturbation); - return hash & (q->divisor - 1); + return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -196,10 +175,8 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return TC_H_MIN(skb->priority); fl = rcu_dereference_bh(q->filter_list); - if (!fl) { - skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); + if (!fl) return sfq_hash(q, skb) + 1; - } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tc_classify(skb, fl, &res); @@ -329,10 +306,10 @@ drop: len = qdisc_pkt_len(skb); slot->backlog -= len; sfq_dec(q, x); - kfree_skb(skb); sch->q.qlen--; qdisc_qstats_drop(sch); qdisc_qstats_backlog_dec(sch, skb); + kfree_skb(skb); return len; } diff --git a/net/sctp/auth.c b/net/sctp/auth.c index fb7976aee61c..4f15b7d730e1 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -381,13 +381,14 @@ nomem: } -/* Public interface to creat the association shared key. +/* Public interface to create the association shared key. * See code above for the algorithm. */ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) { struct sctp_auth_bytes *secret; struct sctp_shared_key *ep_key; + struct sctp_chunk *chunk; /* If we don't support AUTH, or peer is not capable * we don't need to do anything. @@ -410,6 +411,14 @@ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) sctp_auth_key_put(asoc->asoc_shared_key); asoc->asoc_shared_key = secret; + /* Update send queue in case any chunk already in there now + * needs authenticating + */ + list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) { + if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) + chunk->auth = 1; + } + return 0; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0e4198ee2370..e917d27328ea 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -331,8 +331,9 @@ out: rt = (struct rt6_info *)dst; t->dst = dst; - t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; - pr_debug("rt6_dst:%pI6 rt6_src:%pI6\n", &rt->rt6i_dst.addr, + t->dst_cookie = rt6_get_cookie(rt); + pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", + &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &fl6->saddr); } else { t->dst = NULL; @@ -635,7 +636,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; - newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot); + newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0); if (!newsk) goto out; diff --git a/net/sctp/output.c b/net/sctp/output.c index fc5e45b8a832..abe7c2db2412 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -599,7 +599,9 @@ out: return err; no_route: kfree_skb(nskb); - IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES); + + if (asoc) + IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES); /* FIXME: Returning the 'err' will effect all the associations * associated with a socket, although only one of the paths of the diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 8f34b27d5775..59e80356672b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -550,7 +550,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, struct sctp_association *asoc) { struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, - sk->sk_prot); + sk->sk_prot, 0); struct inet_sock *newinet; if (!newsk) @@ -1322,8 +1322,7 @@ static __init int sctp_init(void) int max_share; int order; - BUILD_BUG_ON(sizeof(struct sctp_ulpevent) > - sizeof(((struct sk_buff *) 0)->cb)); + sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); /* Allocate bind_bucket and chunk caches. */ status = -ENOBUFS; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index aafe94bf292e..17bef01b9aa3 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -102,11 +102,6 @@ static int sctp_autobind(struct sock *sk); static void sctp_sock_migrate(struct sock *, struct sock *, struct sctp_association *, sctp_socket_type_t); -extern struct kmem_cache *sctp_bucket_cachep; -extern long sysctl_sctp_mem[3]; -extern int sysctl_sctp_rmem[3]; -extern int sysctl_sctp_wmem[3]; - static int sctp_memory_pressure; static atomic_long_t sctp_memory_allocated; struct percpu_counter sctp_sockets_allocated; @@ -1533,8 +1528,10 @@ static void sctp_close(struct sock *sk, long timeout) /* Supposedly, no process has access to the socket, but * the net layers still may. + * Also, sctp_destroy_sock() needs to be called with addr_wq_lock + * held and that should be grabbed before socket lock. */ - local_bh_disable(); + spin_lock_bh(&net->sctp.addr_wq_lock); bh_lock_sock(sk); /* Hold the sock, since sk_common_release() will put sock_put() @@ -1544,7 +1541,7 @@ static void sctp_close(struct sock *sk, long timeout) sk_common_release(sk); bh_unlock_sock(sk); - local_bh_enable(); + spin_unlock_bh(&net->sctp.addr_wq_lock); sock_put(sk); @@ -1586,8 +1583,7 @@ static int sctp_error(struct sock *sk, int flags, int err) static int sctp_msghdr_parse(const struct msghdr *, sctp_cmsgs_t *); -static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t msg_len) +static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { struct net *net = sock_net(sk); struct sctp_sock *sp; @@ -2066,9 +2062,8 @@ static int sctp_skb_pull(struct sk_buff *skb, int len) * flags - flags sent or received with the user message, see Section * 5 for complete description of the flags. */ -static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, int noblock, - int flags, int *addr_len) +static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) { struct sctp_ulpevent *event = NULL; struct sctp_sock *sp = sctp_sk(sk); @@ -2126,12 +2121,6 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, if (sp->subscribe.sctp_data_io_event) sctp_ulpevent_read_sndrcvinfo(event, msg); -#if 0 - /* FIXME: we should be calling IP/IPv6 layers. */ - if (sk->sk_protinfo.af_inet.cmsg_flags) - ip_cmsg_recv(msg, skb); -#endif - err = copied; /* If skb's length exceeds the user's buffer, update the skb and @@ -2211,12 +2200,6 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) return -EFAULT; - if (sctp_sk(sk)->subscribe.sctp_data_io_event) - pr_warn_ratelimited(DEPRECATED "%s (pid %d) " - "Requested SCTP_SNDRCVINFO event.\n" - "Use SCTP_RCVINFO through SCTP_RECVRCVINFO option instead.\n", - current->comm, task_pid_nr(current)); - /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, * if there is no data to be sent or retransmit, the stack will * immediately send up this notification. @@ -3587,6 +3570,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf)) return 0; + spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock); if (val == 0 && sp->do_auto_asconf) { list_del(&sp->auto_asconf_list); sp->do_auto_asconf = 0; @@ -3595,6 +3579,7 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, &sock_net(sk)->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; } + spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock); return 0; } @@ -4128,18 +4113,28 @@ static int sctp_init_sock(struct sock *sk) local_bh_disable(); percpu_counter_inc(&sctp_sockets_allocated); sock_prot_inuse_add(net, sk->sk_prot, 1); + + /* Nothing can fail after this block, otherwise + * sctp_destroy_sock() will be called without addr_wq_lock held + */ if (net->sctp.default_auto_asconf) { + spin_lock(&sock_net(sk)->sctp.addr_wq_lock); list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; - } else + spin_unlock(&sock_net(sk)->sctp.addr_wq_lock); + } else { sp->do_auto_asconf = 0; + } + local_bh_enable(); return 0; } -/* Cleanup any SCTP per socket resources. */ +/* Cleanup any SCTP per socket resources. Must be called with + * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true + */ static void sctp_destroy_sock(struct sock *sk) { struct sctp_sock *sp; @@ -7202,6 +7197,19 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->mc_list = NULL; } +static inline void sctp_copy_descendant(struct sock *sk_to, + const struct sock *sk_from) +{ + int ancestor_size = sizeof(struct inet_sock) + + sizeof(struct sctp_sock) - + offsetof(struct sctp_sock, auto_asconf_list); + + if (sk_from->sk_family == PF_INET6) + ancestor_size += sizeof(struct ipv6_pinfo); + + __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); +} + /* Populate the fields of the newsk from the oldsk and migrate the assoc * and its messages to the newsk. */ @@ -7216,7 +7224,6 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, struct sk_buff *skb, *tmp; struct sctp_ulpevent *event; struct sctp_bind_hashbucket *head; - struct list_head tmplist; /* Migrate socket buffer sizes and all the socket level options to the * new socket. @@ -7224,12 +7231,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, newsk->sk_sndbuf = oldsk->sk_sndbuf; newsk->sk_rcvbuf = oldsk->sk_rcvbuf; /* Brute force copy old sctp opt. */ - if (oldsp->do_auto_asconf) { - memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist)); - inet_sk_copy_descendant(newsk, oldsk); - memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist)); - } else - inet_sk_copy_descendant(newsk, oldsk); + sctp_copy_descendant(newsk, oldsk); /* Restore the ep value that was overwritten with the above structure * copy. diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 2e9ada10fd84..26d50c565f54 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -58,10 +58,6 @@ static unsigned long max_autoclose_max = (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX) ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; -extern long sysctl_sctp_mem[3]; -extern int sysctl_sctp_rmem[3]; -extern int sysctl_sctp_wmem[3]; - static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/net/socket.c b/net/socket.c index 245330ca0015..9963a0b53a64 100644 --- a/net/socket.c +++ b/net/socket.c @@ -140,8 +140,6 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, static const struct file_operations socket_file_ops = { .owner = THIS_MODULE, .llseek = no_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = sock_read_iter, .write_iter = sock_write_iter, .poll = sock_poll, @@ -314,7 +312,7 @@ static const struct super_operations sockfs_ops = { static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", - dentry->d_inode->i_ino); + d_inode(dentry)->i_ino); } static const struct dentry_operations sockfs_dentry_operations = { @@ -377,7 +375,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) &socket_file_ops); if (unlikely(IS_ERR(file))) { /* drop dentry, keep inode */ - ihold(path.dentry->d_inode); + ihold(d_inode(path.dentry)); path_put(&path); return file; } @@ -499,7 +497,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, ssize_t len; ssize_t used = 0; - len = security_inode_listsecurity(dentry->d_inode, buffer, size); + len = security_inode_listsecurity(d_inode(dentry), buffer, size); if (len < 0) return len; used += len; @@ -578,9 +576,6 @@ void sock_release(struct socket *sock) if (rcu_dereference_protected(sock->wq, 1)->fasync_list) pr_err("%s: fasync list not empty!\n", __func__); - if (test_bit(SOCK_EXTERNALLY_ALLOCATED, &sock->flags)) - return; - this_cpu_sub(sockets_in_use, 1); if (!sock->file) { iput(SOCK_INODE(sock)); @@ -610,60 +605,27 @@ void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) } EXPORT_SYMBOL(__sock_tx_timestamp); -static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size) -{ - return sock->ops->sendmsg(iocb, sock, msg, size); -} - -static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size) -{ - int err = security_socket_sendmsg(sock, msg, size); - - return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size); -} - -static int do_sock_sendmsg(struct socket *sock, struct msghdr *msg, - size_t size, bool nosec) +static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - struct kiocb iocb; - int ret; - - init_sync_kiocb(&iocb, NULL); - ret = nosec ? __sock_sendmsg_nosec(&iocb, sock, msg, size) : - __sock_sendmsg(&iocb, sock, msg, size); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&iocb); + int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg)); + BUG_ON(ret == -EIOCBQUEUED); return ret; } -int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +int sock_sendmsg(struct socket *sock, struct msghdr *msg) { - return do_sock_sendmsg(sock, msg, size, false); -} -EXPORT_SYMBOL(sock_sendmsg); + int err = security_socket_sendmsg(sock, msg, + msg_data_left(msg)); -static int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size) -{ - return do_sock_sendmsg(sock, msg, size, true); + return err ?: sock_sendmsg_nosec(sock, msg); } +EXPORT_SYMBOL(sock_sendmsg); int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { - mm_segment_t oldfs = get_fs(); - int result; - - set_fs(KERNEL_DS); - /* - * the following is safe, since for compiler definitions of kvec and - * iovec are identical, yielding the same in-core layout and alignment - */ - iov_iter_init(&msg->msg_iter, WRITE, (struct iovec *)vec, num, size); - result = sock_sendmsg(sock, msg, size); - set_fs(oldfs); - return result; + iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); + return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); @@ -731,9 +693,9 @@ EXPORT_SYMBOL_GPL(__sock_recv_wifi_status); static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount) + if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount) put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL, - sizeof(__u32), &skb->dropcount); + sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount); } void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, @@ -744,47 +706,21 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, } EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops); -static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { - return sock->ops->recvmsg(iocb, sock, msg, size, flags); + return sock->ops->recvmsg(sock, msg, size, flags); } -static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, int flags) +int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { int err = security_socket_recvmsg(sock, msg, size, flags); - return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags); -} - -int sock_recvmsg(struct socket *sock, struct msghdr *msg, - size_t size, int flags) -{ - struct kiocb iocb; - int ret; - - init_sync_kiocb(&iocb, NULL); - ret = __sock_recvmsg(&iocb, sock, msg, size, flags); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&iocb); - return ret; + return err ?: sock_recvmsg_nosec(sock, msg, size, flags); } EXPORT_SYMBOL(sock_recvmsg); -static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, - size_t size, int flags) -{ - struct kiocb iocb; - int ret; - - init_sync_kiocb(&iocb, NULL); - ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&iocb); - return ret; -} - /** * kernel_recvmsg - Receive a message from a socket (kernel space) * @sock: The socket to receive the message from @@ -806,12 +742,8 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, mm_segment_t oldfs = get_fs(); int result; + iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size); set_fs(KERNEL_DS); - /* - * the following is safe, since for compiler definitions of kvec and - * iovec are identical, yielding the same in-core layout and alignment - */ - iov_iter_init(&msg->msg_iter, READ, (struct iovec *)vec, num, size); result = sock_recvmsg(sock, msg, size, flags); set_fs(oldfs); return result; @@ -849,7 +781,8 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; - struct msghdr msg = {.msg_iter = *to}; + struct msghdr msg = {.msg_iter = *to, + .msg_iocb = iocb}; ssize_t res; if (file->f_flags & O_NONBLOCK) @@ -858,11 +791,10 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) if (iocb->ki_pos != 0) return -ESPIPE; - if (iocb->ki_nbytes == 0) /* Match SYS5 behaviour */ + if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; - res = __sock_recvmsg(iocb, sock, &msg, - iocb->ki_nbytes, msg.msg_flags); + res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags); *to = msg.msg_iter; return res; } @@ -871,7 +803,8 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct socket *sock = file->private_data; - struct msghdr msg = {.msg_iter = *from}; + struct msghdr msg = {.msg_iter = *from, + .msg_iocb = iocb}; ssize_t res; if (iocb->ki_pos != 0) @@ -883,7 +816,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; - res = __sock_sendmsg(iocb, sock, &msg, iocb->ki_nbytes); + res = sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } @@ -1277,9 +1210,9 @@ int sock_create(int family, int type, int protocol, struct socket **res) } EXPORT_SYMBOL(sock_create); -int sock_create_kern(int family, int type, int protocol, struct socket **res) +int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) { - return __sock_create(&init_net, family, type, protocol, res, 1); + return __sock_create(net, family, type, protocol, res, 1); } EXPORT_SYMBOL(sock_create_kern); @@ -1700,18 +1633,14 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, struct iovec iov; int fput_needed; - if (len > INT_MAX) - len = INT_MAX; - if (unlikely(!access_ok(VERIFY_READ, buff, len))) - return -EFAULT; + err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter); + if (unlikely(err)) + return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; - iov.iov_base = buff; - iov.iov_len = len; msg.msg_name = NULL; - iov_iter_init(&msg.msg_iter, WRITE, &iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; @@ -1725,7 +1654,7 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; - err = sock_sendmsg(sock, &msg, len); + err = sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); @@ -1760,26 +1689,22 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, int err, err2; int fput_needed; - if (size > INT_MAX) - size = INT_MAX; - if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size))) - return -EFAULT; + err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter); + if (unlikely(err)) + return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; msg.msg_control = NULL; msg.msg_controllen = 0; - iov.iov_len = size; - iov.iov_base = ubuf; - iov_iter_init(&msg.msg_iter, READ, &iov, 1, size); /* Save some cycles and don't copy the address if not needed */ msg.msg_name = addr ? (struct sockaddr *)&address : NULL; /* We assume all kernel code knows the size of sockaddr_storage */ msg.msg_namelen = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; - err = sock_recvmsg(sock, &msg, size, flags); + err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags); if (err >= 0 && addr != NULL) { err2 = move_addr_to_user(&address, @@ -1899,10 +1824,10 @@ struct used_address { unsigned int name_len; }; -static ssize_t copy_msghdr_from_user(struct msghdr *kmsg, - struct user_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec **iov) +static int copy_msghdr_from_user(struct msghdr *kmsg, + struct user_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec **iov) { struct sockaddr __user *uaddr; struct iovec __user *uiov; @@ -1946,13 +1871,10 @@ static ssize_t copy_msghdr_from_user(struct msghdr *kmsg, if (nr_segs > UIO_MAXIOV) return -EMSGSIZE; - err = rw_copy_check_uvector(save_addr ? READ : WRITE, - uiov, nr_segs, - UIO_FASTIOV, *iov, iov); - if (err >= 0) - iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE, - *iov, nr_segs, err); - return err; + kmsg->msg_iocb = NULL; + + return import_iovec(save_addr ? READ : WRITE, uiov, nr_segs, + UIO_FASTIOV, iov, &kmsg->msg_iter); } static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, @@ -1967,7 +1889,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, __attribute__ ((aligned(sizeof(__kernel_size_t)))); /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; - int ctl_len, total_len; + int ctl_len; ssize_t err; msg_sys->msg_name = &address; @@ -1977,8 +1899,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, else err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov); if (err < 0) - goto out_freeiov; - total_len = err; + return err; err = -ENOBUFS; @@ -2025,10 +1946,10 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, used_address->name_len == msg_sys->msg_namelen && !memcmp(&used_address->name, msg_sys->msg_name, used_address->name_len)) { - err = sock_sendmsg_nosec(sock, msg_sys, total_len); + err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } - err = sock_sendmsg(sock, msg_sys, total_len); + err = sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. @@ -2044,8 +1965,7 @@ out_freectl: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, ctl_len); out_freeiov: - if (iov != iovstack) - kfree(iov); + kfree(iov); return err; } @@ -2170,8 +2090,8 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, else err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov); if (err < 0) - goto out_freeiov; - total_len = err; + return err; + total_len = iov_iter_count(&msg_sys->msg_iter); cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); @@ -2209,8 +2129,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, err = len; out_freeiov: - if (iov != iovstack) - kfree(iov); + kfree(iov); return err; } diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index fb78117b896c..04ce2c0b660e 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -1,9 +1,11 @@ config SUNRPC tristate + depends on MULTIUSER config SUNRPC_GSS tristate select OID_REGISTRY + depends on MULTIUSER config SUNRPC_BACKCHANNEL bool @@ -46,28 +48,16 @@ config SUNRPC_DEBUG If unsure, say Y. -config SUNRPC_XPRT_RDMA_CLIENT - tristate "RPC over RDMA Client Support" +config SUNRPC_XPRT_RDMA + tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS default SUNRPC && INFINIBAND help - This option allows the NFS client to support an RDMA-enabled - transport. + This option allows the NFS client and server to use RDMA + transports (InfiniBand, iWARP, or RoCE). - To compile RPC client RDMA transport support as a module, - choose M here: the module will be called xprtrdma. + To compile this support as a module, choose M. The module + will be called rpcrdma.ko. - If unsure, say N. - -config SUNRPC_XPRT_RDMA_SERVER - tristate "RPC over RDMA Server Support" - depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS - default SUNRPC && INFINIBAND - help - This option allows the NFS server to support an RDMA-enabled - transport. - - To compile RPC server RDMA transport support as a module, - choose M here: the module will be called svcrdma. - - If unsure, say N. + If unsure, or you know there is no RDMA capability on your + hardware platform, say N. diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 15e6f6c23c5d..b512fbd9d79a 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -5,8 +5,7 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ - -obj-y += xprtrdma/ +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o auth_generic.o \ @@ -15,6 +14,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ sunrpc_syms.o cache.o rpc_pipe.o \ svc_xprt.o sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o -sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o +sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 47f38be4155f..02f53674dc39 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -72,7 +72,7 @@ static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) #define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); -static struct kernel_param_ops param_ops_hashtbl_sz = { +static const struct kernel_param_ops param_ops_hashtbl_sz = { .set = param_set_hashtbl_sz, .get = param_get_hashtbl_sz, }; diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index b5408e8a37f2..fee3c15a4b52 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -881,9 +881,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, if (err) goto out_err; - sg_init_table(sg, 1); - sg_set_buf(sg, &zeroconstant, 4); - + sg_init_one(sg, &zeroconstant, 4); err = crypto_hash_digest(&desc, sg, 4, Kseq); if (err) goto out_err; @@ -951,9 +949,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher, if (err) goto out_err; - sg_init_table(sg, 1); - sg_set_buf(sg, zeroconstant, 4); - + sg_init_one(sg, zeroconstant, 4); err = crypto_hash_digest(&desc, sg, 4, Kcrypt); if (err) goto out_err; diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 1ec19f6f0c2b..eeeba5adee6d 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -793,20 +793,26 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, { u32 value_follows; int err; + struct page *scratch; + + scratch = alloc_page(GFP_KERNEL); + if (!scratch) + return -ENOMEM; + xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE); /* res->status */ err = gssx_dec_status(xdr, &res->status); if (err) - return err; + goto out_free; /* res->context_handle */ err = gssx_dec_bool(xdr, &value_follows); if (err) - return err; + goto out_free; if (value_follows) { err = gssx_dec_ctx(xdr, res->context_handle); if (err) - return err; + goto out_free; } else { res->context_handle = NULL; } @@ -814,11 +820,11 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, /* res->output_token */ err = gssx_dec_bool(xdr, &value_follows); if (err) - return err; + goto out_free; if (value_follows) { err = gssx_dec_buffer(xdr, res->output_token); if (err) - return err; + goto out_free; } else { res->output_token = NULL; } @@ -826,14 +832,17 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, /* res->delegated_cred_handle */ err = gssx_dec_bool(xdr, &value_follows); if (err) - return err; + goto out_free; if (value_follows) { /* we do not support upcall servers sending this data. */ - return -EINVAL; + err = -EINVAL; + goto out_free; } /* res->options */ err = gssx_dec_option_array(xdr, &res->options); +out_free: + __free_page(scratch); return err; } diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 9dd0ea8db463..6255d141133b 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -37,16 +37,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ static inline int xprt_need_to_requeue(struct rpc_xprt *xprt) { - return xprt->bc_alloc_count > 0; + return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots); } static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n) { + atomic_add(n, &xprt->bc_free_slots); xprt->bc_alloc_count += n; } static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n) { + atomic_sub(n, &xprt->bc_free_slots); return xprt->bc_alloc_count -= n; } @@ -60,13 +62,62 @@ static void xprt_free_allocation(struct rpc_rqst *req) dprintk("RPC: free allocations for req= %p\n", req); WARN_ON_ONCE(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); - xbufp = &req->rq_private_buf; + xbufp = &req->rq_rcv_buf; free_page((unsigned long)xbufp->head[0].iov_base); xbufp = &req->rq_snd_buf; free_page((unsigned long)xbufp->head[0].iov_base); kfree(req); } +static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) +{ + struct page *page; + /* Preallocate one XDR receive buffer */ + page = alloc_page(gfp_flags); + if (page == NULL) + return -ENOMEM; + buf->head[0].iov_base = page_address(page); + buf->head[0].iov_len = PAGE_SIZE; + buf->tail[0].iov_base = NULL; + buf->tail[0].iov_len = 0; + buf->page_len = 0; + buf->len = 0; + buf->buflen = PAGE_SIZE; + return 0; +} + +static +struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags) +{ + struct rpc_rqst *req; + + /* Pre-allocate one backchannel rpc_rqst */ + req = kzalloc(sizeof(*req), gfp_flags); + if (req == NULL) + return NULL; + + req->rq_xprt = xprt; + INIT_LIST_HEAD(&req->rq_list); + INIT_LIST_HEAD(&req->rq_bc_list); + + /* Preallocate one XDR receive buffer */ + if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) { + printk(KERN_ERR "Failed to create bc receive xbuf\n"); + goto out_free; + } + req->rq_rcv_buf.len = PAGE_SIZE; + + /* Preallocate one XDR send buffer */ + if (xprt_alloc_xdr_buf(&req->rq_snd_buf, gfp_flags) < 0) { + printk(KERN_ERR "Failed to create bc snd xbuf\n"); + goto out_free; + } + return req; +out_free: + xprt_free_allocation(req); + return NULL; +} + /* * Preallocate up to min_reqs structures and related buffers for use * by the backchannel. This function can be called multiple times @@ -87,9 +138,7 @@ static void xprt_free_allocation(struct rpc_rqst *req) */ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) { - struct page *page_rcv = NULL, *page_snd = NULL; - struct xdr_buf *xbufp = NULL; - struct rpc_rqst *req, *tmp; + struct rpc_rqst *req; struct list_head tmp_list; int i; @@ -106,7 +155,7 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) INIT_LIST_HEAD(&tmp_list); for (i = 0; i < min_reqs; i++) { /* Pre-allocate one backchannel rpc_rqst */ - req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL); + req = xprt_alloc_bc_req(xprt, GFP_KERNEL); if (req == NULL) { printk(KERN_ERR "Failed to create bc rpc_rqst\n"); goto out_free; @@ -115,41 +164,6 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) /* Add the allocated buffer to the tmp list */ dprintk("RPC: adding req= %p\n", req); list_add(&req->rq_bc_pa_list, &tmp_list); - - req->rq_xprt = xprt; - INIT_LIST_HEAD(&req->rq_list); - INIT_LIST_HEAD(&req->rq_bc_list); - - /* Preallocate one XDR receive buffer */ - page_rcv = alloc_page(GFP_KERNEL); - if (page_rcv == NULL) { - printk(KERN_ERR "Failed to create bc receive xbuf\n"); - goto out_free; - } - xbufp = &req->rq_rcv_buf; - xbufp->head[0].iov_base = page_address(page_rcv); - xbufp->head[0].iov_len = PAGE_SIZE; - xbufp->tail[0].iov_base = NULL; - xbufp->tail[0].iov_len = 0; - xbufp->page_len = 0; - xbufp->len = PAGE_SIZE; - xbufp->buflen = PAGE_SIZE; - - /* Preallocate one XDR send buffer */ - page_snd = alloc_page(GFP_KERNEL); - if (page_snd == NULL) { - printk(KERN_ERR "Failed to create bc snd xbuf\n"); - goto out_free; - } - - xbufp = &req->rq_snd_buf; - xbufp->head[0].iov_base = page_address(page_snd); - xbufp->head[0].iov_len = 0; - xbufp->tail[0].iov_base = NULL; - xbufp->tail[0].iov_len = 0; - xbufp->page_len = 0; - xbufp->len = 0; - xbufp->buflen = PAGE_SIZE; } /* @@ -167,7 +181,10 @@ out_free: /* * Memory allocation failed, free the temporary list */ - list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) { + while (!list_empty(&tmp_list)) { + req = list_first_entry(&tmp_list, + struct rpc_rqst, + rq_bc_pa_list); list_del(&req->rq_bc_pa_list); xprt_free_allocation(req); } @@ -217,9 +234,15 @@ static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid) struct rpc_rqst *req = NULL; dprintk("RPC: allocate a backchannel request\n"); - if (list_empty(&xprt->bc_pa_list)) + if (atomic_read(&xprt->bc_free_slots) <= 0) goto not_found; - + if (list_empty(&xprt->bc_pa_list)) { + req = xprt_alloc_bc_req(xprt, GFP_ATOMIC); + if (!req) + goto not_found; + list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); + xprt->bc_alloc_count++; + } req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst, rq_bc_pa_list); req->rq_reply_bytes_recvd = 0; @@ -245,11 +268,21 @@ void xprt_free_bc_request(struct rpc_rqst *req) req->rq_connect_cookie = xprt->connect_cookie - 1; smp_mb__before_atomic(); - WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); smp_mb__after_atomic(); - if (!xprt_need_to_requeue(xprt)) { + /* + * Return it to the list of preallocations so that it + * may be reused by a new callback request. + */ + spin_lock_bh(&xprt->bc_pa_lock); + if (xprt_need_to_requeue(xprt)) { + list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); + xprt->bc_alloc_count++; + req = NULL; + } + spin_unlock_bh(&xprt->bc_pa_lock); + if (req != NULL) { /* * The last remaining session was destroyed while this * entry was in use. Free the entry and don't attempt @@ -260,14 +293,6 @@ void xprt_free_bc_request(struct rpc_rqst *req) xprt_free_allocation(req); return; } - - /* - * Return it to the list of preallocations so that it - * may be reused by a new callback request. - */ - spin_lock_bh(&xprt->bc_pa_lock); - list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); } /* @@ -311,6 +336,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) spin_lock(&xprt->bc_pa_lock); list_del(&req->rq_bc_pa_list); + xprt_dec_alloc_count(xprt, 1); spin_unlock(&xprt->bc_pa_lock); req->rq_private_buf.len = copied; diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c deleted file mode 100644 index 15c7a8a1c24f..000000000000 --- a/net/sunrpc/bc_svc.c +++ /dev/null @@ -1,63 +0,0 @@ -/****************************************************************************** - -(c) 2007 Network Appliance, Inc. All Rights Reserved. -(c) 2009 NetApp. All Rights Reserved. - -NetApp provides this source code under the GPL v2 License. -The GPL v2 license is available at -http://opensource.org/licenses/gpl-license.php. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -******************************************************************************/ - -/* - * The NFSv4.1 callback service helper routines. - * They implement the transport level processing required to send the - * reply over an existing open connection previously established by the client. - */ - -#include <linux/module.h> - -#include <linux/sunrpc/xprt.h> -#include <linux/sunrpc/sched.h> -#include <linux/sunrpc/bc_xprt.h> - -#define RPCDBG_FACILITY RPCDBG_SVCDSP - -/* Empty callback ops */ -static const struct rpc_call_ops nfs41_callback_ops = { -}; - - -/* - * Send the callback reply - */ -int bc_send(struct rpc_rqst *req) -{ - struct rpc_task *task; - int ret; - - dprintk("RPC: bc_send req= %p\n", req); - task = rpc_run_bc_task(req, &nfs41_callback_ops); - if (IS_ERR(task)) - ret = PTR_ERR(task); - else { - WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); - ret = task->tk_status; - rpc_put_task(task); - } - dprintk("RPC: bc_send ret= %d\n", ret); - return ret; -} - diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 5199bb1a017e..2928afffbb81 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1072,10 +1072,12 @@ void qword_add(char **bpp, int *lp, char *str) if (len < 0) return; - ret = string_escape_str(str, &bp, len, ESCAPE_OCTAL, "\\ \n\t"); - if (ret < 0 || ret == len) + ret = string_escape_str(str, bp, len, ESCAPE_OCTAL, "\\ \n\t"); + if (ret >= len) { + bp += len; len = -1; - else { + } else { + bp += ret; len -= ret; *bp++ = ' '; len--; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e6ce1517367f..23608eb0ded2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -891,15 +891,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) task->tk_flags |= RPC_TASK_SOFT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; - if (sk_memalloc_socks()) { - struct rpc_xprt *xprt; - - rcu_read_lock(); - xprt = rcu_dereference(clnt->cl_xprt); - if (xprt->swapper) - task->tk_flags |= RPC_TASK_SWAPPER; - rcu_read_unlock(); - } + if (atomic_read(&clnt->cl_swapper)) + task->tk_flags |= RPC_TASK_SWAPPER; /* Add to the client's list of all tasks */ spin_lock(&clnt->cl_lock); list_add_tail(&task->tk_task, &clnt->cl_tasks); @@ -1031,15 +1024,14 @@ EXPORT_SYMBOL_GPL(rpc_call_async); * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run * rpc_execute against it * @req: RPC request - * @tk_ops: RPC call ops */ -struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, - const struct rpc_call_ops *tk_ops) +struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) { struct rpc_task *task; struct xdr_buf *xbufp = &req->rq_snd_buf; struct rpc_task_setup task_setup_data = { - .callback_ops = tk_ops, + .callback_ops = &rpc_default_ops, + .flags = RPC_TASK_SOFTCONN, }; dprintk("RPC: rpc_run_bc_task req= %p\n", req); @@ -1614,6 +1606,7 @@ call_allocate(struct rpc_task *task) req->rq_callsize + req->rq_rcvsize); if (req->rq_buffer != NULL) return; + xprt_inject_disconnect(xprt); dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); @@ -1909,6 +1902,7 @@ call_transmit_status(struct rpc_task *task) switch (task->tk_status) { case -EAGAIN: + case -ENOBUFS: break; default: dprint_status(task); @@ -1935,7 +1929,6 @@ call_transmit_status(struct rpc_task *task) case -ECONNABORTED: case -EADDRINUSE: case -ENOTCONN: - case -ENOBUFS: case -EPIPE: rpc_task_force_reencode(task); } @@ -1951,33 +1944,36 @@ call_bc_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - if (!xprt_prepare_transmit(task)) { - /* - * Could not reserve the transport. Try again after the - * transport is released. - */ - task->tk_status = 0; - task->tk_action = call_bc_transmit; - return; - } + if (!xprt_prepare_transmit(task)) + goto out_retry; - task->tk_action = rpc_exit_task; if (task->tk_status < 0) { printk(KERN_NOTICE "RPC: Could not send backchannel reply " "error: %d\n", task->tk_status); - return; + goto out_done; } + if (req->rq_connect_cookie != req->rq_xprt->connect_cookie) + req->rq_bytes_sent = 0; xprt_transmit(task); + + if (task->tk_status == -EAGAIN) + goto out_nospace; + xprt_end_transmit(task); dprint_status(task); switch (task->tk_status) { case 0: /* Success */ - break; case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -ECONNRESET: + case -ECONNREFUSED: + case -EADDRINUSE: + case -ENOTCONN: + case -EPIPE: + break; case -ETIMEDOUT: /* * Problem reaching the server. Disconnect and let the @@ -2002,6 +1998,13 @@ call_bc_transmit(struct rpc_task *task) break; } rpc_wake_up_queued_task(&req->rq_xprt->pending, task); +out_done: + task->tk_action = rpc_exit_task; + return; +out_nospace: + req->rq_connect_cookie = req->rq_xprt->connect_cookie; +out_retry: + task->tk_status = 0; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ @@ -2054,12 +2057,13 @@ call_status(struct rpc_task *task) case -ECONNABORTED: rpc_force_rebind(clnt); case -EADDRINUSE: - case -ENOBUFS: rpc_delay(task, 3*HZ); case -EPIPE: case -ENOTCONN: task->tk_action = call_bind; break; + case -ENOBUFS: + rpc_delay(task, HZ>>2); case -EAGAIN: task->tk_action = call_transmit; break; @@ -2476,3 +2480,59 @@ void rpc_show_tasks(struct net *net) spin_unlock(&sn->rpc_client_lock); } #endif + +#if IS_ENABLED(CONFIG_SUNRPC_SWAP) +int +rpc_clnt_swap_activate(struct rpc_clnt *clnt) +{ + int ret = 0; + struct rpc_xprt *xprt; + + if (atomic_inc_return(&clnt->cl_swapper) == 1) { +retry: + rcu_read_lock(); + xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + rcu_read_unlock(); + if (!xprt) { + /* + * If we didn't get a reference, then we likely are + * racing with a migration event. Wait for a grace + * period and try again. + */ + synchronize_rcu(); + goto retry; + } + + ret = xprt_enable_swap(xprt); + xprt_put(xprt); + } + return ret; +} +EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate); + +void +rpc_clnt_swap_deactivate(struct rpc_clnt *clnt) +{ + struct rpc_xprt *xprt; + + if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) { +retry: + rcu_read_lock(); + xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + rcu_read_unlock(); + if (!xprt) { + /* + * If we didn't get a reference, then we likely are + * racing with a migration event. Wait for a grace + * period and try again. + */ + synchronize_rcu(); + goto retry; + } + + xprt_disable_swap(xprt); + xprt_put(xprt); + } +} +EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate); +#endif /* CONFIG_SUNRPC_SWAP */ diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 82962f7e6e88..e7b4d93566df 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -10,9 +10,12 @@ #include "netns.h" static struct dentry *topdir; +static struct dentry *rpc_fault_dir; static struct dentry *rpc_clnt_dir; static struct dentry *rpc_xprt_dir; +unsigned int rpc_inject_disconnect; + struct rpc_clnt_iter { struct rpc_clnt *clnt; loff_t pos; @@ -257,6 +260,8 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt) debugfs_remove_recursive(xprt->debugfs); xprt->debugfs = NULL; } + + atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); } void @@ -266,11 +271,79 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt) xprt->debugfs = NULL; } +static int +fault_open(struct inode *inode, struct file *filp) +{ + filp->private_data = kmalloc(128, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + return 0; +} + +static int +fault_release(struct inode *inode, struct file *filp) +{ + kfree(filp->private_data); + return 0; +} + +static ssize_t +fault_disconnect_read(struct file *filp, char __user *user_buf, + size_t len, loff_t *offset) +{ + char *buffer = (char *)filp->private_data; + size_t size; + + size = sprintf(buffer, "%u\n", rpc_inject_disconnect); + return simple_read_from_buffer(user_buf, len, offset, buffer, size); +} + +static ssize_t +fault_disconnect_write(struct file *filp, const char __user *user_buf, + size_t len, loff_t *offset) +{ + char buffer[16]; + + if (len >= sizeof(buffer)) + len = sizeof(buffer) - 1; + if (copy_from_user(buffer, user_buf, len)) + return -EFAULT; + buffer[len] = '\0'; + if (kstrtouint(buffer, 10, &rpc_inject_disconnect)) + return -EINVAL; + return len; +} + +static const struct file_operations fault_disconnect_fops = { + .owner = THIS_MODULE, + .open = fault_open, + .read = fault_disconnect_read, + .write = fault_disconnect_write, + .release = fault_release, +}; + +static struct dentry * +inject_fault_dir(struct dentry *topdir) +{ + struct dentry *faultdir; + + faultdir = debugfs_create_dir("inject_fault", topdir); + if (!faultdir) + return NULL; + + if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir, + NULL, &fault_disconnect_fops)) + return NULL; + + return faultdir; +} + void __exit sunrpc_debugfs_exit(void) { debugfs_remove_recursive(topdir); topdir = NULL; + rpc_fault_dir = NULL; rpc_clnt_dir = NULL; rpc_xprt_dir = NULL; } @@ -282,6 +355,10 @@ sunrpc_debugfs_init(void) if (!topdir) return; + rpc_fault_dir = inject_fault_dir(topdir); + if (!rpc_fault_dir) + goto out_remove; + rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); if (!rpc_clnt_dir) goto out_remove; @@ -294,5 +371,6 @@ sunrpc_debugfs_init(void) out_remove: debugfs_remove_recursive(topdir); topdir = NULL; + rpc_fault_dir = NULL; rpc_clnt_dir = NULL; } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 2d12b76b5a64..d81186d34558 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -94,7 +94,7 @@ rpc_timeout_upcall_queue(struct work_struct *work) } dentry = dget(pipe->dentry); spin_unlock(&pipe->lock); - rpc_purge_list(dentry ? &RPC_I(dentry->d_inode)->waitq : NULL, + rpc_purge_list(dentry ? &RPC_I(d_inode(dentry))->waitq : NULL, &free_list, destroy_msg, -ETIMEDOUT); dput(dentry); } @@ -152,7 +152,7 @@ rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg) dentry = dget(pipe->dentry); spin_unlock(&pipe->lock); if (dentry) { - wake_up(&RPC_I(dentry->d_inode)->waitq); + wake_up(&RPC_I(d_inode(dentry))->waitq); dput(dentry); } return res; @@ -591,7 +591,7 @@ static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry, err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private); if (err) return err; - rpci = RPC_I(dentry->d_inode); + rpci = RPC_I(d_inode(dentry)); rpci->private = private; rpci->pipe = pipe; fsnotify_create(dir, dentry); @@ -616,7 +616,7 @@ int rpc_rmdir(struct dentry *dentry) int error; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); error = __rpc_rmdir(dir, dentry); mutex_unlock(&dir->i_mutex); @@ -638,7 +638,7 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry) static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); rpc_close_pipes(inode); return __rpc_unlink(dir, dentry); @@ -654,7 +654,7 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, if (!dentry) return ERR_PTR(-ENOMEM); } - if (dentry->d_inode == NULL) + if (d_really_is_negative(dentry)) return dentry; dput(dentry); return ERR_PTR(-EEXIST); @@ -667,7 +667,7 @@ static void __rpc_depopulate(struct dentry *parent, const struct rpc_filelist *files, int start, int eof) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct dentry *dentry; struct qstr name; int i; @@ -679,9 +679,9 @@ static void __rpc_depopulate(struct dentry *parent, if (dentry == NULL) continue; - if (dentry->d_inode == NULL) + if (d_really_is_negative(dentry)) goto next; - switch (dentry->d_inode->i_mode & S_IFMT) { + switch (d_inode(dentry)->i_mode & S_IFMT) { default: BUG(); case S_IFREG: @@ -699,7 +699,7 @@ static void rpc_depopulate(struct dentry *parent, const struct rpc_filelist *files, int start, int eof) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); __rpc_depopulate(parent, files, start, eof); @@ -711,7 +711,7 @@ static int rpc_populate(struct dentry *parent, int start, int eof, void *private) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct dentry *dentry; int i, err; @@ -754,7 +754,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, int (*populate)(struct dentry *, void *), void *args_populate) { struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); int error; mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); @@ -787,7 +787,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, int error; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); if (depopulate != NULL) depopulate(dentry); @@ -819,7 +819,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, void *private, struct rpc_pipe *pipe) { struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR; int err; @@ -864,7 +864,7 @@ rpc_unlink(struct dentry *dentry) int error = 0; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); error = __rpc_rmpipe(dir, dentry); mutex_unlock(&dir->i_mutex); @@ -1375,7 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) struct dentry *clnt_dir = pipe_dentry->d_parent; struct dentry *gssd_dir = clnt_dir->d_parent; - __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry); + __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry); __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); dput(pipe_dentry); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b91fd9c597b4..337ca851a350 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) if (!task->tk_timeout) return; - dprintk("RPC: %5u setting alarm for %lu ms\n", - task->tk_pid, task->tk_timeout * 1000 / HZ); + dprintk("RPC: %5u setting alarm for %u ms\n", + task->tk_pid, jiffies_to_msecs(task->tk_timeout)); task->u.tk_wait.expires = jiffies + task->tk_timeout; if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 78974e4d9ad2..5a16d8d8c831 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1290,7 +1290,6 @@ err_bad: svc_putnl(resv, ntohl(rpc_stat)); goto sendit; } -EXPORT_SYMBOL_GPL(svc_process); /* * Process the RPC request. @@ -1338,6 +1337,7 @@ out_drop: svc_drop(rqstp); return 0; } +EXPORT_SYMBOL_GPL(svc_process); #if defined(CONFIG_SUNRPC_BACKCHANNEL) /* @@ -1350,6 +1350,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; + struct rpc_task *task; + int proc_error; + int error; + + dprintk("svc: %s(%p)\n", __func__, req); /* Build the svc_rqst used by the common processing routine */ rqstp->rq_xprt = serv->sv_bc_xprt; @@ -1372,21 +1377,36 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, /* * Skip the next two words because they've already been - * processed in the trasport + * processed in the transport */ svc_getu32(argv); /* XID */ svc_getnl(argv); /* CALLDIR */ - /* Returns 1 for send, 0 for drop */ - if (svc_process_common(rqstp, argv, resv)) { - memcpy(&req->rq_snd_buf, &rqstp->rq_res, - sizeof(req->rq_snd_buf)); - return bc_send(req); - } else { - /* drop request */ + /* Parse and execute the bc call */ + proc_error = svc_process_common(rqstp, argv, resv); + + atomic_inc(&req->rq_xprt->bc_free_slots); + if (!proc_error) { + /* Processing error: drop the request */ xprt_free_bc_request(req); return 0; } + + /* Finally, send the reply synchronously */ + memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); + task = rpc_run_bc_task(req); + if (IS_ERR(task)) { + error = PTR_ERR(task); + goto out; + } + + WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); + error = task->tk_status; + rpc_put_task(task); + +out: + dprintk("svc: %s(), error=%d\n", __func__, error); + return error; } EXPORT_SYMBOL_GPL(bc_svc_process); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index cc331b6cf573..0c8120229a03 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -257,7 +257,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) svc_set_cmsg_data(rqstp, cmh); - if (sock_sendmsg(sock, &msg, 0) < 0) + if (sock_sendmsg(sock, &msg) < 0) goto out; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9949722d99ce..ab5dd621ae0c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -68,6 +68,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); static void xprt_connect_status(struct rpc_task *task); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); +static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *); static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); @@ -250,6 +251,8 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) } xprt_clear_locked(xprt); out_sleep: + if (req) + __xprt_put_cong(xprt, req); dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); task->tk_timeout = 0; task->tk_status = -EAGAIN; @@ -326,6 +329,15 @@ out_unlock: xprt_clear_locked(xprt); } +static void xprt_task_clear_bytes_sent(struct rpc_task *task) +{ + if (task != NULL) { + struct rpc_rqst *req = task->tk_rqstp; + if (req != NULL) + req->rq_bytes_sent = 0; + } +} + /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting @@ -336,11 +348,7 @@ out_unlock: void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -358,11 +366,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } @@ -607,8 +611,8 @@ static void xprt_autoclose(struct work_struct *work) struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); - xprt->ops->close(xprt); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + xprt->ops->close(xprt); xprt_release_write(xprt, NULL); } @@ -700,6 +704,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + xprt_task_clear_bytes_sent(task); xprt->snd_task = cookie; ret = true; out: @@ -965,6 +970,7 @@ void xprt_transmit(struct rpc_task *task) task->tk_status = status; return; } + xprt_inject_disconnect(xprt); dprintk("RPC: %5u xmit complete\n", task->tk_pid); task->tk_flags |= RPC_TASK_SENT; @@ -1283,6 +1289,7 @@ void xprt_release(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(req->rq_buffer); + xprt_inject_disconnect(xprt); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); task->tk_rqstp = NULL; diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index da5136fd5694..48913de240bd 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,8 +1,7 @@ -obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o -xprtrdma-y := transport.o rpc_rdma.o verbs.o - -obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o - -svcrdma-y := svc_rdma.o svc_rdma_transport.o \ - svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o +rpcrdma-y := transport.o rpc_rdma.o verbs.o \ + fmr_ops.o frwr_ops.o physical_ops.o \ + svc_rdma.o svc_rdma_transport.o \ + svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ + module.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c new file mode 100644 index 000000000000..f1e8dafbd507 --- /dev/null +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Memory Regions (FMR). + * Referred to sometimes as MTHCAFMR mode. + * + * FMR uses synchronous memory registration and deregistration. + * FMR registration is known to be fast, but FMR deregistration + * can take tens of usecs to complete. + */ + +/* Normal operation + * + * A Memory Region is prepared for RDMA READ or WRITE using the + * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is + * finished, the Memory Region is unmapped using the ib_unmap_fmr + * verb (fmr_op_unmap). + */ + +/* Transport recovery + * + * After a transport reconnect, fmr_op_map re-uses the MR already + * allocated for the RPC, but generates a fresh rkey then maps the + * MR again. This process is synchronous. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* Maximum scatter/gather per FMR */ +#define RPCRDMA_MAX_FMR_SGES (64) + +static int +fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* FMR mode conveys up to 64 pages of payload per chunk segment. + */ +static size_t +fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); +} + +static int +fmr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int i, rc; + + spin_lock_init(&buf->rb_mwlock); + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1); + i += 2; /* head + tail */ + i *= buf->rb_max_requests; /* one set for each RPC slot */ + dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); + + rc = -ENOMEM; + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + goto out; + + r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * + sizeof(u64), GFP_KERNEL); + if (!r->r.fmr.physaddrs) + goto out_free; + + r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr.fmr)) + goto out_fmr_err; + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_fmr_err: + rc = PTR_ERR(r->r.fmr.fmr); + dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); + kfree(r->r.fmr.physaddrs); +out_free: + kfree(r); +out: + return rc; +} + +static int +__fmr_unmap(struct rpcrdma_mw *r) +{ + LIST_HEAD(l); + + list_add(&r->r.fmr.fmr->list, &l); + return ib_unmap_fmr(&l); +} + +/* Use the ib_map_phys_fmr() verb to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + int len, pageoff, i, rc; + struct rpcrdma_mw *mw; + + mw = seg1->rl_mw; + seg1->rl_mw = NULL; + if (!mw) { + mw = rpcrdma_get_mw(r_xprt); + if (!mw) + return -ENOMEM; + } else { + /* this is a retransmit; generate a fresh rkey */ + rc = __fmr_unmap(mw); + if (rc) + return rc; + } + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > RPCRDMA_MAX_FMR_SGES) + nsegs = RPCRDMA_MAX_FMR_SGES; + for (i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + mw->r.fmr.physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + + rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs, + i, seg1->mr_dma); + if (rc) + goto out_maperr; + + seg1->rl_mw = mw; + seg1->mr_rkey = mw->r.fmr.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_maperr: + dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + __func__, len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(device, --seg); + return rc; +} + +/* Use the ib_unmap_fmr() verb to prevent further remote + * access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + int rc, nsegs = seg->mr_nsegs; + + dprintk("RPC: %s: FMR %p\n", __func__, mw); + + seg1->rl_mw = NULL; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia->ri_device, seg++); + rc = __fmr_unmap(mw); + if (rc) + goto out_err; + rpcrdma_put_mw(r_xprt, mw); + return nsegs; + +out_err: + /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy + * will attempt to release it when the transport is destroyed. + */ + dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); + return nsegs; +} + +static void +fmr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + kfree(r->r.fmr.physaddrs); + + rc = ib_dealloc_fmr(r->r.fmr.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { + .ro_map = fmr_op_map, + .ro_unmap = fmr_op_unmap, + .ro_open = fmr_op_open, + .ro_maxpages = fmr_op_maxpages, + .ro_init = fmr_op_init, + .ro_destroy = fmr_op_destroy, + .ro_displayname = "fmr", +}; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c new file mode 100644 index 000000000000..04ea914201b2 --- /dev/null +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -0,0 +1,460 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Registration Work + * Requests (FRWR). Also referred to sometimes as FRMR mode. + * + * FRWR features ordered asynchronous registration and deregistration + * of arbitrarily sized memory regions. This is the fastest and safest + * but most complex memory registration mode. + */ + +/* Normal operation + * + * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG + * Work Request (frmr_op_map). When the RDMA operation is finished, this + * Memory Region is invalidated using a LOCAL_INV Work Request + * (frmr_op_unmap). + * + * Typically these Work Requests are not signaled, and neither are RDMA + * SEND Work Requests (with the exception of signaling occasionally to + * prevent provider work queue overflows). This greatly reduces HCA + * interrupt workload. + * + * As an optimization, frwr_op_unmap marks MRs INVALID before the + * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on + * rb_mws immediately so that no work (like managing a linked list + * under a spinlock) is needed in the completion upcall. + * + * But this means that frwr_op_map() can occasionally encounter an MR + * that is INVALID but the LOCAL_INV WR has not completed. Work Queue + * ordering prevents a subsequent FAST_REG WR from executing against + * that MR while it is still being invalidated. + */ + +/* Transport recovery + * + * ->op_map and the transport connect worker cannot run at the same + * time, but ->op_unmap can fire while the transport connect worker + * is running. Thus MR recovery is handled in ->op_map, to guarantee + * that recovered MRs are owned by a sending RPC, and not one where + * ->op_unmap could fire at the same time transport reconnect is + * being done. + * + * When the underlying transport disconnects, MRs are left in one of + * three states: + * + * INVALID: The MR was not in use before the QP entered ERROR state. + * (Or, the LOCAL_INV WR has not completed or flushed yet). + * + * STALE: The MR was being registered or unregistered when the QP + * entered ERROR state, and the pending WR was flushed. + * + * VALID: The MR was registered before the QP entered ERROR state. + * + * When frwr_op_map encounters STALE and VALID MRs, they are recovered + * with ib_dereg_mr and then are re-initialized. Beause MR recovery + * allocates fresh resources, it is deferred to a workqueue, and the + * recovered MRs are placed back on the rb_mws list when recovery is + * complete. frwr_op_map allocates another MR for the current RPC while + * the broken MR is reset. + * + * To ensure that frwr_op_map doesn't encounter an MR that is marked + * INVALID but that is about to be flushed due to a previous transport + * disconnect, the transport connect worker attempts to drain all + * pending send queue WRs before the transport is reconnected. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static struct workqueue_struct *frwr_recovery_wq; + +#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) + +int +frwr_alloc_recovery_wq(void) +{ + frwr_recovery_wq = alloc_workqueue("frwr_recovery", + FRWR_RECOVERY_WQ_FLAGS, 0); + return !frwr_recovery_wq ? -ENOMEM : 0; +} + +void +frwr_destroy_recovery_wq(void) +{ + struct workqueue_struct *wq; + + if (!frwr_recovery_wq) + return; + + wq = frwr_recovery_wq; + frwr_recovery_wq = NULL; + destroy_workqueue(wq); +} + +/* Deferred reset of a single FRMR. Generate a fresh rkey by + * replacing the MR. + * + * There's no recovery if this fails. The FRMR is abandoned, but + * remains in rb_all. It will be cleaned up when the transport is + * destroyed. + */ +static void +__frwr_recovery_worker(struct work_struct *work) +{ + struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, + r.frmr.fr_work); + struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + + if (ib_dereg_mr(r->r.frmr.fr_mr)) + goto out_fail; + + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth); + if (IS_ERR(r->r.frmr.fr_mr)) + goto out_fail; + + dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); + r->r.frmr.fr_state = FRMR_IS_INVALID; + rpcrdma_put_mw(r_xprt, r); + return; + +out_fail: + pr_warn("RPC: %s: FRMR %p unrecovered\n", + __func__, r); +} + +/* A broken MR was discovered in a context that can't sleep. + * Defer recovery to the recovery worker. + */ +static void +__frwr_queue_recovery(struct rpcrdma_mw *r) +{ + INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker); + queue_work(frwr_recovery_wq, &r->r.frmr.fr_work); +} + +static int +__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, + unsigned int depth) +{ + struct rpcrdma_frmr *f = &r->r.frmr; + int rc; + + f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); + if (IS_ERR(f->fr_mr)) + goto out_mr_err; + f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); + if (IS_ERR(f->fr_pgl)) + goto out_list_err; + return 0; + +out_mr_err: + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", + __func__, rc); + ib_dereg_mr(f->fr_mr); + return rc; +} + +static void +__frwr_release(struct rpcrdma_mw *r) +{ + int rc; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr status %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); +} + +static int +frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr *devattr = &ia->ri_devattr; + int depth, delta; + + ia->ri_max_frmr_depth = + min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + devattr->max_fast_reg_page_list_len); + dprintk("RPC: %s: device's max FR page list len = %u\n", + __func__, ia->ri_max_frmr_depth); + + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + depth = 7; + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + } + + ep->rep_attr.cap.max_send_wr *= depth; + if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { + cdata->max_requests = devattr->max_qp_wr / depth; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; + } + + return 0; +} + +/* FRWR mode conveys a list of pages per chunk segment. The + * maximum length of that list is the FRWR page list depth. + */ +static size_t +frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); +} + +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r; + + if (likely(wc->status == IB_WC_SUCCESS)) + return; + + /* WARNING: Only wr_id and status are reliable at this point */ + r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + pr_warn("RPC: %s: frmr %p flushed, status %s (%d)\n", + __func__, r, ib_wc_status_msg(wc->status), wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; +} + +static int +frwr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + int i; + + spin_lock_init(&buf->rb_mwlock); + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1); + i += 2; /* head + tail */ + i *= buf->rb_max_requests; /* one set for each RPC slot */ + dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); + + while (i--) { + struct rpcrdma_mw *r; + int rc; + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + rc = __frwr_init(r, pd, device, depth); + if (rc) { + kfree(r); + return rc; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + r->mw_sendcompletion = frwr_sendcompletion; + r->r.frmr.fr_xprt = r_xprt; + } + + return 0; +} + +/* Post a FAST_REG Work Request to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw; + struct rpcrdma_frmr *frmr; + struct ib_mr *mr; + struct ib_send_wr fastreg_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + int seg_len; + u64 pa; + int page_no; + + mw = seg1->rl_mw; + seg1->rl_mw = NULL; + do { + if (mw) + __frwr_queue_recovery(mw); + mw = rpcrdma_get_mw(r_xprt); + if (!mw) + return -ENOMEM; + } while (mw->r.frmr.fr_state != FRMR_IS_INVALID); + frmr = &mw->r.frmr; + frmr->fr_state = FRMR_IS_VALID; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > ia->ri_max_frmr_depth) + nsegs = ia->ri_max_frmr_depth; + + for (page_no = i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + pa = seg->mr_dma; + for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { + frmr->fr_pgl->page_list[page_no++] = pa; + pa += PAGE_SIZE; + } + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", + __func__, mw, i, len); + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.length = len; + fastreg_wr.wr.fast_reg.access_flags = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + mr = frmr->fr_mr; + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); + if (rc) + goto out_senderr; + + seg1->rl_mw = mw; + seg1->mr_rkey = mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_senderr: + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + while (i--) + rpcrdma_unmap_one(device, --seg); + __frwr_queue_recovery(mw); + return rc; +} + +/* Post a LOCAL_INV Work Request to prevent further remote access + * via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mw *mw = seg1->rl_mw; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc, nsegs = seg->mr_nsegs; + + dprintk("RPC: %s: FRMR %p\n", __func__, mw); + + seg1->rl_mw = NULL; + mw->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)mw; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia->ri_device, seg++); + read_lock(&ia->ri_qplock); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + + rpcrdma_put_mw(r_xprt, mw); + return nsegs; + +out_err: + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + __frwr_queue_recovery(mw); + return nsegs; +} + +static void +frwr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + + /* Ensure stale MWs for "buf" are no longer in flight */ + flush_workqueue(frwr_recovery_wq); + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + __frwr_release(r); + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { + .ro_map = frwr_op_map, + .ro_unmap = frwr_op_unmap, + .ro_open = frwr_op_open, + .ro_maxpages = frwr_op_maxpages, + .ro_init = frwr_op_init, + .ro_destroy = frwr_op_destroy, + .ro_displayname = "frwr", +}; diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c new file mode 100644 index 000000000000..560712bd9fa2 --- /dev/null +++ b/net/sunrpc/xprtrdma/module.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + */ + +/* rpcrdma.ko module initialization + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sunrpc/svc_rdma.h> +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc."); +MODULE_DESCRIPTION("RPC/RDMA Transport"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("svcrdma"); +MODULE_ALIAS("xprtrdma"); + +static void __exit rpc_rdma_cleanup(void) +{ + xprt_rdma_cleanup(); + svc_rdma_cleanup(); +} + +static int __init rpc_rdma_init(void) +{ + int rc; + + rc = svc_rdma_init(); + if (rc) + goto out; + + rc = xprt_rdma_init(); + if (rc) + svc_rdma_cleanup(); + +out: + return rc; +} + +module_init(rpc_rdma_init); +module_exit(rpc_rdma_cleanup); diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c new file mode 100644 index 000000000000..41985d07fdb7 --- /dev/null +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* No-op chunk preparation. All client memory is pre-registered. + * Sometimes referred to as ALLPHYSICAL mode. + * + * Physical registration is simple because all client memory is + * pre-registered and never deregistered. This mode is good for + * adapter bring up, but is considered not safe: the server is + * trusted not to abuse its access to client memory not involved + * in RDMA I/O. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static int +physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* PHYSICAL memory registration conveys one page per chunk segment. + */ +static size_t +physical_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt)); +} + +static int +physical_op_init(struct rpcrdma_xprt *r_xprt) +{ + return 0; +} + +/* The client's physical memory is already exposed for + * remote access via RDMA READ or RDMA WRITE. + */ +static int +physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); + seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_base = seg->mr_dma; + seg->mr_nsegs = 1; + return 1; +} + +/* Unmap a memory region, but leave it registered. + */ +static int +physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + rpcrdma_unmap_one(ia->ri_device, seg); + return 1; +} + +static void +physical_op_destroy(struct rpcrdma_buffer *buf) +{ +} + +const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { + .ro_map = physical_op_map, + .ro_unmap = physical_op_unmap, + .ro_open = physical_op_open, + .ro_maxpages = physical_op_maxpages, + .ro_init = physical_op_init, + .ro_destroy = physical_op_destroy, + .ro_displayname = "physical", +}; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 91ffde82fa0c..84ea37daef36 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,6 +53,14 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_write_array *warray = NULL; struct rpcrdma_write_chunk *cur_wchunk = NULL; __be32 *iptr = headerp->rm_body.rm_chunks; + int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); if (type == rpcrdma_readch || type == rpcrdma_areadch) { /* a read chunk - server will RDMA Read our memory */ @@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, if (nsegs < 0) return nsegs; + map = r_xprt->rx_ia.ri_ops->ro_map; do { - n = rpcrdma_register_external(seg, nsegs, - cur_wchunk != NULL, r_xprt); + n = map(r_xprt, seg, nsegs, cur_wchunk != NULL); if (n <= 0) goto out; if (cur_rchunk) { /* read */ @@ -275,37 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); - } + for (pos = 0; nchunks--;) + pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[pos]); return n; } /* - * Marshal chunks. This routine returns the header length - * consumed by marshaling. - * - * Returns positive RPC/RDMA header size, or negative errno. - */ - -ssize_t -rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) -{ - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf); - - if (req->rl_rtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, - headerp, req->rl_rtype); - else if (req->rl_wtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, - headerp, req->rl_wtype); - return result; -} - -/* * Copy write data inline. * This function is used for "small" requests. Data which is passed * to RPC via iovecs (or page list) is copied directly into the @@ -397,6 +382,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; + enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -433,13 +419,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - req->rl_wtype = rpcrdma_noch; + wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - req->rl_wtype = rpcrdma_writech; + wtype = rpcrdma_writech; else - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -456,16 +442,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - req->rl_rtype = rpcrdma_noch; + rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - req->rl_rtype = rpcrdma_areadch; + rtype = rpcrdma_areadch; else - req->rl_rtype = rpcrdma_readch; + rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) - req->rl_wtype = rpcrdma_noch; - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; + if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -479,7 +465,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (req->rl_rtype == rpcrdma_noch) { + if (rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -494,7 +480,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (req->rl_wtype != rpcrdma_noch) { + if (wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -515,18 +501,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (req->rl_wtype == rpcrdma_noch) - req->rl_wtype = rpcrdma_replych; + if (wtype == rpcrdma_noch) + wtype = rpcrdma_replych; } } - hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); + if (rtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, rtype); + wtype = rtype; /* simplify dprintk */ + + } else if (wtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, wtype); + } if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* @@ -735,8 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) struct rpcrdma_msg *headerp; struct rpcrdma_req *req; struct rpc_rqst *rqst; - struct rpc_xprt *xprt = rep->rr_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; __be32 *iptr; int rdmalen, status; unsigned long cwnd; @@ -773,7 +767,6 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) rep->rr_len); repost: r_xprt->rx_stats.bad_reply_count++; - rep->rr_func = rpcrdma_reply_handler; if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) rpcrdma_recv_buffer_put(rep); diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index c1b6270262c2..2cd252f023a5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -38,8 +38,7 @@ * * Author: Tom Tucker <tom@opengridcomputing.com> */ -#include <linux/module.h> -#include <linux/init.h> + #include <linux/slab.h> #include <linux/fs.h> #include <linux/sysctl.h> @@ -295,8 +294,3 @@ int svc_rdma_init(void) destroy_workqueue(svc_rdma_wq); return -ENOMEM; } -MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); -MODULE_DESCRIPTION("SVC RDMA Transport"); -MODULE_LICENSE("Dual BSD/GPL"); -module_init(svc_rdma_init); -module_exit(svc_rdma_cleanup); diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index b681855cf970..e2fca7617242 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -50,12 +50,12 @@ /* * Decodes a read chunk list. The expected format is as follows: * descrim : xdr_one - * position : u32 offset into XDR stream - * handle : u32 RKEY + * position : __be32 offset into XDR stream + * handle : __be32 RKEY * . . . * end-of-list: xdr_zero */ -static u32 *decode_read_list(u32 *va, u32 *vaend) +static __be32 *decode_read_list(__be32 *va, __be32 *vaend) { struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; @@ -67,20 +67,20 @@ static u32 *decode_read_list(u32 *va, u32 *vaend) } ch++; } - return (u32 *)&ch->rc_position; + return &ch->rc_position; } /* * Decodes a write chunk list. The expected format is as follows: * descrim : xdr_one * nchunks : <count> - * handle : u32 RKEY ---+ - * length : u32 <len of segment> | + * handle : __be32 RKEY ---+ + * length : __be32 <len of segment> | * offset : remove va + <count> * . . . | * ---+ */ -static u32 *decode_write_list(u32 *va, u32 *vaend) +static __be32 *decode_write_list(__be32 *va, __be32 *vaend) { unsigned long start, end; int nchunks; @@ -90,14 +90,14 @@ static u32 *decode_write_list(u32 *va, u32 *vaend) /* Check for not write-array */ if (ary->wc_discrim == xdr_zero) - return (u32 *)&ary->wc_nchunks; + return &ary->wc_nchunks; if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > (unsigned long)vaend) { dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); return NULL; } - nchunks = ntohl(ary->wc_nchunks); + nchunks = be32_to_cpu(ary->wc_nchunks); start = (unsigned long)&ary->wc_array[0]; end = (unsigned long)vaend; @@ -112,10 +112,10 @@ static u32 *decode_write_list(u32 *va, u32 *vaend) * rs_length is the 2nd 4B field in wc_target and taking its * address skips the list terminator */ - return (u32 *)&ary->wc_array[nchunks].wc_target.rs_length; + return &ary->wc_array[nchunks].wc_target.rs_length; } -static u32 *decode_reply_array(u32 *va, u32 *vaend) +static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) { unsigned long start, end; int nchunks; @@ -124,14 +124,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend) /* Check for no reply-array */ if (ary->wc_discrim == xdr_zero) - return (u32 *)&ary->wc_nchunks; + return &ary->wc_nchunks; if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > (unsigned long)vaend) { dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); return NULL; } - nchunks = ntohl(ary->wc_nchunks); + nchunks = be32_to_cpu(ary->wc_nchunks); start = (unsigned long)&ary->wc_array[0]; end = (unsigned long)vaend; @@ -142,15 +142,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend) ary, nchunks, vaend); return NULL; } - return (u32 *)&ary->wc_array[nchunks]; + return (__be32 *)&ary->wc_array[nchunks]; } int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, struct svc_rqst *rqstp) { struct rpcrdma_msg *rmsgp = NULL; - u32 *va; - u32 *vaend; + __be32 *va, *vaend; u32 hdr_len; rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; @@ -162,22 +161,17 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, return -EINVAL; } - /* Decode the header */ - rmsgp->rm_xid = ntohl(rmsgp->rm_xid); - rmsgp->rm_vers = ntohl(rmsgp->rm_vers); - rmsgp->rm_credit = ntohl(rmsgp->rm_credit); - rmsgp->rm_type = ntohl(rmsgp->rm_type); - - if (rmsgp->rm_vers != RPCRDMA_VERSION) + if (rmsgp->rm_vers != rpcrdma_version) return -ENOSYS; /* Pull in the extra for the padded case and bump our pointer */ - if (rmsgp->rm_type == RDMA_MSGP) { + if (rmsgp->rm_type == rdma_msgp) { int hdrlen; + rmsgp->rm_body.rm_padded.rm_align = - ntohl(rmsgp->rm_body.rm_padded.rm_align); + be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align); rmsgp->rm_body.rm_padded.rm_thresh = - ntohl(rmsgp->rm_body.rm_padded.rm_thresh); + be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh); va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; rqstp->rq_arg.head[0].iov_base = va; @@ -192,7 +186,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, * chunk list and a reply chunk list. */ va = &rmsgp->rm_body.rm_chunks[0]; - vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); + vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); va = decode_read_list(va, vaend); if (!va) return -EINVAL; @@ -211,76 +205,20 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, return hdr_len; } -int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp) -{ - struct rpcrdma_msg *rmsgp = NULL; - struct rpcrdma_read_chunk *ch; - struct rpcrdma_write_array *ary; - u32 *va; - u32 hdrlen; - - dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n", - rqstp); - rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; - - /* Pull in the extra for the padded case and bump our pointer */ - if (rmsgp->rm_type == RDMA_MSGP) { - va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; - rqstp->rq_arg.head[0].iov_base = va; - hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); - rqstp->rq_arg.head[0].iov_len -= hdrlen; - return hdrlen; - } - - /* - * Skip all chunks to find RPC msg. These were previously processed - */ - va = &rmsgp->rm_body.rm_chunks[0]; - - /* Skip read-list */ - for (ch = (struct rpcrdma_read_chunk *)va; - ch->rc_discrim != xdr_zero; ch++); - va = (u32 *)&ch->rc_position; - - /* Skip write-list */ - ary = (struct rpcrdma_write_array *)va; - if (ary->wc_discrim == xdr_zero) - va = (u32 *)&ary->wc_nchunks; - else - /* - * rs_length is the 2nd 4B field in wc_target and taking its - * address skips the list terminator - */ - va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length; - - /* Skip reply-array */ - ary = (struct rpcrdma_write_array *)va; - if (ary->wc_discrim == xdr_zero) - va = (u32 *)&ary->wc_nchunks; - else - va = (u32 *)&ary->wc_array[ary->wc_nchunks]; - - rqstp->rq_arg.head[0].iov_base = va; - hdrlen = (unsigned long)va - (unsigned long)rmsgp; - rqstp->rq_arg.head[0].iov_len -= hdrlen; - - return hdrlen; -} - int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, - enum rpcrdma_errcode err, u32 *va) + enum rpcrdma_errcode err, __be32 *va) { - u32 *startp = va; + __be32 *startp = va; - *va++ = htonl(rmsgp->rm_xid); - *va++ = htonl(rmsgp->rm_vers); - *va++ = htonl(xprt->sc_max_requests); - *va++ = htonl(RDMA_ERROR); - *va++ = htonl(err); + *va++ = rmsgp->rm_xid; + *va++ = rmsgp->rm_vers; + *va++ = cpu_to_be32(xprt->sc_max_requests); + *va++ = rdma_error; + *va++ = cpu_to_be32(err); if (err == ERR_VERS) { - *va++ = htonl(RPCRDMA_VERSION); - *va++ = htonl(RPCRDMA_VERSION); + *va++ = rpcrdma_version; + *va++ = rpcrdma_version; } return (int)((unsigned long)va - (unsigned long)startp); @@ -297,7 +235,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) &rmsgp->rm_body.rm_chunks[1]; if (wr_ary->wc_discrim) wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]. + &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]. wc_target.rs_length; else wr_ary = (struct rpcrdma_write_array *) @@ -306,7 +244,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) /* skip reply array */ if (wr_ary->wc_discrim) wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]; + &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]; else wr_ary = (struct rpcrdma_write_array *) &wr_ary->wc_nchunks; @@ -325,7 +263,7 @@ void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) ary = (struct rpcrdma_write_array *) &rmsgp->rm_body.rm_chunks[1]; ary->wc_discrim = xdr_one; - ary->wc_nchunks = htonl(chunks); + ary->wc_nchunks = cpu_to_be32(chunks); /* write-list terminator */ ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; @@ -338,7 +276,7 @@ void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, int chunks) { ary->wc_discrim = xdr_one; - ary->wc_nchunks = htonl(chunks); + ary->wc_nchunks = cpu_to_be32(chunks); } void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, @@ -350,7 +288,7 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; seg->rs_handle = rs_handle; seg->rs_offset = rs_offset; - seg->rs_length = htonl(write_len); + seg->rs_length = cpu_to_be32(write_len); } void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, @@ -358,10 +296,10 @@ void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rdma_resp, enum rpcrdma_proc rdma_type) { - rdma_resp->rm_xid = htonl(rdma_argp->rm_xid); - rdma_resp->rm_vers = htonl(rdma_argp->rm_vers); - rdma_resp->rm_credit = htonl(xprt->sc_max_requests); - rdma_resp->rm_type = htonl(rdma_type); + rdma_resp->rm_xid = rdma_argp->rm_xid; + rdma_resp->rm_vers = rdma_argp->rm_vers; + rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests); + rdma_resp->rm_type = cpu_to_be32(rdma_type); /* Encode <nul> chunks lists */ rdma_resp->rm_body.rm_chunks[0] = xdr_zero; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index f9f13a32ddb8..2e1348bde325 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -85,7 +85,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; - if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG) + if (rmsgp->rm_type == rdma_nomsg) rqstp->rq_arg.pages = &rqstp->rq_pages[0]; else rqstp->rq_arg.pages = &rqstp->rq_pages[1]; @@ -117,8 +117,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) { - if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == - RDMA_TRANSPORT_IWARP) + if (!rdma_cap_read_multi_sge(xprt->sc_cm_id->device, + xprt->sc_cm_id->port_num)) return 1; else return min_t(int, sge_count, xprt->sc_max_sge); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 7de33d1af9b6..d25cd430f9ff 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -240,6 +240,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, u32 xdr_off; int chunk_off; int chunk_no; + int nchunks; struct rpcrdma_write_array *arg_ary; struct rpcrdma_write_array *res_ary; int ret; @@ -251,14 +252,15 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, &rdma_resp->rm_body.rm_chunks[1]; /* Write chunks start at the pagelist */ + nchunks = be32_to_cpu(arg_ary->wc_nchunks); for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; - xfer_len && chunk_no < arg_ary->wc_nchunks; + xfer_len && chunk_no < nchunks; chunk_no++) { struct rpcrdma_segment *arg_ch; u64 rs_offset; arg_ch = &arg_ary->wc_array[chunk_no].wc_target; - write_len = min(xfer_len, ntohl(arg_ch->rs_length)); + write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length)); /* Prepare the response chunk given the length actually * written */ @@ -270,7 +272,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, chunk_off = 0; while (write_len) { ret = send_write(xprt, rqstp, - ntohl(arg_ch->rs_handle), + be32_to_cpu(arg_ch->rs_handle), rs_offset + chunk_off, xdr_off, write_len, @@ -318,13 +320,13 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, &rdma_resp->rm_body.rm_chunks[2]; /* xdr offset starts at RPC message */ - nchunks = ntohl(arg_ary->wc_nchunks); + nchunks = be32_to_cpu(arg_ary->wc_nchunks); for (xdr_off = 0, chunk_no = 0; xfer_len && chunk_no < nchunks; chunk_no++) { u64 rs_offset; ch = &arg_ary->wc_array[chunk_no].wc_target; - write_len = min(xfer_len, htonl(ch->rs_length)); + write_len = min(xfer_len, be32_to_cpu(ch->rs_length)); /* Prepare the reply chunk given the length actually * written */ @@ -335,7 +337,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, chunk_off = 0; while (write_len) { ret = send_write(xprt, rqstp, - ntohl(ch->rs_handle), + be32_to_cpu(ch->rs_handle), rs_offset + chunk_off, xdr_off, write_len, @@ -515,7 +517,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) inline_bytes = rqstp->rq_res.len; /* Create the RDMA response header */ - res_page = svc_rdma_get_page(); + res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); rdma_resp = page_address(res_page); reply_ary = svc_rdma_get_reply_array(rdma_argp); if (reply_ary) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f609c1c2d38d..6b36279e4288 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -91,7 +91,7 @@ struct svc_xprt_class svc_rdma_class = { .xcl_name = "rdma", .xcl_owner = THIS_MODULE, .xcl_ops = &svc_rdma_ops, - .xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA, + .xcl_max_payload = RPCRDMA_MAXPAYLOAD, .xcl_ident = XPRT_TRANSPORT_RDMA, }; @@ -99,12 +99,8 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt; - while (1) { - ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL); - if (ctxt) - break; - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - } + ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, + GFP_KERNEL | __GFP_NOFAIL); ctxt->xprt = xprt; INIT_LIST_HEAD(&ctxt->dto_q); ctxt->count = 0; @@ -156,12 +152,8 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) struct svc_rdma_req_map *svc_rdma_get_req_map(void) { struct svc_rdma_req_map *map; - while (1) { - map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL); - if (map) - break; - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - } + map = kmem_cache_alloc(svc_rdma_map_cachep, + GFP_KERNEL | __GFP_NOFAIL); map->count = 0; return map; } @@ -175,8 +167,8 @@ void svc_rdma_put_req_map(struct svc_rdma_req_map *map) static void cq_event_handler(struct ib_event *event, void *context) { struct svc_xprt *xprt = context; - dprintk("svcrdma: received CQ event id=%d, context=%p\n", - event->event, context); + dprintk("svcrdma: received CQ event %s (%d), context=%p\n", + ib_event_msg(event->event), event->event, context); set_bit(XPT_CLOSE, &xprt->xpt_flags); } @@ -191,8 +183,9 @@ static void qp_event_handler(struct ib_event *event, void *context) case IB_EVENT_COMM_EST: case IB_EVENT_SQ_DRAINED: case IB_EVENT_QP_LAST_WQE_REACHED: - dprintk("svcrdma: QP event %d received for QP=%p\n", - event->event, event->element.qp); + dprintk("svcrdma: QP event %s (%d) received for QP=%p\n", + ib_event_msg(event->event), event->event, + event->element.qp); break; /* These are considered fatal events */ case IB_EVENT_PATH_MIG_ERR: @@ -201,9 +194,10 @@ static void qp_event_handler(struct ib_event *event, void *context) case IB_EVENT_QP_ACCESS_ERR: case IB_EVENT_DEVICE_FATAL: default: - dprintk("svcrdma: QP ERROR event %d received for QP=%p, " + dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, " "closing transport\n", - event->event, event->element.qp); + ib_event_msg(event->event), event->event, + event->element.qp); set_bit(XPT_CLOSE, &xprt->xpt_flags); break; } @@ -402,7 +396,8 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) for (i = 0; i < ret; i++) { wc = &wc_a[i]; if (wc->status != IB_WC_SUCCESS) { - dprintk("svcrdma: sq wc err status %d\n", + dprintk("svcrdma: sq wc err status %s (%d)\n", + ib_wc_status_msg(wc->status), wc->status); /* Close the transport */ @@ -490,18 +485,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return cma_xprt; } -struct page *svc_rdma_get_page(void) -{ - struct page *page; - - while ((page = alloc_page(GFP_KERNEL)) == NULL) { - /* If we can't get memory, wait a bit and try again */ - printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n"); - schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); - } - return page; -} - int svc_rdma_post_recv(struct svcxprt_rdma *xprt) { struct ib_recv_wr recv_wr, *bad_recv_wr; @@ -520,7 +503,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err_put_ctxt; } - page = svc_rdma_get_page(); + page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); ctxt->pages[sge_no] = page; pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, @@ -616,7 +599,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " - "event=%d\n", cma_id, cma_id->context, event->event); + "event = %s (%d)\n", cma_id, cma_id->context, + rdma_event_msg(event->event), event->event); handle_connect_req(cma_id, event->param.conn.initiator_depth); break; @@ -636,7 +620,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, default: dprintk("svcrdma: Unexpected event on listening endpoint %p, " - "event=%d\n", cma_id, event->event); + "event = %s (%d)\n", cma_id, + rdma_event_msg(event->event), event->event); break; } @@ -669,7 +654,8 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, break; case RDMA_CM_EVENT_DEVICE_REMOVAL: dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " - "event=%d\n", cma_id, xprt, event->event); + "event = %s (%d)\n", cma_id, xprt, + rdma_event_msg(event->event), event->event); if (xprt) { set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); @@ -677,7 +663,8 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, break; default: dprintk("svcrdma: Unexpected event on DTO endpoint %p, " - "event=%d\n", cma_id, event->event); + "event = %s (%d)\n", cma_id, + rdma_event_msg(event->event), event->event); break; } return 0; @@ -848,10 +835,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; + struct ib_cq_init_attr cq_attr = {}; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; int uninitialized_var(dma_mr_acc); - int need_dma_mr; + int need_dma_mr = 0; int ret; int i; @@ -900,22 +888,22 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating PD for connect request\n"); goto errout; } + cq_attr.cqe = newxprt->sc_sq_depth; newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, sq_comp_handler, cq_event_handler, newxprt, - newxprt->sc_sq_depth, - 0); + &cq_attr); if (IS_ERR(newxprt->sc_sq_cq)) { dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } + cq_attr.cqe = newxprt->sc_max_requests; newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, rq_comp_handler, cq_event_handler, newxprt, - newxprt->sc_max_requests, - 0); + &cq_attr); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); goto errout; @@ -985,35 +973,26 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* * Determine if a DMA MR is required and if so, what privs are required */ - switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) { - case RDMA_TRANSPORT_IWARP: - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { - need_dma_mr = 1; - dma_mr_acc = - (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE); - } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - } else - need_dma_mr = 0; - break; - case RDMA_TRANSPORT_IB: - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - } else if (!(devattr.device_cap_flags & - IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - } else - need_dma_mr = 0; - break; - default: + if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device, + newxprt->sc_cm_id->port_num) && + !rdma_ib_or_roce(newxprt->sc_cm_id->device, + newxprt->sc_cm_id->port_num)) goto errout; + + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || + !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, + newxprt->sc_cm_id->port_num) && + !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) + dma_mr_acc |= IB_ACCESS_REMOTE_WRITE; } + if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, + newxprt->sc_cm_id->port_num)) + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; + /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ if (need_dma_mr) { /* Register all of physical memory */ @@ -1319,11 +1298,11 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, struct ib_send_wr err_wr; struct page *p; struct svc_rdma_op_ctxt *ctxt; - u32 *va; + __be32 *va; int length; int ret; - p = svc_rdma_get_page(); + p = alloc_page(GFP_KERNEL | __GFP_NOFAIL); va = page_address(p); /* XDR encode error */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2e192baa59f3..680f888a9ddd 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -48,7 +48,6 @@ */ #include <linux/module.h> -#include <linux/init.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/sunrpc/addr.h> @@ -59,11 +58,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -MODULE_LICENSE("Dual BSD/GPL"); - -MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS"); -MODULE_AUTHOR("Network Appliance, Inc."); - /* * tunables */ @@ -157,12 +151,47 @@ static struct ctl_table sunrpc_table[] = { static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ static void +xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + char buf[20]; + + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA; +} + +static void +xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + char buf[40]; + + snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; +} + +static void xprt_rdma_format_addresses(struct rpc_xprt *xprt) { struct sockaddr *sap = (struct sockaddr *) &rpcx_to_rdmad(xprt).addr; - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - char buf[64]; + char buf[128]; + + switch (sap->sa_family) { + case AF_INET: + xprt_rdma_format_addresses4(xprt, sap); + break; + case AF_INET6: + xprt_rdma_format_addresses6(xprt, sap); + break; + default: + pr_err("rpcrdma: Unrecognized address family\n"); + return; + } (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); @@ -170,16 +199,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt) snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); - xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; - - snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); - xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); - snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); - /* netid */ - xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } static void @@ -217,6 +240,16 @@ xprt_rdma_connect_worker(struct work_struct *work) xprt_clear_connecting(xprt); } +static void +xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, + rx_xprt); + + pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt); + rdma_disconnect(r_xprt->rx_ia.ri_id); +} + /* * xprt_rdma_destroy * @@ -377,7 +410,10 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_connect_worker); xprt_rdma_format_addresses(xprt); - xprt->max_payload = rpcrdma_max_payload(new_xprt); + xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); + if (xprt->max_payload == 0) + goto out4; + xprt->max_payload <<= PAGE_SHIFT; dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", __func__, xprt->max_payload); @@ -552,8 +588,8 @@ xprt_rdma_free(void *buffer) for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; - i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt); + i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[i]); } rpcrdma_buffer_put(req); @@ -579,22 +615,13 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; - if (req->rl_niovs == 0) - rc = rpcrdma_marshal_req(rqst); - else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL) - rc = rpcrdma_marshal_chunks(rqst, 0); + rc = rpcrdma_marshal_req(rqst); if (rc < 0) goto failed_marshal; if (req->rl_reply == NULL) /* e.g. reconnection */ rpcrdma_recv_buffer_get(req); - if (req->rl_reply) { - req->rl_reply->rr_func = rpcrdma_reply_handler; - /* this need only be done once, but... */ - req->rl_reply->rr_xprt = xprt; - } - /* Must suppress retransmit to maintain credits */ if (req->rl_connect_cookie == xprt->connect_cookie) goto drop_connection; @@ -653,6 +680,17 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.bad_reply_count); } +static int +xprt_rdma_enable_swap(struct rpc_xprt *xprt) +{ + return -EINVAL; +} + +static void +xprt_rdma_disable_swap(struct rpc_xprt *xprt) +{ +} + /* * Plumbing for rpc transport switch and kernel module */ @@ -671,7 +709,10 @@ static struct rpc_xprt_ops xprt_rdma_procs = { .send_request = xprt_rdma_send_request, .close = xprt_rdma_close, .destroy = xprt_rdma_destroy, - .print_stats = xprt_rdma_print_stats + .print_stats = xprt_rdma_print_stats, + .enable_swap = xprt_rdma_enable_swap, + .disable_swap = xprt_rdma_disable_swap, + .inject_disconnect = xprt_rdma_inject_disconnect }; static struct xprt_class xprt_rdma = { @@ -682,7 +723,7 @@ static struct xprt_class xprt_rdma = { .setup = xprt_setup_rdma, }; -static void __exit xprt_rdma_cleanup(void) +void xprt_rdma_cleanup(void) { int rc; @@ -697,17 +738,24 @@ static void __exit xprt_rdma_cleanup(void) if (rc) dprintk("RPC: %s: xprt_unregister returned %i\n", __func__, rc); + + frwr_destroy_recovery_wq(); } -static int __init xprt_rdma_init(void) +int xprt_rdma_init(void) { int rc; - rc = xprt_register_transport(&xprt_rdma); - + rc = frwr_alloc_recovery_wq(); if (rc) return rc; + rc = xprt_register_transport(&xprt_rdma); + if (rc) { + frwr_destroy_recovery_wq(); + return rc; + } + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); dprintk("Defaults:\n"); @@ -724,6 +772,3 @@ static int __init xprt_rdma_init(void) #endif return 0; } - -module_init(xprt_rdma_init); -module_exit(xprt_rdma_cleanup); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 124676c13780..891c4ede2c20 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -50,6 +50,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/prefetch.h> +#include <linux/sunrpc/addr.h> #include <asm/bitops.h> #include "xprt_rdma.h" @@ -62,9 +63,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); -static void rpcrdma_reset_fmrs(struct rpcrdma_ia *); - /* * internal functions */ @@ -82,7 +80,6 @@ static void rpcrdma_run_tasklet(unsigned long data) { struct rpcrdma_rep *rep; - void (*func)(struct rpcrdma_rep *); unsigned long flags; data = data; @@ -91,14 +88,9 @@ rpcrdma_run_tasklet(unsigned long data) rep = list_entry(rpcrdma_tasklets_g.next, struct rpcrdma_rep, rr_list); list_del(&rep->rr_list); - func = rep->rr_func; - rep->rr_func = NULL; spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); - if (func) - func(rep); - else - rpcrdma_recv_buffer_put(rep); + rpcrdma_reply_handler(rep); spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); } @@ -107,32 +99,6 @@ rpcrdma_run_tasklet(unsigned long data) static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); -static const char * const async_event[] = { - "CQ error", - "QP fatal error", - "QP request error", - "QP access error", - "communication established", - "send queue drained", - "path migration successful", - "path mig error", - "device fatal error", - "port active", - "port error", - "LID change", - "P_key change", - "SM change", - "SRQ error", - "SRQ limit reached", - "last WQE reached", - "client reregister", - "GID change", -}; - -#define ASYNC_MSG(status) \ - ((status) < ARRAY_SIZE(async_event) ? \ - async_event[(status)] : "unknown async error") - static void rpcrdma_schedule_tasklet(struct list_head *sched_list) { @@ -150,7 +116,7 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) struct rpcrdma_ep *ep = context; pr_err("RPC: %s: %s on device %s ep %p\n", - __func__, ASYNC_MSG(event->event), + __func__, ib_event_msg(event->event), event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; @@ -165,7 +131,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) struct rpcrdma_ep *ep = context; pr_err("RPC: %s: %s on device %s ep %p\n", - __func__, ASYNC_MSG(event->event), + __func__, ib_event_msg(event->event), event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; @@ -174,53 +140,20 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) } } -static const char * const wc_status[] = { - "success", - "local length error", - "local QP operation error", - "local EE context operation error", - "local protection error", - "WR flushed", - "memory management operation error", - "bad response error", - "local access error", - "remote invalid request error", - "remote access error", - "remote operation error", - "transport retry counter exceeded", - "RNR retrycounter exceeded", - "local RDD violation error", - "remove invalid RD request", - "operation aborted", - "invalid EE context number", - "invalid EE context state", - "fatal error", - "response timeout error", - "general error", -}; - -#define COMPLETION_MSG(status) \ - ((status) < ARRAY_SIZE(wc_status) ? \ - wc_status[(status)] : "unexpected completion error") - static void rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - if (likely(wc->status == IB_WC_SUCCESS)) - return; - /* WARNING: Only wr_id and status are reliable at this point */ - if (wc->wr_id == 0ULL) { - if (wc->status != IB_WC_WR_FLUSH_ERR) + if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) { + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: SEND: %s\n", - __func__, COMPLETION_MSG(wc->status)); + __func__, ib_wc_status_msg(wc->status)); } else { struct rpcrdma_mw *r; r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - r->r.frmr.fr_state = FRMR_IS_STALE; - pr_err("RPC: %s: frmr %p (stale): %s\n", - __func__, r, COMPLETION_MSG(wc->status)); + r->mw_sendcompletion(wc); } } @@ -297,7 +230,7 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) __func__, rep, wc->byte_len); rep->rr_len = wc->byte_len; - ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, + ib_dma_sync_single_for_cpu(rep->rr_device, rdmab_addr(rep->rr_rdmabuf), rep->rr_len, DMA_FROM_DEVICE); prefetch(rdmab_to_msg(rep->rr_rdmabuf)); @@ -308,7 +241,7 @@ out_schedule: out_fail: if (wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: rep %p: %s\n", - __func__, rep, COMPLETION_MSG(wc->status)); + __func__, rep, ib_wc_status_msg(wc->status)); rep->rr_len = ~0U; goto out_schedule; } @@ -392,31 +325,6 @@ rpcrdma_flush_cqs(struct rpcrdma_ep *ep) rpcrdma_sendcq_process_wc(&wc); } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -static const char * const conn[] = { - "address resolved", - "address error", - "route resolved", - "route error", - "connect request", - "connect response", - "connect error", - "unreachable", - "rejected", - "established", - "disconnected", - "device removal", - "multicast join", - "multicast error", - "address change", - "timewait exit", -}; - -#define CONNECTION_MSG(status) \ - ((status) < ARRAY_SIZE(conn) ? \ - conn[(status)] : "unrecognized connection error") -#endif - static int rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -424,7 +332,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; + struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; #endif struct ib_qp_attr *attr = &ia->ri_qp_attr; struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; @@ -480,10 +388,9 @@ connected: wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", - __func__, &addr->sin_addr.s_addr, - ntohs(addr->sin_port), ep, - CONNECTION_MSG(event->event)); + dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", + __func__, sap, rpc_get_port(sap), ep, + rdma_event_msg(event->event)); break; } @@ -491,19 +398,16 @@ connected: if (connstate == 1) { int ird = attr->max_dest_rd_atomic; int tird = ep->rep_remote_cma.responder_resources; - printk(KERN_INFO "rpcrdma: connection to %pI4:%u " - "on %s, memreg %d slots %d ird %d%s\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - ia->ri_id->device->name, - ia->ri_memreg_strategy, + + pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", + sap, rpc_get_port(sap), + ia->ri_device->name, + ia->ri_ops->ro_displayname, xprt->rx_buf.rb_max_requests, ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); } else if (connstate < 0) { - printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - connstate); + pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n", + sap, rpc_get_port(sap), connstate); } #endif @@ -598,8 +502,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) rc = PTR_ERR(ia->ri_id); goto out1; } + ia->ri_device = ia->ri_id->device; - ia->ri_pd = ib_alloc_pd(ia->ri_id->device); + ia->ri_pd = ib_alloc_pd(ia->ri_device); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); dprintk("RPC: %s: ib_alloc_pd() failed %i\n", @@ -607,7 +512,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out2; } - rc = ib_query_device(ia->ri_id->device, devattr); + rc = ib_query_device(ia->ri_device, devattr); if (rc) { dprintk("RPC: %s: ib_query_device failed %d\n", __func__, rc); @@ -616,26 +521,22 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { ia->ri_have_dma_lkey = 1; - ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; } if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ - if ((devattr->device_cap_flags & + if (((devattr->device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || + (devattr->max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; - } else { - /* Mind the ia limit on FRMR page list depth */ - ia->ri_max_frmr_depth = min_t(unsigned int, - RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); } } if (memreg == RPCRDMA_MTHCAFMR) { - if (!ia->ri_id->device->alloc_fmr) { + if (!ia->ri_device->alloc_fmr) { dprintk("RPC: %s: MTHCAFMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_ALLPHYSICAL; @@ -652,13 +553,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) */ switch (memreg) { case RPCRDMA_FRMR: + ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: + ia->ri_ops = &rpcrdma_physical_memreg_ops; mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; case RPCRDMA_MTHCAFMR: + ia->ri_ops = &rpcrdma_fmr_memreg_ops; if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; @@ -678,11 +582,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) rc = -ENOMEM; goto out3; } - dprintk("RPC: %s: memory registration strategy is %d\n", - __func__, memreg); - - /* Else will do memory reg/dereg for each chunk */ - ia->ri_memreg_strategy = memreg; + dprintk("RPC: %s: memory registration strategy is '%s'\n", + __func__, ia->ri_ops->ro_displayname); rwlock_init(&ia->ri_qplock); return 0; @@ -713,17 +614,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) dprintk("RPC: %s: ib_dereg_mr returned %i\n", __func__, rc); } + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); rdma_destroy_id(ia->ri_id); ia->ri_id = NULL; } - if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { - rc = ib_dealloc_pd(ia->ri_pd); - dprintk("RPC: %s: ib_dealloc_pd returned %i\n", - __func__, rc); - } + + /* If the pd is still busy, xprtrdma missed freeing a resource */ + if (ia->ri_pd && !IS_ERR(ia->ri_pd)) + WARN_ON(ib_dealloc_pd(ia->ri_pd)); } /* @@ -735,6 +636,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, { struct ib_device_attr *devattr = &ia->ri_devattr; struct ib_cq *sendcq, *recvcq; + struct ib_cq_init_attr cq_attr = {}; int rc, err; /* check provider's send/recv wr limits */ @@ -743,49 +645,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; - /* send_cq and recv_cq initialized below */ ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: { - int depth = 7; - - /* Add room for frmr register and invalidate WRs. - * 1. FRMR reg WR for head - * 2. FRMR invalidate WR for head - * 3. N FRMR reg WRs for pagelist - * 4. N FRMR invalidate WRs for pagelist - * 5. FRMR reg WR for tail - * 6. FRMR invalidate WR for tail - * 7. The RDMA_SEND WR - */ - - /* Calculate N if the device max FRMR depth is smaller than - * RPCRDMA_MAX_DATA_SEGS. - */ - if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { - int delta = RPCRDMA_MAX_DATA_SEGS - - ia->ri_max_frmr_depth; - - do { - depth += 2; /* FRMR reg + invalidate */ - delta -= ia->ri_max_frmr_depth; - } while (delta > 0); - - } - ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; - if (!cdata->max_requests) - return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * - depth; - } - break; - } - default: - break; - } + rc = ia->ri_ops->ro_open(ia, ep, cdata); + if (rc) + return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); ep->rep_attr.cap.max_recv_sge = 1; @@ -820,9 +684,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); - sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, - rpcrdma_cq_async_error_upcall, ep, - ep->rep_attr.cap.max_send_wr + 1, 0); + cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1; + sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall, + rpcrdma_cq_async_error_upcall, ep, &cq_attr); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); dprintk("RPC: %s: failed to create send CQ: %i\n", @@ -837,9 +701,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, goto out2; } - recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, - rpcrdma_cq_async_error_upcall, ep, - ep->rep_attr.cap.max_recv_wr + 1, 0); + cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1; + recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall, + rpcrdma_cq_async_error_upcall, ep, &cq_attr); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); dprintk("RPC: %s: failed to create recv CQ: %i\n", @@ -944,20 +808,6 @@ retry: rpcrdma_ep_disconnect(ep, ia); rpcrdma_flush_cqs(ep); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_reset_frmrs(ia); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_reset_fmrs(ia); - break; - case RPCRDMA_ALLPHYSICAL: - break; - default: - rc = -EIO; - goto out; - } - xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); @@ -972,7 +822,7 @@ retry: * More stuff I haven't thought of! * Rrrgh! */ - if (ia->ri_id->device != id->device) { + if (ia->ri_device != id->device) { printk("RPC: %s: can't reconnect on " "different device!\n", __func__); rdma_destroy_id(id); @@ -1114,7 +964,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) goto out_free; } - rep->rr_buffer = &r_xprt->rx_buf; + rep->rr_device = ia->ri_device; + rep->rr_rxprt = r_xprt; return rep; out_free: @@ -1123,91 +974,6 @@ out: return ERR_PTR(rc); } -static int -rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_DATA_SEGS, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - - r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr failed %i\n", - __func__, rc); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - return 0; - -out_free: - kfree(r); - return rc; -} - -static int -rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - struct rpcrdma_frmr *f; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - f = &r->r.frmr; - - f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_mr)) { - rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr " - "failed %i\n", __func__, rc); - goto out_free; - } - - f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_pgl)) { - rc = PTR_ERR(f->fr_pgl); - dprintk("RPC: %s: ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(f->fr_mr); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - - return 0; - -out_free: - kfree(r); - return rc; -} - int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { @@ -1244,22 +1010,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_recv_bufs = (struct rpcrdma_rep **) p; p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = rpcrdma_init_frmrs(ia, buf); - if (rc) - goto out; - break; - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_init_fmrs(ia, buf); - if (rc) - goto out; - break; - default: - break; - } + rc = ia->ri_ops->ro_init(r_xprt); + if (rc) + goto out; for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; @@ -1311,47 +1064,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) kfree(req); } -static void -rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", - __func__, rc); - - kfree(r); - } -} - -static void -rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - kfree(r); - } -} - void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { @@ -1372,129 +1084,38 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); } - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_destroy_frmrs(buf); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_destroy_fmrs(buf); - break; - default: - break; - } + ia->ri_ops->ro_destroy(buf); kfree(buf->rb_pool); } -/* After a disconnect, unmap all FMRs. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_fmr_external(). - */ -static void -rpcrdma_reset_fmrs(struct rpcrdma_ia *ia) +struct rpcrdma_mw * +rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - LIST_HEAD(l); - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); + struct rpcrdma_mw *mw = NULL; - INIT_LIST_HEAD(&l); - list_add(&r->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - if (rc) - dprintk("RPC: %s: ib_unmap_fmr failed %i\n", - __func__, rc); + spin_lock(&buf->rb_mwlock); + if (!list_empty(&buf->rb_mws)) { + mw = list_first_entry(&buf->rb_mws, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); } -} + spin_unlock(&buf->rb_mwlock); -/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in - * an unusable state. Find FRMRs in this state and dereg / reg - * each. FRMRs that are VALID and attached to an rpcrdma_req are - * also torn down. - * - * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_frmr_external(). - */ -static void -rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - if (r->r.frmr.fr_state == FRMR_IS_INVALID) - continue; - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - continue; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - continue; - } - r->r.frmr.fr_state = FRMR_IS_INVALID; - } -} - -/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving - * some req segments uninitialized. - */ -static void -rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf) -{ - if (*mw) { - list_add_tail(&(*mw)->mw_list, &buf->rb_mws); - *mw = NULL; - } + if (!mw) + pr_err("RPC: %s: no MWs available\n", __func__); + return mw; } -/* Cycle mw's back in reverse order, and "spin" them. - * This delays and scrambles reuse as much as possible. - */ -static void -rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +void +rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) { - struct rpcrdma_mr_seg *seg = req->rl_segments; - struct rpcrdma_mr_seg *seg1 = seg; - int i; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) - rpcrdma_buffer_put_mr(&seg->rl_mw, buf); - rpcrdma_buffer_put_mr(&seg1->rl_mw, buf); + spin_lock(&buf->rb_mwlock); + list_add_tail(&mw->mw_list, &buf->rb_mws); + spin_unlock(&buf->rb_mwlock); } static void @@ -1504,115 +1125,10 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) req->rl_niovs = 0; if (req->rl_reply) { buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; - req->rl_reply->rr_func = NULL; req->rl_reply = NULL; } } -/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). - * Redo only the ib_post_send(). - */ -static void -rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct ib_send_wr invalidate_wr, *bad_wr; - int rc; - - dprintk("RPC: %s: FRMR %p is stale\n", __func__, r); - - /* When this FRMR is re-inserted into rb_mws, it is no longer stale */ - r->r.frmr.fr_state = FRMR_IS_INVALID; - - memset(&invalidate_wr, 0, sizeof(invalidate_wr)); - invalidate_wr.wr_id = (unsigned long)(void *)r; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - dprintk("RPC: %s: frmr %p invalidating rkey %08x\n", - __func__, r, r->r.frmr.fr_mr->rkey); - - read_lock(&ia->ri_qplock); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) { - /* Force rpcrdma_buffer_get() to retry */ - r->r.frmr.fr_state = FRMR_IS_STALE; - dprintk("RPC: %s: ib_post_send failed, %i\n", - __func__, rc); - } -} - -static void -rpcrdma_retry_flushed_linv(struct list_head *stale, - struct rpcrdma_buffer *buf) -{ - struct rpcrdma_ia *ia = rdmab_to_ia(buf); - struct list_head *pos; - struct rpcrdma_mw *r; - unsigned long flags; - - list_for_each(pos, stale) { - r = list_entry(pos, struct rpcrdma_mw, mw_list); - rpcrdma_retry_local_inv(r, ia); - } - - spin_lock_irqsave(&buf->rb_lock, flags); - list_splice_tail(stale, &buf->rb_mws); - spin_unlock_irqrestore(&buf->rb_lock, flags); -} - -static struct rpcrdma_req * -rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf, - struct list_head *stale) -{ - struct rpcrdma_mw *r; - int i; - - i = RPCRDMA_MAX_SEGS - 1; - while (!list_empty(&buf->rb_mws)) { - r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - if (r->r.frmr.fr_state == FRMR_IS_STALE) { - list_add(&r->mw_list, stale); - continue; - } - req->rl_segments[i].rl_mw = r; - if (unlikely(i-- == 0)) - return req; /* Success */ - } - - /* Not enough entries on rb_mws for this req */ - rpcrdma_buffer_put_sendbuf(req, buf); - rpcrdma_buffer_put_mrs(req, buf); - return NULL; -} - -static struct rpcrdma_req * -rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int i; - - i = RPCRDMA_MAX_SEGS - 1; - while (!list_empty(&buf->rb_mws)) { - r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - req->rl_segments[i].rl_mw = r; - if (unlikely(i-- == 0)) - return req; /* Success */ - } - - /* Not enough entries on rb_mws for this req */ - rpcrdma_buffer_put_sendbuf(req, buf); - rpcrdma_buffer_put_mrs(req, buf); - return NULL; -} - /* * Get a set of request/reply buffers. * @@ -1625,12 +1141,11 @@ rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { - struct rpcrdma_ia *ia = rdmab_to_ia(buffers); - struct list_head stale; struct rpcrdma_req *req; unsigned long flags; spin_lock_irqsave(&buffers->rb_lock, flags); + if (buffers->rb_send_index == buffers->rb_max_requests) { spin_unlock_irqrestore(&buffers->rb_lock, flags); dprintk("RPC: %s: out of request buffers\n", __func__); @@ -1649,20 +1164,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; - INIT_LIST_HEAD(&stale); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - req = rpcrdma_buffer_get_frmrs(req, buffers, &stale); - break; - case RPCRDMA_MTHCAFMR: - req = rpcrdma_buffer_get_fmrs(req, buffers); - break; - default: - break; - } spin_unlock_irqrestore(&buffers->rb_lock, flags); - if (!list_empty(&stale)) - rpcrdma_retry_flushed_linv(&stale, buffers); return req; } @@ -1674,19 +1176,10 @@ void rpcrdma_buffer_put(struct rpcrdma_req *req) { struct rpcrdma_buffer *buffers = req->rl_buffer; - struct rpcrdma_ia *ia = rdmab_to_ia(buffers); unsigned long flags; spin_lock_irqsave(&buffers->rb_lock, flags); rpcrdma_buffer_put_sendbuf(req, buffers); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - case RPCRDMA_MTHCAFMR: - rpcrdma_buffer_put_mrs(req, buffers); - break; - default: - break; - } spin_unlock_irqrestore(&buffers->rb_lock, flags); } @@ -1716,10 +1209,9 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) { - struct rpcrdma_buffer *buffers = rep->rr_buffer; + struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; unsigned long flags; - rep->rr_func = NULL; spin_lock_irqsave(&buffers->rb_lock, flags); buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; spin_unlock_irqrestore(&buffers->rb_lock, flags); @@ -1729,6 +1221,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ +void +rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) +{ + dprintk("RPC: map_one: offset %p iova %llx len %zu\n", + seg->mr_offset, + (unsigned long long)seg->mr_dma, seg->mr_dmalen); +} + static int rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, struct ib_mr **mrp, struct ib_sge *iov) @@ -1740,9 +1240,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, /* * All memory passed here was kmalloc'ed, therefore phys-contiguous. */ - iov->addr = ib_dma_map_single(ia->ri_id->device, + iov->addr = ib_dma_map_single(ia->ri_device, va, len, DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(ia->ri_id->device, iov->addr)) + if (ib_dma_mapping_error(ia->ri_device, iov->addr)) return -ENOMEM; iov->length = len; @@ -1786,8 +1286,8 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia, { int rc; - ib_dma_unmap_single(ia->ri_id->device, - iov->addr, iov->length, DMA_BIDIRECTIONAL); + ib_dma_unmap_single(ia->ri_device, + iov->addr, iov->length, DMA_BIDIRECTIONAL); if (NULL == mr) return 0; @@ -1854,287 +1354,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) } /* - * Wrappers for chunk registration, shared by read/write chunk code. - */ - -static void -rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) -{ - seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - seg->mr_dmalen = seg->mr_len; - if (seg->mr_page) - seg->mr_dma = ib_dma_map_page(ia->ri_id->device, - seg->mr_page, offset_in_page(seg->mr_offset), - seg->mr_dmalen, seg->mr_dir); - else - seg->mr_dma = ib_dma_map_single(ia->ri_id->device, - seg->mr_offset, - seg->mr_dmalen, seg->mr_dir); - if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { - dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", - __func__, - (unsigned long long)seg->mr_dma, - seg->mr_offset, seg->mr_dmalen); - } -} - -static void -rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) -{ - if (seg->mr_page) - ib_dma_unmap_page(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); - else - ib_dma_unmap_single(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); -} - -static int -rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_mr *mr = frmr->fr_mr; - struct ib_send_wr fastreg_wr, *bad_wr; - u8 key; - int len, pageoff; - int i, rc; - int seg_len; - u64 pa; - int page_no; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > ia->ri_max_frmr_depth) - *nsegs = ia->ri_max_frmr_depth; - for (page_no = i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - pa = seg->mr_dma; - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - frmr->fr_pgl->page_list[page_no++] = pa; - pa += PAGE_SIZE; - } - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, mw, i); - - frmr->fr_state = FRMR_IS_VALID; - - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = (unsigned long)(void *)mw; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; - fastreg_wr.wr.fast_reg.page_list_len = page_no; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (fastreg_wr.wr.fast_reg.length < len) { - rc = -EIO; - goto out_err; - } - - /* Bump the key */ - key = (u8)(mr->rkey & 0x000000FF); - ib_update_fast_reg_key(mr, ++key); - - fastreg_wr.wr.fast_reg.access_flags = (writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ); - fastreg_wr.wr.fast_reg.rkey = mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); - if (rc) { - dprintk("RPC: %s: failed ib_post_send for register," - " status %i\n", __func__, rc); - ib_update_fast_reg_key(mr, --key); - goto out_err; - } else { - seg1->mr_rkey = mr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return 0; -out_err: - frmr->fr_state = FRMR_IS_INVALID; - while (i--) - rpcrdma_unmap_one(ia, --seg); - return rc; -} - -static int -rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr invalidate_wr, *bad_wr; - int rc; - - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; - - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) { - /* Force rpcrdma_buffer_get() to retry */ - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; - dprintk("RPC: %s: failed ib_post_send for invalidate," - " status %i\n", __func__, rc); - } - return rc; -} - -static int -rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff, i, rc; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, i, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return rc; -} - -static int -rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - LIST_HEAD(l); - int rc; - - list_add(&seg1->rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - read_unlock(&ia->ri_qplock); - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); - return rc; -} - -int -rpcrdma_register_external(struct rpcrdma_mr_seg *seg, - int nsegs, int writing, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc = 0; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - rpcrdma_map_one(ia, seg, writing); - seg->mr_rkey = ia->ri_bind_mem->rkey; - seg->mr_base = seg->mr_dma; - seg->mr_nsegs = 1; - nsegs = 1; - break; - - /* Registration using frmr registration */ - case RPCRDMA_FRMR: - rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); - break; - - /* Registration using fmr memory registration */ - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); - break; - - default: - return -EIO; - } - if (rc) - return rc; - - return nsegs; -} - -int -rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int nsegs = seg->mr_nsegs, rc; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - read_lock(&ia->ri_qplock); - rpcrdma_unmap_one(ia, seg); - read_unlock(&ia->ri_qplock); - break; - - case RPCRDMA_FRMR: - rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); - break; - - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_deregister_fmr_external(seg, ia); - break; - - default: - break; - } - return nsegs; -} - -/* * Prepost any receive buffer, then post send. * * Receive buffer is donated to hardware, reclaimed upon recv completion. @@ -2156,20 +1375,23 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, } send_wr.next = NULL; - send_wr.wr_id = 0ULL; /* no send cookie */ + send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; send_wr.sg_list = req->rl_send_iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ - ib_dma_sync_single_for_device(ia->ri_id->device, - req->rl_send_iov[3].addr, req->rl_send_iov[3].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_id->device, - req->rl_send_iov[1].addr, req->rl_send_iov[1].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_id->device, - req->rl_send_iov[0].addr, req->rl_send_iov[0].length, - DMA_TO_DEVICE); + ib_dma_sync_single_for_device(ia->ri_device, + req->rl_send_iov[3].addr, + req->rl_send_iov[3].length, + DMA_TO_DEVICE); + ib_dma_sync_single_for_device(ia->ri_device, + req->rl_send_iov[1].addr, + req->rl_send_iov[1].length, + DMA_TO_DEVICE); + ib_dma_sync_single_for_device(ia->ri_device, + req->rl_send_iov[0].addr, + req->rl_send_iov[0].length, + DMA_TO_DEVICE); if (DECR_CQCOUNT(ep) > 0) send_wr.send_flags = 0; @@ -2202,7 +1424,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; recv_wr.num_sge = 1; - ib_dma_sync_single_for_cpu(ia->ri_id->device, + ib_dma_sync_single_for_cpu(ia->ri_device, rdmab_addr(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf), DMA_BIDIRECTIONAL); @@ -2215,43 +1437,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, return rc; } -/* Physical mapping means one Read/Write list entry per-page. - * All list entries must fit within an inline buffer - * - * NB: The server must return a Write list for NFS READ, - * which has the same constraint. Factor in the inline - * rsize as well. +/* How many chunk list items fit within our inline buffers? */ -static size_t -rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +unsigned int +rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - unsigned int inline_size, pages; + int bytes, segments; - inline_size = min_t(unsigned int, - cdata->inline_wsize, cdata->inline_rsize); - inline_size -= RPCRDMA_HDRLEN_MIN; - pages = inline_size / sizeof(struct rpcrdma_segment); - return pages << PAGE_SHIFT; -} - -static size_t -rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) -{ - return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; -} - -size_t -rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) -{ - size_t result; - - switch (r_xprt->rx_ia.ri_memreg_strategy) { - case RPCRDMA_ALLPHYSICAL: - result = rpcrdma_physical_max_payload(r_xprt); - break; - default: - result = rpcrdma_mr_max_payload(r_xprt); + bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize); + bytes -= RPCRDMA_HDRLEN_MIN; + if (bytes < sizeof(struct rpcrdma_segment) * 2) { + pr_warn("RPC: %s: inline threshold too small\n", + __func__); + return 0; } - return result; + + segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1); + dprintk("RPC: %s: max chunk list size = %d segments\n", + __func__, segments); + return segments; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0a16fb6f0885..f49dd8b38122 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -60,7 +60,9 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + const struct rpcrdma_memreg_ops *ri_ops; rwlock_t ri_qplock; + struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; @@ -68,7 +70,6 @@ struct rpcrdma_ia { int ri_have_dma_lkey; struct completion ri_done; int ri_async_rc; - enum rpcrdma_memreg ri_memreg_strategy; unsigned int ri_max_frmr_depth; struct ib_device_attr ri_devattr; struct ib_qp_attr ri_qp_attr; @@ -105,6 +106,10 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +/* Force completion handler to ignore the signal + */ +#define RPCRDMA_IGNORE_COMPLETION (0ULL) + /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV * * The below structure appears at the front of a large region of kmalloc'd @@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -176,9 +173,8 @@ struct rpcrdma_buffer; struct rpcrdma_rep { unsigned int rr_len; - struct rpcrdma_buffer *rr_buffer; - struct rpc_xprt *rr_xprt; - void (*rr_func)(struct rpcrdma_rep *); + struct ib_device *rr_device; + struct rpcrdma_xprt *rr_rxprt; struct list_head rr_list; struct rpcrdma_regbuf *rr_rdmabuf; }; @@ -206,13 +202,21 @@ struct rpcrdma_frmr { struct ib_fast_reg_page_list *fr_pgl; struct ib_mr *fr_mr; enum rpcrdma_frmr_state fr_state; + struct work_struct fr_work; + struct rpcrdma_xprt *fr_xprt; +}; + +struct rpcrdma_fmr { + struct ib_fmr *fmr; + u64 *physaddrs; }; struct rpcrdma_mw { union { - struct ib_fmr *fmr; + struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; } r; + void (*mw_sendcompletion)(struct ib_wc *); struct list_head mw_list; struct list_head mw_all; }; @@ -258,7 +262,6 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ - enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[4]; /* for active requests */ @@ -284,15 +287,17 @@ rpcr_to_rdmar(struct rpc_rqst *rqst) * One of these is associated with a transport instance */ struct rpcrdma_buffer { - spinlock_t rb_lock; /* protects indexes */ - u32 rb_max_requests;/* client max requests */ - struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ - struct list_head rb_all; - int rb_send_index; + spinlock_t rb_mwlock; /* protect rb_mws list */ + struct list_head rb_mws; + struct list_head rb_all; + char *rb_pool; + + spinlock_t rb_lock; /* protect buf arrays */ + u32 rb_max_requests; + int rb_send_index; + int rb_recv_index; struct rpcrdma_req **rb_send_bufs; - int rb_recv_index; struct rpcrdma_rep **rb_recv_bufs; - char *rb_pool; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -340,6 +345,28 @@ struct rpcrdma_stats { }; /* + * Per-registration mode operations + */ +struct rpcrdma_xprt; +struct rpcrdma_memreg_ops { + int (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *, int, bool); + int (*ro_unmap)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *); + int (*ro_open)(struct rpcrdma_ia *, + struct rpcrdma_ep *, + struct rpcrdma_create_data_internal *); + size_t (*ro_maxpages)(struct rpcrdma_xprt *); + int (*ro_init)(struct rpcrdma_xprt *); + void (*ro_destroy)(struct rpcrdma_buffer *); + const char *ro_displayname; +}; + +extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; + +/* * RPCRDMA transport -- encapsulates the structures above for * integration with RPC. * @@ -393,21 +420,66 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); +struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); +void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -int rpcrdma_register_external(struct rpcrdma_mr_seg *, - int, int, struct rpcrdma_xprt *); -int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, - struct rpcrdma_xprt *); - struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, size_t, gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); +unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); + +int frwr_alloc_recovery_wq(void); +void frwr_destroy_recovery_wq(void); + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); + +static inline enum dma_data_direction +rpcrdma_data_dir(bool writing) +{ + return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +static inline void +rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, + enum dma_data_direction direction) +{ + seg->mr_dir = direction; + seg->mr_dmalen = seg->mr_len; + + if (seg->mr_page) + seg->mr_dma = ib_dma_map_page(device, + seg->mr_page, offset_in_page(seg->mr_offset), + seg->mr_dmalen, seg->mr_dir); + else + seg->mr_dma = ib_dma_map_single(device, + seg->mr_offset, + seg->mr_dmalen, seg->mr_dir); + + if (ib_dma_mapping_error(device, seg->mr_dma)) + rpcrdma_mapping_error(seg); +} + +static inline void +rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) +{ + if (seg->mr_page) + ib_dma_unmap_page(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); + else + ib_dma_unmap_single(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); +} + /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ @@ -418,9 +490,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ -ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); -size_t rpcrdma_max_payload(struct rpcrdma_xprt *); + +/* RPC/RDMA module init - xprtrdma/transport.c + */ +int xprt_rdma_init(void); +void xprt_rdma_cleanup(void); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; @@ -429,10 +504,4 @@ extern struct kmem_cache *svc_rdma_ctxt_cachep; /* Workqueue created in svc_rdma.c */ extern struct workqueue_struct *svc_rdma_wq; -#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT) -#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD -#else -#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT) -#endif - #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 66891e32c5e3..0030376327b7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -527,6 +527,10 @@ static int xs_local_send_request(struct rpc_task *task) true, &sent); dprintk("RPC: %s(%u) = %d\n", __func__, xdr->len - req->rq_bytes_sent, status); + + if (status == -EAGAIN && sock_writeable(transport->inet)) + status = -ENOBUFS; + if (likely(sent > 0) || status == 0) { req->rq_bytes_sent += sent; req->rq_xmit_bytes_sent += sent; @@ -539,6 +543,7 @@ static int xs_local_send_request(struct rpc_task *task) switch (status) { case -ENOBUFS: + break; case -EAGAIN: status = xs_nospace(task); break; @@ -589,6 +594,9 @@ static int xs_udp_send_request(struct rpc_task *task) if (status == -EPERM) goto process_status; + if (status == -EAGAIN && sock_writeable(transport->inet)) + status = -ENOBUFS; + if (sent > 0 || status == 0) { req->rq_xmit_bytes_sent += sent; if (sent >= req->rq_slen) @@ -623,24 +631,6 @@ process_status: } /** - * xs_tcp_shutdown - gracefully shut down a TCP socket - * @xprt: transport - * - * Initiates a graceful shutdown of the TCP socket by calling the - * equivalent of shutdown(SHUT_RDWR); - */ -static void xs_tcp_shutdown(struct rpc_xprt *xprt) -{ - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct socket *sock = transport->sock; - - if (sock != NULL) { - kernel_sock_shutdown(sock, SHUT_RDWR); - trace_rpc_socket_shutdown(xprt, sock); - } -} - -/** * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request * @@ -687,9 +677,6 @@ static int xs_tcp_send_request(struct rpc_task *task) dprintk("RPC: xs_tcp_send_request(%u) = %d\n", xdr->len - req->rq_bytes_sent, status); - if (unlikely(sent == 0 && status < 0)) - break; - /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ req->rq_bytes_sent += sent; @@ -699,18 +686,21 @@ static int xs_tcp_send_request(struct rpc_task *task) return 0; } - if (sent != 0) - continue; - status = -EAGAIN; - break; + if (status < 0) + break; + if (sent == 0) { + status = -EAGAIN; + break; + } } + if (status == -EAGAIN && sk_stream_is_writeable(transport->inet)) + status = -ENOBUFS; switch (status) { case -ENOTSOCK: status = -ENOTCONN; /* Should we call xs_close() here? */ break; - case -ENOBUFS: case -EAGAIN: status = xs_nospace(task); break; @@ -721,6 +711,7 @@ static int xs_tcp_send_request(struct rpc_task *task) case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: + case -ENOBUFS: case -EPIPE: clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); } @@ -786,6 +777,7 @@ static void xs_sock_mark_closed(struct rpc_xprt *xprt) xs_sock_reset_connection_flags(xprt); /* Mark transport as closed and wake up all pending tasks */ xprt_disconnect_done(xprt); + xprt_force_disconnect(xprt); } /** @@ -827,6 +819,9 @@ static void xs_reset_transport(struct sock_xprt *transport) if (sk == NULL) return; + if (atomic_read(&transport->xprt.swapper)) + sk_clear_memalloc(sk); + write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; transport->sock = NULL; @@ -863,6 +858,13 @@ static void xs_close(struct rpc_xprt *xprt) xprt_disconnect_done(xprt); } +static void xs_inject_disconnect(struct rpc_xprt *xprt) +{ + dprintk("RPC: injecting transport disconnect on xprt=%p\n", + xprt); + xprt_disconnect_done(xprt); +} + static void xs_xprt_free(struct rpc_xprt *xprt) { xs_free_peer_addresses(xprt); @@ -901,7 +903,6 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) /** * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets * @sk: socket with data to read - * @len: how much data to read * * Currently this assumes we can read the whole reply in a single gulp. */ @@ -965,7 +966,6 @@ static void xs_local_data_ready(struct sock *sk) /** * xs_udp_data_ready - "data ready" callback for UDP sockets * @sk: socket with data to read - * @len: how much data to read * */ static void xs_udp_data_ready(struct sock *sk) @@ -1389,7 +1389,6 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns /** * xs_tcp_data_ready - "data ready" callback for TCP sockets * @sk: socket with data to read - * @bytes: how much data to read * */ static void xs_tcp_data_ready(struct sock *sk) @@ -1886,9 +1885,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, /** * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint - * @xprt: RPC transport to connect * @transport: socket transport to connect - * @create_sock: function to create a socket of the correct type */ static int xs_local_setup_socket(struct sock_xprt *transport) { @@ -1960,43 +1957,84 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) msleep_interruptible(15000); } -#ifdef CONFIG_SUNRPC_SWAP +#if IS_ENABLED(CONFIG_SUNRPC_SWAP) +/* + * Note that this should be called with XPRT_LOCKED held (or when we otherwise + * know that we have exclusive access to the socket), to guard against + * races with xs_reset_transport. + */ static void xs_set_memalloc(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - if (xprt->swapper) + /* + * If there's no sock, then we have nothing to set. The + * reconnecting process will get it for us. + */ + if (!transport->inet) + return; + if (atomic_read(&xprt->swapper)) sk_set_memalloc(transport->inet); } /** - * xs_swapper - Tag this transport as being used for swap. + * xs_enable_swap - Tag this transport as being used for swap. * @xprt: transport to tag - * @enable: enable/disable * + * Take a reference to this transport on behalf of the rpc_clnt, and + * optionally mark it for swapping if it wasn't already. */ -int xs_swapper(struct rpc_xprt *xprt, int enable) +static int +xs_enable_swap(struct rpc_xprt *xprt) { - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, - xprt); - int err = 0; + struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); - if (enable) { - xprt->swapper++; - xs_set_memalloc(xprt); - } else if (xprt->swapper) { - xprt->swapper--; - sk_clear_memalloc(transport->inet); - } + if (atomic_inc_return(&xprt->swapper) != 1) + return 0; + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) + return -ERESTARTSYS; + if (xs->inet) + sk_set_memalloc(xs->inet); + xprt_release_xprt(xprt, NULL); + return 0; +} - return err; +/** + * xs_disable_swap - Untag this transport as being used for swap. + * @xprt: transport to tag + * + * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the + * swapper refcount goes to 0, untag the socket as a memalloc socket. + */ +static void +xs_disable_swap(struct rpc_xprt *xprt) +{ + struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); + + if (!atomic_dec_and_test(&xprt->swapper)) + return; + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) + return; + if (xs->inet) + sk_clear_memalloc(xs->inet); + xprt_release_xprt(xprt, NULL); } -EXPORT_SYMBOL_GPL(xs_swapper); #else static void xs_set_memalloc(struct rpc_xprt *xprt) { } + +static int +xs_enable_swap(struct rpc_xprt *xprt) +{ + return -EINVAL; +} + +static void +xs_disable_swap(struct rpc_xprt *xprt) +{ +} #endif static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) @@ -2057,6 +2095,27 @@ out: xprt_wake_pending_tasks(xprt, status); } +/** + * xs_tcp_shutdown - gracefully shut down a TCP socket + * @xprt: transport + * + * Initiates a graceful shutdown of the TCP socket by calling the + * equivalent of shutdown(SHUT_RDWR); + */ +static void xs_tcp_shutdown(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct socket *sock = transport->sock; + + if (sock == NULL) + return; + if (xprt_connected(xprt)) { + kernel_sock_shutdown(sock, SHUT_RDWR); + trace_rpc_socket_shutdown(xprt, sock); + } else + xs_reset_transport(transport); +} + static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -2067,6 +2126,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) unsigned int keepidle = xprt->timeout->to_initval / HZ; unsigned int keepcnt = xprt->timeout->to_retries + 1; unsigned int opt_on = 1; + unsigned int timeo; /* TCP Keepalive options */ kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, @@ -2078,6 +2138,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keepcnt, sizeof(keepcnt)); + /* TCP user timeout (see RFC5482) */ + timeo = jiffies_to_msecs(xprt->timeout->to_initval) * + (xprt->timeout->to_retries + 1); + kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&timeo, sizeof(timeo)); + write_lock_bh(&sk->sk_callback_lock); xs_save_old_callbacks(transport, sk); @@ -2125,9 +2191,6 @@ out: /** * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint - * @xprt: RPC transport to connect - * @transport: socket transport to connect - * @create_sock: function to create a socket of the correct type * * Invoked by a work queue tasklet. */ @@ -2463,6 +2526,8 @@ static struct rpc_xprt_ops xs_local_ops = { .close = xs_close, .destroy = xs_destroy, .print_stats = xs_local_print_stats, + .enable_swap = xs_enable_swap, + .disable_swap = xs_disable_swap, }; static struct rpc_xprt_ops xs_udp_ops = { @@ -2482,6 +2547,9 @@ static struct rpc_xprt_ops xs_udp_ops = { .close = xs_close, .destroy = xs_destroy, .print_stats = xs_udp_print_stats, + .enable_swap = xs_enable_swap, + .disable_swap = xs_disable_swap, + .inject_disconnect = xs_inject_disconnect, }; static struct rpc_xprt_ops xs_tcp_ops = { @@ -2498,6 +2566,9 @@ static struct rpc_xprt_ops xs_tcp_ops = { .close = xs_tcp_shutdown, .destroy = xs_destroy, .print_stats = xs_tcp_print_stats, + .enable_swap = xs_enable_swap, + .disable_swap = xs_disable_swap, + .inject_disconnect = xs_inject_disconnect, }; /* @@ -2515,6 +2586,9 @@ static struct rpc_xprt_ops bc_tcp_ops = { .close = bc_close, .destroy = bc_destroy, .print_stats = xs_tcp_print_stats, + .enable_swap = xs_enable_swap, + .disable_swap = xs_disable_swap, + .inject_disconnect = xs_inject_disconnect, }; static int xs_init_anyaddr(const int family, struct sockaddr *sap) @@ -2982,7 +3056,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp) RPC_MAX_RESVPORT); } -static struct kernel_param_ops param_ops_portnr = { +static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; @@ -3001,7 +3075,7 @@ static int param_set_slot_table_size(const char *val, RPC_MAX_SLOT_TABLE); } -static struct kernel_param_ops param_ops_slot_table_size = { +static const struct kernel_param_ops param_ops_slot_table_size = { .set = param_set_slot_table_size, .get = param_get_uint, }; @@ -3017,7 +3091,7 @@ static int param_set_max_slot_table_size(const char *val, RPC_MAX_SLOT_TABLE_LIMIT); } -static struct kernel_param_ops param_ops_max_slot_table_size = { +static const struct kernel_param_ops param_ops_max_slot_table_size = { .set = param_set_max_slot_table_size, .get = param_get_uint, }; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8c1e558db118..9f2add3cba26 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1,6 +1,7 @@ /* * net/switchdev/switchdev.c - Switch device API * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us> + * Copyright (c) 2014-2015 Scott Feldman <sfeldma@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -14,87 +15,366 @@ #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/netdevice.h> +#include <linux/if_bridge.h> +#include <net/ip_fib.h> #include <net/switchdev.h> /** - * netdev_switch_parent_id_get - Get ID of a switch + * switchdev_port_attr_get - Get port attribute + * + * @dev: port device + * @attr: attribute to get + */ +int switchdev_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + struct switchdev_attr first = { + .id = SWITCHDEV_ATTR_UNDEFINED + }; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_attr_get) + return ops->switchdev_port_attr_get(dev, attr); + + if (attr->flags & SWITCHDEV_F_NO_RECURSE) + return err; + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to get attr on + * each port. Return -ENODATA if attr values don't + * compare across ports. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = switchdev_port_attr_get(lower_dev, attr); + if (err) + break; + if (first.id == SWITCHDEV_ATTR_UNDEFINED) + first = *attr; + else if (memcmp(&first, attr, sizeof(*attr))) + return -ENODATA; + } + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_port_attr_get); + +static int __switchdev_port_attr_set(struct net_device *dev, + struct switchdev_attr *attr) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_attr_set) + return ops->switchdev_port_attr_set(dev, attr); + + if (attr->flags & SWITCHDEV_F_NO_RECURSE) + return err; + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to set attr on + * each port. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_port_attr_set(lower_dev, attr); + if (err) + break; + } + + return err; +} + +struct switchdev_attr_set_work { + struct work_struct work; + struct net_device *dev; + struct switchdev_attr attr; +}; + +static void switchdev_port_attr_set_work(struct work_struct *work) +{ + struct switchdev_attr_set_work *asw = + container_of(work, struct switchdev_attr_set_work, work); + int err; + + rtnl_lock(); + err = switchdev_port_attr_set(asw->dev, &asw->attr); + if (err && err != -EOPNOTSUPP) + netdev_err(asw->dev, "failed (err=%d) to set attribute (id=%d)\n", + err, asw->attr.id); + rtnl_unlock(); + + dev_put(asw->dev); + kfree(work); +} + +static int switchdev_port_attr_set_defer(struct net_device *dev, + struct switchdev_attr *attr) +{ + struct switchdev_attr_set_work *asw; + + asw = kmalloc(sizeof(*asw), GFP_ATOMIC); + if (!asw) + return -ENOMEM; + + INIT_WORK(&asw->work, switchdev_port_attr_set_work); + + dev_hold(dev); + asw->dev = dev; + memcpy(&asw->attr, attr, sizeof(asw->attr)); + + schedule_work(&asw->work); + + return 0; +} + +/** + * switchdev_port_attr_set - Set port attribute + * + * @dev: port device + * @attr: attribute to set + * + * Use a 2-phase prepare-commit transaction model to ensure + * system is not left in a partially updated state due to + * failure from driver/device. + */ +int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) +{ + int err; + + if (!rtnl_is_locked()) { + /* Running prepare-commit transaction across stacked + * devices requires nothing moves, so if rtnl_lock is + * not held, schedule a worker thread to hold rtnl_lock + * while setting attr. + */ + + return switchdev_port_attr_set_defer(dev, attr); + } + + /* Phase I: prepare for attr set. Driver/device should fail + * here if there are going to be issues in the commit phase, + * such as lack of resources or support. The driver/device + * should reserve resources needed for the commit phase here, + * but should not commit the attr. + */ + + attr->trans = SWITCHDEV_TRANS_PREPARE; + err = __switchdev_port_attr_set(dev, attr); + if (err) { + /* Prepare phase failed: abort the transaction. Any + * resources reserved in the prepare phase are + * released. + */ + + if (err != -EOPNOTSUPP) { + attr->trans = SWITCHDEV_TRANS_ABORT; + __switchdev_port_attr_set(dev, attr); + } + + return err; + } + + /* Phase II: commit attr set. This cannot fail as a fault + * of driver/device. If it does, it's a bug in the driver/device + * because the driver said everythings was OK in phase I. + */ + + attr->trans = SWITCHDEV_TRANS_COMMIT; + err = __switchdev_port_attr_set(dev, attr); + WARN(err, "%s: Commit of attribute (id=%d) failed.\n", + dev->name, attr->id); + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_port_attr_set); + +static int __switchdev_port_obj_add(struct net_device *dev, + struct switchdev_obj *obj) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_obj_add) + return ops->switchdev_port_obj_add(dev, obj); + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to add object on + * each port. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = __switchdev_port_obj_add(lower_dev, obj); + if (err) + break; + } + + return err; +} + +/** + * switchdev_port_obj_add - Add port object + * * @dev: port device - * @psid: switch ID + * @obj: object to add * - * Get ID of a switch this port is part of. + * Use a 2-phase prepare-commit transaction model to ensure + * system is not left in a partially updated state due to + * failure from driver/device. + * + * rtnl_lock must be held. */ -int netdev_switch_parent_id_get(struct net_device *dev, - struct netdev_phys_item_id *psid) +int switchdev_port_obj_add(struct net_device *dev, struct switchdev_obj *obj) { - const struct net_device_ops *ops = dev->netdev_ops; + int err; + + ASSERT_RTNL(); + + /* Phase I: prepare for obj add. Driver/device should fail + * here if there are going to be issues in the commit phase, + * such as lack of resources or support. The driver/device + * should reserve resources needed for the commit phase here, + * but should not commit the obj. + */ + + obj->trans = SWITCHDEV_TRANS_PREPARE; + err = __switchdev_port_obj_add(dev, obj); + if (err) { + /* Prepare phase failed: abort the transaction. Any + * resources reserved in the prepare phase are + * released. + */ + + if (err != -EOPNOTSUPP) { + obj->trans = SWITCHDEV_TRANS_ABORT; + __switchdev_port_obj_add(dev, obj); + } + + return err; + } - if (!ops->ndo_switch_parent_id_get) - return -EOPNOTSUPP; - return ops->ndo_switch_parent_id_get(dev, psid); + /* Phase II: commit obj add. This cannot fail as a fault + * of driver/device. If it does, it's a bug in the driver/device + * because the driver said everythings was OK in phase I. + */ + + obj->trans = SWITCHDEV_TRANS_COMMIT; + err = __switchdev_port_obj_add(dev, obj); + WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); + + return err; } -EXPORT_SYMBOL(netdev_switch_parent_id_get); +EXPORT_SYMBOL_GPL(switchdev_port_obj_add); /** - * netdev_switch_port_stp_update - Notify switch device port of STP - * state change + * switchdev_port_obj_del - Delete port object + * * @dev: port device - * @state: port STP state + * @obj: object to delete + */ +int switchdev_port_obj_del(struct net_device *dev, struct switchdev_obj *obj) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_obj_del) + return ops->switchdev_port_obj_del(dev, obj); + + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to delete object on + * each port. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = switchdev_port_obj_del(lower_dev, obj); + if (err) + break; + } + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_port_obj_del); + +/** + * switchdev_port_obj_dump - Dump port objects * - * Notify switch device port of bridge port STP state change. + * @dev: port device + * @obj: object to dump */ -int netdev_switch_port_stp_update(struct net_device *dev, u8 state) +int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj) { - const struct net_device_ops *ops = dev->netdev_ops; + const struct switchdev_ops *ops = dev->switchdev_ops; + struct net_device *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (ops && ops->switchdev_port_obj_dump) + return ops->switchdev_port_obj_dump(dev, obj); - if (!ops->ndo_switch_port_stp_update) - return -EOPNOTSUPP; - WARN_ON(!ops->ndo_switch_parent_id_get); - return ops->ndo_switch_port_stp_update(dev, state); + /* Switch device port(s) may be stacked under + * bond/team/vlan dev, so recurse down to dump objects on + * first port at bottom of stack. + */ + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = switchdev_port_obj_dump(lower_dev, obj); + break; + } + + return err; } -EXPORT_SYMBOL(netdev_switch_port_stp_update); +EXPORT_SYMBOL_GPL(switchdev_port_obj_dump); -static DEFINE_MUTEX(netdev_switch_mutex); -static RAW_NOTIFIER_HEAD(netdev_switch_notif_chain); +static DEFINE_MUTEX(switchdev_mutex); +static RAW_NOTIFIER_HEAD(switchdev_notif_chain); /** - * register_netdev_switch_notifier - Register nofifier + * register_switchdev_notifier - Register notifier * @nb: notifier_block * * Register switch device notifier. This should be used by code * which needs to monitor events happening in particular device. * Return values are same as for atomic_notifier_chain_register(). */ -int register_netdev_switch_notifier(struct notifier_block *nb) +int register_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_chain_register(&netdev_switch_notif_chain, nb); - mutex_unlock(&netdev_switch_mutex); + mutex_lock(&switchdev_mutex); + err = raw_notifier_chain_register(&switchdev_notif_chain, nb); + mutex_unlock(&switchdev_mutex); return err; } -EXPORT_SYMBOL(register_netdev_switch_notifier); +EXPORT_SYMBOL_GPL(register_switchdev_notifier); /** - * unregister_netdev_switch_notifier - Unregister nofifier + * unregister_switchdev_notifier - Unregister notifier * @nb: notifier_block * * Unregister switch device notifier. * Return values are same as for atomic_notifier_chain_unregister(). */ -int unregister_netdev_switch_notifier(struct notifier_block *nb) +int unregister_switchdev_notifier(struct notifier_block *nb) { int err; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_chain_unregister(&netdev_switch_notif_chain, nb); - mutex_unlock(&netdev_switch_mutex); + mutex_lock(&switchdev_mutex); + err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb); + mutex_unlock(&switchdev_mutex); return err; } -EXPORT_SYMBOL(unregister_netdev_switch_notifier); +EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); /** - * call_netdev_switch_notifiers - Call nofifiers + * call_switchdev_notifiers - Call notifiers * @val: value passed unmodified to notifier function * @dev: port device * @info: notifier information data @@ -103,125 +383,663 @@ EXPORT_SYMBOL(unregister_netdev_switch_notifier); * when it needs to propagate hardware event. * Return values are same as for atomic_notifier_call_chain(). */ -int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev, - struct netdev_switch_notifier_info *info) +int call_switchdev_notifiers(unsigned long val, struct net_device *dev, + struct switchdev_notifier_info *info) { int err; info->dev = dev; - mutex_lock(&netdev_switch_mutex); - err = raw_notifier_call_chain(&netdev_switch_notif_chain, val, info); - mutex_unlock(&netdev_switch_mutex); + mutex_lock(&switchdev_mutex); + err = raw_notifier_call_chain(&switchdev_notif_chain, val, info); + mutex_unlock(&switchdev_mutex); + return err; +} +EXPORT_SYMBOL_GPL(call_switchdev_notifiers); + +struct switchdev_vlan_dump { + struct switchdev_obj obj; + struct sk_buff *skb; + u32 filter_mask; + u16 flags; + u16 begin; + u16 end; +}; + +static int switchdev_port_vlan_dump_put(struct net_device *dev, + struct switchdev_vlan_dump *dump) +{ + struct bridge_vlan_info vinfo; + + vinfo.flags = dump->flags; + + if (dump->begin == 0 && dump->end == 0) { + return 0; + } else if (dump->begin == dump->end) { + vinfo.vid = dump->begin; + if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + return -EMSGSIZE; + } else { + vinfo.vid = dump->begin; + vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; + if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + return -EMSGSIZE; + vinfo.vid = dump->end; + vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN; + vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_END; + if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO, + sizeof(vinfo), &vinfo)) + return -EMSGSIZE; + } + + return 0; +} + +static int switchdev_port_vlan_dump_cb(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_vlan_dump *dump = + container_of(obj, struct switchdev_vlan_dump, obj); + struct switchdev_obj_vlan *vlan = &dump->obj.u.vlan; + int err = 0; + + if (vlan->vid_begin > vlan->vid_end) + return -EINVAL; + + if (dump->filter_mask & RTEXT_FILTER_BRVLAN) { + dump->flags = vlan->flags; + for (dump->begin = dump->end = vlan->vid_begin; + dump->begin <= vlan->vid_end; + dump->begin++, dump->end++) { + err = switchdev_port_vlan_dump_put(dev, dump); + if (err) + return err; + } + } else if (dump->filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) { + if (dump->begin > vlan->vid_begin && + dump->begin >= vlan->vid_end) { + if ((dump->begin - 1) == vlan->vid_end && + dump->flags == vlan->flags) { + /* prepend */ + dump->begin = vlan->vid_begin; + } else { + err = switchdev_port_vlan_dump_put(dev, dump); + dump->flags = vlan->flags; + dump->begin = vlan->vid_begin; + dump->end = vlan->vid_end; + } + } else if (dump->end <= vlan->vid_begin && + dump->end < vlan->vid_end) { + if ((dump->end + 1) == vlan->vid_begin && + dump->flags == vlan->flags) { + /* append */ + dump->end = vlan->vid_end; + } else { + err = switchdev_port_vlan_dump_put(dev, dump); + dump->flags = vlan->flags; + dump->begin = vlan->vid_begin; + dump->end = vlan->vid_end; + } + } else { + err = -EINVAL; + } + } + return err; } -EXPORT_SYMBOL(call_netdev_switch_notifiers); + +static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev, + u32 filter_mask) +{ + struct switchdev_vlan_dump dump = { + .obj = { + .id = SWITCHDEV_OBJ_PORT_VLAN, + .cb = switchdev_port_vlan_dump_cb, + }, + .skb = skb, + .filter_mask = filter_mask, + }; + int err = 0; + + if ((filter_mask & RTEXT_FILTER_BRVLAN) || + (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) { + err = switchdev_port_obj_dump(dev, &dump.obj); + if (err) + goto err_out; + if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) + /* last one */ + err = switchdev_port_vlan_dump_put(dev, &dump); + } + +err_out: + return err == -EOPNOTSUPP ? 0 : err; +} /** - * netdev_switch_port_bridge_setlink - Notify switch device port of bridge - * port attributes + * switchdev_port_bridge_getlink - Get bridge port attributes * * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge setlink flags * - * Notify switch device port of bridge port attributes + * Called for SELF on rtnl_bridge_getlink to get bridge port + * attributes. */ -int netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, + struct net_device *dev, u32 filter_mask, + int nlflags) { - const struct net_device_ops *ops = dev->netdev_ops; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS, + }; + u16 mode = BRIDGE_MODE_UNDEF; + u32 mask = BR_LEARNING | BR_LEARNING_SYNC; + int err; - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return 0; + err = switchdev_port_attr_get(dev, &attr); + if (err && err != -EOPNOTSUPP) + return err; + + return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode, + attr.u.brport_flags, mask, nlflags, + filter_mask, switchdev_port_vlan_fill); +} +EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink); + +static int switchdev_port_br_setflag(struct net_device *dev, + struct nlattr *nlattr, + unsigned long brport_flag) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS, + }; + u8 flag = nla_get_u8(nlattr); + int err; + + err = switchdev_port_attr_get(dev, &attr); + if (err) + return err; + + if (flag) + attr.u.brport_flags |= brport_flag; + else + attr.u.brport_flags &= ~brport_flag; + + return switchdev_port_attr_set(dev, &attr); +} + +static const struct nla_policy +switchdev_port_bridge_policy[IFLA_BRPORT_MAX + 1] = { + [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, + [IFLA_BRPORT_COST] = { .type = NLA_U32 }, + [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, + [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, + [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, + [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, + [IFLA_BRPORT_FAST_LEAVE] = { .type = NLA_U8 }, + [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, + [IFLA_BRPORT_LEARNING_SYNC] = { .type = NLA_U8 }, + [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, +}; + +static int switchdev_port_br_setlink_protinfo(struct net_device *dev, + struct nlattr *protinfo) +{ + struct nlattr *attr; + int rem; + int err; - if (!ops->ndo_bridge_setlink) - return -EOPNOTSUPP; + err = nla_validate_nested(protinfo, IFLA_BRPORT_MAX, + switchdev_port_bridge_policy); + if (err) + return err; + + nla_for_each_nested(attr, protinfo, rem) { + switch (nla_type(attr)) { + case IFLA_BRPORT_LEARNING: + err = switchdev_port_br_setflag(dev, attr, + BR_LEARNING); + break; + case IFLA_BRPORT_LEARNING_SYNC: + err = switchdev_port_br_setflag(dev, attr, + BR_LEARNING_SYNC); + break; + default: + err = -EOPNOTSUPP; + break; + } + if (err) + return err; + } - return ops->ndo_bridge_setlink(dev, nlh, flags); + return 0; +} + +static int switchdev_port_br_afspec(struct net_device *dev, + struct nlattr *afspec, + int (*f)(struct net_device *dev, + struct switchdev_obj *obj)) +{ + struct nlattr *attr; + struct bridge_vlan_info *vinfo; + struct switchdev_obj obj = { + .id = SWITCHDEV_OBJ_PORT_VLAN, + }; + struct switchdev_obj_vlan *vlan = &obj.u.vlan; + int rem; + int err; + + nla_for_each_nested(attr, afspec, rem) { + if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO) + continue; + if (nla_len(attr) != sizeof(struct bridge_vlan_info)) + return -EINVAL; + vinfo = nla_data(attr); + vlan->flags = vinfo->flags; + if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { + if (vlan->vid_begin) + return -EINVAL; + vlan->vid_begin = vinfo->vid; + } else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) { + if (!vlan->vid_begin) + return -EINVAL; + vlan->vid_end = vinfo->vid; + if (vlan->vid_end <= vlan->vid_begin) + return -EINVAL; + err = f(dev, &obj); + if (err) + return err; + memset(vlan, 0, sizeof(*vlan)); + } else { + if (vlan->vid_begin) + return -EINVAL; + vlan->vid_begin = vinfo->vid; + vlan->vid_end = vinfo->vid; + err = f(dev, &obj); + if (err) + return err; + memset(vlan, 0, sizeof(*vlan)); + } + } + + return 0; } -EXPORT_SYMBOL(netdev_switch_port_bridge_setlink); /** - * netdev_switch_port_bridge_dellink - Notify switch device port of bridge - * port attribute delete + * switchdev_port_bridge_setlink - Set bridge port attributes * * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge setlink flags + * @nlh: netlink header + * @flags: netlink flags * - * Notify switch device port of bridge port attribute delete + * Called for SELF on rtnl_bridge_setlink to set bridge port + * attributes. */ -int netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_setlink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { - const struct net_device_ops *ops = dev->netdev_ops; + struct nlattr *protinfo; + struct nlattr *afspec; + int err = 0; - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return 0; + protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_PROTINFO); + if (protinfo) { + err = switchdev_port_br_setlink_protinfo(dev, protinfo); + if (err) + return err; + } - if (!ops->ndo_bridge_dellink) - return -EOPNOTSUPP; + afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_AF_SPEC); + if (afspec) + err = switchdev_port_br_afspec(dev, afspec, + switchdev_port_obj_add); - return ops->ndo_bridge_dellink(dev, nlh, flags); + return err; } -EXPORT_SYMBOL(netdev_switch_port_bridge_dellink); +EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink); /** - * ndo_dflt_netdev_switch_port_bridge_setlink - default ndo bridge setlink - * op for master devices + * switchdev_port_bridge_dellink - Set bridge port attributes * * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge setlink flags + * @nlh: netlink header + * @flags: netlink flags * - * Notify master device slaves of bridge port attributes + * Called for SELF on rtnl_bridge_dellink to set bridge port + * attributes. */ -int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_bridge_dellink(struct net_device *dev, + struct nlmsghdr *nlh, u16 flags) { - struct net_device *lower_dev; - struct list_head *iter; - int ret = 0, err = 0; + struct nlattr *afspec; - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return ret; + afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), + IFLA_AF_SPEC); + if (afspec) + return switchdev_port_br_afspec(dev, afspec, + switchdev_port_obj_del); - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_bridge_setlink(lower_dev, nlh, flags); - if (err && err != -EOPNOTSUPP) - ret = err; - } + return 0; +} +EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink); + +/** + * switchdev_port_fdb_add - Add FDB (MAC/VLAN) entry to port + * + * @ndmsg: netlink hdr + * @nlattr: netlink attributes + * @dev: port device + * @addr: MAC address to add + * @vid: VLAN to add + * + * Add FDB entry to switch device. + */ +int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, const unsigned char *addr, + u16 vid, u16 nlm_flags) +{ + struct switchdev_obj obj = { + .id = SWITCHDEV_OBJ_PORT_FDB, + .u.fdb = { + .addr = addr, + .vid = vid, + }, + }; + + return switchdev_port_obj_add(dev, &obj); +} +EXPORT_SYMBOL_GPL(switchdev_port_fdb_add); + +/** + * switchdev_port_fdb_del - Delete FDB (MAC/VLAN) entry from port + * + * @ndmsg: netlink hdr + * @nlattr: netlink attributes + * @dev: port device + * @addr: MAC address to delete + * @vid: VLAN to delete + * + * Delete FDB entry from switch device. + */ +int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, const unsigned char *addr, + u16 vid) +{ + struct switchdev_obj obj = { + .id = SWITCHDEV_OBJ_PORT_FDB, + .u.fdb = { + .addr = addr, + .vid = vid, + }, + }; + + return switchdev_port_obj_del(dev, &obj); +} +EXPORT_SYMBOL_GPL(switchdev_port_fdb_del); + +struct switchdev_fdb_dump { + struct switchdev_obj obj; + struct sk_buff *skb; + struct netlink_callback *cb; + int idx; +}; + +static int switchdev_port_fdb_dump_cb(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_fdb_dump *dump = + container_of(obj, struct switchdev_fdb_dump, obj); + u32 portid = NETLINK_CB(dump->cb->skb).portid; + u32 seq = dump->cb->nlh->nlmsg_seq; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + if (dump->idx < dump->cb->args[0]) + goto skip; + + nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, + sizeof(*ndm), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = AF_BRIDGE; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = NTF_SELF; + ndm->ndm_type = 0; + ndm->ndm_ifindex = dev->ifindex; + ndm->ndm_state = NUD_REACHABLE; + + if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr)) + goto nla_put_failure; + + if (obj->u.fdb.vid && nla_put_u16(dump->skb, NDA_VLAN, obj->u.fdb.vid)) + goto nla_put_failure; + + nlmsg_end(dump->skb, nlh); + +skip: + dump->idx++; + return 0; - return ret; +nla_put_failure: + nlmsg_cancel(dump->skb, nlh); + return -EMSGSIZE; } -EXPORT_SYMBOL(ndo_dflt_netdev_switch_port_bridge_setlink); /** - * ndo_dflt_netdev_switch_port_bridge_dellink - default ndo bridge dellink - * op for master devices + * switchdev_port_fdb_dump - Dump port FDB (MAC/VLAN) entries * + * @skb: netlink skb + * @cb: netlink callback * @dev: port device - * @nlh: netlink msg with bridge port attributes - * @flags: bridge dellink flags + * @filter_dev: filter device + * @idx: * - * Notify master device slaves of bridge port attribute deletes + * Delete FDB entry from switch device. */ -int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, - struct nlmsghdr *nlh, u16 flags) +int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, + struct net_device *dev, + struct net_device *filter_dev, int idx) { + struct switchdev_fdb_dump dump = { + .obj = { + .id = SWITCHDEV_OBJ_PORT_FDB, + .cb = switchdev_port_fdb_dump_cb, + }, + .skb = skb, + .cb = cb, + .idx = idx, + }; + int err; + + err = switchdev_port_obj_dump(dev, &dump.obj); + if (err) + return err; + + return dump.idx; +} +EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); + +static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) +{ + const struct switchdev_ops *ops = dev->switchdev_ops; struct net_device *lower_dev; + struct net_device *port_dev; struct list_head *iter; - int ret = 0, err = 0; - if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD)) - return ret; + /* Recusively search down until we find a sw port dev. + * (A sw port dev supports switchdev_port_attr_get). + */ + + if (ops && ops->switchdev_port_attr_get) + return dev; netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = netdev_switch_port_bridge_dellink(lower_dev, nlh, flags); - if (err && err != -EOPNOTSUPP) - ret = err; + port_dev = switchdev_get_lowest_dev(lower_dev); + if (port_dev) + return port_dev; + } + + return NULL; +} + +static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) +{ + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + }; + struct switchdev_attr prev_attr; + struct net_device *dev = NULL; + int nhsel; + + /* For this route, all nexthop devs must be on the same switch. */ + + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + + if (!nh->nh_dev) + return NULL; + + dev = switchdev_get_lowest_dev(nh->nh_dev); + if (!dev) + return NULL; + + if (switchdev_port_attr_get(dev, &attr)) + return NULL; + + if (nhsel > 0) { + if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len) + return NULL; + if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id, + attr.u.ppid.id_len)) + return NULL; + } + + prev_attr = attr; } - return ret; + return dev; +} + +/** + * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry + * + * @dst: route's IPv4 destination address + * @dst_len: destination address length (prefix length) + * @fi: route FIB info structure + * @tos: route TOS + * @type: route type + * @nlflags: netlink flags passed in (NLM_F_*) + * @tb_id: route table ID + * + * Add/modify switch IPv4 route entry. + */ +int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 nlflags, u32 tb_id) +{ + struct switchdev_obj fib_obj = { + .id = SWITCHDEV_OBJ_IPV4_FIB, + .u.ipv4_fib = { + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .nlflags = nlflags, + .tb_id = tb_id, + }, + }; + struct net_device *dev; + int err = 0; + + /* Don't offload route if using custom ip rules or if + * IPv4 FIB offloading has been disabled completely. + */ + +#ifdef CONFIG_IP_MULTIPLE_TABLES + if (fi->fib_net->ipv4.fib_has_custom_rules) + return 0; +#endif + + if (fi->fib_net->ipv4.fib_offload_disabled) + return 0; + + dev = switchdev_get_dev_by_nhs(fi); + if (!dev) + return 0; + + err = switchdev_port_obj_add(dev, &fib_obj); + if (!err) + fi->fib_flags |= RTNH_F_OFFLOAD; + + return err == -EOPNOTSUPP ? 0 : err; +} +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); + +/** + * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch + * + * @dst: route's IPv4 destination address + * @dst_len: destination address length (prefix length) + * @fi: route FIB info structure + * @tos: route TOS + * @type: route type + * @tb_id: route table ID + * + * Delete IPv4 route entry from switch device. + */ +int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) +{ + struct switchdev_obj fib_obj = { + .id = SWITCHDEV_OBJ_IPV4_FIB, + .u.ipv4_fib = { + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .nlflags = 0, + .tb_id = tb_id, + }, + }; + struct net_device *dev; + int err = 0; + + if (!(fi->fib_flags & RTNH_F_OFFLOAD)) + return 0; + + dev = switchdev_get_dev_by_nhs(fi); + if (!dev) + return 0; + + err = switchdev_port_obj_del(dev, &fib_obj); + if (!err) + fi->fib_flags &= ~RTNH_F_OFFLOAD; + + return err == -EOPNOTSUPP ? 0 : err; +} +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del); + +/** + * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation + * + * @fi: route FIB info structure + */ +void switchdev_fib_ipv4_abort(struct fib_info *fi) +{ + /* There was a problem installing this route to the offload + * device. For now, until we come up with more refined + * policy handling, abruptly end IPv4 fib offloading for + * for entire net by flushing offload device(s) of all + * IPv4 routes, and mark IPv4 fib offloading broken from + * this point forward. + */ + + fib_flush_external(fi->fib_net); + fi->fib_net->ipv4.fib_offload_disabled = true; } -EXPORT_SYMBOL(ndo_dflt_netdev_switch_port_bridge_dellink); +EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index 91c8a8e031db..c25a3a149dc4 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -26,3 +26,11 @@ config TIPC_MEDIA_IB help Saying Y here will enable support for running TIPC on IP-over-InfiniBand devices. +config TIPC_MEDIA_UDP + bool "IP/UDP media type support" + depends on TIPC + select NET_UDP_TUNNEL + help + Saying Y here will enable support for running TIPC over IP/UDP + bool + default y diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 599b1a540d2b..57e460be4692 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -10,5 +10,6 @@ tipc-y += addr.o bcast.o bearer.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ server.o socket.o +tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/tipc/addr.h b/net/tipc/addr.h index c700c2d28e09..93f7c983be33 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -41,10 +41,18 @@ #include <linux/tipc.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include "core.h" #define TIPC_ZONE_MASK 0xff000000u #define TIPC_CLUSTER_MASK 0xfffff000u +static inline u32 tipc_own_addr(struct net *net) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + + return tn->own_addr; +} + static inline u32 tipc_zone_mask(u32 addr) { return addr & TIPC_ZONE_MASK; @@ -55,6 +63,7 @@ static inline u32 tipc_cluster_mask(u32 addr) return addr & TIPC_CLUSTER_MASK; } +u32 tipc_own_addr(struct net *net); int in_own_cluster(struct net *net, u32 addr); int in_own_cluster_exact(struct net *net, u32 addr); int in_own_node(struct net *net, u32 addr); diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 3e41704832de..a816382fc8af 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -62,21 +62,8 @@ static void tipc_bclink_lock(struct net *net) static void tipc_bclink_unlock(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_node *node = NULL; - if (likely(!tn->bclink->flags)) { - spin_unlock_bh(&tn->bclink->lock); - return; - } - - if (tn->bclink->flags & TIPC_BCLINK_RESET) { - tn->bclink->flags &= ~TIPC_BCLINK_RESET; - node = tipc_bclink_retransmit_to(net); - } spin_unlock_bh(&tn->bclink->lock); - - if (node) - tipc_link_reset_all(node); } void tipc_bclink_input(struct net *net) @@ -91,13 +78,6 @@ uint tipc_bclink_get_mtu(void) return MAX_PKT_DEFAULT_MCAST; } -void tipc_bclink_set_flags(struct net *net, unsigned int flags) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - tn->bclink->flags |= flags; -} - static u32 bcbuf_acks(struct sk_buff *buf) { return (u32)(unsigned long)TIPC_SKB_CB(buf)->handle; @@ -128,6 +108,11 @@ void tipc_bclink_remove_node(struct net *net, u32 addr) tipc_bclink_lock(net); tipc_nmap_remove(&tn->bclink->bcast_nodes, addr); + + /* Last node? => reset backlog queue */ + if (!tn->bclink->bcast_nodes.count) + tipc_link_purge_backlog(&tn->bclink->link); + tipc_bclink_unlock(net); } @@ -136,17 +121,14 @@ static void bclink_set_last_sent(struct net *net) struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *bcl = tn->bcl; - if (bcl->next_out) - bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1); - else - bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1); + bcl->silent_intv_cnt = mod(bcl->snd_nxt - 1); } u32 tipc_bclink_get_last_sent(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - return tn->bcl->fsm_msg_cnt; + return tn->bcl->silent_intv_cnt; } static void bclink_update_last_sent(struct tipc_node *node, u32 seqno) @@ -155,7 +137,6 @@ static void bclink_update_last_sent(struct tipc_node *node, u32 seqno) seqno : node->bclink.last_sent; } - /** * tipc_bclink_retransmit_to - get most recent node to request retransmission * @@ -180,7 +161,7 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to) struct sk_buff *skb; struct tipc_link *bcl = tn->bcl; - skb_queue_walk(&bcl->outqueue, skb) { + skb_queue_walk(&bcl->transmq, skb) { if (more(buf_seqno(skb), after)) { tipc_link_retransmit(bcl, skb, mod(to - after)); break; @@ -210,14 +191,17 @@ void tipc_bclink_wakeup_users(struct net *net) void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) { struct sk_buff *skb, *tmp; - struct sk_buff *next; unsigned int released = 0; struct net *net = n_ptr->net; struct tipc_net *tn = net_generic(net, tipc_net_id); + if (unlikely(!n_ptr->bclink.recv_permitted)) + return; + tipc_bclink_lock(net); + /* Bail out if tx queue is empty (no clean up is required) */ - skb = skb_peek(&tn->bcl->outqueue); + skb = skb_peek(&tn->bcl->transmq); if (!skb) goto exit; @@ -229,42 +213,34 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) * or both sent and unsent messages (otherwise) */ if (tn->bclink->bcast_nodes.count) - acked = tn->bcl->fsm_msg_cnt; + acked = tn->bcl->silent_intv_cnt; else - acked = tn->bcl->next_out_no; + acked = tn->bcl->snd_nxt; } else { /* * Bail out if specified sequence number does not correspond * to a message that has been sent and not yet acknowledged */ if (less(acked, buf_seqno(skb)) || - less(tn->bcl->fsm_msg_cnt, acked) || + less(tn->bcl->silent_intv_cnt, acked) || less_eq(acked, n_ptr->bclink.acked)) goto exit; } /* Skip over packets that node has previously acknowledged */ - skb_queue_walk(&tn->bcl->outqueue, skb) { + skb_queue_walk(&tn->bcl->transmq, skb) { if (more(buf_seqno(skb), n_ptr->bclink.acked)) break; } /* Update packets that node is now acknowledging */ - skb_queue_walk_from_safe(&tn->bcl->outqueue, skb, tmp) { + skb_queue_walk_from_safe(&tn->bcl->transmq, skb, tmp) { if (more(buf_seqno(skb), acked)) break; - - next = tipc_skb_queue_next(&tn->bcl->outqueue, skb); - if (skb != tn->bcl->next_out) { - bcbuf_decr_acks(skb); - } else { - bcbuf_set_acks(skb, 0); - tn->bcl->next_out = next; - bclink_set_last_sent(net); - } - + bcbuf_decr_acks(skb); + bclink_set_last_sent(net); if (bcbuf_acks(skb) == 0) { - __skb_unlink(skb, &tn->bcl->outqueue); + __skb_unlink(skb, &tn->bcl->transmq); kfree_skb(skb); released = 1; } @@ -272,7 +248,7 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) n_ptr->bclink.acked = acked; /* Try resolving broadcast link congestion, if necessary */ - if (unlikely(tn->bcl->next_out)) { + if (unlikely(skb_peek(&tn->bcl->backlogq))) { tipc_link_push_packets(tn->bcl); bclink_set_last_sent(net); } @@ -319,7 +295,7 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr, buf = tipc_buf_acquire(INT_H_SIZE); if (buf) { struct tipc_msg *msg = buf_msg(buf); - struct sk_buff *skb = skb_peek(&n_ptr->bclink.deferred_queue); + struct sk_buff *skb = skb_peek(&n_ptr->bclink.deferdq); u32 to = skb ? buf_seqno(skb) - 1 : n_ptr->bclink.last_sent; tipc_msg_init(tn->own_addr, msg, BCAST_PROTOCOL, STATE_MSG, @@ -354,13 +330,12 @@ static void bclink_peek_nack(struct net *net, struct tipc_msg *msg) return; tipc_node_lock(n_ptr); - if (n_ptr->bclink.recv_permitted && (n_ptr->bclink.last_in != n_ptr->bclink.last_sent) && (n_ptr->bclink.last_in == msg_bcgap_after(msg))) n_ptr->bclink.oos_state = 2; - tipc_node_unlock(n_ptr); + tipc_node_put(n_ptr); } /* tipc_bclink_xmit - deliver buffer chain to all nodes in cluster @@ -387,14 +362,13 @@ int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list) __skb_queue_purge(list); return -EHOSTUNREACH; } - /* Broadcast to all nodes */ if (likely(bclink)) { tipc_bclink_lock(net); if (likely(bclink->bcast_nodes.count)) { rc = __tipc_link_xmit(net, bcl, list); if (likely(!rc)) { - u32 len = skb_queue_len(&bcl->outqueue); + u32 len = skb_queue_len(&bcl->transmq); bclink_set_last_sent(net); bcl->stats.queue_sz_counts++; @@ -440,7 +414,7 @@ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno) */ if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) { tipc_link_proto_xmit(node->active_links[node->addr & 1], - STATE_MSG, 0, 0, 0, 0, 0); + STATE_MSG, 0, 0, 0, 0); tn->bcl->stats.sent_acks++; } } @@ -481,17 +455,18 @@ void tipc_bclink_rcv(struct net *net, struct sk_buff *buf) goto unlock; if (msg_destnode(msg) == tn->own_addr) { tipc_bclink_acknowledge(node, msg_bcast_ack(msg)); - tipc_node_unlock(node); tipc_bclink_lock(net); bcl->stats.recv_nacks++; tn->bclink->retransmit_to = node; bclink_retransmit_pkt(tn, msg_bcgap_after(msg), msg_bcgap_to(msg)); tipc_bclink_unlock(net); + tipc_node_unlock(node); } else { tipc_node_unlock(node); bclink_peek_nack(net, msg); } + tipc_node_put(node); goto exit; } @@ -528,11 +503,13 @@ receive: tipc_bclink_unlock(net); tipc_node_unlock(node); } else if (msg_user(msg) == MSG_FRAGMENTER) { - tipc_buf_append(&node->bclink.reasm_buf, &buf); - if (unlikely(!buf && !node->bclink.reasm_buf)) - goto unlock; tipc_bclink_lock(net); bclink_accept_pkt(node, seqno); + tipc_buf_append(&node->bclink.reasm_buf, &buf); + if (unlikely(!buf && !node->bclink.reasm_buf)) { + tipc_bclink_unlock(net); + goto unlock; + } bcl->stats.recv_fragments++; if (buf) { bcl->stats.recv_fragmented++; @@ -559,25 +536,25 @@ receive: if (node->bclink.last_in == node->bclink.last_sent) goto unlock; - if (skb_queue_empty(&node->bclink.deferred_queue)) { + if (skb_queue_empty(&node->bclink.deferdq)) { node->bclink.oos_state = 1; goto unlock; } - msg = buf_msg(skb_peek(&node->bclink.deferred_queue)); + msg = buf_msg(skb_peek(&node->bclink.deferdq)); seqno = msg_seqno(msg); next_in = mod(next_in + 1); if (seqno != next_in) goto unlock; /* Take in-sequence message from deferred queue & deliver it */ - buf = __skb_dequeue(&node->bclink.deferred_queue); + buf = __skb_dequeue(&node->bclink.deferdq); goto receive; } /* Handle out-of-sequence broadcast message */ if (less(next_in, seqno)) { - deferred = tipc_link_defer_pkt(&node->bclink.deferred_queue, + deferred = tipc_link_defer_pkt(&node->bclink.deferdq, buf); bclink_update_last_sent(node, seqno); buf = NULL; @@ -594,6 +571,7 @@ receive: unlock: tipc_node_unlock(node); + tipc_node_put(node); exit: kfree_skb(buf); } @@ -634,7 +612,6 @@ static int tipc_bcbearer_send(struct net *net, struct sk_buff *buf, msg_set_non_seq(msg, 1); msg_set_mc_netid(msg, tn->net_id); tn->bcl->stats.sent_info++; - if (WARN_ON(!bclink->bcast_nodes.count)) { dump_stack(); return 0; @@ -827,15 +804,15 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->next_in_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->next_out_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt)) goto attr_msg_full; prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->queue_limit[0])) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) goto prop_msg_full; nla_nest_end(msg->skb, prop); @@ -890,6 +867,27 @@ int tipc_bclink_set_queue_limits(struct net *net, u32 limit) return 0; } +int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]) +{ + int err; + u32 win; + struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; + + if (!attrs[TIPC_NLA_LINK_PROP]) + return -EINVAL; + + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); + if (err) + return err; + + if (!props[TIPC_NLA_PROP_WIN]) + return -EOPNOTSUPP; + + win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + + return tipc_bclink_set_queue_limits(net, win); +} + int tipc_bclink_init(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); @@ -913,16 +911,17 @@ int tipc_bclink_init(struct net *net) sprintf(bcbearer->media.name, "tipc-broadcast"); spin_lock_init(&bclink->lock); - __skb_queue_head_init(&bcl->outqueue); - __skb_queue_head_init(&bcl->deferred_queue); + __skb_queue_head_init(&bcl->transmq); + __skb_queue_head_init(&bcl->backlogq); + __skb_queue_head_init(&bcl->deferdq); skb_queue_head_init(&bcl->wakeupq); - bcl->next_out_no = 1; + bcl->snd_nxt = 1; spin_lock_init(&bclink->node.lock); __skb_queue_head_init(&bclink->arrvq); skb_queue_head_init(&bclink->inputq); bcl->owner = &bclink->node; bcl->owner->net = net; - bcl->max_pkt = MAX_PKT_DEFAULT_MCAST; + bcl->mtu = MAX_PKT_DEFAULT_MCAST; tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); bcl->bearer_id = MAX_BEARERS; rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 43f397fbac55..3c290a48f720 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -55,7 +55,6 @@ struct tipc_bcbearer_pair { struct tipc_bearer *secondary; }; -#define TIPC_BCLINK_RESET 1 #define BCBEARER MAX_BEARERS /** @@ -86,7 +85,6 @@ struct tipc_bcbearer { * @lock: spinlock governing access to structure * @link: (non-standard) broadcast link structure * @node: (non-standard) node structure representing b'cast link's peer node - * @flags: represent bclink states * @bcast_nodes: map of broadcast-capable nodes * @retransmit_to: node that most recently requested a retransmit * @@ -96,7 +94,6 @@ struct tipc_bclink { spinlock_t lock; struct tipc_link link; struct tipc_node node; - unsigned int flags; struct sk_buff_head arrvq; struct sk_buff_head inputq; struct tipc_node_map bcast_nodes; @@ -117,7 +114,6 @@ static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, int tipc_bclink_init(struct net *net); void tipc_bclink_stop(struct net *net); -void tipc_bclink_set_flags(struct net *tn, unsigned int flags); void tipc_bclink_add_node(struct net *net, u32 addr); void tipc_bclink_remove_node(struct net *net, u32 addr); struct tipc_node *tipc_bclink_retransmit_to(struct net *tn); @@ -135,6 +131,7 @@ uint tipc_bclink_get_mtu(void); int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list); void tipc_bclink_wakeup_users(struct net *net); int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); +int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); void tipc_bclink_input(struct net *net); #endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 48852c2dcc03..00bc0e620532 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -48,6 +48,9 @@ static struct tipc_media * const media_info_array[] = { #ifdef CONFIG_TIPC_MEDIA_IB &ib_media_info, #endif +#ifdef CONFIG_TIPC_MEDIA_UDP + &udp_media_info, +#endif NULL }; @@ -68,8 +71,7 @@ static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } }; -static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, - bool shutting_down); +static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr); /** * tipc_media_find - locates specified media object by name @@ -216,7 +218,8 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) * tipc_enable_bearer - enable bearer with the given name */ static int tipc_enable_bearer(struct net *net, const char *name, - u32 disc_domain, u32 priority) + u32 disc_domain, u32 priority, + struct nlattr *attr[]) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_bearer *b_ptr; @@ -304,7 +307,7 @@ restart: strcpy(b_ptr->name, name); b_ptr->media = m_ptr; - res = m_ptr->enable_media(net, b_ptr); + res = m_ptr->enable_media(net, b_ptr, attr); if (res) { pr_warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); @@ -320,7 +323,7 @@ restart: res = tipc_disc_create(net, b_ptr, &b_ptr->bcast_addr); if (res) { - bearer_disable(net, b_ptr, false); + bearer_disable(net, b_ptr); pr_warn("Bearer <%s> rejected, discovery object creation failed\n", name); return -EINVAL; @@ -340,7 +343,7 @@ restart: static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr) { pr_info("Resetting bearer <%s>\n", b_ptr->name); - tipc_link_reset_list(net, b_ptr->identity); + tipc_link_delete_list(net, b_ptr->identity); tipc_disc_reset(net, b_ptr); return 0; } @@ -350,8 +353,7 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr) * * Note: This routine assumes caller holds RTNL lock. */ -static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, - bool shutting_down) +static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr) { struct tipc_net *tn = net_generic(net, tipc_net_id); u32 i; @@ -359,7 +361,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, pr_info("Disabling bearer <%s>\n", b_ptr->name); b_ptr->media->disable_media(b_ptr); - tipc_link_delete_list(net, b_ptr->identity, shutting_down); + tipc_link_delete_list(net, b_ptr->identity); if (b_ptr->link_req) tipc_disc_delete(b_ptr->link_req); @@ -372,7 +374,8 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr, kfree_rcu(b_ptr, rcu); } -int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b) +int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, + struct nlattr *attr[]) { struct net_device *dev; char *driver_name = strchr((const char *)b->name, ':') + 1; @@ -536,7 +539,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: - bearer_disable(dev_net(dev), b_ptr, false); + bearer_disable(dev_net(dev), b_ptr); break; } return NOTIFY_OK; @@ -578,7 +581,7 @@ void tipc_bearer_stop(struct net *net) for (i = 0; i < MAX_BEARERS; i++) { b_ptr = rtnl_dereference(tn->bearer_list[i]); if (b_ptr) { - bearer_disable(net, b_ptr, true); + bearer_disable(net, b_ptr); tn->bearer_list[i] = NULL; } } @@ -586,14 +589,14 @@ void tipc_bearer_stop(struct net *net) /* Caller should hold rtnl_lock to protect the bearer */ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, - struct tipc_bearer *bearer) + struct tipc_bearer *bearer, int nlflags) { void *hdr; struct nlattr *attrs; struct nlattr *prop; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, - NLM_F_MULTI, TIPC_NL_BEARER_GET); + nlflags, TIPC_NL_BEARER_GET); if (!hdr) return -EMSGSIZE; @@ -652,7 +655,7 @@ int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb) if (!bearer) continue; - err = __tipc_nl_add_bearer(&msg, bearer); + err = __tipc_nl_add_bearer(&msg, bearer, NLM_F_MULTI); if (err) break; } @@ -700,7 +703,7 @@ int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info) goto err_out; } - err = __tipc_nl_add_bearer(&msg, bearer); + err = __tipc_nl_add_bearer(&msg, bearer, 0); if (err) goto err_out; rtnl_unlock(); @@ -742,7 +745,7 @@ int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - bearer_disable(net, bearer, false); + bearer_disable(net, bearer); rtnl_unlock(); return 0; @@ -791,7 +794,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) } rtnl_lock(); - err = tipc_enable_bearer(net, bearer, domain, prio); + err = tipc_enable_bearer(net, bearer, domain, prio, attrs); if (err) { rtnl_unlock(); return err; @@ -807,7 +810,7 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) char *name; struct tipc_bearer *b; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; - struct net *net = genl_info_net(info); + struct net *net = sock_net(skb->sk); if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; @@ -852,14 +855,14 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) } static int __tipc_nl_add_media(struct tipc_nl_msg *msg, - struct tipc_media *media) + struct tipc_media *media, int nlflags) { void *hdr; struct nlattr *attrs; struct nlattr *prop; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, - NLM_F_MULTI, TIPC_NL_MEDIA_GET); + nlflags, TIPC_NL_MEDIA_GET); if (!hdr) return -EMSGSIZE; @@ -911,7 +914,8 @@ int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); for (; media_info_array[i] != NULL; i++) { - err = __tipc_nl_add_media(&msg, media_info_array[i]); + err = __tipc_nl_add_media(&msg, media_info_array[i], + NLM_F_MULTI); if (err) break; } @@ -958,7 +962,7 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info) goto err_out; } - err = __tipc_nl_add_media(&msg, media); + err = __tipc_nl_add_media(&msg, media, 0); if (err) goto err_out; rtnl_unlock(); diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 6b17795ff8bc..dc714d977768 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -38,10 +38,10 @@ #define _TIPC_BEARER_H #include "netlink.h" +#include "core.h" #include <net/genetlink.h> -#define MAX_BEARERS 2 -#define MAX_MEDIA 2 +#define MAX_MEDIA 3 #define MAX_NODES 4096 #define WSIZE 32 @@ -50,14 +50,16 @@ * - the field's actual content and length is defined per media * - remaining unused bytes in the field are set to zero */ -#define TIPC_MEDIA_ADDR_SIZE 32 +#define TIPC_MEDIA_INFO_SIZE 32 #define TIPC_MEDIA_TYPE_OFFSET 3 +#define TIPC_MEDIA_ADDR_OFFSET 4 /* * Identifiers of supported TIPC media types */ #define TIPC_MEDIA_TYPE_ETH 1 #define TIPC_MEDIA_TYPE_IB 2 +#define TIPC_MEDIA_TYPE_UDP 3 /** * struct tipc_node_map - set of node identifiers @@ -76,7 +78,7 @@ struct tipc_node_map { * @broadcast: non-zero if address is a broadcast address */ struct tipc_media_addr { - u8 value[TIPC_MEDIA_ADDR_SIZE]; + u8 value[TIPC_MEDIA_INFO_SIZE]; u8 media_id; u8 broadcast; }; @@ -103,7 +105,8 @@ struct tipc_media { int (*send_msg)(struct net *net, struct sk_buff *buf, struct tipc_bearer *b_ptr, struct tipc_media_addr *dest); - int (*enable_media)(struct net *net, struct tipc_bearer *b_ptr); + int (*enable_media)(struct net *net, struct tipc_bearer *b_ptr, + struct nlattr *attr[]); void (*disable_media)(struct tipc_bearer *b_ptr); int (*addr2str)(struct tipc_media_addr *addr, char *strbuf, @@ -182,6 +185,9 @@ extern struct tipc_media eth_media_info; #ifdef CONFIG_TIPC_MEDIA_IB extern struct tipc_media ib_media_info; #endif +#ifdef CONFIG_TIPC_MEDIA_UDP +extern struct tipc_media udp_media_info; +#endif int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); @@ -196,7 +202,8 @@ int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); -int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b); +int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, + struct nlattr *attrs[]); void tipc_disable_l2_media(struct tipc_bearer *b); int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); diff --git a/net/tipc/core.c b/net/tipc/core.c index be1c9fa60b09..005ba5eb0ea4 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -68,7 +68,7 @@ static int __net_init tipc_init_net(struct net *net) if (err) goto out_nametbl; - err = tipc_subscr_start(net); + err = tipc_topsrv_start(net); if (err) goto out_subscr; return 0; @@ -83,7 +83,7 @@ out_sk_rht: static void __net_exit tipc_exit_net(struct net *net) { - tipc_subscr_stop(net); + tipc_topsrv_stop(net); tipc_net_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); diff --git a/net/tipc/core.h b/net/tipc/core.h index 3dc68c7a966d..0fcf133d5cb7 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -60,16 +60,19 @@ #include <net/netns/generic.h> #include <linux/rhashtable.h> -#include "node.h" -#include "bearer.h" -#include "bcast.h" -#include "netlink.h" -#include "link.h" -#include "node.h" -#include "msg.h" +struct tipc_node; +struct tipc_bearer; +struct tipc_bcbearer; +struct tipc_bclink; +struct tipc_link; +struct tipc_name_table; +struct tipc_server; #define TIPC_MOD_VER "2.0.0" +#define NODE_HTABLE_SIZE 512 +#define MAX_BEARERS 3 + extern int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; extern int sysctl_tipc_named_timeout __read_mostly; @@ -106,6 +109,26 @@ struct tipc_net { atomic_t subscription_count; }; +static inline u16 mod(u16 x) +{ + return x & 0xffffu; +} + +static inline int less_eq(u16 left, u16 right) +{ + return mod(right - left) < 32768u; +} + +static inline int more(u16 left, u16 right) +{ + return !less_eq(left, right); +} + +static inline int less(u16 left, u16 right) +{ + return less_eq(left, right) && (mod(right) != mod(left)); +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index feef3753615d..967e292f53c8 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -86,9 +86,10 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type, msg = buf_msg(buf); tipc_msg_init(tn->own_addr, msg, LINK_CONFIG, type, - INT_H_SIZE, dest_domain); + MAX_H_SIZE, dest_domain); msg_set_non_seq(msg, 1); msg_set_node_sig(msg, tn->random); + msg_set_node_capabilities(msg, 0); msg_set_dest_domain(msg, dest_domain); msg_set_bc_netid(msg, tn->net_id); b_ptr->media->addr2msg(msg_media_addr(msg), &b_ptr->addr); @@ -133,6 +134,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, u32 net_id = msg_bc_netid(msg); u32 mtyp = msg_type(msg); u32 signature = msg_node_sig(msg); + u16 caps = msg_node_capabilities(msg); bool addr_match = false; bool sign_match = false; bool link_up = false; @@ -167,6 +169,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, if (!node) return; tipc_node_lock(node); + node->capabilities = caps; link = node->links[bearer->identity]; /* Prepare to validate requesting node's signature and media address */ @@ -249,7 +252,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, /* Send response, if necessary */ if (respond && (mtyp == DSC_REQ_MSG)) { - rbuf = tipc_buf_acquire(INT_H_SIZE); + rbuf = tipc_buf_acquire(MAX_H_SIZE); if (rbuf) { tipc_disc_init_msg(net, rbuf, DSC_RESP_MSG, bearer); tipc_bearer_send(net, bearer->identity, rbuf, &maddr); @@ -257,6 +260,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, } } tipc_node_unlock(node); + tipc_node_put(node); } /** @@ -359,8 +363,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr, req = kmalloc(sizeof(*req), GFP_ATOMIC); if (!req) return -ENOMEM; - - req->buf = tipc_buf_acquire(INT_H_SIZE); + req->buf = tipc_buf_acquire(MAX_H_SIZE); if (!req->buf) { kfree(req); return -ENOMEM; diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 5e1426f1751f..f69a2fde9f4a 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -37,8 +37,6 @@ #include "core.h" #include "bearer.h" -#define ETH_ADDR_OFFSET 4 /* MAC addr position inside address field */ - /* Convert Ethernet address (media address format) to string */ static int tipc_eth_addr2str(struct tipc_media_addr *addr, char *strbuf, int bufsz) @@ -53,9 +51,9 @@ static int tipc_eth_addr2str(struct tipc_media_addr *addr, /* Convert from media address format to discovery message addr format */ static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr) { - memset(msg, 0, TIPC_MEDIA_ADDR_SIZE); + memset(msg, 0, TIPC_MEDIA_INFO_SIZE); msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_ETH; - memcpy(msg + ETH_ADDR_OFFSET, addr->value, ETH_ALEN); + memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, addr->value, ETH_ALEN); return 0; } @@ -79,7 +77,7 @@ static int tipc_eth_msg2addr(struct tipc_bearer *b, char *msg) { /* Skip past preamble: */ - msg += ETH_ADDR_OFFSET; + msg += TIPC_MEDIA_ADDR_OFFSET; return tipc_eth_raw2addr(b, addr, msg); } diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index 8522eef9c136..e8c16718e3fa 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -57,7 +57,7 @@ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, /* Convert from media address format to discovery message addr format */ static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr) { - memset(msg, 0, TIPC_MEDIA_ADDR_SIZE); + memset(msg, 0, TIPC_MEDIA_INFO_SIZE); memcpy(msg, addr->value, INFINIBAND_ALEN); return 0; } diff --git a/net/tipc/link.c b/net/tipc/link.c index 14f09b3cb87c..eaa9fe54b4ae 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1,7 +1,7 @@ /* * net/tipc/link.c: TIPC link code * - * Copyright (c) 1996-2007, 2012-2014, Ericsson AB + * Copyright (c) 1996-2007, 2012-2015, Ericsson AB * Copyright (c) 2004-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -35,6 +35,7 @@ */ #include "core.h" +#include "subscr.h" #include "link.h" #include "bcast.h" #include "socket.h" @@ -85,27 +86,17 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { */ #define STARTING_EVT 856384768 /* link processing trigger */ #define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */ -#define TIMEOUT_EVT 560817u /* link timer expired */ +#define SILENCE_EVT 560817u /* timer dicovered silence from peer */ /* - * The following two 'message types' is really just implementation - * data conveniently stored in the message header. - * They must not be considered part of the protocol + * State value stored in 'failover_pkts' */ -#define OPEN_MSG 0 -#define CLOSED_MSG 1 - -/* - * State value stored in 'exp_msg_count' - */ -#define START_CHANGEOVER 100000u +#define FIRST_FAILOVER 0xffffu static void link_handle_out_of_seq_msg(struct tipc_link *link, struct sk_buff *skb); static void tipc_link_proto_rcv(struct tipc_link *link, struct sk_buff *skb); -static int tipc_link_tunnel_rcv(struct tipc_node *node, - struct sk_buff **skb); static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol); static void link_state_event(struct tipc_link *l_ptr, u32 event); static void link_reset_statistics(struct tipc_link *l_ptr); @@ -114,7 +105,8 @@ static void tipc_link_sync_xmit(struct tipc_link *l); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); - +static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); +static void link_set_timer(struct tipc_link *link, unsigned long time); /* * Simple link routines */ @@ -138,32 +130,11 @@ static void tipc_link_put(struct tipc_link *l_ptr) kref_put(&l_ptr->ref, tipc_link_release); } -static void link_init_max_pkt(struct tipc_link *l_ptr) +static struct tipc_link *tipc_parallel_link(struct tipc_link *l) { - struct tipc_node *node = l_ptr->owner; - struct tipc_net *tn = net_generic(node->net, tipc_net_id); - struct tipc_bearer *b_ptr; - u32 max_pkt; - - rcu_read_lock(); - b_ptr = rcu_dereference_rtnl(tn->bearer_list[l_ptr->bearer_id]); - if (!b_ptr) { - rcu_read_unlock(); - return; - } - max_pkt = (b_ptr->mtu & ~3); - rcu_read_unlock(); - - if (max_pkt > MAX_MSG_SIZE) - max_pkt = MAX_MSG_SIZE; - - l_ptr->max_pkt_target = max_pkt; - if (l_ptr->max_pkt_target < MAX_PKT_DEFAULT) - l_ptr->max_pkt = l_ptr->max_pkt_target; - else - l_ptr->max_pkt = MAX_PKT_DEFAULT; - - l_ptr->max_pkt_probes = 0; + if (l->owner->active_links[0] != l) + return l->owner->active_links[0]; + return l->owner->active_links[1]; } /* @@ -194,10 +165,10 @@ static void link_timeout(unsigned long data) tipc_node_lock(l_ptr->owner); /* update counters used in statistical profiling of send traffic */ - l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->outqueue); + l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->transmq); l_ptr->stats.queue_sz_counts++; - skb = skb_peek(&l_ptr->outqueue); + skb = skb_peek(&l_ptr->transmq); if (skb) { struct tipc_msg *msg = buf_msg(skb); u32 length = msg_size(msg); @@ -227,11 +198,12 @@ static void link_timeout(unsigned long data) } /* do all other link processing performed on a periodic basis */ - link_state_event(l_ptr, TIMEOUT_EVT); - - if (l_ptr->next_out) + if (l_ptr->silent_intv_cnt || tipc_bclink_acks_missing(l_ptr->owner)) + link_state_event(l_ptr, SILENCE_EVT); + l_ptr->silent_intv_cnt++; + if (skb_queue_len(&l_ptr->backlogq)) tipc_link_push_packets(l_ptr); - + link_set_timer(l_ptr, l_ptr->keepalive_intv); tipc_node_unlock(l_ptr->owner); tipc_link_put(l_ptr); } @@ -263,8 +235,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, if (n_ptr->link_cnt >= MAX_BEARERS) { tipc_addr_string_fill(addr_string, n_ptr->addr); - pr_err("Attempt to establish %uth link to %s. Max %u allowed.\n", - n_ptr->link_cnt, addr_string, MAX_BEARERS); + pr_err("Cannot establish %uth link to %s. Max %u allowed.\n", + n_ptr->link_cnt, addr_string, MAX_BEARERS); return NULL; } @@ -291,7 +263,6 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, /* note: peer i/f name is updated by reset/activate message */ memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr)); l_ptr->owner = n_ptr; - l_ptr->checkpoint = 1; l_ptr->peer_session = INVALID_SESSION; l_ptr->bearer_id = b_ptr->identity; link_set_supervision_props(l_ptr, b_ptr->tolerance); @@ -305,16 +276,15 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, msg_set_session(msg, (tn->random & 0xffff)); msg_set_bearer_id(msg, b_ptr->identity); strcpy((char *)msg_data(msg), if_name); - + l_ptr->net_plane = b_ptr->net_plane; + l_ptr->advertised_mtu = b_ptr->mtu; + l_ptr->mtu = l_ptr->advertised_mtu; l_ptr->priority = b_ptr->priority; tipc_link_set_queue_limits(l_ptr, b_ptr->window); - - l_ptr->net_plane = b_ptr->net_plane; - link_init_max_pkt(l_ptr); - - l_ptr->next_out_no = 1; - __skb_queue_head_init(&l_ptr->outqueue); - __skb_queue_head_init(&l_ptr->deferred_queue); + l_ptr->snd_nxt = 1; + __skb_queue_head_init(&l_ptr->transmq); + __skb_queue_head_init(&l_ptr->backlogq); + __skb_queue_head_init(&l_ptr->deferdq); skb_queue_head_init(&l_ptr->wakeupq); skb_queue_head_init(&l_ptr->inputq); skb_queue_head_init(&l_ptr->namedq); @@ -327,19 +297,22 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, } /** - * link_delete - Conditional deletion of link. - * If timer still running, real delete is done when it expires - * @link: link to be deleted + * tipc_link_delete - Delete a link + * @l: link to be deleted */ -void tipc_link_delete(struct tipc_link *link) +void tipc_link_delete(struct tipc_link *l) { - tipc_link_reset_fragments(link); - tipc_node_detach_link(link->owner, link); - tipc_link_put(link); + tipc_link_reset(l); + if (del_timer(&l->timer)) + tipc_link_put(l); + l->flags |= LINK_STOPPED; + /* Delete link now, or when timer is finished: */ + tipc_link_reset_fragments(l); + tipc_node_detach_link(l->owner, l); + tipc_link_put(l); } -void tipc_link_delete_list(struct net *net, unsigned int bearer_id, - bool shutting_down) +void tipc_link_delete_list(struct net *net, unsigned int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *link; @@ -349,16 +322,7 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id, list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_lock(node); link = node->links[bearer_id]; - if (!link) { - tipc_node_unlock(node); - continue; - } - tipc_link_reset(link); - if (del_timer(&link->timer)) - tipc_link_put(link); - link->flags |= LINK_STOPPED; - /* Delete link now, or when failover is finished: */ - if (shutting_down || !tipc_node_is_up(node)) + if (link) tipc_link_delete(link); tipc_node_unlock(node); } @@ -366,28 +330,43 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id, } /** - * link_schedule_user - schedule user for wakeup after congestion + * link_schedule_user - schedule a message sender for wakeup after congestion * @link: congested link - * @oport: sending port - * @chain_sz: size of buffer chain that was attempted sent - * @imp: importance of message attempted sent + * @list: message that was attempted sent * Create pseudo msg to send back to user when congestion abates + * Only consumes message if there is an error */ -static bool link_schedule_user(struct tipc_link *link, u32 oport, - uint chain_sz, uint imp) +static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) { - struct sk_buff *buf; + struct tipc_msg *msg = buf_msg(skb_peek(list)); + int imp = msg_importance(msg); + u32 oport = msg_origport(msg); + u32 addr = link_own_addr(link); + struct sk_buff *skb; - buf = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, - link_own_addr(link), link_own_addr(link), - oport, 0, 0); - if (!buf) - return false; - TIPC_SKB_CB(buf)->chain_sz = chain_sz; - TIPC_SKB_CB(buf)->chain_imp = imp; - skb_queue_tail(&link->wakeupq, buf); + /* This really cannot happen... */ + if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { + pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); + tipc_link_reset(link); + goto err; + } + /* Non-blocking sender: */ + if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending) + return -ELINKCONG; + + /* Create and schedule wakeup pseudo message */ + skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, + addr, addr, oport, 0, 0); + if (!skb) + goto err; + TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); + TIPC_SKB_CB(skb)->chain_imp = imp; + skb_queue_tail(&link->wakeupq, skb); link->stats.link_congs++; - return true; + return -ELINKCONG; +err: + __skb_queue_purge(list); + return -ENOBUFS; } /** @@ -396,19 +375,22 @@ static bool link_schedule_user(struct tipc_link *link, u32 oport, * Move a number of waiting users, as permitted by available space in * the send queue, from link wait queue to node wait queue for wakeup */ -void link_prepare_wakeup(struct tipc_link *link) +void link_prepare_wakeup(struct tipc_link *l) { - uint pend_qsz = skb_queue_len(&link->outqueue); + int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,}; + int imp, lim; struct sk_buff *skb, *tmp; - skb_queue_walk_safe(&link->wakeupq, skb, tmp) { - if (pend_qsz >= link->queue_limit[TIPC_SKB_CB(skb)->chain_imp]) + skb_queue_walk_safe(&l->wakeupq, skb, tmp) { + imp = TIPC_SKB_CB(skb)->chain_imp; + lim = l->window + l->backlog[imp].limit; + pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; + if ((pnd[imp] + l->backlog[imp].len) >= lim) break; - pend_qsz += TIPC_SKB_CB(skb)->chain_sz; - skb_unlink(skb, &link->wakeupq); - skb_queue_tail(&link->inputq, skb); - link->owner->inputq = &link->inputq; - link->owner->action_flags |= TIPC_MSG_EVT; + skb_unlink(skb, &l->wakeupq); + skb_queue_tail(&l->inputq, skb); + l->owner->inputq = &l->inputq; + l->owner->action_flags |= TIPC_MSG_EVT; } } @@ -422,31 +404,42 @@ void tipc_link_reset_fragments(struct tipc_link *l_ptr) l_ptr->reasm_buf = NULL; } +void tipc_link_purge_backlog(struct tipc_link *l) +{ + __skb_queue_purge(&l->backlogq); + l->backlog[TIPC_LOW_IMPORTANCE].len = 0; + l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; + l->backlog[TIPC_HIGH_IMPORTANCE].len = 0; + l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0; + l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0; +} + /** * tipc_link_purge_queues - purge all pkt queues associated with link * @l_ptr: pointer to link */ void tipc_link_purge_queues(struct tipc_link *l_ptr) { - __skb_queue_purge(&l_ptr->deferred_queue); - __skb_queue_purge(&l_ptr->outqueue); + __skb_queue_purge(&l_ptr->deferdq); + __skb_queue_purge(&l_ptr->transmq); + tipc_link_purge_backlog(l_ptr); tipc_link_reset_fragments(l_ptr); } void tipc_link_reset(struct tipc_link *l_ptr) { u32 prev_state = l_ptr->state; - u32 checkpoint = l_ptr->next_in_no; int was_active_link = tipc_link_is_active(l_ptr); struct tipc_node *owner = l_ptr->owner; + struct tipc_link *pl = tipc_parallel_link(l_ptr); msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff)); /* Link is down, accept any session */ l_ptr->peer_session = INVALID_SESSION; - /* Prepare for max packet size negotiation */ - link_init_max_pkt(l_ptr); + /* Prepare for renewed mtu size negotiation */ + l_ptr->mtu = l_ptr->advertised_mtu; l_ptr->state = RESET_UNKNOWN; @@ -456,51 +449,39 @@ void tipc_link_reset(struct tipc_link *l_ptr) tipc_node_link_down(l_ptr->owner, l_ptr); tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr); - if (was_active_link && tipc_node_active_links(l_ptr->owner)) { - l_ptr->reset_checkpoint = checkpoint; - l_ptr->exp_msg_count = START_CHANGEOVER; + if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { + l_ptr->flags |= LINK_FAILINGOVER; + l_ptr->failover_checkpt = l_ptr->rcv_nxt; + pl->failover_pkts = FIRST_FAILOVER; + pl->failover_checkpt = l_ptr->rcv_nxt; + pl->failover_skb = l_ptr->reasm_buf; + } else { + kfree_skb(l_ptr->reasm_buf); } - /* Clean up all queues, except inputq: */ - __skb_queue_purge(&l_ptr->outqueue); - __skb_queue_purge(&l_ptr->deferred_queue); + __skb_queue_purge(&l_ptr->transmq); + __skb_queue_purge(&l_ptr->deferdq); if (!owner->inputq) owner->inputq = &l_ptr->inputq; skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; - l_ptr->next_out = NULL; - l_ptr->unacked_window = 0; - l_ptr->checkpoint = 1; - l_ptr->next_out_no = 1; - l_ptr->fsm_msg_cnt = 0; + tipc_link_purge_backlog(l_ptr); + l_ptr->reasm_buf = NULL; + l_ptr->rcv_unacked = 0; + l_ptr->snd_nxt = 1; + l_ptr->silent_intv_cnt = 0; l_ptr->stale_count = 0; link_reset_statistics(l_ptr); } -void tipc_link_reset_list(struct net *net, unsigned int bearer_id) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *l_ptr; - struct tipc_node *n_ptr; - - rcu_read_lock(); - list_for_each_entry_rcu(n_ptr, &tn->node_list, list) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->links[bearer_id]; - if (l_ptr) - tipc_link_reset(l_ptr); - tipc_node_unlock(n_ptr); - } - rcu_read_unlock(); -} - static void link_activate(struct tipc_link *link) { struct tipc_node *node = link->owner; - link->next_in_no = 1; + link->rcv_nxt = 1; link->stats.recv_info = 1; + link->silent_intv_cnt = 0; tipc_node_link_up(node, link); tipc_bearer_add_dest(node->net, link->bearer_id, link->addr); } @@ -513,7 +494,7 @@ static void link_activate(struct tipc_link *link) static void link_state_event(struct tipc_link *l_ptr, unsigned int event) { struct tipc_link *other; - unsigned long cont_intv = l_ptr->cont_intv; + unsigned long timer_intv = l_ptr->keepalive_intv; if (l_ptr->flags & LINK_STOPPED) return; @@ -521,50 +502,33 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT)) return; /* Not yet. */ - /* Check whether changeover is going on */ - if (l_ptr->exp_msg_count) { - if (event == TIMEOUT_EVT) - link_set_timer(l_ptr, cont_intv); + if (l_ptr->flags & LINK_FAILINGOVER) return; - } switch (l_ptr->state) { case WORKING_WORKING: switch (event) { case TRAFFIC_MSG_EVT: case ACTIVATE_MSG: + l_ptr->silent_intv_cnt = 0; break; - case TIMEOUT_EVT: - if (l_ptr->next_in_no != l_ptr->checkpoint) { - l_ptr->checkpoint = l_ptr->next_in_no; - if (tipc_bclink_acks_missing(l_ptr->owner)) { + case SILENCE_EVT: + if (!l_ptr->silent_intv_cnt) { + if (tipc_bclink_acks_missing(l_ptr->owner)) tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - } else if (l_ptr->max_pkt < l_ptr->max_pkt_target) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 1, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - } - link_set_timer(l_ptr, cont_intv); + 0, 0, 0, 0); break; } l_ptr->state = WORKING_UNKNOWN; - l_ptr->fsm_msg_cnt = 0; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv / 4); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); break; case RESET_MSG: pr_debug("%s<%s>, requested by peer\n", link_rst_msg, l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; - l_ptr->fsm_msg_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + 0, 0, 0, 0); break; default: pr_debug("%s%u in WW state\n", link_unk_evt, event); @@ -575,46 +539,33 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) case TRAFFIC_MSG_EVT: case ACTIVATE_MSG: l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; - link_set_timer(l_ptr, cont_intv); + l_ptr->silent_intv_cnt = 0; break; case RESET_MSG: pr_debug("%s<%s>, requested by peer while probing\n", link_rst_msg, l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; - l_ptr->fsm_msg_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + 0, 0, 0, 0); break; - case TIMEOUT_EVT: - if (l_ptr->next_in_no != l_ptr->checkpoint) { + case SILENCE_EVT: + if (!l_ptr->silent_intv_cnt) { l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; - l_ptr->checkpoint = l_ptr->next_in_no; - if (tipc_bclink_acks_missing(l_ptr->owner)) { + if (tipc_bclink_acks_missing(l_ptr->owner)) tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - } - link_set_timer(l_ptr, cont_intv); - } else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) { + 0, 0, 0, 0); + } else if (l_ptr->silent_intv_cnt < + l_ptr->abort_limit) { tipc_link_proto_xmit(l_ptr, STATE_MSG, - 1, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv / 4); + 1, 0, 0, 0); } else { /* Link has failed */ pr_debug("%s<%s>, peer not responding\n", link_rst_msg, l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_UNKNOWN; - l_ptr->fsm_msg_cnt = 0; tipc_link_proto_xmit(l_ptr, RESET_MSG, - 0, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + 0, 0, 0, 0); } break; default: @@ -630,31 +581,22 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (other && link_working_unknown(other)) break; l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; + tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); if (l_ptr->owner->working_links == 1) tipc_link_sync_xmit(l_ptr); - link_set_timer(l_ptr, cont_intv); break; case RESET_MSG: l_ptr->state = RESET_RESET; - l_ptr->fsm_msg_cnt = 0; tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 1, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + 1, 0, 0, 0); break; case STARTING_EVT: l_ptr->flags |= LINK_STARTED; - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + link_set_timer(l_ptr, timer_intv); break; - case TIMEOUT_EVT: - tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + case SILENCE_EVT: + tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); break; default: pr_err("%s%u in RU state\n", link_unk_evt, event); @@ -668,21 +610,16 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) if (other && link_working_unknown(other)) break; l_ptr->state = WORKING_WORKING; - l_ptr->fsm_msg_cnt = 0; link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; + tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); if (l_ptr->owner->working_links == 1) tipc_link_sync_xmit(l_ptr); - link_set_timer(l_ptr, cont_intv); break; case RESET_MSG: break; - case TIMEOUT_EVT: + case SILENCE_EVT: tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0, 0); - l_ptr->fsm_msg_cnt++; - link_set_timer(l_ptr, cont_intv); + 0, 0, 0, 0); break; default: pr_err("%s%u in RR state\n", link_unk_evt, event); @@ -693,104 +630,73 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) } } -/* tipc_link_cong: determine return value and how to treat the - * sent buffer during link congestion. - * - For plain, errorless user data messages we keep the buffer and - * return -ELINKONG. - * - For all other messages we discard the buffer and return -EHOSTUNREACH - * - For TIPC internal messages we also reset the link - */ -static int tipc_link_cong(struct tipc_link *link, struct sk_buff_head *list) -{ - struct sk_buff *skb = skb_peek(list); - struct tipc_msg *msg = buf_msg(skb); - uint imp = tipc_msg_tot_importance(msg); - u32 oport = msg_tot_origport(msg); - - if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { - pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); - tipc_link_reset(link); - goto drop; - } - if (unlikely(msg_errcode(msg))) - goto drop; - if (unlikely(msg_reroute_cnt(msg))) - goto drop; - if (TIPC_SKB_CB(skb)->wakeup_pending) - return -ELINKCONG; - if (link_schedule_user(link, oport, skb_queue_len(list), imp)) - return -ELINKCONG; -drop: - __skb_queue_purge(list); - return -EHOSTUNREACH; -} - /** * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked * @link: link to use * @list: chain of buffers containing message * - * Consumes the buffer chain, except when returning -ELINKCONG - * Returns 0 if success, otherwise errno: -ELINKCONG, -EMSGSIZE (plain socket - * user data messages) or -EHOSTUNREACH (all other messages/senders) - * Only the socket functions tipc_send_stream() and tipc_send_packet() need - * to act on the return value, since they may need to do more send attempts. + * Consumes the buffer chain, except when returning -ELINKCONG, + * since the caller then may want to make more send attempts. + * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS + * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted */ int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list) { struct tipc_msg *msg = buf_msg(skb_peek(list)); - uint psz = msg_size(msg); - uint sndlim = link->queue_limit[0]; - uint imp = tipc_msg_tot_importance(msg); - uint mtu = link->max_pkt; - uint ack = mod(link->next_in_no - 1); - uint seqno = link->next_out_no; - uint bc_last_in = link->owner->bclink.last_in; + unsigned int maxwin = link->window; + unsigned int i, imp = msg_importance(msg); + uint mtu = link->mtu; + u16 ack = mod(link->rcv_nxt - 1); + u16 seqno = link->snd_nxt; + u16 bc_last_in = link->owner->bclink.last_in; struct tipc_media_addr *addr = &link->media_addr; - struct sk_buff_head *outqueue = &link->outqueue; - struct sk_buff *skb, *tmp; - - /* Match queue limits against msg importance: */ - if (unlikely(skb_queue_len(outqueue) >= link->queue_limit[imp])) - return tipc_link_cong(link, list); - - /* Has valid packet limit been used ? */ - if (unlikely(psz > mtu)) { + struct sk_buff_head *transmq = &link->transmq; + struct sk_buff_head *backlogq = &link->backlogq; + struct sk_buff *skb, *bskb; + + /* Match msg importance against this and all higher backlog limits: */ + for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { + if (unlikely(link->backlog[i].len >= link->backlog[i].limit)) + return link_schedule_user(link, list); + } + if (unlikely(msg_size(msg) > mtu)) { __skb_queue_purge(list); return -EMSGSIZE; } - - /* Prepare each packet for sending, and add to outqueue: */ - skb_queue_walk_safe(list, skb, tmp) { - __skb_unlink(skb, list); + /* Prepare each packet for sending, and add to relevant queue: */ + while (skb_queue_len(list)) { + skb = skb_peek(list); msg = buf_msg(skb); - msg_set_word(msg, 2, ((ack << 16) | mod(seqno))); + msg_set_seqno(msg, seqno); + msg_set_ack(msg, ack); msg_set_bcast_ack(msg, bc_last_in); - if (skb_queue_len(outqueue) < sndlim) { - __skb_queue_tail(outqueue, skb); - tipc_bearer_send(net, link->bearer_id, - skb, addr); - link->next_out = NULL; - link->unacked_window = 0; - } else if (tipc_msg_bundle(outqueue, skb, mtu)) { + if (likely(skb_queue_len(transmq) < maxwin)) { + __skb_dequeue(list); + __skb_queue_tail(transmq, skb); + tipc_bearer_send(net, link->bearer_id, skb, addr); + link->rcv_unacked = 0; + seqno++; + continue; + } + if (tipc_msg_bundle(skb_peek_tail(backlogq), msg, mtu)) { + kfree_skb(__skb_dequeue(list)); link->stats.sent_bundled++; continue; - } else if (tipc_msg_make_bundle(outqueue, skb, mtu, - link->addr)) { + } + if (tipc_msg_make_bundle(&bskb, msg, mtu, link->addr)) { + kfree_skb(__skb_dequeue(list)); + __skb_queue_tail(backlogq, bskb); + link->backlog[msg_importance(buf_msg(bskb))].len++; link->stats.sent_bundled++; link->stats.sent_bundles++; - if (!link->next_out) - link->next_out = skb_peek_tail(outqueue); - } else { - __skb_queue_tail(outqueue, skb); - if (!link->next_out) - link->next_out = skb; + continue; } - seqno++; + link->backlog[imp].len += skb_queue_len(list); + skb_queue_splice_tail_init(list, backlogq); } - link->next_out_no = seqno; + link->snd_nxt = seqno; return 0; } @@ -808,13 +714,25 @@ static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb) return __tipc_link_xmit(link->owner->net, link, &head); } +/* tipc_link_xmit_skb(): send single buffer to destination + * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE + * messages, which will not be rejected + * The only exception is datagram messages rerouted after secondary + * lookup, which are rare and safe to dispose of anyway. + * TODO: Return real return value, and let callers use + * tipc_wait_for_sendpkt() where applicable + */ int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; + int rc; skb2list(skb, &head); - return tipc_link_xmit(net, &head, dnode, selector); + rc = tipc_link_xmit(net, &head, dnode, selector); + if (rc == -ELINKCONG) + kfree_skb(skb); + return 0; } /** @@ -841,12 +759,15 @@ int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, if (link) rc = __tipc_link_xmit(net, link, list); tipc_node_unlock(node); + tipc_node_put(node); } if (link) return rc; - if (likely(in_own_node(net, dnode))) - return tipc_sk_rcv(net, list); + if (likely(in_own_node(net, dnode))) { + tipc_sk_rcv(net, list); + return 0; + } __skb_queue_purge(list); return rc; @@ -893,14 +814,6 @@ static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf) kfree_skb(buf); } -struct sk_buff *tipc_skb_queue_next(const struct sk_buff_head *list, - const struct sk_buff *skb) -{ - if (skb_queue_is_last(list, skb)) - return NULL; - return skb->next; -} - /* * tipc_link_push_packets - push unsent packets to bearer * @@ -909,31 +822,29 @@ struct sk_buff *tipc_skb_queue_next(const struct sk_buff_head *list, * * Called with node locked */ -void tipc_link_push_packets(struct tipc_link *l_ptr) +void tipc_link_push_packets(struct tipc_link *link) { - struct sk_buff_head *outqueue = &l_ptr->outqueue; - struct sk_buff *skb = l_ptr->next_out; + struct sk_buff *skb; struct tipc_msg *msg; - u32 next, first; + u16 seqno = link->snd_nxt; + u16 ack = mod(link->rcv_nxt - 1); - skb_queue_walk_from(outqueue, skb) { - msg = buf_msg(skb); - next = msg_seqno(msg); - first = buf_seqno(skb_peek(outqueue)); - - if (mod(next - first) < l_ptr->queue_limit[0]) { - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - if (msg_user(msg) == MSG_BUNDLER) - TIPC_SKB_CB(skb)->bundling = false; - tipc_bearer_send(l_ptr->owner->net, - l_ptr->bearer_id, skb, - &l_ptr->media_addr); - l_ptr->next_out = tipc_skb_queue_next(outqueue, skb); - } else { + while (skb_queue_len(&link->transmq) < link->window) { + skb = __skb_dequeue(&link->backlogq); + if (!skb) break; - } + msg = buf_msg(skb); + link->backlog[msg_importance(msg)].len--; + msg_set_ack(msg, ack); + msg_set_seqno(msg, seqno); + seqno = mod(seqno + 1); + msg_set_bcast_ack(msg, link->owner->bclink.last_in); + link->rcv_unacked = 0; + __skb_queue_tail(&link->transmq, skb); + tipc_bearer_send(link->owner->net, link->bearer_id, + skb, &link->media_addr); } + link->snd_nxt = seqno; } void tipc_link_reset_all(struct tipc_node *node) @@ -979,7 +890,6 @@ static void link_retransmit_failure(struct tipc_link *l_ptr, (unsigned long) TIPC_SKB_CB(buf)->handle); n_ptr = tipc_bclink_retransmit_to(net); - tipc_node_lock(n_ptr); tipc_addr_string_fill(addr_string, n_ptr->addr); pr_info("Broadcast link info for %s\n", addr_string); @@ -991,9 +901,7 @@ static void link_retransmit_failure(struct tipc_link *l_ptr, n_ptr->bclink.oos_state, n_ptr->bclink.last_sent); - tipc_node_unlock(n_ptr); - - tipc_bclink_set_flags(net, TIPC_BCLINK_RESET); + n_ptr->action_flags |= TIPC_BCAST_RESET; l_ptr->stale_count = 0; } } @@ -1009,21 +917,21 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, msg = buf_msg(skb); /* Detect repeated retransmit failures */ - if (l_ptr->last_retransmitted == msg_seqno(msg)) { + if (l_ptr->last_retransm == msg_seqno(msg)) { if (++l_ptr->stale_count > 100) { link_retransmit_failure(l_ptr, skb); return; } } else { - l_ptr->last_retransmitted = msg_seqno(msg); + l_ptr->last_retransm = msg_seqno(msg); l_ptr->stale_count = 1; } - skb_queue_walk_from(&l_ptr->outqueue, skb) { - if (!retransmits || skb == l_ptr->next_out) + skb_queue_walk_from(&l_ptr->transmq, skb) { + if (!retransmits) break; msg = buf_msg(skb); - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1)); msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, skb, &l_ptr->media_addr); @@ -1032,72 +940,43 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, } } -static void link_retrieve_defq(struct tipc_link *link, - struct sk_buff_head *list) -{ - u32 seq_no; - - if (skb_queue_empty(&link->deferred_queue)) - return; - - seq_no = buf_seqno(skb_peek(&link->deferred_queue)); - if (seq_no == mod(link->next_in_no)) - skb_queue_splice_tail_init(&link->deferred_queue, list); -} - -/** - * link_recv_buf_validate - validate basic format of received message - * - * This routine ensures a TIPC message has an acceptable header, and at least - * as much data as the header indicates it should. The routine also ensures - * that the entire message header is stored in the main fragment of the message - * buffer, to simplify future access to message header fields. - * - * Note: Having extra info present in the message header or data areas is OK. - * TIPC will ignore the excess, under the assumption that it is optional info - * introduced by a later release of the protocol. +/* link_synch(): check if all packets arrived before the synch + * point have been consumed + * Returns true if the parallel links are synched, otherwise false */ -static int link_recv_buf_validate(struct sk_buff *buf) +static bool link_synch(struct tipc_link *l) { - static u32 min_data_hdr_size[8] = { - SHORT_H_SIZE, MCAST_H_SIZE, NAMED_H_SIZE, BASIC_H_SIZE, - MAX_H_SIZE, MAX_H_SIZE, MAX_H_SIZE, MAX_H_SIZE - }; + unsigned int post_synch; + struct tipc_link *pl; - struct tipc_msg *msg; - u32 tipc_hdr[2]; - u32 size; - u32 hdr_size; - u32 min_hdr_size; + pl = tipc_parallel_link(l); + if (pl == l) + goto synched; - /* If this packet comes from the defer queue, the skb has already - * been validated - */ - if (unlikely(TIPC_SKB_CB(buf)->deferred)) - return 1; - - if (unlikely(buf->len < MIN_H_SIZE)) - return 0; - - msg = skb_header_pointer(buf, 0, sizeof(tipc_hdr), tipc_hdr); - if (msg == NULL) - return 0; + /* Was last pre-synch packet added to input queue ? */ + if (less_eq(pl->rcv_nxt, l->synch_point)) + return false; - if (unlikely(msg_version(msg) != TIPC_VERSION)) - return 0; + /* Is it still in the input queue ? */ + post_synch = mod(pl->rcv_nxt - l->synch_point) - 1; + if (skb_queue_len(&pl->inputq) > post_synch) + return false; +synched: + l->flags &= ~LINK_SYNCHING; + return true; +} - size = msg_size(msg); - hdr_size = msg_hdr_sz(msg); - min_hdr_size = msg_isdata(msg) ? - min_data_hdr_size[msg_type(msg)] : INT_H_SIZE; +static void link_retrieve_defq(struct tipc_link *link, + struct sk_buff_head *list) +{ + u16 seq_no; - if (unlikely((hdr_size < min_hdr_size) || - (size < hdr_size) || - (buf->len < size) || - (size - hdr_size > TIPC_MAX_USER_MSG_SIZE))) - return 0; + if (skb_queue_empty(&link->deferdq)) + return; - return pskb_may_pull(buf, hdr_size); + seq_no = buf_seqno(skb_peek(&link->deferdq)); + if (seq_no == link->rcv_nxt) + skb_queue_splice_tail_init(&link->deferdq, list); } /** @@ -1117,24 +996,19 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) struct tipc_link *l_ptr; struct sk_buff *skb1, *tmp; struct tipc_msg *msg; - u32 seq_no; - u32 ackd; + u16 seq_no; + u16 ackd; u32 released; skb2list(skb, &head); while ((skb = __skb_dequeue(&head))) { /* Ensure message is well-formed */ - if (unlikely(!link_recv_buf_validate(skb))) - goto discard; - - /* Ensure message data is a single contiguous unit */ - if (unlikely(skb_linearize(skb))) + if (unlikely(!tipc_msg_validate(skb))) goto discard; /* Handle arrival of a non-unicast link message */ msg = buf_msg(skb); - if (unlikely(msg_non_seq(msg))) { if (msg_user(msg) == LINK_CONFIG) tipc_disc_rcv(net, skb, b_ptr); @@ -1152,8 +1026,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) n_ptr = tipc_node_find(net, msg_prevnode(msg)); if (unlikely(!n_ptr)) goto discard; - tipc_node_lock(n_ptr); + tipc_node_lock(n_ptr); /* Locate unicast link endpoint that should handle message */ l_ptr = n_ptr->links[b_ptr->identity]; if (unlikely(!l_ptr)) @@ -1175,21 +1049,20 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) ackd = msg_ack(msg); /* Release acked messages */ - if (n_ptr->bclink.recv_permitted) + if (unlikely(n_ptr->bclink.acked != msg_bcast_ack(msg))) tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg)); released = 0; - skb_queue_walk_safe(&l_ptr->outqueue, skb1, tmp) { - if (skb1 == l_ptr->next_out || - more(buf_seqno(skb1), ackd)) + skb_queue_walk_safe(&l_ptr->transmq, skb1, tmp) { + if (more(buf_seqno(skb1), ackd)) break; - __skb_unlink(skb1, &l_ptr->outqueue); + __skb_unlink(skb1, &l_ptr->transmq); kfree_skb(skb1); released = 1; } /* Try sending any messages link endpoint has pending */ - if (unlikely(l_ptr->next_out)) + if (unlikely(skb_queue_len(&l_ptr->backlogq))) tipc_link_push_packets(l_ptr); if (released && !skb_queue_empty(&l_ptr->wakeupq)) @@ -1217,24 +1090,31 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) } /* Link is now in state WORKING_WORKING */ - if (unlikely(seq_no != mod(l_ptr->next_in_no))) { + if (unlikely(seq_no != l_ptr->rcv_nxt)) { link_handle_out_of_seq_msg(l_ptr, skb); link_retrieve_defq(l_ptr, &head); skb = NULL; goto unlock; } - l_ptr->next_in_no++; - if (unlikely(!skb_queue_empty(&l_ptr->deferred_queue))) - link_retrieve_defq(l_ptr, &head); + l_ptr->silent_intv_cnt = 0; - if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) { + /* Synchronize with parallel link if applicable */ + if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) { + if (!link_synch(l_ptr)) + goto unlock; + } + l_ptr->rcv_nxt++; + if (unlikely(!skb_queue_empty(&l_ptr->deferdq))) + link_retrieve_defq(l_ptr, &head); + if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) { l_ptr->stats.sent_acks++; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); } tipc_link_input(l_ptr, skb); skb = NULL; unlock: tipc_node_unlock(n_ptr); + tipc_node_put(n_ptr); discard: if (unlikely(skb)) kfree_skb(skb); @@ -1271,7 +1151,7 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) node->action_flags |= TIPC_NAMED_MSG_EVT; return true; case MSG_BUNDLER: - case CHANGEOVER_PROTOCOL: + case TUNNEL_PROTOCOL: case MSG_FRAGMENTER: case BCAST_PROTOCOL: return false; @@ -1298,8 +1178,14 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) return; switch (msg_user(msg)) { - case CHANGEOVER_PROTOCOL: - if (!tipc_link_tunnel_rcv(node, &skb)) + case TUNNEL_PROTOCOL: + if (msg_dup(msg)) { + link->flags |= LINK_SYNCHING; + link->synch_point = msg_seqno(msg_get_wrapped(msg)); + kfree_skb(skb); + break; + } + if (!tipc_link_failover_rcv(link, &skb)) break; if (msg_user(buf_msg(skb)) != MSG_BUNDLER) { tipc_data_input(link, skb); @@ -1337,7 +1223,7 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb) { struct sk_buff *skb1; - u32 seq_no = buf_seqno(skb); + u16 seq_no = buf_seqno(skb); /* Empty queue ? */ if (skb_queue_empty(list)) { @@ -1353,7 +1239,7 @@ u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb) /* Locate insertion point in queue, then insert; discard if duplicate */ skb_queue_walk(list, skb1) { - u32 curr_seqno = buf_seqno(skb1); + u16 curr_seqno = buf_seqno(skb1); if (seq_no == curr_seqno) { kfree_skb(skb); @@ -1381,24 +1267,23 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, return; } - /* Record OOS packet arrival (force mismatch on next timeout) */ - l_ptr->checkpoint--; + /* Record OOS packet arrival */ + l_ptr->silent_intv_cnt = 0; /* * Discard packet if a duplicate; otherwise add it to deferred queue * and notify peer of gap as per protocol specification */ - if (less(seq_no, mod(l_ptr->next_in_no))) { + if (less(seq_no, l_ptr->rcv_nxt)) { l_ptr->stats.duplicates++; kfree_skb(buf); return; } - if (tipc_link_defer_pkt(&l_ptr->deferred_queue, buf)) { + if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) { l_ptr->stats.deferred_recv++; - TIPC_SKB_CB(buf)->deferred = true; - if ((skb_queue_len(&l_ptr->deferred_queue) % 16) == 1) - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1) + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); } else { l_ptr->stats.duplicates++; } @@ -1408,15 +1293,16 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, * Send protocol message to the other endpoint. */ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, - u32 gap, u32 tolerance, u32 priority, u32 ack_mtu) + u32 gap, u32 tolerance, u32 priority) { struct sk_buff *buf = NULL; struct tipc_msg *msg = l_ptr->pmsg; u32 msg_size = sizeof(l_ptr->proto_msg); int r_flag; + u16 last_rcv; - /* Don't send protocol message during link changeover */ - if (l_ptr->exp_msg_count) + /* Don't send protocol message during link failover */ + if (l_ptr->flags & LINK_FAILINGOVER) return; /* Abort non-RESET send if communication with node is prohibited */ @@ -1430,51 +1316,34 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net)); if (msg_typ == STATE_MSG) { - u32 next_sent = mod(l_ptr->next_out_no); + u16 next_sent = l_ptr->snd_nxt; if (!tipc_link_is_up(l_ptr)) return; - if (l_ptr->next_out) - next_sent = buf_seqno(l_ptr->next_out); msg_set_next_sent(msg, next_sent); - if (!skb_queue_empty(&l_ptr->deferred_queue)) { - u32 rec = buf_seqno(skb_peek(&l_ptr->deferred_queue)); - gap = mod(rec - mod(l_ptr->next_in_no)); + if (!skb_queue_empty(&l_ptr->deferdq)) { + last_rcv = buf_seqno(skb_peek(&l_ptr->deferdq)); + gap = mod(last_rcv - l_ptr->rcv_nxt); } msg_set_seq_gap(msg, gap); if (gap) l_ptr->stats.sent_nacks++; msg_set_link_tolerance(msg, tolerance); msg_set_linkprio(msg, priority); - msg_set_max_pkt(msg, ack_mtu); - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_max_pkt(msg, l_ptr->mtu); + msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1)); msg_set_probe(msg, probe_msg != 0); - if (probe_msg) { - u32 mtu = l_ptr->max_pkt; - - if ((mtu < l_ptr->max_pkt_target) && - link_working_working(l_ptr) && - l_ptr->fsm_msg_cnt) { - msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3; - if (l_ptr->max_pkt_probes == 10) { - l_ptr->max_pkt_target = (msg_size - 4); - l_ptr->max_pkt_probes = 0; - msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3; - } - l_ptr->max_pkt_probes++; - } - + if (probe_msg) l_ptr->stats.sent_probes++; - } l_ptr->stats.sent_states++; } else { /* RESET_MSG or ACTIVATE_MSG */ - msg_set_ack(msg, mod(l_ptr->reset_checkpoint - 1)); + msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1)); msg_set_seq_gap(msg, 0); msg_set_next_sent(msg, 1); msg_set_probe(msg, 0); msg_set_link_tolerance(msg, l_ptr->tolerance); msg_set_linkprio(msg, l_ptr->priority); - msg_set_max_pkt(msg, l_ptr->max_pkt_target); + msg_set_max_pkt(msg, l_ptr->advertised_mtu); } r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr)); @@ -1482,7 +1351,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, msg_set_linkprio(msg, l_ptr->priority); msg_set_size(msg, msg_size); - msg_set_seqno(msg, mod(l_ptr->next_out_no + (0xffff/2))); + msg_set_seqno(msg, mod(l_ptr->snd_nxt + (0xffff / 2))); buf = tipc_buf_acquire(msg_size); if (!buf) @@ -1490,10 +1359,9 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); buf->priority = TC_PRIO_CONTROL; - tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, buf, &l_ptr->media_addr); - l_ptr->unacked_window = 0; + l_ptr->rcv_unacked = 0; kfree_skb(buf); } @@ -1506,13 +1374,10 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, struct sk_buff *buf) { u32 rec_gap = 0; - u32 max_pkt_info; - u32 max_pkt_ack; u32 msg_tol; struct tipc_msg *msg = buf_msg(buf); - /* Discard protocol message during link changeover */ - if (l_ptr->exp_msg_count) + if (l_ptr->flags & LINK_FAILINGOVER) goto exit; if (l_ptr->net_plane != msg_net_plane(msg)) @@ -1551,15 +1416,8 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, if (msg_linkprio(msg) > l_ptr->priority) l_ptr->priority = msg_linkprio(msg); - max_pkt_info = msg_max_pkt(msg); - if (max_pkt_info) { - if (max_pkt_info < l_ptr->max_pkt_target) - l_ptr->max_pkt_target = max_pkt_info; - if (l_ptr->max_pkt > l_ptr->max_pkt_target) - l_ptr->max_pkt = l_ptr->max_pkt_target; - } else { - l_ptr->max_pkt = l_ptr->max_pkt_target; - } + if (l_ptr->mtu > msg_max_pkt(msg)) + l_ptr->mtu = msg_max_pkt(msg); /* Synchronize broadcast link info, if not done previously */ if (!tipc_node_is_up(l_ptr->owner)) { @@ -1592,30 +1450,18 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, } /* Record reception; force mismatch at next timeout: */ - l_ptr->checkpoint--; + l_ptr->silent_intv_cnt = 0; link_state_event(l_ptr, TRAFFIC_MSG_EVT); l_ptr->stats.recv_states++; if (link_reset_unknown(l_ptr)) break; - if (less_eq(mod(l_ptr->next_in_no), msg_next_sent(msg))) { - rec_gap = mod(msg_next_sent(msg) - - mod(l_ptr->next_in_no)); - } + if (less_eq(l_ptr->rcv_nxt, msg_next_sent(msg))) + rec_gap = mod(msg_next_sent(msg) - l_ptr->rcv_nxt); - max_pkt_ack = msg_max_pkt(msg); - if (max_pkt_ack > l_ptr->max_pkt) { - l_ptr->max_pkt = max_pkt_ack; - l_ptr->max_pkt_probes = 0; - } - - max_pkt_ack = 0; - if (msg_probe(msg)) { + if (msg_probe(msg)) l_ptr->stats.recv_probes++; - if (msg_size(msg) > sizeof(l_ptr->proto_msg)) - max_pkt_ack = msg_size(msg); - } /* Protocol message before retransmits, reduce loss risk */ if (l_ptr->owner->bclink.recv_permitted) @@ -1623,12 +1469,12 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, msg_last_bcast(msg)); if (rec_gap || (msg_probe(msg))) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, rec_gap, 0, - 0, max_pkt_ack); + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, + rec_gap, 0, 0); } if (msg_seq_gap(msg)) { l_ptr->stats.recv_nacks++; - tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->outqueue), + tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq), msg_seq_gap(msg)); } break; @@ -1675,7 +1521,7 @@ static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr, */ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) { - u32 msgcount = skb_queue_len(&l_ptr->outqueue); + int msgcount; struct tipc_link *tunnel = l_ptr->owner->active_links[0]; struct tipc_msg tunnel_hdr; struct sk_buff *skb; @@ -1684,12 +1530,20 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) if (!tunnel) return; - tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, CHANGEOVER_PROTOCOL, - ORIGINAL_MSG, INT_H_SIZE, l_ptr->addr); + tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, TUNNEL_PROTOCOL, + FAILOVER_MSG, INT_H_SIZE, l_ptr->addr); + + skb_queue_walk(&l_ptr->backlogq, skb) { + msg_set_seqno(buf_msg(skb), l_ptr->snd_nxt); + l_ptr->snd_nxt = mod(l_ptr->snd_nxt + 1); + } + skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq); + tipc_link_purge_backlog(l_ptr); + msgcount = skb_queue_len(&l_ptr->transmq); msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); msg_set_msgcnt(&tunnel_hdr, msgcount); - if (skb_queue_empty(&l_ptr->outqueue)) { + if (skb_queue_empty(&l_ptr->transmq)) { skb = tipc_buf_acquire(INT_H_SIZE); if (skb) { skb_copy_to_linear_data(skb, &tunnel_hdr, INT_H_SIZE); @@ -1705,7 +1559,7 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) split_bundles = (l_ptr->owner->active_links[0] != l_ptr->owner->active_links[1]); - skb_queue_walk(&l_ptr->outqueue, skb) { + skb_queue_walk(&l_ptr->transmq, skb) { struct tipc_msg *msg = buf_msg(skb); if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) { @@ -1736,157 +1590,111 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) * and sequence order is preserved per sender/receiver socket pair. * Owner node is locked. */ -void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, - struct tipc_link *tunnel) +void tipc_link_dup_queue_xmit(struct tipc_link *link, + struct tipc_link *tnl) { struct sk_buff *skb; - struct tipc_msg tunnel_hdr; - - tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, CHANGEOVER_PROTOCOL, - DUPLICATE_MSG, INT_H_SIZE, l_ptr->addr); - msg_set_msgcnt(&tunnel_hdr, skb_queue_len(&l_ptr->outqueue)); - msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); - skb_queue_walk(&l_ptr->outqueue, skb) { + struct tipc_msg tnl_hdr; + struct sk_buff_head *queue = &link->transmq; + int mcnt; + u16 seqno; + + tipc_msg_init(link_own_addr(link), &tnl_hdr, TUNNEL_PROTOCOL, + SYNCH_MSG, INT_H_SIZE, link->addr); + mcnt = skb_queue_len(&link->transmq) + skb_queue_len(&link->backlogq); + msg_set_msgcnt(&tnl_hdr, mcnt); + msg_set_bearer_id(&tnl_hdr, link->peer_bearer_id); + +tunnel_queue: + skb_queue_walk(queue, skb) { struct sk_buff *outskb; struct tipc_msg *msg = buf_msg(skb); - u32 length = msg_size(msg); + u32 len = msg_size(msg); - if (msg_user(msg) == MSG_BUNDLER) - msg_set_type(msg, CLOSED_MSG); - msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); /* Update */ - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - msg_set_size(&tunnel_hdr, length + INT_H_SIZE); - outskb = tipc_buf_acquire(length + INT_H_SIZE); + msg_set_ack(msg, mod(link->rcv_nxt - 1)); + msg_set_bcast_ack(msg, link->owner->bclink.last_in); + msg_set_size(&tnl_hdr, len + INT_H_SIZE); + outskb = tipc_buf_acquire(len + INT_H_SIZE); if (outskb == NULL) { pr_warn("%sunable to send duplicate msg\n", link_co_err); return; } - skb_copy_to_linear_data(outskb, &tunnel_hdr, INT_H_SIZE); - skb_copy_to_linear_data_offset(outskb, INT_H_SIZE, skb->data, - length); - __tipc_link_xmit_skb(tunnel, outskb); - if (!tipc_link_is_up(l_ptr)) + skb_copy_to_linear_data(outskb, &tnl_hdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(outskb, INT_H_SIZE, + skb->data, len); + __tipc_link_xmit_skb(tnl, outskb); + if (!tipc_link_is_up(link)) return; } -} - -/** - * buf_extract - extracts embedded TIPC message from another message - * @skb: encapsulating message buffer - * @from_pos: offset to extract from - * - * Returns a new message buffer containing an embedded message. The - * encapsulating buffer is left unchanged. - */ -static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos) -{ - struct tipc_msg *msg = (struct tipc_msg *)(skb->data + from_pos); - u32 size = msg_size(msg); - struct sk_buff *eb; - - eb = tipc_buf_acquire(size); - if (eb) - skb_copy_to_linear_data(eb, msg, size); - return eb; -} - -/* tipc_link_dup_rcv(): Receive a tunnelled DUPLICATE_MSG packet. - * Owner node is locked. - */ -static void tipc_link_dup_rcv(struct tipc_link *l_ptr, - struct sk_buff *t_buf) -{ - struct sk_buff *buf; - - if (!tipc_link_is_up(l_ptr)) - return; - - buf = buf_extract(t_buf, INT_H_SIZE); - if (buf == NULL) { - pr_warn("%sfailed to extract inner dup pkt\n", link_co_err); + if (queue == &link->backlogq) return; + seqno = link->snd_nxt; + skb_queue_walk(&link->backlogq, skb) { + msg_set_seqno(buf_msg(skb), seqno); + seqno = mod(seqno + 1); } - - /* Add buffer to deferred queue, if applicable: */ - link_handle_out_of_seq_msg(l_ptr, buf); + queue = &link->backlogq; + goto tunnel_queue; } -/* tipc_link_failover_rcv(): Receive a tunnelled ORIGINAL_MSG packet +/* tipc_link_failover_rcv(): Receive a tunnelled FAILOVER_MSG packet * Owner node is locked. */ -static struct sk_buff *tipc_link_failover_rcv(struct tipc_link *l_ptr, - struct sk_buff *t_buf) +static bool tipc_link_failover_rcv(struct tipc_link *link, + struct sk_buff **skb) { - struct tipc_msg *t_msg = buf_msg(t_buf); - struct sk_buff *buf = NULL; - struct tipc_msg *msg; - - if (tipc_link_is_up(l_ptr)) - tipc_link_reset(l_ptr); - - /* First failover packet? */ - if (l_ptr->exp_msg_count == START_CHANGEOVER) - l_ptr->exp_msg_count = msg_msgcnt(t_msg); - - /* Should there be an inner packet? */ - if (l_ptr->exp_msg_count) { - l_ptr->exp_msg_count--; - buf = buf_extract(t_buf, INT_H_SIZE); - if (buf == NULL) { - pr_warn("%sno inner failover pkt\n", link_co_err); - goto exit; - } - msg = buf_msg(buf); + struct tipc_msg *msg = buf_msg(*skb); + struct sk_buff *iskb = NULL; + struct tipc_link *pl = NULL; + int bearer_id = msg_bearer_id(msg); + int pos = 0; - if (less(msg_seqno(msg), l_ptr->reset_checkpoint)) { - kfree_skb(buf); - buf = NULL; - goto exit; - } - if (msg_user(msg) == MSG_FRAGMENTER) { - l_ptr->stats.recv_fragments++; - tipc_buf_append(&l_ptr->reasm_buf, &buf); - } + if (msg_type(msg) != FAILOVER_MSG) { + pr_warn("%sunknown tunnel pkt received\n", link_co_err); + goto exit; } -exit: - if ((!l_ptr->exp_msg_count) && (l_ptr->flags & LINK_STOPPED)) - tipc_link_delete(l_ptr); - return buf; -} + if (bearer_id >= MAX_BEARERS) + goto exit; -/* tipc_link_tunnel_rcv(): Receive a tunnelled packet, sent - * via other link as result of a failover (ORIGINAL_MSG) or - * a new active link (DUPLICATE_MSG). Failover packets are - * returned to the active link for delivery upwards. - * Owner node is locked. - */ -static int tipc_link_tunnel_rcv(struct tipc_node *n_ptr, - struct sk_buff **buf) -{ - struct sk_buff *t_buf = *buf; - struct tipc_link *l_ptr; - struct tipc_msg *t_msg = buf_msg(t_buf); - u32 bearer_id = msg_bearer_id(t_msg); + if (bearer_id == link->bearer_id) + goto exit; - *buf = NULL; + pl = link->owner->links[bearer_id]; + if (pl && tipc_link_is_up(pl)) + tipc_link_reset(pl); - if (bearer_id >= MAX_BEARERS) + if (link->failover_pkts == FIRST_FAILOVER) + link->failover_pkts = msg_msgcnt(msg); + + /* Should we expect an inner packet? */ + if (!link->failover_pkts) goto exit; - l_ptr = n_ptr->links[bearer_id]; - if (!l_ptr) + if (!tipc_msg_extract(*skb, &iskb, &pos)) { + pr_warn("%sno inner failover pkt\n", link_co_err); + *skb = NULL; goto exit; + } + link->failover_pkts--; + *skb = NULL; - if (msg_type(t_msg) == DUPLICATE_MSG) - tipc_link_dup_rcv(l_ptr, t_buf); - else if (msg_type(t_msg) == ORIGINAL_MSG) - *buf = tipc_link_failover_rcv(l_ptr, t_buf); - else - pr_warn("%sunknown tunnel pkt received\n", link_co_err); + /* Was this packet already delivered? */ + if (less(buf_seqno(iskb), link->failover_checkpt)) { + kfree_skb(iskb); + iskb = NULL; + goto exit; + } + if (msg_user(buf_msg(iskb)) == MSG_FRAGMENTER) { + link->stats.recv_fragments++; + tipc_buf_append(&link->failover_skb, &iskb); + } exit: - kfree_skb(t_buf); - return *buf != NULL; + if (!link->failover_pkts && pl) + pl->flags &= ~LINK_FAILINGOVER; + kfree_skb(*skb); + *skb = iskb; + return *skb; } static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) @@ -1897,27 +1705,20 @@ static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) return; l_ptr->tolerance = tol; - l_ptr->cont_intv = msecs_to_jiffies(intv); - l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->cont_intv) / 4); + l_ptr->keepalive_intv = msecs_to_jiffies(intv); + l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->keepalive_intv)); } -void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window) +void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) { - /* Data messages from this node, inclusive FIRST_FRAGM */ - l_ptr->queue_limit[TIPC_LOW_IMPORTANCE] = window; - l_ptr->queue_limit[TIPC_MEDIUM_IMPORTANCE] = (window / 3) * 4; - l_ptr->queue_limit[TIPC_HIGH_IMPORTANCE] = (window / 3) * 5; - l_ptr->queue_limit[TIPC_CRITICAL_IMPORTANCE] = (window / 3) * 6; - /* Transiting data messages,inclusive FIRST_FRAGM */ - l_ptr->queue_limit[TIPC_LOW_IMPORTANCE + 4] = 300; - l_ptr->queue_limit[TIPC_MEDIUM_IMPORTANCE + 4] = 600; - l_ptr->queue_limit[TIPC_HIGH_IMPORTANCE + 4] = 900; - l_ptr->queue_limit[TIPC_CRITICAL_IMPORTANCE + 4] = 1200; - l_ptr->queue_limit[CONN_MANAGER] = 1200; - l_ptr->queue_limit[CHANGEOVER_PROTOCOL] = 2500; - l_ptr->queue_limit[NAME_DISTRIBUTOR] = 3000; - /* FRAGMENT and LAST_FRAGMENT packets */ - l_ptr->queue_limit[MSG_FRAGMENTER] = 4000; + int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE); + + l->window = win; + l->backlog[TIPC_LOW_IMPORTANCE].limit = win / 2; + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = win; + l->backlog[TIPC_HIGH_IMPORTANCE].limit = win / 2 * 3; + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = win * 2; + l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } /* tipc_link_find_owner - locate owner node of link by link's name @@ -1965,8 +1766,8 @@ static struct tipc_node *tipc_link_find_owner(struct net *net, static void link_reset_statistics(struct tipc_link *l_ptr) { memset(&l_ptr->stats, 0, sizeof(l_ptr->stats)); - l_ptr->stats.sent_info = l_ptr->next_out_no; - l_ptr->stats.recv_info = l_ptr->next_in_no; + l_ptr->stats.sent_info = l_ptr->snd_nxt; + l_ptr->stats.recv_info = l_ptr->rcv_nxt; } static void link_print(struct tipc_link *l_ptr, const char *str) @@ -2055,6 +1856,9 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) name = nla_data(attrs[TIPC_NLA_LINK_NAME]); + if (strcmp(name, tipc_bclink_name) == 0) + return tipc_nl_bc_link_set(net, attrs); + node = tipc_link_find_owner(net, name, &bearer_id); if (!node) return -EINVAL; @@ -2082,14 +1886,14 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); link_set_supervision_props(link, tol); - tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0, 0); + tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); link->priority = prio; - tipc_link_proto_xmit(link, STATE_MSG, 0, 0, 0, prio, 0); + tipc_link_proto_xmit(link, STATE_MSG, 0, 0, 0, prio); } if (props[TIPC_NLA_PROP_WIN]) { u32 win; @@ -2172,7 +1976,7 @@ msg_full: /* Caller should hold appropriate locks to protect the link */ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, - struct tipc_link *link) + struct tipc_link *link, int nlflags) { int err; void *hdr; @@ -2181,7 +1985,7 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_net *tn = net_generic(net, tipc_net_id); hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, - NLM_F_MULTI, TIPC_NL_LINK_GET); + nlflags, TIPC_NL_LINK_GET); if (!hdr) return -EMSGSIZE; @@ -2194,11 +1998,11 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, tipc_cluster_mask(tn->own_addr))) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->max_pkt)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->next_in_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->rcv_nxt)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->next_out_no)) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->snd_nxt)) goto attr_msg_full; if (tipc_link_is_up(link)) @@ -2216,7 +2020,7 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, link->tolerance)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, - link->queue_limit[TIPC_LOW_IMPORTANCE])) + link->window)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority)) goto prop_msg_full; @@ -2254,7 +2058,7 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, if (!node->links[i]) continue; - err = __tipc_nl_add_link(net, msg, node->links[i]); + err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI); if (err) return err; } @@ -2282,7 +2086,6 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb) msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); - if (prev_node) { node = tipc_node_find(net, prev_node); if (!node) { @@ -2295,6 +2098,7 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->prev_seq = 1; goto out; } + tipc_node_put(node); list_for_each_entry_continue_rcu(node, &tn->node_list, list) { @@ -2337,50 +2141,53 @@ out: int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct sk_buff *ans_skb; struct tipc_nl_msg msg; - struct tipc_link *link; - struct tipc_node *node; char *name; - int bearer_id; int err; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + if (!info->attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; - name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]); - node = tipc_link_find_owner(net, name, &bearer_id); - if (!node) - return -EINVAL; - ans_skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); - if (!ans_skb) + msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg.skb) return -ENOMEM; - msg.skb = ans_skb; - msg.portid = info->snd_portid; - msg.seq = info->snd_seq; - - tipc_node_lock(node); - link = node->links[bearer_id]; - if (!link) { - err = -EINVAL; - goto err_out; - } - - err = __tipc_nl_add_link(net, &msg, link); - if (err) - goto err_out; + if (strcmp(name, tipc_bclink_name) == 0) { + err = tipc_nl_add_bc_link(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + } else { + int bearer_id; + struct tipc_node *node; + struct tipc_link *link; - tipc_node_unlock(node); + node = tipc_link_find_owner(net, name, &bearer_id); + if (!node) + return -EINVAL; - return genlmsg_reply(ans_skb, info); + tipc_node_lock(node); + link = node->links[bearer_id]; + if (!link) { + tipc_node_unlock(node); + nlmsg_free(msg.skb); + return -EINVAL; + } -err_out: - tipc_node_unlock(node); - nlmsg_free(ans_skb); + err = __tipc_nl_add_link(net, &msg, link, 0); + tipc_node_unlock(node); + if (err) { + nlmsg_free(msg.skb); + return err; + } + } - return err; + return genlmsg_reply(msg.skb, info); } int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info) diff --git a/net/tipc/link.h b/net/tipc/link.h index 7aeb52092bf3..ae0a0ea572f2 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -58,8 +58,10 @@ /* Link endpoint execution states */ -#define LINK_STARTED 0x0001 -#define LINK_STOPPED 0x0002 +#define LINK_STARTED 0x0001 +#define LINK_STOPPED 0x0002 +#define LINK_SYNCHING 0x0004 +#define LINK_FAILINGOVER 0x0008 /* Starting value for maximum packet size negotiation on unicast links * (unless bearer MTU is less) @@ -105,30 +107,29 @@ struct tipc_stats { * @owner: pointer to peer node * @refcnt: reference counter for permanent references (owner node & timer) * @flags: execution state flags for link endpoint instance - * @checkpoint: reference point for triggering link continuity checking * @peer_session: link session # being used by peer end of link * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link * @tolerance: minimum link continuity loss needed to reset link [in ms] - * @cont_intv: link continuity testing interval + * @keepalive_intv: link keepalive timer interval * @abort_limit: # of unacknowledged continuity probes needed to reset link * @state: current state of link FSM - * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state + * @silent_intv_cnt: # of timer intervals without any reception from peer * @proto_msg: template for control messages generated by link * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') - * @queue_limit: outbound message queue congestion thresholds (indexed by user) + * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover - * @reset_checkpoint: seq # of last acknowledged message at time of link reset - * @max_pkt: current maximum packet size for this link - * @max_pkt_target: desired maximum packet size for this link - * @max_pkt_probes: # of probes based on current (max_pkt, max_pkt_target) - * @outqueue: outbound message queue - * @next_out_no: next sequence number to use for outbound messages + * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset + * @mtu: current maximum packet size for this link + * @advertised_mtu: advertised own mtu when link is being established + * @transmitq: queue for sent, non-acked messages + * @backlogq: queue for messages waiting to be sent + * @snt_nxt: next sequence number to use for outbound messages * @last_retransmitted: sequence number of most recently retransmitted message * @stale_count: # of identical retransmit requests made by peer - * @next_in_no: next sequence number to expect for inbound messages + * @rcv_nxt: next sequence number to expect for inbound messages * @deferred_queue: deferred queue saved OOS b'cast message received from node * @unacked_window: # of inbound messages rx'd without ack'ing back to peer * @inputq: buffer queue for messages to be delivered upwards @@ -149,15 +150,14 @@ struct tipc_link { /* Management and link supervision data */ unsigned int flags; - u32 checkpoint; u32 peer_session; u32 peer_bearer_id; u32 bearer_id; u32 tolerance; - unsigned long cont_intv; + unsigned long keepalive_intv; u32 abort_limit; int state; - u32 fsm_msg_cnt; + u32 silent_intv_cnt; struct { unchar hdr[INT_H_SIZE]; unchar body[TIPC_MAX_IF_NAME]; @@ -165,36 +165,40 @@ struct tipc_link { struct tipc_msg *pmsg; u32 priority; char net_plane; - u32 queue_limit[15]; /* queue_limit[0]==window limit */ + u16 synch_point; - /* Changeover */ - u32 exp_msg_count; - u32 reset_checkpoint; + /* Failover */ + u16 failover_pkts; + u16 failover_checkpt; + struct sk_buff *failover_skb; /* Max packet negotiation */ - u32 max_pkt; - u32 max_pkt_target; - u32 max_pkt_probes; + u16 mtu; + u16 advertised_mtu; /* Sending */ - struct sk_buff_head outqueue; - u32 next_out_no; - u32 last_retransmitted; + struct sk_buff_head transmq; + struct sk_buff_head backlogq; + struct { + u16 len; + u16 limit; + } backlog[5]; + u16 snd_nxt; + u16 last_retransm; + u32 window; u32 stale_count; /* Reception */ - u32 next_in_no; - struct sk_buff_head deferred_queue; - u32 unacked_window; + u16 rcv_nxt; + u32 rcv_unacked; + struct sk_buff_head deferdq; struct sk_buff_head inputq; struct sk_buff_head namedq; /* Congestion handling */ - struct sk_buff *next_out; struct sk_buff_head wakeupq; /* Fragmentation/reassembly */ - u32 long_msg_seq_no; struct sk_buff *reasm_buf; /* Statistics */ @@ -207,17 +211,16 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, struct tipc_bearer *b_ptr, const struct tipc_media_addr *media_addr); void tipc_link_delete(struct tipc_link *link); -void tipc_link_delete_list(struct net *net, unsigned int bearer_id, - bool shutting_down); +void tipc_link_delete_list(struct net *net, unsigned int bearer_id); void tipc_link_failover_send_queue(struct tipc_link *l_ptr); void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, struct tipc_link *dest); void tipc_link_reset_fragments(struct tipc_link *l_ptr); int tipc_link_is_up(struct tipc_link *l_ptr); int tipc_link_is_active(struct tipc_link *l_ptr); void tipc_link_purge_queues(struct tipc_link *l_ptr); +void tipc_link_purge_backlog(struct tipc_link *l); void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); -void tipc_link_reset_list(struct net *net, unsigned int bearer_id); int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, u32 selector); int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest, @@ -225,7 +228,7 @@ int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest, int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list); void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob, - u32 gap, u32 tolerance, u32 priority, u32 acked_mtu); + u32 gap, u32 tolerance, u32 priority); void tipc_link_push_packets(struct tipc_link *l_ptr); u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *buf); void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window); @@ -241,39 +244,6 @@ int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); void link_prepare_wakeup(struct tipc_link *l); -/* - * Link sequence number manipulation routines (uses modulo 2**16 arithmetic) - */ -static inline u32 buf_seqno(struct sk_buff *buf) -{ - return msg_seqno(buf_msg(buf)); -} - -static inline u32 mod(u32 x) -{ - return x & 0xffffu; -} - -static inline int less_eq(u32 left, u32 right) -{ - return mod(right - left) < 32768u; -} - -static inline int more(u32 left, u32 right) -{ - return !less_eq(left, right); -} - -static inline int less(u32 left, u32 right) -{ - return less_eq(left, right) && (mod(right) != mod(left)); -} - -static inline u32 lesser(u32 left, u32 right) -{ - return less_eq(left, right) ? left : right; -} - static inline u32 link_own_addr(struct tipc_link *l) { return msg_prevnode(l->pmsg); @@ -302,9 +272,4 @@ static inline int link_reset_reset(struct tipc_link *l_ptr) return l_ptr->state == RESET_RESET; } -static inline int link_congested(struct tipc_link *l_ptr) -{ - return skb_queue_len(&l_ptr->outqueue) >= l_ptr->queue_limit[0]; -} - #endif diff --git a/net/tipc/msg.c b/net/tipc/msg.c index b6eb90cd3ef7..08b4cc7d496d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -1,7 +1,7 @@ /* * net/tipc/msg.c: TIPC message header routines * - * Copyright (c) 2000-2006, 2014, Ericsson AB + * Copyright (c) 2000-2006, 2014-2015, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -165,6 +165,9 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) } if (fragid == LAST_FRAGMENT) { + TIPC_SKB_CB(head)->validated = false; + if (unlikely(!tipc_msg_validate(head))) + goto err; *buf = head; TIPC_SKB_CB(head)->tail = NULL; *headbuf = NULL; @@ -172,7 +175,6 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) } *buf = NULL; return 0; - err: pr_warn_ratelimited("Unable to build fragment list\n"); kfree_skb(*buf); @@ -181,6 +183,48 @@ err: return 0; } +/* tipc_msg_validate - validate basic format of received message + * + * This routine ensures a TIPC message has an acceptable header, and at least + * as much data as the header indicates it should. The routine also ensures + * that the entire message header is stored in the main fragment of the message + * buffer, to simplify future access to message header fields. + * + * Note: Having extra info present in the message header or data areas is OK. + * TIPC will ignore the excess, under the assumption that it is optional info + * introduced by a later release of the protocol. + */ +bool tipc_msg_validate(struct sk_buff *skb) +{ + struct tipc_msg *msg; + int msz, hsz; + + if (unlikely(TIPC_SKB_CB(skb)->validated)) + return true; + if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) + return false; + + hsz = msg_hdr_sz(buf_msg(skb)); + if (unlikely(hsz < MIN_H_SIZE) || (hsz > MAX_H_SIZE)) + return false; + if (unlikely(!pskb_may_pull(skb, hsz))) + return false; + + msg = buf_msg(skb); + if (unlikely(msg_version(msg) != TIPC_VERSION)) + return false; + + msz = msg_size(msg); + if (unlikely(msz < hsz)) + return false; + if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE)) + return false; + if (unlikely(skb->len < msz)) + return false; + + TIPC_SKB_CB(skb)->validated = true; + return true; +} /** * tipc_msg_build - create buffer chain containing specified header and data @@ -228,6 +272,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, FIRST_FRAGMENT, INT_H_SIZE, msg_destnode(mhdr)); msg_set_size(&pkthdr, pktmax); msg_set_fragm_no(&pkthdr, pktno); + msg_set_importance(&pkthdr, msg_importance(mhdr)); /* Prepare first fragment */ skb = tipc_buf_acquire(pktmax); @@ -286,121 +331,134 @@ error: /** * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one - * @list: the buffer chain of the existing buffer ("bundle") - * @skb: buffer to be appended + * @skb: the buffer to append to ("bundle") + * @msg: message to be appended * @mtu: max allowable size for the bundle buffer * Consumes buffer if successful * Returns true if bundling could be performed, otherwise false */ -bool tipc_msg_bundle(struct sk_buff_head *list, struct sk_buff *skb, u32 mtu) +bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) { - struct sk_buff *bskb = skb_peek_tail(list); - struct tipc_msg *bmsg = buf_msg(bskb); - struct tipc_msg *msg = buf_msg(skb); - unsigned int bsz = msg_size(bmsg); + struct tipc_msg *bmsg; + unsigned int bsz; unsigned int msz = msg_size(msg); - u32 start = align(bsz); + u32 start, pad; u32 max = mtu - INT_H_SIZE; - u32 pad = start - bsz; if (likely(msg_user(msg) == MSG_FRAGMENTER)) return false; - if (unlikely(msg_user(msg) == CHANGEOVER_PROTOCOL)) + if (!skb) + return false; + bmsg = buf_msg(skb); + bsz = msg_size(bmsg); + start = align(bsz); + pad = start - bsz; + + if (unlikely(msg_user(msg) == TUNNEL_PROTOCOL)) return false; if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) return false; - if (likely(msg_user(bmsg) != MSG_BUNDLER)) - return false; - if (likely(!TIPC_SKB_CB(bskb)->bundling)) + if (unlikely(msg_user(bmsg) != MSG_BUNDLER)) return false; - if (unlikely(skb_tailroom(bskb) < (pad + msz))) + if (unlikely(skb_tailroom(skb) < (pad + msz))) return false; if (unlikely(max < (start + msz))) return false; + if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) && + (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE)) + return false; - skb_put(bskb, pad + msz); - skb_copy_to_linear_data_offset(bskb, start, skb->data, msz); + skb_put(skb, pad + msz); + skb_copy_to_linear_data_offset(skb, start, msg, msz); msg_set_size(bmsg, start + msz); msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); - kfree_skb(skb); return true; } /** * tipc_msg_extract(): extract bundled inner packet from buffer - * @skb: linear outer buffer, to be extracted from. + * @skb: buffer to be extracted from. * @iskb: extracted inner buffer, to be returned - * @pos: position of msg to be extracted. Returns with pointer of next msg + * @pos: position in outer message of msg to be extracted. + * Returns position of next msg * Consumes outer buffer when last packet extracted * Returns true when when there is an extracted buffer, otherwise false */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { - struct tipc_msg *msg = buf_msg(skb); - int imsz; - struct tipc_msg *imsg = (struct tipc_msg *)(msg_data(msg) + *pos); + struct tipc_msg *msg; + int imsz, offset; - /* Is there space left for shortest possible message? */ - if (*pos > (msg_data_sz(msg) - SHORT_H_SIZE)) + *iskb = NULL; + if (unlikely(skb_linearize(skb))) goto none; - imsz = msg_size(imsg); - /* Is there space left for current message ? */ - if ((*pos + imsz) > msg_data_sz(msg)) + msg = buf_msg(skb); + offset = msg_hdr_sz(msg) + *pos; + if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE))) goto none; - *iskb = tipc_buf_acquire(imsz); - if (!*iskb) + + *iskb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!*iskb)) + goto none; + skb_pull(*iskb, offset); + imsz = msg_size(buf_msg(*iskb)); + skb_trim(*iskb, imsz); + if (unlikely(!tipc_msg_validate(*iskb))) goto none; - skb_copy_to_linear_data(*iskb, imsg, imsz); *pos += align(imsz); return true; none: kfree_skb(skb); + kfree_skb(*iskb); *iskb = NULL; return false; } /** * tipc_msg_make_bundle(): Create bundle buf and append message to its tail - * @list: the buffer chain - * @skb: buffer to be appended and replaced + * @list: the buffer chain, where head is the buffer to replace/append + * @skb: buffer to be created, appended to and returned in case of success + * @msg: message to be appended * @mtu: max allowable size for the bundle buffer, inclusive header * @dnode: destination node for message. (Not always present in header) - * Replaces buffer if successful * Returns true if success, otherwise false */ -bool tipc_msg_make_bundle(struct sk_buff_head *list, - struct sk_buff *skb, u32 mtu, u32 dnode) +bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, + u32 mtu, u32 dnode) { - struct sk_buff *bskb; + struct sk_buff *_skb; struct tipc_msg *bmsg; - struct tipc_msg *msg = buf_msg(skb); u32 msz = msg_size(msg); u32 max = mtu - INT_H_SIZE; if (msg_user(msg) == MSG_FRAGMENTER) return false; - if (msg_user(msg) == CHANGEOVER_PROTOCOL) + if (msg_user(msg) == TUNNEL_PROTOCOL) return false; if (msg_user(msg) == BCAST_PROTOCOL) return false; if (msz > (max / 2)) return false; - bskb = tipc_buf_acquire(max); - if (!bskb) + _skb = tipc_buf_acquire(max); + if (!_skb) return false; - skb_trim(bskb, INT_H_SIZE); - bmsg = buf_msg(bskb); + skb_trim(_skb, INT_H_SIZE); + bmsg = buf_msg(_skb); tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, INT_H_SIZE, dnode); + if (msg_isdata(msg)) + msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE); + else + msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE); msg_set_seqno(bmsg, msg_seqno(msg)); msg_set_ack(bmsg, msg_ack(msg)); msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); - TIPC_SKB_CB(bskb)->bundling = true; - __skb_queue_tail(list, bskb); - return tipc_msg_bundle(list, skb, mtu); + tipc_msg_bundle(_skb, msg, mtu); + *skb = _skb; + return true; } /** @@ -415,21 +473,17 @@ bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, int err) { struct tipc_msg *msg = buf_msg(buf); - uint imp = msg_importance(msg); struct tipc_msg ohdr; uint rdsz = min_t(uint, msg_data_sz(msg), MAX_FORWARD_SIZE); if (skb_linearize(buf)) goto exit; + msg = buf_msg(buf); if (msg_dest_droppable(msg)) goto exit; if (msg_errcode(msg)) goto exit; - memcpy(&ohdr, msg, msg_hdr_sz(msg)); - imp = min_t(uint, imp + 1, TIPC_CRITICAL_IMPORTANCE); - if (msg_isdata(msg)) - msg_set_importance(msg, imp); msg_set_errcode(msg, err); msg_set_origport(msg, msg_destport(&ohdr)); msg_set_destport(msg, msg_origport(&ohdr)); @@ -462,15 +516,18 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, { struct tipc_msg *msg = buf_msg(skb); u32 dport; + u32 own_addr = tipc_own_addr(net); if (!msg_isdata(msg)) return false; if (!msg_named(msg)) return false; + if (msg_errcode(msg)) + return false; *err = -TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; - if (msg_reroute_cnt(msg) > 0) + if (msg_reroute_cnt(msg)) return false; *dnode = addr_domain(net, msg_lookup_scope(msg)); dport = tipc_nametbl_translate(net, msg_nametype(msg), @@ -478,6 +535,8 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, if (!dport) return false; msg_incr_reroute_cnt(msg); + if (*dnode != own_addr) + msg_set_prevnode(msg, own_addr); msg_set_destnode(msg, *dnode); msg_set_destport(msg, dport); *err = TIPC_OK; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 9ace47f44a69..19c45fb66238 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -1,7 +1,7 @@ /* * net/tipc/msg.h: Include file for TIPC message header routines * - * Copyright (c) 2000-2007, 2014, Ericsson AB + * Copyright (c) 2000-2007, 2014-2015 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * @@ -54,6 +54,8 @@ struct plist; * - TIPC_HIGH_IMPORTANCE * - TIPC_CRITICAL_IMPORTANCE */ +#define TIPC_SYSTEM_IMPORTANCE 4 + /* * Payload message types @@ -64,6 +66,19 @@ struct plist; #define TIPC_DIRECT_MSG 3 /* + * Internal message users + */ +#define BCAST_PROTOCOL 5 +#define MSG_BUNDLER 6 +#define LINK_PROTOCOL 7 +#define CONN_MANAGER 8 +#define TUNNEL_PROTOCOL 10 +#define NAME_DISTRIBUTOR 11 +#define MSG_FRAGMENTER 12 +#define LINK_CONFIG 13 +#define SOCK_WAKEUP 14 /* pseudo user */ + +/* * Message header sizes */ #define SHORT_H_SIZE 24 /* In-cluster basic payload message */ @@ -76,7 +91,7 @@ struct plist; #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) -#define TIPC_MEDIA_ADDR_OFFSET 5 +#define TIPC_MEDIA_INFO_OFFSET 5 /** * TIPC message buffer code @@ -87,12 +102,12 @@ struct plist; * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields * are word aligned for quicker access */ -#define BUF_HEADROOM LL_MAX_HEADER +#define BUF_HEADROOM (LL_MAX_HEADER + 48) struct tipc_skb_cb { void *handle; struct sk_buff *tail; - bool deferred; + bool validated; bool wakeup_pending; bool bundling; u16 chain_sz; @@ -170,16 +185,6 @@ static inline void msg_set_user(struct tipc_msg *m, u32 n) msg_set_bits(m, 0, 25, 0xf, n); } -static inline u32 msg_importance(struct tipc_msg *m) -{ - return msg_bits(m, 0, 25, 0xf); -} - -static inline void msg_set_importance(struct tipc_msg *m, u32 i) -{ - msg_set_user(m, i); -} - static inline u32 msg_hdr_sz(struct tipc_msg *m) { return msg_bits(m, 0, 21, 0xf) << 2; @@ -235,6 +240,15 @@ static inline void msg_set_size(struct tipc_msg *m, u32 sz) m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); } +static inline unchar *msg_data(struct tipc_msg *m) +{ + return ((unchar *)m) + msg_hdr_sz(m); +} + +static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) +{ + return (struct tipc_msg *)msg_data(m); +} /* * Word 1 @@ -299,12 +313,12 @@ static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 19, 0x3, n); } -static inline u32 msg_bcast_ack(struct tipc_msg *m) +static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } -static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n) +static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } @@ -313,22 +327,22 @@ static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n) /* * Word 2 */ -static inline u32 msg_ack(struct tipc_msg *m) +static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } -static inline void msg_set_ack(struct tipc_msg *m, u32 n) +static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } -static inline u32 msg_seqno(struct tipc_msg *m) +static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } -static inline void msg_set_seqno(struct tipc_msg *m, u32 n) +static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } @@ -336,6 +350,29 @@ static inline void msg_set_seqno(struct tipc_msg *m, u32 n) /* * Words 3-10 */ +static inline u32 msg_importance(struct tipc_msg *m) +{ + int usr = msg_user(m); + + if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) + return usr; + if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) + return msg_bits(m, 5, 13, 0x7); + return TIPC_SYSTEM_IMPORTANCE; +} + +static inline void msg_set_importance(struct tipc_msg *m, u32 i) +{ + int usr = msg_user(m); + + if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) + msg_set_bits(m, 5, 13, 0x7, i); + else if (i < TIPC_SYSTEM_IMPORTANCE) + msg_set_user(m, i); + else + pr_warn("Trying to set illegal importance in message\n"); +} + static inline u32 msg_prevnode(struct tipc_msg *m) { return msg_word(m, 3); @@ -348,6 +385,8 @@ static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) static inline u32 msg_origport(struct tipc_msg *m) { + if (msg_user(m) == MSG_FRAGMENTER) + m = msg_get_wrapped(m); return msg_word(m, 4); } @@ -443,35 +482,11 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) msg_set_word(m, 10, n); } -static inline unchar *msg_data(struct tipc_msg *m) -{ - return ((unchar *)m) + msg_hdr_sz(m); -} - -static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) -{ - return (struct tipc_msg *)msg_data(m); -} - /* * Constants and routines used to read and write TIPC internal message headers */ /* - * Internal message users - */ -#define BCAST_PROTOCOL 5 -#define MSG_BUNDLER 6 -#define LINK_PROTOCOL 7 -#define CONN_MANAGER 8 -#define ROUTE_DISTRIBUTOR 9 /* obsoleted */ -#define CHANGEOVER_PROTOCOL 10 -#define NAME_DISTRIBUTOR 11 -#define MSG_FRAGMENTER 12 -#define LINK_CONFIG 13 -#define SOCK_WAKEUP 14 /* pseudo user */ - -/* * Connection management protocol message types */ #define CONN_PROBE 0 @@ -501,8 +516,8 @@ static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) /* * Changeover tunnel message types */ -#define DUPLICATE_MSG 0 -#define ORIGINAL_MSG 1 +#define SYNCH_MSG 0 +#define FAILOVER_MSG 1 /* * Config protocol message types @@ -510,7 +525,6 @@ static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 - /* * Word 1 */ @@ -534,6 +548,24 @@ static inline void msg_set_node_sig(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 0, 0xffff, n); } +static inline u32 msg_node_capabilities(struct tipc_msg *m) +{ + return msg_bits(m, 1, 15, 0x1fff); +} + +static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 15, 0x1fff, n); +} + +static inline bool msg_dup(struct tipc_msg *m) +{ + if (likely(msg_user(m) != TUNNEL_PROTOCOL)) + return false; + if (msg_type(m) != SYNCH_MSG) + return false; + return true; +} /* * Word 2 @@ -688,7 +720,7 @@ static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) static inline char *msg_media_addr(struct tipc_msg *m) { - return (char *)&m->hdr[TIPC_MEDIA_ADDR_OFFSET]; + return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; } /* @@ -734,21 +766,8 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } -static inline u32 tipc_msg_tot_importance(struct tipc_msg *m) -{ - if ((msg_user(m) == MSG_FRAGMENTER) && (msg_type(m) == FIRST_FRAGMENT)) - return msg_importance(msg_get_wrapped(m)); - return msg_importance(m); -} - -static inline u32 msg_tot_origport(struct tipc_msg *m) -{ - if ((msg_user(m) == MSG_FRAGMENTER) && (msg_type(m) == FIRST_FRAGMENT)) - return msg_origport(msg_get_wrapped(m)); - return msg_origport(m); -} - struct sk_buff *tipc_buf_acquire(u32 size); +bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, int err); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, @@ -757,9 +776,9 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); -bool tipc_msg_bundle(struct sk_buff_head *list, struct sk_buff *skb, u32 mtu); -bool tipc_msg_make_bundle(struct sk_buff_head *list, - struct sk_buff *skb, u32 mtu, u32 dnode); +bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu); +bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, + u32 mtu, u32 dnode); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); @@ -767,6 +786,11 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, u32 *dnode, int *err); struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list); +static inline u16 buf_seqno(struct sk_buff *skb) +{ + return msg_seqno(buf_msg(skb)); +} + /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index fcb07915aaac..41e7b7e4dda0 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -98,7 +98,7 @@ void named_cluster_distribute(struct net *net, struct sk_buff *skb) continue; if (!tipc_node_active_links(node)) continue; - oskb = skb_copy(skb, GFP_ATOMIC); + oskb = pskb_copy(skb, GFP_ATOMIC); if (!oskb) break; msg_set_destnode(buf_msg(oskb), dnode); @@ -244,6 +244,7 @@ static void tipc_publ_subscribe(struct net *net, struct publication *publ, tipc_node_lock(node); list_add_tail(&publ->nodesub_list, &node->publ_list); tipc_node_unlock(node); + tipc_node_put(node); } static void tipc_publ_unsubscribe(struct net *net, struct publication *publ, @@ -258,6 +259,7 @@ static void tipc_publ_unsubscribe(struct net *net, struct publication *publ, tipc_node_lock(node); list_del_init(&publ->nodesub_list); tipc_node_unlock(node); + tipc_node_put(node); } /** diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 105ba7adf06f..0f47f08bf38f 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -330,13 +330,9 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscr_report_overlap(s, - publ->lower, - publ->upper, - TIPC_PUBLISHED, - publ->ref, - publ->node, - created_subseq); + tipc_subscrp_report_overlap(s, publ->lower, publ->upper, + TIPC_PUBLISHED, publ->ref, + publ->node, created_subseq); } return publ; } @@ -404,13 +400,9 @@ found: /* Notify any waiting subscriptions */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscr_report_overlap(s, - publ->lower, - publ->upper, - TIPC_WITHDRAWN, - publ->ref, - publ->node, - removed_subseq); + tipc_subscrp_report_overlap(s, publ->lower, publ->upper, + TIPC_WITHDRAWN, publ->ref, + publ->node, removed_subseq); } return publ; @@ -432,19 +424,17 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, return; while (sseq != &nseq->sseqs[nseq->first_free]) { - if (tipc_subscr_overlap(s, sseq->lower, sseq->upper)) { + if (tipc_subscrp_check_overlap(s, sseq->lower, sseq->upper)) { struct publication *crs; struct name_info *info = sseq->info; int must_report = 1; list_for_each_entry(crs, &info->zone_list, zone_list) { - tipc_subscr_report_overlap(s, - sseq->lower, - sseq->upper, - TIPC_PUBLISHED, - crs->ref, - crs->node, - must_report); + tipc_subscrp_report_overlap(s, sseq->lower, + sseq->upper, + TIPC_PUBLISHED, + crs->ref, crs->node, + must_report); must_report = 0; } } @@ -811,8 +801,8 @@ static void tipc_purge_publications(struct net *net, struct name_seq *seq) sseq = seq->sseqs; info = sseq->info; list_for_each_entry_safe(publ, safe, &info->zone_list, zone_list) { - tipc_nametbl_remove_publ(net, publ->type, publ->lower, - publ->node, publ->ref, publ->key); + tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node, + publ->ref, publ->key); kfree_rcu(publ, rcu); } hlist_del_init_rcu(&seq->ns_list); diff --git a/net/tipc/net.c b/net/tipc/net.c index a54f3cbe2246..d6d1399ae229 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -40,6 +40,7 @@ #include "subscr.h" #include "socket.h" #include "node.h" +#include "bcast.h" static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index ce9121e8e990..53e0fee80086 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -55,6 +55,7 @@ struct tipc_nl_compat_msg { int rep_type; int rep_size; int req_type; + struct net *net; struct sk_buff *rep; struct tlv_desc *req; struct sock *dst_sk; @@ -68,7 +69,8 @@ struct tipc_nl_compat_cmd_dump { struct tipc_nl_compat_cmd_doit { int (*doit)(struct sk_buff *skb, struct genl_info *info); - int (*transcode)(struct sk_buff *skb, struct tipc_nl_compat_msg *msg); + int (*transcode)(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg); }; static int tipc_skb_tailroom(struct sk_buff *skb) @@ -281,7 +283,7 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, if (!trans_buf) return -ENOMEM; - err = (*cmd->transcode)(trans_buf, msg); + err = (*cmd->transcode)(cmd, trans_buf, msg); if (err) goto trans_out; @@ -353,7 +355,8 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg, nla_len(bearer[TIPC_NLA_BEARER_NAME])); } -static int tipc_nl_compat_bearer_enable(struct sk_buff *skb, +static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { struct nlattr *prop; @@ -385,7 +388,8 @@ static int tipc_nl_compat_bearer_enable(struct sk_buff *skb, return 0; } -static int tipc_nl_compat_bearer_disable(struct sk_buff *skb, +static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { char *name; @@ -576,11 +580,81 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, &link_info, sizeof(link_info)); } -static int tipc_nl_compat_link_set(struct sk_buff *skb, - struct tipc_nl_compat_msg *msg) +static int __tipc_add_link_prop(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg, + struct tipc_link_config *lc) +{ + switch (msg->cmd) { + case TIPC_CMD_SET_LINK_PRI: + return nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value)); + case TIPC_CMD_SET_LINK_TOL: + return nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value)); + case TIPC_CMD_SET_LINK_WINDOW: + return nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value)); + } + + return -EINVAL; +} + +static int tipc_nl_compat_media_set(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) { - struct nlattr *link; struct nlattr *prop; + struct nlattr *media; + struct tipc_link_config *lc; + + lc = (struct tipc_link_config *)TLV_DATA(msg->req); + + media = nla_nest_start(skb, TIPC_NLA_MEDIA); + if (!media) + return -EMSGSIZE; + + if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name)) + return -EMSGSIZE; + + prop = nla_nest_start(skb, TIPC_NLA_MEDIA_PROP); + if (!prop) + return -EMSGSIZE; + + __tipc_add_link_prop(skb, msg, lc); + nla_nest_end(skb, prop); + nla_nest_end(skb, media); + + return 0; +} + +static int tipc_nl_compat_bearer_set(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) +{ + struct nlattr *prop; + struct nlattr *bearer; + struct tipc_link_config *lc; + + lc = (struct tipc_link_config *)TLV_DATA(msg->req); + + bearer = nla_nest_start(skb, TIPC_NLA_BEARER); + if (!bearer) + return -EMSGSIZE; + + if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name)) + return -EMSGSIZE; + + prop = nla_nest_start(skb, TIPC_NLA_BEARER_PROP); + if (!prop) + return -EMSGSIZE; + + __tipc_add_link_prop(skb, msg, lc); + nla_nest_end(skb, prop); + nla_nest_end(skb, bearer); + + return 0; +} + +static int __tipc_nl_compat_link_set(struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) +{ + struct nlattr *prop; + struct nlattr *link; struct tipc_link_config *lc; lc = (struct tipc_link_config *)TLV_DATA(msg->req); @@ -596,24 +670,40 @@ static int tipc_nl_compat_link_set(struct sk_buff *skb, if (!prop) return -EMSGSIZE; - if (msg->cmd == TIPC_CMD_SET_LINK_PRI) { - if (nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value))) - return -EMSGSIZE; - } else if (msg->cmd == TIPC_CMD_SET_LINK_TOL) { - if (nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value))) - return -EMSGSIZE; - } else if (msg->cmd == TIPC_CMD_SET_LINK_WINDOW) { - if (nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value))) - return -EMSGSIZE; - } - + __tipc_add_link_prop(skb, msg, lc); nla_nest_end(skb, prop); nla_nest_end(skb, link); return 0; } -static int tipc_nl_compat_link_reset_stats(struct sk_buff *skb, +static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, + struct tipc_nl_compat_msg *msg) +{ + struct tipc_link_config *lc; + struct tipc_bearer *bearer; + struct tipc_media *media; + + lc = (struct tipc_link_config *)TLV_DATA(msg->req); + + media = tipc_media_find(lc->name); + if (media) { + cmd->doit = &tipc_nl_media_set; + return tipc_nl_compat_media_set(skb, msg); + } + + bearer = tipc_bearer_find(msg->net, lc->name); + if (bearer) { + cmd->doit = &tipc_nl_bearer_set; + return tipc_nl_compat_bearer_set(skb, msg); + } + + return __tipc_nl_compat_link_set(skb, msg); +} + +static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { char *name; @@ -851,7 +941,8 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg, sizeof(node_info)); } -static int tipc_nl_compat_net_set(struct sk_buff *skb, +static int tipc_nl_compat_net_set(struct tipc_nl_compat_cmd_doit *cmd, + struct sk_buff *skb, struct tipc_nl_compat_msg *msg) { u32 val; @@ -1007,7 +1098,6 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) struct nlmsghdr *req_nlh; struct nlmsghdr *rep_nlh; struct tipc_genlmsghdr *req_userhdr = info->userhdr; - struct net *net = genl_info_net(info); memset(&msg, 0, sizeof(msg)); @@ -1015,6 +1105,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN; msg.cmd = req_userhdr->cmd; msg.dst_sk = info->dst_sk; + msg.net = genl_info_net(info); if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN); @@ -1030,7 +1121,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) } err = tipc_nl_compat_handle(&msg); - if (err == -EOPNOTSUPP) + if ((err == -EOPNOTSUPP) || (err == -EPERM)) msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED); else if (err == -EINVAL) msg.rep = tipc_get_err_tlv(TIPC_CFG_TLV_ERROR); @@ -1043,7 +1134,7 @@ send: rep_nlh = nlmsg_hdr(msg.rep); memcpy(rep_nlh, info->nlhdr, len); rep_nlh->nlmsg_len = msg.rep->len; - genlmsg_unicast(net, msg.rep, NETLINK_CB(skb).portid); + genlmsg_unicast(msg.net, msg.rep, NETLINK_CB(skb).portid); return err; } diff --git a/net/tipc/node.c b/net/tipc/node.c index 86152de8248d..0b1d61a5f853 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1,7 +1,7 @@ /* * net/tipc/node.c: TIPC node management routines * - * Copyright (c) 2000-2006, 2012-2014, Ericsson AB + * Copyright (c) 2000-2006, 2012-2015, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * @@ -39,9 +39,11 @@ #include "node.h" #include "name_distr.h" #include "socket.h" +#include "bcast.h" static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); +static void tipc_node_delete(struct tipc_node *node); struct tipc_sock_conn { u32 port; @@ -67,6 +69,23 @@ static unsigned int tipc_hashfn(u32 addr) return addr & (NODE_HTABLE_SIZE - 1); } +static void tipc_node_kref_release(struct kref *kref) +{ + struct tipc_node *node = container_of(kref, struct tipc_node, kref); + + tipc_node_delete(node); +} + +void tipc_node_put(struct tipc_node *node) +{ + kref_put(&node->kref, tipc_node_kref_release); +} + +static void tipc_node_get(struct tipc_node *node) +{ + kref_get(&node->kref); +} + /* * tipc_node_find - locate specified node object, if it exists */ @@ -82,6 +101,7 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr) hlist_for_each_entry_rcu(node, &tn->node_htable[tipc_hashfn(addr)], hash) { if (node->addr == addr) { + tipc_node_get(node); rcu_read_unlock(); return node; } @@ -106,12 +126,13 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) } n_ptr->addr = addr; n_ptr->net = net; + kref_init(&n_ptr->kref); spin_lock_init(&n_ptr->lock); INIT_HLIST_NODE(&n_ptr->hash); INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->publ_list); INIT_LIST_HEAD(&n_ptr->conn_sks); - __skb_queue_head_init(&n_ptr->bclink.deferred_queue); + __skb_queue_head_init(&n_ptr->bclink.deferdq); hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n_ptr->addr < temp_node->addr) @@ -120,16 +141,17 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) list_add_tail_rcu(&n_ptr->list, &temp_node->list); n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN; n_ptr->signature = INVALID_NODE_SIG; + tipc_node_get(n_ptr); exit: spin_unlock_bh(&tn->node_list_lock); return n_ptr; } -static void tipc_node_delete(struct tipc_net *tn, struct tipc_node *n_ptr) +static void tipc_node_delete(struct tipc_node *node) { - list_del_rcu(&n_ptr->list); - hlist_del_rcu(&n_ptr->hash); - kfree_rcu(n_ptr, rcu); + list_del_rcu(&node->list); + hlist_del_rcu(&node->hash); + kfree_rcu(node, rcu); } void tipc_node_stop(struct net *net) @@ -139,7 +161,7 @@ void tipc_node_stop(struct net *net) spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) - tipc_node_delete(tn, node); + tipc_node_put(node); spin_unlock_bh(&tn->node_list_lock); } @@ -147,6 +169,7 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; + int err = 0; if (in_own_node(net, dnode)) return 0; @@ -157,8 +180,10 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); - if (!conn) - return -EHOSTUNREACH; + if (!conn) { + err = -EHOSTUNREACH; + goto exit; + } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; @@ -166,7 +191,9 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) tipc_node_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_unlock(node); - return 0; +exit: + tipc_node_put(node); + return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) @@ -189,6 +216,7 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) kfree(conn); } tipc_node_unlock(node); + tipc_node_put(node); } /** @@ -227,8 +255,8 @@ void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) active[0] = active[1] = l_ptr; exit: /* Leave room for changeover header when returning 'mtu' to users: */ - n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; } /** @@ -292,11 +320,10 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) /* Leave room for changeover header when returning 'mtu' to users: */ if (active[0]) { - n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; return; } - /* Loopback link went down? No fragmentation needed from now on. */ if (n_ptr->addr == tn->own_addr) { n_ptr->act_mtus[0] = MAX_MSG_SIZE; @@ -354,7 +381,7 @@ static void node_lost_contact(struct tipc_node *n_ptr) /* Flush broadcast link info associated with lost node */ if (n_ptr->bclink.recv_permitted) { - __skb_queue_purge(&n_ptr->bclink.deferred_queue); + __skb_queue_purge(&n_ptr->bclink.deferdq); if (n_ptr->bclink.reasm_buf) { kfree_skb(n_ptr->bclink.reasm_buf); @@ -367,18 +394,17 @@ static void node_lost_contact(struct tipc_node *n_ptr) n_ptr->bclink.recv_permitted = false; } - /* Abort link changeover */ + /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link *l_ptr = n_ptr->links[i]; if (!l_ptr) continue; - l_ptr->reset_checkpoint = l_ptr->next_in_no; - l_ptr->exp_msg_count = 0; + l_ptr->flags &= ~LINK_FAILINGOVER; + l_ptr->failover_checkpt = 0; + l_ptr->failover_pkts = 0; + kfree_skb(l_ptr->failover_skb); + l_ptr->failover_skb = NULL; tipc_link_reset_fragments(l_ptr); - - /* Link marked for deletion after failover? => do it now */ - if (l_ptr->flags & LINK_STOPPED) - tipc_link_delete(l_ptr); } n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN; @@ -417,19 +443,25 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; + int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); - if ((bearer_id >= MAX_BEARERS) || !node) - return -EINVAL; + if (!node) + return err; + + if (bearer_id >= MAX_BEARERS) + goto exit; + tipc_node_lock(node); link = node->links[bearer_id]; if (link) { strncpy(linkname, link->name, len); - tipc_node_unlock(node); - return 0; + err = 0; } +exit: tipc_node_unlock(node); - return -EINVAL; + tipc_node_put(node); + return err; } void tipc_node_unlock(struct tipc_node *node) @@ -459,7 +491,7 @@ void tipc_node_unlock(struct tipc_node *node) TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP | TIPC_WAKEUP_BCAST_USERS | TIPC_BCAST_MSG_EVT | - TIPC_NAMED_MSG_EVT); + TIPC_NAMED_MSG_EVT | TIPC_BCAST_RESET); spin_unlock_bh(&node->lock); @@ -488,6 +520,9 @@ void tipc_node_unlock(struct tipc_node *node) if (flags & TIPC_BCAST_MSG_EVT) tipc_bclink_input(net); + + if (flags & TIPC_BCAST_RESET) + tipc_link_reset_all(node); } /* Caller should hold node lock for the passed node */ @@ -542,17 +577,21 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); - - if (last_addr && !tipc_node_find(net, last_addr)) { - rcu_read_unlock(); - /* We never set seq or call nl_dump_check_consistent() this - * means that setting prev_seq here will cause the consistence - * check to fail in the netlink callback handler. Resulting in - * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if - * the node state changed while we released the lock. - */ - cb->prev_seq = 1; - return -EPIPE; + if (last_addr) { + node = tipc_node_find(net, last_addr); + if (!node) { + rcu_read_unlock(); + /* We never set seq or call nl_dump_check_consistent() + * this means that setting prev_seq here will cause the + * consistence check to fail in the netlink callback + * handler. Resulting in the NLMSG_DONE message having + * the NLM_F_DUMP_INTR flag set if the node state + * changed while we released the lock. + */ + cb->prev_seq = 1; + return -EPIPE; + } + tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { diff --git a/net/tipc/node.h b/net/tipc/node.h index 3d18c66b7f78..5a834cf142c8 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -45,8 +45,6 @@ /* Out-of-range value for node signature */ #define INVALID_NODE_SIG 0x10000 -#define NODE_HTABLE_SIZE 512 - /* Flags used to take different actions according to flag type * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down @@ -64,7 +62,8 @@ enum { TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7), TIPC_NAMED_MSG_EVT = (1 << 8), - TIPC_BCAST_MSG_EVT = (1 << 9) + TIPC_BCAST_MSG_EVT = (1 << 9), + TIPC_BCAST_RESET = (1 << 10) }; /** @@ -84,7 +83,7 @@ struct tipc_node_bclink { u32 last_sent; u32 oos_state; u32 deferred_size; - struct sk_buff_head deferred_queue; + struct sk_buff_head deferdq; struct sk_buff *reasm_buf; int inputq_map; bool recv_permitted; @@ -93,6 +92,7 @@ struct tipc_node_bclink { /** * struct tipc_node - TIPC node structure * @addr: network address of node + * @ref: reference counter to node object * @lock: spinlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain @@ -106,6 +106,7 @@ struct tipc_node_bclink { * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node + * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any * @publ_list: list of publications @@ -113,6 +114,7 @@ struct tipc_node_bclink { */ struct tipc_node { u32 addr; + struct kref kref; spinlock_t lock; struct net *net; struct hlist_node hash; @@ -125,7 +127,8 @@ struct tipc_node { struct tipc_node_bclink bclink; struct list_head list; int link_cnt; - int working_links; + u16 working_links; + u16 capabilities; u32 signature; u32 link_id; struct list_head publ_list; @@ -134,6 +137,7 @@ struct tipc_node { }; struct tipc_node *tipc_node_find(struct net *net, u32 addr); +void tipc_node_put(struct tipc_node *node); struct tipc_node *tipc_node_create(struct net *net, u32 addr); void tipc_node_stop(struct net *net); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); @@ -168,10 +172,12 @@ static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector) node = tipc_node_find(net, addr); - if (likely(node)) + if (likely(node)) { mtu = node->act_mtus[selector & 1]; - else + tipc_node_put(node); + } else { mtu = MAX_MSG_SIZE; + } return mtu; } diff --git a/net/tipc/server.c b/net/tipc/server.c index eadd4ed45905..922e04a43396 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -37,11 +37,13 @@ #include "core.h" #include "socket.h" #include <net/sock.h> +#include <linux/module.h> /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 #define MAX_RECV_MSG_COUNT 25 #define CF_CONNECTED 1 +#define CF_SERVER 2 #define sock2con(x) ((struct tipc_conn *)(x)->sk_user_data) @@ -88,9 +90,19 @@ static void tipc_clean_outqueues(struct tipc_conn *con); static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); + struct sockaddr_tipc *saddr = con->server->saddr; + struct socket *sock = con->sock; + struct sock *sk; - if (con->sock) { - tipc_sock_release_local(con->sock); + if (sock) { + sk = sock->sk; + if (test_bit(CF_SERVER, &con->flags)) { + __module_get(sock->ops->owner); + __module_get(sk->sk_prot_creator->owner); + } + saddr->scope = -TIPC_NODE_SCOPE; + kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); + sock_release(sock); con->sock = NULL; } @@ -281,7 +293,7 @@ static int tipc_accept_from_sock(struct tipc_conn *con) struct tipc_conn *newcon; int ret; - ret = tipc_sock_accept_local(sock, &newsock, O_NONBLOCK); + ret = kernel_accept(sock, &newsock, O_NONBLOCK); if (ret < 0) return ret; @@ -297,6 +309,10 @@ static int tipc_accept_from_sock(struct tipc_conn *con) /* Notify that new connection is incoming */ newcon->usr_data = s->tipc_conn_new(newcon->conid); + if (!newcon->usr_data) { + sock_release(newsock); + return -ENOMEM; + } /* Wake up receive process in case of 'SYN+' message */ newsock->sk->sk_data_ready(newsock->sk); @@ -309,7 +325,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) struct socket *sock = NULL; int ret; - ret = tipc_sock_create_local(s->net, s->type, &sock); + ret = sock_create_kern(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock); if (ret < 0) return NULL; ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, @@ -337,11 +353,31 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) pr_err("Unknown socket type %d\n", s->type); goto create_err; } + + /* As server's listening socket owner and creator is the same module, + * we have to decrease TIPC module reference count to guarantee that + * it remains zero after the server socket is created, otherwise, + * executing "rmmod" command is unable to make TIPC module deleted + * after TIPC module is inserted successfully. + * + * However, the reference count is ever increased twice in + * sock_create_kern(): one is to increase the reference count of owner + * of TIPC socket's proto_ops struct; another is to increment the + * reference count of owner of TIPC proto struct. Therefore, we must + * decrement the module reference count twice to ensure that it keeps + * zero after server's listening socket is created. Of course, we + * must bump the module reference count twice as well before the socket + * is closed. + */ + module_put(sock->ops->owner); + module_put(sock->sk->sk_prot_creator->owner); + set_bit(CF_SERVER, &con->flags); + return sock; create_err: + kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); - con->sock = NULL; return NULL; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b4d4467d0bb0..3a7567f690f3 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -35,13 +35,13 @@ */ #include <linux/rhashtable.h> -#include <linux/jhash.h> #include "core.h" #include "name_table.h" #include "node.h" #include "link.h" #include "name_distr.h" #include "socket.h" +#include "bcast.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ @@ -74,6 +74,7 @@ * @link_cong: non-zero if owner must sleep because of link congestion * @sent_unacked: # messages sent by socket, and not yet acked by peer * @rcv_unacked: # messages read by user, but not yet acked back to peer + * @remote: 'connected' peer for dgram/rdm * @node: hash table node * @rcu: rcu struct for tipc_sock */ @@ -96,6 +97,7 @@ struct tipc_sock { bool link_cong; uint sent_unacked; uint rcv_unacked; + struct sockaddr_tipc remote; struct rhash_head node; struct rcu_head rcu; }; @@ -114,13 +116,14 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); +static int __tipc_send_stream(struct socket *sock, struct msghdr *m, + size_t dsz); +static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; static const struct proto_ops msg_ops; - static struct proto tipc_proto; -static struct proto tipc_proto_kern; static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, @@ -130,6 +133,8 @@ static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG } }; +static const struct rhashtable_params tsk_rht_params; + /* * Revised TIPC socket locking policy: * @@ -338,11 +343,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, } /* Allocate socket's protocol area */ - if (!kern) - sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto); - else - sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto_kern); - + sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern); if (sk == NULL) return -ENOMEM; @@ -380,75 +381,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock, return 0; } -/** - * tipc_sock_create_local - create TIPC socket from inside TIPC module - * @type: socket type - SOCK_RDM or SOCK_SEQPACKET - * - * We cannot use sock_creat_kern here because it bumps module user count. - * Since socket owner and creator is the same module we must make sure - * that module count remains zero for module local sockets, otherwise - * we cannot do rmmod. - * - * Returns 0 on success, errno otherwise - */ -int tipc_sock_create_local(struct net *net, int type, struct socket **res) -{ - int rc; - - rc = sock_create_lite(AF_TIPC, type, 0, res); - if (rc < 0) { - pr_err("Failed to create kernel socket\n"); - return rc; - } - tipc_sk_create(net, *res, 0, 1); - - return 0; -} - -/** - * tipc_sock_release_local - release socket created by tipc_sock_create_local - * @sock: the socket to be released. - * - * Module reference count is not incremented when such sockets are created, - * so we must keep it from being decremented when they are released. - */ -void tipc_sock_release_local(struct socket *sock) -{ - tipc_release(sock); - sock->ops = NULL; - sock_release(sock); -} - -/** - * tipc_sock_accept_local - accept a connection on a socket created - * with tipc_sock_create_local. Use this function to avoid that - * module reference count is inadvertently incremented. - * - * @sock: the accepting socket - * @newsock: reference to the new socket to be created - * @flags: socket flags - */ - -int tipc_sock_accept_local(struct socket *sock, struct socket **newsock, - int flags) -{ - struct sock *sk = sock->sk; - int ret; - - ret = sock_create_lite(sk->sk_family, sk->sk_type, - sk->sk_protocol, newsock); - if (ret < 0) - return ret; - - ret = tipc_accept(sock, *newsock, flags); - if (ret < 0) { - sock_release(*newsock); - return ret; - } - (*newsock)->ops = sock->ops; - return ret; -} - static void tipc_sk_callback(struct rcu_head *head) { struct tipc_sock *tsk = container_of(head, struct tipc_sock, rcu); @@ -478,7 +410,7 @@ static int tipc_release(struct socket *sock) struct net *net; struct tipc_sock *tsk; struct sk_buff *skb; - u32 dnode, probing_state; + u32 dnode; /* * Exit if socket isn't fully initialized (occurs when a failed accept() @@ -516,10 +448,7 @@ static int tipc_release(struct socket *sock) } tipc_sk_withdraw(tsk, 0, NULL); - probing_state = tsk->probing_state; - if (del_timer_sync(&sk->sk_timer) && - probing_state != TIPC_CONN_PROBING) - sock_put(sk); + sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); if (tsk->connected) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, @@ -892,7 +821,6 @@ static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) /** * tipc_sendmsg - send message in connectionless manner - * @iocb: if NULL, indicates that socket lock is already held * @sock: socket structure * @m: message to send * @dsz: amount of user data to be sent @@ -904,9 +832,21 @@ static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) * * Returns the number of bytes sent on success, or errno otherwise */ -static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, +static int tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) { + struct sock *sk = sock->sk; + int ret; + + lock_sock(sk); + ret = __tipc_sendmsg(sock, m, dsz); + release_sock(sk); + + return ret; +} + +static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) +{ DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); @@ -915,49 +855,40 @@ static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, u32 dnode, dport; struct sk_buff_head *pktchain = &sk->sk_write_queue; struct sk_buff *skb; - struct tipc_name_seq *seq = &dest->addr.nameseq; + struct tipc_name_seq *seq; struct iov_iter save; u32 mtu; long timeo; int rc; - if (unlikely(!dest)) - return -EDESTADDRREQ; - - if (unlikely((m->msg_namelen < sizeof(*dest)) || - (dest->family != AF_TIPC))) - return -EINVAL; - if (dsz > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; - - if (iocb) - lock_sock(sk); - + if (unlikely(!dest)) { + if (tsk->connected && sock->state == SS_READY) + dest = &tsk->remote; + else + return -EDESTADDRREQ; + } else if (unlikely(m->msg_namelen < sizeof(*dest)) || + dest->family != AF_TIPC) { + return -EINVAL; + } if (unlikely(sock->state != SS_READY)) { - if (sock->state == SS_LISTENING) { - rc = -EPIPE; - goto exit; - } - if (sock->state != SS_UNCONNECTED) { - rc = -EISCONN; - goto exit; - } - if (tsk->published) { - rc = -EOPNOTSUPP; - goto exit; - } + if (sock->state == SS_LISTENING) + return -EPIPE; + if (sock->state != SS_UNCONNECTED) + return -EISCONN; + if (tsk->published) + return -EOPNOTSUPP; if (dest->addrtype == TIPC_ADDR_NAME) { tsk->conn_type = dest->addr.name.name.type; tsk->conn_instance = dest->addr.name.name.instance; } } - + seq = &dest->addr.nameseq; timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); if (dest->addrtype == TIPC_ADDR_MCAST) { - rc = tipc_sendmcast(sock, seq, m, dsz, timeo); - goto exit; + return tipc_sendmcast(sock, seq, m, dsz, timeo); } else if (dest->addrtype == TIPC_ADDR_NAME) { u32 type = dest->addr.name.name.type; u32 inst = dest->addr.name.name.instance; @@ -972,10 +903,8 @@ static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, dport = tipc_nametbl_translate(net, type, inst, &dnode); msg_set_destnode(mhdr, dnode); msg_set_destport(mhdr, dport); - if (unlikely(!dport && !dnode)) { - rc = -EHOSTUNREACH; - goto exit; - } + if (unlikely(!dport && !dnode)) + return -EHOSTUNREACH; } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; msg_set_type(mhdr, TIPC_DIRECT_MSG); @@ -990,7 +919,7 @@ new_mtu: mtu = tipc_node_get_mtu(net, dnode, tsk->portid); rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, pktchain); if (rc < 0) - goto exit; + return rc; do { skb = skb_peek(pktchain); @@ -1013,9 +942,6 @@ new_mtu: if (rc) __skb_queue_purge(pktchain); } while (!rc); -exit: - if (iocb) - release_sock(sk); return rc; } @@ -1052,7 +978,6 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) /** * tipc_send_stream - send stream-oriented data - * @iocb: (unused) * @sock: socket structure * @m: data to send * @dsz: total length of data to be transmitted @@ -1062,8 +987,19 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) * Returns the number of bytes sent on success (or partial success), * or errno if no data sent */ -static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t dsz) +static int tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) +{ + struct sock *sk = sock->sk; + int ret; + + lock_sock(sk); + ret = __tipc_send_stream(sock, m, dsz); + release_sock(sk); + + return ret; +} + +static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); @@ -1080,7 +1016,7 @@ static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, /* Handle implied connection establishment */ if (unlikely(dest)) { - rc = tipc_sendmsg(iocb, sock, m, dsz); + rc = __tipc_sendmsg(sock, m, dsz); if (dsz && (dsz == rc)) tsk->sent_unacked = 1; return rc; @@ -1088,15 +1024,11 @@ static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, if (dsz > (uint)INT_MAX) return -EMSGSIZE; - if (iocb) - lock_sock(sk); - if (unlikely(sock->state != SS_CONNECTED)) { if (sock->state == SS_DISCONNECTING) - rc = -EPIPE; + return -EPIPE; else - rc = -ENOTCONN; - goto exit; + return -ENOTCONN; } timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); @@ -1108,7 +1040,7 @@ next: send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); rc = tipc_msg_build(mhdr, m, sent, send, mtu, pktchain); if (unlikely(rc < 0)) - goto exit; + return rc; do { if (likely(!tsk_conn_cong(tsk))) { rc = tipc_link_xmit(net, pktchain, dnode, portid); @@ -1133,15 +1065,12 @@ next: if (rc) __skb_queue_purge(pktchain); } while (!rc); -exit: - if (iocb) - release_sock(sk); + return sent ? sent : rc; } /** * tipc_send_packet - send a connection-oriented message - * @iocb: if NULL, indicates that socket lock is already held * @sock: socket structure * @m: message to send * @dsz: length of data to be transmitted @@ -1150,13 +1079,12 @@ exit: * * Returns the number of bytes sent on success, or errno otherwise */ -static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t dsz) +static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz) { if (dsz > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; - return tipc_send_stream(iocb, sock, m, dsz); + return tipc_send_stream(sock, m, dsz); } /* tipc_sk_finish_conn - complete the setup of a connection @@ -1317,12 +1245,12 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) err = 0; if (!skb_queue_empty(&sk->sk_receive_queue)) break; - err = sock_intr_errno(timeo); - if (signal_pending(current)) - break; err = -EAGAIN; if (!timeo) break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; } finish_wait(sk_sleep(sk), &wait); *timeop = timeo; @@ -1331,7 +1259,6 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) /** * tipc_recvmsg - receive packet-oriented message - * @iocb: (unused) * @m: descriptor for message info * @buf_len: total size of user buffer area * @flags: receive flags @@ -1341,8 +1268,8 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) * * Returns size of returned message data, errno otherwise */ -static int tipc_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t buf_len, int flags) +static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len, + int flags) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); @@ -1426,7 +1353,6 @@ exit: /** * tipc_recv_stream - receive stream-oriented data - * @iocb: (unused) * @m: descriptor for message info * @buf_len: total size of user buffer area * @flags: receive flags @@ -1436,8 +1362,8 @@ exit: * * Returns size of returned message data, errno otherwise */ -static int tipc_recv_stream(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t buf_len, int flags) +static int tipc_recv_stream(struct socket *sock, struct msghdr *m, + size_t buf_len, int flags) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); @@ -1836,13 +1762,14 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) { u32 dnode, dport = 0; - int err = -TIPC_ERR_NO_PORT; + int err; struct sk_buff *skb; struct tipc_sock *tsk; struct tipc_net *tn; struct sock *sk; while (skb_queue_len(inputq)) { + err = -TIPC_ERR_NO_PORT; skb = NULL; dport = tipc_skb_peek_port(inputq, dport); tsk = tipc_sk_lookup(net, dport); @@ -1909,17 +1836,26 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, int destlen, int flags) { struct sock *sk = sock->sk; + struct tipc_sock *tsk = tipc_sk(sk); struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest; struct msghdr m = {NULL,}; - long timeout = (flags & O_NONBLOCK) ? 0 : tipc_sk(sk)->conn_timeout; + long timeout = (flags & O_NONBLOCK) ? 0 : tsk->conn_timeout; socket_state previous; - int res; + int res = 0; lock_sock(sk); - /* For now, TIPC does not allow use of connect() with DGRAM/RDM types */ + /* DGRAM/RDM connect(), just save the destaddr */ if (sock->state == SS_READY) { - res = -EOPNOTSUPP; + if (dst->family == AF_UNSPEC) { + memset(&tsk->remote, 0, sizeof(struct sockaddr_tipc)); + tsk->connected = 0; + } else if (destlen != sizeof(struct sockaddr_tipc)) { + res = -EINVAL; + } else { + memcpy(&tsk->remote, dest, destlen); + tsk->connected = 1; + } goto exit; } @@ -1947,7 +1883,7 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, if (!timeout) m.msg_flags = MSG_DONTWAIT; - res = tipc_sendmsg(NULL, sock, &m, 0); + res = __tipc_sendmsg(sock, &m, 0); if ((res < 0) && (res != -EWOULDBLOCK)) goto exit; @@ -2027,12 +1963,12 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) err = -EINVAL; if (sock->state != SS_LISTENING) break; - err = sock_intr_errno(timeo); - if (signal_pending(current)) - break; err = -EAGAIN; if (!timeo) break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; } finish_wait(sk_sleep(sk), &wait); return err; @@ -2071,6 +2007,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, 1); if (res) goto exit; + security_sk_clone(sock->sk, new_sock->sk); new_sk = new_sock->sk; new_tsock = tipc_sk(new_sk); @@ -2103,7 +2040,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) struct msghdr m = {NULL,}; tsk_advance_rx_queue(sk); - tipc_send_packet(NULL, new_sock, &m, 0); + __tipc_send_stream(new_sock, &m, 0); } else { __skb_dequeue(&sk->sk_receive_queue); __skb_queue_head(&new_sk->sk_receive_queue, buf); @@ -2154,7 +2091,6 @@ restart: TIPC_CONN_SHUTDOWN)) tipc_link_xmit_skb(net, skb, dnode, tsk->portid); - tipc_node_remove_conn(net, dnode, tsk->portid); } else { dnode = tsk_peer_node(tsk); @@ -2205,11 +2141,17 @@ static void tipc_sk_timeout(unsigned long data) peer_node = tsk_peer_node(tsk); if (tsk->probing_state == TIPC_CONN_PROBING) { - /* Previous probe not answered -> self abort */ - skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, - TIPC_CONN_MSG, SHORT_H_SIZE, 0, - own_node, peer_node, tsk->portid, - peer_port, TIPC_ERR_NO_PORT); + if (!sock_owned_by_user(sk)) { + sk->sk_socket->state = SS_DISCONNECTING; + tsk->connected = 0; + tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk), + tsk_peer_port(tsk)); + sk->sk_state_change(sk); + } else { + /* Try again later */ + sk_reset_timer(sk, &sk->sk_timer, (HZ / 20)); + } + } else { skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0, peer_node, own_node, @@ -2312,7 +2254,7 @@ static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) struct tipc_sock *tsk; rcu_read_lock(); - tsk = rhashtable_lookup(&tn->sk_rht, &portid); + tsk = rhashtable_lookup_fast(&tn->sk_rht, &portid, tsk_rht_params); if (tsk) sock_hold(&tsk->sk); rcu_read_unlock(); @@ -2334,7 +2276,8 @@ static int tipc_sk_insert(struct tipc_sock *tsk) portid = TIPC_MIN_PORT; tsk->portid = portid; sock_hold(&tsk->sk); - if (rhashtable_lookup_insert(&tn->sk_rht, &tsk->node)) + if (!rhashtable_lookup_insert_fast(&tn->sk_rht, &tsk->node, + tsk_rht_params)) return 0; sock_put(&tsk->sk); } @@ -2347,26 +2290,27 @@ static void tipc_sk_remove(struct tipc_sock *tsk) struct sock *sk = &tsk->sk; struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id); - if (rhashtable_remove(&tn->sk_rht, &tsk->node)) { + if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) { WARN_ON(atomic_read(&sk->sk_refcnt) == 1); __sock_put(sk); } } +static const struct rhashtable_params tsk_rht_params = { + .nelem_hint = 192, + .head_offset = offsetof(struct tipc_sock, node), + .key_offset = offsetof(struct tipc_sock, portid), + .key_len = sizeof(u32), /* portid */ + .max_size = 1048576, + .min_size = 256, + .automatic_shrinking = true, +}; + int tipc_sk_rht_init(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct rhashtable_params rht_params = { - .nelem_hint = 192, - .head_offset = offsetof(struct tipc_sock, node), - .key_offset = offsetof(struct tipc_sock, portid), - .key_len = sizeof(u32), /* portid */ - .hashfn = jhash, - .max_shift = 20, /* 1M */ - .min_shift = 8, /* 256 */ - }; - return rhashtable_init(&tn->sk_rht, &rht_params); + return rhashtable_init(&tn->sk_rht, &tsk_rht_params); } void tipc_sk_rht_destroy(struct net *net) @@ -2609,12 +2553,6 @@ static struct proto tipc_proto = { .sysctl_rmem = sysctl_tipc_rmem }; -static struct proto tipc_proto_kern = { - .name = "TIPC", - .obj_size = sizeof(struct tipc_sock), - .sysctl_rmem = sysctl_tipc_rmem -}; - /** * tipc_socket_init - initialize TIPC socket interface * diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 238f1b7bd9bd..bf6551389522 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -44,10 +44,6 @@ SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) int tipc_socket_init(void); void tipc_socket_stop(void); -int tipc_sock_create_local(struct net *net, int type, struct socket **res); -void tipc_sock_release_local(struct socket *sock); -int tipc_sock_accept_local(struct socket *sock, struct socket **newsock, - int flags); int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 72c339e432aa..350cca33ee0a 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -40,16 +40,21 @@ /** * struct tipc_subscriber - TIPC network topology subscriber + * @kref: reference counter to tipc_subscription object * @conid: connection identifier to server connecting to subscriber * @lock: control access to subscriber - * @subscription_list: list of subscription objects for this subscriber + * @subscrp_list: list of subscription objects for this subscriber */ struct tipc_subscriber { + struct kref kref; int conid; spinlock_t lock; - struct list_head subscription_list; + struct list_head subscrp_list; }; +static void tipc_subscrp_delete(struct tipc_subscription *sub); +static void tipc_subscrb_put(struct tipc_subscriber *subscriber); + /** * htohl - convert value to endianness used by destination * @in: value to convert @@ -62,9 +67,9 @@ static u32 htohl(u32 in, int swap) return swap ? swab32(in) : in; } -static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node) +static void tipc_subscrp_send_event(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port_ref, u32 node) { struct tipc_net *tn = net_generic(sub->net, tipc_net_id); struct tipc_subscriber *subscriber = sub->subscriber; @@ -82,12 +87,13 @@ static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower, } /** - * tipc_subscr_overlap - test for subscription overlap with the given values + * tipc_subscrp_check_overlap - test for subscription overlap with the + * given values * * Returns 1 if there is overlap, otherwise 0. */ -int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper) +int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, + u32 found_upper) { if (found_lower < sub->seq.lower) found_lower = sub->seq.lower; @@ -98,151 +104,121 @@ int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower, return 1; } -/** - * tipc_subscr_report_overlap - issue event if there is subscription overlap - * - * Protected by nameseq.lock in name_table.c - */ -void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node, int must) +void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, + u32 found_upper, u32 event, u32 port_ref, + u32 node, int must) { - if (!tipc_subscr_overlap(sub, found_lower, found_upper)) + if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper)) return; if (!must && !(sub->filter & TIPC_SUB_PORTS)) return; - subscr_send_event(sub, found_lower, found_upper, event, port_ref, node); + tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, + node); } -static void subscr_timeout(unsigned long data) +static void tipc_subscrp_timeout(unsigned long data) { struct tipc_subscription *sub = (struct tipc_subscription *)data; struct tipc_subscriber *subscriber = sub->subscriber; - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); - - /* The spin lock per subscriber is used to protect its members */ - spin_lock_bh(&subscriber->lock); - - /* Validate timeout (in case subscription is being cancelled) */ - if (sub->timeout == TIPC_WAIT_FOREVER) { - spin_unlock_bh(&subscriber->lock); - return; - } - /* Unlink subscription from name table */ - tipc_nametbl_unsubscribe(sub); - - /* Unlink subscription from subscriber */ - list_del(&sub->subscription_list); + /* Notify subscriber of timeout */ + tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, + TIPC_SUBSCR_TIMEOUT, 0, 0); + spin_lock_bh(&subscriber->lock); + tipc_subscrp_delete(sub); spin_unlock_bh(&subscriber->lock); - /* Notify subscriber of timeout */ - subscr_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, - TIPC_SUBSCR_TIMEOUT, 0, 0); + tipc_subscrb_put(subscriber); +} - /* Now destroy subscription */ - kfree(sub); - atomic_dec(&tn->subscription_count); +static void tipc_subscrb_kref_release(struct kref *kref) +{ + struct tipc_subscriber *subcriber = container_of(kref, + struct tipc_subscriber, kref); + + kfree(subcriber); } -/** - * subscr_del - delete a subscription within a subscription list - * - * Called with subscriber lock held. - */ -static void subscr_del(struct tipc_subscription *sub) +static void tipc_subscrb_put(struct tipc_subscriber *subscriber) { - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + kref_put(&subscriber->kref, tipc_subscrb_kref_release); +} - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscription_list); - kfree(sub); - atomic_dec(&tn->subscription_count); +static void tipc_subscrb_get(struct tipc_subscriber *subscriber) +{ + kref_get(&subscriber->kref); } -/** - * subscr_terminate - terminate communication with a subscriber - * - * Note: Must call it in process context since it might sleep. - */ -static void subscr_terminate(struct tipc_subscription *sub) +static struct tipc_subscriber *tipc_subscrb_create(int conid) { - struct tipc_subscriber *subscriber = sub->subscriber; - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + struct tipc_subscriber *subscriber; + + subscriber = kzalloc(sizeof(*subscriber), GFP_ATOMIC); + if (!subscriber) { + pr_warn("Subscriber rejected, no memory\n"); + return NULL; + } + kref_init(&subscriber->kref); + INIT_LIST_HEAD(&subscriber->subscrp_list); + subscriber->conid = conid; + spin_lock_init(&subscriber->lock); - tipc_conn_terminate(tn->topsrv, subscriber->conid); + return subscriber; } -static void subscr_release(struct tipc_subscriber *subscriber) +static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { - struct tipc_subscription *sub; - struct tipc_subscription *sub_temp; + struct tipc_subscription *sub, *temp; spin_lock_bh(&subscriber->lock); - /* Destroy any existing subscriptions for subscriber */ - list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, - subscription_list) { - if (sub->timeout != TIPC_WAIT_FOREVER) { - spin_unlock_bh(&subscriber->lock); - del_timer_sync(&sub->timer); - spin_lock_bh(&subscriber->lock); + list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, + subscrp_list) { + if (del_timer(&sub->timer)) { + tipc_subscrp_delete(sub); + tipc_subscrb_put(subscriber); } - subscr_del(sub); } spin_unlock_bh(&subscriber->lock); - /* Now destroy subscriber */ - kfree(subscriber); + tipc_subscrb_put(subscriber); } -/** - * subscr_cancel - handle subscription cancellation request - * - * Called with subscriber lock held. Routine must temporarily release lock - * to enable the subscription timeout routine to finish without deadlocking; - * the lock is then reclaimed to allow caller to release it upon return. - * - * Note that fields of 's' use subscriber's endianness! - */ -static void subscr_cancel(struct tipc_subscr *s, - struct tipc_subscriber *subscriber) +static void tipc_subscrp_delete(struct tipc_subscription *sub) { - struct tipc_subscription *sub; - struct tipc_subscription *sub_temp; - int found = 0; + struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + + tipc_nametbl_unsubscribe(sub); + list_del(&sub->subscrp_list); + kfree(sub); + atomic_dec(&tn->subscription_count); +} + +static void tipc_subscrp_cancel(struct tipc_subscr *s, + struct tipc_subscriber *subscriber) +{ + struct tipc_subscription *sub, *temp; + spin_lock_bh(&subscriber->lock); /* Find first matching subscription, exit if not found */ - list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, - subscription_list) { + list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, + subscrp_list) { if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { - found = 1; + if (del_timer(&sub->timer)) { + tipc_subscrp_delete(sub); + tipc_subscrb_put(subscriber); + } break; } } - if (!found) - return; - - /* Cancel subscription timer (if used), then delete subscription */ - if (sub->timeout != TIPC_WAIT_FOREVER) { - sub->timeout = TIPC_WAIT_FOREVER; - spin_unlock_bh(&subscriber->lock); - del_timer_sync(&sub->timer); - spin_lock_bh(&subscriber->lock); - } - subscr_del(sub); + spin_unlock_bh(&subscriber->lock); } -/** - * subscr_subscribe - create subscription for subscriber - * - * Called with subscriber lock held. - */ -static int subscr_subscribe(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, - struct tipc_subscription **sub_p) +static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s, + struct tipc_subscriber *subscriber, + struct tipc_subscription **sub_p) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_subscription *sub; @@ -254,7 +230,7 @@ static int subscr_subscribe(struct net *net, struct tipc_subscr *s, /* Detect & process a subscription cancellation request */ if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); - subscr_cancel(s, subscriber); + tipc_subscrp_cancel(s, subscriber); return 0; } @@ -286,64 +262,51 @@ static int subscr_subscribe(struct net *net, struct tipc_subscr *s, kfree(sub); return -EINVAL; } - list_add(&sub->subscription_list, &subscriber->subscription_list); + spin_lock_bh(&subscriber->lock); + list_add(&sub->subscrp_list, &subscriber->subscrp_list); + spin_unlock_bh(&subscriber->lock); sub->subscriber = subscriber; sub->swap = swap; - memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr)); + memcpy(&sub->evt.s, s, sizeof(*s)); atomic_inc(&tn->subscription_count); - if (sub->timeout != TIPC_WAIT_FOREVER) { - setup_timer(&sub->timer, subscr_timeout, (unsigned long)sub); - mod_timer(&sub->timer, jiffies + sub->timeout); - } + setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); + if (sub->timeout != TIPC_WAIT_FOREVER) + sub->timeout += jiffies; + if (!mod_timer(&sub->timer, sub->timeout)) + tipc_subscrb_get(subscriber); *sub_p = sub; return 0; } /* Handle one termination request for the subscriber */ -static void subscr_conn_shutdown_event(int conid, void *usr_data) +static void tipc_subscrb_shutdown_cb(int conid, void *usr_data) { - subscr_release((struct tipc_subscriber *)usr_data); + tipc_subscrb_delete((struct tipc_subscriber *)usr_data); } /* Handle one request to create a new subscription for the subscriber */ -static void subscr_conn_msg_event(struct net *net, int conid, - struct sockaddr_tipc *addr, void *usr_data, - void *buf, size_t len) +static void tipc_subscrb_rcv_cb(struct net *net, int conid, + struct sockaddr_tipc *addr, void *usr_data, + void *buf, size_t len) { struct tipc_subscriber *subscriber = usr_data; struct tipc_subscription *sub = NULL; + struct tipc_net *tn = net_generic(net, tipc_net_id); - spin_lock_bh(&subscriber->lock); - if (subscr_subscribe(net, (struct tipc_subscr *)buf, subscriber, - &sub) < 0) { - spin_unlock_bh(&subscriber->lock); - subscr_terminate(sub); - return; - } + tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscriber, &sub); if (sub) tipc_nametbl_subscribe(sub); - spin_unlock_bh(&subscriber->lock); + else + tipc_conn_terminate(tn->topsrv, subscriber->conid); } /* Handle one request to establish a new subscriber */ -static void *subscr_named_msg_event(int conid) +static void *tipc_subscrb_connect_cb(int conid) { - struct tipc_subscriber *subscriber; - - /* Create subscriber object */ - subscriber = kzalloc(sizeof(struct tipc_subscriber), GFP_ATOMIC); - if (subscriber == NULL) { - pr_warn("Subscriber rejected, no memory\n"); - return NULL; - } - INIT_LIST_HEAD(&subscriber->subscription_list); - subscriber->conid = conid; - spin_lock_init(&subscriber->lock); - - return (void *)subscriber; + return (void *)tipc_subscrb_create(conid); } -int tipc_subscr_start(struct net *net) +int tipc_topsrv_start(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); const char name[] = "topology_server"; @@ -370,9 +333,9 @@ int tipc_subscr_start(struct net *net) topsrv->imp = TIPC_CRITICAL_IMPORTANCE; topsrv->type = SOCK_SEQPACKET; topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr); - topsrv->tipc_conn_recvmsg = subscr_conn_msg_event; - topsrv->tipc_conn_new = subscr_named_msg_event; - topsrv->tipc_conn_shutdown = subscr_conn_shutdown_event; + topsrv->tipc_conn_recvmsg = tipc_subscrb_rcv_cb; + topsrv->tipc_conn_new = tipc_subscrb_connect_cb; + topsrv->tipc_conn_shutdown = tipc_subscrb_shutdown_cb; strncpy(topsrv->name, name, strlen(name) + 1); tn->topsrv = topsrv; @@ -381,7 +344,7 @@ int tipc_subscr_start(struct net *net) return tipc_server_start(topsrv); } -void tipc_subscr_stop(struct net *net) +void tipc_topsrv_stop(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_server *topsrv = tn->topsrv; diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 33488bd9fe3c..92ee18cc5fe6 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -54,7 +54,7 @@ struct tipc_subscriber; * @filter: event filtering to be done for subscription * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list - * @subscription_list: adjacent subscriptions in subscriber's subscription list + * @subscrp_list: adjacent subscriptions in subscriber's subscription list * @server_ref: object reference of server port associated with subscription * @swap: indicates if subscriber uses opposite endianness in its messages * @evt: template for events generated by subscription @@ -67,17 +67,17 @@ struct tipc_subscription { u32 filter; struct timer_list timer; struct list_head nameseq_list; - struct list_head subscription_list; + struct list_head subscrp_list; int swap; struct tipc_event evt; }; -int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper); -void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node, int must); -int tipc_subscr_start(struct net *net); -void tipc_subscr_stop(struct net *net); +int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower, + u32 found_upper); +void tipc_subscrp_report_overlap(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, u32 event, + u32 port_ref, u32 node, int must); +int tipc_topsrv_start(struct net *net); +void tipc_topsrv_stop(struct net *net); #endif diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c new file mode 100644 index 000000000000..66deebc66aa1 --- /dev/null +++ b/net/tipc/udp_media.c @@ -0,0 +1,448 @@ +/* net/tipc/udp_media.c: IP bearer support for TIPC + * + * Copyright (c) 2015, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/socket.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/inet.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/kernel.h> +#include <linux/workqueue.h> +#include <linux/list.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/udp_tunnel.h> +#include <net/addrconf.h> +#include <linux/tipc_netlink.h> +#include "core.h" +#include "bearer.h" + +/* IANA assigned UDP port */ +#define UDP_PORT_DEFAULT 6118 + +static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { + [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, + [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, + [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, +}; + +/** + * struct udp_media_addr - IP/UDP addressing information + * + * This is the bearer level originating address used in neighbor discovery + * messages, and all fields should be in network byte order + */ +struct udp_media_addr { + __be16 proto; + __be16 udp_port; + union { + struct in_addr ipv4; + struct in6_addr ipv6; + }; +}; + +/** + * struct udp_bearer - ip/udp bearer data structure + * @bearer: associated generic tipc bearer + * @ubsock: bearer associated socket + * @ifindex: local address scope + * @work: used to schedule deferred work on a bearer + */ +struct udp_bearer { + struct tipc_bearer __rcu *bearer; + struct socket *ubsock; + u32 ifindex; + struct work_struct work; +}; + +/* udp_media_addr_set - convert a ip/udp address to a TIPC media address */ +static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, + struct udp_media_addr *ua) +{ + memset(addr, 0, sizeof(struct tipc_media_addr)); + addr->media_id = TIPC_MEDIA_TYPE_UDP; + memcpy(addr->value, ua, sizeof(struct udp_media_addr)); + if (ntohs(ua->proto) == ETH_P_IP) { + if (ipv4_is_multicast(ua->ipv4.s_addr)) + addr->broadcast = 1; + } else if (ntohs(ua->proto) == ETH_P_IPV6) { + if (ipv6_addr_type(&ua->ipv6) & IPV6_ADDR_MULTICAST) + addr->broadcast = 1; + } else { + pr_err("Invalid UDP media address\n"); + } +} + +/* tipc_udp_addr2str - convert ip/udp address to string */ +static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size) +{ + struct udp_media_addr *ua = (struct udp_media_addr *)&a->value; + + if (ntohs(ua->proto) == ETH_P_IP) + snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->udp_port)); + else if (ntohs(ua->proto) == ETH_P_IPV6) + snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->udp_port)); + else + pr_err("Invalid UDP media address\n"); + return 0; +} + +/* tipc_udp_msg2addr - extract an ip/udp address from a TIPC ndisc message */ +static int tipc_udp_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *a, + char *msg) +{ + struct udp_media_addr *ua; + + ua = (struct udp_media_addr *) (msg + TIPC_MEDIA_ADDR_OFFSET); + if (msg[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_UDP) + return -EINVAL; + tipc_udp_media_addr_set(a, ua); + return 0; +} + +/* tipc_udp_addr2msg - write an ip/udp address to a TIPC ndisc message */ +static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) +{ + memset(msg, 0, TIPC_MEDIA_INFO_SIZE); + msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_UDP; + memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, a->value, + sizeof(struct udp_media_addr)); + return 0; +} + +/* tipc_send_msg - enqueue a send request */ +static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *dest) +{ + int ttl, err = 0; + struct udp_bearer *ub; + struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value; + struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; + struct sk_buff *clone; + struct rtable *rt; + + clone = skb_clone(skb, GFP_ATOMIC); + skb_set_inner_protocol(clone, htons(ETH_P_TIPC)); + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + err = -ENODEV; + goto tx_error; + } + if (dst->proto == htons(ETH_P_IP)) { + struct flowi4 fl = { + .daddr = dst->ipv4.s_addr, + .saddr = src->ipv4.s_addr, + .flowi4_mark = clone->mark, + .flowi4_proto = IPPROTO_UDP + }; + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto tx_error; + } + ttl = ip4_dst_hoplimit(&rt->dst); + err = udp_tunnel_xmit_skb(rt, ub->ubsock->sk, clone, + src->ipv4.s_addr, + dst->ipv4.s_addr, 0, ttl, 0, + src->udp_port, dst->udp_port, + false, true); + if (err < 0) { + ip_rt_put(rt); + goto tx_error; + } +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct dst_entry *ndst; + struct flowi6 fl6 = { + .flowi6_oif = ub->ifindex, + .daddr = dst->ipv6, + .saddr = src->ipv6, + .flowi6_proto = IPPROTO_UDP + }; + err = ipv6_stub->ipv6_dst_lookup(ub->ubsock->sk, &ndst, &fl6); + if (err) + goto tx_error; + ttl = ip6_dst_hoplimit(ndst); + err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, clone, + ndst->dev, &src->ipv6, + &dst->ipv6, 0, ttl, src->udp_port, + dst->udp_port, false); +#endif + } + return err; + +tx_error: + kfree_skb(clone); + return err; +} + +/* tipc_udp_recv - read data from bearer socket */ +static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct udp_bearer *ub; + struct tipc_bearer *b; + + ub = rcu_dereference_sk_user_data(sk); + if (!ub) { + pr_err_ratelimited("Failed to get UDP bearer reference"); + kfree_skb(skb); + return 0; + } + + skb_pull(skb, sizeof(struct udphdr)); + rcu_read_lock(); + b = rcu_dereference_rtnl(ub->bearer); + + if (b) { + tipc_rcv(sock_net(sk), skb, b); + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + kfree_skb(skb); + return 0; +} + +static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) +{ + int err = 0; + struct ip_mreqn mreqn; + struct sock *sk = ub->ubsock->sk; + + if (ntohs(remote->proto) == ETH_P_IP) { + if (!ipv4_is_multicast(remote->ipv4.s_addr)) + return 0; + mreqn.imr_multiaddr = remote->ipv4; + mreqn.imr_ifindex = ub->ifindex; + err = ip_mc_join_group(sk, &mreqn); +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (!ipv6_addr_is_multicast(&remote->ipv6)) + return 0; + err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex, + &remote->ipv6); +#endif + } + return err; +} + +/** + * parse_options - build local/remote addresses from configuration + * @attrs: netlink config data + * @ub: UDP bearer instance + * @local: local bearer IP address/port + * @remote: peer or multicast IP/port + */ +static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, + struct udp_media_addr *local, + struct udp_media_addr *remote) +{ + struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; + struct sockaddr_storage *sa_local, *sa_remote; + + if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) + goto err; + if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, + attrs[TIPC_NLA_BEARER_UDP_OPTS], + tipc_nl_udp_policy)) + goto err; + if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) { + sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]); + sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]); + } else { +err: + pr_err("Invalid UDP bearer configuration"); + return -EINVAL; + } + if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) { + struct sockaddr_in *ip4; + + ip4 = (struct sockaddr_in *)sa_local; + local->proto = htons(ETH_P_IP); + local->udp_port = ip4->sin_port; + local->ipv4.s_addr = ip4->sin_addr.s_addr; + + ip4 = (struct sockaddr_in *)sa_remote; + remote->proto = htons(ETH_P_IP); + remote->udp_port = ip4->sin_port; + remote->ipv4.s_addr = ip4->sin_addr.s_addr; + return 0; + +#if IS_ENABLED(CONFIG_IPV6) + } else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) { + struct sockaddr_in6 *ip6; + + ip6 = (struct sockaddr_in6 *)sa_local; + local->proto = htons(ETH_P_IPV6); + local->udp_port = ip6->sin6_port; + local->ipv6 = ip6->sin6_addr; + ub->ifindex = ip6->sin6_scope_id; + + ip6 = (struct sockaddr_in6 *)sa_remote; + remote->proto = htons(ETH_P_IPV6); + remote->udp_port = ip6->sin6_port; + remote->ipv6 = ip6->sin6_addr; + return 0; +#endif + } + return -EADDRNOTAVAIL; +} + +/** + * tipc_udp_enable - callback to create a new udp bearer instance + * @net: network namespace + * @b: pointer to generic tipc_bearer + * @attrs: netlink bearer configuration + * + * validate the bearer parameters and initialize the udp bearer + * rtnl_lock should be held + */ +static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, + struct nlattr *attrs[]) +{ + int err = -EINVAL; + struct udp_bearer *ub; + struct udp_media_addr *remote; + struct udp_media_addr local = {0}; + struct udp_port_cfg udp_conf = {0}; + struct udp_tunnel_sock_cfg tuncfg = {NULL}; + + ub = kzalloc(sizeof(*ub), GFP_ATOMIC); + if (!ub) + return -ENOMEM; + + remote = (struct udp_media_addr *)&b->bcast_addr.value; + memset(remote, 0, sizeof(struct udp_media_addr)); + err = parse_options(attrs, ub, &local, remote); + if (err) + goto err; + + b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; + b->bcast_addr.broadcast = 1; + rcu_assign_pointer(b->media_ptr, ub); + rcu_assign_pointer(ub->bearer, b); + tipc_udp_media_addr_set(&b->addr, &local); + if (local.proto == htons(ETH_P_IP)) { + struct net_device *dev; + + dev = __ip_dev_find(net, local.ipv4.s_addr, false); + if (!dev) { + err = -ENODEV; + goto err; + } + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + udp_conf.use_udp_checksums = false; + ub->ifindex = dev->ifindex; + b->mtu = dev->mtu - sizeof(struct iphdr) + - sizeof(struct udphdr); +#if IS_ENABLED(CONFIG_IPV6) + } else if (local.proto == htons(ETH_P_IPV6)) { + udp_conf.family = AF_INET6; + udp_conf.use_udp6_tx_checksums = true; + udp_conf.use_udp6_rx_checksums = true; + udp_conf.local_ip6 = in6addr_any; + b->mtu = 1280; +#endif + } else { + err = -EAFNOSUPPORT; + goto err; + } + udp_conf.local_udp_port = local.udp_port; + err = udp_sock_create(net, &udp_conf, &ub->ubsock); + if (err) + goto err; + tuncfg.sk_user_data = ub; + tuncfg.encap_type = 1; + tuncfg.encap_rcv = tipc_udp_recv; + tuncfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); + + if (enable_mcast(ub, remote)) + goto err; + return 0; +err: + kfree(ub); + return err; +} + +/* cleanup_bearer - break the socket/bearer association */ +static void cleanup_bearer(struct work_struct *work) +{ + struct udp_bearer *ub = container_of(work, struct udp_bearer, work); + + if (ub->ubsock) + udp_tunnel_sock_release(ub->ubsock); + synchronize_net(); + kfree(ub); +} + +/* tipc_udp_disable - detach bearer from socket */ +static void tipc_udp_disable(struct tipc_bearer *b) +{ + struct udp_bearer *ub; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + pr_err("UDP bearer instance not found\n"); + return; + } + if (ub->ubsock) + sock_set_flag(ub->ubsock->sk, SOCK_DEAD); + RCU_INIT_POINTER(b->media_ptr, NULL); + RCU_INIT_POINTER(ub->bearer, NULL); + + /* sock_release need to be done outside of rtnl lock */ + INIT_WORK(&ub->work, cleanup_bearer); + schedule_work(&ub->work); +} + +struct tipc_media udp_media_info = { + .send_msg = tipc_udp_send_msg, + .enable_media = tipc_udp_enable, + .disable_media = tipc_udp_disable, + .addr2str = tipc_udp_addr2str, + .addr2msg = tipc_udp_addr2msg, + .msg2addr = tipc_udp_msg2addr, + .priority = TIPC_DEF_LINK_PRI, + .tolerance = TIPC_DEF_LINK_TOL, + .window = TIPC_DEF_LINK_WIN, + .type_id = TIPC_MEDIA_TYPE_UDP, + .hwaddr_len = 0, + .name = "udp" +}; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 526b6edab018..03ee4d359f6a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -140,12 +140,17 @@ static struct hlist_head *unix_sockets_unbound(void *addr) #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - memcpy(UNIXSID(skb), &scm->secid, sizeof(u32)); + UNIXCB(skb).secid = scm->secid; } static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { - scm->secid = *UNIXSID(skb); + scm->secid = UNIXCB(skb).secid; +} + +static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) +{ + return (scm->secid == UNIXCB(skb).secid); } #else static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) @@ -153,6 +158,11 @@ static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) { } + +static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) +{ + return true; +} #endif /* CONFIG_SECURITY_NETWORK */ /* @@ -305,7 +315,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->path.dentry; - if (dentry && dentry->d_inode == i) { + if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); goto found; } @@ -516,20 +526,20 @@ static unsigned int unix_dgram_poll(struct file *, struct socket *, poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); -static int unix_stream_sendmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t); -static int unix_stream_recvmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t, int); -static int unix_dgram_sendmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t); -static int unix_dgram_recvmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t, int); +static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); +static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); +static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset, + size_t size, int flags); +static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, + struct pipe_inode_info *, size_t size, + unsigned int flags); +static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); +static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); -static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t); -static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *, - struct msghdr *, size_t, int); +static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); +static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, + int); static int unix_set_peek_off(struct sock *sk, int val) { @@ -563,7 +573,8 @@ static const struct proto_ops unix_stream_ops = { .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, + .sendpage = unix_stream_sendpage, + .splice_read = unix_stream_splice_read, .set_peek_off = unix_set_peek_off, }; @@ -625,7 +636,7 @@ static struct proto unix_proto = { */ static struct lock_class_key af_unix_sk_receive_queue_lock_key; -static struct sock *unix_create1(struct net *net, struct socket *sock) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) { struct sock *sk = NULL; struct unix_sock *u; @@ -634,7 +645,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); if (!sk) goto out; @@ -693,7 +704,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock) ? 0 : -ENOMEM; + return unix_create1(net, sock, kern) ? 0 : -ENOMEM; } static int unix_release(struct socket *sock) @@ -783,7 +794,7 @@ static struct sock *unix_find_other(struct net *net, err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); err = inode_permission(inode, MAY_WRITE); if (err) goto put_fail; @@ -844,7 +855,7 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) */ err = security_path_mknod(&path, dentry, mode, 0); if (!err) { - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); + err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0); if (!err) { res->mnt = mntget(path.mnt); res->dentry = dget(dentry); @@ -910,7 +921,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out_up; } addr->hash = UNIX_HASH_SIZE; - hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); + hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1); spin_lock(&unix_table_lock); u->path = path; list = &unix_socket_table[hash]; @@ -1093,7 +1104,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENOMEM; /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL); + newsk = unix_create1(sock_net(sk), NULL, 0); if (newsk == NULL) goto out; @@ -1413,6 +1424,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; + unix_get_secdata(scm, skb); if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); @@ -1442,8 +1454,8 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, * Send AF_UNIX data. */ -static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); @@ -1508,7 +1520,6 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (err < 0) goto out_free; max_level = err + 1; - unix_get_secdata(&scm, skb); skb_put(skb, len - data_len); skb->data_len = data_len; @@ -1622,8 +1633,8 @@ out: */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) -static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk = sock->sk; struct sock *other = NULL; @@ -1725,8 +1736,103 @@ out_err: return sent ? : err; } -static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, + int offset, size_t size, int flags) +{ + int err = 0; + bool send_sigpipe = true; + struct sock *other, *sk = socket->sk; + struct sk_buff *skb, *newskb = NULL, *tail = NULL; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + other = unix_peer(sk); + if (!other || sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + if (false) { +alloc_skb: + unix_state_unlock(other); + mutex_unlock(&unix_sk(other)->readlock); + newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, + &err, 0); + if (!newskb) + return err; + } + + /* we must acquire readlock as we modify already present + * skbs in the sk_receive_queue and mess with skb->len + */ + err = mutex_lock_interruptible(&unix_sk(other)->readlock); + if (err) { + err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; + send_sigpipe = false; + goto err; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + err = -EPIPE; + goto err_unlock; + } + + unix_state_lock(other); + + if (sock_flag(other, SOCK_DEAD) || + other->sk_shutdown & RCV_SHUTDOWN) { + err = -EPIPE; + goto err_state_unlock; + } + + skb = skb_peek_tail(&other->sk_receive_queue); + if (tail && tail == skb) { + skb = newskb; + } else if (!skb) { + if (newskb) + skb = newskb; + else + goto alloc_skb; + } else if (newskb) { + /* this is fast path, we don't necessarily need to + * call to kfree_skb even though with newskb == NULL + * this - does no harm + */ + consume_skb(newskb); + } + + if (skb_append_pagefrags(skb, page, offset, size)) { + tail = skb; + goto alloc_skb; + } + + skb->len += size; + skb->data_len += size; + skb->truesize += size; + atomic_add(size, &sk->sk_wmem_alloc); + + if (newskb) + __skb_queue_tail(&other->sk_receive_queue, newskb); + + unix_state_unlock(other); + mutex_unlock(&unix_sk(other)->readlock); + + other->sk_data_ready(other); + + return size; + +err_state_unlock: + unix_state_unlock(other); +err_unlock: + mutex_unlock(&unix_sk(other)->readlock); +err: + kfree_skb(newskb); + if (send_sigpipe && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} + +static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { int err; struct sock *sk = sock->sk; @@ -1741,19 +1847,18 @@ static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock, if (msg->msg_namelen) msg->msg_namelen = 0; - return unix_dgram_sendmsg(kiocb, sock, msg, len); + return unix_dgram_sendmsg(sock, msg, len); } -static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, - int flags) +static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { struct sock *sk = sock->sk; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; - return unix_dgram_recvmsg(iocb, sock, msg, size, flags); + return unix_dgram_recvmsg(sock, msg, size, flags); } static void unix_copy_addr(struct msghdr *msg, struct sock *sk) @@ -1766,9 +1871,8 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk) } } -static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, - int flags) +static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; @@ -1867,8 +1971,9 @@ out: * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, - struct sk_buff *last) + struct sk_buff *last, unsigned int last_len) { + struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); @@ -1876,7 +1981,9 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (skb_peek_tail(&sk->sk_receive_queue) != last || + tail = skb_peek_tail(&sk->sk_receive_queue); + if (tail != last || + (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || @@ -1887,6 +1994,10 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, unix_state_unlock(sk); timeo = freezable_schedule_timeout(timeo); unix_state_lock(sk); + + if (sock_flag(sk, SOCK_DEAD)) + break; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); } @@ -1900,39 +2011,50 @@ static unsigned int unix_skb_len(const struct sk_buff *skb) return skb->len - UNIXCB(skb).consumed; } -static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, - int flags) +struct unix_stream_read_state { + int (*recv_actor)(struct sk_buff *, int, int, + struct unix_stream_read_state *); + struct socket *socket; + struct msghdr *msg; + struct pipe_inode_info *pipe; + size_t size; + int flags; + unsigned int splice_flags; +}; + +static int unix_stream_read_generic(struct unix_stream_read_state *state) { struct scm_cookie scm; + struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); int copied = 0; + int flags = state->flags; int noblock = flags & MSG_DONTWAIT; - int check_creds = 0; + bool check_creds = false; int target; int err = 0; long timeo; int skip; + size_t size = state->size; + unsigned int last_len; err = -EINVAL; if (sk->sk_state != TCP_ESTABLISHED) goto out; err = -EOPNOTSUPP; - if (flags&MSG_OOB) + if (flags & MSG_OOB) goto out; - target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); + memset(&scm, 0, sizeof(scm)); + /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ - - memset(&scm, 0, sizeof(scm)); - err = mutex_lock_interruptible(&u->readlock); if (unlikely(err)) { /* recvmsg() in non blocking mode is supposed to return -EAGAIN @@ -1947,7 +2069,12 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb, *last; unix_state_lock(sk); + if (sock_flag(sk, SOCK_DEAD)) { + err = -ECONNRESET; + goto unlock; + } last = skb = skb_peek(&sk->sk_receive_queue); + last_len = last ? last->len : 0; again: if (skb == NULL) { unix_sk(sk)->recursion_level = 0; @@ -1970,16 +2097,17 @@ again: break; mutex_unlock(&u->readlock); - timeo = unix_stream_data_wait(sk, timeo, last); + timeo = unix_stream_data_wait(sk, timeo, last, + last_len); - if (signal_pending(current) - || mutex_lock_interruptible(&u->readlock)) { + if (signal_pending(current) || + mutex_lock_interruptible(&u->readlock)) { err = sock_intr_errno(timeo); goto out; } continue; - unlock: +unlock: unix_state_unlock(sk); break; } @@ -1988,6 +2116,7 @@ again: while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; + last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; @@ -1999,23 +2128,27 @@ again: /* Never glue messages from different writers */ if ((UNIXCB(skb).pid != scm.pid) || !uid_eq(UNIXCB(skb).uid, scm.creds.uid) || - !gid_eq(UNIXCB(skb).gid, scm.creds.gid)) + !gid_eq(UNIXCB(skb).gid, scm.creds.gid) || + !unix_secdata_eq(&scm, skb)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - check_creds = 1; + unix_set_secdata(&scm, skb); + check_creds = true; } /* Copy address just once */ - if (sunaddr) { - unix_copy_addr(msg, skb->sk); + if (state->msg && state->msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, + state->msg->msg_name); + unix_copy_addr(state->msg, skb->sk); sunaddr = NULL; } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); - if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, - msg, chunk)) { + chunk = state->recv_actor(skb, skip, chunk, state); + if (chunk < 0) { if (copied == 0) copied = -EFAULT; break; @@ -2053,11 +2186,85 @@ again: } while (size); mutex_unlock(&u->readlock); - scm_recv(sock, msg, &scm, flags); + if (state->msg) + scm_recv(sock, state->msg, &scm, flags); + else + scm_destroy(&scm); out: return copied ? : err; } +static int unix_stream_read_actor(struct sk_buff *skb, + int skip, int chunk, + struct unix_stream_read_state *state) +{ + int ret; + + ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, + state->msg, chunk); + return ret ?: chunk; +} + +static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_read_actor, + .socket = sock, + .msg = msg, + .size = size, + .flags = flags + }; + + return unix_stream_read_generic(&state); +} + +static ssize_t skb_unix_socket_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + struct unix_sock *u = unix_sk(sk); + + mutex_unlock(&u->readlock); + ret = splice_to_pipe(pipe, spd); + mutex_lock(&u->readlock); + + return ret; +} + +static int unix_stream_splice_actor(struct sk_buff *skb, + int skip, int chunk, + struct unix_stream_read_state *state) +{ + return skb_splice_bits(skb, state->socket->sk, + UNIXCB(skb).consumed + skip, + state->pipe, chunk, state->splice_flags, + skb_unix_socket_splice); +} + +static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t size, unsigned int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_splice_actor, + .socket = sock, + .pipe = pipe, + .size = size, + .splice_flags = flags, + }; + + if (unlikely(*ppos)) + return -ESPIPE; + + if (sock->file->f_flags & O_NONBLOCK || + flags & SPLICE_F_NONBLOCK) + state.flags = MSG_DONTWAIT; + + return unix_stream_read_generic(&state); +} + static int unix_shutdown(struct socket *sock, int mode) { struct sock *sk = sock->sk; diff --git a/net/unix/diag.c b/net/unix/diag.c index ef542fbca9fe..c512f64d5287 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -25,7 +25,7 @@ static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb) if (dentry) { struct unix_diag_vfs uv = { - .udiag_vfs_ino = dentry->d_inode->i_ino, + .udiag_vfs_ino = d_backing_inode(dentry)->i_ino, .udiag_vfs_dev = dentry->d_sb->s_dev, }; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 99f7012b23b9..a73a226f2d33 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -95,39 +95,36 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); unsigned int unix_tot_inflight; - struct sock *unix_get_socket(struct file *filp) { struct sock *u_sock = NULL; struct inode *inode = file_inode(filp); - /* - * Socket ? - */ + /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); struct sock *s = sock->sk; - /* - * PF_UNIX ? - */ + /* PF_UNIX ? */ if (s && sock->ops && sock->ops->family == PF_UNIX) u_sock = s; } return u_sock; } -/* - * Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. +/* Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. */ void unix_inflight(struct file *fp) { struct sock *s = unix_get_socket(fp); + if (s) { struct unix_sock *u = unix_sk(s); + spin_lock(&unix_gc_lock); + if (atomic_long_inc_return(&u->inflight) == 1) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); @@ -142,10 +139,13 @@ void unix_inflight(struct file *fp) void unix_notinflight(struct file *fp) { struct sock *s = unix_get_socket(fp); + if (s) { struct unix_sock *u = unix_sk(s); + spin_lock(&unix_gc_lock); BUG_ON(list_empty(&u->link)); + if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); unix_tot_inflight--; @@ -161,32 +161,27 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* - * Do we have file descriptors ? - */ + /* Do we have file descriptors ? */ if (UNIXCB(skb).fp) { bool hit = false; - /* - * Process the descriptors of this socket - */ + /* Process the descriptors of this socket */ int nfd = UNIXCB(skb).fp->count; struct file **fp = UNIXCB(skb).fp->fp; + while (nfd--) { - /* - * Get the socket the fd matches - * if it indeed does so - */ + /* Get the socket the fd matches if it indeed does so */ struct sock *sk = unix_get_socket(*fp++); + if (sk) { struct unix_sock *u = unix_sk(sk); - /* - * Ignore non-candidates, they could + /* Ignore non-candidates, they could * have been added to the queues after * starting the garbage collection */ if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { hit = true; + func(u); } } @@ -203,24 +198,22 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), static void scan_children(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { - if (x->sk_state != TCP_LISTEN) + if (x->sk_state != TCP_LISTEN) { scan_inflight(x, func, hitlist); - else { + } else { struct sk_buff *skb; struct sk_buff *next; struct unix_sock *u; LIST_HEAD(embryos); - /* - * For a listening socket collect the queued embryos + /* For a listening socket collect the queued embryos * and perform a scan on them as well. */ spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { u = unix_sk(skb->sk); - /* - * An embryo cannot be in-flight, so it's safe + /* An embryo cannot be in-flight, so it's safe * to use the list link. */ BUG_ON(!list_empty(&u->link)); @@ -249,8 +242,7 @@ static void inc_inflight(struct unix_sock *usk) static void inc_inflight_move_tail(struct unix_sock *u) { atomic_long_inc(&u->inflight); - /* - * If this still might be part of a cycle, move it to the end + /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over */ @@ -263,8 +255,7 @@ static bool gc_in_progress; void wait_for_unix_gc(void) { - /* - * If number of inflight sockets is insane, + /* If number of inflight sockets is insane, * force a garbage collect right now. */ if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) @@ -288,8 +279,7 @@ void unix_gc(void) goto out; gc_in_progress = true; - /* - * First, select candidates for garbage collection. Only + /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. * @@ -320,15 +310,13 @@ void unix_gc(void) } } - /* - * Now remove all internal in-flight reference to children of + /* Now remove all internal in-flight reference to children of * the candidates. */ list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, dec_inflight, NULL); - /* - * Restore the references for children of all candidates, + /* Restore the references for children of all candidates, * which have remaining references. Do this recursively, so * only those remain, which form cyclic references. * @@ -350,8 +338,7 @@ void unix_gc(void) } list_del(&cursor); - /* - * not_cycle_list contains those sockets which do not make up a + /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ while (!list_empty(¬_cycle_list)) { @@ -360,8 +347,7 @@ void unix_gc(void) list_move_tail(&u->link, &gc_inflight_list); } - /* - * Now gc_candidates contains only garbage. Restore original + /* Now gc_candidates contains only garbage. Restore original * inflight counters for these as well, and remove the skbuffs * which are creating the cycle(s). */ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 1d0e39c9a3e2..df5fc6b340f1 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -581,13 +581,14 @@ struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, gfp_t priority, - unsigned short type) + unsigned short type, + int kern) { struct sock *sk; struct vsock_sock *psk; struct vsock_sock *vsk; - sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto); + sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); if (!sk) return NULL; @@ -949,8 +950,8 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, return mask; } -static int vsock_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { int err; struct sock *sk; @@ -1062,11 +1063,10 @@ out: return err; } -static int vsock_dgram_recvmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len, int flags) +static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) { - return transport->dgram_dequeue(kiocb, vsock_sk(sock->sk), msg, len, - flags); + return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags); } static const struct proto_ops vsock_dgram_ops = { @@ -1505,8 +1505,8 @@ static int vsock_stream_getsockopt(struct socket *sock, return 0; } -static int vsock_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) { struct sock *sk; struct vsock_sock *vsk; @@ -1644,9 +1644,8 @@ out: static int -vsock_stream_recvmsg(struct kiocb *kiocb, - struct socket *sock, - struct msghdr *msg, size_t len, int flags) +vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct sock *sk; struct vsock_sock *vsk; @@ -1868,7 +1867,7 @@ static int vsock_create(struct net *net, struct socket *sock, sock->state = SS_UNCONNECTED; - return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM; + return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM; } static const struct net_proto_family vsock_family_ops = { diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 7f3255084a6c..1f63daff3965 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1022,7 +1022,7 @@ static int vmci_transport_recv_listen(struct sock *sk, } pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, - sk->sk_type); + sk->sk_type, 0); if (!pending) { vmci_transport_send_reset(sk, pkt); return -ENOMEM; @@ -1730,8 +1730,7 @@ static int vmci_transport_dgram_enqueue( return err - sizeof(*dg); } -static int vmci_transport_dgram_dequeue(struct kiocb *kiocb, - struct vsock_sock *vsk, +static int vmci_transport_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags) { diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 29c8675f9a11..4f5543dd2524 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -175,13 +175,21 @@ config CFG80211_INTERNAL_REGDB Most distributions have a CRDA package. So if unsure, say N. config CFG80211_WEXT - bool "cfg80211 wireless extensions compatibility" + bool "cfg80211 wireless extensions compatibility" if !CFG80211_WEXT_EXPORT depends on CFG80211 select WEXT_CORE + default y if CFG80211_WEXT_EXPORT help Enable this option if you need old userspace for wireless extensions with cfg80211-based drivers. +config CFG80211_WEXT_EXPORT + bool + depends on CFG80211 + help + Drivers should select this option if they require cfg80211's + wext compatibility symbols to be exported. + config LIB80211 tristate default n diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 7aaf7415dc4c..59cabc9bce69 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -698,19 +698,20 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, EXPORT_SYMBOL(cfg80211_chandef_usable); /* - * For GO only, check if the channel can be used under permissive conditions - * mandated by the some regulatory bodies, i.e., the channel is marked with - * IEEE80211_CHAN_GO_CONCURRENT and there is an additional station interface + * Check if the channel can be used under permissive conditions mandated by + * some regulatory bodies, i.e., the channel is marked with + * IEEE80211_CHAN_IR_CONCURRENT and there is an additional station interface * associated to an AP on the same channel or on the same UNII band * (assuming that the AP is an authorized master). - * In addition allow the GO to operate on a channel on which indoor operation is + * In addition allow operation on a channel on which indoor operation is * allowed, iff we are currently operating in an indoor environment. */ -static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, +static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, + enum nl80211_iftype iftype, struct ieee80211_channel *chan) { - struct wireless_dev *wdev_iter; - struct wiphy *wiphy = wiphy_idx_to_wiphy(rdev->wiphy_idx); + struct wireless_dev *wdev; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); ASSERT_RTNL(); @@ -718,32 +719,48 @@ static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)) return false; + /* only valid for GO and TDLS off-channel (station/p2p-CL) */ + if (iftype != NL80211_IFTYPE_P2P_GO && + iftype != NL80211_IFTYPE_STATION && + iftype != NL80211_IFTYPE_P2P_CLIENT) + return false; + if (regulatory_indoor_allowed() && (chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) return true; - if (!(chan->flags & IEEE80211_CHAN_GO_CONCURRENT)) + if (!(chan->flags & IEEE80211_CHAN_IR_CONCURRENT)) return false; /* * Generally, it is possible to rely on another device/driver to allow - * the GO concurrent relaxation, however, since the device can further + * the IR concurrent relaxation, however, since the device can further * enforce the relaxation (by doing a similar verifications as this), * and thus fail the GO instantiation, consider only the interfaces of * the current registered device. */ - list_for_each_entry(wdev_iter, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wdev_list, list) { struct ieee80211_channel *other_chan = NULL; int r1, r2; - if (wdev_iter->iftype != NL80211_IFTYPE_STATION || - !netif_running(wdev_iter->netdev)) - continue; - - wdev_lock(wdev_iter); - if (wdev_iter->current_bss) - other_chan = wdev_iter->current_bss->pub.channel; - wdev_unlock(wdev_iter); + wdev_lock(wdev); + if (wdev->iftype == NL80211_IFTYPE_STATION && + wdev->current_bss) + other_chan = wdev->current_bss->pub.channel; + + /* + * If a GO already operates on the same GO_CONCURRENT channel, + * this one (maybe the same one) can beacon as well. We allow + * the operation even if the station we relied on with + * GO_CONCURRENT is disconnected now. But then we must make sure + * we're not outdoor on an indoor-only channel. + */ + if (iftype == NL80211_IFTYPE_P2P_GO && + wdev->iftype == NL80211_IFTYPE_P2P_GO && + wdev->beacon_interval && + !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY)) + other_chan = wdev->chandef.chan; + wdev_unlock(wdev); if (!other_chan) continue; @@ -780,25 +797,18 @@ static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev, return false; } -bool cfg80211_reg_can_beacon(struct wiphy *wiphy, - struct cfg80211_chan_def *chandef, - enum nl80211_iftype iftype) +static bool _cfg80211_reg_can_beacon(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype, + bool check_no_ir) { - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); bool res; u32 prohibited_flags = IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_RADAR; - trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype); + trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir); - /* - * Under certain conditions suggested by the some regulatory bodies - * a GO can operate on channels marked with IEEE80211_NO_IR - * so set this flag only if such relaxations are not enabled and - * the conditions are not met. - */ - if (iftype != NL80211_IFTYPE_P2P_GO || - !cfg80211_go_permissive_chan(rdev, chandef->chan)) + if (check_no_ir) prohibited_flags |= IEEE80211_CHAN_NO_IR; if (cfg80211_chandef_dfs_required(wiphy, chandef, iftype) > 0 && @@ -812,8 +822,36 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy, trace_cfg80211_return_bool(res); return res; } + +bool cfg80211_reg_can_beacon(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) +{ + return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, true); +} EXPORT_SYMBOL(cfg80211_reg_can_beacon); +bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) +{ + bool check_no_ir; + + ASSERT_RTNL(); + + /* + * Under certain conditions suggested by some regulatory bodies a + * GO/STA can IR on channels marked with IEEE80211_NO_IR. Set this flag + * only if such relaxations are not enabled and the conditions are not + * met. + */ + check_no_ir = !cfg80211_ir_permissive_chan(wiphy, iftype, + chandef->chan); + + return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir); +} +EXPORT_SYMBOL(cfg80211_reg_can_beacon_relax); + int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef) { diff --git a/net/wireless/core.h b/net/wireless/core.h index 801cd49c5a0c..311eef26bf88 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -222,6 +222,7 @@ struct cfg80211_event { const u8 *ie; size_t ie_len; u16 reason; + bool locally_generated; } dc; struct { u8 bssid[ETH_ALEN]; diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index e24fc585c883..4c55fab9b4e4 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -30,7 +30,7 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, return; bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, NULL, 0, - WLAN_CAPABILITY_IBSS, WLAN_CAPABILITY_IBSS); + IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY_ANY); if (WARN_ON(!bss)) return; @@ -533,7 +533,7 @@ int cfg80211_ibss_wext_giwap(struct net_device *dev, else if (wdev->wext.ibss.bssid) memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN); else - memset(ap_addr->sa_data, 0, ETH_ALEN); + eth_zero_addr(ap_addr->sa_data); wdev_unlock(wdev); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 2c52b59e43f3..7aae329e2b4e 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -229,7 +229,8 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, return -EALREADY; req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, - WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS); + IEEE80211_BSS_TYPE_ESS, + IEEE80211_PRIVACY_ANY); if (!req.bss) return -ENOENT; @@ -296,7 +297,8 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, rdev->wiphy.vht_capa_mod_mask); req->bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len, - WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS); + IEEE80211_BSS_TYPE_ESS, + IEEE80211_PRIVACY_ANY); if (!req->bss) return -ENOENT; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b6f84f6a2a09..76b41578a838 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -399,6 +399,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG }, [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 }, + [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -638,8 +639,8 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, if ((chan->flags & IEEE80211_CHAN_INDOOR_ONLY) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_INDOOR_ONLY)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_GO_CONCURRENT) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_GO_CONCURRENT)) + if ((chan->flags & IEEE80211_CHAN_IR_CONCURRENT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_IR_CONCURRENT)) goto nla_put_failure; if ((chan->flags & IEEE80211_CHAN_NO_20MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_20MHZ)) @@ -1098,8 +1099,6 @@ static int nl80211_send_wowlan(struct sk_buff *msg, if (large && nl80211_send_wowlan_tcp_caps(rdev, msg)) return -ENOBUFS; - /* TODO: send wowlan net detect */ - nla_nest_end(msg, nl_wowlan); return 0; @@ -2004,7 +2003,8 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, switch (iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, iftype)) { + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef, + iftype)) { result = -EINVAL; break; } @@ -2668,7 +2668,8 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) wdev = rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL80211_ATTR_IFNAME]), - type, err ? NULL : &flags, ¶ms); + NET_NAME_USER, type, err ? NULL : &flags, + ¶ms); if (WARN_ON(!wdev)) { nlmsg_free(msg); return -EPROTO; @@ -3403,8 +3404,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) } else if (!nl80211_get_ap_channel(rdev, ¶ms)) return -EINVAL; - if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef, - wdev->iftype)) + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms.chandef, + wdev->iftype)) return -EINVAL; if (info->attrs[NL80211_ATTR_ACL_POLICY]) { @@ -4061,7 +4062,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; break; case CFG80211_STA_MESH_PEER_USER: - if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION) + if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION && + params->plink_action != NL80211_PLINK_ACTION_BLOCK) return -EINVAL; break; } @@ -4968,7 +4970,10 @@ static int parse_reg_rule(struct nlattr *tb[], static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) { char *data = NULL; + bool is_indoor; enum nl80211_user_reg_hint_type user_reg_hint_type; + u32 owner_nlportid; + /* * You should only get this when cfg80211 hasn't yet initialized @@ -4994,7 +4999,15 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]); return regulatory_hint_user(data, user_reg_hint_type); case NL80211_USER_REG_HINT_INDOOR: - return regulatory_hint_indoor_user(); + if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + owner_nlportid = info->snd_portid; + is_indoor = !!info->attrs[NL80211_ATTR_REG_INDOOR]; + } else { + owner_nlportid = 0; + is_indoor = true; + } + + return regulatory_hint_indoor(is_indoor, owner_nlportid); default: return -EINVAL; } @@ -5275,7 +5288,7 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 0, 65535, mask, NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); - FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 1, 0xffffffff, + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff, mask, NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32); if (mask_out) @@ -5653,7 +5666,7 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) } } - r = set_regdom(rd); + r = set_regdom(rd, REGD_SOURCE_CRDA); /* set_regdom took ownership */ rd = NULL; @@ -5693,8 +5706,8 @@ static int nl80211_parse_random_mac(struct nlattr **attrs, int i; if (!attrs[NL80211_ATTR_MAC] && !attrs[NL80211_ATTR_MAC_MASK]) { - memset(mac_addr, 0, ETH_ALEN); - memset(mac_addr_mask, 0, ETH_ALEN); + eth_zero_addr(mac_addr); + eth_zero_addr(mac_addr_mask); mac_addr[0] = 0x2; mac_addr_mask[0] = 0x3; @@ -6480,8 +6493,8 @@ skip_beacons: if (err) return err; - if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef, - wdev->iftype)) + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms.chandef, + wdev->iftype)) return -EINVAL; err = cfg80211_chandef_dfs_required(wdev->wiphy, @@ -7275,8 +7288,18 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) break; case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: - if (rdev->wiphy.features & NL80211_FEATURE_HT_IBSS) - break; + if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS)) + return -EINVAL; + break; + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS)) + return -EINVAL; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_VHT_IBSS)) + return -EINVAL; + break; default: return -EINVAL; } @@ -7389,8 +7412,8 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info) static struct sk_buff * __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev, - int approxlen, u32 portid, u32 seq, - enum nl80211_commands cmd, + struct wireless_dev *wdev, int approxlen, + u32 portid, u32 seq, enum nl80211_commands cmd, enum nl80211_attrs attr, const struct nl80211_vendor_cmd_info *info, gfp_t gfp) @@ -7421,6 +7444,16 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev, goto nla_put_failure; } + if (wdev) { + if (nla_put_u64(skb, NL80211_ATTR_WDEV, + wdev_id(wdev))) + goto nla_put_failure; + if (wdev->netdev && + nla_put_u32(skb, NL80211_ATTR_IFINDEX, + wdev->netdev->ifindex)) + goto nla_put_failure; + } + data = nla_nest_start(skb, attr); ((void **)skb->cb)[0] = rdev; @@ -7435,6 +7468,7 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev, } struct sk_buff *__cfg80211_alloc_event_skb(struct wiphy *wiphy, + struct wireless_dev *wdev, enum nl80211_commands cmd, enum nl80211_attrs attr, int vendor_event_idx, @@ -7460,7 +7494,7 @@ struct sk_buff *__cfg80211_alloc_event_skb(struct wiphy *wiphy, return NULL; } - return __cfg80211_alloc_vendor_skb(rdev, approxlen, 0, 0, + return __cfg80211_alloc_vendor_skb(rdev, wdev, approxlen, 0, 0, cmd, attr, info, gfp); } EXPORT_SYMBOL(__cfg80211_alloc_event_skb); @@ -8761,8 +8795,8 @@ static int nl80211_send_wowlan_tcp(struct sk_buff *msg, if (!nl_tcp) return -ENOBUFS; - if (nla_put_be32(msg, NL80211_WOWLAN_TCP_SRC_IPV4, tcp->src) || - nla_put_be32(msg, NL80211_WOWLAN_TCP_DST_IPV4, tcp->dst) || + if (nla_put_in_addr(msg, NL80211_WOWLAN_TCP_SRC_IPV4, tcp->src) || + nla_put_in_addr(msg, NL80211_WOWLAN_TCP_DST_IPV4, tcp->dst) || nla_put(msg, NL80211_WOWLAN_TCP_DST_MAC, ETH_ALEN, tcp->dst_mac) || nla_put_u16(msg, NL80211_WOWLAN_TCP_SRC_PORT, tcp->src_port) || nla_put_u16(msg, NL80211_WOWLAN_TCP_DST_PORT, tcp->dst_port) || @@ -8808,6 +8842,9 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_INTERVAL, req->interval)) return -ENOBUFS; + if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay)) + return -ENOBUFS; + freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES); if (!freqs) return -ENOBUFS; @@ -8993,8 +9030,8 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev, cfg = kzalloc(size, GFP_KERNEL); if (!cfg) return -ENOMEM; - cfg->src = nla_get_be32(tb[NL80211_WOWLAN_TCP_SRC_IPV4]); - cfg->dst = nla_get_be32(tb[NL80211_WOWLAN_TCP_DST_IPV4]); + cfg->src = nla_get_in_addr(tb[NL80211_WOWLAN_TCP_SRC_IPV4]); + cfg->dst = nla_get_in_addr(tb[NL80211_WOWLAN_TCP_DST_IPV4]); memcpy(cfg->dst_mac, nla_data(tb[NL80211_WOWLAN_TCP_DST_MAC]), ETH_ALEN); if (tb[NL80211_WOWLAN_TCP_SRC_PORT]) @@ -9094,6 +9131,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) const struct wiphy_wowlan_support *wowlan = rdev->wiphy.wowlan; int err, i; bool prev_enabled = rdev->wiphy.wowlan_config; + bool regular = false; if (!wowlan) return -EOPNOTSUPP; @@ -9121,12 +9159,14 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (!(wowlan->flags & WIPHY_WOWLAN_DISCONNECT)) return -EINVAL; new_triggers.disconnect = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_MAGIC_PKT]) { if (!(wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT)) return -EINVAL; new_triggers.magic_pkt = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED]) @@ -9136,24 +9176,28 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (!(wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE)) return -EINVAL; new_triggers.gtk_rekey_failure = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST]) { if (!(wowlan->flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ)) return -EINVAL; new_triggers.eap_identity_req = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE]) { if (!(wowlan->flags & WIPHY_WOWLAN_4WAY_HANDSHAKE)) return -EINVAL; new_triggers.four_way_handshake = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_RFKILL_RELEASE]) { if (!(wowlan->flags & WIPHY_WOWLAN_RFKILL_RELEASE)) return -EINVAL; new_triggers.rfkill_release = true; + regular = true; } if (tb[NL80211_WOWLAN_TRIG_PKT_PATTERN]) { @@ -9162,6 +9206,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) int rem, pat_len, mask_len, pkt_offset; struct nlattr *pat_tb[NUM_NL80211_PKTPAT]; + regular = true; + nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN], rem) n_patterns++; @@ -9223,6 +9269,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) } if (tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION]) { + regular = true; err = nl80211_parse_wowlan_tcp( rdev, tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION], &new_triggers); @@ -9231,6 +9278,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) } if (tb[NL80211_WOWLAN_TRIG_NET_DETECT]) { + regular = true; err = nl80211_parse_wowlan_nd( rdev, wowlan, tb[NL80211_WOWLAN_TRIG_NET_DETECT], &new_triggers); @@ -9238,6 +9286,17 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) goto error; } + /* The 'any' trigger means the device continues operating more or less + * as in its normal operation mode and wakes up the host on most of the + * normal interrupts (like packet RX, ...) + * It therefore makes little sense to combine with the more constrained + * wakeup trigger modes. + */ + if (new_triggers.any && regular) { + err = -EINVAL; + goto error; + } + ntrig = kmemdup(&new_triggers, sizeof(new_triggers), GFP_KERNEL); if (!ntrig) { err = -ENOMEM; @@ -9906,7 +9965,7 @@ struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy, if (WARN_ON(!rdev->cur_cmd_info)) return NULL; - return __cfg80211_alloc_vendor_skb(rdev, approxlen, + return __cfg80211_alloc_vendor_skb(rdev, NULL, approxlen, rdev->cur_cmd_info->snd_portid, rdev->cur_cmd_info->snd_seq, cmd, attr, NULL, GFP_KERNEL); @@ -10112,7 +10171,8 @@ static int nl80211_tdls_channel_switch(struct sk_buff *skb, return -EINVAL; /* we will be active on the TDLS link */ - if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, wdev->iftype)) + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef, + wdev->iftype)) return -EINVAL; /* don't allow switching to DFS channels */ @@ -12775,6 +12835,11 @@ static int nl80211_netlink_notify(struct notifier_block * nb, rcu_read_unlock(); + /* + * It is possible that the user space process that is controlling the + * indoor setting disappeared, so notify the regulatory core. + */ + regulatory_netlink_notify(notify->portid); return NOTIFY_OK; } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 35cfb7134bdb..c6e83a7468c0 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -35,13 +35,14 @@ static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev, static inline struct wireless_dev *rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name, + unsigned char name_assign_type, enum nl80211_iftype type, u32 *flags, struct vif_params *params) { struct wireless_dev *ret; trace_rdev_add_virtual_intf(&rdev->wiphy, name, type); - ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, type, flags, - params); + ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type, + type, flags, params); trace_rdev_return_wdev(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 48dfc7b4e981..aa2d75482017 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -82,17 +82,12 @@ * be intersected with the current one. * @REG_REQ_ALREADY_SET: the regulatory request will not change the current * regulatory settings, and no further processing is required. - * @REG_REQ_USER_HINT_HANDLED: a non alpha2 user hint was handled and no - * further processing is required, i.e., not need to update last_request - * etc. This should be used for user hints that do not provide an alpha2 - * but some other type of regulatory hint, i.e., indoor operation. */ enum reg_request_treatment { REG_REQ_OK, REG_REQ_IGNORE, REG_REQ_INTERSECT, REG_REQ_ALREADY_SET, - REG_REQ_USER_HINT_HANDLED, }; static struct regulatory_request core_request_world = { @@ -133,9 +128,17 @@ static int reg_num_devs_support_basehint; * State variable indicating if the platform on which the devices * are attached is operating in an indoor environment. The state variable * is relevant for all registered devices. - * (protected by RTNL) */ static bool reg_is_indoor; +static spinlock_t reg_indoor_lock; + +/* Used to track the userspace process controlling the indoor setting */ +static u32 reg_is_indoor_portid; + +/* Max number of consecutive attempts to communicate with CRDA */ +#define REG_MAX_CRDA_TIMEOUTS 10 + +static u32 reg_crda_timeouts; static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { @@ -487,7 +490,7 @@ static void reg_regdb_search(struct work_struct *work) mutex_unlock(®_regdb_search_mutex); if (!IS_ERR_OR_NULL(regdom)) - set_regdom(regdom); + set_regdom(regdom, REGD_SOURCE_INTERNAL_DB); rtnl_unlock(); } @@ -537,14 +540,19 @@ static int call_crda(const char *alpha2) snprintf(country, sizeof(country), "COUNTRY=%c%c", alpha2[0], alpha2[1]); + /* query internal regulatory database (if it exists) */ + reg_regdb_query(alpha2); + + if (reg_crda_timeouts > REG_MAX_CRDA_TIMEOUTS) { + pr_debug("Exceeded CRDA call max attempts. Not calling CRDA\n"); + return -EINVAL; + } + if (!is_world_regdom((char *) alpha2)) - pr_info("Calling CRDA for country: %c%c\n", + pr_debug("Calling CRDA for country: %c%c\n", alpha2[0], alpha2[1]); else - pr_info("Calling CRDA to update world regulatory domain\n"); - - /* query internal regulatory database (if it exists) */ - reg_regdb_query(alpha2); + pr_debug("Calling CRDA to update world regulatory domain\n"); return kobject_uevent_env(®_pdev->dev.kobj, KOBJ_CHANGE, env); } @@ -554,6 +562,9 @@ reg_call_crda(struct regulatory_request *request) { if (call_crda(request->alpha2)) return REG_REQ_IGNORE; + + queue_delayed_work(system_power_efficient_wq, + ®_timeout, msecs_to_jiffies(3142)); return REG_REQ_OK; } @@ -978,8 +989,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_OFDM; if (rd_flags & NL80211_RRF_NO_OUTDOOR) channel_flags |= IEEE80211_CHAN_INDOOR_ONLY; - if (rd_flags & NL80211_RRF_GO_CONCURRENT) - channel_flags |= IEEE80211_CHAN_GO_CONCURRENT; + if (rd_flags & NL80211_RRF_IR_CONCURRENT) + channel_flags |= IEEE80211_CHAN_IR_CONCURRENT; if (rd_flags & NL80211_RRF_NO_HT40MINUS) channel_flags |= IEEE80211_CHAN_NO_HT40MINUS; if (rd_flags & NL80211_RRF_NO_HT40PLUS) @@ -1248,13 +1259,6 @@ static bool reg_request_cell_base(struct regulatory_request *request) return request->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE; } -static bool reg_request_indoor(struct regulatory_request *request) -{ - if (request->initiator != NL80211_REGDOM_SET_BY_USER) - return false; - return request->user_reg_hint_type == NL80211_USER_REG_HINT_INDOOR; -} - bool reg_last_request_cell_base(void) { return reg_request_cell_base(get_last_request()); @@ -1585,7 +1589,7 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_ADHOC: - return cfg80211_reg_can_beacon(wiphy, &chandef, iftype); + return cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype); case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: return cfg80211_chandef_usable(wiphy, &chandef, @@ -1800,8 +1804,7 @@ static void reg_set_request_processed(void) need_more_processing = true; spin_unlock(®_requests_lock); - if (lr->initiator == NL80211_REGDOM_SET_BY_USER) - cancel_delayed_work(®_timeout); + cancel_delayed_work(®_timeout); if (need_more_processing) schedule_work(®_work); @@ -1833,11 +1836,6 @@ __reg_process_hint_user(struct regulatory_request *user_request) { struct regulatory_request *lr = get_last_request(); - if (reg_request_indoor(user_request)) { - reg_is_indoor = true; - return REG_REQ_USER_HINT_HANDLED; - } - if (reg_request_cell_base(user_request)) return reg_ignore_cell_hint(user_request); @@ -1885,8 +1883,7 @@ reg_process_hint_user(struct regulatory_request *user_request) treatment = __reg_process_hint_user(user_request); if (treatment == REG_REQ_IGNORE || - treatment == REG_REQ_ALREADY_SET || - treatment == REG_REQ_USER_HINT_HANDLED) { + treatment == REG_REQ_ALREADY_SET) { reg_free_request(user_request); return treatment; } @@ -1947,7 +1944,6 @@ reg_process_hint_driver(struct wiphy *wiphy, case REG_REQ_OK: break; case REG_REQ_IGNORE: - case REG_REQ_USER_HINT_HANDLED: reg_free_request(driver_request); return treatment; case REG_REQ_INTERSECT: @@ -2047,7 +2043,6 @@ reg_process_hint_country_ie(struct wiphy *wiphy, case REG_REQ_OK: break; case REG_REQ_IGNORE: - case REG_REQ_USER_HINT_HANDLED: /* fall through */ case REG_REQ_ALREADY_SET: reg_free_request(country_ie_request); @@ -2086,11 +2081,8 @@ static void reg_process_hint(struct regulatory_request *reg_request) case NL80211_REGDOM_SET_BY_USER: treatment = reg_process_hint_user(reg_request); if (treatment == REG_REQ_IGNORE || - treatment == REG_REQ_ALREADY_SET || - treatment == REG_REQ_USER_HINT_HANDLED) + treatment == REG_REQ_ALREADY_SET) return; - queue_delayed_work(system_power_efficient_wq, - ®_timeout, msecs_to_jiffies(3142)); return; case NL80211_REGDOM_SET_BY_DRIVER: if (!wiphy) @@ -2177,6 +2169,13 @@ static void reg_process_pending_hints(void) } reg_process_hint(reg_request); + + lr = get_last_request(); + + spin_lock(®_requests_lock); + if (!list_empty(®_requests_list) && lr && lr->processed) + schedule_work(®_work); + spin_unlock(®_requests_lock); } /* Processes beacon hints -- this has nothing to do with country IEs */ @@ -2304,27 +2303,58 @@ int regulatory_hint_user(const char *alpha2, request->initiator = NL80211_REGDOM_SET_BY_USER; request->user_reg_hint_type = user_reg_hint_type; + /* Allow calling CRDA again */ + reg_crda_timeouts = 0; + queue_regulatory_request(request); return 0; } -int regulatory_hint_indoor_user(void) +int regulatory_hint_indoor(bool is_indoor, u32 portid) { - struct regulatory_request *request; + spin_lock(®_indoor_lock); - request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL); - if (!request) - return -ENOMEM; + /* It is possible that more than one user space process is trying to + * configure the indoor setting. To handle such cases, clear the indoor + * setting in case that some process does not think that the device + * is operating in an indoor environment. In addition, if a user space + * process indicates that it is controlling the indoor setting, save its + * portid, i.e., make it the owner. + */ + reg_is_indoor = is_indoor; + if (reg_is_indoor) { + if (!reg_is_indoor_portid) + reg_is_indoor_portid = portid; + } else { + reg_is_indoor_portid = 0; + } - request->wiphy_idx = WIPHY_IDX_INVALID; - request->initiator = NL80211_REGDOM_SET_BY_USER; - request->user_reg_hint_type = NL80211_USER_REG_HINT_INDOOR; - queue_regulatory_request(request); + spin_unlock(®_indoor_lock); + + if (!is_indoor) + reg_check_channels(); return 0; } +void regulatory_netlink_notify(u32 portid) +{ + spin_lock(®_indoor_lock); + + if (reg_is_indoor_portid != portid) { + spin_unlock(®_indoor_lock); + return; + } + + reg_is_indoor = false; + reg_is_indoor_portid = 0; + + spin_unlock(®_indoor_lock); + + reg_check_channels(); +} + /* Driver hints */ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) { @@ -2345,6 +2375,9 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_DRIVER; + /* Allow calling CRDA again */ + reg_crda_timeouts = 0; + queue_regulatory_request(request); return 0; @@ -2398,6 +2431,9 @@ void regulatory_hint_country_ie(struct wiphy *wiphy, enum ieee80211_band band, request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE; request->country_ie_env = env; + /* Allow calling CRDA again */ + reg_crda_timeouts = 0; + queue_regulatory_request(request); request = NULL; out: @@ -2486,13 +2522,22 @@ static void restore_regulatory_settings(bool reset_user) char alpha2[2]; char world_alpha2[2]; struct reg_beacon *reg_beacon, *btmp; - struct regulatory_request *reg_request, *tmp; LIST_HEAD(tmp_reg_req_list); struct cfg80211_registered_device *rdev; ASSERT_RTNL(); - reg_is_indoor = false; + /* + * Clear the indoor setting in case that it is not controlled by user + * space, as otherwise there is no guarantee that the device is still + * operating in an indoor environment. + */ + spin_lock(®_indoor_lock); + if (reg_is_indoor && !reg_is_indoor_portid) { + reg_is_indoor = false; + reg_check_channels(); + } + spin_unlock(®_indoor_lock); reset_regdomains(true, &world_regdom); restore_alpha2(alpha2, reset_user); @@ -2504,11 +2549,7 @@ static void restore_regulatory_settings(bool reset_user) * settings. */ spin_lock(®_requests_lock); - list_for_each_entry_safe(reg_request, tmp, ®_requests_list, list) { - if (reg_request->initiator != NL80211_REGDOM_SET_BY_USER) - continue; - list_move_tail(®_request->list, &tmp_reg_req_list); - } + list_splice_tail_init(®_requests_list, &tmp_reg_req_list); spin_unlock(®_requests_lock); /* Clear beacon hints */ @@ -2871,7 +2912,8 @@ static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd, * multiple drivers can be ironed out later. Caller must've already * kmalloc'd the rd structure. */ -int set_regdom(const struct ieee80211_regdomain *rd) +int set_regdom(const struct ieee80211_regdomain *rd, + enum ieee80211_regd_source regd_src) { struct regulatory_request *lr; bool user_reset = false; @@ -2882,6 +2924,9 @@ int set_regdom(const struct ieee80211_regdomain *rd) return -EINVAL; } + if (regd_src == REGD_SOURCE_CRDA) + reg_crda_timeouts = 0; + lr = get_last_request(); /* Note that this doesn't update the wiphys, this is done below */ @@ -3041,6 +3086,7 @@ static void reg_timeout_work(struct work_struct *work) { REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n"); rtnl_lock(); + reg_crda_timeouts++; restore_regulatory_settings(true); rtnl_unlock(); } @@ -3089,6 +3135,7 @@ int __init regulatory_init(void) spin_lock_init(®_requests_lock); spin_lock_init(®_pending_beacons_lock); + spin_lock_init(®_indoor_lock); reg_regdb_size_check(); diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 4b45d6e61d24..9f495d76eca0 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -16,6 +16,11 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +enum ieee80211_regd_source { + REGD_SOURCE_INTERNAL_DB, + REGD_SOURCE_CRDA, +}; + extern const struct ieee80211_regdomain __rcu *cfg80211_regdomain; bool reg_is_valid_request(const char *alpha2); @@ -25,7 +30,20 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy); int regulatory_hint_user(const char *alpha2, enum nl80211_user_reg_hint_type user_reg_hint_type); -int regulatory_hint_indoor_user(void); + +/** + * regulatory_hint_indoor - hint operation in indoor env. or not + * @is_indoor: if true indicates that user space thinks that the + * device is operating in an indoor environment. + * @portid: the netlink port ID on which the hint was given. + */ +int regulatory_hint_indoor(bool is_indoor, u32 portid); + +/** + * regulatory_netlink_notify - notify on released netlink socket + * @portid: the netlink socket port ID + */ +void regulatory_netlink_notify(u32 portid); void wiphy_regulatory_register(struct wiphy *wiphy); void wiphy_regulatory_deregister(struct wiphy *wiphy); @@ -33,7 +51,9 @@ void wiphy_regulatory_deregister(struct wiphy *wiphy); int __init regulatory_init(void); void regulatory_exit(void); -int set_regdom(const struct ieee80211_regdomain *rd); +int set_regdom(const struct ieee80211_regdomain *rd, + enum ieee80211_regd_source regd_src); + unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, const struct ieee80211_reg_rule *rule); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index c705c3e2b751..3a50aa2553bf 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -531,24 +531,78 @@ static int cmp_bss(struct cfg80211_bss *a, } } +static bool cfg80211_bss_type_match(u16 capability, + enum ieee80211_band band, + enum ieee80211_bss_type bss_type) +{ + bool ret = true; + u16 mask, val; + + if (bss_type == IEEE80211_BSS_TYPE_ANY) + return ret; + + if (band == IEEE80211_BAND_60GHZ) { + mask = WLAN_CAPABILITY_DMG_TYPE_MASK; + switch (bss_type) { + case IEEE80211_BSS_TYPE_ESS: + val = WLAN_CAPABILITY_DMG_TYPE_AP; + break; + case IEEE80211_BSS_TYPE_PBSS: + val = WLAN_CAPABILITY_DMG_TYPE_PBSS; + break; + case IEEE80211_BSS_TYPE_IBSS: + val = WLAN_CAPABILITY_DMG_TYPE_IBSS; + break; + default: + return false; + } + } else { + mask = WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS; + switch (bss_type) { + case IEEE80211_BSS_TYPE_ESS: + val = WLAN_CAPABILITY_ESS; + break; + case IEEE80211_BSS_TYPE_IBSS: + val = WLAN_CAPABILITY_IBSS; + break; + case IEEE80211_BSS_TYPE_MBSS: + val = 0; + break; + default: + return false; + } + } + + ret = ((capability & mask) == val); + return ret; +} + /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, const u8 *bssid, const u8 *ssid, size_t ssid_len, - u16 capa_mask, u16 capa_val) + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss, *res = NULL; unsigned long now = jiffies; + int bss_privacy; - trace_cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, capa_mask, - capa_val); + trace_cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, bss_type, + privacy); spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { - if ((bss->pub.capability & capa_mask) != capa_val) + if (!cfg80211_bss_type_match(bss->pub.capability, + bss->pub.channel->band, bss_type)) + continue; + + bss_privacy = (bss->pub.capability & WLAN_CAPABILITY_PRIVACY); + if ((privacy == IEEE80211_PRIVACY_ON && !bss_privacy) || + (privacy == IEEE80211_PRIVACY_OFF && bss_privacy)) continue; if (channel && bss->pub.channel != channel) continue; @@ -896,6 +950,7 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, struct cfg80211_bss_ies *ies; struct ieee80211_channel *channel; struct cfg80211_internal_bss tmp = {}, *res; + int bss_type; bool signal_valid; if (WARN_ON(!wiphy)) @@ -950,8 +1005,15 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, if (!res) return NULL; - if (res->pub.capability & WLAN_CAPABILITY_ESS) - regulatory_hint_found_beacon(wiphy, channel, gfp); + if (channel->band == IEEE80211_BAND_60GHZ) { + bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK; + if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || + bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) + regulatory_hint_found_beacon(wiphy, channel, gfp); + } else { + if (res->pub.capability & WLAN_CAPABILITY_ESS) + regulatory_hint_found_beacon(wiphy, channel, gfp); + } trace_cfg80211_return_bss(&res->pub); /* cfg80211_bss_update gives us a referenced result */ @@ -973,6 +1035,7 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, bool signal_valid; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); + int bss_type; BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != offsetof(struct ieee80211_mgmt, u.beacon.variable)); @@ -1025,8 +1088,15 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, if (!res) return NULL; - if (res->pub.capability & WLAN_CAPABILITY_ESS) - regulatory_hint_found_beacon(wiphy, channel, gfp); + if (channel->band == IEEE80211_BAND_60GHZ) { + bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK; + if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || + bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) + regulatory_hint_found_beacon(wiphy, channel, gfp); + } else { + if (res->pub.capability & WLAN_CAPABILITY_ESS) + regulatory_hint_found_beacon(wiphy, channel, gfp); + } trace_cfg80211_return_bss(&res->pub); /* cfg80211_bss_update gives us a referenced result */ @@ -1237,17 +1307,17 @@ int cfg80211_wext_siwscan(struct net_device *dev, kfree(creq); return err; } -EXPORT_SYMBOL_GPL(cfg80211_wext_siwscan); +EXPORT_WEXT_HANDLER(cfg80211_wext_siwscan); -static void ieee80211_scan_add_ies(struct iw_request_info *info, - const struct cfg80211_bss_ies *ies, - char **current_ev, char *end_buf) +static char *ieee80211_scan_add_ies(struct iw_request_info *info, + const struct cfg80211_bss_ies *ies, + char *current_ev, char *end_buf) { const u8 *pos, *end, *next; struct iw_event iwe; if (!ies) - return; + return current_ev; /* * If needed, fragment the IEs buffer (at IE boundaries) into short @@ -1264,10 +1334,11 @@ static void ieee80211_scan_add_ies(struct iw_request_info *info, memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVGENIE; iwe.u.data.length = next - pos; - *current_ev = iwe_stream_add_point(info, *current_ev, - end_buf, &iwe, - (void *)pos); - + current_ev = iwe_stream_add_point_check(info, current_ev, + end_buf, &iwe, + (void *)pos); + if (IS_ERR(current_ev)) + return current_ev; pos = next; } @@ -1275,10 +1346,14 @@ static void ieee80211_scan_add_ies(struct iw_request_info *info, memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVGENIE; iwe.u.data.length = end - pos; - *current_ev = iwe_stream_add_point(info, *current_ev, - end_buf, &iwe, - (void *)pos); + current_ev = iwe_stream_add_point_check(info, current_ev, + end_buf, &iwe, + (void *)pos); + if (IS_ERR(current_ev)) + return current_ev; } + + return current_ev; } static char * @@ -1289,7 +1364,8 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, const struct cfg80211_bss_ies *ies; struct iw_event iwe; const u8 *ie; - u8 *buf, *cfg, *p; + u8 buf[50]; + u8 *cfg, *p, *tmp; int rem, i, sig; bool ismesh = false; @@ -1297,22 +1373,28 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, iwe.cmd = SIOCGIWAP; iwe.u.ap_addr.sa_family = ARPHRD_ETHER; memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN); - current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, - IW_EV_ADDR_LEN); + current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, + IW_EV_ADDR_LEN); + if (IS_ERR(current_ev)) + return current_ev; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq); iwe.u.freq.e = 0; - current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, - IW_EV_FREQ_LEN); + current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, + IW_EV_FREQ_LEN); + if (IS_ERR(current_ev)) + return current_ev; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; iwe.u.freq.m = bss->pub.channel->center_freq; iwe.u.freq.e = 6; - current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, - IW_EV_FREQ_LEN); + current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, + IW_EV_FREQ_LEN); + if (IS_ERR(current_ev)) + return current_ev; if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) { memset(&iwe, 0, sizeof(iwe)); @@ -1341,8 +1423,11 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, /* not reached */ break; } - current_ev = iwe_stream_add_event(info, current_ev, end_buf, - &iwe, IW_EV_QUAL_LEN); + current_ev = iwe_stream_add_event_check(info, current_ev, + end_buf, &iwe, + IW_EV_QUAL_LEN); + if (IS_ERR(current_ev)) + return current_ev; } memset(&iwe, 0, sizeof(iwe)); @@ -1352,8 +1437,10 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, else iwe.u.data.flags = IW_ENCODE_DISABLED; iwe.u.data.length = 0; - current_ev = iwe_stream_add_point(info, current_ev, end_buf, - &iwe, ""); + current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, + &iwe, ""); + if (IS_ERR(current_ev)) + return current_ev; rcu_read_lock(); ies = rcu_dereference(bss->pub.ies); @@ -1371,66 +1458,91 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, iwe.cmd = SIOCGIWESSID; iwe.u.data.length = ie[1]; iwe.u.data.flags = 1; - current_ev = iwe_stream_add_point(info, current_ev, end_buf, - &iwe, (u8 *)ie + 2); + current_ev = iwe_stream_add_point_check(info, + current_ev, + end_buf, &iwe, + (u8 *)ie + 2); + if (IS_ERR(current_ev)) + goto unlock; break; case WLAN_EID_MESH_ID: memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWESSID; iwe.u.data.length = ie[1]; iwe.u.data.flags = 1; - current_ev = iwe_stream_add_point(info, current_ev, end_buf, - &iwe, (u8 *)ie + 2); + current_ev = iwe_stream_add_point_check(info, + current_ev, + end_buf, &iwe, + (u8 *)ie + 2); + if (IS_ERR(current_ev)) + goto unlock; break; case WLAN_EID_MESH_CONFIG: ismesh = true; if (ie[1] != sizeof(struct ieee80211_meshconf_ie)) break; - buf = kmalloc(50, GFP_ATOMIC); - if (!buf) - break; cfg = (u8 *)ie + 2; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; sprintf(buf, "Mesh Network Path Selection Protocol ID: " "0x%02X", cfg[0]); iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); + current_ev = iwe_stream_add_point_check(info, + current_ev, + end_buf, + &iwe, buf); + if (IS_ERR(current_ev)) + goto unlock; sprintf(buf, "Path Selection Metric ID: 0x%02X", cfg[1]); iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); + current_ev = iwe_stream_add_point_check(info, + current_ev, + end_buf, + &iwe, buf); + if (IS_ERR(current_ev)) + goto unlock; sprintf(buf, "Congestion Control Mode ID: 0x%02X", cfg[2]); iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); + current_ev = iwe_stream_add_point_check(info, + current_ev, + end_buf, + &iwe, buf); + if (IS_ERR(current_ev)) + goto unlock; sprintf(buf, "Synchronization ID: 0x%02X", cfg[3]); iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); + current_ev = iwe_stream_add_point_check(info, + current_ev, + end_buf, + &iwe, buf); + if (IS_ERR(current_ev)) + goto unlock; sprintf(buf, "Authentication ID: 0x%02X", cfg[4]); iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); + current_ev = iwe_stream_add_point_check(info, + current_ev, + end_buf, + &iwe, buf); + if (IS_ERR(current_ev)) + goto unlock; sprintf(buf, "Formation Info: 0x%02X", cfg[5]); iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); + current_ev = iwe_stream_add_point_check(info, + current_ev, + end_buf, + &iwe, buf); + if (IS_ERR(current_ev)) + goto unlock; sprintf(buf, "Capabilities: 0x%02X", cfg[6]); iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, - &iwe, buf); - kfree(buf); + current_ev = iwe_stream_add_point_check(info, + current_ev, + end_buf, + &iwe, buf); + if (IS_ERR(current_ev)) + goto unlock; break; case WLAN_EID_SUPP_RATES: case WLAN_EID_EXT_SUPP_RATES: @@ -1445,8 +1557,14 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, for (i = 0; i < ie[1]; i++) { iwe.u.bitrate.value = ((ie[i + 2] & 0x7f) * 500000); + tmp = p; p = iwe_stream_add_value(info, current_ev, p, - end_buf, &iwe, IW_EV_PARAM_LEN); + end_buf, &iwe, + IW_EV_PARAM_LEN); + if (p == tmp) { + current_ev = ERR_PTR(-E2BIG); + goto unlock; + } } current_ev = p; break; @@ -1465,31 +1583,35 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, iwe.u.mode = IW_MODE_MASTER; else iwe.u.mode = IW_MODE_ADHOC; - current_ev = iwe_stream_add_event(info, current_ev, end_buf, - &iwe, IW_EV_UINT_LEN); - } - - buf = kmalloc(31, GFP_ATOMIC); - if (buf) { - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVCUSTOM; - sprintf(buf, "tsf=%016llx", (unsigned long long)(ies->tsf)); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, end_buf, - &iwe, buf); - memset(&iwe, 0, sizeof(iwe)); - iwe.cmd = IWEVCUSTOM; - sprintf(buf, " Last beacon: %ums ago", - elapsed_jiffies_msecs(bss->ts)); - iwe.u.data.length = strlen(buf); - current_ev = iwe_stream_add_point(info, current_ev, - end_buf, &iwe, buf); - kfree(buf); + current_ev = iwe_stream_add_event_check(info, current_ev, + end_buf, &iwe, + IW_EV_UINT_LEN); + if (IS_ERR(current_ev)) + goto unlock; } - ieee80211_scan_add_ies(info, ies, ¤t_ev, end_buf); + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVCUSTOM; + sprintf(buf, "tsf=%016llx", (unsigned long long)(ies->tsf)); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, + &iwe, buf); + if (IS_ERR(current_ev)) + goto unlock; + memset(&iwe, 0, sizeof(iwe)); + iwe.cmd = IWEVCUSTOM; + sprintf(buf, " Last beacon: %ums ago", + elapsed_jiffies_msecs(bss->ts)); + iwe.u.data.length = strlen(buf); + current_ev = iwe_stream_add_point_check(info, current_ev, + end_buf, &iwe, buf); + if (IS_ERR(current_ev)) + goto unlock; + + current_ev = ieee80211_scan_add_ies(info, ies, current_ev, end_buf); + + unlock: rcu_read_unlock(); - return current_ev; } @@ -1501,19 +1623,27 @@ static int ieee80211_scan_results(struct cfg80211_registered_device *rdev, char *current_ev = buf; char *end_buf = buf + len; struct cfg80211_internal_bss *bss; + int err = 0; spin_lock_bh(&rdev->bss_lock); cfg80211_bss_expire(rdev); list_for_each_entry(bss, &rdev->bss_list, list) { if (buf + len - current_ev <= IW_EV_ADDR_LEN) { - spin_unlock_bh(&rdev->bss_lock); - return -E2BIG; + err = -E2BIG; + break; } current_ev = ieee80211_bss(&rdev->wiphy, info, bss, current_ev, end_buf); + if (IS_ERR(current_ev)) { + err = PTR_ERR(current_ev); + break; + } } spin_unlock_bh(&rdev->bss_lock); + + if (err) + return err; return current_ev - buf; } @@ -1545,5 +1675,5 @@ int cfg80211_wext_giwscan(struct net_device *dev, return res; } -EXPORT_SYMBOL_GPL(cfg80211_wext_giwscan); +EXPORT_WEXT_HANDLER(cfg80211_wext_giwscan); #endif diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 0ab3711c79a0..8020b5b094d4 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -42,7 +42,7 @@ struct cfg80211_conn { CFG80211_CONN_CONNECTED, } state; u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; - u8 *ie; + const u8 *ie; size_t ie_len; bool auto_auth, prev_bssid_valid; }; @@ -257,19 +257,15 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; - u16 capa = WLAN_CAPABILITY_ESS; ASSERT_WDEV_LOCK(wdev); - if (wdev->conn->params.privacy) - capa |= WLAN_CAPABILITY_PRIVACY; - bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, - WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_PRIVACY, - capa); + IEEE80211_BSS_TYPE_ESS, + IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (!bss) return NULL; @@ -427,6 +423,62 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev) schedule_work(&rdev->conn_work); } +static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev, + const u8 *ies, size_t ies_len, + const u8 **out_ies, size_t *out_ies_len) +{ + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + u8 *buf; + size_t offs; + + if (!rdev->wiphy.extended_capabilities_len || + (ies && cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, ies, ies_len))) { + *out_ies = kmemdup(ies, ies_len, GFP_KERNEL); + if (!*out_ies) + return -ENOMEM; + *out_ies_len = ies_len; + return 0; + } + + buf = kmalloc(ies_len + rdev->wiphy.extended_capabilities_len + 2, + GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (ies_len) { + static const u8 before_extcapa[] = { + /* not listing IEs expected to be created by driver */ + WLAN_EID_RSN, + WLAN_EID_QOS_CAPA, + WLAN_EID_RRM_ENABLED_CAPABILITIES, + WLAN_EID_MOBILITY_DOMAIN, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + WLAN_EID_BSS_COEX_2040, + }; + + offs = ieee80211_ie_split(ies, ies_len, before_extcapa, + ARRAY_SIZE(before_extcapa), 0); + memcpy(buf, ies, offs); + /* leave a whole for extended capabilities IE */ + memcpy(buf + offs + rdev->wiphy.extended_capabilities_len + 2, + ies + offs, ies_len - offs); + } else { + offs = 0; + } + + /* place extended capabilities IE (with only driver capabilities) */ + buf[offs] = WLAN_EID_EXT_CAPABILITY; + buf[offs + 1] = rdev->wiphy.extended_capabilities_len; + memcpy(buf + offs + 2, + rdev->wiphy.extended_capabilities, + rdev->wiphy.extended_capabilities_len); + + *out_ies = buf; + *out_ies_len = ies_len + rdev->wiphy.extended_capabilities_len + 2; + + return 0; +} + static int cfg80211_sme_connect(struct wireless_dev *wdev, struct cfg80211_connect_params *connect, const u8 *prev_bssid) @@ -457,16 +509,14 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, memcpy(wdev->conn->bssid, connect->bssid, ETH_ALEN); } - if (connect->ie) { - wdev->conn->ie = kmemdup(connect->ie, connect->ie_len, - GFP_KERNEL); - wdev->conn->params.ie = wdev->conn->ie; - if (!wdev->conn->ie) { - kfree(wdev->conn); - wdev->conn = NULL; - return -ENOMEM; - } + if (cfg80211_sme_get_conn_ies(wdev, connect->ie, connect->ie_len, + &wdev->conn->ie, + &wdev->conn->params.ie_len)) { + kfree(wdev->conn); + wdev->conn = NULL; + return -ENOMEM; } + wdev->conn->params.ie = wdev->conn->ie; if (connect->auth_type == NL80211_AUTHTYPE_AUTOMATIC) { wdev->conn->auto_auth = true; @@ -637,8 +687,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect); bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid, wdev->ssid, wdev->ssid_len, - WLAN_CAPABILITY_ESS, - WLAN_CAPABILITY_ESS); + IEEE80211_BSS_TYPE_ESS, + IEEE80211_PRIVACY_ANY); if (bss) cfg80211_hold_bss(bss_from_pub(bss)); } @@ -795,8 +845,8 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_bss *bss; bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid, - wdev->ssid_len, WLAN_CAPABILITY_ESS, - WLAN_CAPABILITY_ESS); + wdev->ssid_len, + IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); if (WARN_ON(!bss)) return; @@ -888,7 +938,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, } void cfg80211_disconnected(struct net_device *dev, u16 reason, - const u8 *ie, size_t ie_len, gfp_t gfp) + const u8 *ie, size_t ie_len, + bool locally_generated, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -904,6 +955,7 @@ void cfg80211_disconnected(struct net_device *dev, u16 reason, ev->dc.ie_len = ie_len; memcpy((void *)ev->dc.ie, ie, ie_len); ev->dc.reason = reason; + ev->dc.locally_generated = locally_generated; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 9ee6bc1a7610..9cee0220665d 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -86,7 +86,7 @@ static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; @@ -95,7 +95,7 @@ static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) cfg80211_leave(rdev, wdev); } -static int wiphy_suspend(struct device *dev, pm_message_t state) +static int wiphy_suspend(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; @@ -136,6 +136,11 @@ static int wiphy_resume(struct device *dev) return ret; } + +static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume); +#define WIPHY_PM_OPS (&wiphy_pm_ops) +#else +#define WIPHY_PM_OPS NULL #endif static const void *wiphy_namespace(struct device *d) @@ -151,10 +156,7 @@ struct class ieee80211_class = { .dev_release = wiphy_dev_release, .dev_groups = ieee80211_groups, .dev_uevent = wiphy_uevent, -#ifdef CONFIG_PM - .suspend = wiphy_suspend, - .resume = wiphy_resume, -#endif + .pm = WIPHY_PM_OPS, .ns_type = &net_ns_type_operations, .namespace = wiphy_namespace, }; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index b17b3692f8c2..a808279a432a 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -7,6 +7,7 @@ #include <linux/tracepoint.h> #include <linux/rtnetlink.h> +#include <linux/etherdevice.h> #include <net/cfg80211.h> #include "core.h" @@ -15,7 +16,7 @@ if (given_mac) \ memcpy(__entry->entry_mac, given_mac, ETH_ALEN); \ else \ - memset(__entry->entry_mac, 0, ETH_ALEN); \ + eth_zero_addr(__entry->entry_mac); \ } while (0) #define MAC_PR_FMT "%pM" #define MAC_PR_ARG(entry_mac) (__entry->entry_mac) @@ -627,6 +628,7 @@ DECLARE_EVENT_CLASS(station_add_change, __field(u8, plink_state) __field(u8, uapsd_queues) __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap)) + __array(char, vlan, IFNAMSIZ) ), TP_fast_assign( WIPHY_ASSIGN; @@ -644,16 +646,19 @@ DECLARE_EVENT_CLASS(station_add_change, if (params->ht_capa) memcpy(__entry->ht_capa, params->ht_capa, sizeof(struct ieee80211_ht_cap)); + memset(__entry->vlan, 0, sizeof(__entry->vlan)); + if (params->vlan) + memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", station flags mask: %u, station flags set: %u, " "station modify mask: %u, listen interval: %d, aid: %u, " - "plink action: %u, plink state: %u, uapsd queues: %u", + "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac), __entry->sta_flags_mask, __entry->sta_flags_set, __entry->sta_modify_mask, __entry->listen_interval, __entry->aid, __entry->plink_action, __entry->plink_state, - __entry->uapsd_queues) + __entry->uapsd_queues, __entry->vlan) ); DEFINE_EVENT(station_add_change, rdev_add_station, @@ -1077,7 +1082,7 @@ TRACE_EVENT(rdev_auth, if (req->bss) MAC_ASSIGN(bssid, req->bss->bssid); else - memset(__entry->bssid, 0, ETH_ALEN); + eth_zero_addr(__entry->bssid); __entry->auth_type = req->auth_type; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", auth type: %d, bssid: " MAC_PR_FMT, @@ -1103,7 +1108,7 @@ TRACE_EVENT(rdev_assoc, if (req->bss) MAC_ASSIGN(bssid, req->bss->bssid); else - memset(__entry->bssid, 0, ETH_ALEN); + eth_zero_addr(__entry->bssid); MAC_ASSIGN(prev_bssid, req->prev_bssid); __entry->use_mfp = req->use_mfp; __entry->flags = req->flags; @@ -1153,7 +1158,7 @@ TRACE_EVENT(rdev_disassoc, if (req->bss) MAC_ASSIGN(bssid, req->bss->bssid); else - memset(__entry->bssid, 0, ETH_ALEN); + eth_zero_addr(__entry->bssid); __entry->reason_code = req->reason_code; __entry->local_state_change = req->local_state_change; ), @@ -2353,20 +2358,23 @@ TRACE_EVENT(cfg80211_cqm_rssi_notify, TRACE_EVENT(cfg80211_reg_can_beacon, TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, - enum nl80211_iftype iftype), - TP_ARGS(wiphy, chandef, iftype), + enum nl80211_iftype iftype, bool check_no_ir), + TP_ARGS(wiphy, chandef, iftype, check_no_ir), TP_STRUCT__entry( WIPHY_ENTRY CHAN_DEF_ENTRY __field(enum nl80211_iftype, iftype) + __field(bool, check_no_ir) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->iftype = iftype; + __entry->check_no_ir = check_no_ir; ), - TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d", - WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype) + TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d check_no_ir=%s", + WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype, + BOOL_TO_STR(__entry->check_no_ir)) ); TRACE_EVENT(cfg80211_chandef_dfs_required, @@ -2636,28 +2644,30 @@ DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_stopped, TRACE_EVENT(cfg80211_get_bss, TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel, const u8 *bssid, const u8 *ssid, size_t ssid_len, - u16 capa_mask, u16 capa_val), - TP_ARGS(wiphy, channel, bssid, ssid, ssid_len, capa_mask, capa_val), + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy), + TP_ARGS(wiphy, channel, bssid, ssid, ssid_len, bss_type, privacy), TP_STRUCT__entry( WIPHY_ENTRY CHAN_ENTRY MAC_ENTRY(bssid) __dynamic_array(u8, ssid, ssid_len) - __field(u16, capa_mask) - __field(u16, capa_val) + __field(enum ieee80211_bss_type, bss_type) + __field(enum ieee80211_privacy, privacy) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_ASSIGN(channel); MAC_ASSIGN(bssid, bssid); memcpy(__get_dynamic_array(ssid), ssid, ssid_len); - __entry->capa_mask = capa_mask; - __entry->capa_val = capa_val; - ), - TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT ", " MAC_PR_FMT ", buf: %#.2x, " - "capa_mask: %d, capa_val: %u", WIPHY_PR_ARG, CHAN_PR_ARG, - MAC_PR_ARG(bssid), ((u8 *)__get_dynamic_array(ssid))[0], - __entry->capa_mask, __entry->capa_val) + __entry->bss_type = bss_type; + __entry->privacy = privacy; + ), + TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT ", " MAC_PR_FMT + ", buf: %#.2x, bss_type: %d, privacy: %d", + WIPHY_PR_ARG, CHAN_PR_ARG, MAC_PR_ARG(bssid), + ((u8 *)__get_dynamic_array(ssid))[0], __entry->bss_type, + __entry->privacy) ); TRACE_EVENT(cfg80211_inform_bss_width_frame, diff --git a/net/wireless/util.c b/net/wireless/util.c index 6903dbdcb8c1..baf7218cec15 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -887,7 +887,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, ev->dc.ie, ev->dc.ie_len, - ev->dc.reason, true); + ev->dc.reason, + !ev->dc.locally_generated); break; case EVENT_IBSS_JOINED: __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, @@ -944,7 +945,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, ntype == NL80211_IFTYPE_P2P_CLIENT)) return -EBUSY; - if (ntype != otype && netif_running(dev)) { + if (ntype != otype) { dev->ieee80211_ptr->use_4addr = false; dev->ieee80211_ptr->mesh_id_up_len = 0; wdev_lock(dev->ieee80211_ptr); @@ -1290,12 +1291,54 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, } EXPORT_SYMBOL(cfg80211_get_p2p_attr); +static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id) +{ + int i; + + for (i = 0; i < n_ids; i++) + if (ids[i] == id) + return true; + return false; +} + +size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, + const u8 *after_ric, int n_after_ric, + size_t offset) +{ + size_t pos = offset; + + while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) { + if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { + pos += 2 + ies[pos + 1]; + + while (pos < ielen && + !ieee80211_id_in_list(after_ric, n_after_ric, + ies[pos])) + pos += 2 + ies[pos + 1]; + } else { + pos += 2 + ies[pos + 1]; + } + } + + return pos; +} +EXPORT_SYMBOL(ieee80211_ie_split_ric); + +size_t ieee80211_ie_split(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, size_t offset) +{ + return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset); +} +EXPORT_SYMBOL(ieee80211_ie_split); + bool ieee80211_operating_class_to_band(u8 operating_class, enum ieee80211_band *band) { switch (operating_class) { case 112: case 115 ... 127: + case 128 ... 130: *band = IEEE80211_BAND_5GHZ; return true; case 81: @@ -1313,6 +1356,135 @@ bool ieee80211_operating_class_to_band(u8 operating_class, } EXPORT_SYMBOL(ieee80211_operating_class_to_band); +bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, + u8 *op_class) +{ + u8 vht_opclass; + u16 freq = chandef->center_freq1; + + if (freq >= 2412 && freq <= 2472) { + if (chandef->width > NL80211_CHAN_WIDTH_40) + return false; + + /* 2.407 GHz, channels 1..13 */ + if (chandef->width == NL80211_CHAN_WIDTH_40) { + if (freq > chandef->chan->center_freq) + *op_class = 83; /* HT40+ */ + else + *op_class = 84; /* HT40- */ + } else { + *op_class = 81; + } + + return true; + } + + if (freq == 2484) { + if (chandef->width > NL80211_CHAN_WIDTH_40) + return false; + + *op_class = 82; /* channel 14 */ + return true; + } + + switch (chandef->width) { + case NL80211_CHAN_WIDTH_80: + vht_opclass = 128; + break; + case NL80211_CHAN_WIDTH_160: + vht_opclass = 129; + break; + case NL80211_CHAN_WIDTH_80P80: + vht_opclass = 130; + break; + case NL80211_CHAN_WIDTH_10: + case NL80211_CHAN_WIDTH_5: + return false; /* unsupported for now */ + default: + vht_opclass = 0; + break; + } + + /* 5 GHz, channels 36..48 */ + if (freq >= 5180 && freq <= 5240) { + if (vht_opclass) { + *op_class = vht_opclass; + } else if (chandef->width == NL80211_CHAN_WIDTH_40) { + if (freq > chandef->chan->center_freq) + *op_class = 116; + else + *op_class = 117; + } else { + *op_class = 115; + } + + return true; + } + + /* 5 GHz, channels 52..64 */ + if (freq >= 5260 && freq <= 5320) { + if (vht_opclass) { + *op_class = vht_opclass; + } else if (chandef->width == NL80211_CHAN_WIDTH_40) { + if (freq > chandef->chan->center_freq) + *op_class = 119; + else + *op_class = 120; + } else { + *op_class = 118; + } + + return true; + } + + /* 5 GHz, channels 100..144 */ + if (freq >= 5500 && freq <= 5720) { + if (vht_opclass) { + *op_class = vht_opclass; + } else if (chandef->width == NL80211_CHAN_WIDTH_40) { + if (freq > chandef->chan->center_freq) + *op_class = 122; + else + *op_class = 123; + } else { + *op_class = 121; + } + + return true; + } + + /* 5 GHz, channels 149..169 */ + if (freq >= 5745 && freq <= 5845) { + if (vht_opclass) { + *op_class = vht_opclass; + } else if (chandef->width == NL80211_CHAN_WIDTH_40) { + if (freq > chandef->chan->center_freq) + *op_class = 126; + else + *op_class = 127; + } else if (freq <= 5805) { + *op_class = 124; + } else { + *op_class = 125; + } + + return true; + } + + /* 56.16 GHz, channel 1..4 */ + if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 4) { + if (chandef->width >= NL80211_CHAN_WIDTH_40) + return false; + + *op_class = 180; + return true; + } + + /* not supported yet */ + return false; +} +EXPORT_SYMBOL(ieee80211_chandef_to_operating_class); + int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, u32 beacon_int) { diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 5b24d39d7903..fd682832a0e3 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -63,7 +63,7 @@ int cfg80211_wext_giwname(struct net_device *dev, return 0; } -EXPORT_SYMBOL_GPL(cfg80211_wext_giwname); +EXPORT_WEXT_HANDLER(cfg80211_wext_giwname); int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, u32 *mode, char *extra) @@ -99,7 +99,7 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, return cfg80211_change_iface(rdev, dev, type, NULL, &vifparams); } -EXPORT_SYMBOL_GPL(cfg80211_wext_siwmode); +EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode); int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, u32 *mode, char *extra) @@ -134,7 +134,7 @@ int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, } return 0; } -EXPORT_SYMBOL_GPL(cfg80211_wext_giwmode); +EXPORT_WEXT_HANDLER(cfg80211_wext_giwmode); int cfg80211_wext_giwrange(struct net_device *dev, @@ -248,7 +248,7 @@ int cfg80211_wext_giwrange(struct net_device *dev, return 0; } -EXPORT_SYMBOL_GPL(cfg80211_wext_giwrange); +EXPORT_WEXT_HANDLER(cfg80211_wext_giwrange); /** @@ -303,7 +303,7 @@ int cfg80211_wext_siwrts(struct net_device *dev, return err; } -EXPORT_SYMBOL_GPL(cfg80211_wext_siwrts); +EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts); int cfg80211_wext_giwrts(struct net_device *dev, struct iw_request_info *info, @@ -317,7 +317,7 @@ int cfg80211_wext_giwrts(struct net_device *dev, return 0; } -EXPORT_SYMBOL_GPL(cfg80211_wext_giwrts); +EXPORT_WEXT_HANDLER(cfg80211_wext_giwrts); int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_request_info *info, @@ -343,7 +343,7 @@ int cfg80211_wext_siwfrag(struct net_device *dev, return err; } -EXPORT_SYMBOL_GPL(cfg80211_wext_siwfrag); +EXPORT_WEXT_HANDLER(cfg80211_wext_siwfrag); int cfg80211_wext_giwfrag(struct net_device *dev, struct iw_request_info *info, @@ -357,7 +357,7 @@ int cfg80211_wext_giwfrag(struct net_device *dev, return 0; } -EXPORT_SYMBOL_GPL(cfg80211_wext_giwfrag); +EXPORT_WEXT_HANDLER(cfg80211_wext_giwfrag); static int cfg80211_wext_siwretry(struct net_device *dev, struct iw_request_info *info, @@ -427,7 +427,7 @@ int cfg80211_wext_giwretry(struct net_device *dev, return 0; } -EXPORT_SYMBOL_GPL(cfg80211_wext_giwretry); +EXPORT_WEXT_HANDLER(cfg80211_wext_giwretry); static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, struct net_device *dev, bool pairwise, @@ -1333,6 +1333,8 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN); wdev_unlock(wdev); + memset(&sinfo, 0, sizeof(sinfo)); + if (rdev_get_station(rdev, dev, bssid, &sinfo)) return NULL; diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h index ebcacca2f731..94c7405a5413 100644 --- a/net/wireless/wext-compat.h +++ b/net/wireless/wext-compat.h @@ -4,6 +4,12 @@ #include <net/iw_handler.h> #include <linux/wireless.h> +#ifdef CONFIG_CFG80211_WEXT_EXPORT +#define EXPORT_WEXT_HANDLER(h) EXPORT_SYMBOL_GPL(h) +#else +#define EXPORT_WEXT_HANDLER(h) +#endif /* CONFIG_CFG80211_WEXT_EXPORT */ + int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra); diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 368611c05739..a4e8af3321d2 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -322,7 +322,7 @@ int cfg80211_mgd_wext_giwap(struct net_device *dev, if (wdev->current_bss) memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN); else - memset(ap_addr->sa_data, 0, ETH_ALEN); + eth_zero_addr(ap_addr->sa_data); wdev_unlock(wdev); return 0; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d9149b68b9bc..a750f330b8dd 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -515,10 +515,10 @@ static struct proto x25_proto = { .obj_size = sizeof(struct x25_sock), }; -static struct sock *x25_alloc_socket(struct net *net) +static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; - struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto); + struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; @@ -553,7 +553,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -ENOBUFS; - if ((sk = x25_alloc_socket(net)) == NULL) + if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); @@ -602,7 +602,7 @@ static struct sock *x25_make_new(struct sock *osk) if (osk->sk_type != SOCK_SEQPACKET) goto out; - if ((sk = x25_alloc_socket(sock_net(osk))) == NULL) + if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); @@ -1077,8 +1077,7 @@ out_clear_request: goto out; } -static int x25_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) +static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); @@ -1252,8 +1251,7 @@ out_kfree_skb: } -static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size, +static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 12e82a5e4ad5..42f7c76cf853 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -31,6 +31,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqniv", .icv_truncbits = 64, } }, @@ -49,6 +50,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqniv", .icv_truncbits = 96, } }, @@ -67,6 +69,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqniv", .icv_truncbits = 128, } }, @@ -85,6 +88,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqniv", .icv_truncbits = 64, } }, @@ -103,6 +107,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqniv", .icv_truncbits = 96, } }, @@ -121,6 +126,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqniv", .icv_truncbits = 128, } }, @@ -139,6 +145,7 @@ static struct xfrm_algo_desc aead_list[] = { .uinfo = { .aead = { + .geniv = "seqiv", .icv_truncbits = 128, } }, @@ -152,6 +159,18 @@ static struct xfrm_algo_desc aead_list[] = { .sadb_alg_maxbits = 256 } }, +{ + .name = "rfc7539esp(chacha20,poly1305)", + + .uinfo = { + .aead = { + .geniv = "seqniv", + .icv_truncbits = 128, + } + }, + + .pfkey_supported = 0, +}, }; static struct xfrm_algo_desc aalg_list[] = { @@ -353,6 +372,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 64, .defkeybits = 64, } @@ -373,6 +393,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 64, .defkeybits = 192, } @@ -393,6 +414,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 64, .defkeybits = 128, } @@ -413,6 +435,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 64, .defkeybits = 128, } @@ -433,6 +456,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } @@ -453,6 +477,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } @@ -473,6 +498,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } @@ -493,6 +519,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } @@ -512,6 +539,7 @@ static struct xfrm_algo_desc ealg_list[] = { .uinfo = { .encr = { + .geniv = "seqiv", .blockbits = 128, .defkeybits = 160, /* 128-bit key + 32-bit nonce */ } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 85d1d4764612..60ce7014e1b0 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -13,6 +13,8 @@ #include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> +#include <net/ip_tunnels.h> +#include <net/ip6_tunnel.h> static struct kmem_cache *secpath_cachep __read_mostly; @@ -29,7 +31,7 @@ int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_input_afinfo_lock); if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL)) - err = -ENOBUFS; + err = -EEXIST; else rcu_assign_pointer(xfrm_input_afinfo[afinfo->family], afinfo); spin_unlock_bh(&xfrm_input_afinfo_lock); @@ -186,6 +188,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) struct xfrm_state *x = NULL; xfrm_address_t *daddr; struct xfrm_mode *inner_mode; + u32 mark = skb->mark; unsigned int family; int decaps = 0; int async = 0; @@ -203,6 +206,18 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SPI_SKB_CB(skb)->daddroff); family = XFRM_SPI_SKB_CB(skb)->family; + /* if tunnel is present override skb->mark value with tunnel i_key */ + if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) { + switch (family) { + case AF_INET: + mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key); + break; + case AF_INET6: + mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key); + break; + } + } + /* Allocate new secpath or COW existing one. */ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { struct sec_path *sp; @@ -229,7 +244,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } - x = xfrm_state_lookup(net, skb->mark, daddr, spi, nexthdr, family); + x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound(skb, family, spi, seq); @@ -238,19 +253,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->sp->xvec[skb->sp->len++] = x; - if (xfrm_tunnel_check(skb, x, family)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); - goto drop; - } - spin_lock(&x->lock); - if (unlikely(x->km.state == XFRM_STATE_ACQ)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); - goto drop_unlock; - } if (unlikely(x->km.state != XFRM_STATE_VALID)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); + if (x->km.state == XFRM_STATE_ACQ) + XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); + else + XFRM_INC_STATS(net, + LINUX_MIB_XFRMINSTATEINVALID); goto drop_unlock; } @@ -271,6 +281,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) spin_unlock(&x->lock); + if (xfrm_tunnel_check(skb, x, family)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); + goto drop; + } + seq_hi = htonl(xfrm_replay_seqhi(x, seq)); XFRM_SKB_CB(skb)->seq.input.low = seq; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 7c532856b398..68ada2ca4b60 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -19,7 +19,7 @@ #include <net/dst.h> #include <net/xfrm.h> -static int xfrm_output2(struct sk_buff *skb); +static int xfrm_output2(struct sock *sk, struct sk_buff *skb); static int xfrm_skb_check_space(struct sk_buff *skb) { @@ -38,6 +38,18 @@ static int xfrm_skb_check_space(struct sk_buff *skb) return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC); } +/* Children define the path of the packet through the + * Linux networking. Thus, destinations are stackable. + */ + +static struct dst_entry *skb_dst_pop(struct sk_buff *skb) +{ + struct dst_entry *child = dst_clone(skb_dst(skb)->child); + + skb_dst_drop(skb); + return child; +} + static int xfrm_output_one(struct sk_buff *skb, int err) { struct dst_entry *dst = skb_dst(skb); @@ -130,7 +142,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err) return dst_output(skb); err = nf_hook(skb_dst(skb)->ops->family, - NF_INET_POST_ROUTING, skb, + NF_INET_POST_ROUTING, skb->sk, skb, NULL, skb_dst(skb)->dev, xfrm_output2); if (unlikely(err != 1)) goto out; @@ -144,12 +156,12 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output_resume); -static int xfrm_output2(struct sk_buff *skb) +static int xfrm_output2(struct sock *sk, struct sk_buff *skb) { return xfrm_output_resume(skb, 1); } -static int xfrm_output_gso(struct sk_buff *skb) +static int xfrm_output_gso(struct sock *sk, struct sk_buff *skb) { struct sk_buff *segs; @@ -165,7 +177,7 @@ static int xfrm_output_gso(struct sk_buff *skb) int err; segs->next = NULL; - err = xfrm_output2(segs); + err = xfrm_output2(sk, segs); if (unlikely(err)) { kfree_skb_list(nskb); @@ -178,13 +190,13 @@ static int xfrm_output_gso(struct sk_buff *skb) return 0; } -int xfrm_output(struct sk_buff *skb) +int xfrm_output(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); int err; if (skb_is_gso(skb)) - return xfrm_output_gso(skb); + return xfrm_output_gso(sk, skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { err = skb_checksum_help(skb); @@ -195,7 +207,7 @@ int xfrm_output(struct sk_buff *skb) } } - return xfrm_output2(skb); + return xfrm_output2(sk, skb); } EXPORT_SYMBOL_GPL(xfrm_output); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 638af0655aaf..18cead7645be 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -315,14 +315,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) } EXPORT_SYMBOL(xfrm_policy_destroy); -static void xfrm_queue_purge(struct sk_buff_head *list) -{ - struct sk_buff *skb; - - while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); -} - /* Rule must be locked. Release descentant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ @@ -335,7 +327,7 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) if (del_timer(&policy->polq.hold_timer)) xfrm_pol_put(policy); - xfrm_queue_purge(&policy->polq.hold_queue); + skb_queue_purge(&policy->polq.hold_queue); if (del_timer(&policy->timer)) xfrm_pol_put(policy); @@ -708,6 +700,9 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, struct xfrm_policy_queue *pq = &old->polq; struct sk_buff_head list; + if (skb_queue_empty(&pq->hold_queue)) + return; + __skb_queue_head_init(&list); spin_lock_bh(&pq->hold_queue.lock); @@ -716,9 +711,6 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); - if (skb_queue_empty(&list)) - return; - pq = &new->polq; spin_lock_bh(&pq->hold_queue.lock); @@ -1012,7 +1004,9 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else - x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); + x = list_first_entry(&walk->walk.all, + struct xfrm_policy_walk_entry, all); + list_for_each_entry_from(x, &net->xfrm.policy_all, all) { if (x->dead) continue; @@ -1120,6 +1114,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst) { + if ((pol->priority >= priority) && ret) + break; + err = xfrm_policy_match(pol, fl, type, family, dir); if (err) { if (err == -ESRCH) @@ -1128,13 +1125,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, ret = ERR_PTR(err); goto fail; } - } else if (pol->priority < priority) { + } else { ret = pol; break; } } - if (ret) - xfrm_pol_hold(ret); + + xfrm_pol_hold(ret); fail: read_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -1955,7 +1952,7 @@ out: purge_queue: pq->timeout = 0; - xfrm_queue_purge(&pq->hold_queue); + skb_queue_purge(&pq->hold_queue); xfrm_pol_put(pol); } @@ -2814,7 +2811,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) return -EAFNOSUPPORT; spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) - err = -ENOBUFS; + err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; if (likely(dst_ops->kmem_cachep == NULL)) @@ -3209,16 +3206,17 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst) { + if ((pol->priority >= priority) && ret) + break; + if (xfrm_migrate_selector_match(sel, &pol->selector) && - pol->type == type && - pol->priority < priority) { + pol->type == type) { ret = pol; break; } } - if (ret) - xfrm_pol_hold(ret); + xfrm_pol_hold(ret); read_unlock_bh(&net->xfrm.xfrm_policy_lock); diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index dab57daae408..4fd725a0c500 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -99,6 +99,7 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq; + XFRM_SKB_CB(skb)->seq.output.hi = 0; if (unlikely(x->replay.oseq == 0)) { x->replay.oseq--; xfrm_audit_state_replay_overflow(x, skb); @@ -177,6 +178,7 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; + XFRM_SKB_CB(skb)->seq.output.hi = 0; if (unlikely(replay_esn->oseq == 0)) { replay_esn->oseq--; xfrm_audit_state_replay_overflow(x, skb); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index de971b6d38c5..9895a8c56d8c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -927,8 +927,8 @@ struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, x->id.spi != spi) continue; - spin_unlock_bh(&net->xfrm.xfrm_state_lock); xfrm_state_hold(x); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } spin_unlock_bh(&net->xfrm.xfrm_state_lock); @@ -1043,12 +1043,12 @@ static struct xfrm_state *__find_acq_core(struct net *net, break; case AF_INET6: - *(struct in6_addr *)x->sel.daddr.a6 = *(struct in6_addr *)daddr; - *(struct in6_addr *)x->sel.saddr.a6 = *(struct in6_addr *)saddr; + x->sel.daddr.in6 = daddr->in6; + x->sel.saddr.in6 = saddr->in6; x->sel.prefixlen_d = 128; x->sel.prefixlen_s = 128; - *(struct in6_addr *)x->props.saddr.a6 = *(struct in6_addr *)saddr; - *(struct in6_addr *)x->id.daddr.a6 = *(struct in6_addr *)daddr; + x->props.saddr.in6 = saddr->in6; + x->id.daddr.in6 = daddr->in6; break; } @@ -1626,7 +1626,7 @@ int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, if (list_empty(&walk->all)) x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all); else - x = list_entry(&walk->all, struct xfrm_state_walk, all); + x = list_first_entry(&walk->all, struct xfrm_state_walk, all); list_for_each_entry_from(x, &net->xfrm.state_all, all) { if (x->state == XFRM_STATE_DEAD) continue; @@ -1908,7 +1908,7 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_state_afinfo_lock); if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) - err = -ENOBUFS; + err = -EEXIST; else rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo); spin_unlock_bh(&xfrm_state_afinfo_lock); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7de2ed9ec46d..bd16c6c7e1e7 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -289,6 +289,31 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, return 0; } +static int attach_crypt(struct xfrm_state *x, struct nlattr *rta) +{ + struct xfrm_algo *p, *ualg; + struct xfrm_algo_desc *algo; + + if (!rta) + return 0; + + ualg = nla_data(rta); + + algo = xfrm_ealg_get_byname(ualg->alg_name, 1); + if (!algo) + return -ENOSYS; + x->props.ealgo = algo->desc.sadb_alg_id; + + p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL); + if (!p) + return -ENOMEM; + + strcpy(p->alg_name, algo->name); + x->ealg = p; + x->geniv = algo->uinfo.encr.geniv; + return 0; +} + static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, struct nlattr *rta) { @@ -349,8 +374,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, return 0; } -static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props, - struct nlattr *rta) +static int attach_aead(struct xfrm_state *x, struct nlattr *rta) { struct xfrm_algo_aead *p, *ualg; struct xfrm_algo_desc *algo; @@ -363,14 +387,15 @@ static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props, algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1); if (!algo) return -ENOSYS; - *props = algo->desc.sadb_alg_id; + x->props.ealgo = algo->desc.sadb_alg_id; p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL); if (!p) return -ENOMEM; strcpy(p->alg_name, algo->name); - *algpp = p; + x->aead = p; + x->geniv = algo->uinfo.aead.geniv; return 0; } @@ -515,8 +540,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (attrs[XFRMA_SA_EXTRA_FLAGS]) x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); - if ((err = attach_aead(&x->aead, &x->props.ealgo, - attrs[XFRMA_ALG_AEAD]))) + if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD]))) goto error; if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo, attrs[XFRMA_ALG_AUTH_TRUNC]))) @@ -526,9 +550,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, attrs[XFRMA_ALG_AUTH]))) goto error; } - if ((err = attach_one_algo(&x->ealg, &x->props.ealgo, - xfrm_ealg_get_byname, - attrs[XFRMA_ALG_CRYPT]))) + if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT]))) goto error; if ((err = attach_one_algo(&x->calg, &x->props.calgo, xfrm_calg_get_byname, @@ -2423,6 +2445,11 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) const struct xfrm_link *link; int type, err; +#ifdef CONFIG_COMPAT + if (is_compat_task()) + return -ENOTSUPP; +#endif + type = nlh->nlmsg_type; if (type > XFRM_MSG_MAX) return -EINVAL; |