From a2b4a79b88b24c49d98d45a06a014ffd22ada1a4 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sun, 10 Sep 2017 20:29:45 +0300 Subject: spi: uapi: spidev: add missing ioctl header The SPI_IOC_MESSAGE() macro references _IOC_SIZEBITS. Add linux/ioctl.h to make sure this macro is defined. This fixes the following build failure of lcdproc with the musl libc: In file included from .../sysroot/usr/include/sys/ioctl.h:7:0, from hd44780-spi.c:31: hd44780-spi.c: In function 'spi_transfer': hd44780-spi.c:89:24: error: '_IOC_SIZEBITS' undeclared (first use in this function) status = ioctl(p->fd, SPI_IOC_MESSAGE(1), &xfer); ^ Signed-off-by: Baruch Siach Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- include/uapi/linux/spi/spidev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/spi/spidev.h b/include/uapi/linux/spi/spidev.h index dd5f21e75805..856de39d0b89 100644 --- a/include/uapi/linux/spi/spidev.h +++ b/include/uapi/linux/spi/spidev.h @@ -23,6 +23,7 @@ #define SPIDEV_H #include +#include /* User space versions of kernel symbols for SPI clocking modes, * matching -- cgit v1.2.3 From 2bbbd96357ce76cc45ec722c00f654aa7b189112 Mon Sep 17 00:00:00 2001 From: Jan Luebbe Date: Mon, 28 Aug 2017 17:25:16 +0200 Subject: bus: mbus: fix window size calculation for 4GB windows At least the Armada XP SoC supports 4GB on a single DRAM window. Because the size register values contain the actual size - 1, the MSB is set in that case. For example, the SDRAM window's control register's value is 0xffffffe1 for 4GB (bits 31 to 24 contain the size). The MBUS driver reads back each window's size from registers and calculates the actual size as (control_reg | ~DDR_SIZE_MASK) + 1, which overflows for 32 bit values, resulting in other miscalculations further on (a bad RAM window for the CESA crypto engine calculated by mvebu_mbus_setup_cpu_target_nooverlap() in my case). This patch changes the type in 'struct mbus_dram_window' from u32 to u64, which allows us to keep using the same register calculation code in most MBUS-using drivers (which calculate ->size - 1 again). Fixes: fddddb52a6c4 ("bus: introduce an Marvell EBU MBus driver") CC: stable@vger.kernel.org Signed-off-by: Jan Luebbe Signed-off-by: Gregory CLEMENT --- drivers/bus/mvebu-mbus.c | 2 +- include/linux/mbus.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c index c7f396903184..70db4d5638a6 100644 --- a/drivers/bus/mvebu-mbus.c +++ b/drivers/bus/mvebu-mbus.c @@ -720,7 +720,7 @@ mvebu_mbus_default_setup_cpu_target(struct mvebu_mbus_state *mbus) if (mbus->hw_io_coherency) w->mbus_attr |= ATTR_HW_COHERENCY; w->base = base & DDR_BASE_CS_LOW_MASK; - w->size = (size | ~DDR_SIZE_MASK) + 1; + w->size = (u64)(size | ~DDR_SIZE_MASK) + 1; } } mvebu_mbus_dram_info.num_cs = cs; diff --git a/include/linux/mbus.h b/include/linux/mbus.h index 0d3f14fd2621..4773145246ed 100644 --- a/include/linux/mbus.h +++ b/include/linux/mbus.h @@ -31,8 +31,8 @@ struct mbus_dram_target_info struct mbus_dram_window { u8 cs_index; u8 mbus_attr; - u32 base; - u32 size; + u64 base; + u64 size; } cs[4]; }; -- cgit v1.2.3 From 30ae9610d275f8f03f5bf7612ce71d8af6fc400b Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 9 Oct 2017 11:46:55 -0500 Subject: irqchip/gic-v3-its: Add missing changes to support 52bit physical address The current ITS driver works fine as long as normal memory and GICR regions are located within the lower 48bit (>=0 && <2^48) physical address space. Some of the registers GICR_PEND/PROP, GICR_VPEND/VPROP and GITS_CBASER are handled properly but not all when configuring the hardware with 52bit physical address. This patch does the following changes to support 52bit PA. -Handle 52bit PA in GITS_BASERn. -Fix ITT_addr width to 52bits, bits[51:8]. -Fix RDbase width to 52bits, bits[51:16]. -Fix VPT_addr width to 52bits, bits[51:16]. Definition of the GITS_BASERn register when ITS PageSize is 64KB: -Bits[47:16] of the register provide bits[47:16] of the table PA. -Bits[15:12] of the register provide bits[51:48] of the table PA. -Bits[15:00] of the base physical address are 0. Signed-off-by: Shanker Donthineni Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 26 +++++++++++++++++++++----- include/linux/irqchip/arm-gic-v3.h | 2 ++ 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 991cf33750c6..e88395605e32 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -312,7 +312,7 @@ static void its_encode_size(struct its_cmd_block *cmd, u8 size) static void its_encode_itt(struct its_cmd_block *cmd, u64 itt_addr) { - its_mask_encode(&cmd->raw_cmd[2], itt_addr >> 8, 50, 8); + its_mask_encode(&cmd->raw_cmd[2], itt_addr >> 8, 51, 8); } static void its_encode_valid(struct its_cmd_block *cmd, int valid) @@ -322,7 +322,7 @@ static void its_encode_valid(struct its_cmd_block *cmd, int valid) static void its_encode_target(struct its_cmd_block *cmd, u64 target_addr) { - its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 50, 16); + its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16); } static void its_encode_collection(struct its_cmd_block *cmd, u16 col) @@ -362,7 +362,7 @@ static void its_encode_its_list(struct its_cmd_block *cmd, u16 its_list) static void its_encode_vpt_addr(struct its_cmd_block *cmd, u64 vpt_pa) { - its_mask_encode(&cmd->raw_cmd[3], vpt_pa >> 16, 50, 16); + its_mask_encode(&cmd->raw_cmd[3], vpt_pa >> 16, 51, 16); } static void its_encode_vpt_size(struct its_cmd_block *cmd, u8 vpt_size) @@ -1482,9 +1482,9 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, u64 val = its_read_baser(its, baser); u64 esz = GITS_BASER_ENTRY_SIZE(val); u64 type = GITS_BASER_TYPE(val); + u64 baser_phys, tmp; u32 alloc_pages; void *base; - u64 tmp; retry_alloc_baser: alloc_pages = (PAGE_ORDER_TO_SIZE(order) / psz); @@ -1500,8 +1500,24 @@ retry_alloc_baser: if (!base) return -ENOMEM; + baser_phys = virt_to_phys(base); + + /* Check if the physical address of the memory is above 48bits */ + if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && (baser_phys >> 48)) { + + /* 52bit PA is supported only when PageSize=64K */ + if (psz != SZ_64K) { + pr_err("ITS: no 52bit PA support when psz=%d\n", psz); + free_pages((unsigned long)base, order); + return -ENXIO; + } + + /* Convert 52bit PA to 48bit field */ + baser_phys = GITS_BASER_PHYS_52_to_48(baser_phys); + } + retry_baser: - val = (virt_to_phys(base) | + val = (baser_phys | (type << GITS_BASER_TYPE_SHIFT) | ((esz - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | ((alloc_pages - 1) << GITS_BASER_PAGES_SHIFT) | diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 1ea576c8126f..14b74f22d43c 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -372,6 +372,8 @@ #define GITS_BASER_ENTRY_SIZE_SHIFT (48) #define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1) #define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) +#define GITS_BASER_PHYS_52_to_48(phys) \ + (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12) #define GITS_BASER_SHAREABILITY_SHIFT (10) #define GITS_BASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) -- cgit v1.2.3 From 20608924cc2e6bdeaf6f58ccbe9ddfe12dbfa082 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 4 Oct 2017 14:26:26 +0200 Subject: genirq: generic chip: Add irq_gc_mask_disable_and_ack_set() The irq_gc_mask_disable_reg_and_ack() function name implies that it provides the combined functions of irq_gc_mask_disable_reg() and irq_gc_ack(). However, the implementation does not actually do that since it writes the mask instead of the disable register. It also does not maintain the mask cache which makes it inappropriate to use with other masking functions. In addition, commit 659fb32d1b67 ("genirq: replace irq_gc_ack() with {set,clr}_bit variants (fwd)") effectively renamed irq_gc_ack() to irq_gc_ack_set_bit() so this function probably should have also been renamed at that time. The generic chip code currently provides three functions for use with the irq_mask member of the irq_chip structure and two functions for use with the irq_ack member of the irq_chip structure. These functions could be combined into six functions for use with the irq_mask_ack member of the irq_chip structure. However, since only one of the combinations is currently used, only the function irq_gc_mask_disable_and_ack_set() is added by this commit. The '_reg' and '_bit' portions of the base function name were left out of the new combined function name in an attempt to keep the function name length manageable with the 80 character source code line length while still allowing the distinct aspects of each combination to be captured by the name. If other combinations are desired in the future please add them to the irq generic chip library at that time. Signed-off-by: Doug Berger Signed-off-by: Marc Zyngier --- include/linux/irq.h | 1 + kernel/irq/generic-chip.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index d4728bf6a537..494d328f7051 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1010,6 +1010,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d); void irq_gc_ack_set_bit(struct irq_data *d); void irq_gc_ack_clr_bit(struct irq_data *d); void irq_gc_mask_disable_reg_and_ack(struct irq_data *d); +void irq_gc_mask_disable_and_ack_set(struct irq_data *d); void irq_gc_eoi(struct irq_data *d); int irq_gc_set_wake(struct irq_data *d, unsigned int on); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 5270a54b9fa4..ec5fe9a0cb05 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -150,6 +150,31 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) irq_gc_unlock(gc); } +/** + * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt + * @d: irq_data + * + * This generic implementation of the irq_mask_ack method is for chips + * with separate enable/disable registers instead of a single mask + * register and where a pending interrupt is acknowledged by setting a + * bit. + * + * Note: This is the only permutation currently used. Similar generic + * functions should be added here if other permutations are required. + */ +void irq_gc_mask_disable_and_ack_set(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; + + irq_gc_lock(gc); + irq_reg_writel(gc, mask, ct->regs.disable); + *ct->mask_cache &= ~mask; + irq_reg_writel(gc, mask, ct->regs.ack); + irq_gc_unlock(gc); +} + /** * irq_gc_eoi - EOI interrupt * @d: irq_data -- cgit v1.2.3 From 0d08af35f16a0cc418ad2afde3bc5f70ace82705 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 4 Oct 2017 14:28:17 +0200 Subject: genirq: generic chip: remove irq_gc_mask_disable_reg_and_ack() Any usage of the irq_gc_mask_disable_reg_and_ack() function has been replaced with the desired functionality. The incorrect and ambiguously named function is removed here to prevent accidental misuse. Signed-off-by: Doug Berger Signed-off-by: Marc Zyngier --- include/linux/irq.h | 1 - kernel/irq/generic-chip.c | 16 ---------------- 2 files changed, 17 deletions(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 494d328f7051..5ad10948ea95 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1009,7 +1009,6 @@ void irq_gc_mask_clr_bit(struct irq_data *d); void irq_gc_unmask_enable_reg(struct irq_data *d); void irq_gc_ack_set_bit(struct irq_data *d); void irq_gc_ack_clr_bit(struct irq_data *d); -void irq_gc_mask_disable_reg_and_ack(struct irq_data *d); void irq_gc_mask_disable_and_ack_set(struct irq_data *d); void irq_gc_eoi(struct irq_data *d); int irq_gc_set_wake(struct irq_data *d, unsigned int on); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ec5fe9a0cb05..c26c5bb6b491 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -134,22 +134,6 @@ void irq_gc_ack_clr_bit(struct irq_data *d) irq_gc_unlock(gc); } -/** - * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt - * @d: irq_data - */ -void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) -{ - struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = d->mask; - - irq_gc_lock(gc); - irq_reg_writel(gc, mask, ct->regs.mask); - irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); -} - /** * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt * @d: irq_data -- cgit v1.2.3 From 0ad646c81b2182f7fa67ec0c8c825e0ee165696d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 13 Oct 2017 11:58:53 -0700 Subject: tun: call dev_get_valid_name() before register_netdevice() register_netdevice() could fail early when we have an invalid dev name, in which case ->ndo_uninit() is not called. For tun device, this is a problem because a timer etc. are already initialized and it expects ->ndo_uninit() to clean them up. We could move these initializations into a ->ndo_init() so that register_netdevice() knows better, however this is still complicated due to the logic in tun_detach(). Therefore, I choose to just call dev_get_valid_name() before register_netdevice(), which is quicker and much easier to audit. And for this specific case, it is already enough. Fixes: 96442e42429e ("tuntap: choose the txq based on rxq") Reported-by: Dmitry Alexeev Cc: Jason Wang Cc: "Michael S. Tsirkin" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 +++ include/linux/netdevice.h | 3 +++ net/core/dev.c | 6 +++--- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 5ce580f413b9..e21bf90b819f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2027,6 +2027,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (!dev) return -ENOMEM; + err = dev_get_valid_name(net, dev, name); + if (err) + goto err_free_dev; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f535779d9dc1..2eaac7d75af4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3694,6 +3694,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs); +int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name); + #define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \ alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1) diff --git a/net/core/dev.c b/net/core/dev.c index 588b473194a8..11596a302a26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1147,9 +1147,8 @@ static int dev_alloc_name_ns(struct net *net, return ret; } -static int dev_get_valid_name(struct net *net, - struct net_device *dev, - const char *name) +int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name) { BUG_ON(!net); @@ -1165,6 +1164,7 @@ static int dev_get_valid_name(struct net *net, return 0; } +EXPORT_SYMBOL(dev_get_valid_name); /** * dev_change_name - change name of a device -- cgit v1.2.3 From 0bfe649fbb1337400065fa47679b381b2ac845f0 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 16 Oct 2017 17:05:57 +0200 Subject: fq_impl: Properly enforce memory limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fq structure would fail to properly enforce the memory limit in the case where the packet being enqueued was bigger than the packet being removed to bring the memory usage down. So keep dropping packets until the memory usage is back below the limit. Also, fix the statistics for memory limit violations. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/fq_impl.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 4e6131cd3f43..ac1a2317941e 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -146,6 +146,7 @@ static void fq_tin_enqueue(struct fq *fq, fq_flow_get_default_t get_default_func) { struct fq_flow *flow; + bool oom; lockdep_assert_held(&fq->lock); @@ -167,8 +168,8 @@ static void fq_tin_enqueue(struct fq *fq, } __skb_queue_tail(&flow->queue, skb); - - if (fq->backlog > fq->limit || fq->memory_usage > fq->memory_limit) { + oom = (fq->memory_usage > fq->memory_limit); + while (fq->backlog > fq->limit || oom) { flow = list_first_entry_or_null(&fq->backlogs, struct fq_flow, backlogchain); @@ -183,8 +184,10 @@ static void fq_tin_enqueue(struct fq *fq, flow->tin->overlimit++; fq->overlimit++; - if (fq->memory_usage > fq->memory_limit) + if (oom) { fq->overmemory++; + oom = (fq->memory_usage > fq->memory_limit); + } } } -- cgit v1.2.3 From 363b02dab09b3226f3bd1420dad9c72b79a42a76 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Oct 2017 16:43:25 +0100 Subject: KEYS: Fix race between updating and finding a negative key Consolidate KEY_FLAG_INSTANTIATED, KEY_FLAG_NEGATIVE and the rejection error into one field such that: (1) The instantiation state can be modified/read atomically. (2) The error can be accessed atomically with the state. (3) The error isn't stored unioned with the payload pointers. This deals with the problem that the state is spread over three different objects (two bits and a separate variable) and reading or updating them atomically isn't practical, given that not only can uninstantiated keys change into instantiated or rejected keys, but rejected keys can also turn into instantiated keys - and someone accessing the key might not be using any locking. The main side effect of this problem is that what was held in the payload may change, depending on the state. For instance, you might observe the key to be in the rejected state. You then read the cached error, but if the key semaphore wasn't locked, the key might've become instantiated between the two reads - and you might now have something in hand that isn't actually an error code. The state is now KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE or a negative error code if the key is negatively instantiated. The key_is_instantiated() function is replaced with key_is_positive() to avoid confusion as negative keys are also 'instantiated'. Additionally, barriering is included: (1) Order payload-set before state-set during instantiation. (2) Order state-read before payload-read when using the key. Further separate barriering is necessary if RCU is being used to access the payload content after reading the payload pointers. Fixes: 146aa8b1453b ("KEYS: Merge the type-specific data with the payload data") Cc: stable@vger.kernel.org # v4.4+ Reported-by: Eric Biggers Signed-off-by: David Howells Reviewed-by: Eric Biggers --- include/linux/key.h | 47 ++++++++++++++++++++------------ net/dns_resolver/dns_key.c | 2 +- security/keys/big_key.c | 4 +-- security/keys/encrypted-keys/encrypted.c | 2 +- security/keys/gc.c | 8 +++--- security/keys/key.c | 31 +++++++++++++-------- security/keys/keyctl.c | 9 +++--- security/keys/keyring.c | 10 +++---- security/keys/proc.c | 7 +++-- security/keys/process_keys.c | 2 +- security/keys/request_key.c | 7 ++--- security/keys/request_key_auth.c | 2 +- security/keys/trusted.c | 2 +- security/keys/user_defined.c | 4 +-- 14 files changed, 80 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/include/linux/key.h b/include/linux/key.h index e315e16b6ff8..8a15cabe928d 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -138,6 +138,11 @@ struct key_restriction { struct key_type *keytype; }; +enum key_state { + KEY_IS_UNINSTANTIATED, + KEY_IS_POSITIVE, /* Positively instantiated */ +}; + /*****************************************************************************/ /* * authentication token / access credential / keyring @@ -169,6 +174,7 @@ struct key { * - may not match RCU dereferenced payload * - payload should contain own length */ + short state; /* Key state (+) or rejection error (-) */ #ifdef KEY_DEBUGGING unsigned magic; @@ -176,18 +182,16 @@ struct key { #endif unsigned long flags; /* status flags (change with bitops) */ -#define KEY_FLAG_INSTANTIATED 0 /* set if key has been instantiated */ -#define KEY_FLAG_DEAD 1 /* set if key type has been deleted */ -#define KEY_FLAG_REVOKED 2 /* set if key had been revoked */ -#define KEY_FLAG_IN_QUOTA 3 /* set if key consumes quota */ -#define KEY_FLAG_USER_CONSTRUCT 4 /* set if key is being constructed in userspace */ -#define KEY_FLAG_NEGATIVE 5 /* set if key is negative */ -#define KEY_FLAG_ROOT_CAN_CLEAR 6 /* set if key can be cleared by root without permission */ -#define KEY_FLAG_INVALIDATED 7 /* set if key has been invalidated */ -#define KEY_FLAG_BUILTIN 8 /* set if key is built in to the kernel */ -#define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ -#define KEY_FLAG_KEEP 10 /* set if key should not be removed */ -#define KEY_FLAG_UID_KEYRING 11 /* set if key is a user or user session keyring */ +#define KEY_FLAG_DEAD 0 /* set if key type has been deleted */ +#define KEY_FLAG_REVOKED 1 /* set if key had been revoked */ +#define KEY_FLAG_IN_QUOTA 2 /* set if key consumes quota */ +#define KEY_FLAG_USER_CONSTRUCT 3 /* set if key is being constructed in userspace */ +#define KEY_FLAG_ROOT_CAN_CLEAR 4 /* set if key can be cleared by root without permission */ +#define KEY_FLAG_INVALIDATED 5 /* set if key has been invalidated */ +#define KEY_FLAG_BUILTIN 6 /* set if key is built in to the kernel */ +#define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ +#define KEY_FLAG_KEEP 8 /* set if key should not be removed */ +#define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -213,7 +217,6 @@ struct key { struct list_head name_link; struct assoc_array keys; }; - int reject_error; }; /* This is set on a keyring to restrict the addition of a link to a key @@ -353,17 +356,27 @@ extern void key_set_timeout(struct key *, unsigned); #define KEY_NEED_SETATTR 0x20 /* Require permission to change attributes */ #define KEY_NEED_ALL 0x3f /* All the above permissions */ +static inline short key_read_state(const struct key *key) +{ + /* Barrier versus mark_key_instantiated(). */ + return smp_load_acquire(&key->state); +} + /** - * key_is_instantiated - Determine if a key has been positively instantiated + * key_is_positive - Determine if a key has been positively instantiated * @key: The key to check. * * Return true if the specified key has been positively instantiated, false * otherwise. */ -static inline bool key_is_instantiated(const struct key *key) +static inline bool key_is_positive(const struct key *key) +{ + return key_read_state(key) == KEY_IS_POSITIVE; +} + +static inline bool key_is_negative(const struct key *key) { - return test_bit(KEY_FLAG_INSTANTIATED, &key->flags) && - !test_bit(KEY_FLAG_NEGATIVE, &key->flags); + return key_read_state(key) < 0; } #define dereference_key_rcu(KEY) \ diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 8737412c7b27..e1d4d898a007 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -224,7 +224,7 @@ static int dns_resolver_match_preparse(struct key_match_data *match_data) static void dns_resolver_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); - if (key_is_instantiated(key)) { + if (key_is_positive(key)) { int err = PTR_ERR(key->payload.data[dns_key_error]); if (err) diff --git a/security/keys/big_key.c b/security/keys/big_key.c index e607830b6154..929e14978c42 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -247,7 +247,7 @@ void big_key_revoke(struct key *key) /* clear the quota */ key_payload_reserve(key, 0); - if (key_is_instantiated(key) && + if (key_is_positive(key) && (size_t)key->payload.data[big_key_len] > BIG_KEY_FILE_THRESHOLD) vfs_truncate(path, 0); } @@ -279,7 +279,7 @@ void big_key_describe(const struct key *key, struct seq_file *m) seq_puts(m, key->description); - if (key_is_instantiated(key)) + if (key_is_positive(key)) seq_printf(m, ": %zu [%s]", datalen, datalen > BIG_KEY_FILE_THRESHOLD ? "file" : "buff"); diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 535db141f4da..d92cbf9687c3 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -854,7 +854,7 @@ static int encrypted_update(struct key *key, struct key_preparsed_payload *prep) size_t datalen = prep->datalen; int ret = 0; - if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) + if (key_is_negative(key)) return -ENOKEY; if (datalen <= 0 || datalen > 32767 || !prep->data) return -EINVAL; diff --git a/security/keys/gc.c b/security/keys/gc.c index 87cb260e4890..f01d48cb3de1 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -129,15 +129,15 @@ static noinline void key_gc_unused_keys(struct list_head *keys) while (!list_empty(keys)) { struct key *key = list_entry(keys->next, struct key, graveyard_link); + short state = key->state; + list_del(&key->graveyard_link); kdebug("- %u", key->serial); key_check(key); /* Throw away the key data if the key is instantiated */ - if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags) && - !test_bit(KEY_FLAG_NEGATIVE, &key->flags) && - key->type->destroy) + if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); security_key_free(key); @@ -151,7 +151,7 @@ static noinline void key_gc_unused_keys(struct list_head *keys) } atomic_dec(&key->user->nkeys); - if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) + if (state != KEY_IS_UNINSTANTIATED) atomic_dec(&key->user->nikeys); key_user_put(key->user); diff --git a/security/keys/key.c b/security/keys/key.c index eb914a838840..9385e7cc710f 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -401,6 +401,18 @@ int key_payload_reserve(struct key *key, size_t datalen) } EXPORT_SYMBOL(key_payload_reserve); +/* + * Change the key state to being instantiated. + */ +static void mark_key_instantiated(struct key *key, int reject_error) +{ + /* Commit the payload before setting the state; barrier versus + * key_read_state(). + */ + smp_store_release(&key->state, + (reject_error < 0) ? reject_error : KEY_IS_POSITIVE); +} + /* * Instantiate a key and link it into the target keyring atomically. Must be * called with the target keyring's semaphore writelocked. The target key's @@ -424,14 +436,14 @@ static int __key_instantiate_and_link(struct key *key, mutex_lock(&key_construction_mutex); /* can't instantiate twice */ - if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { + if (key->state == KEY_IS_UNINSTANTIATED) { /* instantiate the key */ ret = key->type->instantiate(key, prep); if (ret == 0) { /* mark the key as being instantiated */ atomic_inc(&key->user->nikeys); - set_bit(KEY_FLAG_INSTANTIATED, &key->flags); + mark_key_instantiated(key, 0); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; @@ -577,13 +589,10 @@ int key_reject_and_link(struct key *key, mutex_lock(&key_construction_mutex); /* can't instantiate twice */ - if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { + if (key->state == KEY_IS_UNINSTANTIATED) { /* mark the key as being negatively instantiated */ atomic_inc(&key->user->nikeys); - key->reject_error = -error; - smp_wmb(); - set_bit(KEY_FLAG_NEGATIVE, &key->flags); - set_bit(KEY_FLAG_INSTANTIATED, &key->flags); + mark_key_instantiated(key, -error); now = current_kernel_time(); key->expiry = now.tv_sec + timeout; key_schedule_gc(key->expiry + key_gc_delay); @@ -752,8 +761,8 @@ static inline key_ref_t __key_update(key_ref_t key_ref, ret = key->type->update(key, prep); if (ret == 0) - /* updating a negative key instantiates it */ - clear_bit(KEY_FLAG_NEGATIVE, &key->flags); + /* Updating a negative key positively instantiates it */ + mark_key_instantiated(key, 0); up_write(&key->sem); @@ -986,8 +995,8 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen) ret = key->type->update(key, &prep); if (ret == 0) - /* updating a negative key instantiates it */ - clear_bit(KEY_FLAG_NEGATIVE, &key->flags); + /* Updating a negative key positively instantiates it */ + mark_key_instantiated(key, 0); up_write(&key->sem); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 365ff85d7e27..76d22f726ae4 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -766,10 +766,9 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) key = key_ref_to_ptr(key_ref); - if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { - ret = -ENOKEY; - goto error2; - } + ret = key_read_state(key); + if (ret < 0) + goto error2; /* Negatively instantiated */ /* see if we can read it directly */ ret = key_permission(key_ref, KEY_NEED_READ); @@ -901,7 +900,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group) atomic_dec(&key->user->nkeys); atomic_inc(&newowner->nkeys); - if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { + if (key->state != KEY_IS_UNINSTANTIATED) { atomic_dec(&key->user->nikeys); atomic_inc(&newowner->nikeys); } diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 4fa82a8a9c0e..06173b091a74 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -414,7 +414,7 @@ static void keyring_describe(const struct key *keyring, struct seq_file *m) else seq_puts(m, "[anon]"); - if (key_is_instantiated(keyring)) { + if (key_is_positive(keyring)) { if (keyring->keys.nr_leaves_on_tree != 0) seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else @@ -553,7 +553,8 @@ static int keyring_search_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); - unsigned long kflags = key->flags; + unsigned long kflags = READ_ONCE(key->flags); + short state = READ_ONCE(key->state); kenter("{%d}", key->serial); @@ -597,9 +598,8 @@ static int keyring_search_iterator(const void *object, void *iterator_data) if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { /* we set a different error code if we pass a negative key */ - if (kflags & (1 << KEY_FLAG_NEGATIVE)) { - smp_rmb(); - ctx->result = ERR_PTR(key->reject_error); + if (state < 0) { + ctx->result = ERR_PTR(state); kleave(" = %d [neg]", ctx->skipped_ret); goto skipped; } diff --git a/security/keys/proc.c b/security/keys/proc.c index de834309d100..4089ce1f7757 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -182,6 +182,7 @@ static int proc_keys_show(struct seq_file *m, void *v) unsigned long timo; key_ref_t key_ref, skey_ref; char xbuf[16]; + short state; int rc; struct keyring_search_context ctx = { @@ -236,17 +237,19 @@ static int proc_keys_show(struct seq_file *m, void *v) sprintf(xbuf, "%luw", timo / (60*60*24*7)); } + state = key_read_state(key); + #define showflag(KEY, LETTER, FLAG) \ (test_bit(FLAG, &(KEY)->flags) ? LETTER : '-') seq_printf(m, "%08x %c%c%c%c%c%c%c %5d %4s %08x %5d %5d %-9.9s ", key->serial, - showflag(key, 'I', KEY_FLAG_INSTANTIATED), + state != KEY_IS_UNINSTANTIATED ? 'I' : '-', showflag(key, 'R', KEY_FLAG_REVOKED), showflag(key, 'D', KEY_FLAG_DEAD), showflag(key, 'Q', KEY_FLAG_IN_QUOTA), showflag(key, 'U', KEY_FLAG_USER_CONSTRUCT), - showflag(key, 'N', KEY_FLAG_NEGATIVE), + state < 0 ? 'N' : '-', showflag(key, 'i', KEY_FLAG_INVALIDATED), refcount_read(&key->usage), xbuf, diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 293d3598153b..740affd65ee9 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -730,7 +730,7 @@ try_again: ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && - !test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) + key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; /* check the permissions */ diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 63e63a42db3c..e8036cd0ad54 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -595,10 +595,9 @@ int wait_for_key_construction(struct key *key, bool intr) intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret) return -ERESTARTSYS; - if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { - smp_rmb(); - return key->reject_error; - } + ret = key_read_state(key); + if (ret < 0) + return ret; return key_validate(key); } EXPORT_SYMBOL(wait_for_key_construction); diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 6ebf1af8fce9..424e1d90412e 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -73,7 +73,7 @@ static void request_key_auth_describe(const struct key *key, seq_puts(m, "key:"); seq_puts(m, key->description); - if (key_is_instantiated(key)) + if (key_is_positive(key)) seq_printf(m, " pid:%d ci:%zu", rka->pid, rka->callout_len); } diff --git a/security/keys/trusted.c b/security/keys/trusted.c index ddfaebf60fc8..bd85315cbfeb 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -1066,7 +1066,7 @@ static int trusted_update(struct key *key, struct key_preparsed_payload *prep) char *datablob; int ret = 0; - if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) + if (key_is_negative(key)) return -ENOKEY; p = key->payload.data[0]; if (!p->migratable) diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c index 3d8c68eba516..9f558bedba23 100644 --- a/security/keys/user_defined.c +++ b/security/keys/user_defined.c @@ -114,7 +114,7 @@ int user_update(struct key *key, struct key_preparsed_payload *prep) /* attach the new data, displacing the old */ key->expiry = prep->expiry; - if (!test_bit(KEY_FLAG_NEGATIVE, &key->flags)) + if (key_is_positive(key)) zap = dereference_key_locked(key); rcu_assign_keypointer(key, prep->payload.data[0]); prep->payload.data[0] = NULL; @@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(user_destroy); void user_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); - if (key_is_instantiated(key)) + if (key_is_positive(key)) seq_printf(m, ": %u", key->datalen); } -- cgit v1.2.3 From a91d66129fb9bcead12af3ed2008d6ddbf179509 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 16 Oct 2017 11:39:28 +0200 Subject: ALSA: hda - Fix incorrect TLV callback check introduced during set_fs() removal The commit 99b5c5bb9a54 ("ALSA: hda - Remove the use of set_fs()") converted the get_kctl_0dB_offset() call for killing set_fs() usage in HD-audio codec code. The conversion assumed that the TLV callback used in HD-audio code is only snd_hda_mixer_amp() and applies the TLV calculation locally. Although this assumption is correct, and all slave kctls are actually with that callback, the current code is still utterly buggy; it doesn't hit this condition and falls back to the next check. It's because the function gets called after adding slave kctls to vmaster. By assigning a slave kctl, the slave kctl object is faked inside vmaster code, and the whole kctl ops are overridden. Thus the callback op points to a different value from what we've assumed. More badly, as reported by the KERNEXEC and UDEREF features of PaX, the code flow turns into the unexpected pitfall. The next fallback check is SNDRV_CTL_ELEM_ACCESS_TLV_READ access bit, and this always hits for each kctl with TLV. Then it evaluates the callback function pointer wrongly as if it were a TLV array. Although currently its side-effect is fairly limited, this incorrect reference may lead to an unpleasant result. For addressing the regression, this patch introduces a new helper to vmaster code, snd_ctl_apply_vmaster_slaves(). This works similarly like the existing map_slaves() in hda_codec.c: it loops over the slave list of the given master, and applies the given function to each slave. Then the initializer function receives the right kctl object and we can compare the correct pointer instead of the faked one. Also, for catching the similar breakage in future, give an error message when the unexpected TLV callback is found and bail out immediately. Fixes: 99b5c5bb9a54 ("ALSA: hda - Remove the use of set_fs()") Reported-by: PaX Team Cc: # v4.13 Signed-off-by: Takashi Iwai --- include/sound/control.h | 3 ++ sound/core/vmaster.c | 31 +++++++++++++++ sound/pci/hda/hda_codec.c | 97 +++++++++++++++++++++++++++-------------------- 3 files changed, 89 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/sound/control.h b/include/sound/control.h index bd7246de58e7..a1f1152bc687 100644 --- a/include/sound/control.h +++ b/include/sound/control.h @@ -248,6 +248,9 @@ int snd_ctl_add_vmaster_hook(struct snd_kcontrol *kctl, void *private_data); void snd_ctl_sync_vmaster(struct snd_kcontrol *kctl, bool hook_only); #define snd_ctl_sync_vmaster_hook(kctl) snd_ctl_sync_vmaster(kctl, true) +int snd_ctl_apply_vmaster_slaves(struct snd_kcontrol *kctl, + int (*func)(struct snd_kcontrol *, void *), + void *arg); /* * Helper functions for jack-detection controls diff --git a/sound/core/vmaster.c b/sound/core/vmaster.c index 6c58e6f73a01..e43af18d4383 100644 --- a/sound/core/vmaster.c +++ b/sound/core/vmaster.c @@ -484,3 +484,34 @@ void snd_ctl_sync_vmaster(struct snd_kcontrol *kcontrol, bool hook_only) master->hook(master->hook_private_data, master->val); } EXPORT_SYMBOL_GPL(snd_ctl_sync_vmaster); + +/** + * snd_ctl_apply_vmaster_slaves - Apply function to each vmaster slave + * @kctl: vmaster kctl element + * @func: function to apply + * @arg: optional function argument + * + * Apply the function @func to each slave kctl of the given vmaster kctl. + * Returns 0 if successful, or a negative error code. + */ +int snd_ctl_apply_vmaster_slaves(struct snd_kcontrol *kctl, + int (*func)(struct snd_kcontrol *, void *), + void *arg) +{ + struct link_master *master; + struct link_slave *slave; + int err; + + master = snd_kcontrol_chip(kctl); + err = master_init(master); + if (err < 0) + return err; + list_for_each_entry(slave, &master->slaves, list) { + err = func(&slave->slave, arg); + if (err < 0) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(snd_ctl_apply_vmaster_slaves); diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index b6cf9684c2ec..a0989d231fd0 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -1803,36 +1803,6 @@ static int check_slave_present(struct hda_codec *codec, return 1; } -/* guess the value corresponding to 0dB */ -static int get_kctl_0dB_offset(struct hda_codec *codec, - struct snd_kcontrol *kctl, int *step_to_check) -{ - int _tlv[4]; - const int *tlv = NULL; - int val = -1; - - if ((kctl->vd[0].access & SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK) && - kctl->tlv.c == snd_hda_mixer_amp_tlv) { - get_ctl_amp_tlv(kctl, _tlv); - tlv = _tlv; - } else if (kctl->vd[0].access & SNDRV_CTL_ELEM_ACCESS_TLV_READ) - tlv = kctl->tlv.p; - if (tlv && tlv[0] == SNDRV_CTL_TLVT_DB_SCALE) { - int step = tlv[3]; - step &= ~TLV_DB_SCALE_MUTE; - if (!step) - return -1; - if (*step_to_check && *step_to_check != step) { - codec_err(codec, "Mismatching dB step for vmaster slave (%d!=%d)\n", - *step_to_check, step); - return -1; - } - *step_to_check = step; - val = -tlv[2] / step; - } - return val; -} - /* call kctl->put with the given value(s) */ static int put_kctl_with_value(struct snd_kcontrol *kctl, int val) { @@ -1847,19 +1817,58 @@ static int put_kctl_with_value(struct snd_kcontrol *kctl, int val) return 0; } -/* initialize the slave volume with 0dB */ -static int init_slave_0dB(struct hda_codec *codec, - void *data, struct snd_kcontrol *slave) +struct slave_init_arg { + struct hda_codec *codec; + int step; +}; + +/* initialize the slave volume with 0dB via snd_ctl_apply_vmaster_slaves() */ +static int init_slave_0dB(struct snd_kcontrol *kctl, void *_arg) { - int offset = get_kctl_0dB_offset(codec, slave, data); - if (offset > 0) - put_kctl_with_value(slave, offset); + struct slave_init_arg *arg = _arg; + int _tlv[4]; + const int *tlv = NULL; + int step; + int val; + + if (kctl->vd[0].access & SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK) { + if (kctl->tlv.c != snd_hda_mixer_amp_tlv) { + codec_err(arg->codec, + "Unexpected TLV callback for slave %s:%d\n", + kctl->id.name, kctl->id.index); + return 0; /* ignore */ + } + get_ctl_amp_tlv(kctl, _tlv); + tlv = _tlv; + } else if (kctl->vd[0].access & SNDRV_CTL_ELEM_ACCESS_TLV_READ) + tlv = kctl->tlv.p; + + if (!tlv || tlv[0] != SNDRV_CTL_TLVT_DB_SCALE) + return 0; + + step = tlv[3]; + step &= ~TLV_DB_SCALE_MUTE; + if (!step) + return 0; + if (arg->step && arg->step != step) { + codec_err(arg->codec, + "Mismatching dB step for vmaster slave (%d!=%d)\n", + arg->step, step); + return 0; + } + + arg->step = step; + val = -tlv[2] / step; + if (val > 0) { + put_kctl_with_value(kctl, val); + return val; + } + return 0; } -/* unmute the slave */ -static int init_slave_unmute(struct hda_codec *codec, - void *data, struct snd_kcontrol *slave) +/* unmute the slave via snd_ctl_apply_vmaster_slaves() */ +static int init_slave_unmute(struct snd_kcontrol *slave, void *_arg) { return put_kctl_with_value(slave, 1); } @@ -1919,9 +1928,13 @@ int __snd_hda_add_vmaster(struct hda_codec *codec, char *name, /* init with master mute & zero volume */ put_kctl_with_value(kctl, 0); if (init_slave_vol) { - int step = 0; - map_slaves(codec, slaves, suffix, - tlv ? init_slave_0dB : init_slave_unmute, &step); + struct slave_init_arg arg = { + .codec = codec, + .step = 0, + }; + snd_ctl_apply_vmaster_slaves(kctl, + tlv ? init_slave_0dB : init_slave_unmute, + &arg); } if (ctl_ret) -- cgit v1.2.3 From 55dfce873dca46df00304c44a568d7933bffff89 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 9 Oct 2017 11:09:33 -0700 Subject: Input: factor out and export input_device_id matching code Factor out and export input_match_device_id() so that modules may use it. It will be needed by joydev to blacklist accelerometers in composite devices. Tested-by: Roderick Colenbrander Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 83 +++++++++++++++++++++++---------------------------- include/linux/input.h | 3 ++ 2 files changed, 41 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/input/input.c b/drivers/input/input.c index d268fdc23c64..02e6ea7955fe 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -933,58 +933,51 @@ int input_set_keycode(struct input_dev *dev, } EXPORT_SYMBOL(input_set_keycode); +bool input_match_device_id(const struct input_dev *dev, + const struct input_device_id *id) +{ + if (id->flags & INPUT_DEVICE_ID_MATCH_BUS) + if (id->bustype != dev->id.bustype) + return false; + + if (id->flags & INPUT_DEVICE_ID_MATCH_VENDOR) + if (id->vendor != dev->id.vendor) + return false; + + if (id->flags & INPUT_DEVICE_ID_MATCH_PRODUCT) + if (id->product != dev->id.product) + return false; + + if (id->flags & INPUT_DEVICE_ID_MATCH_VERSION) + if (id->version != dev->id.version) + return false; + + if (!bitmap_subset(id->evbit, dev->evbit, EV_MAX) || + !bitmap_subset(id->keybit, dev->keybit, KEY_MAX) || + !bitmap_subset(id->relbit, dev->relbit, REL_MAX) || + !bitmap_subset(id->absbit, dev->absbit, ABS_MAX) || + !bitmap_subset(id->mscbit, dev->mscbit, MSC_MAX) || + !bitmap_subset(id->ledbit, dev->ledbit, LED_MAX) || + !bitmap_subset(id->sndbit, dev->sndbit, SND_MAX) || + !bitmap_subset(id->ffbit, dev->ffbit, FF_MAX) || + !bitmap_subset(id->swbit, dev->swbit, SW_MAX)) { + return false; + } + + return true; +} +EXPORT_SYMBOL(input_match_device_id); + static const struct input_device_id *input_match_device(struct input_handler *handler, struct input_dev *dev) { const struct input_device_id *id; for (id = handler->id_table; id->flags || id->driver_info; id++) { - - if (id->flags & INPUT_DEVICE_ID_MATCH_BUS) - if (id->bustype != dev->id.bustype) - continue; - - if (id->flags & INPUT_DEVICE_ID_MATCH_VENDOR) - if (id->vendor != dev->id.vendor) - continue; - - if (id->flags & INPUT_DEVICE_ID_MATCH_PRODUCT) - if (id->product != dev->id.product) - continue; - - if (id->flags & INPUT_DEVICE_ID_MATCH_VERSION) - if (id->version != dev->id.version) - continue; - - if (!bitmap_subset(id->evbit, dev->evbit, EV_MAX)) - continue; - - if (!bitmap_subset(id->keybit, dev->keybit, KEY_MAX)) - continue; - - if (!bitmap_subset(id->relbit, dev->relbit, REL_MAX)) - continue; - - if (!bitmap_subset(id->absbit, dev->absbit, ABS_MAX)) - continue; - - if (!bitmap_subset(id->mscbit, dev->mscbit, MSC_MAX)) - continue; - - if (!bitmap_subset(id->ledbit, dev->ledbit, LED_MAX)) - continue; - - if (!bitmap_subset(id->sndbit, dev->sndbit, SND_MAX)) - continue; - - if (!bitmap_subset(id->ffbit, dev->ffbit, FF_MAX)) - continue; - - if (!bitmap_subset(id->swbit, dev->swbit, SW_MAX)) - continue; - - if (!handler->match || handler->match(handler, dev)) + if (input_match_device_id(dev, id) && + (!handler->match || handler->match(handler, dev))) { return id; + } } return NULL; diff --git a/include/linux/input.h b/include/linux/input.h index fb5e23c7ed98..2a44650e449d 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -469,6 +469,9 @@ int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke); int input_set_keycode(struct input_dev *dev, const struct input_keymap_entry *ke); +bool input_match_device_id(const struct input_dev *dev, + const struct input_device_id *id); + void input_enable_softrepeat(struct input_dev *dev, int delay, int period); extern struct class input_class; -- cgit v1.2.3 From 8724ecb072293f109a6f5dc93be8a98bf61fe14f Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 9 Oct 2017 12:01:14 -0700 Subject: Input: allow matching device IDs on property bits Let's allow matching input devices on their property bits, both in-kernel and when generating module aliases. Tested-by: Roderick Colenbrander Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 3 ++- include/linux/input.h | 4 ++++ include/linux/mod_devicetable.h | 3 +++ scripts/mod/devicetable-offsets.c | 1 + scripts/mod/file2alias.c | 6 +++++- 5 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/input/input.c b/drivers/input/input.c index 02e6ea7955fe..762bfb9487dc 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -960,7 +960,8 @@ bool input_match_device_id(const struct input_dev *dev, !bitmap_subset(id->ledbit, dev->ledbit, LED_MAX) || !bitmap_subset(id->sndbit, dev->sndbit, SND_MAX) || !bitmap_subset(id->ffbit, dev->ffbit, FF_MAX) || - !bitmap_subset(id->swbit, dev->swbit, SW_MAX)) { + !bitmap_subset(id->swbit, dev->swbit, SW_MAX) || + !bitmap_subset(id->propbit, dev->propbit, INPUT_PROP_MAX)) { return false; } diff --git a/include/linux/input.h b/include/linux/input.h index 2a44650e449d..7c7516eb7d76 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -234,6 +234,10 @@ struct input_dev { #error "SW_MAX and INPUT_DEVICE_ID_SW_MAX do not match" #endif +#if INPUT_PROP_MAX != INPUT_DEVICE_ID_PROP_MAX +#error "INPUT_PROP_MAX and INPUT_DEVICE_ID_PROP_MAX do not match" +#endif + #define INPUT_DEVICE_ID_MATCH_DEVICE \ (INPUT_DEVICE_ID_MATCH_BUS | INPUT_DEVICE_ID_MATCH_VENDOR | INPUT_DEVICE_ID_MATCH_PRODUCT) #define INPUT_DEVICE_ID_MATCH_DEVICE_AND_VERSION \ diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 3f74ef2281e8..72f0b7f19c59 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -293,6 +293,7 @@ struct pcmcia_device_id { #define INPUT_DEVICE_ID_SND_MAX 0x07 #define INPUT_DEVICE_ID_FF_MAX 0x7f #define INPUT_DEVICE_ID_SW_MAX 0x0f +#define INPUT_DEVICE_ID_PROP_MAX 0x1f #define INPUT_DEVICE_ID_MATCH_BUS 1 #define INPUT_DEVICE_ID_MATCH_VENDOR 2 @@ -308,6 +309,7 @@ struct pcmcia_device_id { #define INPUT_DEVICE_ID_MATCH_SNDBIT 0x0400 #define INPUT_DEVICE_ID_MATCH_FFBIT 0x0800 #define INPUT_DEVICE_ID_MATCH_SWBIT 0x1000 +#define INPUT_DEVICE_ID_MATCH_PROPBIT 0x2000 struct input_device_id { @@ -327,6 +329,7 @@ struct input_device_id { kernel_ulong_t sndbit[INPUT_DEVICE_ID_SND_MAX / BITS_PER_LONG + 1]; kernel_ulong_t ffbit[INPUT_DEVICE_ID_FF_MAX / BITS_PER_LONG + 1]; kernel_ulong_t swbit[INPUT_DEVICE_ID_SW_MAX / BITS_PER_LONG + 1]; + kernel_ulong_t propbit[INPUT_DEVICE_ID_PROP_MAX / BITS_PER_LONG + 1]; kernel_ulong_t driver_info; }; diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c index e4d90e50f6fe..812657ab5aa3 100644 --- a/scripts/mod/devicetable-offsets.c +++ b/scripts/mod/devicetable-offsets.c @@ -105,6 +105,7 @@ int main(void) DEVID_FIELD(input_device_id, sndbit); DEVID_FIELD(input_device_id, ffbit); DEVID_FIELD(input_device_id, swbit); + DEVID_FIELD(input_device_id, propbit); DEVID(eisa_device_id); DEVID_FIELD(eisa_device_id, sig); diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 29d6699d5a06..bc25898f6df0 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -761,7 +761,7 @@ static void do_input(char *alias, sprintf(alias + strlen(alias), "%X,*", i); } -/* input:b0v0p0e0-eXkXrXaXmXlXsXfXwX where X is comma-separated %02X. */ +/* input:b0v0p0e0-eXkXrXaXmXlXsXfXwXprX where X is comma-separated %02X. */ static int do_input_entry(const char *filename, void *symval, char *alias) { @@ -779,6 +779,7 @@ static int do_input_entry(const char *filename, void *symval, DEF_FIELD_ADDR(symval, input_device_id, sndbit); DEF_FIELD_ADDR(symval, input_device_id, ffbit); DEF_FIELD_ADDR(symval, input_device_id, swbit); + DEF_FIELD_ADDR(symval, input_device_id, propbit); sprintf(alias, "input:"); @@ -816,6 +817,9 @@ static int do_input_entry(const char *filename, void *symval, sprintf(alias + strlen(alias), "w*"); if (flags & INPUT_DEVICE_ID_MATCH_SWBIT) do_input(alias, *swbit, 0, INPUT_DEVICE_ID_SW_MAX); + sprintf(alias + strlen(alias), "pr*"); + if (flags & INPUT_DEVICE_ID_MATCH_PROPBIT) + do_input(alias, *propbit, 0, INPUT_DEVICE_ID_PROP_MAX); return 1; } ADD_TO_DEVTABLE("input", input_device_id, do_input_entry); -- cgit v1.2.3 From a961e40917fb14614d368d8bc9782ca4d6a8cd11 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 19 Oct 2017 13:30:15 -0400 Subject: membarrier: Provide register expedited private command This introduces a "register private expedited" membarrier command which allows eventual removal of important memory barrier constraints on the scheduler fast-paths. It changes how the "private expedited" membarrier command (new to 4.14) is used from user-space. This new command allows processes to register their intent to use the private expedited command. This affects how the expedited private command introduced in 4.14-rc is meant to be used, and should be merged before 4.14 final. Processes are now required to register before using MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM. This fixes a problem that arose when designing requested extensions to sys_membarrier() to allow JITs to efficiently flush old code from instruction caches. Several potential algorithms are much less painful if the user register intent to use this functionality early on, for example, before the process spawns the second thread. Registering at this time removes the need to interrupt each and every thread in that process at the first expedited sys_membarrier() system call. Signed-off-by: Mathieu Desnoyers Acked-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Alexander Viro Signed-off-by: Linus Torvalds --- fs/exec.c | 1 + include/linux/mm_types.h | 3 +++ include/linux/sched/mm.h | 16 ++++++++++++++++ include/uapi/linux/membarrier.h | 23 ++++++++++++++++------- kernel/sched/membarrier.c | 34 ++++++++++++++++++++++++++++++---- 5 files changed, 66 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/fs/exec.c b/fs/exec.c index 5470d3c1892a..3e14ba25f678 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1802,6 +1802,7 @@ static int do_execveat_common(int fd, struct filename *filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; + membarrier_execve(current); acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 46f4ecf5479a..1861ea8dba77 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -445,6 +445,9 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access the bits */ struct core_state *core_state; /* coredumping support */ +#ifdef CONFIG_MEMBARRIER + atomic_t membarrier_state; +#endif #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index ae53e413fb13..ab9bf7b73954 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -211,4 +211,20 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +#ifdef CONFIG_MEMBARRIER +enum { + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), + MEMBARRIER_STATE_SWITCH_MM = (1U << 1), +}; + +static inline void membarrier_execve(struct task_struct *t) +{ + atomic_set(&t->mm->membarrier_state, 0); +} +#else +static inline void membarrier_execve(struct task_struct *t) +{ +} +#endif + #endif /* _LINUX_SCHED_MM_H */ diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index 6d47b3249d8a..4e01ad7ffe98 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -52,21 +52,30 @@ * (non-running threads are de facto in such a * state). This only covers threads from the * same processes as the caller thread. This - * command returns 0. The "expedited" commands - * complete faster than the non-expedited ones, - * they never block, but have the downside of - * causing extra overhead. + * command returns 0 on success. The + * "expedited" commands complete faster than + * the non-expedited ones, they never block, + * but have the downside of causing extra + * overhead. A process needs to register its + * intent to use the private expedited command + * prior to using it, otherwise this command + * returns -EPERM. + * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: + * Register the process intent to use + * MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always + * returns 0. * * Command to be passed to the membarrier system call. The commands need to * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to * the value 0. */ enum membarrier_cmd { - MEMBARRIER_CMD_QUERY = 0, - MEMBARRIER_CMD_SHARED = (1 << 0), + MEMBARRIER_CMD_QUERY = 0, + MEMBARRIER_CMD_SHARED = (1 << 0), /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */ /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */ - MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), + MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4), }; #endif /* _UAPI_LINUX_MEMBARRIER_H */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index a92fddc22747..dd7908743dab 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "sched.h" /* for cpu_rq(). */ @@ -26,21 +27,26 @@ * except MEMBARRIER_CMD_QUERY. */ #define MEMBARRIER_CMD_BITMASK \ - (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ } -static void membarrier_private_expedited(void) +static int membarrier_private_expedited(void) { int cpu; bool fallback = false; cpumask_var_t tmpmask; + if (!(atomic_read(¤t->mm->membarrier_state) + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) + return -EPERM; + if (num_online_cpus() == 1) - return; + return 0; /* * Matches memory barriers around rq->curr modification in @@ -94,6 +100,24 @@ static void membarrier_private_expedited(void) * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ + return 0; +} + +static void membarrier_register_private_expedited(void) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + + /* + * We need to consider threads belonging to different thread + * groups, which use the same mm. (CLONE_VM but not + * CLONE_THREAD). + */ + if (atomic_read(&mm->membarrier_state) + & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) + return; + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + &mm->membarrier_state); } /** @@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) synchronize_sched(); return 0; case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - membarrier_private_expedited(); + return membarrier_private_expedited(); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: + membarrier_register_private_expedited(); return 0; default: return -EINVAL; -- cgit v1.2.3 From 27fdb35fe99011d86bcc54f62fe84712c53f4d05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Oct 2017 14:26:21 -0700 Subject: doc: Fix various RCU docbook comment-header problems Because many of RCU's files have not been included into docbook, a number of errors have accumulated. This commit fixes them. Signed-off-by: Paul E. McKenney Signed-off-by: Linus Torvalds --- include/linux/rculist.h | 2 +- include/linux/rcupdate.h | 22 ++++++++++++++-------- include/linux/srcu.h | 1 + kernel/rcu/srcutree.c | 2 +- kernel/rcu/sync.c | 9 ++++++--- kernel/rcu/tree.c | 18 ++++++++++-------- 6 files changed, 33 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index b1fd8bf85fdc..2bea1d5e9930 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -276,7 +276,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, #define list_entry_rcu(ptr, type, member) \ container_of(lockless_dereference(ptr), type, member) -/** +/* * Where are list_empty_rcu() and list_first_entry_rcu()? * * Implementing those functions following their counterparts list_empty() and diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de50d8a4cf41..1a9f70d44af9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -523,7 +523,7 @@ static inline void rcu_preempt_sleep_check(void) { } * Return the value of the specified RCU-protected pointer, but omit * both the smp_read_barrier_depends() and the READ_ONCE(). This * is useful in cases where update-side locks prevent the value of the - * pointer from changing. Please note that this primitive does -not- + * pointer from changing. Please note that this primitive does *not* * prevent the compiler from repeating this reference or combining it * with other references, so it should not be used without protection * of appropriate locks. @@ -568,7 +568,7 @@ static inline void rcu_preempt_sleep_check(void) { } * is handed off from RCU to some other synchronization mechanism, for * example, reference counting or locking. In C11, it would map to * kill_dependency(). It could be used as follows: - * + * `` * rcu_read_lock(); * p = rcu_dereference(gp); * long_lived = is_long_lived(p); @@ -579,6 +579,7 @@ static inline void rcu_preempt_sleep_check(void) { } * p = rcu_pointer_handoff(p); * } * rcu_read_unlock(); + *`` */ #define rcu_pointer_handoff(p) (p) @@ -778,18 +779,21 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /** * RCU_INIT_POINTER() - initialize an RCU protected pointer + * @p: The pointer to be initialized. + * @v: The value to initialized the pointer to. * * Initialize an RCU-protected pointer in special cases where readers * do not need ordering constraints on the CPU or the compiler. These * special cases are: * - * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer -or- + * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or* * 2. The caller has taken whatever steps are required to prevent - * RCU readers from concurrently accessing this pointer -or- + * RCU readers from concurrently accessing this pointer *or* * 3. The referenced data structure has already been exposed to - * readers either at compile time or via rcu_assign_pointer() -and- - * a. You have not made -any- reader-visible changes to - * this structure since then -or- + * readers either at compile time or via rcu_assign_pointer() *and* + * + * a. You have not made *any* reader-visible changes to + * this structure since then *or* * b. It is OK for readers accessing this structure from its * new location to see the old state of the structure. (For * example, the changes were to statistical counters or to @@ -805,7 +809,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * by a single external-to-structure RCU-protected pointer, then you may * use RCU_INIT_POINTER() to initialize the internal RCU-protected * pointers, but you must use rcu_assign_pointer() to initialize the - * external-to-structure pointer -after- you have completely initialized + * external-to-structure pointer *after* you have completely initialized * the reader-accessible portions of the linked structure. * * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no @@ -819,6 +823,8 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer + * @p: The pointer to be initialized. + * @v: The value to initialized the pointer to. * * GCC-style initialization for an RCU-protected pointer in a structure field. */ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 39af9bc0f653..62be8966e837 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -78,6 +78,7 @@ void synchronize_srcu(struct srcu_struct *sp); /** * srcu_read_lock_held - might we be in SRCU read-side critical section? + * @sp: The srcu_struct structure to check * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 729a8706751d..6d5880089ff6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, /** * call_srcu() - Queue a callback for invocation after an SRCU grace period * @sp: srcu_struct in queue the callback - * @head: structure to be used for queueing the SRCU callback. + * @rhp: structure to be used for queueing the SRCU callback. * @func: function to be invoked after the SRCU grace period * * The callback function will be invoked some time after a full SRCU diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 50d1861f7759..3f943efcf61c 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) } /** + * rcu_sync_enter_start - Force readers onto slow path for multiple updates + * @rsp: Pointer to rcu_sync structure to use for synchronization + * * Must be called after rcu_sync_init() and before first use. * * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() @@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) /** * rcu_sync_func() - Callback function managing reader access to fastpath - * @rsp: Pointer to rcu_sync structure to use for synchronization + * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization * * This function is passed to one of the call_rcu() functions by * rcu_sync_exit(), so that it is invoked after a grace period following the @@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp) * rcu_sync_exit(). Otherwise, set all state back to idle so that readers * can again use their fastpaths. */ -static void rcu_sync_func(struct rcu_head *rcu) +static void rcu_sync_func(struct rcu_head *rhp) { - struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); + struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; BUG_ON(rsp->gp_state != GP_PASSED); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0ad62b0e7b8..3e3650e94ae6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, * read-side critical sections have completed. call_rcu_sched() assumes * that the read-side critical sections end on enabling of preemption * or on voluntary preemption. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR - * - anything that disables preemption. + * RCU read-side critical sections are delimited by: + * + * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR + * - anything that disables preemption. * * These may be nested. * @@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * handler. This means that read-side critical sections in process * context must not be interrupted by softirqs. This interface is to be * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. - * OR - * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. - * These may be nested. + * RCU read-side critical sections are delimited by: + * + * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR + * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. + * + * These may be nested. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. -- cgit v1.2.3 From 34f79502bbcfab659b8729da68b5e387f96eb4c1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 18 Oct 2017 07:10:36 -0700 Subject: bpf: avoid preempt enable/disable in sockmap using tcp_skb_cb region SK_SKB BPF programs are run from the socket/tcp context but early in the stack before much of the TCP metadata is needed in tcp_skb_cb. So we can use some unused fields to place BPF metadata needed for SK_SKB programs when implementing the redirect function. This allows us to drop the preempt disable logic. It does however require an API change so sk_redirect_map() has been updated to additionally provide ctx_ptr to skb. Note, we do however continue to disable/enable preemption around actual BPF program running to account for map updates. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 2 +- include/net/tcp.h | 5 ++++ kernel/bpf/sockmap.c | 19 +++++++------- net/core/filter.c | 29 +++++++++++----------- samples/sockmap/sockmap_kern.c | 2 +- tools/include/uapi/linux/bpf.h | 3 ++- tools/testing/selftests/bpf/bpf_helpers.h | 2 +- tools/testing/selftests/bpf/sockmap_verdict_prog.c | 4 +-- 8 files changed, 36 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index d29e58fde364..818a0b26249e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -728,7 +728,7 @@ void xdp_do_flush_map(void); void bpf_warn_invalid_xdp_action(u32 act); void bpf_warn_invalid_xdp_redirect(u32 ifindex); -struct sock *do_sk_redirect_map(void); +struct sock *do_sk_redirect_map(struct sk_buff *skb); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/include/net/tcp.h b/include/net/tcp.h index 89974c5286d8..b1ef98ebce53 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -840,6 +840,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ + struct { + __u32 key; + __u32 flags; + struct bpf_map *map; + } bpf; }; }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index c68899d5b246..beaabb21c3a3 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -39,6 +39,7 @@ #include #include #include +#include struct bpf_stab { struct bpf_map map; @@ -101,9 +102,16 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) return SK_DROP; skb_orphan(skb); + /* We need to ensure that BPF metadata for maps is also cleared + * when we orphan the skb so that we don't have the possibility + * to reference a stale map. + */ + TCP_SKB_CB(skb)->bpf.map = NULL; skb->sk = psock->sock; bpf_compute_data_end(skb); + preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); + preempt_enable(); skb->sk = NULL; return rc; @@ -114,17 +122,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) struct sock *sk; int rc; - /* Because we use per cpu values to feed input from sock redirect - * in BPF program to do_sk_redirect_map() call we need to ensure we - * are not preempted. RCU read lock is not sufficient in this case - * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. - */ - preempt_disable(); rc = smap_verdict_func(psock, skb); switch (rc) { case SK_REDIRECT: - sk = do_sk_redirect_map(); - preempt_enable(); + sk = do_sk_redirect_map(skb); if (likely(sk)) { struct smap_psock *peer = smap_psock_sk(sk); @@ -141,8 +142,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) /* Fall through and free skb otherwise */ case SK_DROP: default: - if (rc != SK_REDIRECT) - preempt_enable(); kfree_skb(skb); } } diff --git a/net/core/filter.c b/net/core/filter.c index 74b8c91fb5f4..ca1ba0bbfbc2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1839,31 +1839,31 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags) +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, + struct bpf_map *, map, u32, key, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); if (unlikely(flags)) return SK_ABORTED; - ri->ifindex = key; - ri->flags = flags; - ri->map = map; + tcb->bpf.key = key; + tcb->bpf.flags = flags; + tcb->bpf.map = map; return SK_REDIRECT; } -struct sock *do_sk_redirect_map(void) +struct sock *do_sk_redirect_map(struct sk_buff *skb) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk = NULL; - if (ri->map) { - sk = __sock_map_lookup_elem(ri->map, ri->ifindex); + if (tcb->bpf.map) { + sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); - ri->ifindex = 0; - ri->map = NULL; - /* we do not clear flags for future lookup */ + tcb->bpf.key = 0; + tcb->bpf.map = NULL; } return sk; @@ -1873,9 +1873,10 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index f9b38ef82dc2..52b0053274f4 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -62,7 +62,7 @@ int bpf_prog2(struct __sk_buff *skb) ret = 1; bpf_printk("sockmap: %d -> %d @ %d\n", lport, bpf_ntohl(rport), ret); - return bpf_sk_redirect_map(&sock_map, ret, 0); + return bpf_sk_redirect_map(skb, &sock_map, ret, 0); } SEC("sockops") diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 43ab5c402f98..be9a631a69f7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -569,9 +569,10 @@ union bpf_attr { * @flags: reserved for future use * Return: 0 on success or negative error code * - * int bpf_sk_redirect_map(map, key, flags) + * int bpf_sk_redirect_map(skb, map, key, flags) * Redirect skb to a sock in map using key as a lookup key for the * sock in map. + * @skb: pointer to skb * @map: pointer to sockmap * @key: key to lookup sock in map * @flags: reserved for future use diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 36fb9161b34a..b2e02bdcd098 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -65,7 +65,7 @@ static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, int optlen) = (void *) BPF_FUNC_setsockopt; -static int (*bpf_sk_redirect_map)(void *map, int key, int flags) = +static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = (void *) BPF_FUNC_sk_redirect_map; static int (*bpf_sock_map_update)(void *map, void *key, void *value, unsigned long long flags) = diff --git a/tools/testing/selftests/bpf/sockmap_verdict_prog.c b/tools/testing/selftests/bpf/sockmap_verdict_prog.c index 9b99bd10807d..2cd2d552938b 100644 --- a/tools/testing/selftests/bpf/sockmap_verdict_prog.c +++ b/tools/testing/selftests/bpf/sockmap_verdict_prog.c @@ -61,8 +61,8 @@ int bpf_prog2(struct __sk_buff *skb) bpf_printk("verdict: data[0] = redir(%u:%u)\n", map, sk); if (!map) - return bpf_sk_redirect_map(&sock_map_rx, sk, 0); - return bpf_sk_redirect_map(&sock_map_tx, sk, 0); + return bpf_sk_redirect_map(skb, &sock_map_rx, sk, 0); + return bpf_sk_redirect_map(skb, &sock_map_tx, sk, 0); } char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From c92e8c02fe664155ac4234516e32544bec0f113d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Oct 2017 09:04:13 -0700 Subject: tcp/dccp: fix ireq->opt races syzkaller found another bug in DCCP/TCP stacks [1] For the reasons explained in commit ce1050089c96 ("tcp/dccp: fix ireq->pktopts race"), we need to make sure we do not access ireq->opt unless we own the request sock. Note the opt field is renamed to ireq_opt to ease grep games. [1] BUG: KASAN: use-after-free in ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474 Read of size 1 at addr ffff8801c951039c by task syz-executor5/3295 CPU: 1 PID: 3295 Comm: syz-executor5 Not tainted 4.14.0-rc4+ #80 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:427 ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474 tcp_transmit_skb+0x1ab7/0x3840 net/ipv4/tcp_output.c:1135 tcp_send_ack.part.37+0x3bb/0x650 net/ipv4/tcp_output.c:3587 tcp_send_ack+0x49/0x60 net/ipv4/tcp_output.c:3557 __tcp_ack_snd_check+0x2c6/0x4b0 net/ipv4/tcp_input.c:5072 tcp_ack_snd_check net/ipv4/tcp_input.c:5085 [inline] tcp_rcv_state_process+0x2eff/0x4850 net/ipv4/tcp_input.c:6071 tcp_child_process+0x342/0x990 net/ipv4/tcp_minisocks.c:816 tcp_v4_rcv+0x1827/0x2f80 net/ipv4/tcp_ipv4.c:1682 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:249 [inline] ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:464 [inline] ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:249 [inline] ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493 __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514 netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587 netif_receive_skb+0xae/0x390 net/core/dev.c:4611 tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372 tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766 tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792 call_write_iter include/linux/fs.h:1770 [inline] new_sync_write fs/read_write.c:468 [inline] __vfs_write+0x68a/0x970 fs/read_write.c:481 vfs_write+0x18f/0x510 fs/read_write.c:543 SYSC_write fs/read_write.c:588 [inline] SyS_write+0xef/0x220 fs/read_write.c:580 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x40c341 RSP: 002b:00007f469523ec10 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 000000000040c341 RDX: 0000000000000037 RSI: 0000000020004000 RDI: 0000000000000015 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000000f4240 R11: 0000000000000293 R12: 00000000004b7fd1 R13: 00000000ffffffff R14: 0000000020000000 R15: 0000000000025000 Allocated by task 3295: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 __do_kmalloc mm/slab.c:3725 [inline] __kmalloc+0x162/0x760 mm/slab.c:3734 kmalloc include/linux/slab.h:498 [inline] tcp_v4_save_options include/net/tcp.h:1962 [inline] tcp_v4_init_req+0x2d3/0x3e0 net/ipv4/tcp_ipv4.c:1271 tcp_conn_request+0xf6d/0x3410 net/ipv4/tcp_input.c:6283 tcp_v4_conn_request+0x157/0x210 net/ipv4/tcp_ipv4.c:1313 tcp_rcv_state_process+0x8ea/0x4850 net/ipv4/tcp_input.c:5857 tcp_v4_do_rcv+0x55c/0x7d0 net/ipv4/tcp_ipv4.c:1482 tcp_v4_rcv+0x2d10/0x2f80 net/ipv4/tcp_ipv4.c:1711 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:249 [inline] ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:464 [inline] ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:249 [inline] ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493 __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514 netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587 netif_receive_skb+0xae/0x390 net/core/dev.c:4611 tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372 tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766 tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792 call_write_iter include/linux/fs.h:1770 [inline] new_sync_write fs/read_write.c:468 [inline] __vfs_write+0x68a/0x970 fs/read_write.c:481 vfs_write+0x18f/0x510 fs/read_write.c:543 SYSC_write fs/read_write.c:588 [inline] SyS_write+0xef/0x220 fs/read_write.c:580 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 3306: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kfree+0xca/0x250 mm/slab.c:3820 inet_sock_destruct+0x59d/0x950 net/ipv4/af_inet.c:157 __sk_destruct+0xfd/0x910 net/core/sock.c:1560 sk_destruct+0x47/0x80 net/core/sock.c:1595 __sk_free+0x57/0x230 net/core/sock.c:1603 sk_free+0x2a/0x40 net/core/sock.c:1614 sock_put include/net/sock.h:1652 [inline] inet_csk_complete_hashdance+0xd5/0xf0 net/ipv4/inet_connection_sock.c:959 tcp_check_req+0xf4d/0x1620 net/ipv4/tcp_minisocks.c:765 tcp_v4_rcv+0x17f6/0x2f80 net/ipv4/tcp_ipv4.c:1675 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:249 [inline] ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:464 [inline] ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:249 [inline] ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493 __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514 netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587 netif_receive_skb+0xae/0x390 net/core/dev.c:4611 tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372 tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766 tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792 call_write_iter include/linux/fs.h:1770 [inline] new_sync_write fs/read_write.c:468 [inline] __vfs_write+0x68a/0x970 fs/read_write.c:481 vfs_write+0x18f/0x510 fs/read_write.c:543 SYSC_write fs/read_write.c:588 [inline] SyS_write+0xef/0x220 fs/read_write.c:580 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 +- net/dccp/ipv4.c | 13 ++++++++----- net/ipv4/cipso_ipv4.c | 24 +++++++----------------- net/ipv4/inet_connection_sock.c | 8 +++----- net/ipv4/syncookies.c | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 22 +++++++++++++--------- 7 files changed, 34 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index aa95053dfc78..425752f768d2 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -96,7 +96,7 @@ struct inet_request_sock { kmemcheck_bitfield_end(flags); u32 ir_mark; union { - struct ip_options_rcu *opt; + struct ip_options_rcu __rcu *ireq_opt; #if IS_ENABLED(CONFIG_IPV6) struct { struct ipv6_txoptions *ipv6_opt; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 001c08696334..0490916864f9 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -414,8 +414,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, sk_daddr_set(newsk, ireq->ir_rmt_addr); sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newinet->inet_saddr = ireq->ir_loc_addr; - newinet->inet_opt = ireq->opt; - ireq->opt = NULL; + RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->inet_id = jiffies; @@ -430,7 +429,10 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); - + if (*own_req) + ireq->ireq_opt = NULL; + else + newinet->inet_opt = NULL; return newsk; exit_overflow: @@ -441,6 +443,7 @@ exit: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: + newinet->inet_opt = NULL; inet_csk_prepare_forced_close(newsk); dccp_done(newsk); goto exit; @@ -492,7 +495,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req ireq->ir_rmt_addr); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq->opt); + rcu_dereference(ireq->ireq_opt)); err = net_xmit_eval(err); } @@ -548,7 +551,7 @@ out: static void dccp_v4_reqsk_destructor(struct request_sock *req) { dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(inet_rsk(req)->opt); + kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); } void dccp_syn_ack_timeout(const struct request_sock *req) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2ae8f54cb321..82178cc69c96 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1951,7 +1951,7 @@ int cipso_v4_req_setattr(struct request_sock *req, buf = NULL; req_inet = inet_rsk(req); - opt = xchg(&req_inet->opt, opt); + opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt); if (opt) kfree_rcu(opt, rcu); @@ -1973,11 +1973,13 @@ req_setattr_failure: * values on failure. * */ -static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) +static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) { + struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1); int hdr_delta = 0; - struct ip_options_rcu *opt = *opt_ptr; + if (!opt || opt->opt.cipso == 0) + return 0; if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { u8 cipso_len; u8 cipso_off; @@ -2039,14 +2041,10 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) */ void cipso_v4_sock_delattr(struct sock *sk) { - int hdr_delta; - struct ip_options_rcu *opt; struct inet_sock *sk_inet; + int hdr_delta; sk_inet = inet_sk(sk); - opt = rcu_dereference_protected(sk_inet->inet_opt, 1); - if (!opt || opt->opt.cipso == 0) - return; hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); if (sk_inet->is_icsk && hdr_delta > 0) { @@ -2066,15 +2064,7 @@ void cipso_v4_sock_delattr(struct sock *sk) */ void cipso_v4_req_delattr(struct request_sock *req) { - struct ip_options_rcu *opt; - struct inet_request_sock *req_inet; - - req_inet = inet_rsk(req); - opt = req_inet->opt; - if (!opt || opt->opt.cipso == 0) - return; - - cipso_v4_delopt(&req_inet->opt); + cipso_v4_delopt(&inet_rsk(req)->ireq_opt); } /** diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 67aec7a10686..5ec9136a7c36 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -540,9 +540,10 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); - struct ip_options_rcu *opt = ireq->opt; + struct ip_options_rcu *opt; struct rtable *rt; + opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), @@ -576,10 +577,9 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct flowi4 *fl4; struct rtable *rt; + opt = rcu_dereference(ireq->ireq_opt); fl4 = &newinet->cork.fl.u.ip4; - rcu_read_lock(); - opt = rcu_dereference(newinet->inet_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), @@ -592,13 +592,11 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; - rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: - rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b1bb1b3a1082..77cf32a80952 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -355,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ - ireq->opt = tcp_v4_save_options(sock_net(sk), skb); + RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb)); if (security_inet_conn_request(sk, skb, req)) { reqsk_free(req); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c5d7656beeee..7eec3383702b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6196,7 +6196,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, struct inet_request_sock *ireq = inet_rsk(req); kmemcheck_annotate_bitfield(ireq, flags); - ireq->opt = NULL; + ireq->ireq_opt = NULL; #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; #endif diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 85164d4d3e53..4c43365c374c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -877,7 +877,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq->opt); + rcu_dereference(ireq->ireq_opt)); err = net_xmit_eval(err); } @@ -889,7 +889,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, */ static void tcp_v4_reqsk_destructor(struct request_sock *req) { - kfree(inet_rsk(req)->opt); + kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); } #ifdef CONFIG_TCP_MD5SIG @@ -1265,10 +1265,11 @@ static void tcp_v4_init_req(struct request_sock *req, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); + struct net *net = sock_net(sk_listener); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb); + RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, @@ -1355,10 +1356,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, sk_daddr_set(newsk, ireq->ir_rmt_addr); sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newsk->sk_bound_dev_if = ireq->ir_iif; - newinet->inet_saddr = ireq->ir_loc_addr; - inet_opt = ireq->opt; - rcu_assign_pointer(newinet->inet_opt, inet_opt); - ireq->opt = NULL; + newinet->inet_saddr = ireq->ir_loc_addr; + inet_opt = rcu_dereference(ireq->ireq_opt); + RCU_INIT_POINTER(newinet->inet_opt, inet_opt); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; @@ -1403,9 +1403,12 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); - if (*own_req) + if (likely(*own_req)) { tcp_move_syn(newtp, req); - + ireq->ireq_opt = NULL; + } else { + newinet->inet_opt = NULL; + } return newsk; exit_overflow: @@ -1416,6 +1419,7 @@ exit: tcp_listendrop(sk); return NULL; put_and_exit: + newinet->inet_opt = NULL; inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto exit; -- cgit v1.2.3 From 88796e7e5c457cae72833196cb98e6895dd107e2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 20 Oct 2017 10:13:46 -0700 Subject: sched/swait: Document it clearly that the swait facilities are special and shouldn't be used We currently welcome using swait over wait whenever possible because it is a slimmer data structure. However, Linus has made it very clear that he does not want this used, unless under very specific RT scenarios (such as current users). Update the comments before kernel hipsters start thinking swait is the cool thing to do. Signed-off-by: Davidlohr Bueso Acked-by: Luis R. Rodriguez Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: wagi@monom.org Link: http://lkml.kernel.org/r/20171020171346.24445-1-dave@stgolabs.net Signed-off-by: Ingo Molnar --- include/linux/swait.h | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/swait.h b/include/linux/swait.h index 73e97a08d3d0..cf30f5022472 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -9,13 +9,16 @@ /* * Simple wait queues * - * While these are very similar to the other/complex wait queues (wait.h) the - * most important difference is that the simple waitqueue allows for - * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold - * times. + * While these are very similar to regular wait queues (wait.h) the most + * important difference is that the simple waitqueue allows for deterministic + * behaviour -- IOW it has strictly bounded IRQ and lock hold times. * - * In order to make this so, we had to drop a fair number of features of the - * other waitqueue code; notably: + * Mainly, this is accomplished by two things. Firstly not allowing swake_up_all + * from IRQ disabled, and dropping the lock upon every wakeup, giving a higher + * priority task a chance to run. + * + * Secondly, we had to drop a fair number of features of the other waitqueue + * code; notably: * * - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue; * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right @@ -24,12 +27,14 @@ * - the exclusive mode; because this requires preserving the list order * and this is hard. * - * - custom wake functions; because you cannot give any guarantees about - * random code. - * - * As a side effect of this; the data structures are slimmer. + * - custom wake callback functions; because you cannot give any guarantees + * about random code. This also allows swait to be used in RT, such that + * raw spinlock can be used for the swait queue head. * - * One would recommend using this wait queue where possible. + * As a side effect of these; the data structures are slimmer albeit more ad-hoc. + * For all the above, note that simple wait queues should _only_ be used under + * very specific realtime constraints -- it is best to stick with the regular + * wait queues in most cases. */ struct task_struct; -- cgit v1.2.3 From 0cc2b4e5a020fc7f4d1795741c116c983e9467d7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 24 Oct 2017 15:20:45 +0200 Subject: PM / QoS: Fix device resume latency PM QoS The special value of 0 for device resume latency PM QoS means "no restriction", but there are two problems with that. First, device resume latency PM QoS requests with 0 as the value are always put in front of requests with positive values in the priority lists used internally by the PM QoS framework, causing 0 to be chosen as an effective constraint value. However, that 0 is then interpreted as "no restriction" effectively overriding the other requests with specific restrictions which is incorrect. Second, the users of device resume latency PM QoS have no way to specify that *any* resume latency at all should be avoided, which is an artificial limitation in general. To address these issues, modify device resume latency PM QoS to use S32_MAX as the "no constraint" value and 0 as the "no latency at all" one and rework its users (the cpuidle menu governor, the genpd QoS governor and the runtime PM framework) to follow these changes. Also add a special "n/a" value to the corresponding user space I/F to allow user space to indicate that it cannot accept any resume latencies at all for the given device. Fixes: 85dc0b8a4019 (PM / QoS: Make it possible to expose PM QoS latency constraints) Link: https://bugzilla.kernel.org/show_bug.cgi?id=197323 Reported-by: Reinette Chatre Tested-by: Reinette Chatre Signed-off-by: Rafael J. Wysocki Acked-by: Alex Shi Cc: All applicable --- Documentation/ABI/testing/sysfs-devices-power | 4 +- drivers/base/cpu.c | 3 +- drivers/base/power/domain_governor.c | 53 +++++++++++++++------------ drivers/base/power/qos.c | 2 +- drivers/base/power/runtime.c | 2 +- drivers/base/power/sysfs.c | 25 +++++++++++-- drivers/cpuidle/governors/menu.c | 4 +- include/linux/pm_qos.h | 5 ++- 8 files changed, 63 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power index 676fdf5f2a99..5cbb6f038615 100644 --- a/Documentation/ABI/testing/sysfs-devices-power +++ b/Documentation/ABI/testing/sysfs-devices-power @@ -211,7 +211,9 @@ Description: device, after it has been suspended at run time, from a resume request to the moment the device will be ready to process I/O, in microseconds. If it is equal to 0, however, this means that - the PM QoS resume latency may be arbitrary. + the PM QoS resume latency may be arbitrary and the special value + "n/a" means that user space cannot accept any resume latency at + all for the given device. Not all drivers support this attribute. If it isn't supported, it is not present. diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 321cd7b4d817..227bac5f1191 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -377,7 +377,8 @@ int register_cpu(struct cpu *cpu, int num) per_cpu(cpu_sys_devices, num) = &cpu->dev; register_cpu_under_node(num, cpu_to_node(num)); - dev_pm_qos_expose_latency_limit(&cpu->dev, 0); + dev_pm_qos_expose_latency_limit(&cpu->dev, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); return 0; } diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c index 281f949c5ffe..51751cc8c9e6 100644 --- a/drivers/base/power/domain_governor.c +++ b/drivers/base/power/domain_governor.c @@ -14,23 +14,20 @@ static int dev_update_qos_constraint(struct device *dev, void *data) { s64 *constraint_ns_p = data; - s32 constraint_ns = -1; + s64 constraint_ns = -1; if (dev->power.subsys_data && dev->power.subsys_data->domain_data) constraint_ns = dev_gpd_data(dev)->td.effective_constraint_ns; - if (constraint_ns < 0) { + if (constraint_ns < 0) constraint_ns = dev_pm_qos_read_value(dev); - constraint_ns *= NSEC_PER_USEC; - } - if (constraint_ns == 0) + + if (constraint_ns == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT) return 0; - /* - * constraint_ns cannot be negative here, because the device has been - * suspended. - */ - if (constraint_ns < *constraint_ns_p || *constraint_ns_p == 0) + constraint_ns *= NSEC_PER_USEC; + + if (constraint_ns < *constraint_ns_p || *constraint_ns_p < 0) *constraint_ns_p = constraint_ns; return 0; @@ -63,10 +60,14 @@ static bool default_suspend_ok(struct device *dev) spin_unlock_irqrestore(&dev->power.lock, flags); - if (constraint_ns < 0) + if (constraint_ns == 0) return false; - constraint_ns *= NSEC_PER_USEC; + if (constraint_ns == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT) + constraint_ns = -1; + else + constraint_ns *= NSEC_PER_USEC; + /* * We can walk the children without any additional locking, because * they all have been suspended at this point and their @@ -76,14 +77,19 @@ static bool default_suspend_ok(struct device *dev) device_for_each_child(dev, &constraint_ns, dev_update_qos_constraint); - if (constraint_ns > 0) { - constraint_ns -= td->suspend_latency_ns + - td->resume_latency_ns; - if (constraint_ns == 0) - return false; + if (constraint_ns < 0) { + /* The children have no constraints. */ + td->effective_constraint_ns = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; + td->cached_suspend_ok = true; + } else { + constraint_ns -= td->suspend_latency_ns + td->resume_latency_ns; + if (constraint_ns > 0) { + td->effective_constraint_ns = constraint_ns; + td->cached_suspend_ok = true; + } else { + td->effective_constraint_ns = 0; + } } - td->effective_constraint_ns = constraint_ns; - td->cached_suspend_ok = constraint_ns >= 0; /* * The children have been suspended already, so we don't need to take @@ -145,13 +151,14 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd, td = &to_gpd_data(pdd)->td; constraint_ns = td->effective_constraint_ns; /* default_suspend_ok() need not be called before us. */ - if (constraint_ns < 0) { + if (constraint_ns < 0) constraint_ns = dev_pm_qos_read_value(pdd->dev); - constraint_ns *= NSEC_PER_USEC; - } - if (constraint_ns == 0) + + if (constraint_ns == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT) continue; + constraint_ns *= NSEC_PER_USEC; + /* * constraint_ns cannot be negative here, because the device has * been suspended. diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c index 277d43a83f53..7d29286d9313 100644 --- a/drivers/base/power/qos.c +++ b/drivers/base/power/qos.c @@ -189,7 +189,7 @@ static int dev_pm_qos_constraints_allocate(struct device *dev) plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; - c->no_constraint_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; + c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 7bcf80fa9ada..13e015905543 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -253,7 +253,7 @@ static int rpm_check_suspend_allowed(struct device *dev) || (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME)) retval = -EAGAIN; - else if (__dev_pm_qos_read_value(dev) < 0) + else if (__dev_pm_qos_read_value(dev) == 0) retval = -EPERM; else if (dev->power.runtime_status == RPM_SUSPENDED) retval = 1; diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index 156ab57bca77..632077f05c57 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -218,7 +218,14 @@ static ssize_t pm_qos_resume_latency_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_pm_qos_requested_resume_latency(dev)); + s32 value = dev_pm_qos_requested_resume_latency(dev); + + if (value == 0) + return sprintf(buf, "n/a\n"); + else if (value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT) + value = 0; + + return sprintf(buf, "%d\n", value); } static ssize_t pm_qos_resume_latency_store(struct device *dev, @@ -228,11 +235,21 @@ static ssize_t pm_qos_resume_latency_store(struct device *dev, s32 value; int ret; - if (kstrtos32(buf, 0, &value)) - return -EINVAL; + if (!kstrtos32(buf, 0, &value)) { + /* + * Prevent users from writing negative or "no constraint" values + * directly. + */ + if (value < 0 || value == PM_QOS_RESUME_LATENCY_NO_CONSTRAINT) + return -EINVAL; - if (value < 0) + if (value == 0) + value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; + } else if (!strcmp(buf, "n/a") || !strcmp(buf, "n/a\n")) { + value = 0; + } else { return -EINVAL; + } ret = dev_pm_qos_update_request(dev->power.qos->resume_latency_req, value); diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 48eaf2879228..aa390404e85f 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -298,8 +298,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->needs_update = 0; } - /* resume_latency is 0 means no restriction */ - if (resume_latency && resume_latency < latency_req) + if (resume_latency < latency_req && + resume_latency != PM_QOS_RESUME_LATENCY_NO_CONSTRAINT) latency_req = resume_latency; /* Special case when user has set very strict latency requirement */ diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 032b55909145..6737a8c9e8c6 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -27,16 +27,17 @@ enum pm_qos_flags_status { PM_QOS_FLAGS_ALL, }; -#define PM_QOS_DEFAULT_VALUE -1 +#define PM_QOS_DEFAULT_VALUE (-1) +#define PM_QOS_LATENCY_ANY S32_MAX #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE 0 #define PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE 0 #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE 0 +#define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT PM_QOS_LATENCY_ANY #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) -#define PM_QOS_LATENCY_ANY ((s32)(~(__u32)0 >> 1)) #define PM_QOS_FLAG_NO_POWER_OFF (1 << 0) #define PM_QOS_FLAG_REMOTE_WAKEUP (1 << 1) -- cgit v1.2.3 From 829385f08ae99740276cbd46c9db29764c519211 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 20 Oct 2017 16:40:43 -0700 Subject: strparser: Use delayed work instead of timer for msg timeout Sock lock may be taken in the message timer function which is a problem since timers run in BH. Instead of timers use delayed_work. Reported-by: Eric Dumazet Fixes: bbb03029a899 ("strparser: Generalize strparser") Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/strparser.h | 3 +-- net/strparser/strparser.c | 17 ++++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/strparser.h b/include/net/strparser.h index 7dc131d62ad5..d96b59f45eba 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -74,10 +74,9 @@ struct strparser { u32 unrecov_intr : 1; struct sk_buff **skb_nextp; - struct timer_list msg_timer; struct sk_buff *skb_head; unsigned int need_bytes; - struct delayed_work delayed_work; + struct delayed_work msg_timer_work; struct work_struct work; struct strp_stats stats; struct strp_callbacks cb; diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index d4ea46a5f233..c5fda15ba319 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -49,7 +49,7 @@ static void strp_abort_strp(struct strparser *strp, int err) { /* Unrecoverable error in receive */ - del_timer(&strp->msg_timer); + cancel_delayed_work(&strp->msg_timer_work); if (strp->stopped) return; @@ -68,7 +68,7 @@ static void strp_abort_strp(struct strparser *strp, int err) static void strp_start_timer(struct strparser *strp, long timeo) { if (timeo) - mod_timer(&strp->msg_timer, timeo); + mod_delayed_work(strp_wq, &strp->msg_timer_work, timeo); } /* Lower lock held */ @@ -319,7 +319,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, eaten += (cand_len - extra); /* Hurray, we have a new message! */ - del_timer(&strp->msg_timer); + cancel_delayed_work(&strp->msg_timer_work); strp->skb_head = NULL; STRP_STATS_INCR(strp->stats.msgs); @@ -450,9 +450,10 @@ static void strp_work(struct work_struct *w) do_strp_work(container_of(w, struct strparser, work)); } -static void strp_msg_timeout(unsigned long arg) +static void strp_msg_timeout(struct work_struct *w) { - struct strparser *strp = (struct strparser *)arg; + struct strparser *strp = container_of(w, struct strparser, + msg_timer_work.work); /* Message assembly timed out */ STRP_STATS_INCR(strp->stats.msg_timeouts); @@ -505,9 +506,7 @@ int strp_init(struct strparser *strp, struct sock *sk, strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp; - setup_timer(&strp->msg_timer, strp_msg_timeout, - (unsigned long)strp); - + INIT_DELAYED_WORK(&strp->msg_timer_work, strp_msg_timeout); INIT_WORK(&strp->work, strp_work); return 0; @@ -532,7 +531,7 @@ void strp_done(struct strparser *strp) { WARN_ON(!strp->stopped); - del_timer_sync(&strp->msg_timer); + cancel_delayed_work_sync(&strp->msg_timer_work); cancel_work_sync(&strp->work); if (strp->skb_head) { -- cgit v1.2.3 From be0f161ef141e4df368aa3f417a1c2ab9c362e75 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Thu, 28 Sep 2017 15:33:50 -0500 Subject: net/mlx5e: DCBNL, Implement tc with ets type and zero bandwidth Previously, tc with ets type and zero bandwidth is not accepted by driver. This behavior does not follow the IEEE802.1qaz spec. If there are tcs with ets type and zero bandwidth, these tcs are assigned to the lowest priority tc_group #0. We equally distribute 100% bw of the tc_group #0 to these zero bandwidth ets tcs. Also, the non zero bandwidth ets tcs are assigned to tc_group #1. If there is no zero bandwidth ets tc, the non zero bandwidth ets tcs are assigned to tc_group #0. Fixes: cdcf11212b22 ("net/mlx5e: Validate BW weight values of ETS") Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 113 +++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/port.c | 21 ++++ include/linux/mlx5/port.h | 2 + 3 files changed, 106 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index c1d384fca4dc..51c4cc00a186 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -41,6 +41,11 @@ #define MLX5E_CEE_STATE_UP 1 #define MLX5E_CEE_STATE_DOWN 0 +enum { + MLX5E_VENDOR_TC_GROUP_NUM = 7, + MLX5E_LOWEST_PRIO_GROUP = 0, +}; + /* If dcbx mode is non-host set the dcbx mode to host. */ static int mlx5e_dcbnl_set_dcbx_mode(struct mlx5e_priv *priv, @@ -85,6 +90,9 @@ static int mlx5e_dcbnl_ieee_getets(struct net_device *netdev, { struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; + u8 tc_group[IEEE_8021QAZ_MAX_TCS]; + bool is_tc_group_6_exist = false; + bool is_zero_bw_ets_tc = false; int err = 0; int i; @@ -96,37 +104,64 @@ static int mlx5e_dcbnl_ieee_getets(struct net_device *netdev, err = mlx5_query_port_prio_tc(mdev, i, &ets->prio_tc[i]); if (err) return err; - } - for (i = 0; i < ets->ets_cap; i++) { + err = mlx5_query_port_tc_group(mdev, i, &tc_group[i]); + if (err) + return err; + err = mlx5_query_port_tc_bw_alloc(mdev, i, &ets->tc_tx_bw[i]); if (err) return err; + + if (ets->tc_tx_bw[i] < MLX5E_MAX_BW_ALLOC && + tc_group[i] == (MLX5E_LOWEST_PRIO_GROUP + 1)) + is_zero_bw_ets_tc = true; + + if (tc_group[i] == (MLX5E_VENDOR_TC_GROUP_NUM - 1)) + is_tc_group_6_exist = true; + } + + /* Report 0% ets tc if exits*/ + if (is_zero_bw_ets_tc) { + for (i = 0; i < ets->ets_cap; i++) + if (tc_group[i] == MLX5E_LOWEST_PRIO_GROUP) + ets->tc_tx_bw[i] = 0; + } + + /* Update tc_tsa based on fw setting*/ + for (i = 0; i < ets->ets_cap; i++) { if (ets->tc_tx_bw[i] < MLX5E_MAX_BW_ALLOC) priv->dcbx.tc_tsa[i] = IEEE_8021QAZ_TSA_ETS; + else if (tc_group[i] == MLX5E_VENDOR_TC_GROUP_NUM && + !is_tc_group_6_exist) + priv->dcbx.tc_tsa[i] = IEEE_8021QAZ_TSA_VENDOR; } - memcpy(ets->tc_tsa, priv->dcbx.tc_tsa, sizeof(ets->tc_tsa)); return err; } -enum { - MLX5E_VENDOR_TC_GROUP_NUM = 7, - MLX5E_ETS_TC_GROUP_NUM = 0, -}; - static void mlx5e_build_tc_group(struct ieee_ets *ets, u8 *tc_group, int max_tc) { bool any_tc_mapped_to_ets = false; + bool ets_zero_bw = false; int strict_group; int i; - for (i = 0; i <= max_tc; i++) - if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) + for (i = 0; i <= max_tc; i++) { + if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) { any_tc_mapped_to_ets = true; + if (!ets->tc_tx_bw[i]) + ets_zero_bw = true; + } + } - strict_group = any_tc_mapped_to_ets ? 1 : 0; + /* strict group has higher priority than ets group */ + strict_group = MLX5E_LOWEST_PRIO_GROUP; + if (any_tc_mapped_to_ets) + strict_group++; + if (ets_zero_bw) + strict_group++; for (i = 0; i <= max_tc; i++) { switch (ets->tc_tsa[i]) { @@ -137,7 +172,9 @@ static void mlx5e_build_tc_group(struct ieee_ets *ets, u8 *tc_group, int max_tc) tc_group[i] = strict_group++; break; case IEEE_8021QAZ_TSA_ETS: - tc_group[i] = MLX5E_ETS_TC_GROUP_NUM; + tc_group[i] = MLX5E_LOWEST_PRIO_GROUP; + if (ets->tc_tx_bw[i] && ets_zero_bw) + tc_group[i] = MLX5E_LOWEST_PRIO_GROUP + 1; break; } } @@ -146,8 +183,22 @@ static void mlx5e_build_tc_group(struct ieee_ets *ets, u8 *tc_group, int max_tc) static void mlx5e_build_tc_tx_bw(struct ieee_ets *ets, u8 *tc_tx_bw, u8 *tc_group, int max_tc) { + int bw_for_ets_zero_bw_tc = 0; + int last_ets_zero_bw_tc = -1; + int num_ets_zero_bw = 0; int i; + for (i = 0; i <= max_tc; i++) { + if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS && + !ets->tc_tx_bw[i]) { + num_ets_zero_bw++; + last_ets_zero_bw_tc = i; + } + } + + if (num_ets_zero_bw) + bw_for_ets_zero_bw_tc = MLX5E_MAX_BW_ALLOC / num_ets_zero_bw; + for (i = 0; i <= max_tc; i++) { switch (ets->tc_tsa[i]) { case IEEE_8021QAZ_TSA_VENDOR: @@ -157,12 +208,26 @@ static void mlx5e_build_tc_tx_bw(struct ieee_ets *ets, u8 *tc_tx_bw, tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC; break; case IEEE_8021QAZ_TSA_ETS: - tc_tx_bw[i] = ets->tc_tx_bw[i]; + tc_tx_bw[i] = ets->tc_tx_bw[i] ? + ets->tc_tx_bw[i] : + bw_for_ets_zero_bw_tc; break; } } + + /* Make sure the total bw for ets zero bw group is 100% */ + if (last_ets_zero_bw_tc != -1) + tc_tx_bw[last_ets_zero_bw_tc] += + MLX5E_MAX_BW_ALLOC % num_ets_zero_bw; } +/* If there are ETS BW 0, + * Set ETS group # to 1 for all ETS non zero BW tcs. Their sum must be 100%. + * Set group #0 to all the ETS BW 0 tcs and + * equally splits the 100% BW between them + * Report both group #0 and #1 as ETS type. + * All the tcs in group #0 will be reported with 0% BW. + */ int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets) { struct mlx5_core_dev *mdev = priv->mdev; @@ -188,7 +253,6 @@ int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets) return err; memcpy(priv->dcbx.tc_tsa, ets->tc_tsa, sizeof(ets->tc_tsa)); - return err; } @@ -209,17 +273,9 @@ static int mlx5e_dbcnl_validate_ets(struct net_device *netdev, } /* Validate Bandwidth Sum */ - for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { - if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) { - if (!ets->tc_tx_bw[i]) { - netdev_err(netdev, - "Failed to validate ETS: BW 0 is illegal\n"); - return -EINVAL; - } - + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) + if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) bw_sum += ets->tc_tx_bw[i]; - } - } if (bw_sum != 0 && bw_sum != 100) { netdev_err(netdev, @@ -533,8 +589,7 @@ static void mlx5e_dcbnl_getpgtccfgtx(struct net_device *netdev, static void mlx5e_dcbnl_getpgbwgcfgtx(struct net_device *netdev, int pgid, u8 *bw_pct) { - struct mlx5e_priv *priv = netdev_priv(netdev); - struct mlx5_core_dev *mdev = priv->mdev; + struct ieee_ets ets; if (pgid >= CEE_DCBX_MAX_PGS) { netdev_err(netdev, @@ -542,8 +597,8 @@ static void mlx5e_dcbnl_getpgbwgcfgtx(struct net_device *netdev, return; } - if (mlx5_query_port_tc_bw_alloc(mdev, pgid, bw_pct)) - *bw_pct = 0; + mlx5e_dcbnl_ieee_getets(netdev, &ets); + *bw_pct = ets.tc_tx_bw[pgid]; } static void mlx5e_dcbnl_setpfccfg(struct net_device *netdev, @@ -739,8 +794,6 @@ static void mlx5e_ets_init(struct mlx5e_priv *priv) ets.prio_tc[i] = i; } - memcpy(priv->dcbx.tc_tsa, ets.tc_tsa, sizeof(ets.tc_tsa)); - /* tclass[prio=0]=1, tclass[prio=1]=0, tclass[prio=i]=i (for i>1) */ ets.prio_tc[0] = 1; ets.prio_tc[1] = 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 1975d4388d4f..e07061f565d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -677,6 +677,27 @@ int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group) } EXPORT_SYMBOL_GPL(mlx5_set_port_tc_group); +int mlx5_query_port_tc_group(struct mlx5_core_dev *mdev, + u8 tc, u8 *tc_group) +{ + u32 out[MLX5_ST_SZ_DW(qetc_reg)]; + void *ets_tcn_conf; + int err; + + err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out)); + if (err) + return err; + + ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out, + tc_configuration[tc]); + + *tc_group = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf, + group); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_tc_group); + int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw) { u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0}; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index c57d4b7de3a8..c59af8ab753a 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -157,6 +157,8 @@ int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc); int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev, u8 prio, u8 *tc); int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group); +int mlx5_query_port_tc_group(struct mlx5_core_dev *mdev, + u8 tc, u8 *tc_group); int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw); int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 tc, u8 *bw_pct); -- cgit v1.2.3 From 06f877d613be3621604c2520ec0351d9fbdca15f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Oct 2017 08:20:31 -0700 Subject: tcp/dccp: fix other lockdep splats accessing ireq_opt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In my first attempt to fix the lockdep splat, I forgot we could enter inet_csk_route_req() with a freshly allocated request socket, for which refcount has not yet been elevated, due to complex SLAB_TYPESAFE_BY_RCU rules. We either are in rcu_read_lock() section _or_ we own a refcount on the request. Correct RCU verb to use here is rcu_dereference_check(), although it is not possible to prove we actually own a reference on a shared refcount :/ In v2, I added ireq_opt_deref() helper and use in three places, to fix other possible splats. [ 49.844590] lockdep_rcu_suspicious+0xea/0xf3 [ 49.846487] inet_csk_route_req+0x53/0x14d [ 49.848334] tcp_v4_route_req+0xe/0x10 [ 49.850174] tcp_conn_request+0x31c/0x6a0 [ 49.851992] ? __lock_acquire+0x614/0x822 [ 49.854015] tcp_v4_conn_request+0x5a/0x79 [ 49.855957] ? tcp_v4_conn_request+0x5a/0x79 [ 49.858052] tcp_rcv_state_process+0x98/0xdcc [ 49.859990] ? sk_filter_trim_cap+0x2f6/0x307 [ 49.862085] tcp_v4_do_rcv+0xfc/0x145 [ 49.864055] ? tcp_v4_do_rcv+0xfc/0x145 [ 49.866173] tcp_v4_rcv+0x5ab/0xaf9 [ 49.868029] ip_local_deliver_finish+0x1af/0x2e7 [ 49.870064] ip_local_deliver+0x1b2/0x1c5 [ 49.871775] ? inet_del_offload+0x45/0x45 [ 49.873916] ip_rcv_finish+0x3f7/0x471 [ 49.875476] ip_rcv+0x3f1/0x42f [ 49.876991] ? ip_local_deliver_finish+0x2e7/0x2e7 [ 49.878791] __netif_receive_skb_core+0x6d3/0x950 [ 49.880701] ? process_backlog+0x7e/0x216 [ 49.882589] __netif_receive_skb+0x1d/0x5e [ 49.884122] process_backlog+0x10c/0x216 [ 49.885812] net_rx_action+0x147/0x3df Fixes: a6ca7abe53633 ("tcp/dccp: fix lockdep splat in inet_csk_route_req()") Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races") Signed-off-by: Eric Dumazet Reported-by: kernel test robot Reported-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/net/inet_sock.h | 6 ++++++ net/dccp/ipv4.c | 2 +- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/tcp_ipv4.c | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 425752f768d2..db8162dd8c0b 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -132,6 +132,12 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, return sk->sk_bound_dev_if; } +static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) +{ + return rcu_dereference_check(ireq->ireq_opt, + refcount_read(&ireq->req.rsk_refcnt) > 0); +} + struct inet_cork { unsigned int flags; __be32 addr; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 0490916864f9..e65fcb45c3f6 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -495,7 +495,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req ireq->ir_rmt_addr); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + ireq_opt_deref(ireq)); err = net_xmit_eval(err); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 18cd2eae758f..b47a59cb3573 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -543,8 +543,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, struct ip_options_rcu *opt; struct rtable *rt; - opt = rcu_dereference_protected(ireq->ireq_opt, - refcount_read(&req->rsk_refcnt) > 0); + opt = ireq_opt_deref(ireq); + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4c43365c374c..5b027c69cbc5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -877,7 +877,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + ireq_opt_deref(ireq)); err = net_xmit_eval(err); } -- cgit v1.2.3 From dea6e19f4ef746aa18b4c33d1a7fed54356796ed Mon Sep 17 00:00:00 2001 From: Girish Moodalbail Date: Fri, 27 Oct 2017 00:00:16 -0700 Subject: tap: reference to KVA of an unloaded module causes kernel panic The commit 9a393b5d5988 ("tap: tap as an independent module") created a separate tap module that implements tap functionality and exports interfaces that will be used by macvtap and ipvtap modules to create create respective tap devices. However, that patch introduced a regression wherein the modules macvtap and ipvtap can be removed (through modprobe -r) while there are applications using the respective /dev/tapX devices. These applications cause kernel to hold reference to /dev/tapX through 'struct cdev macvtap_cdev' and 'struct cdev ipvtap_dev' defined in macvtap and ipvtap modules respectively. So, when the application is later closed the kernel panics because we are referencing KVA that is present in the unloaded modules. ----------8<------- Example ----------8<---------- $ sudo ip li add name mv0 link enp7s0 type macvtap $ sudo ip li show mv0 |grep mv0| awk -e '{print $1 $2}' 14:mv0@enp7s0: $ cat /dev/tap14 & $ lsmod |egrep -i 'tap|vlan' macvtap 16384 0 macvlan 24576 1 macvtap tap 24576 3 macvtap $ sudo modprobe -r macvtap $ fg cat /dev/tap14 ^C <...system panics...> BUG: unable to handle kernel paging request at ffffffffa038c500 IP: cdev_put+0xf/0x30 ----------8<-----------------8<---------- The fix is to set cdev.owner to the module that creates the tap device (either macvtap or ipvtap). With this set, the operations (in fs/char_dev.c) on char device holds and releases the module through cdev_get() and cdev_put() and will not allow the module to unload prematurely. Fixes: 9a393b5d5988ea4e (tap: tap as an independent module) Signed-off-by: Girish Moodalbail Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvtap.c | 4 ++-- drivers/net/macvtap.c | 4 ++-- drivers/net/tap.c | 5 +++-- include/linux/if_tap.h | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/ipvlan/ipvtap.c b/drivers/net/ipvlan/ipvtap.c index 5dea2063dbc8..0bcc07f346c3 100644 --- a/drivers/net/ipvlan/ipvtap.c +++ b/drivers/net/ipvlan/ipvtap.c @@ -197,8 +197,8 @@ static int ipvtap_init(void) { int err; - err = tap_create_cdev(&ipvtap_cdev, &ipvtap_major, "ipvtap"); - + err = tap_create_cdev(&ipvtap_cdev, &ipvtap_major, "ipvtap", + THIS_MODULE); if (err) goto out1; diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index c2d0ea2fb019..cba5cb3b849a 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -204,8 +204,8 @@ static int macvtap_init(void) { int err; - err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap"); - + err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap", + THIS_MODULE); if (err) goto out1; diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 98ee6cc2875d..1b10fcc6a58d 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1249,8 +1249,8 @@ static int tap_list_add(dev_t major, const char *device_name) return 0; } -int tap_create_cdev(struct cdev *tap_cdev, - dev_t *tap_major, const char *device_name) +int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, + const char *device_name, struct module *module) { int err; @@ -1259,6 +1259,7 @@ int tap_create_cdev(struct cdev *tap_cdev, goto out1; cdev_init(tap_cdev, &tap_fops); + tap_cdev->owner = module; err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS); if (err) goto out2; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 4837157da0dc..9ae41cdd0d4c 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -73,8 +73,8 @@ void tap_del_queues(struct tap_dev *tap); int tap_get_minor(dev_t major, struct tap_dev *tap); void tap_free_minor(dev_t major, struct tap_dev *tap); int tap_queue_resize(struct tap_dev *tap); -int tap_create_cdev(struct cdev *tap_cdev, - dev_t *tap_major, const char *device_name); +int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, + const char *device_name, struct module *module); void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev); #endif /*_LINUX_IF_TAP_H_*/ -- cgit v1.2.3 From 8108a77515126f6db4374e8593956e20430307c0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:34 -0700 Subject: bpf: bpf_compute_data uses incorrect cb structure SK_SKB program types use bpf_compute_data to store the end of the packet data. However, bpf_compute_data assumes the cb is stored in the qdisc layer format. But, for SK_SKB this is the wrong layer of the stack for this type. It happens to work (sort of!) because in most cases nothing happens to be overwritten today. This is very fragile and error prone. Fortunately, we have another hole in tcp_skb_cb we can use so lets put the data_end value there. Note, SK_SKB program types do not use data_meta, they are failed by sk_skb_is_valid_access(). Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + kernel/bpf/sockmap.c | 12 ++++++++++-- net/core/filter.c | 27 ++++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index b1ef98ebce53..33599d17522d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -844,6 +844,7 @@ struct tcp_skb_cb { __u32 key; __u32 flags; struct bpf_map *map; + void *data_end; } bpf; }; }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 2b6eb35ae5d3..6778fb773934 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -93,6 +93,14 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +/* compute the linear packet data range [data, data_end) for skb when + * sk_skb type programs are in use. + */ +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} + static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -108,7 +116,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.map = NULL; skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -368,7 +376,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); diff --git a/net/core/filter.c b/net/core/filter.c index aa0265997f93..68eaa2f81a8e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4243,6 +4243,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct __sk_buff, data_end): + off = si->off; + off -= offsetof(struct __sk_buff, data_end); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct tcp_skb_cb, bpf.data_end); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + default: + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); + } + + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_prog_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -4301,7 +4326,7 @@ const struct bpf_verifier_ops sock_ops_prog_ops = { const struct bpf_verifier_ops sk_skb_prog_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, - .convert_ctx_access = bpf_convert_ctx_access, + .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; -- cgit v1.2.3 From bfa640757e9378c2f26867e723f1287e94f5a7ad Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:53 -0700 Subject: bpf: rename sk_actions to align with bpf infrastructure Recent additions to support multiple programs in cgroups impose a strict requirement, "all yes is yes, any no is no". To enforce this the infrastructure requires the 'no' return code, SK_DROP in this case, to be 0. To apply these rules to SK_SKB program types the sk_actions return codes need to be adjusted. This fix adds SK_PASS and makes 'SK_DROP = 0'. Finally, remove SK_ABORTED to remove any chance that the API may allow aborted program flows to be passed up the stack. This would be incorrect behavior and allow programs to break existing policies. Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 6 +++--- kernel/bpf/sockmap.c | 3 ++- net/core/filter.c | 5 +++-- tools/include/uapi/linux/bpf.h | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f90860d1f897..0d7948ce2128 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -575,7 +575,7 @@ union bpf_attr { * @map: pointer to sockmap * @key: key to lookup sock in map * @flags: reserved for future use - * Return: SK_REDIRECT + * Return: SK_PASS * * int bpf_sock_map_update(skops, map, key, flags) * @skops: pointer to bpf_sock_ops @@ -786,8 +786,8 @@ struct xdp_md { }; enum sk_action { - SK_ABORTED = 0, - SK_DROP, + SK_DROP = 0, + SK_PASS, SK_REDIRECT, }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6778fb773934..66f00a2b27f4 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -122,7 +122,8 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) preempt_enable(); skb->sk = NULL; - return rc; + return rc == SK_PASS ? + (TCP_SKB_CB(skb)->bpf.map ? SK_REDIRECT : SK_PASS) : SK_DROP; } static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) diff --git a/net/core/filter.c b/net/core/filter.c index 68eaa2f81a8e..6ae94f825f72 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1844,14 +1844,15 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + /* If user passes invalid input drop the packet. */ if (unlikely(flags)) - return SK_ABORTED; + return SK_DROP; tcb->bpf.key = key; tcb->bpf.flags = flags; tcb->bpf.map = map; - return SK_REDIRECT; + return SK_PASS; } struct sock *do_sk_redirect_map(struct sk_buff *skb) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 24b35a1fd4d6..c174971afbe6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -787,8 +787,8 @@ struct xdp_md { }; enum sk_action { - SK_ABORTED = 0, - SK_DROP, + SK_DROP = 0, + SK_PASS, SK_REDIRECT, }; -- cgit v1.2.3 From 1da4fc97cbf89514e417a3df46eaec864a9b8a48 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 19:43:54 +0800 Subject: sctp: fix some type cast warnings introduced by stream reconf These warnings were found by running 'make C=2 M=net/sctp/'. They are introduced by not aware of Endian when coding stream reconf patches. Since commit c0d8bab6ae51 ("sctp: add get and set sockopt for reconf_enable") enabled stream reconf feature for users, the Fixes tag below would use it. Fixes: c0d8bab6ae51 ("sctp: add get and set sockopt for reconf_enable") Reported-by: Eric Dumazet Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 32 ++++++++++++++++---------------- include/net/sctp/sm.h | 2 +- include/net/sctp/ulpevent.h | 2 +- net/sctp/sm_make_chunk.c | 5 +++-- net/sctp/stream.c | 26 +++++++++++++++++--------- net/sctp/ulpevent.c | 2 +- 6 files changed, 39 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 82b171e1aa0b..09d7412e9cb0 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -716,28 +716,28 @@ struct sctp_reconf_chunk { struct sctp_strreset_outreq { struct sctp_paramhdr param_hdr; - __u32 request_seq; - __u32 response_seq; - __u32 send_reset_at_tsn; - __u16 list_of_streams[0]; + __be32 request_seq; + __be32 response_seq; + __be32 send_reset_at_tsn; + __be16 list_of_streams[0]; }; struct sctp_strreset_inreq { struct sctp_paramhdr param_hdr; - __u32 request_seq; - __u16 list_of_streams[0]; + __be32 request_seq; + __be16 list_of_streams[0]; }; struct sctp_strreset_tsnreq { struct sctp_paramhdr param_hdr; - __u32 request_seq; + __be32 request_seq; }; struct sctp_strreset_addstrm { struct sctp_paramhdr param_hdr; - __u32 request_seq; - __u16 number_of_streams; - __u16 reserved; + __be32 request_seq; + __be16 number_of_streams; + __be16 reserved; }; enum { @@ -752,16 +752,16 @@ enum { struct sctp_strreset_resp { struct sctp_paramhdr param_hdr; - __u32 response_seq; - __u32 result; + __be32 response_seq; + __be32 result; }; struct sctp_strreset_resptsn { struct sctp_paramhdr param_hdr; - __u32 response_seq; - __u32 result; - __u32 senders_next_tsn; - __u32 receivers_next_tsn; + __be32 response_seq; + __be32 result; + __be32 senders_next_tsn; + __be32 receivers_next_tsn; }; #endif /* __LINUX_SCTP_H__ */ diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2db3d3a9ce1d..88233cf8b8d4 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -261,7 +261,7 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, struct sctp_fwdtsn_skip *skiplist); struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc); struct sctp_chunk *sctp_make_strreset_req(const struct sctp_association *asoc, - __u16 stream_num, __u16 *stream_list, + __u16 stream_num, __be16 *stream_list, bool out, bool in); struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc); diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index b8c86ec1a8f5..231dc42f1da6 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -130,7 +130,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( const struct sctp_association *asoc, __u16 flags, - __u16 stream_num, __u16 *stream_list, gfp_t gfp); + __u16 stream_num, __be16 *stream_list, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( const struct sctp_association *asoc, __u16 flags, diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index ca8f196b6c6c..57c55045f5a7 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3591,7 +3591,7 @@ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc, */ struct sctp_chunk *sctp_make_strreset_req( const struct sctp_association *asoc, - __u16 stream_num, __u16 *stream_list, + __u16 stream_num, __be16 *stream_list, bool out, bool in) { struct sctp_strreset_outreq outreq; @@ -3788,7 +3788,8 @@ bool sctp_verify_reconf(const struct sctp_association *asoc, { struct sctp_reconf_chunk *hdr; union sctp_params param; - __u16 last = 0, cnt = 0; + __be16 last = 0; + __u16 cnt = 0; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr, params) { diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 63ea15503714..fa8371ff05c4 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -118,6 +118,7 @@ int sctp_send_reset_streams(struct sctp_association *asoc, __u16 i, str_nums, *str_list; struct sctp_chunk *chunk; int retval = -EINVAL; + __be16 *nstr_list; bool out, in; if (!asoc->peer.reconf_capable || @@ -148,13 +149,18 @@ int sctp_send_reset_streams(struct sctp_association *asoc, if (str_list[i] >= stream->incnt) goto out; + nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL); + if (!nstr_list) { + retval = -ENOMEM; + goto out; + } + for (i = 0; i < str_nums; i++) - str_list[i] = htons(str_list[i]); + nstr_list[i] = htons(str_list[i]); - chunk = sctp_make_strreset_req(asoc, str_nums, str_list, out, in); + chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in); - for (i = 0; i < str_nums; i++) - str_list[i] = ntohs(str_list[i]); + kfree(nstr_list); if (!chunk) { retval = -ENOMEM; @@ -305,7 +311,7 @@ out: } static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param( - struct sctp_association *asoc, __u32 resp_seq, + struct sctp_association *asoc, __be32 resp_seq, __be16 type) { struct sctp_chunk *chunk = asoc->strreset_chunk; @@ -345,8 +351,9 @@ struct sctp_chunk *sctp_process_strreset_outreq( { struct sctp_strreset_outreq *outreq = param.v; struct sctp_stream *stream = &asoc->stream; - __u16 i, nums, flags = 0, *str_p = NULL; __u32 result = SCTP_STRRESET_DENIED; + __u16 i, nums, flags = 0; + __be16 *str_p = NULL; __u32 request_seq; request_seq = ntohl(outreq->request_seq); @@ -439,8 +446,9 @@ struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; struct sctp_chunk *chunk = NULL; - __u16 i, nums, *str_p; __u32 request_seq; + __u16 i, nums; + __be16 *str_p; request_seq = ntohl(inreq->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || @@ -769,7 +777,7 @@ struct sctp_chunk *sctp_process_strreset_resp( if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) { struct sctp_strreset_outreq *outreq; - __u16 *str_p; + __be16 *str_p; outreq = (struct sctp_strreset_outreq *)req; str_p = outreq->list_of_streams; @@ -794,7 +802,7 @@ struct sctp_chunk *sctp_process_strreset_resp( nums, str_p, GFP_ATOMIC); } else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) { struct sctp_strreset_inreq *inreq; - __u16 *str_p; + __be16 *str_p; /* if the result is performed, it's impossible for inreq */ if (result == SCTP_STRRESET_PERFORMED) diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 67abc0194f30..5447228bf1a0 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -847,7 +847,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( const struct sctp_association *asoc, __u16 flags, __u16 stream_num, - __u16 *stream_list, gfp_t gfp) + __be16 *stream_list, gfp_t gfp) { struct sctp_stream_reset_event *sreset; struct sctp_ulpevent *event; -- cgit v1.2.3 From 978aa0474115f3f5848949f2efce4def0766a5cb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 19:43:57 +0800 Subject: sctp: fix some type cast warnings introduced since very beginning These warnings were found by running 'make C=2 M=net/sctp/'. They are there since very beginning. Note after this patch, there still one warning left in sctp_outq_flush(): sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM) Since it has been moved to sctp_stream_outq_migrate on net-next, to avoid the extra job when merging net-next to net, I will post the fix for it after the merging is done. Reported-by: Eric Dumazet Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 2 +- include/uapi/linux/sctp.h | 2 +- net/sctp/ipv6.c | 2 +- net/sctp/sm_make_chunk.c | 4 ++-- net/sctp/sm_sideeffect.c | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 09d7412e9cb0..da803dfc7a39 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -231,7 +231,7 @@ struct sctp_datahdr { __be32 tsn; __be16 stream; __be16 ssn; - __be32 ppid; + __u32 ppid; __u8 payload[0]; }; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6217ff8500a1..84fc2914b7fb 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -376,7 +376,7 @@ struct sctp_remote_error { __u16 sre_type; __u16 sre_flags; __u32 sre_length; - __u16 sre_error; + __be16 sre_error; sctp_assoc_t sre_assoc_id; __u8 sre_data[0]; }; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7fe9e1d1b7ec..a6dfa86c0201 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -738,7 +738,7 @@ static int sctp_v6_skb_iif(const struct sk_buff *skb) /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v6_is_ce(const struct sk_buff *skb) { - return *((__u32 *)(ipv6_hdr(skb))) & htonl(1 << 20); + return *((__u32 *)(ipv6_hdr(skb))) & (__force __u32)htonl(1 << 20); } /* Dump the v6 addr to the seq file. */ diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 57c55045f5a7..514465b03829 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2854,7 +2854,7 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = flags; param.param_hdr.length = htons(paramlen + addr_param_len); - param.crr_id = i; + param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); @@ -2867,7 +2867,7 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = SCTP_PARAM_DEL_IP; param.param_hdr.length = htons(paramlen + addr_param_len); - param.crr_id = i; + param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, ¶m); sctp_addto_chunk(retval, addr_param_len, &addr_param); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 8f2762bba879..e2d9a4b49c9c 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1607,12 +1607,12 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, break; case SCTP_CMD_INIT_FAILED: - sctp_cmd_init_failed(commands, asoc, cmd->obj.err); + sctp_cmd_init_failed(commands, asoc, cmd->obj.u32); break; case SCTP_CMD_ASSOC_FAILED: sctp_cmd_assoc_failed(commands, asoc, event_type, - subtype, chunk, cmd->obj.err); + subtype, chunk, cmd->obj.u32); break; case SCTP_CMD_INIT_COUNTER_INC: -- cgit v1.2.3 From 7aa0045dadb6ef37485ea9f2a7d28278ca588b51 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:28 -0700 Subject: net_sched: introduce a workqueue for RCU callbacks of tc filter This patch introduces a dedicated workqueue for tc filters so that each tc filter's RCU callback could defer their action destroy work to this workqueue. The helper tcf_queue_work() is introduced for them to use. Because we hold RTNL lock when calling tcf_block_put(), we can not simply flush works inside it, therefore we have to defer it again to this workqueue and make sure all flying RCU callbacks have already queued their work before this one, in other words, to ensure this is the last one to execute to prevent any use-after-free. On the other hand, this makes tcf_block_put() ugly and harder to understand. Since David and Eric strongly dislike adding synchronize_rcu(), this is probably the only solution that could make everyone happy. Please also see the code comments below. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 +++ include/net/sch_generic.h | 2 ++ net/sched/cls_api.c | 68 +++++++++++++++++++++++++++++++++++------------ 3 files changed, 56 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e80edd8879ef..3009547f3c66 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -2,6 +2,7 @@ #define __NET_PKT_CLS_H #include +#include #include #include @@ -17,6 +18,8 @@ struct tcf_walker { int register_tcf_proto_ops(struct tcf_proto_ops *ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); +bool tcf_queue_work(struct work_struct *work); + #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 135f5a2dd931..0dec8a23be57 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -271,6 +272,7 @@ struct tcf_chain { struct tcf_block { struct list_head chain_list; + struct work_struct work; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0b2219adf520..045d13679ad6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -77,6 +77,8 @@ out: } EXPORT_SYMBOL(register_tcf_proto_ops); +static struct workqueue_struct *tc_filter_wq; + int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { struct tcf_proto_ops *t; @@ -86,6 +88,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) * tcf_proto_ops's destroy() handler. */ rcu_barrier(); + flush_workqueue(tc_filter_wq); write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { @@ -100,6 +103,12 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) } EXPORT_SYMBOL(unregister_tcf_proto_ops); +bool tcf_queue_work(struct work_struct *work) +{ + return queue_work(tc_filter_wq, work); +} +EXPORT_SYMBOL(tcf_queue_work); + /* Select new prio value from the range, managed by kernel. */ static inline u32 tcf_auto_prio(struct tcf_proto *tp) @@ -266,23 +275,30 @@ err_chain_create: } EXPORT_SYMBOL(tcf_block_get); -void tcf_block_put(struct tcf_block *block) +static void tcf_block_put_final(struct work_struct *work) { + struct tcf_block *block = container_of(work, struct tcf_block, work); struct tcf_chain *chain, *tmp; - if (!block) - return; - - /* XXX: Standalone actions are not allowed to jump to any chain, and - * bound actions should be all removed after flushing. However, - * filters are destroyed in RCU callbacks, we have to hold the chains - * first, otherwise we would always race with RCU callbacks on this list - * without proper locking. - */ + /* At this point, all the chains should have refcnt == 1. */ + rtnl_lock(); + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_put(chain); + rtnl_unlock(); + kfree(block); +} - /* Wait for existing RCU callbacks to cool down. */ - rcu_barrier(); +/* XXX: Standalone actions are not allowed to jump to any chain, and bound + * actions should be all removed after flushing. However, filters are destroyed + * in RCU callbacks, we have to hold the chains first, otherwise we would + * always race with RCU callbacks on this list without proper locking. + */ +static void tcf_block_put_deferred(struct work_struct *work) +{ + struct tcf_block *block = container_of(work, struct tcf_block, work); + struct tcf_chain *chain; + rtnl_lock(); /* Hold a refcnt for all chains, except 0, in case they are gone. */ list_for_each_entry(chain, &block->chain_list, list) if (chain->index) @@ -292,13 +308,27 @@ void tcf_block_put(struct tcf_block *block) list_for_each_entry(chain, &block->chain_list, list) tcf_chain_flush(chain); - /* Wait for RCU callbacks to release the reference count. */ + INIT_WORK(&block->work, tcf_block_put_final); + /* Wait for RCU callbacks to release the reference count and make + * sure their works have been queued before this. + */ rcu_barrier(); + tcf_queue_work(&block->work); + rtnl_unlock(); +} - /* At this point, all the chains should have refcnt == 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) - tcf_chain_put(chain); - kfree(block); +void tcf_block_put(struct tcf_block *block) +{ + if (!block) + return; + + INIT_WORK(&block->work, tcf_block_put_deferred); + /* Wait for existing RCU callbacks to cool down, make sure their works + * have been queued before this. We can not flush pending works here + * because we are holding the RTNL lock. + */ + rcu_barrier(); + tcf_queue_work(&block->work); } EXPORT_SYMBOL(tcf_block_put); @@ -1030,6 +1060,10 @@ EXPORT_SYMBOL(tcf_exts_get_dev); static int __init tc_filter_init(void) { + tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0); + if (!tc_filter_wq) + return -ENOMEM; + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, -- cgit v1.2.3 From 2a9a86d5c81389cd9afe6a4fea42c585733cd705 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Mon, 30 Oct 2017 09:10:46 +0200 Subject: PM / QoS: Fix default runtime_pm device resume latency The recent change to the PM QoS framework to introduce a proper no constraint value overlooked to handle the devices which don't implement PM QoS OPS. Runtime PM is one of the more severely impacted subsystems, failing every attempt to runtime suspend a device. This leads into some nasty second level issues like probe failures and increased power consumption among other things. Fix this by adding a proper return value for devices that don't implement PM QoS. Fixes: 0cc2b4e5a020 (PM / QoS: Fix device resume latency PM QoS) Signed-off-by: Tero Kristo Cc: All applicable Signed-off-by: Rafael J. Wysocki --- include/linux/pm_qos.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 6737a8c9e8c6..d68b0569a5eb 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -175,7 +175,8 @@ static inline s32 dev_pm_qos_requested_flags(struct device *dev) static inline s32 dev_pm_qos_raw_read_value(struct device *dev) { return IS_ERR_OR_NULL(dev->power.qos) ? - 0 : pm_qos_read_value(&dev->power.qos->resume_latency); + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : + pm_qos_read_value(&dev->power.qos->resume_latency); } #else static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, -- cgit v1.2.3