From 24d28e4f1271cb2f91613dada8f2acccd00eff56 Mon Sep 17 00:00:00 2001 From: Nick Dyer Date: Tue, 5 Jun 2018 10:17:51 -0700 Subject: Input: synaptics-rmi4 - convert irq distribution to irq_domain Convert the RMI driver to use the standard mechanism for distributing IRQs to the various functions. Tested on: * S7300 (F11, F34, F54) * S7817 (F12, F34, F54) Signed-off-by: Nick Dyer Acked-by: Christopher Heiny Signed-off-by: Dmitry Torokhov --- include/linux/rmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/rmi.h b/include/linux/rmi.h index 64125443f8a6..5ef5c7c412a7 100644 --- a/include/linux/rmi.h +++ b/include/linux/rmi.h @@ -354,6 +354,8 @@ struct rmi_driver_data { struct mutex irq_mutex; struct input_dev *input; + struct irq_domain *irqdomain; + u8 pdt_props; u8 num_rx_electrodes; -- cgit v1.2.3 From bf6247a70f2b7569180304041b63b59ab14217bc Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 5 Jun 2018 10:08:44 -0700 Subject: Input: make input_report_slot_state() return boolean Let's make input_report_slot_state() return boolean representing whether the contact is active or not. This will allow writing code like: if (input_mt_report_slot_state(input, obj->mt_tool, obj->type != RMI_2D_OBJECT_NONE) { input_event(sensor->input, EV_ABS, ABS_MT_POSITION_X, obj->x); input_event(sensor->input, EV_ABS, ABS_MT_POSITION_Y, obj->y); ... } instead of: input_mt_report_slot_state(input, obj->mt_tool, obj->type != RMI_2D_OBJECT_NONE); if (obj->type != RMI_2D_OBJECT_NONE) { input_event(sensor->input, EV_ABS, ABS_MT_POSITION_X, obj->x); input_event(sensor->input, EV_ABS, ABS_MT_POSITION_Y, obj->y); ... } Reviewed-by: Henrik Rydberg Acked-by: Benjamin Tissoires Signed-off-by: Dmitry Torokhov --- drivers/input/input-mt.c | 10 +++++++--- include/linux/input/mt.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c index a1bbec9cda8d..8de52e3c00c0 100644 --- a/drivers/input/input-mt.c +++ b/drivers/input/input-mt.c @@ -131,8 +131,10 @@ EXPORT_SYMBOL(input_mt_destroy_slots); * inactive, or if the tool type is changed, a new tracking id is * assigned to the slot. The tool type is only reported if the * corresponding absbit field is set. + * + * Returns true if contact is active. */ -void input_mt_report_slot_state(struct input_dev *dev, +bool input_mt_report_slot_state(struct input_dev *dev, unsigned int tool_type, bool active) { struct input_mt *mt = dev->mt; @@ -140,14 +142,14 @@ void input_mt_report_slot_state(struct input_dev *dev, int id; if (!mt) - return; + return false; slot = &mt->slots[mt->slot]; slot->frame = mt->frame; if (!active) { input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1); - return; + return false; } id = input_mt_get_value(slot, ABS_MT_TRACKING_ID); @@ -156,6 +158,8 @@ void input_mt_report_slot_state(struct input_dev *dev, input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, id); input_event(dev, EV_ABS, ABS_MT_TOOL_TYPE, tool_type); + + return true; } EXPORT_SYMBOL(input_mt_report_slot_state); diff --git a/include/linux/input/mt.h b/include/linux/input/mt.h index d7188de4db96..3f4bf60b0bb5 100644 --- a/include/linux/input/mt.h +++ b/include/linux/input/mt.h @@ -100,7 +100,7 @@ static inline bool input_is_mt_axis(int axis) return axis == ABS_MT_SLOT || input_is_mt_value(axis); } -void input_mt_report_slot_state(struct input_dev *dev, +bool input_mt_report_slot_state(struct input_dev *dev, unsigned int tool_type, bool active); void input_mt_report_finger_count(struct input_dev *dev, int count); -- cgit v1.2.3 From 9ce7bc036ae4cfe3393232c86e9e1fea2153c237 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Jun 2018 10:11:56 -0700 Subject: netfilter: ipv6: nf_defrag: reduce struct net memory waste It is a waste of memory to use a full "struct netns_sysctl_ipv6" while only one pointer is really used, considering netns_sysctl_ipv6 keeps growing. Also, since "struct netns_frags" has cache line alignment, it is better to move the frags_hdr pointer outside, otherwise we spend a full cache line for this pointer. This saves 192 bytes of memory per netns. Fixes: c038a767cd69 ("ipv6: add a new namespace for nf_conntrack_reasm") Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/net/net_namespace.h | 1 + include/net/netns/ipv6.h | 1 - net/ipv6/netfilter/nf_conntrack_reasm.c | 6 +++--- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 47e35cce3b64..a71264d75d7f 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -128,6 +128,7 @@ struct net { #endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) struct netns_nf_frag nf_frag; + struct ctl_table_header *nf_frag_frags_hdr; #endif struct sock *nfnl; struct sock *nfnl_stash; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index c978a31b0f84..762ac9931b62 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -109,7 +109,6 @@ struct netns_ipv6 { #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) struct netns_nf_frag { - struct netns_sysctl_ipv6 sysctl; struct netns_frags frags; }; #endif diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 5e0332014c17..a452d99c9f52 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -107,7 +107,7 @@ static int nf_ct_frag6_sysctl_register(struct net *net) if (hdr == NULL) goto err_reg; - net->nf_frag.sysctl.frags_hdr = hdr; + net->nf_frag_frags_hdr = hdr; return 0; err_reg: @@ -121,8 +121,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct ctl_table *table; - table = net->nf_frag.sysctl.frags_hdr->ctl_table_arg; - unregister_net_sysctl_table(net->nf_frag.sysctl.frags_hdr); + table = net->nf_frag_frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->nf_frag_frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } -- cgit v1.2.3 From 6c3796d130ed2860489885a934dcb7bb334d5eb0 Mon Sep 17 00:00:00 2001 From: "bstroesser@ts.fujitsu.com" Date: Thu, 24 May 2018 18:49:41 +0200 Subject: scsi: target: tcmu: add read length support Generally target core and TCMUser seem to work fine for tape devices and media changers. But there is at least one situation where TCMUser is not able to support sequential access device emulation correctly. The situation is when an initiator sends a SCSI READ CDB with a length that is greater than the length of the tape block to read. We can distinguish two subcases: A) The initiator sent the READ CDB with the SILI bit being set. In this case the sequential access device has to transfer the data from the tape block (only the length of the tape block) and transmit a good status. The current interface between TCMUser and the userspace does not support reduction of the read data size by the userspace program. The patch below fixes this subcase by allowing the userspace program to specify a reduced data size in read direction. B) The initiator sent the READ CDB with the SILI bit not being set. In this case the sequential access device has to transfer the data from the tape block as in A), but additionally has to transmit CHECK CONDITION with the ILI bit set and NO SENSE in the sensebytes. The information field in the sensebytes must contain the residual count. With the below patch a user space program can specify the real read data length and appropriate sensebytes. TCMUser then uses the se_cmd flag SCF_TREAT_READ_AS_NORMAL, to force target core to transmit the real data size and the sensebytes. Note: the flag SCF_TREAT_READ_AS_NORMAL is introduced by Lee Duncan's patch "[PATCH v4] target: transport should handle st FM/EOM/ILI reads" from Tue, 15 May 2018 18:25:24 -0700. Signed-off-by: Bodo Stroesser Acked-by: Mike Christie Reviewed-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/target/target_core_user.c | 44 ++++++++++++++++++++++++++++------- include/uapi/linux/target_core_user.h | 4 +++- 2 files changed, 39 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 7f96dfa32b9c..d8dc3d22051f 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -656,7 +656,7 @@ static void scatter_data_area(struct tcmu_dev *udev, } static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd, - bool bidi) + bool bidi, uint32_t read_len) { struct se_cmd *se_cmd = cmd->se_cmd; int i, dbi; @@ -689,7 +689,7 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd, for_each_sg(data_sg, sg, data_nents, i) { int sg_remaining = sg->length; to = kmap_atomic(sg_page(sg)) + sg->offset; - while (sg_remaining > 0) { + while (sg_remaining > 0 && read_len > 0) { if (block_remaining == 0) { if (from) kunmap_atomic(from); @@ -701,6 +701,8 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd, } copy_bytes = min_t(size_t, sg_remaining, block_remaining); + if (read_len < copy_bytes) + copy_bytes = read_len; offset = DATA_BLOCK_SIZE - block_remaining; tcmu_flush_dcache_range(from, copy_bytes); memcpy(to + sg->length - sg_remaining, from + offset, @@ -708,8 +710,11 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd, sg_remaining -= copy_bytes; block_remaining -= copy_bytes; + read_len -= copy_bytes; } kunmap_atomic(to - sg->offset); + if (read_len == 0) + break; } if (from) kunmap_atomic(from); @@ -1042,6 +1047,8 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry * { struct se_cmd *se_cmd = cmd->se_cmd; struct tcmu_dev *udev = cmd->tcmu_dev; + bool read_len_valid = false; + uint32_t read_len = se_cmd->data_length; /* * cmd has been completed already from timeout, just reclaim @@ -1056,13 +1063,28 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry * pr_warn("TCMU: Userspace set UNKNOWN_OP flag on se_cmd %p\n", cmd->se_cmd); entry->rsp.scsi_status = SAM_STAT_CHECK_CONDITION; - } else if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) { + goto done; + } + + if (se_cmd->data_direction == DMA_FROM_DEVICE && + (entry->hdr.uflags & TCMU_UFLAG_READ_LEN) && entry->rsp.read_len) { + read_len_valid = true; + if (entry->rsp.read_len < read_len) + read_len = entry->rsp.read_len; + } + + if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) { transport_copy_sense_to_cmd(se_cmd, entry->rsp.sense_buffer); - } else if (se_cmd->se_cmd_flags & SCF_BIDI) { + if (!read_len_valid ) + goto done; + else + se_cmd->se_cmd_flags |= SCF_TREAT_READ_AS_NORMAL; + } + if (se_cmd->se_cmd_flags & SCF_BIDI) { /* Get Data-In buffer before clean up */ - gather_data_area(udev, cmd, true); + gather_data_area(udev, cmd, true, read_len); } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { - gather_data_area(udev, cmd, false); + gather_data_area(udev, cmd, false, read_len); } else if (se_cmd->data_direction == DMA_TO_DEVICE) { /* TODO: */ } else if (se_cmd->data_direction != DMA_NONE) { @@ -1070,7 +1092,13 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry * se_cmd->data_direction); } - target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status); +done: + if (read_len_valid) { + pr_debug("read_len = %d\n", read_len); + target_complete_cmd_with_length(cmd->se_cmd, + entry->rsp.scsi_status, read_len); + } else + target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status); out: cmd->se_cmd = NULL; @@ -1740,7 +1768,7 @@ static int tcmu_configure_device(struct se_device *dev) /* Initialise the mailbox of the ring buffer */ mb = udev->mb_addr; mb->version = TCMU_MAILBOX_VERSION; - mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC; + mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC | TCMU_MAILBOX_FLAG_CAP_READ_LEN; mb->cmdr_off = CMDR_OFF; mb->cmdr_size = udev->cmdr_size; diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index 6e299349b158..b7b57967d90f 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -44,6 +44,7 @@ #define TCMU_MAILBOX_VERSION 2 #define ALIGN_SIZE 64 /* Should be enough for most CPUs */ #define TCMU_MAILBOX_FLAG_CAP_OOOC (1 << 0) /* Out-of-order completions */ +#define TCMU_MAILBOX_FLAG_CAP_READ_LEN (1 << 1) /* Read data length */ struct tcmu_mailbox { __u16 version; @@ -71,6 +72,7 @@ struct tcmu_cmd_entry_hdr { __u16 cmd_id; __u8 kflags; #define TCMU_UFLAG_UNKNOWN_OP 0x1 +#define TCMU_UFLAG_READ_LEN 0x2 __u8 uflags; } __packed; @@ -119,7 +121,7 @@ struct tcmu_cmd_entry { __u8 scsi_status; __u8 __pad1; __u16 __pad2; - __u32 __pad3; + __u32 read_len; char sense_buffer[TCMU_SENSE_BUFFERSIZE]; } rsp; }; -- cgit v1.2.3 From 6362f0a290023bafd7f991089e81dd9278f154b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Jun 2018 10:12:48 -0600 Subject: libata: add command iterator helpers Now that we have the internal tag as a special (higher) value tag, it gets a bit tricky to iterate the internal commands as some loops will exceed ATA_MAX_QUEUE. Add explicit helpers for iterating pending commands, both inflight and internal. Signed-off-by: Jens Axboe Signed-off-by: Tejun Heo --- include/linux/libata.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 8b8946dd63b9..a2257e380789 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1495,6 +1495,29 @@ static inline bool ata_tag_valid(unsigned int tag) return tag < ATA_MAX_QUEUE || ata_tag_internal(tag); } +#define __ata_qc_for_each(ap, qc, tag, max_tag, fn) \ + for ((tag) = 0; (tag) < (max_tag) && \ + ({ qc = fn((ap), (tag)); 1; }); (tag)++) \ + +/* + * Internal use only, iterate commands ignoring error handling and + * status of 'qc'. + */ +#define ata_qc_for_each_raw(ap, qc, tag) \ + __ata_qc_for_each(ap, qc, tag, ATA_MAX_QUEUE, __ata_qc_from_tag) + +/* + * Iterate all potential commands that can be queued + */ +#define ata_qc_for_each(ap, qc, tag) \ + __ata_qc_for_each(ap, qc, tag, ATA_MAX_QUEUE, ata_qc_from_tag) + +/* + * Like ata_qc_for_each, but with the internal tag included + */ +#define ata_qc_for_each_with_internal(ap, qc, tag) \ + __ata_qc_for_each(ap, qc, tag, ATA_MAX_QUEUE + 1, ata_qc_from_tag) + /* * device helpers */ -- cgit v1.2.3 From 92397a6c38d139d50fabbe9e2dc09b61d53b2377 Mon Sep 17 00:00:00 2001 From: Phil Reid Date: Tue, 5 Jun 2018 14:15:10 +0800 Subject: iio: buffer: fix the function signature to match implementation linux/iio/buffer-dma.h was not updated to when length was changed to unsigned int. Fixes: c043ec1ca5ba ("iio:buffer: make length types match kfifo types") Signed-off-by: Phil Reid Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h index 767467d886de..67c75372b691 100644 --- a/include/linux/iio/buffer-dma.h +++ b/include/linux/iio/buffer-dma.h @@ -141,7 +141,7 @@ int iio_dma_buffer_read(struct iio_buffer *buffer, size_t n, char __user *user_buffer); size_t iio_dma_buffer_data_available(struct iio_buffer *buffer); int iio_dma_buffer_set_bytes_per_datum(struct iio_buffer *buffer, size_t bpd); -int iio_dma_buffer_set_length(struct iio_buffer *buffer, int length); +int iio_dma_buffer_set_length(struct iio_buffer *buffer, unsigned int length); int iio_dma_buffer_request_update(struct iio_buffer *buffer); int iio_dma_buffer_init(struct iio_dma_buffer_queue *queue, -- cgit v1.2.3 From 5e03aa61a7f3cb99852bc3ad6925f5fa3c0dcc93 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 14 Jun 2018 10:53:34 +0530 Subject: PM / Domains: Fix return value of of_genpd_opp_to_performance_state() of_genpd_opp_to_performance_state() should return 0 for errors, but the dummy routine isn't doing that. Fix it. Signed-off-by: Viresh Kumar Acked-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 9206a4fef9ac..139f79c8477a 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -276,7 +276,7 @@ static inline unsigned int of_genpd_opp_to_performance_state(struct device *dev, struct device_node *opp_node) { - return -ENODEV; + return 0; } static inline int genpd_dev_pm_attach(struct device *dev) -- cgit v1.2.3 From ad6384ba3ac98ad524194e37de379c1fe503870b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 20 Jun 2018 11:45:37 +0530 Subject: PM / Domains: Rename opp_node to np The DT node passed here isn't necessarily an OPP node, as this routine can also be used for cases where the "required-opps" property is present directly in the device's node. Rename it. This also removes a stale comment. Acked-by: Ulf Hansson Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 7 +++---- include/linux/pm_domain.h | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 4925af5c4cf0..c298de8a8308 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2487,10 +2487,9 @@ EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states); * power domain corresponding to a DT node's "required-opps" property. * * @dev: Device for which the performance-state needs to be found. - * @opp_node: DT node where the "required-opps" property is present. This can be + * @np: DT node where the "required-opps" property is present. This can be * the device node itself (if it doesn't have an OPP table) or a node * within the OPP table of a device (if device has an OPP table). - * @state: Pointer to return performance state. * * Returns performance state corresponding to the "required-opps" property of * a DT node. This calls platform specific genpd->opp_to_performance_state() @@ -2499,7 +2498,7 @@ EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states); * Returns performance state on success and 0 on failure. */ unsigned int of_genpd_opp_to_performance_state(struct device *dev, - struct device_node *opp_node) + struct device_node *np) { struct generic_pm_domain *genpd; struct dev_pm_opp *opp; @@ -2514,7 +2513,7 @@ unsigned int of_genpd_opp_to_performance_state(struct device *dev, genpd_lock(genpd); - opp = of_dev_pm_opp_find_required_opp(&genpd->dev, opp_node); + opp = of_dev_pm_opp_find_required_opp(&genpd->dev, np); if (IS_ERR(opp)) { dev_err(dev, "Failed to find required OPP: %ld\n", PTR_ERR(opp)); diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 139f79c8477a..cb8d84090cfb 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -234,7 +234,7 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np); int of_genpd_parse_idle_states(struct device_node *dn, struct genpd_power_state **states, int *n); unsigned int of_genpd_opp_to_performance_state(struct device *dev, - struct device_node *opp_node); + struct device_node *np); int genpd_dev_pm_attach(struct device *dev); struct device *genpd_dev_pm_attach_by_id(struct device *dev, @@ -274,7 +274,7 @@ static inline int of_genpd_parse_idle_states(struct device_node *dn, static inline unsigned int of_genpd_opp_to_performance_state(struct device *dev, - struct device_node *opp_node) + struct device_node *np) { return 0; } -- cgit v1.2.3 From 8f732850df1b2b4d8d719f7e606dfb3050e7ea11 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Thu, 31 May 2018 13:49:29 +0200 Subject: HID: core: allow concurrent registration of drivers Detected on the Dell XPS 9365. The laptop has 2 devices that benefit from the hid-generic auto-unbinding. When those 2 devices are presented to the userspace, udev loads both wacom and hid-multitouch. When this happens, the code in __hid_bus_reprobe_drivers() is called concurrently and the second device gets reprobed twice. An other bug in the power_supply subsystem prevent to remove the wacom driver if it just finished its initialization, which basically kills the wacom node. [jkosina@suse.cz: reformat changelog a bit] Fixes c17a7476e4c4 ("HID: core: rewrite the hid-generic automatic unbind") Cc: stable@vger.kernel.org # v4.17 Tested-by: Mario Limonciello Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 5 ++++- include/linux/hid.h | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 355dc7e49562..a460ec147aee 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1949,6 +1949,8 @@ static int hid_device_probe(struct device *dev) } hdev->io_started = false; + clear_bit(ffs(HID_STAT_REPROBED), &hdev->status); + if (!hdev->driver) { id = hid_match_device(hdev, hdrv); if (id == NULL) { @@ -2212,7 +2214,8 @@ static int __hid_bus_reprobe_drivers(struct device *dev, void *data) struct hid_device *hdev = to_hid_device(dev); if (hdev->driver == hdrv && - !hdrv->match(hdev, hid_ignore_special_drivers)) + !hdrv->match(hdev, hid_ignore_special_drivers) && + !test_and_set_bit(ffs(HID_STAT_REPROBED), &hdev->status)) return device_reprobe(dev); return 0; diff --git a/include/linux/hid.h b/include/linux/hid.h index 41a3d5775394..773bcb1d4044 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -511,6 +511,7 @@ struct hid_output_fifo { #define HID_STAT_ADDED BIT(0) #define HID_STAT_PARSED BIT(1) #define HID_STAT_DUP_DETECTED BIT(2) +#define HID_STAT_REPROBED BIT(3) struct hid_input { struct list_head list; @@ -579,7 +580,7 @@ struct hid_device { /* device report descriptor */ bool battery_avoid_query; #endif - unsigned int status; /* see STAT flags above */ + unsigned long status; /* see STAT flags above */ unsigned claimed; /* Claimed by hidinput, hiddev? */ unsigned quirks; /* Various quirks the device can pull on us */ bool io_started; /* If IO has started */ -- cgit v1.2.3 From d2d2e3c46be5d6dd8001d0eebdf7cafb9bc7006b Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Thu, 21 Jun 2018 16:43:17 +0300 Subject: acpi: Add helper for deactivating memory region Sometimes memory resource may be overlapping with SystemMemory Operation Region by design, for example if the memory region is used as a mailbox for communication with a firmware in the system. One occasion of such mailboxes is USB Type-C Connector System Software Interface (UCSI). With regions like that, it is important that the driver is able to map the memory with the requirements it has. For example, the driver should be allowed to map the memory as non-cached memory. However, if the operation region has been accessed before the driver has mapped the memory, the memory has been marked as write-back by the time the driver is loaded. That means the driver will fail to map the memory if it expects non-cached memory. To work around the problem, introducing helper that the drivers can use to temporarily deactivate (unmap) SystemMemory Operation Regions that overlap with their IO memory. Fixes: 8243edf44152 ("usb: typec: ucsi: Add ACPI driver") Cc: stable@vger.kernel.org Reviewed-by: Rafael J. Wysocki Signed-off-by: Heikki Krogerus Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/osl.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 3 +++ 2 files changed, 75 insertions(+) (limited to 'include') diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 7ca41bf023c9..8df9abfa947b 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -45,6 +45,8 @@ #include #include +#include "acpica/accommon.h" +#include "acpica/acnamesp.h" #include "internal.h" #define _COMPONENT ACPI_OS_SERVICES @@ -1490,6 +1492,76 @@ int acpi_check_region(resource_size_t start, resource_size_t n, } EXPORT_SYMBOL(acpi_check_region); +static acpi_status acpi_deactivate_mem_region(acpi_handle handle, u32 level, + void *_res, void **return_value) +{ + struct acpi_mem_space_context **mem_ctx; + union acpi_operand_object *handler_obj; + union acpi_operand_object *region_obj2; + union acpi_operand_object *region_obj; + struct resource *res = _res; + acpi_status status; + + region_obj = acpi_ns_get_attached_object(handle); + if (!region_obj) + return AE_OK; + + handler_obj = region_obj->region.handler; + if (!handler_obj) + return AE_OK; + + if (region_obj->region.space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) + return AE_OK; + + if (!(region_obj->region.flags & AOPOBJ_SETUP_COMPLETE)) + return AE_OK; + + region_obj2 = acpi_ns_get_secondary_object(region_obj); + if (!region_obj2) + return AE_OK; + + mem_ctx = (void *)®ion_obj2->extra.region_context; + + if (!(mem_ctx[0]->address >= res->start && + mem_ctx[0]->address < res->end)) + return AE_OK; + + status = handler_obj->address_space.setup(region_obj, + ACPI_REGION_DEACTIVATE, + NULL, (void **)mem_ctx); + if (ACPI_SUCCESS(status)) + region_obj->region.flags &= ~(AOPOBJ_SETUP_COMPLETE); + + return status; +} + +/** + * acpi_release_memory - Release any mappings done to a memory region + * @handle: Handle to namespace node + * @res: Memory resource + * @level: A level that terminates the search + * + * Walks through @handle and unmaps all SystemMemory Operation Regions that + * overlap with @res and that have already been activated (mapped). + * + * This is a helper that allows drivers to place special requirements on memory + * region that may overlap with operation regions, primarily allowing them to + * safely map the region as non-cached memory. + * + * The unmapped Operation Regions will be automatically remapped next time they + * are called, so the drivers do not need to do anything else. + */ +acpi_status acpi_release_memory(acpi_handle handle, struct resource *res, + u32 level) +{ + if (!(res->flags & IORESOURCE_MEM)) + return AE_TYPE; + + return acpi_walk_namespace(ACPI_TYPE_REGION, handle, level, + acpi_deactivate_mem_region, NULL, res, NULL); +} +EXPORT_SYMBOL_GPL(acpi_release_memory); + /* * Let drivers know whether the resource checks are effective */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4b35a66383f9..e54f40974eb0 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -443,6 +443,9 @@ int acpi_check_resource_conflict(const struct resource *res); int acpi_check_region(resource_size_t start, resource_size_t n, const char *name); +acpi_status acpi_release_memory(acpi_handle handle, struct resource *res, + u32 level); + int acpi_resources_are_enforced(void); #ifdef CONFIG_HIBERNATION -- cgit v1.2.3 From 8793bb7f4a9dd1396575d2e9337d331662cb7555 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Jun 2018 13:14:56 -0700 Subject: kbuild: add macro for controlling warnings to linux/compiler.h I have occasionally run into a situation where it would make sense to control a compiler warning from a source file rather than doing so from a Makefile using the $(cc-disable-warning, ...) or $(cc-option, ...) helpers. The approach here is similar to what glibc uses, using __diag() and related macros to encapsulate a _Pragma("GCC diagnostic ...") statement that gets turned into the respective "#pragma GCC diagnostic ..." by the preprocessor when the macro gets expanded. Like glibc, I also have an argument to pass the affected compiler version, but decided to actually evaluate that one. For now, this supports GCC_4_6, GCC_4_7, GCC_4_8, GCC_4_9, GCC_5, GCC_6, GCC_7, GCC_8 and GCC_9. Adding support for CLANG_5 and other interesting versions is straightforward here. GNU compilers starting with gcc-4.2 could support it in principle, but "#pragma GCC diagnostic push" was only added in gcc-4.6, so it seems simpler to not deal with those at all. The same versions show a large number of warnings already, so it seems easier to just leave it at that and not do a more fine-grained control for them. The use cases I found so far include: - turning off the gcc-8 -Wattribute-alias warning inside of the SYSCALL_DEFINEx() macro without having to do it globally. - Reducing the build time for a simple re-make after a change, once we move the warnings from ./Makefile and ./scripts/Makefile.extrawarn into linux/compiler.h - More control over the warnings based on other configurations, using preprocessor syntax instead of Makefile syntax. This should make it easier for the average developer to understand and change things. - Adding an easy way to turn the W=1 option on unconditionally for a subdirectory or a specific file. This has been requested by several developers in the past that want to have their subsystems W=1 clean. - Integrating clang better into the build systems. Clang supports more warnings than GCC, and we probably want to classify them as default, W=1, W=2 etc, but there are cases in which the warnings should be classified differently due to excessive false positives from one or the other compiler. - Adding a way to turn the default warnings into errors (e.g. using a new "make E=0" tag) while not also turning the W=1 warnings into errors. This patch for now just adds the minimal infrastructure in order to do the first of the list above. As the #pragma GCC diagnostic takes precedence over command line options, the next step would be to convert a lot of the individual Makefiles that set nonstandard options to use __diag() instead. [paul.burton@mips.com: - Rebase atop current master. - Add __diag_GCC, or more generally __diag_, abstraction to avoid code outside of linux/compiler-gcc.h needing to duplicate knowledge about different GCC versions. - Add a comment argument to __diag_{ignore,warn,error} which isn't used in the expansion of the macros but serves to push people to document the reason for using them - per feedback from Kees Cook. - Translate severity to GCC-specific pragmas in linux/compiler-gcc.h rather than using GCC-specific in linux/compiler_types.h. - Drop all but GCC 8 macros, since we only need to define macros for versions that we need to introduce pragmas for, and as of this series that's just GCC 8. - Capitalize comments in linux/compiler-gcc.h to match the style of the rest of the file. - Line up macro definitions with tabs in linux/compiler-gcc.h.] Signed-off-by: Arnd Bergmann Signed-off-by: Paul Burton Tested-by: Christophe Leroy Tested-by: Stafford Horne Signed-off-by: Masahiro Yamada --- include/linux/compiler-gcc.h | 25 +++++++++++++++++++++++++ include/linux/compiler_types.h | 18 ++++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index f1a7492a5cc8..fd282c7d3e5e 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -347,3 +347,28 @@ #if GCC_VERSION >= 50100 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif + +/* + * Turn individual warnings and errors on and off locally, depending + * on version. + */ +#define __diag_GCC(version, severity, s) \ + __diag_GCC_ ## version(__diag_GCC_ ## severity s) + +/* Severity used in pragma directives */ +#define __diag_GCC_ignore ignored +#define __diag_GCC_warn warning +#define __diag_GCC_error error + +/* Compilers before gcc-4.6 do not understand "#pragma GCC diagnostic push" */ +#if GCC_VERSION >= 40600 +#define __diag_str1(s) #s +#define __diag_str(s) __diag_str1(s) +#define __diag(s) _Pragma(__diag_str(GCC diagnostic s)) +#endif + +#if GCC_VERSION >= 80000 +#define __diag_GCC_8(s) __diag(s) +#else +#define __diag_GCC_8(s) +#endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6b79a9bba9a7..a8ba6b04152c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -271,4 +271,22 @@ struct ftrace_likely_data { # define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) #endif +#ifndef __diag +#define __diag(string) +#endif + +#ifndef __diag_GCC +#define __diag_GCC(version, severity, string) +#endif + +#define __diag_push() __diag(push) +#define __diag_pop() __diag(pop) + +#define __diag_ignore(compiler, version, option, comment) \ + __diag_ ## compiler(version, ignore, option) +#define __diag_warn(compiler, version, option, comment) \ + __diag_ ## compiler(version, warn, option) +#define __diag_error(compiler, version, option, comment) \ + __diag_ ## compiler(version, error, option) + #endif /* __LINUX_COMPILER_TYPES_H */ -- cgit v1.2.3 From bee20031772af3debe8cbaa234528f24c7892e8f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Jun 2018 13:14:57 -0700 Subject: disable -Wattribute-alias warning for SYSCALL_DEFINEx() gcc-8 warns for every single definition of a system call entry point, e.g.: include/linux/compat.h:56:18: error: 'compat_sys_rt_sigprocmask' alias between functions of incompatible types 'long int(int, compat_sigset_t *, compat_sigset_t *, compat_size_t)' {aka 'long int(int, struct *, struct *, unsigned int)'} and 'long int(long int, long int, long int, long int)' [-Werror=attribute-alias] asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))\ ^~~~~~~~~~ include/linux/compat.h:45:2: note: in expansion of macro 'COMPAT_SYSCALL_DEFINEx' COMPAT_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) ^~~~~~~~~~~~~~~~~~~~~~ kernel/signal.c:2601:1: note: in expansion of macro 'COMPAT_SYSCALL_DEFINE4' COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, ^~~~~~~~~~~~~~~~~~~~~~ include/linux/compat.h:60:18: note: aliased declaration here asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ ^~~~~~~~~~ The new warning seems reasonable in principle, but it doesn't help us here, since we rely on the type mismatch to sanitize the system call arguments. After I reported this as GCC PR82435, a new -Wno-attribute-alias option was added that could be used to turn the warning off globally on the command line, but I'd prefer to do it a little more fine-grained. Interestingly, turning a warning off and on again inside of a single macro doesn't always work, in this case I had to add an extra statement inbetween and decided to copy the __SC_TEST one from the native syscall to the compat syscall macro. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83256 for more details about this. [paul.burton@mips.com: - Rebase atop current master. - Split GCC & version arguments to __diag_ignore() in order to match changes to the preceding patch. - Add the comment argument to match the preceding patch.] Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82435 Signed-off-by: Arnd Bergmann Signed-off-by: Paul Burton Tested-by: Christophe Leroy Tested-by: Stafford Horne Signed-off-by: Masahiro Yamada --- include/linux/compat.h | 8 +++++++- include/linux/syscalls.h | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compat.h b/include/linux/compat.h index b1a5562b3215..c68acc47da57 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -72,6 +72,9 @@ */ #ifndef COMPAT_SYSCALL_DEFINEx #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wattribute-alias", \ + "Type aliasing is used to sanitize syscall arguments");\ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_compat_sys##name)))); \ @@ -80,8 +83,11 @@ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ - return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\ + long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + return ret; \ } \ + __diag_pop(); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #endif /* COMPAT_SYSCALL_DEFINEx */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 73810808cdf2..a368a68cb667 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -231,6 +231,9 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) */ #ifndef __SYSCALL_DEFINEx #define __SYSCALL_DEFINEx(x, name, ...) \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wattribute-alias", \ + "Type aliasing is used to sanitize syscall arguments");\ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_sys##name)))); \ ALLOW_ERROR_INJECTION(sys##name, ERRNO); \ @@ -243,6 +246,7 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ return ret; \ } \ + __diag_pop(); \ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #endif /* __SYSCALL_DEFINEx */ -- cgit v1.2.3 From fdb5c4531c1e0e50e609df83f736b6f3a02896e2 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 19 Jun 2018 00:04:24 +0100 Subject: bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF If the kernel is compiled with CONFIG_CGROUP_BPF not enabled, it is not possible to attach, detach or query IR BPF programs to /dev/lircN devices, making them impossible to use. For embedded devices, it should be possible to use IR decoding without cgroups or CONFIG_CGROUP_BPF enabled. This change requires some refactoring, since bpf_prog_{attach,detach,query} functions are now always compiled, but their code paths for cgroups need moving out. Rather than a #ifdef CONFIG_CGROUP_BPF in kernel/bpf/syscall.c, moving them to kernel/bpf/cgroup.c and kernel/bpf/sockmap.c does not require #ifdefs since that is already conditionally compiled. Fixes: f4364dcfc86d ("media: rc: introduce BPF_PROG_LIRC_MODE2") Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann --- drivers/media/rc/bpf-lirc.c | 14 +------ include/linux/bpf-cgroup.h | 26 ++++++++++++ include/linux/bpf.h | 8 ++++ include/linux/bpf_lirc.h | 5 ++- kernel/bpf/cgroup.c | 54 +++++++++++++++++++++++++ kernel/bpf/sockmap.c | 18 +++++++++ kernel/bpf/syscall.c | 99 ++++++++++----------------------------------- 7 files changed, 132 insertions(+), 92 deletions(-) (limited to 'include') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 40826bba06b6..fcfab6635f9c 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -207,29 +207,19 @@ void lirc_bpf_free(struct rc_dev *rcdev) bpf_prog_array_free(rcdev->raw->progs); } -int lirc_prog_attach(const union bpf_attr *attr) +int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { - struct bpf_prog *prog; struct rc_dev *rcdev; int ret; if (attr->attach_flags) return -EINVAL; - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_LIRC_MODE2); - if (IS_ERR(prog)) - return PTR_ERR(prog); - rcdev = rc_dev_get_from_fd(attr->target_fd); - if (IS_ERR(rcdev)) { - bpf_prog_put(prog); + if (IS_ERR(rcdev)) return PTR_ERR(rcdev); - } ret = lirc_bpf_attach(rcdev, prog); - if (ret) - bpf_prog_put(prog); put_device(&rcdev->dev); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 975fb4cf1bb7..79795c5fa7c3 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -188,12 +188,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, \ __ret; \ }) +int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog); +int cgroup_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype); +int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); #else +struct bpf_prog; struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) +{ + return -EINVAL; +} + +static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} + #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7df32a3200f7..8827e797ff97 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -696,6 +696,8 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); +int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -714,6 +716,12 @@ static inline int sock_map_prog(struct bpf_map *map, { return -EOPNOTSUPP; } + +static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog) +{ + return -EINVAL; +} #endif #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h index 5f8a4283092d..9d9ff755ec29 100644 --- a/include/linux/bpf_lirc.h +++ b/include/linux/bpf_lirc.h @@ -5,11 +5,12 @@ #include #ifdef CONFIG_BPF_LIRC_MODE2 -int lirc_prog_attach(const union bpf_attr *attr); +int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int lirc_prog_detach(const union bpf_attr *attr); int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else -static inline int lirc_prog_attach(const union bpf_attr *attr) +static inline int lirc_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) { return -EINVAL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f7c00bd6f8e4..3d83ee7df381 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return ret; } +int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_query(cgrp, attr, uattr); + + cgroup_put(cgrp); + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 52a91d816c0e..81d0c55a77aa 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1915,6 +1915,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return 0; } +int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog) +{ + int ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int err; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = sock_map_prog(map, prog, attr->attach_type); + fdput(f); + return err; +} + static void *sock_map_lookup(struct bpf_map *map, void *key) { return NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35dc466641f2..d10ecd78105f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1483,8 +1483,6 @@ out_free_tp: return err; } -#ifdef CONFIG_CGROUP_BPF - static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -1499,40 +1497,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, - int type, bool attach) -{ - struct bpf_prog *prog = NULL; - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, type); - if (IS_ERR(prog)) { - fdput(f); - return PTR_ERR(prog); - } - } - - err = sock_map_prog(map, prog, attr->attach_type); - if (err) { - fdput(f); - if (prog) - bpf_prog_put(prog); - return err; - } - - fdput(f); - return 0; -} - #define BPF_F_ATTACH_MASK \ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) @@ -1540,7 +1504,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; - struct cgroup *cgrp; int ret; if (!capable(CAP_NET_ADMIN)) @@ -1577,12 +1540,15 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); + ptype = BPF_PROG_TYPE_SK_MSG; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + ptype = BPF_PROG_TYPE_SK_SKB; + break; case BPF_LIRC_MODE2: - return lirc_prog_attach(attr); + ptype = BPF_PROG_TYPE_LIRC_MODE2; + break; default: return -EINVAL; } @@ -1596,18 +1562,20 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) { - bpf_prog_put(prog); - return PTR_ERR(cgrp); + switch (ptype) { + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + ret = sockmap_get_from_fd(attr, ptype, prog); + break; + case BPF_PROG_TYPE_LIRC_MODE2: + ret = lirc_prog_attach(attr, prog); + break; + default: + ret = cgroup_bpf_prog_attach(attr, ptype, prog); } - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, - attr->attach_flags); if (ret) bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; } @@ -1616,9 +1584,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - struct bpf_prog *prog; - struct cgroup *cgrp; - int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1651,29 +1616,17 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); - if (IS_ERR(prog)) - prog = NULL; - - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); - if (prog) - bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; + return cgroup_bpf_prog_detach(attr, ptype); } #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt @@ -1681,9 +1634,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - struct cgroup *cgrp; - int ret; - if (!capable(CAP_NET_ADMIN)) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) @@ -1711,14 +1661,9 @@ static int bpf_prog_query(const union bpf_attr *attr, default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->query.target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); - cgroup_put(cgrp); - return ret; + + return cgroup_bpf_prog_query(attr, uattr); } -#endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -2365,7 +2310,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; -#ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); break; @@ -2375,7 +2319,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_QUERY: err = bpf_prog_query(&attr, uattr); break; -#endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; -- cgit v1.2.3 From 15bfd21fbc5d35834b9ea383dc458a1f0c9e3434 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 26 Jun 2018 09:14:58 -0600 Subject: block: Fix transfer when chunk sectors exceeds max A device may have boundary restrictions where the number of sectors between boundaries exceeds its max transfer size. In this case, we need to cap the max size to the smaller of the two limits. Reported-by: Jitendra Bhivare Tested-by: Jitendra Bhivare Cc: Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9154570edf29..79226ca8f80f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1119,8 +1119,8 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q, if (!q->limits.chunk_sectors) return q->limits.max_sectors; - return q->limits.chunk_sectors - - (offset & (q->limits.chunk_sectors - 1)); + return min(q->limits.max_sectors, (unsigned int)(q->limits.chunk_sectors - + (offset & (q->limits.chunk_sectors - 1)))); } static inline unsigned int blk_rq_get_max_sectors(struct request *rq, -- cgit v1.2.3 From 0efc8562491b7d36f6bbc4fbc8f3348cb6641e9c Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 31 May 2018 11:16:18 +0300 Subject: net/mlx5: E-Switch, Avoid setup attempt if not being e-switch manager In smartnic env, the host (PF) driver might not be an e-switch manager, hence the FW will err on driver attempts to deal with setting/unsetting the eswitch and as a result the overall setup of sriov will fail. Fix that by avoiding the operation if e-switch management is not allowed for this driver instance. While here, move to use the correct name for the esw manager capability name. Fixes: 81848731ff40 ('net/mlx5: E-Switch, Add SR-IOV (FDB) support') Signed-off-by: Or Gerlitz Reported-by: Guy Kushnir Reviewed-by: Eli Cohen Tested-by: Eli Cohen Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 5 +++-- drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 7 ++++++- include/linux/mlx5/eswitch.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 2 +- 7 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 378ad74518ec..60236f73373c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -839,7 +839,7 @@ static bool mlx5e_is_vf_vport_rep(struct mlx5e_priv *priv) struct mlx5e_rep_priv *rpriv = priv->ppriv; struct mlx5_eswitch_rep *rep; - if (!MLX5_CAP_GEN(priv->mdev, eswitch_flow_table)) + if (!MLX5_ESWITCH_MANAGER(priv->mdev)) return false; rep = rpriv->rep; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index f63dfbcd29fe..103fd6a0cc65 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1604,7 +1604,7 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) if (!ESW_ALLOWED(esw)) return 0; - if (!MLX5_CAP_GEN(esw->dev, eswitch_flow_table) || + if (!MLX5_ESWITCH_MANAGER(esw->dev) || !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) { esw_warn(esw->dev, "E-Switch FDB is not supported, aborting ...\n"); return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 49a75d31185e..f1a86cea86a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -32,6 +32,7 @@ #include #include +#include #include "mlx5_core.h" #include "fs_core.h" @@ -2652,7 +2653,7 @@ int mlx5_init_fs(struct mlx5_core_dev *dev) goto err; } - if (MLX5_CAP_GEN(dev, eswitch_flow_table)) { + if (MLX5_ESWITCH_MANAGER(dev)) { if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, ft_support)) { err = init_fdb_root_ns(steering); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index afd9f4fa22f4..41ad24f0de2c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -32,6 +32,7 @@ #include #include +#include #include #include "mlx5_core.h" #include "../../mlxfw/mlxfw.h" @@ -159,13 +160,13 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) } if (MLX5_CAP_GEN(dev, vport_group_manager) && - MLX5_CAP_GEN(dev, eswitch_flow_table)) { + MLX5_ESWITCH_MANAGER(dev)) { err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE); if (err) return err; } - if (MLX5_CAP_GEN(dev, eswitch_flow_table)) { + if (MLX5_ESWITCH_MANAGER(dev)) { err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index 2a8b529ce6dd..a0674962f02c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -88,6 +88,9 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) return -EBUSY; } + if (!MLX5_ESWITCH_MANAGER(dev)) + goto enable_vfs_hca; + err = mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY); if (err) { mlx5_core_warn(dev, @@ -95,6 +98,7 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) return err; } +enable_vfs_hca: for (vf = 0; vf < num_vfs; vf++) { err = mlx5_core_enable_hca(dev, vf + 1); if (err) { @@ -140,7 +144,8 @@ static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev) } out: - mlx5_eswitch_disable_sriov(dev->priv.eswitch); + if (MLX5_ESWITCH_MANAGER(dev)) + mlx5_eswitch_disable_sriov(dev->priv.eswitch); if (mlx5_wait_for_vf_pages(dev)) mlx5_core_warn(dev, "timeout reclaiming VFs pages\n"); diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index d3c9db492b30..fab5121ffb8f 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -8,6 +8,8 @@ #include +#define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager) + enum { SRIOV_NONE, SRIOV_LEGACY, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 27134c4fcb76..ac281f5ec9b8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -922,7 +922,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 vnic_env_queue_counters[0x1]; u8 ets[0x1]; u8 nic_flow_table[0x1]; - u8 eswitch_flow_table[0x1]; + u8 eswitch_manager[0x1]; u8 device_memory[0x1]; u8 mcam_reg[0x1]; u8 pcam_reg[0x1]; -- cgit v1.2.3 From 951a8ee6def39e25d0e60b9394e5a249ba8b2390 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 20:36:28 -0700 Subject: nfp: reject binding to shared blocks TC shared blocks allow multiple qdiscs to be grouped together and filters shared between them. Currently the chains of filters attached to a block are only flushed when the block is removed. If a qdisc is removed from a block but the block still exists, flow del messages are not passed to the callback registered for that qdisc. For the NFP, this presents the possibility of rules still existing in hw when they should be removed. Prevent binding to shared blocks until the kernel can send per qdisc del messages when block unbinds occur. tcf_block_shared() was not used outside of the core until now, so also add an empty implementation for builds with CONFIG_NET_CLS=n. Fixes: 4861738775d7 ("net: sched: introduce shared filter blocks infrastructure") Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 3 +++ drivers/net/ethernet/netronome/nfp/flower/offload.c | 3 +++ include/net/pkt_cls.h | 5 +++++ 3 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index fcdfb8e7fdea..6b15e3b11956 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -202,6 +202,9 @@ static int nfp_bpf_setup_tc_block(struct net_device *netdev, if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) return -EOPNOTSUPP; + if (tcf_block_shared(f->block)) + return -EOPNOTSUPP; + switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 477f584f6d28..525057bee0ed 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -631,6 +631,9 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev, if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) return -EOPNOTSUPP; + if (tcf_block_shared(f->block)) + return -EOPNOTSUPP; + switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3c1a2c47cd4..20b059574e60 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -111,6 +111,11 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, { } +static inline bool tcf_block_shared(struct tcf_block *block) +{ + return false; +} + static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; -- cgit v1.2.3 From a11e1d432b51f63ba698d044441284a661f01144 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jun 2018 09:43:44 -0700 Subject: Revert changes to convert to ->poll_mask() and aio IOCB_CMD_POLL The poll() changes were not well thought out, and completely unexplained. They also caused a huge performance regression, because "->poll()" was no longer a trivial file operation that just called down to the underlying file operations, but instead did at least two indirect calls. Indirect calls are sadly slow now with the Spectre mitigation, but the performance problem could at least be largely mitigated by changing the "->get_poll_head()" operation to just have a per-file-descriptor pointer to the poll head instead. That gets rid of one of the new indirections. But that doesn't fix the new complexity that is completely unwarranted for the regular case. The (undocumented) reason for the poll() changes was some alleged AIO poll race fixing, but we don't make the common case slower and more complex for some uncommon special case, so this all really needs way more explanations and most likely a fundamental redesign. [ This revert is a revert of about 30 different commits, not reverted individually because that would just be unnecessarily messy - Linus ] Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 7 +- Documentation/filesystems/vfs.txt | 13 ---- crypto/af_alg.c | 13 +++- crypto/algif_aead.c | 4 +- crypto/algif_skcipher.c | 4 +- drivers/char/random.c | 29 ++++---- drivers/isdn/mISDN/socket.c | 2 +- drivers/net/ppp/pppoe.c | 2 +- fs/aio.c | 148 +------------------------------------- fs/eventfd.c | 19 ++--- fs/eventpoll.c | 15 ++-- fs/pipe.c | 22 +++--- fs/select.c | 23 ------ fs/timerfd.c | 22 +++--- include/crypto/if_alg.h | 3 +- include/linux/fs.h | 2 - include/linux/net.h | 1 - include/linux/poll.h | 12 ++-- include/linux/skbuff.h | 3 +- include/net/bluetooth/bluetooth.h | 2 +- include/net/iucv/af_iucv.h | 2 + include/net/sctp/sctp.h | 3 +- include/net/tcp.h | 3 +- include/net/tls.h | 6 +- include/net/udp.h | 2 +- include/uapi/linux/aio_abi.h | 8 ++- net/appletalk/ddp.c | 2 +- net/atm/common.c | 11 ++- net/atm/common.h | 2 +- net/atm/pvc.c | 2 +- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 2 +- net/bluetooth/af_bluetooth.c | 7 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/l2cap_sock.c | 2 +- net/bluetooth/rfcomm/sock.c | 2 +- net/bluetooth/sco.c | 2 +- net/caif/caif_socket.c | 12 ++-- net/can/bcm.c | 2 +- net/can/raw.c | 2 +- net/core/datagram.c | 13 ++-- net/dccp/dccp.h | 3 +- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/dccp/proto.c | 13 +++- net/decnet/af_decnet.c | 6 +- net/ieee802154/socket.c | 4 +- net/ipv4/af_inet.c | 8 +-- net/ipv4/tcp.c | 23 ++++-- net/ipv4/udp.c | 10 +-- net/ipv6/af_inet6.c | 4 +- net/ipv6/raw.c | 4 +- net/iucv/af_iucv.c | 7 +- net/kcm/kcmsock.c | 10 +-- net/key/af_key.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- net/l2tp/l2tp_ppp.c | 2 +- net/llc/af_llc.c | 2 +- net/netlink/af_netlink.c | 2 +- net/netrom/af_netrom.c | 2 +- net/nfc/llcp_sock.c | 9 ++- net/nfc/rawsock.c | 4 +- net/packet/af_packet.c | 9 +-- net/phonet/socket.c | 9 ++- net/qrtr/qrtr.c | 2 +- net/rose/af_rose.c | 2 +- net/rxrpc/af_rxrpc.c | 10 ++- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 2 +- net/sctp/socket.c | 4 +- net/smc/af_smc.c | 12 +++- net/socket.c | 48 ++----------- net/tipc/socket.c | 14 ++-- net/tls/tls_main.c | 2 +- net/tls/tls_sw.c | 19 ++--- net/unix/af_unix.c | 30 +++++--- net/vmw_vsock/af_vsock.c | 19 +++-- net/x25/af_x25.c | 2 +- net/xdp/xsk.c | 7 +- 80 files changed, 301 insertions(+), 450 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2c391338c675..37bf0a9de75c 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -441,8 +441,6 @@ prototypes: int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -473,7 +471,7 @@ prototypes: }; locking rules: - All except for ->poll_mask may block. + All may block. ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you @@ -505,9 +503,6 @@ in sys_read() and friends. the lease within the individual filesystem to record the result of the operation -->poll_mask can be called with or without the waitqueue lock for the waitqueue -returned from ->get_poll_head. - --------------------------- dquot_operations ------------------------------- prototypes: int (*write_dquot) (struct dquot *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 829a7b7857a4..f608180ad59d 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -857,8 +857,6 @@ struct file_operations { ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); @@ -903,17 +901,6 @@ otherwise noted. activity on this file and (optionally) go to sleep until there is activity. Called by the select(2) and poll(2) system calls - get_poll_head: Returns the struct wait_queue_head that callers can - wait on. Callers need to check the returned events using ->poll_mask - once woken. Can return NULL to indicate polling is not supported, - or any error code using the ERR_PTR convention to indicate that a - grave error occured and ->poll_mask shall not be called. - - poll_mask: return the mask of EPOLL* values describing the file descriptor - state. Called either before going to sleep on the waitqueue returned by - get_poll_head, or after it has been woken. If ->get_poll_head and - ->poll_mask are implemented ->poll does not need to be implement. - unlocked_ioctl: called by the ioctl(2) system call. compat_ioctl: called by the ioctl(2) system call when 32 bit system calls diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 49fa8582138b..314c52c967e5 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1060,12 +1060,19 @@ void af_alg_async_cb(struct crypto_async_request *_req, int err) } EXPORT_SYMBOL_GPL(af_alg_async_cb); -__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events) +/** + * af_alg_poll - poll system call handler + */ +__poll_t af_alg_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; if (!ctx->more || ctx->used) mask |= EPOLLIN | EPOLLRDNORM; @@ -1075,7 +1082,7 @@ __poll_t af_alg_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL_GPL(af_alg_poll_mask); +EXPORT_SYMBOL_GPL(af_alg_poll); /** * af_alg_alloc_areq - allocate struct af_alg_async_req diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index 825524f27438..c40a8c7ee8ae 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -375,7 +375,7 @@ static struct proto_ops algif_aead_ops = { .sendmsg = aead_sendmsg, .sendpage = af_alg_sendpage, .recvmsg = aead_recvmsg, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static int aead_check_key(struct socket *sock) @@ -471,7 +471,7 @@ static struct proto_ops algif_aead_ops_nokey = { .sendmsg = aead_sendmsg_nokey, .sendpage = aead_sendpage_nokey, .recvmsg = aead_recvmsg_nokey, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static void *aead_bind(const char *name, u32 type, u32 mask) diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 4c04eb9888ad..cfdaab2b7d76 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -206,7 +206,7 @@ static struct proto_ops algif_skcipher_ops = { .sendmsg = skcipher_sendmsg, .sendpage = af_alg_sendpage, .recvmsg = skcipher_recvmsg, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static int skcipher_check_key(struct socket *sock) @@ -302,7 +302,7 @@ static struct proto_ops algif_skcipher_ops_nokey = { .sendmsg = skcipher_sendmsg_nokey, .sendpage = skcipher_sendpage_nokey, .recvmsg = skcipher_recvmsg_nokey, - .poll_mask = af_alg_poll_mask, + .poll = af_alg_poll, }; static void *skcipher_bind(const char *name, u32 type, u32 mask) diff --git a/drivers/char/random.c b/drivers/char/random.c index a8fb0020ba5c..cd888d4ee605 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -402,7 +402,8 @@ static struct poolinfo { /* * Static global variables */ -static DECLARE_WAIT_QUEUE_HEAD(random_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_read_wait); +static DECLARE_WAIT_QUEUE_HEAD(random_write_wait); static struct fasync_struct *fasync; static DEFINE_SPINLOCK(random_ready_list_lock); @@ -721,8 +722,8 @@ retry: /* should we wake readers? */ if (entropy_bits >= random_read_wakeup_bits && - wq_has_sleeper(&random_wait)) { - wake_up_interruptible_poll(&random_wait, POLLIN); + wq_has_sleeper(&random_read_wait)) { + wake_up_interruptible(&random_read_wait); kill_fasync(&fasync, SIGIO, POLL_IN); } /* If the input pool is getting full, send some @@ -1396,7 +1397,7 @@ retry: trace_debit_entropy(r->name, 8 * ibytes); if (ibytes && (r->entropy_count >> ENTROPY_SHIFT) < random_write_wakeup_bits) { - wake_up_interruptible_poll(&random_wait, POLLOUT); + wake_up_interruptible(&random_write_wait); kill_fasync(&fasync, SIGIO, POLL_OUT); } @@ -1838,7 +1839,7 @@ _random_read(int nonblock, char __user *buf, size_t nbytes) if (nonblock) return -EAGAIN; - wait_event_interruptible(random_wait, + wait_event_interruptible(random_read_wait, ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits); if (signal_pending(current)) @@ -1875,17 +1876,14 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) return ret; } -static struct wait_queue_head * -random_get_poll_head(struct file *file, __poll_t events) -{ - return &random_wait; -} - static __poll_t -random_poll_mask(struct file *file, __poll_t events) +random_poll(struct file *file, poll_table * wait) { - __poll_t mask = 0; + __poll_t mask; + poll_wait(file, &random_read_wait, wait); + poll_wait(file, &random_write_wait, wait); + mask = 0; if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits) mask |= EPOLLIN | EPOLLRDNORM; if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits) @@ -1992,8 +1990,7 @@ static int random_fasync(int fd, struct file *filp, int on) const struct file_operations random_fops = { .read = random_read, .write = random_write, - .get_poll_head = random_get_poll_head, - .poll_mask = random_poll_mask, + .poll = random_poll, .unlocked_ioctl = random_ioctl, .fasync = random_fasync, .llseek = noop_llseek, @@ -2326,7 +2323,7 @@ void add_hwgenerator_randomness(const char *buffer, size_t count, * We'll be woken up again once below random_write_wakeup_thresh, * or when the calling thread is about to terminate. */ - wait_event_interruptible(random_wait, kthread_should_stop() || + wait_event_interruptible(random_write_wait, kthread_should_stop() || ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits); mix_pool_bytes(poolp, buffer, count); credit_entropy_bits(poolp, entropy); diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 98f90aadd141..18c0a1281914 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -588,7 +588,7 @@ static const struct proto_ops data_sock_ops = { .getname = data_sock_getname, .sendmsg = mISDN_sock_sendmsg, .recvmsg = mISDN_sock_recvmsg, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = data_sock_setsockopt, diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index de51e8f70f44..ce61231e96ea 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1107,7 +1107,7 @@ static const struct proto_ops pppoe_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppoe_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/fs/aio.c b/fs/aio.c index e1d20124ec0e..210df9da1283 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -5,7 +5,6 @@ * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. - * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ @@ -165,22 +164,10 @@ struct fsync_iocb { bool datasync; }; -struct poll_iocb { - struct file *file; - __poll_t events; - struct wait_queue_head *head; - - union { - struct wait_queue_entry wait; - struct work_struct work; - }; -}; - struct aio_kiocb { union { struct kiocb rw; struct fsync_iocb fsync; - struct poll_iocb poll; }; struct kioctx *ki_ctx; @@ -1590,6 +1577,7 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; + req->file = fget(iocb->aio_fildes); if (unlikely(!req->file)) return -EBADF; @@ -1604,137 +1592,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) return 0; } -/* need to use list_del_init so we can check if item was present */ -static inline bool __aio_poll_remove(struct poll_iocb *req) -{ - if (list_empty(&req->wait.entry)) - return false; - list_del_init(&req->wait.entry); - return true; -} - -static inline void __aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - fput(iocb->poll.file); - aio_complete(iocb, mangle_poll(mask), 0); -} - -static void aio_poll_work(struct work_struct *work) -{ - struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, poll.work); - - if (!list_empty_careful(&iocb->ki_list)) - aio_remove_iocb(iocb); - __aio_poll_complete(iocb, iocb->poll.events); -} - -static int aio_poll_cancel(struct kiocb *iocb) -{ - struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); - struct poll_iocb *req = &aiocb->poll; - struct wait_queue_head *head = req->head; - bool found = false; - - spin_lock(&head->lock); - found = __aio_poll_remove(req); - spin_unlock(&head->lock); - - if (found) { - req->events = 0; - INIT_WORK(&req->work, aio_poll_work); - schedule_work(&req->work); - } - return 0; -} - -static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); - struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); - struct file *file = req->file; - __poll_t mask = key_to_poll(key); - - assert_spin_locked(&req->head->lock); - - /* for instances that support it check for an event match first: */ - if (mask && !(mask & req->events)) - return 0; - - mask = file->f_op->poll_mask(file, req->events) & req->events; - if (!mask) - return 0; - - __aio_poll_remove(req); - - /* - * Try completing without a context switch if we can acquire ctx_lock - * without spinning. Otherwise we need to defer to a workqueue to - * avoid a deadlock due to the lock order. - */ - if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { - list_del_init(&iocb->ki_list); - spin_unlock(&iocb->ki_ctx->ctx_lock); - - __aio_poll_complete(iocb, mask); - } else { - req->events = mask; - INIT_WORK(&req->work, aio_poll_work); - schedule_work(&req->work); - } - - return 1; -} - -static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) -{ - struct kioctx *ctx = aiocb->ki_ctx; - struct poll_iocb *req = &aiocb->poll; - __poll_t mask; - - /* reject any unknown events outside the normal event mask. */ - if ((u16)iocb->aio_buf != iocb->aio_buf) - return -EINVAL; - /* reject fields that are not defined for poll */ - if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) - return -EINVAL; - - req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; - if (!file_has_poll_mask(req->file)) - goto out_fail; - - req->head = req->file->f_op->get_poll_head(req->file, req->events); - if (!req->head) - goto out_fail; - if (IS_ERR(req->head)) { - mask = EPOLLERR; - goto done; - } - - init_waitqueue_func_entry(&req->wait, aio_poll_wake); - aiocb->ki_cancel = aio_poll_cancel; - - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - mask = req->file->f_op->poll_mask(req->file, req->events) & req->events; - if (!mask) { - __add_wait_queue(req->head, &req->wait); - list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - } - spin_unlock(&req->head->lock); - spin_unlock_irq(&ctx->ctx_lock); -done: - if (mask) - __aio_poll_complete(aiocb, mask); - return 0; -out_fail: - fput(req->file); - return -EINVAL; /* same as no support for IOCB_CMD_POLL */ -} - static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { @@ -1808,9 +1665,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, case IOCB_CMD_FDSYNC: ret = aio_fsync(&req->fsync, &iocb, true); break; - case IOCB_CMD_POLL: - ret = aio_poll(req, &iocb); - break; default: pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); ret = -EINVAL; diff --git a/fs/eventfd.c b/fs/eventfd.c index ceb1031f1cac..08d3bd602f73 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -101,20 +101,14 @@ static int eventfd_release(struct inode *inode, struct file *file) return 0; } -static struct wait_queue_head * -eventfd_get_poll_head(struct file *file, __poll_t events) -{ - struct eventfd_ctx *ctx = file->private_data; - - return &ctx->wqh; -} - -static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) +static __poll_t eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; __poll_t events = 0; u64 count; + poll_wait(file, &ctx->wqh, wait); + /* * All writes to ctx->count occur within ctx->wqh.lock. This read * can be done outside ctx->wqh.lock because we know that poll_wait @@ -156,11 +150,11 @@ static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask) count = READ_ONCE(ctx->count); if (count > 0) - events |= (EPOLLIN & eventmask); + events |= EPOLLIN; if (count == ULLONG_MAX) events |= EPOLLERR; if (ULLONG_MAX - 1 > count) - events |= (EPOLLOUT & eventmask); + events |= EPOLLOUT; return events; } @@ -311,8 +305,7 @@ static const struct file_operations eventfd_fops = { .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, - .get_poll_head = eventfd_get_poll_head, - .poll_mask = eventfd_poll_mask, + .poll = eventfd_poll, .read = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ea4436f409fb..67db22fe99c5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -922,18 +922,14 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head return 0; } -static struct wait_queue_head *ep_eventpoll_get_poll_head(struct file *file, - __poll_t eventmask) -{ - struct eventpoll *ep = file->private_data; - return &ep->poll_wait; -} - -static __poll_t ep_eventpoll_poll_mask(struct file *file, __poll_t eventmask) +static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { struct eventpoll *ep = file->private_data; int depth = 0; + /* Insert inside our poll wait queue */ + poll_wait(file, &ep->poll_wait, wait); + /* * Proceed to find out if wanted events are really available inside * the ready list. @@ -972,8 +968,7 @@ static const struct file_operations eventpoll_fops = { .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, - .get_poll_head = ep_eventpoll_get_poll_head, - .poll_mask = ep_eventpoll_poll_mask, + .poll = ep_eventpoll_poll, .llseek = noop_llseek, }; diff --git a/fs/pipe.c b/fs/pipe.c index bb0840e234f3..39d6f431da83 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -509,22 +509,19 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } -static struct wait_queue_head * -pipe_get_poll_head(struct file *filp, __poll_t events) -{ - struct pipe_inode_info *pipe = filp->private_data; - - return &pipe->wait; -} - /* No kernel lock held - fine */ -static __poll_t pipe_poll_mask(struct file *filp, __poll_t events) +static __poll_t +pipe_poll(struct file *filp, poll_table *wait) { + __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - int nrbufs = pipe->nrbufs; - __poll_t mask = 0; + int nrbufs; + + poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ + nrbufs = pipe->nrbufs; + mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; if (!pipe->writers && filp->f_version != pipe->w_counter) @@ -1023,8 +1020,7 @@ const struct file_operations pipefifo_fops = { .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, - .get_poll_head = pipe_get_poll_head, - .poll_mask = pipe_poll_mask, + .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, diff --git a/fs/select.c b/fs/select.c index 317891ff8165..4a6b6e4b21cb 100644 --- a/fs/select.c +++ b/fs/select.c @@ -34,29 +34,6 @@ #include -__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) -{ - if (file->f_op->poll) { - return file->f_op->poll(file, pt); - } else if (file_has_poll_mask(file)) { - unsigned int events = poll_requested_events(pt); - struct wait_queue_head *head; - - if (pt && pt->_qproc) { - head = file->f_op->get_poll_head(file, events); - if (!head) - return DEFAULT_POLLMASK; - if (IS_ERR(head)) - return EPOLLERR; - pt->_qproc(file, head, pt); - } - - return file->f_op->poll_mask(file, events); - } else { - return DEFAULT_POLLMASK; - } -} -EXPORT_SYMBOL_GPL(vfs_poll); /* * Estimate expected accuracy in ns from a timeval. diff --git a/fs/timerfd.c b/fs/timerfd.c index d84a2bee4f82..cdad49da3ff7 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -226,20 +226,21 @@ static int timerfd_release(struct inode *inode, struct file *file) kfree_rcu(ctx, rcu); return 0; } - -static struct wait_queue_head *timerfd_get_poll_head(struct file *file, - __poll_t eventmask) + +static __poll_t timerfd_poll(struct file *file, poll_table *wait) { struct timerfd_ctx *ctx = file->private_data; + __poll_t events = 0; + unsigned long flags; - return &ctx->wqh; -} + poll_wait(file, &ctx->wqh, wait); -static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask) -{ - struct timerfd_ctx *ctx = file->private_data; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->ticks) + events |= EPOLLIN; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); - return ctx->ticks ? EPOLLIN : 0; + return events; } static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, @@ -363,8 +364,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg static const struct file_operations timerfd_fops = { .release = timerfd_release, - .get_poll_head = timerfd_get_poll_head, - .poll_mask = timerfd_poll_mask, + .poll = timerfd_poll, .read = timerfd_read, .llseek = noop_llseek, .show_fdinfo = timerfd_show, diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index cc414db9da0a..482461d8931d 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -245,7 +245,8 @@ ssize_t af_alg_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); void af_alg_free_resources(struct af_alg_async_req *areq); void af_alg_async_cb(struct crypto_async_request *_req, int err); -__poll_t af_alg_poll_mask(struct socket *sock, __poll_t events); +__poll_t af_alg_poll(struct file *file, struct socket *sock, + poll_table *wait); struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk, unsigned int areqlen); int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, diff --git a/include/linux/fs.h b/include/linux/fs.h index 5c91108846db..d78d146a98da 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1720,8 +1720,6 @@ struct file_operations { int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); - struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t); - __poll_t (*poll_mask) (struct file *, __poll_t); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); diff --git a/include/linux/net.h b/include/linux/net.h index 08b6eb964dd6..6554d3ba4396 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -147,7 +147,6 @@ struct proto_ops { int (*getname) (struct socket *sock, struct sockaddr *addr, int peer); - __poll_t (*poll_mask) (struct socket *sock, __poll_t events); __poll_t (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, diff --git a/include/linux/poll.h b/include/linux/poll.h index fdf86b4cbc71..7e0fdcf905d2 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -74,18 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) pt->_key = ~(__poll_t)0; /* all events enabled */ } -static inline bool file_has_poll_mask(struct file *file) +static inline bool file_can_poll(struct file *file) { - return file->f_op->get_poll_head && file->f_op->poll_mask; + return file->f_op->poll; } -static inline bool file_can_poll(struct file *file) +static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) { - return file->f_op->poll || file_has_poll_mask(file); + if (unlikely(!file->f_op->poll)) + return DEFAULT_POLLMASK; + return file->f_op->poll(file, pt); } -__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt); - struct poll_table_entry { struct file *filp; __poll_t key; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c86885954994..164cdedf6012 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3252,7 +3252,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, int *peeked, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err); -__poll_t datagram_poll_mask(struct socket *sock, __poll_t events); +__poll_t datagram_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 53ce8176c313..ec9d6bc65855 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -271,7 +271,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); -__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events); +__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); int bt_sock_wait_ready(struct sock *sk, unsigned long flags); diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index b0eaeb02d46d..f4c21b5a1242 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -153,6 +153,8 @@ struct iucv_sock_list { atomic_t autobind_name; }; +__poll_t iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait); void iucv_sock_link(struct iucv_sock_list *l, struct sock *s); void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s); void iucv_accept_enqueue(struct sock *parent, struct sock *sk); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 30b3e2fe240a..8c2caa370e0f 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -109,7 +109,8 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); void sctp_data_ready(struct sock *sk); -__poll_t sctp_poll_mask(struct socket *sock, __poll_t events); +__poll_t sctp_poll(struct file *file, struct socket *sock, + poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc); diff --git a/include/net/tcp.h b/include/net/tcp.h index 0448e7c5d2b4..800582b5dd54 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -388,7 +388,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op); -__poll_t tcp_poll_mask(struct socket *sock, __poll_t events); +__poll_t tcp_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, diff --git a/include/net/tls.h b/include/net/tls.h index 7f84ea3e217c..70c273777fe9 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -109,7 +109,8 @@ struct tls_sw_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); - __poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); struct sk_buff *recv_pkt; u8 control; bool decrypted; @@ -224,7 +225,8 @@ void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events); +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/include/net/udp.h b/include/net/udp.h index b1ea8b0f5e6a..81afdacd4fff 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -285,7 +285,7 @@ int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); -__poll_t udp_poll_mask(struct socket *sock, __poll_t events); +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index d00221345c19..d4e768d55d14 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -39,8 +39,10 @@ enum { IOCB_CMD_PWRITE = 1, IOCB_CMD_FSYNC = 2, IOCB_CMD_FDSYNC = 3, - /* 4 was the experimental IOCB_CMD_PREADX */ - IOCB_CMD_POLL = 5, + /* These two are experimental. + * IOCB_CMD_PREADX = 4, + * IOCB_CMD_POLL = 5, + */ IOCB_CMD_NOOP = 6, IOCB_CMD_PREADV = 7, IOCB_CMD_PWRITEV = 8, @@ -109,7 +111,7 @@ struct iocb { #undef IFLITTLE struct __aio_sigset { - const sigset_t __user *sigmask; + sigset_t __user *sigmask; size_t sigsetsize; }; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 55fdba05d7d9..9b6bc5abe946 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1869,7 +1869,7 @@ static const struct proto_ops atalk_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = atalk_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = atalk_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = atalk_compat_ioctl, diff --git a/net/atm/common.c b/net/atm/common.c index ff5748b2190f..a7a68e509628 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -647,11 +647,16 @@ out: return error; } -__poll_t vcc_poll_mask(struct socket *sock, __poll_t events) +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - struct atm_vcc *vcc = ATM_SD(sock); - __poll_t mask = 0; + struct atm_vcc *vcc; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + vcc = ATM_SD(sock); /* exceptional events */ if (sk->sk_err) diff --git a/net/atm/common.h b/net/atm/common.h index 526796ad230f..5850649068bb 100644 --- a/net/atm/common.h +++ b/net/atm/common.h @@ -17,7 +17,7 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci); int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len); -__poll_t vcc_poll_mask(struct socket *sock, __poll_t events); +__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait); int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int vcc_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 9f75092fe778..2cb10af16afc 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -113,7 +113,7 @@ static const struct proto_ops pvc_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pvc_getname, - .poll_mask = vcc_poll_mask, + .poll = vcc_poll, .ioctl = vcc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vcc_compat_ioctl, diff --git a/net/atm/svc.c b/net/atm/svc.c index 53f4ad7087b1..2f91b766ac42 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -636,7 +636,7 @@ static const struct proto_ops svc_proto_ops = { .socketpair = sock_no_socketpair, .accept = svc_accept, .getname = svc_getname, - .poll_mask = vcc_poll_mask, + .poll = vcc_poll, .ioctl = svc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d1d2442ce573..c603d33d5410 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1941,7 +1941,7 @@ static const struct proto_ops ax25_proto_ops = { .socketpair = sock_no_socketpair, .accept = ax25_accept, .getname = ax25_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ax25_ioctl, .listen = ax25_listen, .shutdown = ax25_shutdown, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 510ab4f55df5..3264e1873219 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -437,13 +437,16 @@ static inline __poll_t bt_accept_poll(struct sock *parent) return 0; } -__poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) +__poll_t bt_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; BT_DBG("sock %p, sk %p", sock, sk); + poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); @@ -475,7 +478,7 @@ __poll_t bt_sock_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(bt_sock_poll_mask); +EXPORT_SYMBOL(bt_sock_poll); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index d6c099861538..1506e1632394 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1975,7 +1975,7 @@ static const struct proto_ops hci_sock_ops = { .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, .ioctl = hci_sock_ioctl, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = hci_sock_setsockopt, diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 742a190034e6..686bdc6b35b0 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1653,7 +1653,7 @@ static const struct proto_ops l2cap_sock_ops = { .getname = l2cap_sock_getname, .sendmsg = l2cap_sock_sendmsg, .recvmsg = l2cap_sock_recvmsg, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 1cf57622473a..d606e9212291 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1049,7 +1049,7 @@ static const struct proto_ops rfcomm_sock_ops = { .setsockopt = rfcomm_sock_setsockopt, .getsockopt = rfcomm_sock_getsockopt, .ioctl = rfcomm_sock_ioctl, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .socketpair = sock_no_socketpair, .mmap = sock_no_mmap }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index d60dbc61d170..413b8ee49fec 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1197,7 +1197,7 @@ static const struct proto_ops sco_sock_ops = { .getname = sco_sock_getname, .sendmsg = sco_sock_sendmsg, .recvmsg = sco_sock_recvmsg, - .poll_mask = bt_sock_poll_mask, + .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index c7991867d622..a6fb1b3bcad9 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -934,11 +934,15 @@ static int caif_release(struct socket *sock) } /* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */ -static __poll_t caif_poll_mask(struct socket *sock, __poll_t events) +static __poll_t caif_poll(struct file *file, + struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - __poll_t mask = 0; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -972,7 +976,7 @@ static const struct proto_ops caif_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = caif_poll_mask, + .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -993,7 +997,7 @@ static const struct proto_ops caif_stream_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = caif_poll_mask, + .poll = caif_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/bcm.c b/net/can/bcm.c index 9393f25df08d..0af8f0db892a 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1660,7 +1660,7 @@ static const struct proto_ops bcm_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/can/raw.c b/net/can/raw.c index fd7e2f49ea6a..1051eee82581 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -843,7 +843,7 @@ static const struct proto_ops raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = raw_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/core/datagram.c b/net/core/datagram.c index f19bf3dc2bd6..9938952c5c78 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -819,8 +819,9 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll + * @file: file struct * @sock: socket - * @events to wait for + * @wait: poll table * * Datagram poll: Again totally generic. This also handles * sequenced packet sockets providing the socket receive queue @@ -830,10 +831,14 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ -__poll_t datagram_poll_mask(struct socket *sock, __poll_t events) +__poll_t datagram_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -866,4 +871,4 @@ __poll_t datagram_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(datagram_poll_mask); +EXPORT_SYMBOL(datagram_poll); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 0ea2ee56ac1b..f91e3816806b 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -316,7 +316,8 @@ int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); void dccp_shutdown(struct sock *sk, int how); int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll_mask(struct socket *sock, __poll_t events); +__poll_t dccp_poll(struct file *file, struct socket *sock, + poll_table *wait); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void dccp_req_err(struct sock *sk, u64 seq); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a9e478cd3787..b08feb219b44 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -984,7 +984,7 @@ static const struct proto_ops inet_dccp_ops = { .accept = inet_accept, .getname = inet_getname, /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll_mask = dccp_poll_mask, + .poll = dccp_poll, .ioctl = inet_ioctl, /* FIXME: work on inet_listen to rename it to sock_common_listen */ .listen = inet_dccp_listen, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 17fc4e0166ba..6344f1b18a6a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1070,7 +1070,7 @@ static const struct proto_ops inet6_dccp_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet6_getname, - .poll_mask = dccp_poll_mask, + .poll = dccp_poll, .ioctl = inet6_ioctl, .listen = inet_dccp_listen, .shutdown = inet_shutdown, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ca21c1c76da0..0d56e36a6db7 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -312,11 +312,20 @@ int dccp_disconnect(struct sock *sk, int flags) EXPORT_SYMBOL_GPL(dccp_disconnect); -__poll_t dccp_poll_mask(struct socket *sock, __poll_t events) +/* + * Wait for a DCCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +__poll_t dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); @@ -358,7 +367,7 @@ __poll_t dccp_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL_GPL(dccp_poll_mask); +EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 9a686d890bfa..7d6ff983ba2c 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1207,11 +1207,11 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer) } -static __poll_t dn_poll_mask(struct socket *sock, __poll_t events) +static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); if (!skb_queue_empty(&scp->other_receive_queue)) mask |= EPOLLRDBAND; @@ -2331,7 +2331,7 @@ static const struct proto_ops dn_proto_ops = { .socketpair = sock_no_socketpair, .accept = dn_accept, .getname = dn_getname, - .poll_mask = dn_poll_mask, + .poll = dn_poll, .ioctl = dn_ioctl, .listen = dn_listen, .shutdown = dn_shutdown, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a0768d2759b8..a60658c85a9a 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -423,7 +423,7 @@ static const struct proto_ops ieee802154_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -969,7 +969,7 @@ static const struct proto_ops ieee802154_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15e125558c76..b403499fdabe 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -986,7 +986,7 @@ const struct proto_ops inet_stream_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, - .poll_mask = tcp_poll_mask, + .poll = tcp_poll, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, @@ -1021,7 +1021,7 @@ const struct proto_ops inet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = udp_poll_mask, + .poll = udp_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, @@ -1042,7 +1042,7 @@ EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without - * udp_poll_mask + * udp_poll */ static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, @@ -1053,7 +1053,7 @@ static const struct proto_ops inet_sockraw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 141acd92e58a..e7b53d2a971f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -494,21 +494,32 @@ static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, } /* - * Socket is not locked. We are protected from async events by poll logic and - * correct handling of state changes made by other threads is impossible in - * any case. + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. */ -__poll_t tcp_poll_mask(struct socket *sock, __poll_t events) +__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { + __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); - __poll_t mask = 0; int state; + sock_poll_wait(file, sk_sleep(sk), wait); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); + /* Socket is not locked. We are protected from async events + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. + */ + + mask = 0; + /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a @@ -589,7 +600,7 @@ __poll_t tcp_poll_mask(struct socket *sock, __poll_t events) return mask; } -EXPORT_SYMBOL(tcp_poll_mask); +EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9bb27df4dac5..24e116ddae79 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2591,7 +2591,7 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * udp_poll - wait for a UDP event. * @file - file struct * @sock - socket - * @events - events to wait for + * @wait - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd @@ -2600,23 +2600,23 @@ int compat_udp_getsockopt(struct sock *sk, int level, int optname, * but then block when reading it. Add special case code * to work around these arguably broken applications. */ -__poll_t udp_poll_mask(struct socket *sock, __poll_t events) +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ - if ((mask & EPOLLRDNORM) && !(sock->file->f_flags & O_NONBLOCK) && + if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); return mask; } -EXPORT_SYMBOL(udp_poll_mask); +EXPORT_SYMBOL(udp_poll); int udp_abort(struct sock *sk, int err) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 74f2a261e8df..9ed0eae91758 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -570,7 +570,7 @@ const struct proto_ops inet6_stream_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = inet_accept, /* ok */ .getname = inet6_getname, - .poll_mask = tcp_poll_mask, /* ok */ + .poll = tcp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = inet_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ @@ -603,7 +603,7 @@ const struct proto_ops inet6_dgram_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll_mask = udp_poll_mask, /* ok */ + .poll = udp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ce6f0d15b5dd..afc307c89d1a 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1334,7 +1334,7 @@ void raw6_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -/* Same as inet6_dgram_ops, sans udp_poll_mask. */ +/* Same as inet6_dgram_ops, sans udp_poll. */ const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -1344,7 +1344,7 @@ const struct proto_ops inet6_sockraw_ops = { .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, - .poll_mask = datagram_poll_mask, /* ok */ + .poll = datagram_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 68e86257a549..893a022f9620 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1488,11 +1488,14 @@ static inline __poll_t iucv_accept_poll(struct sock *parent) return 0; } -static __poll_t iucv_sock_poll_mask(struct socket *sock, __poll_t events) +__poll_t iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); @@ -2385,7 +2388,7 @@ static const struct proto_ops iucv_sock_ops = { .getname = iucv_sock_getname, .sendmsg = iucv_sock_sendmsg, .recvmsg = iucv_sock_recvmsg, - .poll_mask = iucv_sock_poll_mask, + .poll = iucv_sock_poll, .ioctl = sock_no_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 84b7d5c6fec8..d3601d421571 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1336,9 +1336,9 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) struct list_head *head; int index = 0; - /* For SOCK_SEQPACKET sock type, datagram_poll_mask checks the sk_state, - * so we set sk_state, otherwise epoll_wait always returns right away - * with EPOLLHUP + /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so + * we set sk_state, otherwise epoll_wait always returns right away with + * EPOLLHUP */ kcm->sk.sk_state = TCP_ESTABLISHED; @@ -1903,7 +1903,7 @@ static const struct proto_ops kcm_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -1924,7 +1924,7 @@ static const struct proto_ops kcm_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = kcm_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/key/af_key.c b/net/key/af_key.c index 8bdc1cbe490a..5e1d2946ffbf 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3751,7 +3751,7 @@ static const struct proto_ops pfkey_ops = { /* Now the operations that really occur. */ .release = pfkey_release, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 181073bf6925..a9c05b2bc1b0 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -613,7 +613,7 @@ static const struct proto_ops l2tp_ip_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 336e4c00abbc..957369192ca1 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -754,7 +754,7 @@ static const struct proto_ops l2tp_ip6_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = inet6_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 55188382845c..e398797878a9 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1818,7 +1818,7 @@ static const struct proto_ops pppol2tp_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppol2tp_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = pppol2tp_setsockopt, diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 804de8490186..1beeea9549fa 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1192,7 +1192,7 @@ static const struct proto_ops llc_ui_ops = { .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 1189b84413d5..393573a99a5a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2658,7 +2658,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 93fbcafbf388..03f37c4e64fe 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1355,7 +1355,7 @@ static const struct proto_ops nr_proto_ops = { .socketpair = sock_no_socketpair, .accept = nr_accept, .getname = nr_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = nr_ioctl, .listen = nr_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ab5bb14b49af..ea0c0c6f1874 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -548,13 +548,16 @@ static inline __poll_t llcp_accept_poll(struct sock *parent) return 0; } -static __poll_t llcp_sock_poll_mask(struct socket *sock, __poll_t events) +static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; pr_debug("%p\n", sk); + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); @@ -896,7 +899,7 @@ static const struct proto_ops llcp_sock_ops = { .socketpair = sock_no_socketpair, .accept = llcp_sock_accept, .getname = llcp_sock_getname, - .poll_mask = llcp_sock_poll_mask, + .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, @@ -916,7 +919,7 @@ static const struct proto_ops llcp_rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = llcp_sock_getname, - .poll_mask = llcp_sock_poll_mask, + .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 60c322531c49..e2188deb08dc 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -284,7 +284,7 @@ static const struct proto_ops rawsock_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -304,7 +304,7 @@ static const struct proto_ops rawsock_raw_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ff8e7e245c37..57634bc3da74 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4076,11 +4076,12 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, return 0; } -static __poll_t packet_poll_mask(struct socket *sock, __poll_t events) +static __poll_t packet_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); - __poll_t mask = datagram_poll_mask(sock, events); + __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { @@ -4422,7 +4423,7 @@ static const struct proto_ops packet_ops_spkt = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -4443,7 +4444,7 @@ static const struct proto_ops packet_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, - .poll_mask = packet_poll_mask, + .poll = packet_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index c295c4e20f01..30187990257f 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -340,12 +340,15 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, return sizeof(struct sockaddr_pn); } -static __poll_t pn_socket_poll_mask(struct socket *sock, __poll_t events) +static __poll_t pn_socket_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct pep_sock *pn = pep_sk(sk); __poll_t mask = 0; + poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == TCP_CLOSE) return EPOLLERR; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -445,7 +448,7 @@ const struct proto_ops phonet_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pn_socket_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = pn_socket_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -470,7 +473,7 @@ const struct proto_ops phonet_stream_ops = { .socketpair = sock_no_socketpair, .accept = pn_socket_accept, .getname = pn_socket_getname, - .poll_mask = pn_socket_poll_mask, + .poll = pn_socket_poll, .ioctl = pn_socket_ioctl, .listen = pn_socket_listen, .shutdown = sock_no_shutdown, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 1b5025ea5b04..2aa07b547b16 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1023,7 +1023,7 @@ static const struct proto_ops qrtr_proto_ops = { .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ebe42e7eb456..d00a0ef39a56 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1470,7 +1470,7 @@ static const struct proto_ops rose_proto_ops = { .socketpair = sock_no_socketpair, .accept = rose_accept, .getname = rose_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = rose_ioctl, .listen = rose_listen, .shutdown = sock_no_shutdown, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 3b1ac93efee2..2b463047dd7b 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -734,11 +734,15 @@ static int rxrpc_getsockopt(struct socket *sock, int level, int optname, /* * permit an RxRPC socket to be polled */ -static __poll_t rxrpc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t rxrpc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* the socket is readable if there are any messages waiting on the Rx * queue */ @@ -945,7 +949,7 @@ static const struct proto_ops rxrpc_rpc_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = rxrpc_poll_mask, + .poll = rxrpc_poll, .ioctl = sock_no_ioctl, .listen = rxrpc_listen, .shutdown = rxrpc_shutdown, diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7339918a805d..0cd2e764f47f 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1010,7 +1010,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = sctp_getname, - .poll_mask = sctp_poll_mask, + .poll = sctp_poll, .ioctl = inet6_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 5dffbc493008..67f73d3a1356 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1016,7 +1016,7 @@ static const struct proto_ops inet_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, /* Semantics are different. */ - .poll_mask = sctp_poll_mask, + .poll = sctp_poll, .ioctl = inet_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d20f7addee19..ce620e878538 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7717,12 +7717,14 @@ out: * here, again, by modeling the current TCP/UDP code. We don't have * a good way to test with it yet. */ -__poll_t sctp_poll_mask(struct socket *sock, __poll_t events) +__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct sctp_sock *sp = sctp_sk(sk); __poll_t mask; + poll_wait(file, sk_sleep(sk), wait); + sock_rps_record_flow(sk); /* A TCP-style listening socket becomes readable when the accept queue diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da7f02edcd37..973b4471b532 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1273,7 +1273,8 @@ static __poll_t smc_accept_poll(struct sock *parent) return mask; } -static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; @@ -1289,7 +1290,7 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ release_sock(sk); - mask = smc->clcsock->ops->poll_mask(smc->clcsock, events); + mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); lock_sock(sk); sk->sk_err = smc->clcsock->sk->sk_err; if (sk->sk_err) { @@ -1307,6 +1308,11 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) } } } else { + if (sk->sk_state != SMC_CLOSED) { + release_sock(sk); + sock_poll_wait(file, sk_sleep(sk), wait); + lock_sock(sk); + } if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || @@ -1619,7 +1625,7 @@ static const struct proto_ops smc_sock_ops = { .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, - .poll_mask = smc_poll_mask, + .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, diff --git a/net/socket.c b/net/socket.c index 8a109012608a..a564c6ed19d5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -117,10 +117,8 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from); static int sock_mmap(struct file *file, struct vm_area_struct *vma); static int sock_close(struct inode *inode, struct file *file); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events); -static __poll_t sock_poll_mask(struct file *file, __poll_t); -static __poll_t sock_poll(struct file *file, struct poll_table_struct *wait); +static __poll_t sock_poll(struct file *file, + struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT static long compat_sock_ioctl(struct file *file, @@ -143,8 +141,6 @@ static const struct file_operations socket_file_ops = { .llseek = no_llseek, .read_iter = sock_read_iter, .write_iter = sock_write_iter, - .get_poll_head = sock_get_poll_head, - .poll_mask = sock_poll_mask, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, #ifdef CONFIG_COMPAT @@ -1130,48 +1126,14 @@ out_release: } EXPORT_SYMBOL(sock_create_lite); -static struct wait_queue_head *sock_get_poll_head(struct file *file, - __poll_t events) -{ - struct socket *sock = file->private_data; - - if (!sock->ops->poll_mask) - return NULL; - sock_poll_busy_loop(sock, events); - return sk_sleep(sock->sk); -} - -static __poll_t sock_poll_mask(struct file *file, __poll_t events) -{ - struct socket *sock = file->private_data; - - /* - * We need to be sure we are in sync with the socket flags modification. - * - * This memory barrier is paired in the wq_has_sleeper. - */ - smp_mb(); - - /* this socket can poll_ll so tell the system call */ - return sock->ops->poll_mask(sock, events) | - (sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0); -} - /* No kernel lock held - perfect */ static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; - __poll_t events = poll_requested_events(wait), mask = 0; - - if (sock->ops->poll) { - sock_poll_busy_loop(sock, events); - mask = sock->ops->poll(file, sock, wait); - } else if (sock->ops->poll_mask) { - sock_poll_wait(file, sock_get_poll_head(file, events), wait); - mask = sock->ops->poll_mask(sock, events); - } + __poll_t events = poll_requested_events(wait); - return mask | sock_poll_busy_flag(sock); + sock_poll_busy_loop(sock, events); + return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 14a5d055717d..930852c54d7a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -692,9 +692,10 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, } /** - * tipc_poll - read pollmask + * tipc_poll - read and possibly block on pollmask * @file: file structure associated with the socket * @sock: socket for which to calculate the poll bits + * @wait: ??? * * Returns pollmask value * @@ -708,12 +709,15 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, * imply that the operation will succeed, merely that it should be performed * and will not block. */ -static __poll_t tipc_poll_mask(struct socket *sock, __poll_t events) +static __poll_t tipc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) @@ -3033,7 +3037,7 @@ static const struct proto_ops msg_ops = { .socketpair = tipc_socketpair, .accept = sock_no_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = sock_no_listen, .shutdown = tipc_shutdown, @@ -3054,7 +3058,7 @@ static const struct proto_ops packet_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, @@ -3075,7 +3079,7 @@ static const struct proto_ops stream_ops = { .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, - .poll_mask = tipc_poll_mask, + .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a127d61e8af9..301f22430469 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -712,7 +712,7 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll_mask = tls_sw_poll_mask; + tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f127fac88acf..d2380548f8f6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -919,22 +919,23 @@ splice_read_end: return copied ? : err; } -__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events) +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) { + unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - __poll_t mask; - /* Grab EPOLLOUT and EPOLLHUP from the underlying socket */ - mask = ctx->sk_poll_mask(sock, events); + /* Grab POLLOUT and POLLHUP from the underlying socket */ + ret = ctx->sk_poll(file, sock, wait); - /* Clear EPOLLIN bits, and set based on recv_pkt */ - mask &= ~(EPOLLIN | EPOLLRDNORM); + /* Clear POLLIN bits, and set based on recv_pkt */ + ret &= ~(POLLIN | POLLRDNORM); if (ctx->recv_pkt) - mask |= EPOLLIN | EPOLLRDNORM; + ret |= POLLIN | POLLRDNORM; - return mask; + return ret; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1191,7 +1192,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask; + sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; strp_check_rcv(&sw_ctx_rx->strp); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 95b02a71fd47..e5473c03d667 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -638,8 +638,9 @@ static int unix_stream_connect(struct socket *, struct sockaddr *, static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int); -static __poll_t unix_poll_mask(struct socket *, __poll_t); -static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t); +static __poll_t unix_poll(struct file *, struct socket *, poll_table *); +static __poll_t unix_dgram_poll(struct file *, struct socket *, + poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); @@ -680,7 +681,7 @@ static const struct proto_ops unix_stream_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_poll_mask, + .poll = unix_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -703,7 +704,7 @@ static const struct proto_ops unix_dgram_ops = { .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = sock_no_listen, .shutdown = unix_shutdown, @@ -725,7 +726,7 @@ static const struct proto_ops unix_seqpacket_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -2629,10 +2630,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -2661,11 +2665,15 @@ static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) return mask; } -static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk, *other; - int writable; - __poll_t mask = 0; + unsigned int writable; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -2691,7 +2699,7 @@ static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) } /* No write status requested, avoid expensive OUT tests. */ - if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) + if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index bb5d5fa68c35..c1076c19b858 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -850,11 +850,18 @@ static int vsock_shutdown(struct socket *sock, int mode) return err; } -static __poll_t vsock_poll_mask(struct socket *sock, __poll_t events) +static __poll_t vsock_poll(struct file *file, struct socket *sock, + poll_table *wait) { - struct sock *sk = sock->sk; - struct vsock_sock *vsk = vsock_sk(sk); - __poll_t mask = 0; + struct sock *sk; + __poll_t mask; + struct vsock_sock *vsk; + + sk = sock->sk; + vsk = vsock_sk(sk); + + poll_wait(file, sk_sleep(sk), wait); + mask = 0; if (sk->sk_err) /* Signify that there has been an error on this socket. */ @@ -1084,7 +1091,7 @@ static const struct proto_ops vsock_dgram_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = vsock_getname, - .poll_mask = vsock_poll_mask, + .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, @@ -1842,7 +1849,7 @@ static const struct proto_ops vsock_stream_ops = { .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, - .poll_mask = vsock_poll_mask, + .poll = vsock_poll, .ioctl = sock_no_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index f93365ae0fdd..d49aa79b7997 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1750,7 +1750,7 @@ static const struct proto_ops x25_proto_ops = { .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, - .poll_mask = datagram_poll_mask, + .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3b3410ada097..59fb7d3c36a3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -303,9 +303,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); } -static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events) +static unsigned int xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) { - __poll_t mask = datagram_poll_mask(sock, events); + unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -696,7 +697,7 @@ static const struct proto_ops xsk_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll_mask = xsk_poll_mask, + .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, -- cgit v1.2.3 From d50d82faa0c964e31f7a946ba8aba7c715ca7ab0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 27 Jun 2018 23:26:09 -0700 Subject: slub: fix failure when we delete and create a slab cache In kernel 4.17 I removed some code from dm-bufio that did slab cache merging (commit 21bb13276768: "dm bufio: remove code that merges slab caches") - both slab and slub support merging caches with identical attributes, so dm-bufio now just calls kmem_cache_create and relies on implicit merging. This uncovered a bug in the slub subsystem - if we delete a cache and immediatelly create another cache with the same attributes, it fails because of duplicate filename in /sys/kernel/slab/. The slub subsystem offloads freeing the cache to a workqueue - and if we create the new cache before the workqueue runs, it complains because of duplicate filename in sysfs. This patch fixes the bug by moving the call of kobject_del from sysfs_slab_remove_workfn to shutdown_cache. kobject_del must be called while we hold slab_mutex - so that the sysfs entry is deleted before a cache with the same attributes could be created. Running device-mapper-test-suite with: dmtest run --suite thin-provisioning -n /commit_failure_causes_fallback/ triggered: Buffer I/O error on dev dm-0, logical block 1572848, async page read device-mapper: thin: 253:1: metadata operation 'dm_pool_alloc_data_block' failed: error = -5 device-mapper: thin: 253:1: aborting current metadata transaction sysfs: cannot create duplicate filename '/kernel/slab/:a-0000144' CPU: 2 PID: 1037 Comm: kworker/u48:1 Not tainted 4.17.0.snitm+ #25 Hardware name: Supermicro SYS-1029P-WTR/X11DDW-L, BIOS 2.0a 12/06/2017 Workqueue: dm-thin do_worker [dm_thin_pool] Call Trace: dump_stack+0x5a/0x73 sysfs_warn_dup+0x58/0x70 sysfs_create_dir_ns+0x77/0x80 kobject_add_internal+0xba/0x2e0 kobject_init_and_add+0x70/0xb0 sysfs_slab_add+0xb1/0x250 __kmem_cache_create+0x116/0x150 create_cache+0xd9/0x1f0 kmem_cache_create_usercopy+0x1c1/0x250 kmem_cache_create+0x18/0x20 dm_bufio_client_create+0x1ae/0x410 [dm_bufio] dm_block_manager_create+0x5e/0x90 [dm_persistent_data] __create_persistent_data_objects+0x38/0x940 [dm_thin_pool] dm_pool_abort_metadata+0x64/0x90 [dm_thin_pool] metadata_operation_failed+0x59/0x100 [dm_thin_pool] alloc_data_block.isra.53+0x86/0x180 [dm_thin_pool] process_cell+0x2a3/0x550 [dm_thin_pool] do_worker+0x28d/0x8f0 [dm_thin_pool] process_one_work+0x171/0x370 worker_thread+0x49/0x3f0 kthread+0xf8/0x130 ret_from_fork+0x35/0x40 kobject_add_internal failed for :a-0000144 with -EEXIST, don't try to register things with the same name in the same directory. kmem_cache_create(dm_bufio_buffer-16) failed with error -17 Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1806151817130.6333@file01.intranet.prod.int.rdu2.redhat.com Signed-off-by: Mikulas Patocka Reported-by: Mike Snitzer Tested-by: Mike Snitzer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 4 ++++ mm/slab_common.c | 4 ++++ mm/slub.c | 7 ++++++- 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 09fa2c6f0e68..3a1a1dbc6f49 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -155,8 +155,12 @@ struct kmem_cache { #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS +void sysfs_slab_unlink(struct kmem_cache *); void sysfs_slab_release(struct kmem_cache *); #else +static inline void sysfs_slab_unlink(struct kmem_cache *s) +{ +} static inline void sysfs_slab_release(struct kmem_cache *s) { } diff --git a/mm/slab_common.c b/mm/slab_common.c index 890b1f04a03a..2296caf87bfb 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -567,10 +567,14 @@ static int shutdown_cache(struct kmem_cache *s) list_del(&s->list); if (s->flags & SLAB_TYPESAFE_BY_RCU) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_unlink(s); +#endif list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { #ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_unlink(s); sysfs_slab_release(s); #else slab_kmem_cache_release(s); diff --git a/mm/slub.c b/mm/slub.c index a3b8467c14af..51258eff4178 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5667,7 +5667,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); - kobject_del(&s->kobj); out: kobject_put(&s->kobj); } @@ -5752,6 +5751,12 @@ static void sysfs_slab_remove(struct kmem_cache *s) schedule_work(&s->kobj_remove_work); } +void sysfs_slab_unlink(struct kmem_cache *s) +{ + if (slab_state >= FULL) + kobject_del(&s->kobj); +} + void sysfs_slab_release(struct kmem_cache *s) { if (slab_state >= FULL) -- cgit v1.2.3 From f77bc3a82ce58fe7977a11f28e0b6cac8a9e087c Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Wed, 27 Jun 2018 23:26:17 -0700 Subject: include/linux/dax.h: dax_iomap_fault() returns vm_fault_t Commit 1c8f422059ae ("mm: change return type to vm_fault_t") missed a conversion. It's not a big problem at present because mainline is still using typedef int vm_fault_t; Fixes: 1c8f422059ae ("mm: change return type to vm_fault_t") Link: http://lkml.kernel.org/r/20180620172046.GA27894@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Andrew Morton Cc: Matthew Wilcox Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dax.h b/include/linux/dax.h index 3855e3800f48..deb0f663252f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -135,7 +135,7 @@ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t pfn); -- cgit v1.2.3 From 4c79579b44b1876444f4d04de31c1a37098a0350 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Jun 2018 16:21:18 -0700 Subject: bpf: Change bpf_fib_lookup to return lookup status For ACLs implemented using either FIB rules or FIB entries, the BPF program needs the FIB lookup status to be able to drop the packet. Since the bpf_fib_lookup API has not reached a released kernel yet, change the return code to contain an encoding of the FIB lookup result and return the nexthop device index in the params struct. In addition, inform the BPF program of any post FIB lookup reason as to why the packet needs to go up the stack. The fib result for unicast routes must have an egress device, so remove the check that it is non-NULL. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 28 ++++++++++++--- net/core/filter.c | 86 +++++++++++++++++++++++++++++----------------- samples/bpf/xdp_fwd_kern.c | 8 ++--- 3 files changed, 81 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 59b19b6a40d7..b7db3261c62d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1857,7 +1857,8 @@ union bpf_attr { * is resolved), the nexthop address is returned in ipv4_dst * or ipv6_dst based on family, smac is set to mac address of * egress device, dmac is set to nexthop mac address, rt_metric - * is set to metric from route (IPv4/IPv6 only). + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the @@ -1873,9 +1874,10 @@ union bpf_attr { * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. * Return - * Egress device index on success, 0 if packet needs to continue - * up the stack for further processing or a negative error in case - * of failure. + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * * packet is not forwarded or needs assist from full stack * * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) * Description @@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_DIRECT BIT(0) #define BPF_FIB_LOOKUP_OUTPUT BIT(1) +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +}; + struct bpf_fib_lookup { /* input: network family for lookup (AF_INET, AF_INET6) * output: network family of egress nexthop @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup { /* total length of packet from network header - used for MTU check */ __u16 tot_len; - __u32 ifindex; /* L3 device index for lookup */ + + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; union { /* inputs to lookup */ diff --git a/net/core/filter.c b/net/core/filter.c index e7f12e9f598c..0ca6907d7efe 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; + params->ifindex = dev->ifindex; - return dev->ifindex; + return 0; } #endif @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; @@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = fib_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { @@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } - if (err || res.type != RTN_UNICAST) - return 0; + if (err) { + /* map fib lookup errors to RTN_ type */ + if (err == -EINVAL) + return BPF_FIB_LKUP_RET_BLACKHOLE; + if (err == -EHOSTUNREACH) + return BPF_FIB_LKUP_RET_UNREACHABLE; + if (err == -EACCES) + return BPF_FIB_LKUP_RET_PROHIBIT; + + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + + if (res.type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); @@ -4144,19 +4157,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ if (nh->nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nh->nh_dev; - if (unlikely(!dev)) - return 0; - if (nh->nh_gw) params->ipv4_dst = nh->nh_gw; @@ -4166,10 +4176,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, * rcu_read_lock_bh is not needed here */ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4190,7 +4200,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4198,7 +4208,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; @@ -4225,7 +4235,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); } else { @@ -4238,11 +4248,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, } if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; + + if (unlikely(f6i->fib6_flags & RTF_REJECT)) { + switch (f6i->fib6_type) { + case RTN_BLACKHOLE: + return BPF_FIB_LKUP_RET_BLACKHOLE; + case RTN_UNREACHABLE: + return BPF_FIB_LKUP_RET_UNREACHABLE; + case RTN_PROHIBIT: + return BPF_FIB_LKUP_RET_PROHIBIT; + default: + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + } - if (unlikely(f6i->fib6_flags & RTF_REJECT || - f6i->fib6_type != RTN_UNICAST)) - return 0; + if (f6i->fib6_type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, @@ -4252,11 +4274,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } if (f6i->fib6_nh.nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (f6i->fib6_flags & RTF_GATEWAY) *dst = f6i->fib6_nh.nh_gw; @@ -4270,10 +4292,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, */ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, ndisc_hashfn, dst, dev); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4315,7 +4337,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); - int index = -EAFNOSUPPORT; + int rc = -EAFNOSUPPORT; if (plen < sizeof(*params)) return -EINVAL; @@ -4326,25 +4348,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - index = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, false); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - index = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, false); break; #endif } - if (index > 0) { + if (!rc) { struct net_device *dev; - dev = dev_get_by_index_rcu(net, index); + dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) - index = 0; + rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; } - return index; + return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c index 6673cdb9f55c..a7e94e7ff87d 100644 --- a/samples/bpf/xdp_fwd_kern.c +++ b/samples/bpf/xdp_fwd_kern.c @@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) struct ethhdr *eth = data; struct ipv6hdr *ip6h; struct iphdr *iph; - int out_index; u16 h_proto; u64 nh_off; + int rc; nh_off = sizeof(*eth); if (data + nh_off > data_end) @@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) fib_params.ifindex = ctx->ingress_ifindex; - out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); + rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); /* verify egress index has xdp support * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with @@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) * NOTE: without verification that egress index supports XDP * forwarding packets are dropped. */ - if (out_index > 0) { + if (rc == 0) { if (h_proto == htons(ETH_P_IP)) ip_decrease_ttl(iph); else if (h_proto == htons(ETH_P_IPV6)) @@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); memcpy(eth->h_source, fib_params.smac, ETH_ALEN); - return bpf_redirect_map(&tx_port, out_index, 0); + return bpf_redirect_map(&tx_port, fib_params.ifindex, 0); } return XDP_PASS; -- cgit v1.2.3 From 2cd3ae2129736f9019130064d09713a375870941 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 29 Jun 2018 15:37:25 +0200 Subject: aio: mark __aio_sigset::sigmask const io_pgetevents() will not change the signal mask. Mark it const to make it clear and to reduce the need for casts in user code. Reviewed-by: Christoph Hellwig Signed-off-by: Avi Kivity Signed-off-by: Al Viro [hch: reapply the patch that got incorrectly reverted] Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- include/uapi/linux/aio_abi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index d4e768d55d14..3c5038b587ba 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -111,7 +111,7 @@ struct iocb { #undef IFLITTLE struct __aio_sigset { - sigset_t __user *sigmask; + const sigset_t __user *sigmask; size_t sigsetsize; }; -- cgit v1.2.3 From 9544bc5347207a68eb308cc8aaaed6c3a687cabd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Jun 2018 08:48:06 -0600 Subject: sg: remove ->sg_magic member This was introduced more than a decade ago when sg chaining was added, but we never really caught anything with it. The scatterlist entry size can be critical, since drivers allocate it, so remove the magic member. Recently it's been triggering allocation stalls and failures in NVMe. Tested-by: Jordan Glover Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/gpu/drm/i915/i915_drv.h | 3 --- include/linux/scatterlist.h | 18 ------------------ lib/scatterlist.c | 6 ------ tools/virtio/linux/scatterlist.h | 18 ------------------ 4 files changed, 45 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 34c125e2d90c..9180f67746b4 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2238,9 +2238,6 @@ static inline struct scatterlist *____sg_next(struct scatterlist *sg) **/ static inline struct scatterlist *__sg_next(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif return sg_is_last(sg) ? NULL : ____sg_next(sg); } diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 51f52020ad5f..093aa57120b0 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -9,9 +9,6 @@ #include struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif unsigned long page_link; unsigned int offset; unsigned int length; @@ -64,7 +61,6 @@ struct sg_table { * */ -#define SG_MAGIC 0x87654321 #define SG_CHAIN 0x01UL #define SG_END 0x02UL @@ -98,7 +94,6 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) */ BUG_ON((unsigned long) page & (SG_CHAIN | SG_END)); #ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); BUG_ON(sg_is_chain(sg)); #endif sg->page_link = page_link | (unsigned long) page; @@ -129,7 +124,6 @@ static inline void sg_set_page(struct scatterlist *sg, struct page *page, static inline struct page *sg_page(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); BUG_ON(sg_is_chain(sg)); #endif return (struct page *)((sg)->page_link & ~(SG_CHAIN | SG_END)); @@ -195,9 +189,6 @@ static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, **/ static inline void sg_mark_end(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif /* * Set termination bit, clear potential chain bit */ @@ -215,9 +206,6 @@ static inline void sg_mark_end(struct scatterlist *sg) **/ static inline void sg_unmark_end(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif sg->page_link &= ~SG_END; } @@ -260,12 +248,6 @@ static inline void *sg_virt(struct scatterlist *sg) static inline void sg_init_marker(struct scatterlist *sgl, unsigned int nents) { -#ifdef CONFIG_DEBUG_SG - unsigned int i; - - for (i = 0; i < nents; i++) - sgl[i].sg_magic = SG_MAGIC; -#endif sg_mark_end(&sgl[nents - 1]); } diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 06dad7a072fd..d4ae67d6cd1e 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -24,9 +24,6 @@ **/ struct scatterlist *sg_next(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif if (sg_is_last(sg)) return NULL; @@ -111,10 +108,7 @@ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) for_each_sg(sgl, sg, nents, i) ret = sg; -#ifdef CONFIG_DEBUG_SG - BUG_ON(sgl[0].sg_magic != SG_MAGIC); BUG_ON(!sg_is_last(ret)); -#endif return ret; } EXPORT_SYMBOL(sg_last); diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h index 9a45f90e2d08..369ee308b668 100644 --- a/tools/virtio/linux/scatterlist.h +++ b/tools/virtio/linux/scatterlist.h @@ -36,7 +36,6 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) */ BUG_ON((unsigned long) page & 0x03); #ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); BUG_ON(sg_is_chain(sg)); #endif sg->page_link = page_link | (unsigned long) page; @@ -67,7 +66,6 @@ static inline void sg_set_page(struct scatterlist *sg, struct page *page, static inline struct page *sg_page(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); BUG_ON(sg_is_chain(sg)); #endif return (struct page *)((sg)->page_link & ~0x3); @@ -116,9 +114,6 @@ static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, **/ static inline void sg_mark_end(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif /* * Set termination bit, clear potential chain bit */ @@ -136,17 +131,11 @@ static inline void sg_mark_end(struct scatterlist *sg) **/ static inline void sg_unmark_end(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif sg->page_link &= ~0x02; } static inline struct scatterlist *sg_next(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif if (sg_is_last(sg)) return NULL; @@ -160,13 +149,6 @@ static inline struct scatterlist *sg_next(struct scatterlist *sg) static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) { memset(sgl, 0, sizeof(*sgl) * nents); -#ifdef CONFIG_DEBUG_SG - { - unsigned int i; - for (i = 0; i < nents; i++) - sgl[i].sg_magic = SG_MAGIC; - } -#endif sg_mark_end(&sgl[nents - 1]); } -- cgit v1.2.3 From 85782e037f8aba8922dadb24a1523ca0b82ab8bc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 28 Jun 2018 23:34:59 +0200 Subject: bpf: undo prog rejection on read-only lock failure Partially undo commit 9facc336876f ("bpf: reject any prog that failed read-only lock") since it caused a regression, that is, syzkaller was able to manage to cause a panic via fault injection deep in set_memory_ro() path by letting an allocation fail: In x86's __change_page_attr_set_clr() it was able to change the attributes of the primary mapping but not in the alias mapping via cpa_process_alias(), so the second, inner call to the __change_page_attr() via __change_page_attr_set_clr() had to split a larger page and failed in the alloc_pages() with the artifically triggered allocation error which is then propagated down to the call site. Thus, for set_memory_ro() this means that it returned with an error, but from debugging a probe_kernel_write() revealed EFAULT on that memory since the primary mapping succeeded to get changed. Therefore the subsequent hdr->locked = 0 reset triggered the panic as it was performed on read-only memory, so call-site assumptions were infact wrong to assume that it would either succeed /or/ not succeed at all since there's no such rollback in set_memory_*() calls from partial change of mappings, in other words, we're left in a state that is "half done". A later undo via set_memory_rw() is succeeding though due to matching permissions on that part (aka due to the try_preserve_large_page() succeeding). While reproducing locally with explicitly triggering this error, the initial splitting only happens on rare occasions and in real world it would additionally need oom conditions, but that said, it could partially fail. Therefore, it is definitely wrong to bail out on set_memory_ro() error and reject the program with the set_memory_*() semantics we have today. Shouldn't have gone the extra mile since no other user in tree today infact checks for any set_memory_*() errors, e.g. neither module_enable_ro() / module_disable_ro() for module RO/NX handling which is mostly default these days nor kprobes core with alloc_insn_page() / free_insn_page() as examples that could be invoked long after bootup and original 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") did neither when it got first introduced to BPF so "improving" with bailing out was clearly not right when set_memory_*() cannot handle it today. Kees suggested that if set_memory_*() can fail, we should annotate it with __must_check, and all callers need to deal with it gracefully given those set_memory_*() markings aren't "advisory", but they're expected to actually do what they say. This might be an option worth to move forward in future but would at the same time require that set_memory_*() calls from supporting archs are guaranteed to be "atomic" in that they provide rollback if part of the range fails, once that happened, the transition from RW -> RO could be made more robust that way, while subsequent RO -> RW transition /must/ continue guaranteeing to always succeed the undo part. Reported-by: syzbot+a4eb8c7766952a1ca872@syzkaller.appspotmail.com Reported-by: syzbot+d866d1925855328eac3b@syzkaller.appspotmail.com Fixes: 9facc336876f ("bpf: reject any prog that failed read-only lock") Cc: Laura Abbott Cc: Kees Cook Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 56 ++++++++------------------------------------------ kernel/bpf/core.c | 30 +-------------------------- 2 files changed, 9 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 20f2659dd829..300baad62c88 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -470,9 +470,7 @@ struct sock_fprog_kern { }; struct bpf_binary_header { - u16 pages; - u16 locked:1; - + u32 pages; /* Some arches need word alignment for their instructions */ u8 image[] __aligned(4); }; @@ -481,7 +479,7 @@ struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ jit_requested:1,/* archs need to JIT the prog */ - locked:1, /* Program image locked? */ + undo_set_mem:1, /* Passed set_memory_ro() checkpoint */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ @@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - fp->locked = 1; - if (set_memory_ro((unsigned long)fp, fp->pages)) - fp->locked = 0; -#endif + fp->undo_set_mem = 1; + set_memory_ro((unsigned long)fp, fp->pages); } static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - if (fp->locked) { - WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages)); - /* In case set_memory_rw() fails, we want to be the first - * to crash here instead of some random place later on. - */ - fp->locked = 0; - } -#endif + if (fp->undo_set_mem) + set_memory_rw((unsigned long)fp, fp->pages); } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - hdr->locked = 1; - if (set_memory_ro((unsigned long)hdr, hdr->pages)) - hdr->locked = 0; -#endif + set_memory_ro((unsigned long)hdr, hdr->pages); } static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) { -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - if (hdr->locked) { - WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); - /* In case set_memory_rw() fails, we want to be the first - * to crash here instead of some random place later on. - */ - hdr->locked = 0; - } -#endif + set_memory_rw((unsigned long)hdr, hdr->pages); } static inline struct bpf_binary_header * @@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp) return (void *)addr; } -#ifdef CONFIG_ARCH_HAS_SET_MEMORY -static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp) -{ - if (!fp->locked) - return -ENOLCK; - if (fp->jited) { - const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - - if (!hdr->locked) - return -ENOLCK; - } - - return 0; -} -#endif - int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a9e6c04d0f4a..1e5625d46414 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_fill_ill_insns(hdr, size); hdr->pages = size / PAGE_SIZE; - hdr->locked = 0; - hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } -static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) -{ -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - int i, err; - - for (i = 0; i < fp->aux->func_cnt; i++) { - err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); - if (err) - return err; - } - - return bpf_prog_check_pages_ro_single(fp); -#endif - return 0; -} - static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON @@ -1524,17 +1506,7 @@ finalize: * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); - if (*err) - return fp; - - /* Checkpoint: at this point onwards any cBPF -> eBPF or - * native eBPF program is read-only. If we failed to change - * the page attributes (e.g. allocation failure from - * splitting large pages), then reject the whole program - * in order to guarantee not ending up with any W+X pages - * from BPF side in kernel. - */ - *err = bpf_prog_check_pages_ro_locked(fp); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); -- cgit v1.2.3 From 55c5e0c602c20cb6f350e5ae357cfd7e04ebb189 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 2 Jun 2018 11:02:02 -0300 Subject: dt-bindings: clock: imx6ul: Do not change the clock definition order Commit f5a4670de966 ("clk: imx: Add new clo01 and clo2 controlled by CCOSR") introduced the CLK_CLKO definitions, but didn't put them at the end of the list, which may cause dtb breakage when running an old dtb with a newer kernel. In order to avoid that, simply add the new CLK_CKO clock definitions at the end of the list. Fixes: f5a4670de966 ("clk: imx: Add new clo01 and clo2 controlled by CCOSR") Reported-by: Stefan Wahren Signed-off-by: Fabio Estevam Acked-by: Rob Herring Reviewed-by: Stefan Agner Signed-off-by: Stephen Boyd --- include/dt-bindings/clock/imx6ul-clock.h | 40 +++++++++++++++----------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/clock/imx6ul-clock.h b/include/dt-bindings/clock/imx6ul-clock.h index 9564597cbfac..0aa1d9c3e0b9 100644 --- a/include/dt-bindings/clock/imx6ul-clock.h +++ b/include/dt-bindings/clock/imx6ul-clock.h @@ -235,27 +235,25 @@ #define IMX6UL_CLK_CSI_PODF 222 #define IMX6UL_CLK_PLL3_120M 223 #define IMX6UL_CLK_KPP 224 -#define IMX6UL_CLK_CKO1_SEL 225 -#define IMX6UL_CLK_CKO1_PODF 226 -#define IMX6UL_CLK_CKO1 227 -#define IMX6UL_CLK_CKO2_SEL 228 -#define IMX6UL_CLK_CKO2_PODF 229 -#define IMX6UL_CLK_CKO2 230 -#define IMX6UL_CLK_CKO 231 - -/* For i.MX6ULL */ -#define IMX6ULL_CLK_ESAI_PRED 232 -#define IMX6ULL_CLK_ESAI_PODF 233 -#define IMX6ULL_CLK_ESAI_EXTAL 234 -#define IMX6ULL_CLK_ESAI_MEM 235 -#define IMX6ULL_CLK_ESAI_IPG 236 -#define IMX6ULL_CLK_DCP_CLK 237 -#define IMX6ULL_CLK_EPDC_PRE_SEL 238 -#define IMX6ULL_CLK_EPDC_SEL 239 -#define IMX6ULL_CLK_EPDC_PODF 240 -#define IMX6ULL_CLK_EPDC_ACLK 241 -#define IMX6ULL_CLK_EPDC_PIX 242 -#define IMX6ULL_CLK_ESAI_SEL 243 +#define IMX6ULL_CLK_ESAI_PRED 225 +#define IMX6ULL_CLK_ESAI_PODF 226 +#define IMX6ULL_CLK_ESAI_EXTAL 227 +#define IMX6ULL_CLK_ESAI_MEM 228 +#define IMX6ULL_CLK_ESAI_IPG 229 +#define IMX6ULL_CLK_DCP_CLK 230 +#define IMX6ULL_CLK_EPDC_PRE_SEL 231 +#define IMX6ULL_CLK_EPDC_SEL 232 +#define IMX6ULL_CLK_EPDC_PODF 233 +#define IMX6ULL_CLK_EPDC_ACLK 234 +#define IMX6ULL_CLK_EPDC_PIX 235 +#define IMX6ULL_CLK_ESAI_SEL 236 +#define IMX6UL_CLK_CKO1_SEL 237 +#define IMX6UL_CLK_CKO1_PODF 238 +#define IMX6UL_CLK_CKO1 239 +#define IMX6UL_CLK_CKO2_SEL 240 +#define IMX6UL_CLK_CKO2_PODF 241 +#define IMX6UL_CLK_CKO2 242 +#define IMX6UL_CLK_CKO 243 #define IMX6UL_CLK_END 244 #endif /* __DT_BINDINGS_CLOCK_IMX6UL_H */ -- cgit v1.2.3 From 603d4cf8fe095b1ee78f423d514427be507fb513 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Sat, 30 Jun 2018 17:38:55 +0200 Subject: net: fix use-after-free in GRO with ESP Since the addition of GRO for ESP, gro_receive can consume the skb and return -EINPROGRESS. In that case, the lower layer GRO handler cannot touch the skb anymore. Commit 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.") converted some of the gro_receive handlers that can lead to ESP's gro_receive so that they wouldn't access the skb when -EINPROGRESS is returned, but missed other spots, mainly in tunneling protocols. This patch finishes the conversion to using skb_gro_flush_final(), and adds a new helper, skb_gro_flush_final_remcsum(), used in VXLAN and GUE. Fixes: 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.") Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 +- drivers/net/vxlan.c | 4 +--- include/linux/netdevice.h | 20 ++++++++++++++++++++ net/8021q/vlan.c | 2 +- net/ipv4/fou.c | 4 +--- net/ipv4/gre_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- 7 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 750eaa53bf0c..ada33c2d9ac2 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -476,7 +476,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aee0e60471f1..f6bb1d54d4bd 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -623,9 +623,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk, flush = 0; out: - skb_gro_remcsum_cleanup(skb, &grc); - skb->remcsum_offload = 0; - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec9850c7936..3d0cc0b5cec2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2789,11 +2789,31 @@ static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } +static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, + struct sk_buff **pp, + int flush, + struct gro_remcsum *grc) +{ + if (PTR_ERR(pp) != -EINPROGRESS) { + NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_remcsum_cleanup(skb, grc); + skb->remcsum_offload = 0; + } +} #else static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } +static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, + struct sk_buff **pp, + int flush, + struct gro_remcsum *grc) +{ + NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_remcsum_cleanup(skb, grc); + skb->remcsum_offload = 0; +} #endif static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 73a65789271b..8ccee3d01822 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -693,7 +693,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..c9ec1603666b 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -448,9 +448,7 @@ next_proto: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, &grc); - skb->remcsum_offload = 0; + skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 1859c473b21a..6a7d980105f6 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -223,7 +223,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 92dc9e5a7ff3..69c54540d5b4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -394,7 +394,7 @@ unflush: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } EXPORT_SYMBOL(udp_gro_receive); -- cgit v1.2.3 From 240630e61870e62e39a97225048f9945848fa5f5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 1 Jul 2018 12:15:46 +0200 Subject: ahci: Disable LPM on Lenovo 50 series laptops with a too old BIOS There have been several reports of LPM related hard freezes about once a day on multiple Lenovo 50 series models. Strange enough these reports where not disk model specific as LPM issues usually are and some users with the exact same disk + laptop where seeing them while other users where not seeing these issues. It turns out that enabling LPM triggers a firmware bug somewhere, which has been fixed in later BIOS versions. This commit adds a new ahci_broken_lpm() function and a new ATA_FLAG_NO_LPM for dealing with this. The ahci_broken_lpm() function contains DMI match info for the 4 models which are known to be affected by this and the DMI BIOS date field for known good BIOS versions. If the BIOS date is older then the one in the table LPM will be disabled and a warning will be printed. Note the BIOS dates are for known good versions, some older versions may work too, but we don't know for sure, the table is using dates from BIOS versions for which users have confirmed that upgrading to that version makes the problem go away. Unfortunately I've been unable to get hold of the reporter who reported that BIOS version 2.35 fixed the problems on the W541 for him. I've been able to verify the DMI_SYS_VENDOR and DMI_PRODUCT_VERSION from an older dmidecode, but I don't know the exact BIOS date as reported in the DMI. Lenovo keeps a changelog with dates in their release notes, but the dates there are the release dates not the build dates which are in DMI. So I've chosen to set the date to which we compare to one day past the release date of the 2.34 BIOS. I plan to fix this with a follow up commit once I've the necessary info. Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Signed-off-by: Tejun Heo --- drivers/ata/ahci.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/ata/libata-core.c | 3 +++ include/linux/libata.h | 1 + 3 files changed, 63 insertions(+) (limited to 'include') diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 738fb22978dd..fdeb3b4d0f4a 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1280,6 +1280,59 @@ static bool ahci_broken_suspend(struct pci_dev *pdev) return strcmp(buf, dmi->driver_data) < 0; } +static bool ahci_broken_lpm(struct pci_dev *pdev) +{ + static const struct dmi_system_id sysids[] = { + /* Various Lenovo 50 series have LPM issues with older BIOSen */ + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X250"), + }, + .driver_data = "20180406", /* 1.31 */ + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad L450"), + }, + .driver_data = "20180420", /* 1.28 */ + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T450s"), + }, + .driver_data = "20180315", /* 1.33 */ + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad W541"), + }, + /* + * Note date based on release notes, 2.35 has been + * reported to be good, but I've been unable to get + * a hold of the reporter to get the DMI BIOS date. + * TODO: fix this. + */ + .driver_data = "20180310", /* 2.35 */ + }, + { } /* terminate list */ + }; + const struct dmi_system_id *dmi = dmi_first_match(sysids); + int year, month, date; + char buf[9]; + + if (!dmi) + return false; + + dmi_get_date(DMI_BIOS_DATE, &year, &month, &date); + snprintf(buf, sizeof(buf), "%04d%02d%02d", year, month, date); + + return strcmp(buf, dmi->driver_data) < 0; +} + static bool ahci_broken_online(struct pci_dev *pdev) { #define ENCODE_BUSDEVFN(bus, slot, func) \ @@ -1694,6 +1747,12 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) "quirky BIOS, skipping spindown on poweroff\n"); } + if (ahci_broken_lpm(pdev)) { + pi.flags |= ATA_FLAG_NO_LPM; + dev_warn(&pdev->dev, + "BIOS update required for Link Power Management support\n"); + } + if (ahci_broken_suspend(pdev)) { hpriv->flags |= AHCI_HFLAG_NO_SUSPEND; dev_warn(&pdev->dev, diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 27d15ed7fa3d..cc71c63df381 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2493,6 +2493,9 @@ int ata_dev_configure(struct ata_device *dev) (id[ATA_ID_SATA_CAPABILITY] & 0xe) == 0x2) dev->horkage |= ATA_HORKAGE_NOLPM; + if (ap->flags & ATA_FLAG_NO_LPM) + dev->horkage |= ATA_HORKAGE_NOLPM; + if (dev->horkage & ATA_HORKAGE_NOLPM) { ata_dev_warn(dev, "LPM support broken, forcing max_power\n"); dev->link->ap->target_lpm_policy = ATA_LPM_MAX_POWER; diff --git a/include/linux/libata.h b/include/linux/libata.h index a2257e380789..32f247cb5e9e 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -210,6 +210,7 @@ enum { ATA_FLAG_SLAVE_POSS = (1 << 0), /* host supports slave dev */ /* (doesn't imply presence) */ ATA_FLAG_SATA = (1 << 1), + ATA_FLAG_NO_LPM = (1 << 2), /* host not happy with LPM */ ATA_FLAG_NO_LOG_PAGE = (1 << 5), /* do not issue log page read */ ATA_FLAG_NO_ATAPI = (1 << 6), /* No ATAPI support */ ATA_FLAG_PIO_DMA = (1 << 7), /* PIO cmds via DMA */ -- cgit v1.2.3 From a9744f7ca200c756e6f8c65b633770a2da711651 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:20 +0200 Subject: xsk: fix potential race in SKB TX completion code There is a potential race in the TX completion code for the SKB case. One process enters the sendmsg code of an AF_XDP socket in order to send a frame. The execution eventually trickles down to the driver that is told to send the packet. However, it decides to drop the packet due to some error condition (e.g., rings full) and frees the SKB. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. At the same time a TX interrupt has fired on another core and it dispatches the TX completion code in the driver. It does its HW specific things and ends up freeing the SKB associated with the transmitted packet. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. With a pseudo call stack, it would look like this: Core 1: sendmsg() being called in the application netdev_start_xmit() Driver entered through ndo_start_xmit Driver decides to free the SKB for some reason (e.g., rings full) Destructor of SKB called xskq_produce_addr() is called to signal completion to user space Core 2: TX completion irq NAPI loop Driver irq handler for TX completions Frees the SKB Destructor of SKB called xskq_produce_addr() is called to signal completion to user space We now have a violation of the single-producer/single-consumer principle for our queues as there are two threads trying to produce at the same time on the same queue. Fixed by introducing a spin_lock in the destructor. In regards to the performance, I get around 1.74 Mpps for txonly before and after the introduction of the spinlock. There is of course some impact due to the spin lock but it is in the less significant digits that are too noisy for me to measure. But let us say that the version without the spin lock got 1.745 Mpps in the best case and the version with 1.735 Mpps in the worst case, then that would mean a maximum drop in performance of 0.5%. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 4 ++++ net/xdp/xsk.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9fe472f2ac95..7161856bcf9c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -60,6 +60,10 @@ struct xdp_sock { bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; + /* Mutual exclusion of NAPI TX thread and sendmsg error paths + * in the SKB destructor callback. + */ + spinlock_t tx_completion_lock; u64 rx_dropped; }; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 15aca73805fc..7d220cbd09b6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -199,8 +199,11 @@ static void xsk_destruct_skb(struct sk_buff *skb) { u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; struct xdp_sock *xs = xdp_sk(skb->sk); + unsigned long flags; + spin_lock_irqsave(&xs->tx_completion_lock, flags); WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); + spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); } @@ -755,6 +758,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); mutex_init(&xs->mutex); + spin_lock_init(&xs->tx_completion_lock); local_bh_disable(); sock_prot_inuse_add(net, &xsk_proto, 1); -- cgit v1.2.3 From 1cef1150ef40ec52f507436a14230cbc2623299c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 11:45:49 +0200 Subject: kthread, sched/core: Fix kthread_parkme() (again...) Gaurav reports that commit: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") isn't working for him. Because of the following race: > controller Thread CPUHP Thread > takedown_cpu > kthread_park > kthread_parkme > Set KTHREAD_SHOULD_PARK > smpboot_thread_fn > set Task interruptible > > > wake_up_process > if (!(p->state & state)) > goto out; > > Kthread_parkme > SET TASK_PARKED > schedule > raw_spin_lock(&rq->lock) > ttwu_remote > waiting for __task_rq_lock > context_switch > > finish_lock_switch > > > > Case TASK_PARKED > kthread_park_complete > > > SET Running Furthermore, Oleg noticed that the whole scheduler TASK_PARKED handling is buggered because the TASK_DEAD thing is done with preemption disabled, the current code can still complete early on preemption :/ So basically revert that earlier fix and go with a variant of the alternative mentioned in the commit. Promote TASK_PARKED to special state to avoid the store-store issue on task->state leading to the WARN in kthread_unpark() -> __kthread_bind(). But in addition, add wait_task_inactive() to kthread_park() to ensure the task really is PARKED when we return from kthread_park(). This avoids the whole kthread still gets migrated nonsense -- although it would be really good to get this done differently. Reported-by: Gaurav Kohli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") Signed-off-by: Ingo Molnar --- include/linux/kthread.h | 1 - include/linux/sched.h | 2 +- kernel/kthread.c | 30 ++++++++++++++++++++++++------ kernel/sched/core.c | 31 +++++++++++-------------------- 4 files changed, 36 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2803264c512f..c1961761311d 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); -void kthread_park_complete(struct task_struct *k); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index 9256118bd40c..43731fe51c97 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -118,7 +118,7 @@ struct task_group; * the comment with set_special_state(). */ #define is_special_task_state(state) \ - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) #define __set_current_state(state_value) \ do { \ diff --git a/kernel/kthread.c b/kernel/kthread.c index 481951bf091d..750cb8082694 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { for (;;) { - set_current_state(TASK_PARKED); + /* + * TASK_PARKED is a special state; we must serialize against + * possible pending wakeups to avoid store-store collisions on + * task->state. + * + * Such a collision might possibly result in the task state + * changin from TASK_PARKED and us failing the + * wait_task_inactive() in kthread_park(). + */ + set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; + + complete_all(&self->parked); schedule(); } __set_current_state(TASK_RUNNING); @@ -191,11 +202,6 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); -void kthread_park_complete(struct task_struct *k) -{ - complete_all(&to_kthread(k)->parked); -} - static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k) reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. + */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k) set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); + /* + * Wait for __kthread_parkme() to complete(), this means we + * _will_ have TASK_PARKED and are about to call schedule(). + */ wait_for_completion(&kthread->parked); + /* + * Now wait for that schedule() to complete and the task to + * get scheduled out. + */ + WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 22fce36426c0..fe365c9a08e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,7 +7,6 @@ */ #include "sched.h" -#include #include #include @@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { - switch (prev_state) { - case TASK_DEAD: - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); - put_task_struct(prev); - break; + /* Task is done with its stack. */ + put_task_stack(prev); - case TASK_PARKED: - kthread_park_complete(prev); - break; - } + put_task_struct(prev); } tick_nohz_task_switch(); -- cgit v1.2.3 From d03db2bc26f0e4a6849ad649a09c9c73fccdc656 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 21 Jun 2018 09:23:22 -0700 Subject: compiler-gcc.h: Add __attribute__((gnu_inline)) to all inline declarations Functions marked extern inline do not emit an externally visible function when the gnu89 C standard is used. Some KBUILD Makefiles overwrite KBUILD_CFLAGS. This is an issue for GCC 5.1+ users as without an explicit C standard specified, the default is gnu11. Since c99, the semantics of extern inline have changed such that an externally visible function is always emitted. This can lead to multiple definition errors of extern inline functions at link time of compilation units whose build files have removed an explicit C standard compiler flag for users of GCC 5.1+ or Clang. Suggested-by: Arnd Bergmann Suggested-by: H. Peter Anvin Suggested-by: Joe Perches Signed-off-by: Nick Desaulniers Acked-by: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@redhat.com Cc: akataria@vmware.com Cc: akpm@linux-foundation.org Cc: andrea.parri@amarulasolutions.com Cc: ard.biesheuvel@linaro.org Cc: aryabinin@virtuozzo.com Cc: astrachan@google.com Cc: boris.ostrovsky@oracle.com Cc: brijesh.singh@amd.com Cc: caoj.fnst@cn.fujitsu.com Cc: geert@linux-m68k.org Cc: ghackmann@google.com Cc: gregkh@linuxfoundation.org Cc: jan.kiszka@siemens.com Cc: jarkko.sakkinen@linux.intel.com Cc: jpoimboe@redhat.com Cc: keescook@google.com Cc: kirill.shutemov@linux.intel.com Cc: kstewart@linuxfoundation.org Cc: linux-efi@vger.kernel.org Cc: linux-kbuild@vger.kernel.org Cc: manojgupta@google.com Cc: mawilcox@microsoft.com Cc: michal.lkml@markovi.net Cc: mjg59@google.com Cc: mka@chromium.org Cc: pombredanne@nexb.com Cc: rientjes@google.com Cc: rostedt@goodmis.org Cc: sedat.dilek@gmail.com Cc: thomas.lendacky@amd.com Cc: tstellar@redhat.com Cc: tweek@google.com Cc: virtualization@lists.linux-foundation.org Cc: will.deacon@arm.com Cc: yamada.masahiro@socionext.com Link: http://lkml.kernel.org/r/20180621162324.36656-2-ndesaulniers@google.com Signed-off-by: Ingo Molnar --- include/linux/compiler-gcc.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index fd282c7d3e5e..573f5a7d42d4 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -65,6 +65,18 @@ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) #endif +/* + * Feature detection for gnu_inline (gnu89 extern inline semantics). Either + * __GNUC_STDC_INLINE__ is defined (not using gnu89 extern inline semantics, + * and we opt in to the gnu89 semantics), or __GNUC_STDC_INLINE__ is not + * defined so the gnu89 semantics are the default. + */ +#ifdef __GNUC_STDC_INLINE__ +# define __gnu_inline __attribute__((gnu_inline)) +#else +# define __gnu_inline +#endif + /* * Force always-inline if the user requests it so via the .config, * or if gcc is too old. @@ -72,19 +84,22 @@ * -Wunused-function. This turns out to avoid the need for complex #ifdef * directives. Suppress the warning in clang as well by using "unused" * function attribute, which is redundant but not harmful for gcc. + * Prefer gnu_inline, so that extern inline functions do not emit an + * externally visible function. This makes extern inline behave as per gnu89 + * semantics rather than c99. This prevents multiple symbol definition errors + * of extern inline functions at link time. + * A lot of inline functions can cause havoc with function tracing. */ #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -#define inline inline __attribute__((always_inline,unused)) notrace -#define __inline__ __inline__ __attribute__((always_inline,unused)) notrace -#define __inline __inline __attribute__((always_inline,unused)) notrace +#define inline \ + inline __attribute__((always_inline, unused)) notrace __gnu_inline #else -/* A lot of inline functions can cause havoc with function tracing */ -#define inline inline __attribute__((unused)) notrace -#define __inline__ __inline__ __attribute__((unused)) notrace -#define __inline __inline __attribute__((unused)) notrace +#define inline inline __attribute__((unused)) notrace __gnu_inline #endif +#define __inline__ inline +#define __inline inline #define __always_inline inline __attribute__((always_inline)) #define noinline __attribute__((noinline)) -- cgit v1.2.3 From 5ccba64a560fa6ca06008d4001f5d46ebeb34b41 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 2 Feb 2018 10:14:49 +0800 Subject: ftrace: Nuke clear_ftrace_function clear_ftrace_function is not used outside of ftrace.c and is not help to use a function, so nuke it per Steve's suggestion. Link: http://lkml.kernel.org/r/1517537689-34947-1-git-send-email-xieyisheng1@huawei.com Suggested-by: Steven Rostedt Signed-off-by: Yisheng Xie Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 2 -- kernel/trace/ftrace.c | 13 +------------ 2 files changed, 1 insertion(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8154f4920fcb..ebb77674be90 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -223,7 +223,6 @@ extern enum ftrace_tracing_type_t ftrace_tracing_type; */ int register_ftrace_function(struct ftrace_ops *ops); int unregister_ftrace_function(struct ftrace_ops *ops); -void clear_ftrace_function(void); extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op, struct pt_regs *regs); @@ -239,7 +238,6 @@ static inline int ftrace_nr_registered_ops(void) { return 0; } -static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index efed9c1cfb7e..caf9cbf35816 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -192,17 +192,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -/** - * clear_ftrace_function - reset the ftrace function - * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag - */ -void clear_ftrace_function(void) -{ - ftrace_trace_function = ftrace_stub; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -6689,7 +6678,7 @@ void ftrace_kill(void) { ftrace_disabled = 1; ftrace_enabled = 0; - clear_ftrace_function(); + ftrace_trace_function = ftrace_stub; } /** -- cgit v1.2.3 From 077772468ec141b22e1e7c0c58bc09e2f9dc8762 Mon Sep 17 00:00:00 2001 From: Wang Dongsheng Date: Sun, 1 Jul 2018 23:15:46 -0700 Subject: net: phy: marvell: change default m88e1510 LED configuration The m88e1121 LED default configuration does not apply m88e151x. So add a function to relpace m88e1121 LED configuration. Signed-off-by: Wang Dongsheng Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 54 ++++++++++++++++++++++++++++++--------------- include/linux/marvell_phy.h | 2 ++ 2 files changed, 38 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index b8f57e9b9379..1cd439bdf608 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -130,8 +130,9 @@ #define MII_88E1318S_PHY_WOL_CTRL_CLEAR_WOL_STATUS BIT(12) #define MII_88E1318S_PHY_WOL_CTRL_MAGIC_PACKET_MATCH_ENABLE BIT(14) -#define MII_88E1121_PHY_LED_CTRL 16 +#define MII_PHY_LED_CTRL 16 #define MII_88E1121_PHY_LED_DEF 0x0030 +#define MII_88E1510_PHY_LED_DEF 0x1177 #define MII_M1011_PHY_STATUS 0x11 #define MII_M1011_PHY_STATUS_1000 0x8000 @@ -632,8 +633,40 @@ error: return err; } +static void marvell_config_led(struct phy_device *phydev) +{ + u16 def_config; + int err; + + switch (MARVELL_PHY_FAMILY_ID(phydev->phy_id)) { + /* Default PHY LED config: LED[0] .. Link, LED[1] .. Activity */ + case MARVELL_PHY_FAMILY_ID(MARVELL_PHY_ID_88E1121R): + case MARVELL_PHY_FAMILY_ID(MARVELL_PHY_ID_88E1318S): + def_config = MII_88E1121_PHY_LED_DEF; + break; + /* Default PHY LED config: + * LED[0] .. 1000Mbps Link + * LED[1] .. 100Mbps Link + * LED[2] .. Blink, Activity + */ + case MARVELL_PHY_FAMILY_ID(MARVELL_PHY_ID_88E1510): + def_config = MII_88E1510_PHY_LED_DEF; + break; + default: + return; + } + + err = phy_write_paged(phydev, MII_MARVELL_LED_PAGE, MII_PHY_LED_CTRL, + def_config); + if (err < 0) + pr_warn("Fail to config marvell phy LED.\n"); +} + static int marvell_config_init(struct phy_device *phydev) { + /* Set defalut LED */ + marvell_config_led(phydev); + /* Set registers from marvell,reg-init DT property */ return marvell_of_reg_init(phydev); } @@ -813,21 +846,6 @@ static int m88e1111_config_init(struct phy_device *phydev) return genphy_soft_reset(phydev); } -static int m88e1121_config_init(struct phy_device *phydev) -{ - int err; - - /* Default PHY LED config: LED[0] .. Link, LED[1] .. Activity */ - err = phy_write_paged(phydev, MII_MARVELL_LED_PAGE, - MII_88E1121_PHY_LED_CTRL, - MII_88E1121_PHY_LED_DEF); - if (err < 0) - return err; - - /* Set marvell,reg-init configuration from device tree */ - return marvell_config_init(phydev); -} - static int m88e1318_config_init(struct phy_device *phydev) { if (phy_interrupt_is_valid(phydev)) { @@ -841,7 +859,7 @@ static int m88e1318_config_init(struct phy_device *phydev) return err; } - return m88e1121_config_init(phydev); + return marvell_config_init(phydev); } static int m88e1510_config_init(struct phy_device *phydev) @@ -2087,7 +2105,7 @@ static struct phy_driver marvell_drivers[] = { .features = PHY_GBIT_FEATURES, .flags = PHY_HAS_INTERRUPT, .probe = &m88e1121_probe, - .config_init = &m88e1121_config_init, + .config_init = &marvell_config_init, .config_aneg = &m88e1121_config_aneg, .read_status = &marvell_read_status, .ack_interrupt = &marvell_ack_interrupt, diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index 4f5f8c21e283..1eb6f244588d 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -27,6 +27,8 @@ */ #define MARVELL_PHY_ID_88E6390 0x01410f90 +#define MARVELL_PHY_FAMILY_ID(id) ((id) >> 4) + /* struct phy_device dev_flags definitions */ #define MARVELL_PHY_M1145_FLAGS_RESISTANCE 0x00000001 #define MARVELL_PHY_M1118_DNS323_LEDS 0x00000002 -- cgit v1.2.3 From 33bd5ac54dc47e002da4a395aaf9bf158dd17709 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 3 Jul 2018 14:36:21 -0700 Subject: net/ipv6: Revert attempt to simplify route replace and append NetworkManager likes to manage linklocal prefix routes and does so with the NLM_F_APPEND flag, breaking attempts to simplify the IPv6 route code and by extension enable multipath routes with device only nexthops. Revert f34436a43092 and these followup patches: 6eba08c3626b ("ipv6: Only emit append events for appended routes"). ce45bded6435 ("mlxsw: spectrum_router: Align with new route replace logic") 53b562df8c20 ("mlxsw: spectrum_router: Allow appending to dev-only routes") Update the fib_tests cases to reflect the old behavior. Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") Signed-off-by: David Ahern --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 48 +++---- include/net/ip6_route.h | 6 + net/ipv6/ip6_fib.c | 156 ++++++++++++--------- net/ipv6/route.c | 3 +- tools/testing/selftests/net/fib_tests.sh | 41 ------ 5 files changed, 117 insertions(+), 137 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 6aaaf3d9ba31..77b2adb29341 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4756,6 +4756,12 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6) kfree(mlxsw_sp_rt6); } +static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt) +{ + /* RTF_CACHE routes are ignored */ + return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY; +} + static struct fib6_info * mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) { @@ -4765,11 +4771,11 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, - const struct fib6_info *nrt, bool append) + const struct fib6_info *nrt, bool replace) { struct mlxsw_sp_fib6_entry *fib6_entry; - if (!append) + if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace) return NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { @@ -4784,7 +4790,8 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, break; if (rt->fib6_metric < nrt->fib6_metric) continue; - if (rt->fib6_metric == nrt->fib6_metric) + if (rt->fib6_metric == nrt->fib6_metric && + mlxsw_sp_fib6_rt_can_mp(rt)) return fib6_entry; if (rt->fib6_metric > nrt->fib6_metric) break; @@ -5163,7 +5170,7 @@ static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, const struct fib6_info *nrt, bool replace) { - struct mlxsw_sp_fib6_entry *fib6_entry; + struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry); @@ -5172,13 +5179,18 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, continue; if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id) break; - if (replace && rt->fib6_metric == nrt->fib6_metric) - return fib6_entry; + if (replace && rt->fib6_metric == nrt->fib6_metric) { + if (mlxsw_sp_fib6_rt_can_mp(rt) == + mlxsw_sp_fib6_rt_can_mp(nrt)) + return fib6_entry; + if (mlxsw_sp_fib6_rt_can_mp(nrt)) + fallback = fallback ?: fib6_entry; + } if (rt->fib6_metric > nrt->fib6_metric) - return fib6_entry; + return fallback ?: fib6_entry; } - return NULL; + return fallback; } static int @@ -5304,8 +5316,7 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp, } static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, - struct fib6_info *rt, bool replace, - bool append) + struct fib6_info *rt, bool replace) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_fib_node *fib_node; @@ -5331,7 +5342,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, /* Before creating a new entry, try to append route to an existing * multipath entry. */ - fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, append); + fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, replace); if (fib6_entry) { err = mlxsw_sp_fib6_entry_nexthop_add(mlxsw_sp, fib6_entry, rt); if (err) @@ -5339,14 +5350,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, return 0; } - /* We received an append event, yet did not find any route to - * append to. - */ - if (WARN_ON(append)) { - err = -EINVAL; - goto err_fib6_entry_append; - } - fib6_entry = mlxsw_sp_fib6_entry_create(mlxsw_sp, fib_node, rt); if (IS_ERR(fib6_entry)) { err = PTR_ERR(fib6_entry); @@ -5364,7 +5367,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, err_fib6_node_entry_link: mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry); err_fib6_entry_create: -err_fib6_entry_append: err_fib6_entry_nexthop_add: mlxsw_sp_fib_node_put(mlxsw_sp, fib_node); return err; @@ -5715,7 +5717,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) struct mlxsw_sp_fib_event_work *fib_work = container_of(work, struct mlxsw_sp_fib_event_work, work); struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp; - bool replace, append; + bool replace; int err; rtnl_lock(); @@ -5726,10 +5728,8 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) case FIB_EVENT_ENTRY_APPEND: /* fall through */ case FIB_EVENT_ENTRY_ADD: replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE; - append = fib_work->event == FIB_EVENT_ENTRY_APPEND; err = mlxsw_sp_router_fib6_add(mlxsw_sp, - fib_work->fen6_info.rt, replace, - append); + fib_work->fen6_info.rt, replace); if (err) mlxsw_sp_router_fib_abort(mlxsw_sp); mlxsw_sp_rt6_release(fib_work->fen6_info.rt); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 59656fc580df..7b9c82de11cc 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) +{ + return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1fb2f3118d60..d212738e9d10 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -935,20 +935,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; - struct fib6_info *iter = NULL, *match = NULL; + struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; + struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); - int append = (info->nlh && - (info->nlh->nlmsg_flags & NLM_F_APPEND)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); u16 nlflags = NLM_F_EXCL; int err; - if (append) + if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; @@ -970,8 +969,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, nlflags &= ~NLM_F_EXCL; if (replace) { - found++; - break; + if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { + found++; + break; + } + if (rt_can_ecmp) + fallback_ins = fallback_ins ?: ins; + goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { @@ -986,51 +990,71 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } - - /* first route that matches */ - if (!match) - match = iter; + /* If we have the same destination and the same metric, + * but not the same gateway, then the route we try to + * add is sibling to this route, increment our counter + * of siblings, and later we will add our route to the + * list. + * Only static routes (which don't have flag + * RTF_EXPIRES) are used for ECMPv6. + * + * To avoid long list, we only had siblings if the + * route have a gateway. + */ + if (rt_can_ecmp && + rt6_qualify_for_ecmp(iter)) + rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; +next_iter: ins = &iter->fib6_next; } + if (fallback_ins && !found) { + /* No ECMP-able route found, replace first non-ECMP one */ + ins = fallback_ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + found++; + } + /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (append && match) { + if (rt->fib6_nsiblings) { + unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; - if (rt->fib6_flags & RTF_REJECT) { - NL_SET_ERR_MSG(extack, - "Can not append a REJECT route"); - return -EINVAL; - } else if (match->fib6_flags & RTF_REJECT) { - NL_SET_ERR_MSG(extack, - "Can not append to a REJECT route"); - return -EINVAL; + /* Find the first route that have the same metric */ + sibling = leaf; + while (sibling) { + if (sibling->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(sibling)) { + list_add_tail(&rt->fib6_siblings, + &sibling->fib6_siblings); + break; + } + sibling = rcu_dereference_protected(sibling->fib6_next, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } - event = FIB_EVENT_ENTRY_APPEND; - rt->fib6_nsiblings = match->fib6_nsiblings; - list_add_tail(&rt->fib6_siblings, &match->fib6_siblings); - match->fib6_nsiblings++; - /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ + fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &match->fib6_siblings, fib6_siblings) { + &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; - BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings); + BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); + fib6_nsiblings++; } - - rt6_multipath_rebalance(match); + BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1043,8 +1067,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, event, rt, - extack); + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); if (err) return err; @@ -1062,7 +1087,7 @@ add: } } else { - struct fib6_info *tmp; + int nsiblings; if (!found) { if (add) @@ -1077,57 +1102,48 @@ add: if (err) return err; - /* if route being replaced has siblings, set tmp to - * last one, otherwise tmp is current route. this is - * used to set fib6_next for new route - */ - if (iter->fib6_nsiblings) - tmp = list_last_entry(&iter->fib6_siblings, - struct fib6_info, - fib6_siblings); - else - tmp = iter; - - /* insert new route */ atomic_inc(&rt->fib6_ref); rcu_assign_pointer(rt->fib6_node, fn); - rt->fib6_next = tmp->fib6_next; + rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); - if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } + nsiblings = iter->fib6_nsiblings; + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); - /* delete old route */ - rt = iter; - - if (rt->fib6_nsiblings) { - struct fib6_info *tmp; - + if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings, - fib6_siblings) { - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - - rt->fib6_nsiblings--; - info->nl_net->ipv6.rt6_stats->fib_rt_entries--; + ins = &rt->fib6_next; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + while (iter) { + if (iter->fib6_metric > rt->fib6_metric) + break; + if (rt6_qualify_for_ecmp(iter)) { + *ins = iter->fib6_next; + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); + nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; + } else { + ins = &iter->fib6_next; + } + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } + WARN_ON(nsiblings != 0); } - - WARN_ON(rt->fib6_nsiblings != 0); - - rt->fib6_node = NULL; - fib6_purge_rt(rt, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == rt) - fn->rr_ptr = NULL; - fib6_info_release(rt); } return 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 86a0e4333d42..63f99411f0de 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3842,7 +3842,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && - iter->fib6_nsiblings) + rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -4439,7 +4439,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); - cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; nhn++; } diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 78245d60d8bc..0f45633bd634 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -740,13 +740,6 @@ ipv6_rt_add() run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64" log_test $? 2 "Attempt to add duplicate route - reject route" - # iproute2 prepend only sets NLM_F_CREATE - # - adds a new route; does NOT convert existing route to ECMP - add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" - run_cmd "$IP -6 ro prepend 2001:db8:104::/64 via 2001:db8:103::2" - check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024 2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024" - log_test $? 0 "Add new route for existing prefix (w/o NLM_F_EXCL)" - # route append with same prefix adds a new route # - iproute2 sets NLM_F_CREATE | NLM_F_APPEND add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" @@ -754,27 +747,6 @@ ipv6_rt_add() check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1" log_test $? 0 "Append nexthop to existing route - gw" - add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" - run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3" - check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop dev veth3 weight 1" - log_test $? 0 "Append nexthop to existing route - dev only" - - # multipath route can not have a nexthop that is a reject route - add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" - run_cmd "$IP -6 ro append unreachable 2001:db8:104::/64" - log_test $? 2 "Append nexthop to existing route - reject route" - - # reject route can not be converted to multipath route - run_cmd "$IP -6 ro flush 2001:db8:104::/64" - run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64" - run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2" - log_test $? 2 "Append nexthop to existing reject route - gw" - - run_cmd "$IP -6 ro flush 2001:db8:104::/64" - run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64" - run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3" - log_test $? 2 "Append nexthop to existing reject route - dev only" - # insert mpath directly add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1" @@ -819,13 +791,6 @@ ipv6_rt_replace_single() check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1" log_test $? 0 "Single path with multipath" - # single path with reject - # - add_initial_route6 "nexthop via 2001:db8:101::2" - run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64" - check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024" - log_test $? 0 "Single path with reject route" - # single path with single path using MULTIPATH attribute # add_initial_route6 "via 2001:db8:101::2" @@ -873,12 +838,6 @@ ipv6_rt_replace_mpath() check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024" log_test $? 0 "Multipath with single path via multipath attribute" - # multipath with reject - add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" - run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64" - check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024" - log_test $? 0 "Multipath with reject route" - # route replace fails - invalid nexthop 1 add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3" -- cgit v1.2.3 From a9ba23d48dbc6ffd08426bb10f05720e0b9f5c14 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 4 Jul 2018 09:58:05 -0400 Subject: ipv6: make ipv6_renew_options() interrupt/kernel safe At present the ipv6_renew_options_kern() function ends up calling into access_ok() which is problematic if done from inside an interrupt as access_ok() calls WARN_ON_IN_IRQ() on some (all?) architectures (x86-64 is affected). Example warning/backtrace is shown below: WARNING: CPU: 1 PID: 3144 at lib/usercopy.c:11 _copy_from_user+0x85/0x90 ... Call Trace: ipv6_renew_option+0xb2/0xf0 ipv6_renew_options+0x26a/0x340 ipv6_renew_options_kern+0x2c/0x40 calipso_req_setattr+0x72/0xe0 netlbl_req_setattr+0x126/0x1b0 selinux_netlbl_inet_conn_request+0x80/0x100 selinux_inet_conn_request+0x6d/0xb0 security_inet_conn_request+0x32/0x50 tcp_conn_request+0x35f/0xe00 ? __lock_acquire+0x250/0x16c0 ? selinux_socket_sock_rcv_skb+0x1ae/0x210 ? tcp_rcv_state_process+0x289/0x106b tcp_rcv_state_process+0x289/0x106b ? tcp_v6_do_rcv+0x1a7/0x3c0 tcp_v6_do_rcv+0x1a7/0x3c0 tcp_v6_rcv+0xc82/0xcf0 ip6_input_finish+0x10d/0x690 ip6_input+0x45/0x1e0 ? ip6_rcv_finish+0x1d0/0x1d0 ipv6_rcv+0x32b/0x880 ? ip6_make_skb+0x1e0/0x1e0 __netif_receive_skb_core+0x6f2/0xdf0 ? process_backlog+0x85/0x250 ? process_backlog+0x85/0x250 ? process_backlog+0xec/0x250 process_backlog+0xec/0x250 net_rx_action+0x153/0x480 __do_softirq+0xd9/0x4f7 do_softirq_own_stack+0x2a/0x40 ... While not present in the backtrace, ipv6_renew_option() ends up calling access_ok() via the following chain: access_ok() _copy_from_user() copy_from_user() ipv6_renew_option() The fix presented in this patch is to perform the userspace copy earlier in the call chain such that it is only called when the option data is actually coming from userspace; that place is do_ipv6_setsockopt(). Not only does this solve the problem seen in the backtrace above, it also allows us to simplify the code quite a bit by removing ipv6_renew_options_kern() completely. We also take this opportunity to cleanup ipv6_renew_options()/ipv6_renew_option() a small amount as well. This patch is heavily based on a rough patch by Al Viro. I've taken his original patch, converted a kmemdup() call in do_ipv6_setsockopt() to a memdup_user() call, made better use of the e_inval jump target in the same function, and cleaned up the use ipv6_renew_option() by ipv6_renew_options(). CC: Al Viro Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- include/net/ipv6.h | 9 +--- net/ipv6/calipso.c | 9 ++-- net/ipv6/exthdrs.c | 111 +++++++++++++---------------------------------- net/ipv6/ipv6_sockglue.c | 27 ++++++++---- 4 files changed, 53 insertions(+), 103 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 16475c269749..d02881e4ad1f 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -355,14 +355,7 @@ struct ipv6_txoptions *ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, - struct ipv6_opt_hdr __user *newopt, - int newoptlen); -struct ipv6_txoptions * -ipv6_renew_options_kern(struct sock *sk, - struct ipv6_txoptions *opt, - int newtype, - struct ipv6_opt_hdr *newopt, - int newoptlen); + struct ipv6_opt_hdr *newopt); struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 1323b9679cf7..1c0bb9fb76e6 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -799,8 +799,7 @@ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) { struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; - txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS, - hop, hop ? ipv6_optlen(hop) : 0); + txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop); txopt_put(old); if (IS_ERR(txopts)) return PTR_ERR(txopts); @@ -1222,8 +1221,7 @@ static int calipso_req_setattr(struct request_sock *req, if (IS_ERR(new)) return PTR_ERR(new); - txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, - new, new ? ipv6_optlen(new) : 0); + txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); kfree(new); @@ -1260,8 +1258,7 @@ static void calipso_req_delattr(struct request_sock *req) if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) return; /* Nothing to do */ - txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, - new, new ? ipv6_optlen(new) : 0); + txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); if (!IS_ERR(txopts)) { txopts = xchg(&req_inet->ipv6_opt, txopts); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 5bc2bf3733ab..20291c2036fc 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -1015,29 +1015,21 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) } EXPORT_SYMBOL_GPL(ipv6_dup_options); -static int ipv6_renew_option(void *ohdr, - struct ipv6_opt_hdr __user *newopt, int newoptlen, - int inherit, - struct ipv6_opt_hdr **hdr, - char **p) +static void ipv6_renew_option(int renewtype, + struct ipv6_opt_hdr **dest, + struct ipv6_opt_hdr *old, + struct ipv6_opt_hdr *new, + int newtype, char **p) { - if (inherit) { - if (ohdr) { - memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr)); - *hdr = (struct ipv6_opt_hdr *)*p; - *p += CMSG_ALIGN(ipv6_optlen(*hdr)); - } - } else { - if (newopt) { - if (copy_from_user(*p, newopt, newoptlen)) - return -EFAULT; - *hdr = (struct ipv6_opt_hdr *)*p; - if (ipv6_optlen(*hdr) > newoptlen) - return -EINVAL; - *p += CMSG_ALIGN(newoptlen); - } - } - return 0; + struct ipv6_opt_hdr *src; + + src = (renewtype == newtype ? new : old); + if (!src) + return; + + memcpy(*p, src, ipv6_optlen(src)); + *dest = (struct ipv6_opt_hdr *)*p; + *p += CMSG_ALIGN(ipv6_optlen(*dest)); } /** @@ -1063,13 +1055,11 @@ static int ipv6_renew_option(void *ohdr, */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, - int newtype, - struct ipv6_opt_hdr __user *newopt, int newoptlen) + int newtype, struct ipv6_opt_hdr *newopt) { int tot_len = 0; char *p; struct ipv6_txoptions *opt2; - int err; if (opt) { if (newtype != IPV6_HOPOPTS && opt->hopopt) @@ -1082,8 +1072,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); } - if (newopt && newoptlen) - tot_len += CMSG_ALIGN(newoptlen); + if (newopt) + tot_len += CMSG_ALIGN(ipv6_optlen(newopt)); if (!tot_len) return NULL; @@ -1098,29 +1088,19 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, opt2->tot_len = tot_len; p = (char *)(opt2 + 1); - err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen, - newtype != IPV6_HOPOPTS, - &opt2->hopopt, &p); - if (err) - goto out; - - err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen, - newtype != IPV6_RTHDRDSTOPTS, - &opt2->dst0opt, &p); - if (err) - goto out; - - err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen, - newtype != IPV6_RTHDR, - (struct ipv6_opt_hdr **)&opt2->srcrt, &p); - if (err) - goto out; - - err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen, - newtype != IPV6_DSTOPTS, - &opt2->dst1opt, &p); - if (err) - goto out; + ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt, + (opt ? opt->hopopt : NULL), + newopt, newtype, &p); + ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt, + (opt ? opt->dst0opt : NULL), + newopt, newtype, &p); + ipv6_renew_option(IPV6_RTHDR, + (struct ipv6_opt_hdr **)&opt2->srcrt, + (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL), + newopt, newtype, &p); + ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt, + (opt ? opt->dst1opt : NULL), + newopt, newtype, &p); opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + @@ -1128,37 +1108,6 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); return opt2; -out: - sock_kfree_s(sk, opt2, opt2->tot_len); - return ERR_PTR(err); -} - -/** - * ipv6_renew_options_kern - replace a specific ext hdr with a new one. - * - * @sk: sock from which to allocate memory - * @opt: original options - * @newtype: option type to replace in @opt - * @newopt: new option of type @newtype to replace (kernel-mem) - * @newoptlen: length of @newopt - * - * See ipv6_renew_options(). The difference is that @newopt is - * kernel memory, rather than user memory. - */ -struct ipv6_txoptions * -ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt, - int newtype, struct ipv6_opt_hdr *newopt, - int newoptlen) -{ - struct ipv6_txoptions *ret_val; - const mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret_val = ipv6_renew_options(sk, opt, newtype, - (struct ipv6_opt_hdr __user *)newopt, - newoptlen); - set_fs(old_fs); - return ret_val; } struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4d780c7f0130..c95c3486d904 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -398,6 +398,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; + struct ipv6_opt_hdr *new = NULL; + + /* hop-by-hop / destination options are privileged option */ + retv = -EPERM; + if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) + break; /* remove any sticky options header with a zero option * length, per RFC3542. @@ -409,17 +415,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, else if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) goto e_inval; - - /* hop-by-hop / destination options are privileged option */ - retv = -EPERM; - if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) - break; + else { + new = memdup_user(optval, optlen); + if (IS_ERR(new)) { + retv = PTR_ERR(new); + break; + } + if (unlikely(ipv6_optlen(new) > optlen)) { + kfree(new); + goto e_inval; + } + } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - opt = ipv6_renew_options(sk, opt, optname, - (struct ipv6_opt_hdr __user *)optval, - optlen); + opt = ipv6_renew_options(sk, opt, optname, new); + kfree(new); if (IS_ERR(opt)) { retv = PTR_ERR(opt); break; -- cgit v1.2.3 From 5711b4e89319c2912f20b2a4f371c1525fc9551d Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Thu, 5 Jul 2018 12:01:53 +0200 Subject: netfilter: nf_tproxy: fix possible non-linear access to transport header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes a silent out-of-bound read possibility that was present because of the misuse of this function. Mostly it was called with a struct udphdr *hp which had only the udphdr part linearized by the skb_header_pointer, however nf_tproxy_get_sock_v{4,6} uses it as a tcphdr pointer, so some reads for tcp specific attributes may be invalid. Fixes: a583636a83ea ("inet: refactor inet[6]_lookup functions to take skb") Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tproxy.h | 4 ++-- net/ipv4/netfilter/nf_tproxy_ipv4.c | 18 ++++++++++++------ net/ipv6/netfilter/nf_tproxy_ipv6.c | 18 ++++++++++++------ net/netfilter/xt_TPROXY.c | 8 ++++---- 4 files changed, 30 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 9754a50ecde9..4cc64c8446eb 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -64,7 +64,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * belonging to established connections going through that one. */ struct sock * -nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, @@ -103,7 +103,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, struct sock *sk); struct sock * -nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 805e83ec3ad9..164714104965 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); @@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) EXPORT_SYMBOL_GPL(nf_tproxy_laddr4); struct sock * -nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, @@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; - struct tcphdr *tcph; switch (protocol) { - case IPPROTO_TCP: + case IPPROTO_TCP: { + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(struct tcphdr), &_hdr); + if (hp == NULL) + return NULL; + switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - tcph = hp; sk = inet_lookup_listener(net, &tcp_hashinfo, skb, ip_hdrlen(skb) + - __tcp_hdrlen(tcph), + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, BUG(); } break; + } case IPPROTO_UDP: sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index bf1d6c421e3b..5dfd33af6451 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -55,7 +55,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto, + sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, tproto, &iph->saddr, nf_tproxy_laddr6(skb, laddr, &iph->daddr), hp->source, @@ -72,7 +72,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6); struct sock * -nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, @@ -80,15 +80,20 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; - struct tcphdr *tcph; switch (protocol) { - case IPPROTO_TCP: + case IPPROTO_TCP: { + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, thoff, + sizeof(struct tcphdr), &_hdr); + if (hp == NULL) + return NULL; + switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - tcph = hp; sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, - thoff + __tcp_hdrlen(tcph), + thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), in->ifindex, 0); @@ -110,6 +115,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, BUG(); } break; + } case IPPROTO_UDP: sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 58fce4e749a9..d76550a8b642 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -61,7 +61,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); @@ -77,7 +77,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NF_TPROXY_LOOKUP_LISTENER); @@ -150,7 +150,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto, + sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED); @@ -171,7 +171,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, + sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto, &iph->saddr, laddr, hp->source, lport, xt_in(par), NF_TPROXY_LOOKUP_LISTENER); -- cgit v1.2.3 From e240cd0df48185a28c153f83a39ba3940e3e9b86 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 6 Jul 2018 19:06:43 +0200 Subject: netfilter: nf_tables: place all set backends in one single module This patch disallows rbtree with single elements, which is causing problems with the recent timeout support. Before this patch, you could opt out individual set representations per module, which is just adding extra complexity. Fixes: 8d8540c4f5e0("netfilter: nft_set_rbtree: add timeout support") Reported-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 6 ++++++ net/netfilter/Kconfig | 25 +++++++------------------ net/netfilter/Makefile | 7 ++++--- net/netfilter/nf_tables_set_core.c | 28 ++++++++++++++++++++++++++++ net/netfilter/nft_set_bitmap.c | 19 +------------------ net/netfilter/nft_set_hash.c | 29 +++-------------------------- net/netfilter/nft_set_rbtree.c | 19 +------------------ 7 files changed, 50 insertions(+), 83 deletions(-) create mode 100644 net/netfilter/nf_tables_set_core.c (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index e0c0c2558ec4..a05134507e7b 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -65,4 +65,10 @@ extern const struct nft_expr_ops nft_payload_fast_ops; extern struct static_key_false nft_counters_enabled; extern struct static_key_false nft_trace_enabled; +extern struct nft_set_type nft_set_rhash_type; +extern struct nft_set_type nft_set_hash_type; +extern struct nft_set_type nft_set_hash_fast_type; +extern struct nft_set_type nft_set_rbtree_type; +extern struct nft_set_type nft_set_bitmap_type; + #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index dbd7d1fad277..f0a1c536ef15 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -460,6 +460,13 @@ config NF_TABLES if NF_TABLES +config NF_TABLES_SET + tristate "Netfilter nf_tables set infrastructure" + help + This option enables the nf_tables set infrastructure that allows to + look up for elements in a set and to build one-way mappings between + matchings and actions. + config NF_TABLES_INET depends on IPV6 select NF_TABLES_IPV4 @@ -493,24 +500,6 @@ config NFT_FLOW_OFFLOAD This option adds the "flow_offload" expression that you can use to choose what flows are placed into the hardware. -config NFT_SET_RBTREE - tristate "Netfilter nf_tables rbtree set module" - help - This option adds the "rbtree" set type (Red Black tree) that is used - to build interval-based sets. - -config NFT_SET_HASH - tristate "Netfilter nf_tables hash set module" - help - This option adds the "hash" set type that is used to build one-way - mappings between matchings and actions. - -config NFT_SET_BITMAP - tristate "Netfilter nf_tables bitmap set module" - help - This option adds the "bitmap" set type that is used to build sets - whose keys are smaller or equal to 16 bits. - config NFT_COUNTER tristate "Netfilter nf_tables counter module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 44449389e527..8a76dced974d 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -78,7 +78,11 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o +nf_tables_set-objs := nf_tables_set_core.o \ + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o + obj-$(CONFIG_NF_TABLES) += nf_tables.o +obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o @@ -91,9 +95,6 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o -obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o -obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o -obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c new file mode 100644 index 000000000000..814789644bd3 --- /dev/null +++ b/net/netfilter/nf_tables_set_core.c @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +static int __init nf_tables_set_module_init(void) +{ + nft_register_set(&nft_set_hash_fast_type); + nft_register_set(&nft_set_hash_type); + nft_register_set(&nft_set_rhash_type); + nft_register_set(&nft_set_bitmap_type); + nft_register_set(&nft_set_rbtree_type); + + return 0; +} + +static void __exit nf_tables_set_module_exit(void) +{ + nft_unregister_set(&nft_set_rbtree_type); + nft_unregister_set(&nft_set_bitmap_type); + nft_unregister_set(&nft_set_rhash_type); + nft_unregister_set(&nft_set_hash_type); + nft_unregister_set(&nft_set_hash_fast_type); +} + +module_init(nf_tables_set_module_init); +module_exit(nf_tables_set_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index d6626e01c7ee..128bc16f52dd 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -296,7 +296,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_bitmap_type __read_mostly = { +struct nft_set_type nft_set_bitmap_type __read_mostly = { .owner = THIS_MODULE, .ops = { .privsize = nft_bitmap_privsize, @@ -314,20 +314,3 @@ static struct nft_set_type nft_bitmap_type __read_mostly = { .get = nft_bitmap_get, }, }; - -static int __init nft_bitmap_module_init(void) -{ - return nft_register_set(&nft_bitmap_type); -} - -static void __exit nft_bitmap_module_exit(void) -{ - nft_unregister_set(&nft_bitmap_type); -} - -module_init(nft_bitmap_module_init); -module_exit(nft_bitmap_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso "); -MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6f9a1365a09f..72ef35b51cac 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -654,7 +654,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features return true; } -static struct nft_set_type nft_rhash_type __read_mostly = { +struct nft_set_type nft_set_rhash_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT | NFT_SET_EVAL, @@ -677,7 +677,7 @@ static struct nft_set_type nft_rhash_type __read_mostly = { }, }; -static struct nft_set_type nft_hash_type __read_mostly = { +struct nft_set_type nft_set_hash_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { @@ -697,7 +697,7 @@ static struct nft_set_type nft_hash_type __read_mostly = { }, }; -static struct nft_set_type nft_hash_fast_type __read_mostly = { +struct nft_set_type nft_set_hash_fast_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { @@ -716,26 +716,3 @@ static struct nft_set_type nft_hash_fast_type __read_mostly = { .get = nft_hash_get, }, }; - -static int __init nft_hash_module_init(void) -{ - if (nft_register_set(&nft_hash_fast_type) || - nft_register_set(&nft_hash_type) || - nft_register_set(&nft_rhash_type)) - return 1; - return 0; -} - -static void __exit nft_hash_module_exit(void) -{ - nft_unregister_set(&nft_rhash_type); - nft_unregister_set(&nft_hash_type); - nft_unregister_set(&nft_hash_fast_type); -} - -module_init(nft_hash_module_init); -module_exit(nft_hash_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 7f3a9a211034..1f8f257cb518 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -462,7 +462,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_rbtree_type __read_mostly = { +struct nft_set_type nft_set_rbtree_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -481,20 +481,3 @@ static struct nft_set_type nft_rbtree_type __read_mostly = { .get = nft_rbtree_get, }, }; - -static int __init nft_rbtree_module_init(void) -{ - return nft_register_set(&nft_rbtree_type); -} - -static void __exit nft_rbtree_module_exit(void) -{ - nft_unregister_set(&nft_rbtree_type); -} - -module_init(nft_rbtree_module_init); -module_exit(nft_rbtree_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_SET(); -- cgit v1.2.3 From 000244d3dc1f8114e38fe9ee2d9a0986404d9cbe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Jul 2018 14:44:02 +0200 Subject: net: bridge: fix br_vlan_get_{pvid,info} return values These two functions return the regular -EINVAL failure in the normal code path, but return a nonstandard '-1' error otherwise, which gets interpreted as -EPERM. Let's change it to -EINVAL for the dummy functions as well. Fixes: 4d4fd36126d6 ("net: bridge: Publish bridge accessor functions") Signed-off-by: Arnd Bergmann Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 7843b98e1c6e..c20c7e197d07 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -105,13 +105,13 @@ static inline bool br_vlan_enabled(const struct net_device *dev) static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { - return -1; + return -EINVAL; } static inline int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { - return -1; + return -EINVAL; } #endif -- cgit v1.2.3 From 11a245e2f7bf25fc21f47e4c9c8491841b128890 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 6 Jul 2018 21:01:05 +0200 Subject: net/sched: act_csum: fix NULL dereference when 'goto chain' is used the control action in the common member of struct tcf_csum must be a valid value, as it can contain the chain index when 'goto chain' is used. Ensure that the control action can be read as x->tcfa_action, when x is a pointer to struct tc_action and x->ops->type is TCA_ACT_CSUM, to prevent the following command: # tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ > $tcflags dst_mac $h2mac action csum ip or tcp or udp or sctp goto chain 1 from triggering a NULL pointer dereference when a matching packet is received. BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 800000010416b067 P4D 800000010416b067 PUD 1041be067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 3072 Comm: mausezahn Tainted: G E 4.18.0-rc2.auguri+ #421 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.58 02/07/2013 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffa020dea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffffa020d7ccef00 RCX: 0000000000000054 RDX: 0000000000000000 RSI: ffffa020ca5ae000 RDI: ffffa020d7ccef00 RBP: ffffa020dea03e60 R08: 0000000000000000 R09: ffffa020dea03c9c R10: ffffa020dea03c78 R11: 0000000000000008 R12: ffffa020d3fe4f00 R13: ffffa020d3fe4f08 R14: 0000000000000001 R15: ffffa020d53ca300 FS: 00007f5a46942740(0000) GS:ffffa020dea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000104218002 CR4: 00000000001606f0 Call Trace: fl_classify+0x1ad/0x1c0 [cls_flower] ? arp_rcv+0x121/0x1b0 ? __x2apic_send_IPI_dest+0x40/0x40 ? smp_reschedule_interrupt+0x1c/0xd0 ? reschedule_interrupt+0xf/0x20 ? reschedule_interrupt+0xa/0x20 ? device_is_rmrr_locked+0xe/0x50 ? iommu_should_identity_map+0x49/0xd0 ? __intel_map_single+0x30/0x140 ? e1000e_update_rdt_wa.isra.52+0x22/0xb0 [e1000e] ? e1000_alloc_rx_buffers+0x233/0x250 [e1000e] ? kmem_cache_alloc+0x38/0x1c0 tcf_classify+0x89/0x140 __netif_receive_skb_core+0x5ea/0xb70 ? enqueue_task_fair+0xb6/0x7d0 ? process_backlog+0x97/0x150 process_backlog+0x97/0x150 net_rx_action+0x14b/0x3e0 __do_softirq+0xde/0x2b4 do_softirq_own_stack+0x2a/0x40 do_softirq.part.18+0x49/0x50 __local_bh_enable_ip+0x49/0x50 __dev_queue_xmit+0x4ab/0x8a0 ? wait_woken+0x80/0x80 ? packet_sendmsg+0x38f/0x810 ? __dev_queue_xmit+0x8a0/0x8a0 packet_sendmsg+0x38f/0x810 sock_sendmsg+0x36/0x40 __sys_sendto+0x10e/0x140 ? do_vfs_ioctl+0xa4/0x630 ? syscall_trace_enter+0x1df/0x2e0 ? __audit_syscall_exit+0x22a/0x290 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f5a45cbec93 Code: 48 8b 0d 18 83 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 c7 20 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 2b f7 ff ff 48 89 04 24 RSP: 002b:00007ffd0ee6d748 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000001161010 RCX: 00007f5a45cbec93 RDX: 0000000000000062 RSI: 0000000001161322 RDI: 0000000000000003 RBP: 00007ffd0ee6d780 R08: 00007ffd0ee6d760 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000062 R13: 0000000001161322 R14: 00007ffd0ee6d760 R15: 0000000000000003 Modules linked in: act_csum act_gact cls_flower sch_ingress vrf veth act_tunnel_key(E) xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel snd_hda_codec_hdmi snd_hda_codec_realtek kvm snd_hda_codec_generic hp_wmi iTCO_wdt sparse_keymap rfkill mei_wdt iTCO_vendor_support wmi_bmof gpio_ich irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel snd_hda_intel crypto_simd cryptd snd_hda_codec glue_helper snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm pcspkr i2c_i801 snd_timer snd sg lpc_ich soundcore wmi mei_me mei ie31200_edac nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sr_mod cdrom sd_mod ahci libahci crc32c_intel i915 ixgbe serio_raw libata video dca i2c_algo_bit sfc drm_kms_helper syscopyarea mtd sysfillrect mdio sysimgblt fb_sys_fops drm e1000e i2c_core CR2: 0000000000000000 ---[ end trace 3c9e9d1a77df4026 ]--- RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffa020dea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffffa020d7ccef00 RCX: 0000000000000054 RDX: 0000000000000000 RSI: ffffa020ca5ae000 RDI: ffffa020d7ccef00 RBP: ffffa020dea03e60 R08: 0000000000000000 R09: ffffa020dea03c9c R10: ffffa020dea03c78 R11: 0000000000000008 R12: ffffa020d3fe4f00 R13: ffffa020d3fe4f08 R14: 0000000000000001 R15: ffffa020d53ca300 FS: 00007f5a46942740(0000) GS:ffffa020dea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000104218002 CR4: 00000000001606f0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x26400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: 9c5f69bbd75a ("net/sched: act_csum: don't use spinlock in the fast path") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_csum.h | 1 - net/sched/act_csum.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 9470fd7e4350..32d2454c0479 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -7,7 +7,6 @@ #include struct tcf_csum_params { - int action; u32 update_flags; struct rcu_head rcu; }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 526a8e491626..6e7124e57918 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -91,7 +91,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } params_old = rtnl_dereference(p->params); - params_new->action = parm->action; + p->tcf_action = parm->action; params_new->update_flags = parm->update_flags; rcu_assign_pointer(p->params, params_new); if (params_old) @@ -561,7 +561,7 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&p->tcf_tm); bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); - action = params->action; + action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) goto drop_stats; @@ -599,11 +599,11 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, .index = p->tcf_index, .refcnt = p->tcf_refcnt - ref, .bindcnt = p->tcf_bindcnt - bind, + .action = p->tcf_action, }; struct tcf_t t; params = rtnl_dereference(p->params); - opt.action = params->action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) -- cgit v1.2.3 From 38230a3e0e0933bbcf5df6fa469ba0667f667568 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 6 Jul 2018 21:01:06 +0200 Subject: net/sched: act_tunnel_key: fix NULL dereference when 'goto chain' is used the control action in the common member of struct tcf_tunnel_key must be a valid value, as it can contain the chain index when 'goto chain' is used. Ensure that the control action can be read as x->tcfa_action, when x is a pointer to struct tc_action and x->ops->type is TCA_ACT_TUNNEL_KEY, to prevent the following command: # tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ > $tcflags dst_mac $h2mac action tunnel_key unset goto chain 1 from causing a NULL dereference when a matching packet is received: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 80000001097ac067 P4D 80000001097ac067 PUD 103b0a067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 3491 Comm: mausezahn Tainted: G E 4.18.0-rc2.auguri+ #421 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.58 02/07/2013 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff95145ea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffff9514499e5800 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000 RBP: ffff95145ea03e60 R08: 0000000000000000 R09: ffff95145ea03c9c R10: ffff95145ea03c78 R11: 0000000000000008 R12: ffff951456a69800 R13: ffff951456a69808 R14: 0000000000000001 R15: ffff95144965ee40 FS: 00007fd67ee11740(0000) GS:ffff95145ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001038a2006 CR4: 00000000001606f0 Call Trace: fl_classify+0x1ad/0x1c0 [cls_flower] ? __update_load_avg_se.isra.47+0x1ca/0x1d0 ? __update_load_avg_se.isra.47+0x1ca/0x1d0 ? update_load_avg+0x665/0x690 ? update_load_avg+0x665/0x690 ? kmem_cache_alloc+0x38/0x1c0 tcf_classify+0x89/0x140 __netif_receive_skb_core+0x5ea/0xb70 ? enqueue_entity+0xd0/0x270 ? process_backlog+0x97/0x150 process_backlog+0x97/0x150 net_rx_action+0x14b/0x3e0 __do_softirq+0xde/0x2b4 do_softirq_own_stack+0x2a/0x40 do_softirq.part.18+0x49/0x50 __local_bh_enable_ip+0x49/0x50 __dev_queue_xmit+0x4ab/0x8a0 ? wait_woken+0x80/0x80 ? packet_sendmsg+0x38f/0x810 ? __dev_queue_xmit+0x8a0/0x8a0 packet_sendmsg+0x38f/0x810 sock_sendmsg+0x36/0x40 __sys_sendto+0x10e/0x140 ? do_vfs_ioctl+0xa4/0x630 ? syscall_trace_enter+0x1df/0x2e0 ? __audit_syscall_exit+0x22a/0x290 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fd67e18dc93 Code: 48 8b 0d 18 83 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 c7 20 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 2b f7 ff ff 48 89 04 24 RSP: 002b:00007ffe0189b748 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000020ca010 RCX: 00007fd67e18dc93 RDX: 0000000000000062 RSI: 00000000020ca322 RDI: 0000000000000003 RBP: 00007ffe0189b780 R08: 00007ffe0189b760 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000062 R13: 00000000020ca322 R14: 00007ffe0189b760 R15: 0000000000000003 Modules linked in: act_tunnel_key act_gact cls_flower sch_ingress vrf veth act_csum(E) xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter intel_rapl snd_hda_codec_hdmi x86_pkg_temp_thermal intel_powerclamp snd_hda_codec_realtek coretemp snd_hda_codec_generic kvm_intel kvm irqbypass snd_hda_intel crct10dif_pclmul crc32_pclmul hp_wmi ghash_clmulni_intel pcbc snd_hda_codec aesni_intel sparse_keymap rfkill snd_hda_core snd_hwdep snd_seq crypto_simd iTCO_wdt gpio_ich iTCO_vendor_support wmi_bmof cryptd mei_wdt glue_helper snd_seq_device snd_pcm pcspkr snd_timer snd i2c_i801 lpc_ich sg soundcore wmi mei_me mei ie31200_edac nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod sr_mod cdrom i915 video i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ahci crc32c_intel libahci serio_raw sfc libata mtd drm ixgbe mdio i2c_core e1000e dca CR2: 0000000000000000 ---[ end trace 1ab8b5b5d4639dfc ]--- RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff95145ea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffff9514499e5800 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000 RBP: ffff95145ea03e60 R08: 0000000000000000 R09: ffff95145ea03c9c R10: ffff95145ea03c78 R11: 0000000000000008 R12: ffff951456a69800 R13: ffff951456a69808 R14: 0000000000000001 R15: ffff95144965ee40 FS: 00007fd67ee11740(0000) GS:ffff95145ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001038a2006 CR4: 00000000001606f0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x11400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: d0f6dd8a914f ("net/sched: Introduce act_tunnel_key") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_tunnel_key.h | 1 - net/sched/act_tunnel_key.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index efef0b4b1b2b..46b8c7f1c8d5 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -18,7 +18,6 @@ struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; - int action; struct metadata_dst *tcft_enc_metadata; }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 626dac81a48a..9bc6c2ae98a5 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -36,7 +36,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&t->tcf_tm); bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); - action = params->action; + action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: @@ -182,7 +182,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, params_old = rtnl_dereference(t->params); - params_new->action = parm->action; + t->tcf_action = parm->action; params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; @@ -254,13 +254,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, .index = t->tcf_index, .refcnt = t->tcf_refcnt - ref, .bindcnt = t->tcf_bindcnt - bind, + .action = t->tcf_action, }; struct tcf_t tm; params = rtnl_dereference(t->params); opt.t_action = params->tcft_action; - opt.action = params->action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) goto nla_put_failure; -- cgit v1.2.3 From 543af5861f41af0a5d2432f6fb5976af50f9cee5 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 6 Jul 2018 22:05:38 -0400 Subject: uio: change to use the mutex lock instead of the spin lock We are hitting a regression with the following commit: commit a93e7b331568227500186a465fee3c2cb5dffd1f Author: Hamish Martin Date: Mon May 14 13:32:23 2018 +1200 uio: Prevent device destruction while fds are open The problem is the addition of spin_lock_irqsave in uio_write. This leads to hitting uio_write -> copy_from_user -> _copy_from_user -> might_fault and the logs filling up with sleeping warnings. I also noticed some uio drivers allocate memory, sleep, grab mutexes from callouts like open() and release and uio is now doing spin_lock_irqsave while calling them. Reported-by: Mike Christie CC: Hamish Martin Reviewed-by: Hamish Martin Signed-off-by: Xiubo Li Signed-off-by: Greg Kroah-Hartman --- drivers/uio/uio.c | 32 +++++++++++++------------------- include/linux/uio_driver.h | 2 +- 2 files changed, 14 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index b4b2ae1e0473..655ade4fb3b1 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -433,7 +433,6 @@ static int uio_open(struct inode *inode, struct file *filep) struct uio_device *idev; struct uio_listener *listener; int ret = 0; - unsigned long flags; mutex_lock(&minor_lock); idev = idr_find(&uio_idr, iminor(inode)); @@ -460,10 +459,10 @@ static int uio_open(struct inode *inode, struct file *filep) listener->event_count = atomic_read(&idev->event); filep->private_data = listener; - spin_lock_irqsave(&idev->info_lock, flags); + mutex_lock(&idev->info_lock); if (idev->info && idev->info->open) ret = idev->info->open(idev->info, inode); - spin_unlock_irqrestore(&idev->info_lock, flags); + mutex_unlock(&idev->info_lock); if (ret) goto err_infoopen; @@ -495,12 +494,11 @@ static int uio_release(struct inode *inode, struct file *filep) int ret = 0; struct uio_listener *listener = filep->private_data; struct uio_device *idev = listener->dev; - unsigned long flags; - spin_lock_irqsave(&idev->info_lock, flags); + mutex_lock(&idev->info_lock); if (idev->info && idev->info->release) ret = idev->info->release(idev->info, inode); - spin_unlock_irqrestore(&idev->info_lock, flags); + mutex_unlock(&idev->info_lock); module_put(idev->owner); kfree(listener); @@ -513,12 +511,11 @@ static __poll_t uio_poll(struct file *filep, poll_table *wait) struct uio_listener *listener = filep->private_data; struct uio_device *idev = listener->dev; __poll_t ret = 0; - unsigned long flags; - spin_lock_irqsave(&idev->info_lock, flags); + mutex_lock(&idev->info_lock); if (!idev->info || !idev->info->irq) ret = -EIO; - spin_unlock_irqrestore(&idev->info_lock, flags); + mutex_unlock(&idev->info_lock); if (ret) return ret; @@ -537,12 +534,11 @@ static ssize_t uio_read(struct file *filep, char __user *buf, DECLARE_WAITQUEUE(wait, current); ssize_t retval = 0; s32 event_count; - unsigned long flags; - spin_lock_irqsave(&idev->info_lock, flags); + mutex_lock(&idev->info_lock); if (!idev->info || !idev->info->irq) retval = -EIO; - spin_unlock_irqrestore(&idev->info_lock, flags); + mutex_unlock(&idev->info_lock); if (retval) return retval; @@ -592,9 +588,8 @@ static ssize_t uio_write(struct file *filep, const char __user *buf, struct uio_device *idev = listener->dev; ssize_t retval; s32 irq_on; - unsigned long flags; - spin_lock_irqsave(&idev->info_lock, flags); + mutex_lock(&idev->info_lock); if (!idev->info || !idev->info->irq) { retval = -EIO; goto out; @@ -618,7 +613,7 @@ static ssize_t uio_write(struct file *filep, const char __user *buf, retval = idev->info->irqcontrol(idev->info, irq_on); out: - spin_unlock_irqrestore(&idev->info_lock, flags); + mutex_unlock(&idev->info_lock); return retval ? retval : sizeof(s32); } @@ -865,7 +860,7 @@ int __uio_register_device(struct module *owner, idev->owner = owner; idev->info = info; - spin_lock_init(&idev->info_lock); + mutex_init(&idev->info_lock); init_waitqueue_head(&idev->wait); atomic_set(&idev->event, 0); @@ -929,7 +924,6 @@ EXPORT_SYMBOL_GPL(__uio_register_device); void uio_unregister_device(struct uio_info *info) { struct uio_device *idev; - unsigned long flags; if (!info || !info->uio_dev) return; @@ -943,9 +937,9 @@ void uio_unregister_device(struct uio_info *info) if (info->irq && info->irq != UIO_IRQ_CUSTOM) free_irq(info->irq, idev); - spin_lock_irqsave(&idev->info_lock, flags); + mutex_lock(&idev->info_lock); idev->info = NULL; - spin_unlock_irqrestore(&idev->info_lock, flags); + mutex_unlock(&idev->info_lock); device_unregister(&idev->dev); diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 6c5f2074e14f..6f8b68cd460f 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -75,7 +75,7 @@ struct uio_device { struct fasync_struct *async_queue; wait_queue_head_t wait; struct uio_info *info; - spinlock_t info_lock; + struct mutex info_lock; struct kobject *map_dir; struct kobject *portio_dir; }; -- cgit v1.2.3 From 0ea488ff8d23c93da383fcf424825c298b13b1fb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:15 -0700 Subject: bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb In commit 'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512) we added the routine bpf_compute_data_end_sk_skb() to compute the correct data_end values, but this has since been lost. In kernel v4.14 this was correct and the above patch was applied in it entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree we lost the piece that renamed bpf_compute_data_pointers to the new function bpf_compute_data_end_sk_skb. This was done here, e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") When it conflicted with the following rename patch, 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Finally, after a refactor I thought even the function bpf_compute_data_end_sk_skb() was no longer needed and it was erroneously removed. However, we never reverted the sk_skb_convert_ctx_access() usage of tcp_skb_cb which had been committed and survived the merge conflict. Here we fix this by adding back the helper and *_data_end_sk_skb() usage. Using the bpf_skc_data_end mapping is not correct because it expects a qdisc_skb_cb object but at the sock layer this is not the case. Even though it happens to work here because we don't overwrite any data in-use at the socket layer and the cb structure is cleared later this has potential to create some subtle issues. But, even more concretely the filter.c access check uses tcp_skb_cb. And by some act of chance though, struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; /* 0 28 */ /* XXX 4 bytes hole, try to pack */ void * data_meta; /* 32 8 */ void * data_end; /* 40 8 */ /* size: 48, cachelines: 1, members: 3 */ /* sum members: 44, holes: 1, sum holes: 4 */ /* last cacheline: 48 bytes */ }; and then tcp_skb_cb, struct tcp_skb_cb { [...] struct { __u32 flags; /* 24 4 */ struct sock * sk_redir; /* 32 8 */ void * data_end; /* 40 8 */ } bpf; /* 24 */ }; So when we use offset_of() to track down the byte offset we get 40 in either case and everything continues to work. Fix this mess and use correct structures its unclear how long this might actually work for until someone moves the structs around. Reported-by: Martin KaFai Lau Fixes: e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 4 +++ kernel/bpf/sockmap.c | 4 +-- net/core/filter.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 97 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 800582b5dd54..af3ec72d5d41 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -828,6 +828,10 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index dfc8a8a07c1f..98fb7938beea 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -1491,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); diff --git a/net/core/filter.c b/net/core/filter.c index 3095f1ba7015..470268024a40 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1762,6 +1762,37 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +static inline int sk_skb_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err = __bpf_try_make_writable(skb, write_len); + + bpf_compute_data_end_sk_skb(skb); + return err; +} + +BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto sk_skb_pull_data_proto = { + .func = sk_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -2864,8 +2895,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) return __skb_trim_rcsum(skb, new_len); } -BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, - u64, flags) +static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, + u64 flags) { u32 max_len = __bpf_skb_max_len(skb); u32 min_len = __bpf_skb_min_len(skb); @@ -2901,6 +2932,13 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; @@ -2915,8 +2953,26 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, +BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_tail_proto = { + .func = sk_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, + u64 flags) { u32 max_len = __bpf_skb_max_len(skb); u32 new_len = skb->len + head_room; @@ -2942,8 +2998,16 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_header(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + bpf_compute_data_pointers(skb); - return 0; + return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { @@ -2955,6 +3019,23 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_head_proto = { + .func = sk_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -4618,9 +4699,12 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_store_bytes || func == bpf_skb_change_proto || func == bpf_skb_change_head || + func == sk_skb_change_head || func == bpf_skb_change_tail || + func == sk_skb_change_tail || func == bpf_skb_adjust_room || func == bpf_skb_pull_data || + func == sk_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || @@ -4872,11 +4956,11 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; + return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: - return &bpf_skb_change_tail_proto; + return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: - return &bpf_skb_change_head_proto; + return &sk_skb_change_head_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: -- cgit v1.2.3 From d8d7218ad842e18fc6976b87c08ed749e8d56313 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 6 Jul 2018 11:49:00 +0900 Subject: xdp: XDP_REDIRECT should check IFF_UP and MTU Otherwise we end up with attempting to send packets from down devices or to send oversized packets, which may cause unexpected driver/device behaviour. Generic XDP has already done this check, so reuse the logic in native XDP. Fixes: 814abfabef3c ("xdp: add bpf_redirect helper function") Signed-off-by: Toshiaki Makita Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 6 +++--- kernel/bpf/devmap.c | 7 ++++++- net/core/filter.c | 9 +++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 300baad62c88..c73dd7396886 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -765,8 +765,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); -static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, - struct net_device *fwd) +static inline int xdp_ok_fwd_dev(const struct net_device *fwd, + unsigned int pktlen) { unsigned int len; @@ -774,7 +774,7 @@ static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) + if (pktlen > len) return -EMSGSIZE; return 0; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642c97f6d1b8..d361fc1e3bf3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; + int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, { int err; - err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; skb->dev = dst->dev; diff --git a/net/core/filter.c b/net/core/filter.c index 470268024a40..5fa66a33927f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3128,12 +3128,16 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int sent; + int err, sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -3367,7 +3371,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, goto err; } - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) goto err; skb->dev = fwd; -- cgit v1.2.3 From f292b87d3ac020418644d8a4bbf29814890505cb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 6 Jul 2018 14:34:29 -0700 Subject: bpf: include errno.h from bpf-cgroup.h Commit fdb5c4531c1e ("bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF") caused some build issues, detected by 0-DAY kernel test infrastructure. The problem is that cgroup_bpf_prog_attach/detach/query() functions can return -EINVAL error code, which is not defined. Fix this adding errno.h to includes. Fixes: fdb5c4531c1e ("bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF") Signed-off-by: Roman Gushchin Cc: Sean Young Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 79795c5fa7c3..d50c2f0a655a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -2,6 +2,7 @@ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H +#include #include #include -- cgit v1.2.3 From e96d71359e9bbea846a2111e4469a03a055dfa6f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:50 -0400 Subject: rseq: Use __u64 for rseq_cs fields, validate user inputs Change the rseq ABI so rseq_cs start_ip, post_commit_offset and abort_ip fields are seen as 64-bit fields by both 32-bit and 64-bit kernels rather that ignoring the 32 upper bits on 32-bit kernels. This ensures we have a consistent behavior for a 32-bit binary executed on 32-bit kernels and in compat mode on 64-bit kernels. Validating the value of abort_ip field to be below TASK_SIZE ensures the kernel don't return to an invalid address when returning to userspace after an abort. I don't fully trust each architecture code to consistently deal with invalid return addresses. Validating the value of the start_ip and post_commit_offset fields prevents overflow on arithmetic performed on those values, used to check whether abort_ip is within the rseq critical section. If validation fails, the process is killed with a segmentation fault. When the signature encountered before abort_ip does not match the expected signature, return -EINVAL rather than -EPERM to be consistent with other input validation return codes from rseq_get_rseq_cs(). Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-2-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 6 +++--- kernel/rseq.c | 14 ++++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index d620fa43756c..519ad6e176d1 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -52,10 +52,10 @@ struct rseq_cs { __u32 version; /* enum rseq_cs_flags */ __u32 flags; - LINUX_FIELD_u32_u64(start_ip); + __u64 start_ip; /* Offset from start_ip. */ - LINUX_FIELD_u32_u64(post_commit_offset); - LINUX_FIELD_u32_u64(abort_ip); + __u64 post_commit_offset; + __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); /* diff --git a/kernel/rseq.c b/kernel/rseq.c index 22b6acf1ad63..16b38c5342f9 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -130,14 +130,20 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) urseq_cs = (struct rseq_cs __user *)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; - if (rseq_cs->version > 0) - return -EINVAL; + if (rseq_cs->start_ip >= TASK_SIZE || + rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || + rseq_cs->abort_ip >= TASK_SIZE || + rseq_cs->version > 0) + return -EINVAL; + /* Check for overflow. */ + if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) + return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; - usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); + usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; @@ -146,7 +152,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); - return -EPERM; + return -EINVAL; } return 0; } -- cgit v1.2.3 From 0fb9a1abc8c97f858997e962694eb36b4517144e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:52 -0400 Subject: rseq: uapi: Update uapi comments Update rseq uapi header comments to reflect that user-space need to do thread-local loads/stores from/to the struct rseq fields. As a consequence of this added requirement, the kernel does not need to perform loads/stores with single-copy atomicity. Update the comment associated to the "flags" fields to describe more accurately that it's only useful to facilitate single-stepping through rseq critical sections with debuggers. Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-4-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 69 ++++++++++++++++++++++++----------------------- kernel/rseq.c | 2 +- 2 files changed, 37 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 519ad6e176d1..bf4188c13bec 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -67,28 +67,30 @@ struct rseq_cs { struct rseq { /* * Restartable sequences cpu_id_start field. Updated by the - * kernel, and read by user-space with single-copy atomicity - * semantics. Aligned on 32-bit. Always contains a value in the - * range of possible CPUs, although the value may not be the - * actual current CPU (e.g. if rseq is not initialized). This - * CPU number value should always be compared against the value - * of the cpu_id field before performing a rseq commit or - * returning a value read from a data structure indexed using - * the cpu_id_start value. + * kernel. Read by user-space with single-copy atomicity + * semantics. This field should only be read by the thread which + * registered this data structure. Aligned on 32-bit. Always + * contains a value in the range of possible CPUs, although the + * value may not be the actual current CPU (e.g. if rseq is not + * initialized). This CPU number value should always be compared + * against the value of the cpu_id field before performing a rseq + * commit or returning a value read from a data structure indexed + * using the cpu_id_start value. */ __u32 cpu_id_start; /* - * Restartable sequences cpu_id field. Updated by the kernel, - * and read by user-space with single-copy atomicity semantics. - * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and - * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the - * former means "rseq uninitialized", and latter means "rseq - * initialization failed". This value is meant to be read within - * rseq critical sections and compared with the cpu_id_start - * value previously read, before performing the commit instruction, - * or read and compared with the cpu_id_start value before returning - * a value loaded from a data structure indexed using the - * cpu_id_start value. + * Restartable sequences cpu_id field. Updated by the kernel. + * Read by user-space with single-copy atomicity semantics. This + * field should only be read by the thread which registered this + * data structure. Aligned on 32-bit. Values + * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED + * have a special semantic: the former means "rseq uninitialized", + * and latter means "rseq initialization failed". This value is + * meant to be read within rseq critical sections and compared + * with the cpu_id_start value previously read, before performing + * the commit instruction, or read and compared with the + * cpu_id_start value before returning a value loaded from a data + * structure indexed using the cpu_id_start value. */ __u32 cpu_id; /* @@ -105,27 +107,28 @@ struct rseq { * targeted by the rseq_cs. Also needs to be set to NULL by user-space * before reclaiming memory that contains the targeted struct rseq_cs. * - * Read and set by the kernel with single-copy atomicity semantics. - * Set by user-space with single-copy atomicity semantics. Aligned - * on 64-bit. + * Read and set by the kernel. Set by user-space with single-copy + * atomicity semantics. This field should only be updated by the + * thread which registered this data structure. Aligned on 64-bit. */ LINUX_FIELD_u32_u64(rseq_cs); /* - * - RSEQ_DISABLE flag: + * Restartable sequences flags field. + * + * This field should only be updated by the thread which + * registered this data structure. Read by the kernel. + * Mainly used for single-stepping through rseq critical sections + * with debuggers. * - * Fallback fast-track flag for single-stepping. - * Set by user-space if lack of progress is detected. - * Cleared by user-space after rseq finish. - * Read by the kernel. * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT - * Inhibit instruction sequence block restart and event - * counter increment on preemption for this thread. + * Inhibit instruction sequence block restart on preemption + * for this thread. * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL - * Inhibit instruction sequence block restart and event - * counter increment on signal delivery for this thread. + * Inhibit instruction sequence block restart on signal + * delivery for this thread. * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE - * Inhibit instruction sequence block restart and event - * counter increment on migration for this thread. + * Inhibit instruction sequence block restart on migration for + * this thread. */ __u32 flags; } __attribute__((aligned(4 * sizeof(__u64)))); diff --git a/kernel/rseq.c b/kernel/rseq.c index 2c8463acb50d..2a7748675be7 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -201,7 +201,7 @@ static int clear_rseq_cs(struct task_struct *t) * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * - * Set rseq_cs to NULL with single-copy atomicity. + * Set rseq_cs to NULL. */ return put_user(0UL, &t->rseq->rseq_cs); } -- cgit v1.2.3 From ec9c82e03a744e5698bd95eab872855861a821fa Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:53 -0400 Subject: rseq: uapi: Declare rseq_cs field as union, update includes Declaring the rseq_cs field as a union between __u64 and two __u32 allows both 32-bit and 64-bit kernels to read the full __u64, and therefore validate that a 32-bit user-space cleared the upper 32 bits, thus ensuring a consistent behavior between native 32-bit kernels and 32-bit compat tasks on 64-bit kernels. Check that the rseq_cs value read is < TASK_SIZE. The asm/byteorder.h header needs to be included by rseq.h, now that it is not using linux/types_32_64.h anymore. Considering that only __32 and __u64 types are declared in linux/rseq.h, the linux/types.h header should always be included for both kernel and user-space code: including stdint.h is just for u64 and u32, which are not used in this header at all. Use copy_from_user()/clear_user() to interact with a 64-bit field, because arm32 does not implement 64-bit __get_user, and ppc32 does not 64-bit get_user. Considering that the rseq_cs pointer does not need to be loaded/stored with single-copy atomicity from the kernel anymore, we can simply use copy_from_user()/clear_user(). Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-5-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 27 +++++++++++++++++++-------- kernel/rseq.c | 15 +++++++++------ tools/testing/selftests/rseq/rseq.h | 11 ++++++++++- 3 files changed, 38 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index bf4188c13bec..9a402fdb60e9 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -10,13 +10,8 @@ * Copyright (c) 2015-2018 Mathieu Desnoyers */ -#ifdef __KERNEL__ -# include -#else -# include -#endif - -#include +#include +#include enum rseq_cpu_id_state { RSEQ_CPU_ID_UNINITIALIZED = -1, @@ -111,7 +106,23 @@ struct rseq { * atomicity semantics. This field should only be updated by the * thread which registered this data structure. Aligned on 64-bit. */ - LINUX_FIELD_u32_u64(rseq_cs); + union { + __u64 ptr64; +#ifdef __LP64__ + __u64 ptr; +#else + struct { +#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN) + __u32 padding; /* Initialized to zero. */ + __u32 ptr32; +#else /* LITTLE */ + __u32 ptr32; + __u32 padding; /* Initialized to zero. */ +#endif /* ENDIAN */ + } ptr; +#endif + } rseq_cs; + /* * Restartable sequences flags field. * diff --git a/kernel/rseq.c b/kernel/rseq.c index 2a7748675be7..c6242d8594dc 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -115,19 +115,20 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; - unsigned long ptr; + u64 ptr; u32 __user *usig; u32 sig; int ret; - ret = get_user(ptr, &t->rseq->rseq_cs); - if (ret) - return ret; + if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) + return -EFAULT; if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } - urseq_cs = (struct rseq_cs __user *)ptr; + if (ptr >= TASK_SIZE) + return -EINVAL; + urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; @@ -203,7 +204,9 @@ static int clear_rseq_cs(struct task_struct *t) * * Set rseq_cs to NULL. */ - return put_user(0UL, &t->rseq->rseq_cs); + if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) + return -EFAULT; + return 0; } /* diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index a4684112676c..f2073cfa4448 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -133,6 +133,15 @@ static inline uint32_t rseq_current_cpu(void) return cpu; } +static inline void rseq_clear_rseq_cs(void) +{ +#ifdef __LP64__ + __rseq_abi.rseq_cs.ptr = 0; +#else + __rseq_abi.rseq_cs.ptr.ptr32 = 0; +#endif +} + /* * rseq_prepare_unload() should be invoked by each thread using rseq_finish*() * at least once between their last rseq_finish*() and library unload of the @@ -143,7 +152,7 @@ static inline uint32_t rseq_current_cpu(void) */ static inline void rseq_prepare_unload(void) { - __rseq_abi.rseq_cs = 0; + rseq_clear_rseq_cs(); } #endif /* RSEQ_H_ */ -- cgit v1.2.3 From 4f4c0acdf4652a964da869d578a3c8bf6df14ce2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:54 -0400 Subject: rseq: Remove unused types_32_64.h uapi header This header was introduced in the 4.18 merge window, and rseq does not need it anymore. Nuke it before the final release. Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-6-mathieu.desnoyers@efficios.com --- include/uapi/linux/types_32_64.h | 50 ---------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 include/uapi/linux/types_32_64.h (limited to 'include') diff --git a/include/uapi/linux/types_32_64.h b/include/uapi/linux/types_32_64.h deleted file mode 100644 index 0a87ace34a57..000000000000 --- a/include/uapi/linux/types_32_64.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_TYPES_32_64_H -#define _UAPI_LINUX_TYPES_32_64_H - -/* - * linux/types_32_64.h - * - * Integer type declaration for pointers across 32-bit and 64-bit systems. - * - * Copyright (c) 2015-2018 Mathieu Desnoyers - */ - -#ifdef __KERNEL__ -# include -#else -# include -#endif - -#include - -#ifdef __BYTE_ORDER -# if (__BYTE_ORDER == __BIG_ENDIAN) -# define LINUX_BYTE_ORDER_BIG_ENDIAN -# else -# define LINUX_BYTE_ORDER_LITTLE_ENDIAN -# endif -#else -# ifdef __BIG_ENDIAN -# define LINUX_BYTE_ORDER_BIG_ENDIAN -# else -# define LINUX_BYTE_ORDER_LITTLE_ENDIAN -# endif -#endif - -#ifdef __LP64__ -# define LINUX_FIELD_u32_u64(field) __u64 field -# define LINUX_FIELD_u32_u64_INIT_ONSTACK(field, v) field = (intptr_t)v -#else -# ifdef LINUX_BYTE_ORDER_BIG_ENDIAN -# define LINUX_FIELD_u32_u64(field) __u32 field ## _padding, field -# define LINUX_FIELD_u32_u64_INIT_ONSTACK(field, v) \ - field ## _padding = 0, field = (intptr_t)v -# else -# define LINUX_FIELD_u32_u64(field) __u32 field, field ## _padding -# define LINUX_FIELD_u32_u64_INIT_ONSTACK(field, v) \ - field = (intptr_t)v, field ## _padding = 0 -# endif -#endif - -#endif /* _UAPI_LINUX_TYPES_32_64_H */ -- cgit v1.2.3 From b4e7a7a88b5d060650094b8d3454bc521d669f6a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 Jun 2018 11:17:54 -0400 Subject: drm_mode_create_lease_ioctl(): fix open-coded filp_clone_open() Failure of ->open() should *not* be followed by fput(). Fixed by using filp_clone_open(), which gets the cleanups right. Cc: stable@vger.kernel.org Acked-by: Linus Torvalds Signed-off-by: Al Viro --- drivers/gpu/drm/drm_lease.c | 16 +--------------- fs/internal.h | 1 - include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_lease.c b/drivers/gpu/drm/drm_lease.c index 50c73c0a20b9..d638c0fb3418 100644 --- a/drivers/gpu/drm/drm_lease.c +++ b/drivers/gpu/drm/drm_lease.c @@ -553,24 +553,13 @@ int drm_mode_create_lease_ioctl(struct drm_device *dev, /* Clone the lessor file to create a new file for us */ DRM_DEBUG_LEASE("Allocating lease file\n"); - path_get(&lessor_file->f_path); - lessee_file = alloc_file(&lessor_file->f_path, - lessor_file->f_mode, - fops_get(lessor_file->f_inode->i_fop)); - + lessee_file = filp_clone_open(lessor_file); if (IS_ERR(lessee_file)) { ret = PTR_ERR(lessee_file); goto out_lessee; } - /* Initialize the new file for DRM */ - DRM_DEBUG_LEASE("Initializing the file with %p\n", lessee_file->f_op->open); - ret = lessee_file->f_op->open(lessee_file->f_inode, lessee_file); - if (ret) - goto out_lessee_file; - lessee_priv = lessee_file->private_data; - /* Change the file to a master one */ drm_master_put(&lessee_priv->master); lessee_priv->master = lessee; @@ -588,9 +577,6 @@ int drm_mode_create_lease_ioctl(struct drm_device *dev, DRM_DEBUG_LEASE("drm_mode_create_lease_ioctl succeeded\n"); return 0; -out_lessee_file: - fput(lessee_file); - out_lessee: drm_master_put(&lessee); diff --git a/fs/internal.h b/fs/internal.h index 980d005b21b4..5645b4ebf494 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -127,7 +127,6 @@ int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); -extern struct file *filp_clone_open(struct file *); /* * inode.c diff --git a/include/linux/fs.h b/include/linux/fs.h index 5c91108846db..aa9b4c169ed2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2422,6 +2422,7 @@ extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int, umode_t); extern struct file * dentry_open(const struct path *, int, const struct cred *); +extern struct file *filp_clone_open(struct file *); extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int, int *); -- cgit v1.2.3 From 8b7008620b8452728cadead460a36f64ed78c460 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 11 Jul 2018 14:39:42 +0200 Subject: net: Don't copy pfmemalloc flag in __copy_skb_header() The pfmemalloc flag indicates that the skb was allocated from the PFMEMALLOC reserves, and the flag is currently copied on skb copy and clone. However, an skb copied from an skb flagged with pfmemalloc wasn't necessarily allocated from PFMEMALLOC reserves, and on the other hand an skb allocated that way might be copied from an skb that wasn't. So we should not copy the flag on skb copy, and rather decide whether to allow an skb to be associated with sockets unrelated to page reclaim depending only on how it was allocated. Move the pfmemalloc flag before headers_start[0] using an existing 1-bit hole, so that __copy_skb_header() doesn't copy it. When cloning, we'll now take care of this flag explicitly, contravening to the warning comment of __skb_clone(). While at it, restore the newline usage introduced by commit b19372273164 ("net: reorganize sk_buff for faster __copy_skb_header()") to visually separate bytes used in bitfields after headers_start[0], that was gone after commit a9e419dc7be6 ("netfilter: merge ctinfo into nfct pointer storage area"), and describe the pfmemalloc flag in the kernel-doc structure comment. This doesn't change the size of sk_buff or cacheline boundaries, but consolidates the 15 bits hole before tc_index into a 2 bytes hole before csum, that could now be filled more easily. Reported-by: Patrick Talbert Fixes: c93bdd0e03e8 ("netvm: allow skb allocation to use PFMEMALLOC reserves") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 +++++----- net/core/skbuff.c | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 164cdedf6012..610a201126ee 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -630,6 +630,7 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue + * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -735,7 +736,7 @@ struct sk_buff { peeked:1, head_frag:1, xmit_more:1, - __unused:1; /* one bit hole */ + pfmemalloc:1; /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() @@ -754,31 +755,30 @@ struct sk_buff { __u8 __pkt_type_offset[0]; __u8 pkt_type:3; - __u8 pfmemalloc:1; __u8 ignore_df:1; - __u8 nf_trace:1; __u8 ip_summed:2; __u8 ooo_okay:1; + __u8 l4_hash:1; __u8 sw_hash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; - __u8 no_fcs:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; + __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 csum_not_inet:1; - __u8 dst_pending_confirm:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif __u8 ipvs_property:1; + __u8 inner_protocol_type:1; __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV diff --git a/net/core/skbuff.c b/net/core/skbuff.c index eba8dae22c25..4df3164bb5fc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -858,6 +858,8 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->cloned = 1; n->nohdr = 0; n->peeked = 0; + if (skb->pfmemalloc) + n->pfmemalloc = 1; n->destructor = NULL; C(tail); C(end); -- cgit v1.2.3 From c3086637b0d7dbee0925697f8dbee2bcf9637b9f Mon Sep 17 00:00:00 2001 From: Michael Heimpold Date: Wed, 11 Jul 2018 23:10:55 +0200 Subject: net: ethtool: fix spelling mistake: "tubale" -> "tunable" Signed-off-by: Michael Heimpold Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 4ca65b56084f..7363f18e65a5 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -226,7 +226,7 @@ enum tunable_id { ETHTOOL_TX_COPYBREAK, ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ /* - * Add your fresh new tubale attribute above and remember to update + * Add your fresh new tunable attribute above and remember to update * tunable_strings[] in net/core/ethtool.c */ __ETHTOOL_TUNABLE_COUNT, -- cgit v1.2.3 From a69258f7aa2623e0930212f09c586fd06674ad79 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 12 Jul 2018 06:04:53 -0700 Subject: tcp: remove DELAYED ACK events in DCTCP After fixing the way DCTCP tracking delayed ACKs, the delayed-ACK related callbacks are no longer needed Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- include/net/tcp.h | 2 -- net/ipv4/tcp_dctcp.c | 25 ------------------------- net/ipv4/tcp_output.c | 4 ---- 3 files changed, 31 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index af3ec72d5d41..3482d13d655b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -912,8 +912,6 @@ enum tcp_ca_event { CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ - CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ - CA_EVENT_NON_DELAYED_ACK, }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 89f88b0d8167..5869f89ca656 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -55,7 +55,6 @@ struct dctcp { u32 dctcp_alpha; u32 next_seq; u32 ce_state; - u32 delayed_ack_reserved; u32 loss_cwnd; }; @@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk) ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); - ca->delayed_ack_reserved = 0; ca->loss_cwnd = 0; ca->ce_state = 0; @@ -250,25 +248,6 @@ static void dctcp_state(struct sock *sk, u8 new_state) } } -static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) -{ - struct dctcp *ca = inet_csk_ca(sk); - - switch (ev) { - case CA_EVENT_DELAYED_ACK: - if (!ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 1; - break; - case CA_EVENT_NON_DELAYED_ACK: - if (ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 0; - break; - default: - /* Don't care for the rest. */ - break; - } -} - static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { switch (ev) { @@ -278,10 +257,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) case CA_EVENT_ECN_NO_CE: dctcp_ce_state_1_to_0(sk); break; - case CA_EVENT_DELAYED_ACK: - case CA_EVENT_NON_DELAYED_ACK: - dctcp_update_ack_reserved(sk, ev); - break; default: /* Don't care for the rest. */ break; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8e08b409c71e..00e5a300ddb9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3523,8 +3523,6 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; - tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); - if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3581,8 +3579,6 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; - tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); - /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. -- cgit v1.2.3 From a90744bac57c3c07d0d4422af62f3e44549ade30 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 13 Jul 2018 16:59:03 -0700 Subject: mm: allow arch to supply p??_free_tlb functions The mmu_gather APIs keep track of the invalidated address range including the span covered by invalidated page table pages. Ranges covered by page tables but not ptes (and therefore no TLBs) still need to be invalidated because some architectures (x86) can cache intermediate page table entries, and invalidate those with normal TLB invalidation instructions to be almost-backward-compatible. Architectures which don't cache intermediate page table entries, or which invalidate these caches separately from TLB invalidation, do not require TLB invalidation range expanded over page tables. Allow architectures to supply their own p??_free_tlb functions, which can avoid the __tlb_adjust_range. Link: http://lkml.kernel.org/r/20180703013131.2807-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Andrew Morton Cc: "Aneesh Kumar K. V" Cc: Minchan Kim Cc: Mel Gorman Cc: Nadav Amit Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index faddde44de8c..3063125197ad 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -265,33 +265,41 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, * For now w.r.t page table cache, mark the range_size as PAGE_SIZE */ +#ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __pte_free_tlb(tlb, ptep, address); \ } while (0) +#endif +#ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) +#endif #ifndef __ARCH_HAS_4LEVEL_HACK +#ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif +#endif #ifndef __ARCH_HAS_5LEVEL_HACK +#ifndef p4d_free_tlb #define p4d_free_tlb(tlb, pudp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __p4d_free_tlb(tlb, pudp, address); \ } while (0) #endif +#endif #define tlb_migrate_finish(mm) do {} while (0) -- cgit v1.2.3 From d1b47a7c9efcf3c3384b70f6e3c8f1423b44d8c7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 16 Jul 2018 11:16:30 -0400 Subject: mm: don't do zero_resv_unavail if memmap is not allocated Moving zero_resv_unavail before memmap_init_zone(), caused a regression on x86-32. The cause is that we access struct pages before they are allocated when CONFIG_FLAT_NODE_MEM_MAP is used. free_area_init_nodes() zero_resv_unavail() mm_zero_struct_page(pfn_to_page(pfn)); <- struct page is not alloced free_area_init_node() if CONFIG_FLAT_NODE_MEM_MAP alloc_node_mem_map() memblock_virt_alloc_node_nopanic() <- struct page alloced here On the other hand memblock_virt_alloc_node_nopanic() zeroes all the memory that it returns, so we do not need to do zero_resv_unavail() here. Fixes: e181ae0c5db9 ("mm: zero unavailable pages before memmap init") Signed-off-by: Pavel Tatashin Tested-by: Matt Hart Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/page_alloc.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a0fbb9ffe380..3982c83fdcbf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2132,7 +2132,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state); #endif -#ifdef CONFIG_HAVE_MEMBLOCK +#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) void zero_resv_unavail(void); #else static inline void zero_resv_unavail(void) {} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5d800d61ddb7..a790ef4be74e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6383,7 +6383,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } -#ifdef CONFIG_HAVE_MEMBLOCK +#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Only struct pages that are backed by physical memory are zeroed and * initialized by going through __init_single_page(). But, there are some @@ -6421,7 +6421,7 @@ void __paginginit zero_resv_unavail(void) if (pgcnt) pr_info("Reserved but unavailable: %lld pages", pgcnt); } -#endif /* CONFIG_HAVE_MEMBLOCK */ +#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -- cgit v1.2.3 From 6e2059b53f9885f202b086d7b4ca10a98926e974 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Jul 2018 22:41:26 +0800 Subject: ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 ++ net/ipv4/igmp.c | 58 ++++++++++++++++++++++++++++++++++++-------------- net/ipv4/ip_sockglue.c | 4 ++-- 3 files changed, 46 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index f8231854b5d6..119f53941c12 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -109,6 +109,8 @@ struct ip_mc_list { extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto); extern int igmp_rcv(struct sk_buff *); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); +extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern int ip_mc_source(int add, int omode, struct sock *sk, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 85b617b655bc..b3c899a630a0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1200,13 +1200,14 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; im->sfmode = pmc->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { im->tomb = pmc->tomb; im->sources = pmc->sources; for (psf = im->sources; psf; psf = psf->sf_next) - psf->sf_crcount = im->crcount; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + } else { + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } in_dev_put(pmc->interface); kfree(pmc); @@ -1288,7 +1289,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #endif } -static void igmp_group_added(struct ip_mc_list *im) +static void igmp_group_added(struct ip_mc_list *im, unsigned int mode) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST @@ -1316,7 +1317,13 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should + * not send filter-mode change record as the mode should be from + * IN() to IN(A). + */ + if (mode == MCAST_EXCLUDE) + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + igmp_ifc_event(in_dev); #endif } @@ -1381,8 +1388,7 @@ static void ip_mc_hash_remove(struct in_device *in_dev, /* * A socket has joined a multicast group on device dev. */ - -void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode) { struct ip_mc_list *im; #ifdef CONFIG_IP_MULTICAST @@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) { im->users++; - ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); + ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } } @@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) in_dev_hold(in_dev); im->multiaddr = addr; /* initial mode is (EX, empty) */ - im->sfmode = MCAST_EXCLUDE; - im->sfcount[MCAST_EXCLUDE] = 1; + im->sfmode = mode; + im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST @@ -1426,12 +1432,17 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif - igmp_group_added(im); + igmp_group_added(im, mode); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: return; } + +void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +{ + __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) @@ -1688,7 +1699,7 @@ void ip_mc_remap(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc); + igmp_group_added(pmc, pmc->sfmode); } } @@ -1751,7 +1762,7 @@ void ip_mc_up(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc); + igmp_group_added(pmc, pmc->sfmode); } } @@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) /* Join a multicast group */ - -int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; @@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; - iml->sfmode = MCAST_EXCLUDE; + iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); - ip_mc_inc_group(in_dev, addr); + __ip_mc_inc_group(in_dev, addr, mode); err = 0; done: return err; } + +/* Join ASM (Any-Source Multicast) group + */ +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +{ + return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_join_group); +/* Join SSM (Source-Specific Multicast) group + */ +int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) +{ + return __ip_mc_join_group(sk, imr, mode); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fc32fdbeefa6..64c76dcf7386 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -984,7 +984,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; @@ -1061,7 +1061,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; greqs.gsr_interface = mreq.imr_ifindex; -- cgit v1.2.3 From c7ea20c9da5b94e400c8dcc0adb99411f2e430a6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Jul 2018 22:41:27 +0800 Subject: ipv6/mcast: init as INCLUDE when join SSM INCLUDE group This an IPv6 version patch of "ipv4/igmp: init group mode as INCLUDE when join source group". From RFC3810, part 6.1: If no per-interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have an INCLUDE filter mode and an empty source list. Which means a new multicast group should start with state IN(). Currently, for MLDv2 SSM JOIN_SOURCE_GROUP mode, we first call ipv6_sock_mc_join(), then ip6_mc_source(), which will trigger a TO_IN() message instead of ALLOW(). The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we sent both ALLOW(A) and TO_IN(A). Now, we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add some wrapper functions to avoid changing too much code. v1 -> v2: In the first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger a filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. There is also a difference between v4 and v6 version. For IPv6, when the interface goes down and up, we will send correct state change record with unspecified IPv6 address (::) with function ipv6_mc_up(). But after DAD is completed, we resend the change record TO_IN() in mld_send_initial_cr(). Fix it by sending ALLOW() for INCLUDE mode in mld_send_initial_cr(). Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 ++ net/ipv6/ipv6_sockglue.c | 5 ++-- net/ipv6/mcast.c | 64 ++++++++++++++++++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d02881e4ad1f..7528632bcf2a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1100,6 +1100,8 @@ void ipv6_sysctl_unregister(void); int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); +int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, + const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); #endif /* _NET_IPV6_H */ diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c95c3486d904..568ca4187cd1 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -729,8 +729,9 @@ done: struct sockaddr_in6 *psin6; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; - retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, - &psin6->sin6_addr); + retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, + &psin6->sin6_addr, + MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index c0c74088f2af..2699be7202be 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -95,6 +95,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int delta); static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); +static int __ipv6_dev_mc_inc(struct net_device *dev, + const struct in6_addr *addr, unsigned int mode); #define MLD_QRV_DEFAULT 2 /* RFC3810, 9.2. Query Interval */ @@ -132,7 +134,8 @@ static int unsolicited_report_interval(struct inet6_dev *idev) return iv > 0 ? iv : 1; } -int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, + const struct in6_addr *addr, unsigned int mode) { struct net_device *dev = NULL; struct ipv6_mc_socklist *mc_lst; @@ -179,7 +182,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) } mc_lst->ifindex = dev->ifindex; - mc_lst->sfmode = MCAST_EXCLUDE; + mc_lst->sfmode = mode; rwlock_init(&mc_lst->sflock); mc_lst->sflist = NULL; @@ -187,7 +190,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) * now add/increase the group membership on the device */ - err = ipv6_dev_mc_inc(dev, addr); + err = __ipv6_dev_mc_inc(dev, addr, mode); if (err) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); @@ -199,8 +202,19 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return 0; } + +int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ipv6_sock_mc_join); +int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, + const struct in6_addr *addr, unsigned int mode) +{ + return __ipv6_sock_mc_join(sk, ifindex, addr, mode); +} + /* * socket leave on multicast group */ @@ -646,7 +660,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, return rv; } -static void igmp6_group_added(struct ifmcaddr6 *mc) +static void igmp6_group_added(struct ifmcaddr6 *mc, unsigned int mode) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; @@ -672,7 +686,13 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) } /* else v2 */ - mc->mca_crcount = mc->idev->mc_qrv; + /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we + * should not send filter-mode change record as the mode + * should be from IN() to IN(A). + */ + if (mode == MCAST_EXCLUDE) + mc->mca_crcount = mc->idev->mc_qrv; + mld_ifc_event(mc->idev); } @@ -770,13 +790,14 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) spin_lock_bh(&im->mca_lock); if (pmc) { im->idev = pmc->idev; - im->mca_crcount = idev->mc_qrv; im->mca_sfmode = pmc->mca_sfmode; if (pmc->mca_sfmode == MCAST_INCLUDE) { im->mca_tomb = pmc->mca_tomb; im->mca_sources = pmc->mca_sources; for (psf = im->mca_sources; psf; psf = psf->sf_next) - psf->sf_crcount = im->mca_crcount; + psf->sf_crcount = idev->mc_qrv; + } else { + im->mca_crcount = idev->mc_qrv; } in6_dev_put(pmc->idev); kfree(pmc); @@ -831,7 +852,8 @@ static void ma_put(struct ifmcaddr6 *mc) } static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, - const struct in6_addr *addr) + const struct in6_addr *addr, + unsigned int mode) { struct ifmcaddr6 *mc; @@ -849,9 +871,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, refcount_set(&mc->mca_refcnt, 1); spin_lock_init(&mc->mca_lock); - /* initial mode is (EX, empty) */ - mc->mca_sfmode = MCAST_EXCLUDE; - mc->mca_sfcount[MCAST_EXCLUDE] = 1; + mc->mca_sfmode = mode; + mc->mca_sfcount[mode] = 1; if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) @@ -863,7 +884,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, /* * device multicast group inc (add if not found) */ -int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) +static int __ipv6_dev_mc_inc(struct net_device *dev, + const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; struct inet6_dev *idev; @@ -887,14 +909,13 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; write_unlock_bh(&idev->lock); - ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0, - NULL, 0); + ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0); in6_dev_put(idev); return 0; } } - mc = mca_alloc(idev, addr); + mc = mca_alloc(idev, addr, mode); if (!mc) { write_unlock_bh(&idev->lock); in6_dev_put(idev); @@ -911,11 +932,16 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) write_unlock_bh(&idev->lock); mld_del_delrec(idev, mc); - igmp6_group_added(mc); + igmp6_group_added(mc, mode); ma_put(mc); return 0; } +int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) +{ + return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE); +} + /* * device multicast group del */ @@ -1751,7 +1777,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, psf_next = psf->sf_next; - if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) { psf_prev = psf; continue; } @@ -2066,7 +2092,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev) if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; else - type = MLD2_CHANGE_TO_INCLUDE; + type = MLD2_ALLOW_NEW_SOURCES; skb = add_grec(skb, pmc, type, 0, 0, 1); spin_unlock_bh(&pmc->mca_lock); } @@ -2546,7 +2572,7 @@ void ipv6_mc_up(struct inet6_dev *idev) ipv6_mc_reset(idev); for (i = idev->mc_list; i; i = i->next) { mld_del_delrec(idev, i); - igmp6_group_added(i); + igmp6_group_added(i, i->mca_sfmode); } read_unlock_bh(&idev->lock); } -- cgit v1.2.3 From c133459765fae249ba482f62e12f987aec4376f0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 13 Jul 2018 21:25:19 -0700 Subject: net/ethernet/freescale/fman: fix cross-build error CC [M] drivers/net/ethernet/freescale/fman/fman.o In file included from ../drivers/net/ethernet/freescale/fman/fman.c:35: ../include/linux/fsl/guts.h: In function 'guts_set_dmacr': ../include/linux/fsl/guts.h:165:2: error: implicit declaration of function 'clrsetbits_be32' [-Werror=implicit-function-declaration] clrsetbits_be32(&guts->dmacr, 3 << shift, device << shift); ^~~~~~~~~~~~~~~ Signed-off-by: Randy Dunlap Cc: Madalin Bucur Cc: netdev@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: David S. Miller --- include/linux/fsl/guts.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fsl/guts.h b/include/linux/fsl/guts.h index 3efa3b861d44..941b11811f85 100644 --- a/include/linux/fsl/guts.h +++ b/include/linux/fsl/guts.h @@ -16,6 +16,7 @@ #define __FSL_GUTS_H__ #include +#include /** * Global Utility Registers. -- cgit v1.2.3 From 31048d7aedf31bf0f69c54a662944632f29d82f2 Mon Sep 17 00:00:00 2001 From: Stefan Baranoff Date: Sun, 15 Jul 2018 11:36:37 -0400 Subject: tcp: Fix broken repair socket window probe patch Correct previous bad attempt at allowing sockets to come out of TCP repair without sending window probes. To avoid changing size of the repair variable in struct tcp_sock, this lets the decision for sending probes or not to be made when coming out of repair by introducing two ways to turn it off. v2: * Remove erroneous comment; defines now make behavior clear Fixes: 70b7ff130224 ("tcp: allow user to create repair socket without window probes") Signed-off-by: Stefan Baranoff Signed-off-by: Eric Dumazet Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 4 ++++ net/ipv4/tcp.c | 13 +++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 29eb659aa77a..e3f6ed8a7064 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -127,6 +127,10 @@ enum { #define TCP_CM_INQ TCP_INQ +#define TCP_REPAIR_ON 1 +#define TCP_REPAIR_OFF 0 +#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ + struct tcp_repair_opt { __u32 opt_code; __u32 opt_val; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8e5e2ca9ab1b..ec2186e3087f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2823,16 +2823,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; - /* 1 for normal repair, 2 for no window probes */ - else if (val == 1 || val == 2) { - tp->repair = val; + else if (val == TCP_REPAIR_ON) { + tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; - } else if (val == 0) { + } else if (val == TCP_REPAIR_OFF) { + tp->repair = 0; + sk->sk_reuse = SK_NO_REUSE; + tcp_send_window_probe(sk); + } else if (val == TCP_REPAIR_OFF_NO_WP) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; - if (tp->repair == 1) - tcp_send_window_probe(sk); } else err = -EINVAL; -- cgit v1.2.3 From 9ba546c01976a426292af99e682a557075d6c010 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jul 2018 15:48:46 +0200 Subject: aio: don't expose __aio_sigset in uapi glibc uses a different defintion of sigset_t than the kernel does, and the current version would pull in both. To fix this just do not expose the type at all - this somewhat mirrors pselect() where we do not even have a type for the magic sigmask argument, but just use pointer arithmetics. Fixes: 7a074e96 ("aio: implement io_pgetevents") Signed-off-by: Christoph Hellwig Reported-by: Adrian Reber Signed-off-by: Al Viro --- fs/aio.c | 5 +++++ include/linux/syscalls.h | 1 + include/uapi/linux/aio_abi.h | 6 ------ 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/aio.c b/fs/aio.c index e1d20124ec0e..b1a42e45698b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2042,6 +2042,11 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, return ret; } +struct __aio_sigset { + const sigset_t __user *sigmask; + size_t sigsetsize; +}; + SYSCALL_DEFINE6(io_pgetevents, aio_context_t, ctx_id, long, min_nr, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 73810808cdf2..b06b5eeda8e8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -11,6 +11,7 @@ #ifndef _LINUX_SYSCALLS_H #define _LINUX_SYSCALLS_H +struct __aio_sigset; struct epoll_event; struct iattr; struct inode; diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index d00221345c19..ce43d340f010 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -29,7 +29,6 @@ #include #include -#include #include typedef __kernel_ulong_t aio_context_t; @@ -108,10 +107,5 @@ struct iocb { #undef IFBIG #undef IFLITTLE -struct __aio_sigset { - const sigset_t __user *sigmask; - size_t sigsetsize; -}; - #endif /* __LINUX__AIO_ABI_H */ -- cgit v1.2.3 From a5fb9fb023a1435f2b42bccd7f547560f3a21dc3 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Wed, 18 Jul 2018 15:40:26 -0500 Subject: PCI: OF: Fix I/O space page leak When testing the R-Car PCIe driver on the Condor board, if the PCIe PHY driver was left disabled, the kernel crashed with this BUG: kernel BUG at lib/ioremap.c:72! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 39 Comm: kworker/0:1 Not tainted 4.17.0-dirty #1092 Hardware name: Renesas Condor board based on r8a77980 (DT) Workqueue: events deferred_probe_work_func pstate: 80000005 (Nzcv daif -PAN -UAO) pc : ioremap_page_range+0x370/0x3c8 lr : ioremap_page_range+0x40/0x3c8 sp : ffff000008da39e0 x29: ffff000008da39e0 x28: 00e8000000000f07 x27: ffff7dfffee00000 x26: 0140000000000000 x25: ffff7dfffef00000 x24: 00000000000fe100 x23: ffff80007b906000 x22: ffff000008ab8000 x21: ffff000008bb1d58 x20: ffff7dfffef00000 x19: ffff800009c30fb8 x18: 0000000000000001 x17: 00000000000152d0 x16: 00000000014012d0 x15: 0000000000000000 x14: 0720072007200720 x13: 0720072007200720 x12: 0720072007200720 x11: 0720072007300730 x10: 00000000000000ae x9 : 0000000000000000 x8 : ffff7dffff000000 x7 : 0000000000000000 x6 : 0000000000000100 x5 : 0000000000000000 x4 : 000000007b906000 x3 : ffff80007c61a880 x2 : ffff7dfffeefffff x1 : 0000000040000000 x0 : 00e80000fe100f07 Process kworker/0:1 (pid: 39, stack limit = 0x (ptrval)) Call trace: ioremap_page_range+0x370/0x3c8 pci_remap_iospace+0x7c/0xac pci_parse_request_of_pci_ranges+0x13c/0x190 rcar_pcie_probe+0x4c/0xb04 platform_drv_probe+0x50/0xbc driver_probe_device+0x21c/0x308 __device_attach_driver+0x98/0xc8 bus_for_each_drv+0x54/0x94 __device_attach+0xc4/0x12c device_initial_probe+0x10/0x18 bus_probe_device+0x90/0x98 deferred_probe_work_func+0xb0/0x150 process_one_work+0x12c/0x29c worker_thread+0x200/0x3fc kthread+0x108/0x134 ret_from_fork+0x10/0x18 Code: f9004ba2 54000080 aa0003fb 17ffff48 (d4210000) It turned out that pci_remap_iospace() wasn't undone when the driver's probe failed, and since devm_phy_optional_get() returned -EPROBE_DEFER, the probe was retried, finally causing the BUG due to trying to remap already remapped pages. Introduce the devm_pci_remap_iospace() managed API and replace the pci_remap_iospace() call with it to fix the bug. Fixes: dbf9826d5797 ("PCI: generic: Convert to DT resource parsing API") Signed-off-by: Sergei Shtylyov [lorenzo.pieralisi@arm.com: split commit/updated the commit log] Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Linus Walleij --- drivers/pci/of.c | 2 +- drivers/pci/pci.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 2 ++ 3 files changed, 41 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/pci/of.c b/drivers/pci/of.c index d088c9147f10..69a60d6ebd73 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -612,7 +612,7 @@ int pci_parse_request_of_pci_ranges(struct device *dev, switch (resource_type(res)) { case IORESOURCE_IO: - err = pci_remap_iospace(res, iobase); + err = devm_pci_remap_iospace(dev, res, iobase); if (err) { dev_warn(dev, "error %d: failed to map resource %pR\n", err, res); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 97acba712e4e..316496e99da9 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3579,6 +3579,44 @@ void pci_unmap_iospace(struct resource *res) } EXPORT_SYMBOL(pci_unmap_iospace); +static void devm_pci_unmap_iospace(struct device *dev, void *ptr) +{ + struct resource **res = ptr; + + pci_unmap_iospace(*res); +} + +/** + * devm_pci_remap_iospace - Managed pci_remap_iospace() + * @dev: Generic device to remap IO address for + * @res: Resource describing the I/O space + * @phys_addr: physical address of range to be mapped + * + * Managed pci_remap_iospace(). Map is automatically unmapped on driver + * detach. + */ +int devm_pci_remap_iospace(struct device *dev, const struct resource *res, + phys_addr_t phys_addr) +{ + const struct resource **ptr; + int error; + + ptr = devres_alloc(devm_pci_unmap_iospace, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + error = pci_remap_iospace(res, phys_addr); + if (error) { + devres_free(ptr); + } else { + *ptr = res; + devres_add(dev, ptr); + } + + return error; +} +EXPORT_SYMBOL(devm_pci_remap_iospace); + /** * devm_pci_remap_cfgspace - Managed pci_remap_cfgspace() * @dev: Generic device to remap IO address for diff --git a/include/linux/pci.h b/include/linux/pci.h index 340029b2fb38..abd5d5e17aee 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1240,6 +1240,8 @@ int pci_register_io_range(struct fwnode_handle *fwnode, phys_addr_t addr, unsigned long pci_address_to_pio(phys_addr_t addr); phys_addr_t pci_pio_to_address(unsigned long pio); int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr); +int devm_pci_remap_iospace(struct device *dev, const struct resource *res, + phys_addr_t phys_addr); void pci_unmap_iospace(struct resource *res); void __iomem *devm_pci_remap_cfgspace(struct device *dev, resource_size_t offset, -- cgit v1.2.3 From 169dc027fb02492ea37a0575db6a658cf922b854 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Jul 2018 17:12:39 +0100 Subject: ipv6: fix useless rol32 call on hash The rol32 call is currently rotating hash but the rol'd value is being discarded. I believe the current code is incorrect and hash should be assigned the rotated value returned from rol32. Thanks to David Lebrun for spotting this. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7528632bcf2a..8f73be494503 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -823,7 +823,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, * to minimize possbility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ - rol32(hash, 16); + hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; -- cgit v1.2.3 From 2db1581e1f432ac6b4efe152c57fdfb4de85c154 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 8 Jul 2018 14:23:21 +0800 Subject: Revert "iommu/vt-d: Clean up pasid quirk for pre-production devices" This reverts commit ab96746aaa344fb720a198245a837e266fad3b62. The commit ab96746aaa34 ("iommu/vt-d: Clean up pasid quirk for pre-production devices") triggers ECS mode on some platforms which have broken ECS support. As the result, graphic device will be inoperable on boot. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107017 Cc: Ashok Raj Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 32 ++++++++++++++++++++++++++++++-- include/linux/intel-iommu.h | 1 + 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index b344a883f116..115ff26e9ced 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -484,14 +484,37 @@ static int dmar_forcedac; static int intel_iommu_strict; static int intel_iommu_superpage = 1; static int intel_iommu_ecs = 1; +static int intel_iommu_pasid28; static int iommu_identity_mapping; #define IDENTMAP_ALL 1 #define IDENTMAP_GFX 2 #define IDENTMAP_AZALIA 4 -#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap)) -#define pasid_enabled(iommu) (ecs_enabled(iommu) && ecap_pasid(iommu->ecap)) +/* Broadwell and Skylake have broken ECS support — normal so-called "second + * level" translation of DMA requests-without-PASID doesn't actually happen + * unless you also set the NESTE bit in an extended context-entry. Which of + * course means that SVM doesn't work because it's trying to do nested + * translation of the physical addresses it finds in the process page tables, + * through the IOVA->phys mapping found in the "second level" page tables. + * + * The VT-d specification was retroactively changed to change the definition + * of the capability bits and pretend that Broadwell/Skylake never happened... + * but unfortunately the wrong bit was changed. It's ECS which is broken, but + * for some reason it was the PASID capability bit which was redefined (from + * bit 28 on BDW/SKL to bit 40 in future). + * + * So our test for ECS needs to eschew those implementations which set the old + * PASID capabiity bit 28, since those are the ones on which ECS is broken. + * Unless we are working around the 'pasid28' limitations, that is, by putting + * the device into passthrough mode for normal DMA and thus masking the bug. + */ +#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \ + (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap))) +/* PASID support is thus enabled if ECS is enabled and *either* of the old + * or new capability bits are set. */ +#define pasid_enabled(iommu) (ecs_enabled(iommu) && \ + (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap))) int intel_iommu_gfx_mapped; EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); @@ -554,6 +577,11 @@ static int __init intel_iommu_setup(char *str) printk(KERN_INFO "Intel-IOMMU: disable extended context table support\n"); intel_iommu_ecs = 0; + } else if (!strncmp(str, "pasid28", 7)) { + printk(KERN_INFO + "Intel-IOMMU: enable pre-production PASID support\n"); + intel_iommu_pasid28 = 1; + iommu_identity_mapping |= IDENTMAP_GFX; } else if (!strncmp(str, "tboot_noforce", 13)) { printk(KERN_INFO "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 1df940196ab2..ef169d67df92 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -121,6 +121,7 @@ #define ecap_srs(e) ((e >> 31) & 0x1) #define ecap_ers(e) ((e >> 30) & 0x1) #define ecap_prs(e) ((e >> 29) & 0x1) +#define ecap_broken_pasid(e) ((e >> 28) & 0x1) #define ecap_dis(e) ((e >> 27) & 0x1) #define ecap_nest(e) ((e >> 26) & 0x1) #define ecap_mts(e) ((e >> 25) & 0x1) -- cgit v1.2.3 From 3928d4f5ee37cdc523894f6e549e6aae521d8980 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 Jul 2018 13:48:51 -0700 Subject: mm: use helper functions for allocating and freeing vm_area structs The vm_area_struct is one of the most fundamental memory management objects, but the management of it is entirely open-coded evertwhere, ranging from allocation and freeing (using kmem_cache_[z]alloc and kmem_cache_free) to initializing all the fields. We want to unify this in order to end up having some unified initialization of the vmas, and the first step to this is to at least have basic allocation functions. Right now those functions are literally just wrappers around the kmem_cache_*() calls. This is a purely mechanical conversion: # new vma: kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL) -> vm_area_alloc() # copy old vma kmem_cache_alloc(vm_area_cachep, GFP_KERNEL) -> vm_area_dup(old) # free vma kmem_cache_free(vm_area_cachep, vma) -> vm_area_free(vma) to the point where the old vma passed in to the vm_area_dup() function isn't even used yet (because I've left all the old manual initialization alone). Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 4 ++-- arch/ia64/mm/init.c | 8 ++++---- fs/exec.c | 4 ++-- include/linux/mm.h | 4 +++- kernel/fork.c | 21 ++++++++++++++++++--- mm/mmap.c | 22 +++++++++++----------- mm/nommu.c | 8 ++++---- 7 files changed, 44 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 3b38c717008a..e859246badca 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2278,7 +2278,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; @@ -2346,7 +2346,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t return 0; error: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); error_kmem: pfm_rvfree(smpl_buf, size); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 18278b448530..3f2321bffb72 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -114,7 +114,7 @@ ia64_init_addr_space (void) * the problem. When the process attempts to write to the register backing store * for the first time, it will get a SEGFAULT in this case. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (vma) { INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; @@ -125,7 +125,7 @@ ia64_init_addr_space (void) down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return; } up_write(¤t->mm->mmap_sem); @@ -133,7 +133,7 @@ ia64_init_addr_space (void) /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (vma) { INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; @@ -144,7 +144,7 @@ ia64_init_addr_space (void) down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return; } up_write(¤t->mm->mmap_sem); diff --git a/fs/exec.c b/fs/exec.c index 2d4e0075bd24..9bd83989ea25 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + bprm->vma = vma = vm_area_alloc(); if (!vma) return -ENOMEM; @@ -326,7 +326,7 @@ err: up_write(&mm->mmap_sem); err_free: bprm->vma = NULL; - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return err; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 3982c83fdcbf..de2fd86c6154 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,7 +155,9 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, * mmap() functions). */ -extern struct kmem_cache *vm_area_cachep; +struct vm_area_struct *vm_area_alloc(void); +struct vm_area_struct *vm_area_dup(struct vm_area_struct *); +void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..0e23deb5acfc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -303,11 +303,26 @@ struct kmem_cache *files_cachep; struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -struct kmem_cache *vm_area_cachep; +static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +struct vm_area_struct *vm_area_alloc(void) +{ + return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); +} + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); +} + +void vm_area_free(struct vm_area_struct *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + static void account_kernel_stack(struct task_struct *tsk, int account) { void *stack = task_stack_page(tsk); @@ -455,7 +470,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -539,7 +554,7 @@ fail_uprobe_end: fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); + vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); diff --git a/mm/mmap.c b/mm/mmap.c index 5801b5f0a634..4286ad2dd1f5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -182,7 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return next; } @@ -911,7 +911,7 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); + vm_area_free(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1729,7 +1729,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { error = -ENOMEM; goto unacct_error; @@ -1832,7 +1832,7 @@ allow_write_and_free_vma: if (vm_flags & VM_DENYWRITE) allow_write_access(file); free_vma: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -2620,7 +2620,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return err; } - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) return -ENOMEM; @@ -2669,7 +2669,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, out_free_mpol: mpol_put(vma_policy(new)); out_free_vma: - kmem_cache_free(vm_area_cachep, new); + vm_area_free(new); return err; } @@ -2984,7 +2984,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; @@ -3202,7 +3202,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { - new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new_vma = vm_area_dup(vma); if (!new_vma) goto out; *new_vma = *vma; @@ -3226,7 +3226,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: - kmem_cache_free(vm_area_cachep, new_vma); + vm_area_free(new_vma); out: return NULL; } @@ -3350,7 +3350,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3376,7 +3376,7 @@ static struct vm_area_struct *__install_special_mapping( return vma; out: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ERR_PTR(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index 4452d8bd9ae4..006e3fe65017 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -769,7 +769,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); put_nommu_region(vma->vm_region); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); } /* @@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) goto error_getting_vma; @@ -1368,7 +1368,7 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ret; sharing_violation: @@ -1469,7 +1469,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!region) return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) { kmem_cache_free(vm_region_jar, region); return -ENOMEM; -- cgit v1.2.3 From 490fc053865c9cc40f1085ef8a5504f5341f79d2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 Jul 2018 15:24:03 -0700 Subject: mm: make vm_area_alloc() initialize core fields Like vm_area_dup(), it initializes the anon_vma_chain head, and the basic mm pointer. The rest of the fields end up being different for different users, although the plan is to also initialize the 'vm_ops' field to a dummy entry. Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 4 +--- arch/ia64/mm/init.c | 8 ++------ fs/exec.c | 4 +--- include/linux/mm.h | 2 +- kernel/fork.c | 10 ++++++++-- mm/mmap.c | 12 +++--------- mm/nommu.c | 3 +-- 7 files changed, 17 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index e859246badca..46bff1661836 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2278,17 +2278,15 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; } - INIT_LIST_HEAD(&vma->anon_vma_chain); /* * partially initialize the vma for the sampling buffer */ - vma->vm_mm = mm; vma->vm_file = get_file(filp); vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 3f2321bffb72..bdb14a369137 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -114,10 +114,8 @@ ia64_init_addr_space (void) * the problem. When the process attempts to write to the register backing store * for the first time, it will get a SEGFAULT in this case. */ - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (vma) { - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = current->mm; vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; @@ -133,10 +131,8 @@ ia64_init_addr_space (void) /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (vma) { - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | diff --git a/fs/exec.c b/fs/exec.c index 9bd83989ea25..72e961a62adb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = vm_area_alloc(); + bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; @@ -298,7 +298,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) err = -EINTR; goto err_free; } - vma->vm_mm = mm; /* * Place the stack at the largest stack address the architecture @@ -311,7 +310,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) diff --git a/include/linux/mm.h b/include/linux/mm.h index de2fd86c6154..d3a3842316b8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,7 +155,7 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, * mmap() functions). */ -struct vm_area_struct *vm_area_alloc(void); +struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); diff --git a/kernel/fork.c b/kernel/fork.c index 67253e41bfb0..a191c05e757d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -308,9 +308,15 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -struct vm_area_struct *vm_area_alloc(void) +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + + if (vma) { + vma->vm_mm = mm; + INIT_LIST_HEAD(&vma->anon_vma_chain); + } + return vma; } struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) diff --git a/mm/mmap.c b/mm/mmap.c index b0ed8ce1b67e..ff1944d8d458 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1729,19 +1729,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { error = -ENOMEM; goto unacct_error; } - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { if (vm_flags & VM_DENYWRITE) { @@ -2979,14 +2977,12 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* * create a vma struct for an anonymous mapping */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = pgoff; @@ -3343,12 +3339,10 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; diff --git a/mm/nommu.c b/mm/nommu.c index c2560e9cc803..1d22fdbf7d7c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (!vma) goto error_getting_vma; @@ -1212,7 +1212,6 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; -- cgit v1.2.3 From f88a333b44318643282b8acc92af90deda441f5e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 Jul 2018 15:07:11 +0100 Subject: alpha: fix osf_wait4() breakage kernel_wait4() expects a userland address for status - it's only rusage that goes as a kernel one (and needs a copyout afterwards) [ Also, fix the prototype of kernel_wait4() to have that __user annotation - Linus ] Fixes: 92ebce5ac55d ("osf_wait4: switch to kernel_wait4()") Cc: stable@kernel.org # v4.13+ Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/alpha/kernel/osf_sys.c | 5 +---- include/linux/sched/task.h | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 6e921754c8fc..c210a25dd6da 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1180,13 +1180,10 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) SYSCALL_DEFINE4(osf_wait4, pid_t, pid, int __user *, ustatus, int, options, struct rusage32 __user *, ur) { - unsigned int status = 0; struct rusage r; - long err = kernel_wait4(pid, &status, options, &r); + long err = kernel_wait4(pid, ustatus, options, &r); if (err <= 0) return err; - if (put_user(status, ustatus)) - return -EFAULT; if (!ur) return err; if (put_tv_to_tv32(&ur->ru_utime, &r.ru_utime)) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 5be31eb7b266..108ede99e533 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -75,7 +75,7 @@ extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -extern long kernel_wait4(pid_t, int *, int, struct rusage *); +extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); extern void free_task(struct task_struct *tsk); -- cgit v1.2.3 From 6cbc304f2f360f25cc8607817239d6f4a2fd3dc5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 May 2018 15:48:41 +0200 Subject: perf/x86/intel: Fix unwind errors from PEBS entries (mk-II) Vince reported the perf_fuzzer giving various unwinder warnings and Josh reported: > Deja vu. Most of these are related to perf PEBS, similar to the > following issue: > > b8000586c90b ("perf/x86/intel: Cure bogus unwind from PEBS entries") > > This is basically the ORC version of that. setup_pebs_sample_data() is > assembling a franken-pt_regs which ORC isn't happy about. RIP is > inconsistent with some of the other registers (like RSP and RBP). And where the previous unwinder only needed BP,SP ORC also requires IP. But we cannot spoof IP because then the sample will get displaced, entirely negating the point of PEBS. So cure the whole thing differently by doing the unwind early; this does however require a means to communicate we did the unwind early. We (ab)use an unused sample_type bit for this, which we set on events that fill out the data->callchain before the normal perf_prepare_sample(). Debugged-by: Josh Poimboeuf Reported-by: Vince Weaver Tested-by: Josh Poimboeuf Tested-by: Prashant Bhole Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 3 +++ arch/x86/events/intel/ds.c | 25 +++++++++++-------------- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 2 ++ kernel/events/core.c | 6 ++++-- 5 files changed, 21 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 707b2a96e516..86f0c15dcc2d 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2997,6 +2997,9 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); + + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8cf03f101938..8dbba77e0518 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1185,17 +1185,21 @@ static void setup_pebs_sample_data(struct perf_event *event, data->data_src.val = val; } + /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happend between the record and + * PMI. + */ + if (sample_type & PERF_SAMPLE_CALLCHAIN) + data->callchain = perf_callchain(event, iregs); + /* * We use the interrupt regs as a base because the PEBS record does not * contain a full regs set, specifically it seems to lack segment * descriptors, which get used by things like user_mode(). * * In the simple case fix up only the IP for PERF_SAMPLE_IP. - * - * We must however always use BP,SP from iregs for the unwinder to stay - * sane; the record BP,SP can point into thin air when the record is - * from a previous PMI context or an (I)RET happend between the record - * and PMI. */ *regs = *iregs; @@ -1214,15 +1218,8 @@ static void setup_pebs_sample_data(struct perf_event *event, regs->si = pebs->si; regs->di = pebs->di; - /* - * Per the above; only set BP,SP if we don't need callchains. - * - * XXX: does this make sense? - */ - if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) { - regs->bp = pebs->bp; - regs->sp = pebs->sp; - } + regs->bp = pebs->bp; + regs->sp = pebs->sp; #ifndef CONFIG_X86_32 regs->r8 = pebs->r8; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1fa12887ec02..87f6db437e4a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1130,6 +1130,7 @@ extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); +extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b8e288a1f740..eeb787b1c53c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -143,6 +143,8 @@ enum perf_event_sample_format { PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ + + __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f0434a9951a..cdb32cf8e33c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6343,7 +6343,7 @@ static u64 perf_virt_to_phys(u64 virt) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; -static struct perf_callchain_entry * +struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; @@ -6382,7 +6382,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(event, regs); + if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + data->callchain = perf_callchain(event, regs); + size += data->callchain->nr; header->size += size * sizeof(u64); -- cgit v1.2.3