From 8d50cdf8b8341770bc6367bce40c0c1bb0e1d5b3 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 19 May 2022 20:32:13 -0700 Subject: x86/speculation/mmio: Add sysfs reporting for Processor MMIO Stale Data Add the sysfs reporting file for Processor MMIO Stale Data vulnerability. It exposes the vulnerability and mitigation state similar to the existing files for the other hardware vulnerabilities. Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov --- include/linux/cpu.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 54dc2f9a2d56..2c7477354744 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -65,6 +65,9 @@ extern ssize_t cpu_show_tsx_async_abort(struct device *dev, extern ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_mmio_stale_data(struct device *dev, + struct device_attribute *attr, + char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From 7b6c7a877cc616bc7dc9cd39646fe454acbed48b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 3 Jun 2022 08:04:44 -0700 Subject: x86/ftrace: Remove OBJECT_FILES_NON_STANDARD usage The file-wide OBJECT_FILES_NON_STANDARD annotation is used with CONFIG_FRAME_POINTER to tell objtool to skip the entire file when frame pointers are enabled. However that annotation is now deprecated because it doesn't work with IBT, where objtool runs on vmlinux.o instead of individual translation units. Instead, use more fine-grained function-specific annotations: - The 'save_mcount_regs' macro does funny things with the frame pointer. Use STACK_FRAME_NON_STANDARD_FP to tell objtool to ignore the functions using it. - The return_to_handler() "function" isn't actually a callable function. Instead of being called, it's returned to. The real return address isn't on the stack, so unwinding is already doomed no matter which unwinder is used. So just remove the STT_FUNC annotation, telling objtool to ignore it. That also removes the implicit ANNOTATE_NOENDBR, which now needs to be made explicit. Fixes the following warning: vmlinux.o: warning: objtool: __fentry__+0x16: return with modified stack frame Fixes: ed53a0d97192 ("x86/alternative: Use .ibt_endbr_seal to seal indirect calls") Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/b7a7a42fe306aca37826043dac89e113a1acdbac.1654268610.git.jpoimboe@kernel.org --- arch/x86/kernel/Makefile | 4 ---- arch/x86/kernel/ftrace_64.S | 11 ++++++++--- include/linux/objtool.h | 6 ++++++ tools/include/linux/objtool.h | 6 ++++++ 4 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 03364dc40d8d..4c8b6ae802ac 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,10 +36,6 @@ KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD_test_nx.o := y -ifdef CONFIG_FRAME_POINTER -OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y -endif - # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, # boot, dumpstack/stacktrace, etc are either non-interesting or can lead to diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 4ec13608d3c6..dfeb227de561 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -175,6 +175,7 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue SYM_FUNC_END(ftrace_caller); +STACK_FRAME_NON_STANDARD_FP(ftrace_caller) SYM_FUNC_START(ftrace_epilogue) /* @@ -282,6 +283,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue SYM_FUNC_END(ftrace_regs_caller) +STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -311,10 +313,14 @@ trace: jmp ftrace_stub SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) +STACK_FRAME_NON_STANDARD_FP(__fentry__) + #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_FUNC_START(return_to_handler) +SYM_CODE_START(return_to_handler) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR subq $16, %rsp /* Save the return values */ @@ -339,7 +345,6 @@ SYM_FUNC_START(return_to_handler) int3 .Ldo_rop: mov %rdi, (%rsp) - UNWIND_HINT_FUNC RET -SYM_FUNC_END(return_to_handler) +SYM_CODE_END(return_to_handler) #endif diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 6491fa8fba6d..15b940ec1eac 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -143,6 +143,12 @@ struct unwind_hint { .popsection .endm +.macro STACK_FRAME_NON_STANDARD_FP func:req +#ifdef CONFIG_FRAME_POINTER + STACK_FRAME_NON_STANDARD \func +#endif +.endm + .macro ANNOTATE_NOENDBR .Lhere_\@: .pushsection .discard.noendbr diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 6491fa8fba6d..15b940ec1eac 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -143,6 +143,12 @@ struct unwind_hint { .popsection .endm +.macro STACK_FRAME_NON_STANDARD_FP func:req +#ifdef CONFIG_FRAME_POINTER + STACK_FRAME_NON_STANDARD \func +#endif +.endm + .macro ANNOTATE_NOENDBR .Lhere_\@: .pushsection .discard.noendbr -- cgit v1.2.3 From 4527d47bb63a134c4483a1a478d0ff5874b466c7 Mon Sep 17 00:00:00 2001 From: "GONG, Ruiqi" Date: Tue, 7 Jun 2022 19:08:48 +0800 Subject: drm/atomic: fix warning of unused variable Fix the `unused-but-set-variable` warning as how other iteration wrappers do. Link: https://lore.kernel.org/all/202206071049.pofHsRih-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: GONG, Ruiqi Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20220607110848.941486-1-gongruiqi1@huawei.com --- include/drm/drm_atomic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h index 0777725085df..10b1990bc1f6 100644 --- a/include/drm/drm_atomic.h +++ b/include/drm/drm_atomic.h @@ -1022,6 +1022,7 @@ void drm_state_dump(struct drm_device *dev, struct drm_printer *p); for ((__i) = 0; \ (__i) < (__state)->num_private_objs && \ ((obj) = (__state)->private_objs[__i].ptr, \ + (void)(obj) /* Only to avoid unused-but-set-variable warning */, \ (new_obj_state) = (__state)->private_objs[__i].new_state, 1); \ (__i)++) -- cgit v1.2.3 From cfab87c2c2715763dc7e43d9968bdaa01cde4bc3 Mon Sep 17 00:00:00 2001 From: Vijaya Krishna Nivarthi Date: Wed, 8 Jun 2022 00:22:44 +0530 Subject: serial: core: Introduce callback for start_rx and do stop_rx in suspend only if this callback implementation is present. In suspend sequence there is a need to perform stop_rx during suspend sequence to prevent any asynchronous data over rx line. However this can cause problem to drivers which dont do re-start_rx during set_termios. Add new callback start_rx and perform stop_rx only when implementation of start_rx is present. Also add call to start_rx in resume sequence so that drivers who come across this problem can make use of this framework. Fixes: c9d2325cdb92 ("serial: core: Do stop_rx in suspend path for console if console_suspend is disabled") Reviewed-by: Douglas Anderson Signed-off-by: Vijaya Krishna Nivarthi Link: https://lore.kernel.org/r/1654627965-1461-2-git-send-email-quic_vnivarth@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 9 ++++++--- include/linux/serial_core.h | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 9a85b41caa0a..338ebadfd44b 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -2214,11 +2214,12 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport) /* * Nothing to do if the console is not suspending * except stop_rx to prevent any asynchronous data - * over RX line. Re-start_rx, when required, is - * done by set_termios in resume sequence + * over RX line. However ensure that we will be + * able to Re-start_rx later. */ if (!console_suspend_enabled && uart_console(uport)) { - uport->ops->stop_rx(uport); + if (uport->ops->start_rx) + uport->ops->stop_rx(uport); goto unlock; } @@ -2310,6 +2311,8 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport) if (console_suspend_enabled) uart_change_pm(state, UART_PM_STATE_ON); uport->ops->set_termios(uport, &termios, NULL); + if (!console_suspend_enabled && uport->ops->start_rx) + uport->ops->start_rx(uport); if (console_suspend_enabled) console_start(uport->cons); } diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index cbd5070bc87f..657a0fc68a3f 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -45,6 +45,7 @@ struct uart_ops { void (*unthrottle)(struct uart_port *); void (*send_xchar)(struct uart_port *, char ch); void (*stop_rx)(struct uart_port *); + void (*start_rx)(struct uart_port *); void (*enable_ms)(struct uart_port *); void (*break_ctl)(struct uart_port *, int ctl); int (*startup)(struct uart_port *); -- cgit v1.2.3 From cd756dafd86ee3a4969906086f3c2537e0c6d9d0 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Mon, 6 Jun 2022 14:22:00 +0100 Subject: staging: Also remove the Unisys visorbus.h The commit that removed the Unisys s-Par and visorbus drivers left around the include/linux/visorbus.h file mentioned in the MAINTAINERS entry, we can also remove that too. Fixes: e5f45b011e4a ("staging: Remove the drivers for the Unisys s-Par") Reviewed-by: Fabio M. De Francesco Signed-off-by: Peter Robinson Link: https://lore.kernel.org/r/20220606132200.2873243-1-pbrobinson@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/visorbus.h | 344 ----------------------------------------------- 1 file changed, 344 deletions(-) delete mode 100644 include/linux/visorbus.h (limited to 'include') diff --git a/include/linux/visorbus.h b/include/linux/visorbus.h deleted file mode 100644 index 0d8bd6769b13..000000000000 --- a/include/linux/visorbus.h +++ /dev/null @@ -1,344 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Copyright (C) 2010 - 2013 UNISYS CORPORATION - * All rights reserved. - */ - -/* - * This header file is to be included by other kernel mode components that - * implement a particular kind of visor_device. Each of these other kernel - * mode components is called a visor device driver. Refer to visortemplate - * for a minimal sample visor device driver. - * - * There should be nothing in this file that is private to the visorbus - * bus implementation itself. - */ - -#ifndef __VISORBUS_H__ -#define __VISORBUS_H__ - -#include - -#define VISOR_CHANNEL_SIGNATURE ('L' << 24 | 'N' << 16 | 'C' << 8 | 'E') - -/* - * enum channel_serverstate - * @CHANNELSRV_UNINITIALIZED: Channel is in an undefined state. - * @CHANNELSRV_READY: Channel has been initialized by server. - */ -enum channel_serverstate { - CHANNELSRV_UNINITIALIZED = 0, - CHANNELSRV_READY = 1 -}; - -/* - * enum channel_clientstate - * @CHANNELCLI_DETACHED: - * @CHANNELCLI_DISABLED: Client can see channel but is NOT allowed to use it - * unless given TBD* explicit request - * (should actually be < DETACHED). - * @CHANNELCLI_ATTACHING: Legacy EFI client request for EFI server to attach. - * @CHANNELCLI_ATTACHED: Idle, but client may want to use channel any time. - * @CHANNELCLI_BUSY: Client either wants to use or is using channel. - * @CHANNELCLI_OWNED: "No worries" state - client can access channel - * anytime. - */ -enum channel_clientstate { - CHANNELCLI_DETACHED = 0, - CHANNELCLI_DISABLED = 1, - CHANNELCLI_ATTACHING = 2, - CHANNELCLI_ATTACHED = 3, - CHANNELCLI_BUSY = 4, - CHANNELCLI_OWNED = 5 -}; - -/* - * Values for VISOR_CHANNEL_PROTOCOL.Features: This define exists so that - * a guest can look at the FeatureFlags in the io channel, and configure the - * driver to use interrupts or not based on this setting. All feature bits for - * all channels should be defined here. The io channel feature bits are defined - * below. - */ -#define VISOR_DRIVER_ENABLES_INTS (0x1ULL << 1) -#define VISOR_CHANNEL_IS_POLLING (0x1ULL << 3) -#define VISOR_IOVM_OK_DRIVER_DISABLING_INTS (0x1ULL << 4) -#define VISOR_DRIVER_DISABLES_INTS (0x1ULL << 5) -#define VISOR_DRIVER_ENHANCED_RCVBUF_CHECKING (0x1ULL << 6) - -/* - * struct channel_header - Common Channel Header - * @signature: Signature. - * @legacy_state: DEPRECATED - being replaced by. - * @header_size: sizeof(struct channel_header). - * @size: Total size of this channel in bytes. - * @features: Flags to modify behavior. - * @chtype: Channel type: data, bus, control, etc.. - * @partition_handle: ID of guest partition. - * @handle: Device number of this channel in client. - * @ch_space_offset: Offset in bytes to channel specific area. - * @version_id: Struct channel_header Version ID. - * @partition_index: Index of guest partition. - * @zone_uuid: Guid of Channel's zone. - * @cli_str_offset: Offset from channel header to null-terminated - * ClientString (0 if ClientString not present). - * @cli_state_boot: CHANNEL_CLIENTSTATE of pre-boot EFI client of this - * channel. - * @cmd_state_cli: CHANNEL_COMMANDSTATE (overloaded in Windows drivers, see - * ServerStateUp, ServerStateDown, etc). - * @cli_state_os: CHANNEL_CLIENTSTATE of Guest OS client of this channel. - * @ch_characteristic: CHANNEL_CHARACTERISTIC_. - * @cmd_state_srv: CHANNEL_COMMANDSTATE (overloaded in Windows drivers, see - * ServerStateUp, ServerStateDown, etc). - * @srv_state: CHANNEL_SERVERSTATE. - * @cli_error_boot: Bits to indicate err states for boot clients, so err - * messages can be throttled. - * @cli_error_os: Bits to indicate err states for OS clients, so err - * messages can be throttled. - * @filler: Pad out to 128 byte cacheline. - * @recover_channel: Please add all new single-byte values below here. - */ -struct channel_header { - u64 signature; - u32 legacy_state; - /* SrvState, CliStateBoot, and CliStateOS below */ - u32 header_size; - u64 size; - u64 features; - guid_t chtype; - u64 partition_handle; - u64 handle; - u64 ch_space_offset; - u32 version_id; - u32 partition_index; - guid_t zone_guid; - u32 cli_str_offset; - u32 cli_state_boot; - u32 cmd_state_cli; - u32 cli_state_os; - u32 ch_characteristic; - u32 cmd_state_srv; - u32 srv_state; - u8 cli_error_boot; - u8 cli_error_os; - u8 filler[1]; - u8 recover_channel; -} __packed; - -#define VISOR_CHANNEL_ENABLE_INTS (0x1ULL << 0) - -/* - * struct signal_queue_header - Subheader for the Signal Type variation of the - * Common Channel. - * @version: SIGNAL_QUEUE_HEADER Version ID. - * @chtype: Queue type: storage, network. - * @size: Total size of this queue in bytes. - * @sig_base_offset: Offset to signal queue area. - * @features: Flags to modify behavior. - * @num_sent: Total # of signals placed in this queue. - * @num_overflows: Total # of inserts failed due to full queue. - * @signal_size: Total size of a signal for this queue. - * @max_slots: Max # of slots in queue, 1 slot is always empty. - * @max_signals: Max # of signals in queue (MaxSignalSlots-1). - * @head: Queue head signal #. - * @num_received: Total # of signals removed from this queue. - * @tail: Queue tail signal. - * @reserved1: Reserved field. - * @reserved2: Reserved field. - * @client_queue: - * @num_irq_received: Total # of Interrupts received. This is incremented by the - * ISR in the guest windows driver. - * @num_empty: Number of times that visor_signal_remove is called and - * returned Empty Status. - * @errorflags: Error bits set during SignalReinit to denote trouble with - * client's fields. - * @filler: Pad out to 64 byte cacheline. - */ -struct signal_queue_header { - /* 1st cache line */ - u32 version; - u32 chtype; - u64 size; - u64 sig_base_offset; - u64 features; - u64 num_sent; - u64 num_overflows; - u32 signal_size; - u32 max_slots; - u32 max_signals; - u32 head; - /* 2nd cache line */ - u64 num_received; - u32 tail; - u32 reserved1; - u64 reserved2; - u64 client_queue; - u64 num_irq_received; - u64 num_empty; - u32 errorflags; - u8 filler[12]; -} __packed; - -/* VISORCHANNEL Guids */ -/* {414815ed-c58c-11da-95a9-00e08161165f} */ -#define VISOR_VHBA_CHANNEL_GUID \ - GUID_INIT(0x414815ed, 0xc58c, 0x11da, \ - 0x95, 0xa9, 0x0, 0xe0, 0x81, 0x61, 0x16, 0x5f) -#define VISOR_VHBA_CHANNEL_GUID_STR \ - "414815ed-c58c-11da-95a9-00e08161165f" -struct visorchipset_state { - u32 created:1; - u32 attached:1; - u32 configured:1; - u32 running:1; - /* Remaining bits in this 32-bit word are reserved. */ -}; - -/** - * struct visor_device - A device type for things "plugged" into the visorbus - * bus - * @visorchannel: Points to the channel that the device is - * associated with. - * @channel_type_guid: Identifies the channel type to the bus driver. - * @device: Device struct meant for use by the bus driver - * only. - * @list_all: Used by the bus driver to enumerate devices. - * @timer: Timer fired periodically to do interrupt-type - * activity. - * @being_removed: Indicates that the device is being removed from - * the bus. Private bus driver use only. - * @visordriver_callback_lock: Used by the bus driver to lock when adding and - * removing devices. - * @pausing: Indicates that a change towards a paused state. - * is in progress. Only modified by the bus driver. - * @resuming: Indicates that a change towards a running state - * is in progress. Only modified by the bus driver. - * @chipset_bus_no: Private field used by the bus driver. - * @chipset_dev_no: Private field used the bus driver. - * @state: Used to indicate the current state of the - * device. - * @inst: Unique GUID for this instance of the device. - * @name: Name of the device. - * @pending_msg_hdr: For private use by bus driver to respond to - * hypervisor requests. - * @vbus_hdr_info: A pointer to header info. Private use by bus - * driver. - * @partition_guid: Indicates client partion id. This should be the - * same across all visor_devices in the current - * guest. Private use by bus driver only. - */ -struct visor_device { - struct visorchannel *visorchannel; - guid_t channel_type_guid; - /* These fields are for private use by the bus driver only. */ - struct device device; - struct list_head list_all; - struct timer_list timer; - bool timer_active; - bool being_removed; - struct mutex visordriver_callback_lock; /* synchronize probe/remove */ - bool pausing; - bool resuming; - u32 chipset_bus_no; - u32 chipset_dev_no; - struct visorchipset_state state; - guid_t inst; - u8 *name; - struct controlvm_message_header *pending_msg_hdr; - void *vbus_hdr_info; - guid_t partition_guid; - struct dentry *debugfs_dir; - struct dentry *debugfs_bus_info; -}; - -#define to_visor_device(x) container_of(x, struct visor_device, device) - -typedef void (*visorbus_state_complete_func) (struct visor_device *dev, - int status); - -/* - * This struct describes a specific visor channel, by providing its GUID, name, - * and sizes. - */ -struct visor_channeltype_descriptor { - const guid_t guid; - const char *name; - u64 min_bytes; - u32 version; -}; - -/** - * struct visor_driver - Information provided by each visor driver when it - * registers with the visorbus driver - * @name: Name of the visor driver. - * @owner: The module owner. - * @channel_types: Types of channels handled by this driver, ending with - * a zero GUID. Our specialized BUS.match() method knows - * about this list, and uses it to determine whether this - * driver will in fact handle a new device that it has - * detected. - * @probe: Called when a new device comes online, by our probe() - * function specified by driver.probe() (triggered - * ultimately by some call to driver_register(), - * bus_add_driver(), or driver_attach()). - * @remove: Called when a new device is removed, by our remove() - * function specified by driver.remove() (triggered - * ultimately by some call to device_release_driver()). - * @channel_interrupt: Called periodically, whenever there is a possiblity - * that "something interesting" may have happened to the - * channel. - * @pause: Called to initiate a change of the device's state. If - * the return valu`e is < 0, there was an error and the - * state transition will NOT occur. If the return value - * is >= 0, then the state transition was INITIATED - * successfully, and complete_func() will be called (or - * was just called) with the final status when either the - * state transition fails or completes successfully. - * @resume: Behaves similar to pause. - * @driver: Private reference to the device driver. For use by bus - * driver only. - */ -struct visor_driver { - const char *name; - struct module *owner; - struct visor_channeltype_descriptor *channel_types; - int (*probe)(struct visor_device *dev); - void (*remove)(struct visor_device *dev); - void (*channel_interrupt)(struct visor_device *dev); - int (*pause)(struct visor_device *dev, - visorbus_state_complete_func complete_func); - int (*resume)(struct visor_device *dev, - visorbus_state_complete_func complete_func); - - /* These fields are for private use by the bus driver only. */ - struct device_driver driver; -}; - -#define to_visor_driver(x) (container_of(x, struct visor_driver, driver)) - -int visor_check_channel(struct channel_header *ch, struct device *dev, - const guid_t *expected_uuid, char *chname, - u64 expected_min_bytes, u32 expected_version, - u64 expected_signature); - -int visorbus_register_visor_driver(struct visor_driver *drv); -void visorbus_unregister_visor_driver(struct visor_driver *drv); -int visorbus_read_channel(struct visor_device *dev, - unsigned long offset, void *dest, - unsigned long nbytes); -int visorbus_write_channel(struct visor_device *dev, - unsigned long offset, void *src, - unsigned long nbytes); -int visorbus_enable_channel_interrupts(struct visor_device *dev); -void visorbus_disable_channel_interrupts(struct visor_device *dev); - -int visorchannel_signalremove(struct visorchannel *channel, u32 queue, - void *msg); -int visorchannel_signalinsert(struct visorchannel *channel, u32 queue, - void *msg); -bool visorchannel_signalempty(struct visorchannel *channel, u32 queue); -const guid_t *visorchannel_get_guid(struct visorchannel *channel); - -#define BUS_ROOT_DEVICE UINT_MAX -struct visor_device *visorbus_get_device_by_id(u32 bus_no, u32 dev_no, - struct visor_device *from); -#endif -- cgit v1.2.3 From 4314f9f4f85832b5082f4e38b07b63b11baa538c Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 8 Jun 2022 10:55:28 +0100 Subject: firmware: arm_scmi: Avoid using extended string-buffers sizes if not necessary Commit b260fccaebdc2 ("firmware: arm_scmi: Add SCMI v3.1 protocol extended names support") moved all the name string buffers to use the extended buffer size of 64 instead of the required 16 bytes. While that should be fine if the firmware terminates the string before 16 bytes, there is possibility of copying random data if the name is not NULL terminated by the firmware. SCMI base protocol agent_name/vendor_id/sub_vendor_id are defined by the specification as NULL-terminated ASCII strings up to 16-bytes in length. The underlying buffers and message descriptors are currently bigger than needed; resize them to fit only the strictly needed 16 bytes to avoid any possible leaks when reading data from the firmware. Change the size argument of strlcpy to use SCMI_SHORT_NAME_MAX_SIZE always when dealing with short domain names, so as to limit the possibility that an ill-formed non-NULL terminated short reply from the SCMI platform firmware can leak stale content laying in the underlying transport shared memory area. While at that, convert all strings handling routines to use the preferred strscpy. Link: https://lore.kernel.org/r/20220608095530.497879-1-cristian.marussi@arm.com Fixes: b260fccaebdc2 ("firmware: arm_scmi: Add SCMI v3.1 protocol extended names support") Signed-off-by: Cristian Marussi Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/base.c | 8 ++++---- drivers/firmware/arm_scmi/clock.c | 2 +- drivers/firmware/arm_scmi/perf.c | 2 +- drivers/firmware/arm_scmi/power.c | 2 +- drivers/firmware/arm_scmi/protocols.h | 2 -- drivers/firmware/arm_scmi/reset.c | 2 +- drivers/firmware/arm_scmi/sensors.c | 4 ++-- drivers/firmware/arm_scmi/voltage.c | 2 +- include/linux/scmi_protocol.h | 9 +++++---- 9 files changed, 16 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/firmware/arm_scmi/base.c b/drivers/firmware/arm_scmi/base.c index d0ac96da1ddf..a52f084a6a87 100644 --- a/drivers/firmware/arm_scmi/base.c +++ b/drivers/firmware/arm_scmi/base.c @@ -36,7 +36,7 @@ struct scmi_msg_resp_base_attributes { struct scmi_msg_resp_base_discover_agent { __le32 agent_id; - u8 name[SCMI_MAX_STR_SIZE]; + u8 name[SCMI_SHORT_NAME_MAX_SIZE]; }; @@ -119,7 +119,7 @@ scmi_base_vendor_id_get(const struct scmi_protocol_handle *ph, bool sub_vendor) ret = ph->xops->do_xfer(ph, t); if (!ret) - memcpy(vendor_id, t->rx.buf, size); + strscpy(vendor_id, t->rx.buf, size); ph->xops->xfer_put(ph, t); @@ -276,7 +276,7 @@ static int scmi_base_discover_agent_get(const struct scmi_protocol_handle *ph, ret = ph->xops->do_xfer(ph, t); if (!ret) { agent_info = t->rx.buf; - strlcpy(name, agent_info->name, SCMI_MAX_STR_SIZE); + strscpy(name, agent_info->name, SCMI_SHORT_NAME_MAX_SIZE); } ph->xops->xfer_put(ph, t); @@ -375,7 +375,7 @@ static int scmi_base_protocol_init(const struct scmi_protocol_handle *ph) int id, ret; u8 *prot_imp; u32 version; - char name[SCMI_MAX_STR_SIZE]; + char name[SCMI_SHORT_NAME_MAX_SIZE]; struct device *dev = ph->dev; struct scmi_revision_info *rev = scmi_revision_area_get(ph); diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c index 1a718faa4192..c7a83f6e38e5 100644 --- a/drivers/firmware/arm_scmi/clock.c +++ b/drivers/firmware/arm_scmi/clock.c @@ -153,7 +153,7 @@ static int scmi_clock_attributes_get(const struct scmi_protocol_handle *ph, if (!ret) { u32 latency = 0; attributes = le32_to_cpu(attr->attributes); - strlcpy(clk->name, attr->name, SCMI_MAX_STR_SIZE); + strscpy(clk->name, attr->name, SCMI_SHORT_NAME_MAX_SIZE); /* clock_enable_latency field is present only since SCMI v3.1 */ if (PROTOCOL_REV_MAJOR(version) >= 0x2) latency = le32_to_cpu(attr->clock_enable_latency); diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c index c1f701623058..bbb0331801ff 100644 --- a/drivers/firmware/arm_scmi/perf.c +++ b/drivers/firmware/arm_scmi/perf.c @@ -252,7 +252,7 @@ scmi_perf_domain_attributes_get(const struct scmi_protocol_handle *ph, dom_info->mult_factor = (dom_info->sustained_freq_khz * 1000) / dom_info->sustained_perf_level; - strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE); + strscpy(dom_info->name, attr->name, SCMI_SHORT_NAME_MAX_SIZE); } ph->xops->xfer_put(ph, t); diff --git a/drivers/firmware/arm_scmi/power.c b/drivers/firmware/arm_scmi/power.c index 964882cc8747..356e83631664 100644 --- a/drivers/firmware/arm_scmi/power.c +++ b/drivers/firmware/arm_scmi/power.c @@ -122,7 +122,7 @@ scmi_power_domain_attributes_get(const struct scmi_protocol_handle *ph, dom_info->state_set_notify = SUPPORTS_STATE_SET_NOTIFY(flags); dom_info->state_set_async = SUPPORTS_STATE_SET_ASYNC(flags); dom_info->state_set_sync = SUPPORTS_STATE_SET_SYNC(flags); - strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE); + strscpy(dom_info->name, attr->name, SCMI_SHORT_NAME_MAX_SIZE); } ph->xops->xfer_put(ph, t); diff --git a/drivers/firmware/arm_scmi/protocols.h b/drivers/firmware/arm_scmi/protocols.h index 73304af5ec4a..c679f3fb8718 100644 --- a/drivers/firmware/arm_scmi/protocols.h +++ b/drivers/firmware/arm_scmi/protocols.h @@ -24,8 +24,6 @@ #include -#define SCMI_SHORT_NAME_MAX_SIZE 16 - #define PROTOCOL_REV_MINOR_MASK GENMASK(15, 0) #define PROTOCOL_REV_MAJOR_MASK GENMASK(31, 16) #define PROTOCOL_REV_MAJOR(x) ((u16)(FIELD_GET(PROTOCOL_REV_MAJOR_MASK, (x)))) diff --git a/drivers/firmware/arm_scmi/reset.c b/drivers/firmware/arm_scmi/reset.c index a420a9102094..673f3eb498f4 100644 --- a/drivers/firmware/arm_scmi/reset.c +++ b/drivers/firmware/arm_scmi/reset.c @@ -116,7 +116,7 @@ scmi_reset_domain_attributes_get(const struct scmi_protocol_handle *ph, dom_info->latency_us = le32_to_cpu(attr->latency); if (dom_info->latency_us == U32_MAX) dom_info->latency_us = 0; - strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE); + strscpy(dom_info->name, attr->name, SCMI_SHORT_NAME_MAX_SIZE); } ph->xops->xfer_put(ph, t); diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c index 8a93dd944c49..7288c6117838 100644 --- a/drivers/firmware/arm_scmi/sensors.c +++ b/drivers/firmware/arm_scmi/sensors.c @@ -412,7 +412,7 @@ iter_axes_desc_process_response(const struct scmi_protocol_handle *ph, attrh = le32_to_cpu(adesc->attributes_high); a->scale = S32_EXT(SENSOR_SCALE(attrh)); a->type = SENSOR_TYPE(attrh); - strscpy(a->name, adesc->name, SCMI_MAX_STR_SIZE); + strscpy(a->name, adesc->name, SCMI_SHORT_NAME_MAX_SIZE); if (a->extended_attrs) { unsigned int ares = le32_to_cpu(adesc->resolution); @@ -634,7 +634,7 @@ iter_sens_descr_process_response(const struct scmi_protocol_handle *ph, SUPPORTS_AXIS(attrh) ? SENSOR_AXIS_NUMBER(attrh) : 0, SCMI_MAX_NUM_SENSOR_AXIS); - strscpy(s->name, sdesc->name, SCMI_MAX_STR_SIZE); + strscpy(s->name, sdesc->name, SCMI_SHORT_NAME_MAX_SIZE); /* * If supported overwrite short name with the extended diff --git a/drivers/firmware/arm_scmi/voltage.c b/drivers/firmware/arm_scmi/voltage.c index 97df6d3dd131..5de93f637bd4 100644 --- a/drivers/firmware/arm_scmi/voltage.c +++ b/drivers/firmware/arm_scmi/voltage.c @@ -233,7 +233,7 @@ static int scmi_voltage_descriptors_get(const struct scmi_protocol_handle *ph, v = vinfo->domains + dom; v->id = dom; attributes = le32_to_cpu(resp_dom->attr); - strlcpy(v->name, resp_dom->name, SCMI_MAX_STR_SIZE); + strscpy(v->name, resp_dom->name, SCMI_SHORT_NAME_MAX_SIZE); /* * If supported overwrite short name with the extended one; diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 1c58646ba381..704111f63993 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -13,8 +13,9 @@ #include #include -#define SCMI_MAX_STR_SIZE 64 -#define SCMI_MAX_NUM_RATES 16 +#define SCMI_MAX_STR_SIZE 64 +#define SCMI_SHORT_NAME_MAX_SIZE 16 +#define SCMI_MAX_NUM_RATES 16 /** * struct scmi_revision_info - version information structure @@ -36,8 +37,8 @@ struct scmi_revision_info { u8 num_protocols; u8 num_agents; u32 impl_ver; - char vendor_id[SCMI_MAX_STR_SIZE]; - char sub_vendor_id[SCMI_MAX_STR_SIZE]; + char vendor_id[SCMI_SHORT_NAME_MAX_SIZE]; + char sub_vendor_id[SCMI_SHORT_NAME_MAX_SIZE]; }; struct scmi_clock_info { -- cgit v1.2.3 From 993d0b287e2ef7bee2e8b13b0ce4d2b5066f278e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 12 Jun 2022 22:32:25 +0100 Subject: usercopy: Handle vm_map_ram() areas vmalloc does not allocate a vm_struct for vm_map_ram() areas. That causes us to deny usercopies from those areas. This affects XFS which uses vm_map_ram() for its directories. Fix this by calling find_vmap_area() instead of find_vm_area(). Fixes: 0aef499f3172 ("mm/usercopy: Detect vmalloc overruns") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Uladzislau Rezki (Sony) Tested-by: Zorro Lang Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220612213227.3881769-2-willy@infradead.org --- include/linux/vmalloc.h | 1 + mm/usercopy.c | 10 ++++------ mm/vmalloc.c | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b159c2789961..096d48aa3437 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -215,6 +215,7 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); +struct vmap_area *find_vmap_area(unsigned long addr); static inline bool is_vm_area_hugepages(const void *addr) { diff --git a/mm/usercopy.c b/mm/usercopy.c index baeacc735b83..cd4b41d9bf76 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -173,16 +173,14 @@ static inline void check_heap_object(const void *ptr, unsigned long n, } if (is_vmalloc_addr(ptr)) { - struct vm_struct *area = find_vm_area(ptr); + struct vmap_area *area = find_vmap_area((unsigned long)ptr); unsigned long offset; - if (!area) { + if (!area) usercopy_abort("vmalloc", "no area", to_user, 0, n); - return; - } - offset = ptr - area->addr; - if (offset + n > get_vm_area_size(area)) + offset = (unsigned long)ptr - area->va_start; + if ((unsigned long)ptr + n > area->va_end) usercopy_abort("vmalloc", NULL, to_user, offset, n); return; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 07db42455dd4..effd1ff6a4b4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1798,7 +1798,7 @@ static void free_unmap_vmap_area(struct vmap_area *va) free_vmap_area_noflush(va); } -static struct vmap_area *find_vmap_area(unsigned long addr) +struct vmap_area *find_vmap_area(unsigned long addr) { struct vmap_area *va; -- cgit v1.2.3 From 0f9cd1ea10d307cad221d6693b648a8956e812b0 Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 13 Jun 2022 09:37:03 +0200 Subject: drm/ttm: fix bulk move handling v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The resource must be on the LRU before ttm_lru_bulk_move_add() is called and we need to check if the BO is pinned or not before adding it. Additional to that we missed taking the LRU spinlock in ttm_bo_unpin(). Signed-off-by: Christian König Reviewed-by: Arunpravin Paneer Selvam Acked-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20220613080816.4965-1-christian.koenig@amd.com Fixes: fee2ede15542 ("drm/ttm: rework bulk move handling v5") --- drivers/gpu/drm/ttm/ttm_bo.c | 22 ++++++++++------ drivers/gpu/drm/ttm/ttm_resource.c | 52 ++++++++++++++++++++++++++------------ include/drm/ttm/ttm_resource.h | 8 +++--- 3 files changed, 54 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 75d308ec173d..406e9c324e76 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -109,11 +109,11 @@ void ttm_bo_set_bulk_move(struct ttm_buffer_object *bo, return; spin_lock(&bo->bdev->lru_lock); - if (bo->bulk_move && bo->resource) - ttm_lru_bulk_move_del(bo->bulk_move, bo->resource); + if (bo->resource) + ttm_resource_del_bulk_move(bo->resource, bo); bo->bulk_move = bulk; - if (bo->bulk_move && bo->resource) - ttm_lru_bulk_move_add(bo->bulk_move, bo->resource); + if (bo->resource) + ttm_resource_add_bulk_move(bo->resource, bo); spin_unlock(&bo->bdev->lru_lock); } EXPORT_SYMBOL(ttm_bo_set_bulk_move); @@ -689,8 +689,11 @@ void ttm_bo_pin(struct ttm_buffer_object *bo) { dma_resv_assert_held(bo->base.resv); WARN_ON_ONCE(!kref_read(&bo->kref)); - if (!(bo->pin_count++) && bo->bulk_move && bo->resource) - ttm_lru_bulk_move_del(bo->bulk_move, bo->resource); + spin_lock(&bo->bdev->lru_lock); + if (bo->resource) + ttm_resource_del_bulk_move(bo->resource, bo); + ++bo->pin_count; + spin_unlock(&bo->bdev->lru_lock); } EXPORT_SYMBOL(ttm_bo_pin); @@ -707,8 +710,11 @@ void ttm_bo_unpin(struct ttm_buffer_object *bo) if (WARN_ON_ONCE(!bo->pin_count)) return; - if (!(--bo->pin_count) && bo->bulk_move && bo->resource) - ttm_lru_bulk_move_add(bo->bulk_move, bo->resource); + spin_lock(&bo->bdev->lru_lock); + --bo->pin_count; + if (bo->resource) + ttm_resource_add_bulk_move(bo->resource, bo); + spin_unlock(&bo->bdev->lru_lock); } EXPORT_SYMBOL(ttm_bo_unpin); diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index 65889b3caf50..20f9adcc3235 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -91,8 +91,8 @@ static void ttm_lru_bulk_move_pos_tail(struct ttm_lru_bulk_move_pos *pos, } /* Add the resource to a bulk_move cursor */ -void ttm_lru_bulk_move_add(struct ttm_lru_bulk_move *bulk, - struct ttm_resource *res) +static void ttm_lru_bulk_move_add(struct ttm_lru_bulk_move *bulk, + struct ttm_resource *res) { struct ttm_lru_bulk_move_pos *pos = ttm_lru_bulk_move_pos(bulk, res); @@ -105,8 +105,8 @@ void ttm_lru_bulk_move_add(struct ttm_lru_bulk_move *bulk, } /* Remove the resource from a bulk_move range */ -void ttm_lru_bulk_move_del(struct ttm_lru_bulk_move *bulk, - struct ttm_resource *res) +static void ttm_lru_bulk_move_del(struct ttm_lru_bulk_move *bulk, + struct ttm_resource *res) { struct ttm_lru_bulk_move_pos *pos = ttm_lru_bulk_move_pos(bulk, res); @@ -122,6 +122,22 @@ void ttm_lru_bulk_move_del(struct ttm_lru_bulk_move *bulk, } } +/* Add the resource to a bulk move if the BO is configured for it */ +void ttm_resource_add_bulk_move(struct ttm_resource *res, + struct ttm_buffer_object *bo) +{ + if (bo->bulk_move && !bo->pin_count) + ttm_lru_bulk_move_add(bo->bulk_move, res); +} + +/* Remove the resource from a bulk move if the BO is configured for it */ +void ttm_resource_del_bulk_move(struct ttm_resource *res, + struct ttm_buffer_object *bo) +{ + if (bo->bulk_move && !bo->pin_count) + ttm_lru_bulk_move_del(bo->bulk_move, res); +} + /* Move a resource to the LRU or bulk tail */ void ttm_resource_move_to_lru_tail(struct ttm_resource *res) { @@ -169,15 +185,14 @@ void ttm_resource_init(struct ttm_buffer_object *bo, res->bus.is_iomem = false; res->bus.caching = ttm_cached; res->bo = bo; - INIT_LIST_HEAD(&res->lru); man = ttm_manager_type(bo->bdev, place->mem_type); spin_lock(&bo->bdev->lru_lock); - man->usage += res->num_pages << PAGE_SHIFT; - if (bo->bulk_move) - ttm_lru_bulk_move_add(bo->bulk_move, res); + if (bo->pin_count) + list_add_tail(&res->lru, &bo->bdev->pinned); else - ttm_resource_move_to_lru_tail(res); + list_add_tail(&res->lru, &man->lru[bo->priority]); + man->usage += res->num_pages << PAGE_SHIFT; spin_unlock(&bo->bdev->lru_lock); } EXPORT_SYMBOL(ttm_resource_init); @@ -210,8 +225,16 @@ int ttm_resource_alloc(struct ttm_buffer_object *bo, { struct ttm_resource_manager *man = ttm_manager_type(bo->bdev, place->mem_type); + int ret; + + ret = man->func->alloc(man, bo, place, res_ptr); + if (ret) + return ret; - return man->func->alloc(man, bo, place, res_ptr); + spin_lock(&bo->bdev->lru_lock); + ttm_resource_add_bulk_move(*res_ptr, bo); + spin_unlock(&bo->bdev->lru_lock); + return 0; } void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res) @@ -221,12 +244,9 @@ void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res) if (!*res) return; - if (bo->bulk_move) { - spin_lock(&bo->bdev->lru_lock); - ttm_lru_bulk_move_del(bo->bulk_move, *res); - spin_unlock(&bo->bdev->lru_lock); - } - + spin_lock(&bo->bdev->lru_lock); + ttm_resource_del_bulk_move(*res, bo); + spin_unlock(&bo->bdev->lru_lock); man = ttm_manager_type(bo->bdev, (*res)->mem_type); man->func->free(man, *res); *res = NULL; diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h index 441653693970..ca89a48c2460 100644 --- a/include/drm/ttm/ttm_resource.h +++ b/include/drm/ttm/ttm_resource.h @@ -311,12 +311,12 @@ ttm_resource_manager_cleanup(struct ttm_resource_manager *man) } void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk); -void ttm_lru_bulk_move_add(struct ttm_lru_bulk_move *bulk, - struct ttm_resource *res); -void ttm_lru_bulk_move_del(struct ttm_lru_bulk_move *bulk, - struct ttm_resource *res); void ttm_lru_bulk_move_tail(struct ttm_lru_bulk_move *bulk); +void ttm_resource_add_bulk_move(struct ttm_resource *res, + struct ttm_buffer_object *bo); +void ttm_resource_del_bulk_move(struct ttm_resource *res, + struct ttm_buffer_object *bo); void ttm_resource_move_to_lru_tail(struct ttm_resource *res); void ttm_resource_init(struct ttm_buffer_object *bo, -- cgit v1.2.3 From d884b6498d2f022098502e106d5a45ab635f2e9a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 14 Jun 2022 17:51:18 +0100 Subject: io_uring: remove IORING_CLOSE_FD_AND_FILE_SLOT This partially reverts a7c41b4687f5902af70cd559806990930c8a307b Even though IORING_CLOSE_FD_AND_FILE_SLOT might save cycles for some users, but it tries to do two things at a time and it's not clear how to handle errors and what to return in a single result field when one part fails and another completes well. Kill it for now. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/837c745019b3795941eee4fcfd7de697886d645b.1655224415.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++--------- include/uapi/linux/io_uring.h | 6 ------ 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1b95c6750a81..1b0b6099e717 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -576,7 +576,6 @@ struct io_close { struct file *file; int fd; u32 file_slot; - u32 flags; }; struct io_timeout_data { @@ -5966,18 +5965,14 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->off || sqe->addr || sqe->len || sqe->buf_index) + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); req->close.file_slot = READ_ONCE(sqe->file_index); - req->close.flags = READ_ONCE(sqe->close_flags); - if (req->close.flags & ~IORING_CLOSE_FD_AND_FILE_SLOT) - return -EINVAL; - if (!(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT) && - req->close.file_slot && req->close.fd) + if (req->close.file_slot && req->close.fd) return -EINVAL; return 0; @@ -5993,8 +5988,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) if (req->close.file_slot) { ret = io_close_fixed(req, issue_flags); - if (ret || !(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT)) - goto err; + goto err; } spin_lock(&files->file_lock); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 776e0278f9dd..53e7dae92e42 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -47,7 +47,6 @@ struct io_uring_sqe { __u32 unlink_flags; __u32 hardlink_flags; __u32 xattr_flags; - __u32 close_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -259,11 +258,6 @@ enum io_uring_op { */ #define IORING_ACCEPT_MULTISHOT (1U << 0) -/* - * close flags, store in sqe->close_flags - */ -#define IORING_CLOSE_FD_AND_FILE_SLOT (1U << 0) - /* * IO completion data structure (Completion Queue Entry) */ -- cgit v1.2.3 From 018ab4fabddd94f1c96f3b59e180691b9e88d5d8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Jun 2022 10:36:11 -0700 Subject: netfs: fix up netfs_inode_init() docbook comment Commit e81fb4198e27 ("netfs: Further cleanups after struct netfs_inode wrapper introduced") changed the argument types and names, and actually updated the comment too (although that was thanks to David Howells, not me: my original patch only changed the code). But the comment fixup didn't go quite far enough, and didn't change the argument name in the comment, resulting in include/linux/netfs.h:314: warning: Function parameter or member 'ctx' not described in 'netfs_inode_init' include/linux/netfs.h:314: warning: Excess function parameter 'inode' description in 'netfs_inode_init' during htmldoc generation. Fixes: e81fb4198e27 ("netfs: Further cleanups after struct netfs_inode wrapper introduced") Reported-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 097cdd644665..1773e5df8e65 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -304,7 +304,7 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) /** * netfs_inode_init - Initialise a netfslib inode context - * @inode: The netfs inode to initialise + * @ctx: The netfs inode to initialise * @ops: The netfs's operations list * * Initialise the netfs library context struct. This is expected to follow on -- cgit v1.2.3 From b87f02307d3cfbda768520f0687c51ca77e14fc3 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 15 Jun 2022 18:28:05 +0200 Subject: printk: Wait for the global console lock when the system is going down There are reports that the console kthreads block the global console lock when the system is going down, for example, reboot, panic. First part of the solution was to block kthreads in these problematic system states so they stopped handling newly added messages. Second part of the solution is to wait when for the kthreads when they are actively printing. It solves the problem when a message was printed before the system entered the problematic state and the kthreads managed to step in. A busy waiting has to be used because panic() can be called in any context and in an unknown state of the scheduler. There must be a timeout because the kthread might get stuck or sleeping and never release the lock. The timeout 10s is an arbitrary value inspired by the softlockup timeout. Link: https://lore.kernel.org/r/20220610205038.GA3050413@paulmck-ThinkPad-P17-Gen-1 Link: https://lore.kernel.org/r/CAMdYzYpF4FNTBPZsEFeWRuEwSies36QM_As8osPWZSr2q-viEA@mail.gmail.com Signed-off-by: Petr Mladek Tested-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220615162805.27962-3-pmladek@suse.com --- include/linux/printk.h | 5 +++++ kernel/panic.c | 2 ++ kernel/printk/internal.h | 2 ++ kernel/printk/printk.c | 4 ++++ kernel/printk/printk_safe.c | 32 ++++++++++++++++++++++++++++++++ kernel/reboot.c | 2 ++ 6 files changed, 47 insertions(+) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index cd26aab0ab2a..c1e07c0652c7 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -174,6 +174,7 @@ extern void printk_prefer_direct_enter(void); extern void printk_prefer_direct_exit(void); extern bool pr_flush(int timeout_ms, bool reset_on_progress); +extern void try_block_console_kthreads(int timeout_ms); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -238,6 +239,10 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress) return true; } +static inline void try_block_console_kthreads(int timeout_ms) +{ +} + static inline int printk_ratelimit(void) { return 0; diff --git a/kernel/panic.c b/kernel/panic.c index 6737b2332275..fe73d18ecdf0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -273,6 +273,7 @@ void panic(const char *fmt, ...) * unfortunately means it may not be hardened to work in a * panic situation. */ + try_block_console_kthreads(10000); smp_send_stop(); } else { /* @@ -280,6 +281,7 @@ void panic(const char *fmt, ...) * kmsg_dump, we will need architecture dependent extra * works in addition to stopping other CPUs. */ + try_block_console_kthreads(10000); crash_smp_send_stop(); } diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index d947ca6c84f9..e7d8578860ad 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -20,6 +20,8 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; +extern bool block_console_kthreads; + __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 45c6c2b0b104..b095fb5f5f61 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -250,6 +250,9 @@ static atomic_t console_kthreads_active = ATOMIC_INIT(0); #define console_kthread_printing_exit() \ atomic_dec(&console_kthreads_active) +/* Block console kthreads to avoid processing new messages. */ +bool block_console_kthreads; + /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -3730,6 +3733,7 @@ static bool printer_should_wake(struct console *con, u64 seq) if (con->blocked || console_kthreads_atomically_blocked() || + block_console_kthreads || system_state > SYSTEM_RUNNING || oops_in_progress) { return false; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index ef0f9a2044da..caac4de1ea59 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include "internal.h" @@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt, va_list args) return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); + +/** + * try_block_console_kthreads() - Try to block console kthreads and + * make the global console_lock() avaialble + * + * @timeout_ms: The maximum time (in ms) to wait. + * + * Prevent console kthreads from starting processing new messages. Wait + * until the global console_lock() become available. + * + * Context: Can be called in any context. + */ +void try_block_console_kthreads(int timeout_ms) +{ + block_console_kthreads = true; + + /* Do not wait when the console lock could not be safely taken. */ + if (this_cpu_read(printk_context) || in_nmi()) + return; + + while (timeout_ms > 0) { + if (console_trylock()) { + console_unlock(); + return; + } + + udelay(1000); + timeout_ms -= 1; + } +} diff --git a/kernel/reboot.c b/kernel/reboot.c index 4177645e74d6..310363685502 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -74,6 +74,7 @@ void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; + try_block_console_kthreads(10000); usermodehelper_disable(); device_shutdown(); } @@ -262,6 +263,7 @@ static void kernel_shutdown_prepare(enum system_states state) blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; + try_block_console_kthreads(10000); usermodehelper_disable(); device_shutdown(); } -- cgit v1.2.3 From 4bca7e80b6455772b4bf3f536dcbc19aac424d6a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Jun 2022 15:22:29 +0200 Subject: init: Initialize noop_backing_dev_info early noop_backing_dev_info is used by superblocks of various pseudofilesystems such as kdevtmpfs. After commit 10e14073107d ("writeback: Fix inode->i_io_list not be protected by inode->i_lock error") this broke because __mark_inode_dirty() started to access more fields from noop_backing_dev_info and this led to crashes inside locked_inode_to_wb_and_lock_list() called from __mark_inode_dirty(). Fix the problem by initializing noop_backing_dev_info before the filesystems get mounted. Fixes: 10e14073107d ("writeback: Fix inode->i_io_list not be protected by inode->i_lock error") Reported-and-tested-by: Suzuki K Poulose Reported-and-tested-by: Alexandru Elisei Reported-and-tested-by: Guenter Roeck Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- drivers/base/init.c | 2 ++ include/linux/backing-dev.h | 2 ++ mm/backing-dev.c | 11 ++--------- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/base/init.c b/drivers/base/init.c index d8d0fe687111..397eb9880cec 100644 --- a/drivers/base/init.c +++ b/drivers/base/init.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "base.h" @@ -20,6 +21,7 @@ void __init driver_init(void) { /* These are the core pieces */ + bdi_init(&noop_backing_dev_info); devtmpfs_init(); devices_init(); buses_init(); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2bd073fa6bb5..d452071db572 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -119,6 +119,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); extern struct backing_dev_info noop_backing_dev_info; +int bdi_init(struct backing_dev_info *bdi); + /** * writeback_in_progress - determine whether there is writeback in progress * @wb: bdi_writeback of interest diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ff60bd7d74e0..95550b8fa7fe 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -231,20 +231,13 @@ static __init int bdi_class_init(void) } postcore_initcall(bdi_class_init); -static int bdi_init(struct backing_dev_info *bdi); - static int __init default_bdi_init(void) { - int err; - bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; - - err = bdi_init(&noop_backing_dev_info); - - return err; + return 0; } subsys_initcall(default_bdi_init); @@ -781,7 +774,7 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) #endif /* CONFIG_CGROUP_WRITEBACK */ -static int bdi_init(struct backing_dev_info *bdi) +int bdi_init(struct backing_dev_info *bdi) { int ret; -- cgit v1.2.3 From 593d1ebe00a45af5cb7bda1235c0790987c2a2b2 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 15 Jun 2022 12:32:13 -0700 Subject: Revert "net: Add a second bind table hashed by port and address" This reverts: commit d5a42de8bdbe ("net: Add a second bind table hashed by port and address") commit 538aaf9b2383 ("selftests: Add test for timing a bind request to a port with a populated bhash entry") Link: https://lore.kernel.org/netdev/20220520001834.2247810-1-kuba@kernel.org/ There are a few things that need to be fixed here: * Updating bhash2 in cases where the socket's rcv saddr changes * Adding bhash2 hashbucket locks Links to syzbot reports: https://lore.kernel.org/netdev/00000000000022208805e0df247a@google.com/ https://lore.kernel.org/netdev/0000000000003f33bc05dfaf44fe@google.com/ Fixes: d5a42de8bdbe ("net: Add a second bind table hashed by port and address") Reported-by: syzbot+015d756bbd1f8b5c8f09@syzkaller.appspotmail.com Reported-by: syzbot+98fd2d1422063b0f8c44@syzkaller.appspotmail.com Reported-by: syzbot+0a847a982613c6438fba@syzkaller.appspotmail.com Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20220615193213.2419568-1-joannelkoong@gmail.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 3 - include/net/inet_hashtables.h | 68 +------ include/net/sock.h | 14 -- net/dccp/proto.c | 33 +--- net/ipv4/inet_connection_sock.c | 247 +++++++------------------- net/ipv4/inet_hashtables.c | 193 ++------------------ net/ipv4/tcp.c | 14 +- tools/testing/selftests/net/.gitignore | 1 - tools/testing/selftests/net/Makefile | 2 - tools/testing/selftests/net/bind_bhash_test.c | 119 ------------- 10 files changed, 83 insertions(+), 611 deletions(-) delete mode 100644 tools/testing/selftests/net/bind_bhash_test.c (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 077cd730ce2f..85cd695e7fd1 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -25,7 +25,6 @@ #undef INET_CSK_CLEAR_TIMERS struct inet_bind_bucket; -struct inet_bind2_bucket; struct tcp_congestion_ops; /* @@ -58,7 +57,6 @@ struct inet_connection_sock_af_ops { * * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node - * @icsk_bind2_hash: Bind node in the bhash2 table * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout @@ -85,7 +83,6 @@ struct inet_connection_sock { struct inet_sock icsk_inet; struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; - struct inet_bind2_bucket *icsk_bind2_hash; unsigned long icsk_timeout; struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index a0887b70967b..ebfa3df6f8dc 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -90,32 +90,11 @@ struct inet_bind_bucket { struct hlist_head owners; }; -struct inet_bind2_bucket { - possible_net_t ib_net; - int l3mdev; - unsigned short port; - union { -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr v6_rcv_saddr; -#endif - __be32 rcv_saddr; - }; - /* Node in the inet2_bind_hashbucket chain */ - struct hlist_node node; - /* List of sockets hashed to this bucket */ - struct hlist_head owners; -}; - static inline struct net *ib_net(struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } -static inline struct net *ib2_net(struct inet_bind2_bucket *ib) -{ - return read_pnet(&ib->ib_net); -} - #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) @@ -124,15 +103,6 @@ struct inet_bind_hashbucket { struct hlist_head chain; }; -/* This is synchronized using the inet_bind_hashbucket's spinlock. - * Instead of having separate spinlocks, the inet_bind2_hashbucket can share - * the inet_bind_hashbucket's given that in every case where the bhash2 table - * is useful, a lookup in the bhash table also occurs. - */ -struct inet_bind2_hashbucket { - struct hlist_head chain; -}; - /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without @@ -164,12 +134,6 @@ struct inet_hashinfo { */ struct kmem_cache *bind_bucket_cachep; struct inet_bind_hashbucket *bhash; - /* The 2nd binding table hashed by port and address. - * This is used primarily for expediting the resolution of bind - * conflicts. - */ - struct kmem_cache *bind2_bucket_cachep; - struct inet_bind2_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ @@ -229,36 +193,6 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); -static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb, - struct net *net, - const unsigned short port, - int l3mdev) -{ - return net_eq(ib_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev; -} - -struct inet_bind2_bucket * -inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, - struct inet_bind2_hashbucket *head, - const unsigned short port, int l3mdev, - const struct sock *sk); - -void inet_bind2_bucket_destroy(struct kmem_cache *cachep, - struct inet_bind2_bucket *tb); - -struct inet_bind2_bucket * -inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net, - const unsigned short port, int l3mdev, - struct sock *sk, - struct inet_bind2_hashbucket **head); - -bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb, - struct net *net, - const unsigned short port, - int l3mdev, - const struct sock *sk); - static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { @@ -266,7 +200,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, } void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - struct inet_bind2_bucket *tb2, const unsigned short snum); + const unsigned short snum); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); diff --git a/include/net/sock.h b/include/net/sock.h index c585ef6565d9..72ca97ccb460 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -348,7 +348,6 @@ struct sk_filter; * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference - * @sk_bind2_node: bind node in the bhash2 table */ struct sock { /* @@ -538,7 +537,6 @@ struct sock { #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; - struct hlist_node sk_bind2_node; }; enum sk_pacing { @@ -819,16 +817,6 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_add_head(&sk->sk_bind_node, list); } -static inline void __sk_del_bind2_node(struct sock *sk) -{ - __hlist_del(&sk->sk_bind2_node); -} - -static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) -{ - hlist_add_head(&sk->sk_bind2_node, list); -} - #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ @@ -846,8 +834,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) -#define sk_for_each_bound_bhash2(__sk, list) \ - hlist_for_each_entry(__sk, list, sk_bind2_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2e78458900f2..eb8e128e43e8 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1120,12 +1120,6 @@ static int __init dccp_init(void) SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; - dccp_hashinfo.bind2_bucket_cachep = - kmem_cache_create("dccp_bind2_bucket", - sizeof(struct inet_bind2_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind2_bucket_cachep) - goto out_free_bind_bucket_cachep; /* * Size and allocate the main established and bind bucket @@ -1156,7 +1150,7 @@ static int __init dccp_init(void) if (!dccp_hashinfo.ehash) { DCCP_CRIT("Failed to allocate DCCP established hash table"); - goto out_free_bind2_bucket_cachep; + goto out_free_bind_bucket_cachep; } for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) @@ -1182,23 +1176,14 @@ static int __init dccp_init(void) goto out_free_dccp_locks; } - dccp_hashinfo.bhash2 = (struct inet_bind2_hashbucket *) - __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); - - if (!dccp_hashinfo.bhash2) { - DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); - goto out_free_dccp_bhash; - } - for (i = 0; i < dccp_hashinfo.bhash_size; i++) { spin_lock_init(&dccp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); } rc = dccp_mib_init(); if (rc) - goto out_free_dccp_bhash2; + goto out_free_dccp_bhash; rc = dccp_ackvec_init(); if (rc) @@ -1222,38 +1207,30 @@ out_ackvec_exit: dccp_ackvec_exit(); out_free_dccp_mib: dccp_mib_exit(); -out_free_dccp_bhash2: - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); out_free_dccp_locks: inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); -out_free_bind2_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); out_free_hashinfo2: inet_hashinfo2_free_mod(&dccp_hashinfo); out_fail: dccp_hashinfo.bhash = NULL; - dccp_hashinfo.bhash2 = NULL; dccp_hashinfo.ehash = NULL; dccp_hashinfo.bind_bucket_cachep = NULL; - dccp_hashinfo.bind2_bucket_cachep = NULL; return rc; } static void __exit dccp_fini(void) { - int bhash_order = get_order(dccp_hashinfo.bhash_size * - sizeof(struct inet_bind_hashbucket)); - ccid_cleanup_builtins(); dccp_mib_exit(); - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); + free_pages((unsigned long)dccp_hashinfo.bhash, + get_order(dccp_hashinfo.bhash_size * + sizeof(struct inet_bind_hashbucket))); free_pages((unsigned long)dccp_hashinfo.ehash, get_order((dccp_hashinfo.ehash_mask + 1) * sizeof(struct inet_ehash_bucket))); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c0b7e6c21360..53f5f956d948 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -117,32 +117,6 @@ bool inet_rcv_saddr_any(const struct sock *sk) return !sk->sk_rcv_saddr; } -static bool use_bhash2_on_bind(const struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - int addr_type; - - if (sk->sk_family == AF_INET6) { - addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - return addr_type != IPV6_ADDR_ANY && - addr_type != IPV6_ADDR_MAPPED; - } -#endif - return sk->sk_rcv_saddr != htonl(INADDR_ANY); -} - -static u32 get_bhash2_nulladdr_hash(const struct sock *sk, struct net *net, - int port) -{ -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr nulladdr = {}; - - if (sk->sk_family == AF_INET6) - return ipv6_portaddr_hash(net, &nulladdr, port); -#endif - return ipv4_portaddr_hash(net, 0, port); -} - void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; @@ -156,71 +130,16 @@ void inet_get_local_port_range(struct net *net, int *low, int *high) } EXPORT_SYMBOL(inet_get_local_port_range); -static bool bind_conflict_exist(const struct sock *sk, struct sock *sk2, - kuid_t sk_uid, bool relax, - bool reuseport_cb_ok, bool reuseport_ok) -{ - int bound_dev_if2; - - if (sk == sk2) - return false; - - bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); - - if (!sk->sk_bound_dev_if || !bound_dev_if2 || - sk->sk_bound_dev_if == bound_dev_if2) { - if (sk->sk_reuse && sk2->sk_reuse && - sk2->sk_state != TCP_LISTEN) { - if (!relax || (!reuseport_ok && sk->sk_reuseport && - sk2->sk_reuseport && reuseport_cb_ok && - (sk2->sk_state == TCP_TIME_WAIT || - uid_eq(sk_uid, sock_i_uid(sk2))))) - return true; - } else if (!reuseport_ok || !sk->sk_reuseport || - !sk2->sk_reuseport || !reuseport_cb_ok || - (sk2->sk_state != TCP_TIME_WAIT && - !uid_eq(sk_uid, sock_i_uid(sk2)))) { - return true; - } - } - return false; -} - -static bool check_bhash2_conflict(const struct sock *sk, - struct inet_bind2_bucket *tb2, kuid_t sk_uid, - bool relax, bool reuseport_cb_ok, - bool reuseport_ok) -{ - struct sock *sk2; - - sk_for_each_bound_bhash2(sk2, &tb2->owners) { - if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) - continue; - - if (bind_conflict_exist(sk, sk2, sk_uid, relax, - reuseport_cb_ok, reuseport_ok)) - return true; - } - return false; -} - -/* This should be called only when the corresponding inet_bind_bucket spinlock - * is held - */ -static int inet_csk_bind_conflict(const struct sock *sk, int port, - struct inet_bind_bucket *tb, - struct inet_bind2_bucket *tb2, /* may be null */ +static int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb, bool relax, bool reuseport_ok) { - struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - kuid_t uid = sock_i_uid((struct sock *)sk); - struct sock_reuseport *reuseport_cb; - struct inet_bind2_hashbucket *head2; - bool reuseport_cb_ok; struct sock *sk2; - struct net *net; - int l3mdev; - u32 hash; + bool reuseport_cb_ok; + bool reuse = sk->sk_reuse; + bool reuseport = !!sk->sk_reuseport; + struct sock_reuseport *reuseport_cb; + kuid_t uid = sock_i_uid((struct sock *)sk); rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); @@ -231,42 +150,40 @@ static int inet_csk_bind_conflict(const struct sock *sk, int port, /* * Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed - * in tb->owners and tb2->owners list belong - * to the same net + * in tb->owners list belong to the same net - the + * one this bucket belongs to. */ - if (!use_bhash2_on_bind(sk)) { - sk_for_each_bound(sk2, &tb->owners) - if (bind_conflict_exist(sk, sk2, uid, relax, - reuseport_cb_ok, reuseport_ok) && - inet_rcv_saddr_equal(sk, sk2, true)) - return true; + sk_for_each_bound(sk2, &tb->owners) { + int bound_dev_if2; - return false; + if (sk == sk2) + continue; + bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); + if ((!sk->sk_bound_dev_if || + !bound_dev_if2 || + sk->sk_bound_dev_if == bound_dev_if2)) { + if (reuse && sk2->sk_reuse && + sk2->sk_state != TCP_LISTEN) { + if ((!relax || + (!reuseport_ok && + reuseport && sk2->sk_reuseport && + reuseport_cb_ok && + (sk2->sk_state == TCP_TIME_WAIT || + uid_eq(uid, sock_i_uid(sk2))))) && + inet_rcv_saddr_equal(sk, sk2, true)) + break; + } else if (!reuseport_ok || + !reuseport || !sk2->sk_reuseport || + !reuseport_cb_ok || + (sk2->sk_state != TCP_TIME_WAIT && + !uid_eq(uid, sock_i_uid(sk2)))) { + if (inet_rcv_saddr_equal(sk, sk2, true)) + break; + } + } } - - if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok)) - return true; - - net = sock_net(sk); - - /* check there's no conflict with an existing IPV6_ADDR_ANY (if ipv6) or - * INADDR_ANY (if ipv4) socket. - */ - hash = get_bhash2_nulladdr_hash(sk, net, port); - head2 = &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; - - l3mdev = inet_sk_bound_l3mdev(sk); - inet_bind_bucket_for_each(tb2, &head2->chain) - if (check_bind2_bucket_match_nulladdr(tb2, net, port, l3mdev, sk)) - break; - - if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok)) - return true; - - return false; + return sk2 != NULL; } /* @@ -274,20 +191,16 @@ static int inet_csk_bind_conflict(const struct sock *sk, int port, * inet_bind_hashbucket lock held. */ static struct inet_bind_hashbucket * -inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, - struct inet_bind2_bucket **tb2_ret, - struct inet_bind2_hashbucket **head2_ret, int *port_ret) +inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret) { struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - struct inet_bind2_hashbucket *head2; + int port = 0; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); + bool relax = false; int i, low, high, attempt_half; - struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; u32 remaining, offset; - bool relax = false; - int port = 0; int l3mdev; l3mdev = inet_sk_bound_l3mdev(sk); @@ -326,12 +239,10 @@ other_parity_scan: head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); - tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, - &head2); inet_bind_bucket_for_each(tb, &head->chain) - if (check_bind_bucket_match(tb, net, port, l3mdev)) { - if (!inet_csk_bind_conflict(sk, port, tb, tb2, - relax, false)) + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { + if (!inet_csk_bind_conflict(sk, tb, relax, false)) goto success; goto next_port; } @@ -361,8 +272,6 @@ next_port: success: *port_ret = port; *tb_ret = tb; - *tb2_ret = tb2; - *head2_ret = head2; return head; } @@ -458,81 +367,54 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - bool bhash_created = false, bhash2_created = false; - struct inet_bind2_bucket *tb2 = NULL; - struct inet_bind2_hashbucket *head2; - struct inet_bind_bucket *tb = NULL; + int ret = 1, port = snum; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); - int ret = 1, port = snum; - bool found_port = false; + struct inet_bind_bucket *tb = NULL; int l3mdev; l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { - head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port); + head = inet_csk_find_open_port(sk, &tb, &port); if (!head) return ret; - if (tb && tb2) - goto success; - found_port = true; - } else { - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock_bh(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (check_bind_bucket_match(tb, net, port, l3mdev)) - break; - - tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, - &head2); - } - - if (!tb) { - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, - head, port, l3mdev); if (!tb) - goto fail_unlock; - bhash_created = true; - } - - if (!tb2) { - tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, - net, head2, port, l3mdev, sk); - if (!tb2) - goto fail_unlock; - bhash2_created = true; + goto tb_not_found; + goto success; } - - /* If we had to find an open port, we already checked for conflicts */ - if (!found_port && !hlist_empty(&tb->owners)) { + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) + goto tb_found; +tb_not_found: + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port, l3mdev); + if (!tb) + goto fail_unlock; +tb_found: + if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE) goto success; if ((tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) goto success; - if (inet_csk_bind_conflict(sk, port, tb, tb2, true, true)) + if (inet_csk_bind_conflict(sk, tb, true, true)) goto fail_unlock; } success: inet_csk_update_fastreuse(tb, sk); if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, tb2, port); + inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); - WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); ret = 0; fail_unlock: - if (ret) { - if (bhash_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); - if (bhash2_created) - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - tb2); - } spin_unlock_bh(&head->lock); return ret; } @@ -1079,7 +961,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; - newicsk->icsk_bind2_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 545f91b6cb5e..b9d995b5ce24 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -81,41 +81,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, return tb; } -struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, - struct net *net, - struct inet_bind2_hashbucket *head, - const unsigned short port, - int l3mdev, - const struct sock *sk) -{ - struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); - - if (tb) { - write_pnet(&tb->ib_net, net); - tb->l3mdev = l3mdev; - tb->port = port; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; - else -#endif - tb->rcv_saddr = sk->sk_rcv_saddr; - INIT_HLIST_HEAD(&tb->owners); - hlist_add_head(&tb->node, &head->chain); - } - return tb; -} - -static bool bind2_bucket_addr_match(struct inet_bind2_bucket *tb2, struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - return ipv6_addr_equal(&tb2->v6_rcv_saddr, - &sk->sk_v6_rcv_saddr); -#endif - return tb2->rcv_saddr == sk->sk_rcv_saddr; -} - /* * Caller must hold hashbucket lock for this tb with local BH disabled */ @@ -127,25 +92,12 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket } } -/* Caller must hold the lock for the corresponding hashbucket in the bhash table - * with local BH disabled - */ -void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) -{ - if (hlist_empty(&tb->owners)) { - __hlist_del(&tb->node); - kmem_cache_free(cachep, tb); - } -} - void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - struct inet_bind2_bucket *tb2, const unsigned short snum) + const unsigned short snum) { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); inet_csk(sk)->icsk_bind_hash = tb; - sk_add_bind2_node(sk, &tb2->owners); - inet_csk(sk)->icsk_bind2_hash = tb2; } /* @@ -157,7 +109,6 @@ static void __inet_put_port(struct sock *sk) const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; - struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; spin_lock(&head->lock); @@ -166,13 +117,6 @@ static void __inet_put_port(struct sock *sk) inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); - - if (inet_csk(sk)->icsk_bind2_hash) { - tb2 = inet_csk(sk)->icsk_bind2_hash; - __sk_del_bind2_node(sk); - inet_csk(sk)->icsk_bind2_hash = NULL; - inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); - } spin_unlock(&head->lock); } @@ -189,19 +133,14 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; unsigned short port = inet_sk(child)->inet_num; const int bhash = inet_bhashfn(sock_net(sk), port, - table->bhash_size); + table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; - struct inet_bind2_hashbucket *head_bhash2; - bool created_inet_bind_bucket = false; - struct net *net = sock_net(sk); - struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; int l3mdev; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; - tb2 = inet_csk(sk)->icsk_bind2_hash; - if (unlikely(!tb || !tb2)) { + if (unlikely(!tb)) { spin_unlock(&head->lock); return -ENOENT; } @@ -214,45 +153,25 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (check_bind_bucket_match(tb, net, port, l3mdev)) + if (net_eq(ib_net(tb), sock_net(sk)) && + tb->l3mdev == l3mdev && tb->port == port) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, - net, head, port, l3mdev); + sock_net(sk), head, port, + l3mdev); if (!tb) { spin_unlock(&head->lock); return -ENOMEM; } - created_inet_bind_bucket = true; } inet_csk_update_fastreuse(tb, child); - - goto bhash2_find; - } else if (!bind2_bucket_addr_match(tb2, child)) { - l3mdev = inet_sk_bound_l3mdev(sk); - -bhash2_find: - tb2 = inet_bind2_bucket_find(table, net, port, l3mdev, child, - &head_bhash2); - if (!tb2) { - tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, - net, head_bhash2, port, - l3mdev, child); - if (!tb2) - goto error; - } } - inet_bind_hash(child, tb, tb2, port); + inet_bind_hash(child, tb, port); spin_unlock(&head->lock); return 0; - -error: - if (created_inet_bind_bucket) - inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); - spin_unlock(&head->lock); - return -ENOMEM; } EXPORT_SYMBOL_GPL(__inet_inherit_port); @@ -756,76 +675,6 @@ void inet_unhash(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_unhash); -static bool check_bind2_bucket_match(struct inet_bind2_bucket *tb, - struct net *net, unsigned short port, - int l3mdev, struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_equal(&tb->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); - else -#endif - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && tb->rcv_saddr == sk->sk_rcv_saddr; -} - -bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb, - struct net *net, const unsigned short port, - int l3mdev, const struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr nulladdr = {}; - - if (sk->sk_family == AF_INET6) - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && - ipv6_addr_equal(&tb->v6_rcv_saddr, &nulladdr); - else -#endif - return net_eq(ib2_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev && tb->rcv_saddr == 0; -} - -static struct inet_bind2_hashbucket * -inet_bhashfn_portaddr(struct inet_hashinfo *hinfo, const struct sock *sk, - const struct net *net, unsigned short port) -{ - u32 hash; - -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) - hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); - else -#endif - hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); - return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; -} - -/* This should only be called when the spinlock for the socket's corresponding - * bind_hashbucket is held - */ -struct inet_bind2_bucket * -inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net, - const unsigned short port, int l3mdev, struct sock *sk, - struct inet_bind2_hashbucket **head) -{ - struct inet_bind2_bucket *bhash2 = NULL; - struct inet_bind2_hashbucket *h; - - h = inet_bhashfn_portaddr(hinfo, sk, net, port); - inet_bind_bucket_for_each(bhash2, &h->chain) { - if (check_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) - break; - } - - if (head) - *head = h; - - return bhash2; -} - /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this @@ -846,13 +695,10 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_timewait_sock *tw = NULL; - struct inet_bind2_hashbucket *head2; struct inet_bind_hashbucket *head; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); - struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; - bool tb_created = false; u32 remaining, offset; int ret, i, low, high; int l3mdev; @@ -909,7 +755,8 @@ other_parity_scan: * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (check_bind_bucket_match(tb, net, port, l3mdev)) { + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; @@ -927,7 +774,6 @@ other_parity_scan: spin_unlock_bh(&head->lock); return -ENOMEM; } - tb_created = true; tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; @@ -943,17 +789,6 @@ next_port: return -EADDRNOTAVAIL; ok: - /* Find the corresponding tb2 bucket since we need to - * add the socket to the bhash2 table as well - */ - tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk, &head2); - if (!tb2) { - tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, - head2, port, l3mdev, sk); - if (!tb2) - goto error; - } - /* Here we want to add a little bit of randomness to the next source * port that will be chosen. We use a max() with a random here so that * on low contention the randomness is maximal and on high contention @@ -963,7 +798,7 @@ ok: WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, tb2, port); + inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); @@ -975,12 +810,6 @@ ok: inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; - -error: - if (tb_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); - spin_unlock_bh(&head->lock); - return -ENOMEM; } /* diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9984d23a7f3e..028513d3e2a2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4604,12 +4604,6 @@ void __init tcp_init(void) SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); - tcp_hashinfo.bind2_bucket_cachep = - kmem_cache_create("tcp_bind2_bucket", - sizeof(struct inet_bind2_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT, - NULL); /* Size and allocate the main established and bind bucket * hash tables. @@ -4632,9 +4626,8 @@ void __init tcp_init(void) if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = - alloc_large_system_hash("TCP bind bhash tables", - sizeof(struct inet_bind_hashbucket) + - sizeof(struct inet_bind2_hashbucket), + alloc_large_system_hash("TCP bind", + sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, @@ -4643,12 +4636,9 @@ void __init tcp_init(void) 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; - tcp_hashinfo.bhash2 = - (struct inet_bind2_hashbucket *)(tcp_hashinfo.bhash + tcp_hashinfo.bhash_size); for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); - INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index b984f8c8d523..a29f79618934 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -37,4 +37,3 @@ gro ioam6_parser toeplitz cmsg_sender -bind_bhash_test diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 464df13831f2..7ea54af55490 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -59,7 +59,6 @@ TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen TEST_PROGS += test_vxlan_vnifiltering.sh -TEST_GEN_FILES += bind_bhash_test TEST_FILES := settings @@ -70,5 +69,4 @@ include bpf/Makefile $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -$(OUTPUT)/bind_bhash_test: LDLIBS += -lpthread $(OUTPUT)/tcp_inq: LDLIBS += -lpthread diff --git a/tools/testing/selftests/net/bind_bhash_test.c b/tools/testing/selftests/net/bind_bhash_test.c deleted file mode 100644 index 252e73754e76..000000000000 --- a/tools/testing/selftests/net/bind_bhash_test.c +++ /dev/null @@ -1,119 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This times how long it takes to bind to a port when the port already - * has multiple sockets in its bhash table. - * - * In the setup(), we populate the port's bhash table with - * MAX_THREADS * MAX_CONNECTIONS number of entries. - */ - -#include -#include -#include -#include - -#define MAX_THREADS 600 -#define MAX_CONNECTIONS 40 - -static const char *bind_addr = "::1"; -static const char *port; - -static int fd_array[MAX_THREADS][MAX_CONNECTIONS]; - -static int bind_socket(int opt, const char *addr) -{ - struct addrinfo *res, hint = {}; - int sock_fd, reuse = 1, err; - - sock_fd = socket(AF_INET6, SOCK_STREAM, 0); - if (sock_fd < 0) { - perror("socket fd err"); - return -1; - } - - hint.ai_family = AF_INET6; - hint.ai_socktype = SOCK_STREAM; - - err = getaddrinfo(addr, port, &hint, &res); - if (err) { - perror("getaddrinfo failed"); - return -1; - } - - if (opt) { - err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse)); - if (err) { - perror("setsockopt failed"); - return -1; - } - } - - err = bind(sock_fd, res->ai_addr, res->ai_addrlen); - if (err) { - perror("failed to bind to port"); - return -1; - } - - return sock_fd; -} - -static void *setup(void *arg) -{ - int sock_fd, i; - int *array = (int *)arg; - - for (i = 0; i < MAX_CONNECTIONS; i++) { - sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr); - if (sock_fd < 0) - return NULL; - array[i] = sock_fd; - } - - return NULL; -} - -int main(int argc, const char *argv[]) -{ - int listener_fd, sock_fd, i, j; - pthread_t tid[MAX_THREADS]; - clock_t begin, end; - - if (argc != 2) { - printf("Usage: listener \n"); - return -1; - } - - port = argv[1]; - - listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, bind_addr); - if (listen(listener_fd, 100) < 0) { - perror("listen failed"); - return -1; - } - - /* Set up threads to populate the bhash table entry for the port */ - for (i = 0; i < MAX_THREADS; i++) - pthread_create(&tid[i], NULL, setup, fd_array[i]); - - for (i = 0; i < MAX_THREADS; i++) - pthread_join(tid[i], NULL); - - begin = clock(); - - /* Bind to the same port on a different address */ - sock_fd = bind_socket(0, "2001:0db8:0:f101::1"); - - end = clock(); - - printf("time spent = %f\n", (double)(end - begin) / CLOCKS_PER_SEC); - - /* clean up */ - close(sock_fd); - close(listener_fd); - for (i = 0; i < MAX_THREADS; i++) { - for (j = 0; i < MAX_THREADS; i++) - close(fd_array[i][j]); - } - - return 0; -} -- cgit v1.2.3 From 4d337cebcb1c27d9b48c48b9a98e939d4552d584 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jun 2022 09:44:00 +0800 Subject: blk-mq: avoid to touch q->elevator without any protection q->elevator is referred in blk_mq_has_sqsched() without any protection, no .q_usage_counter is held, no queue srcu and rcu read lock is held, so potential use-after-free may be triggered. Fix the issue by adding one queue flag for checking if the elevator uses single queue style dispatch. Meantime the elevator feature flag of ELEVATOR_F_MQ_AWARE isn't needed any more. Cc: Jan Kara Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220616014401.817001-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +++ block/blk-mq-sched.c | 1 + block/blk-mq.c | 18 ++---------------- block/kyber-iosched.c | 3 ++- block/mq-deadline.c | 3 +++ include/linux/blkdev.h | 4 ++-- 6 files changed, 13 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0d46cb728bbf..caa55a5624bc 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7188,6 +7188,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + /* We dispatch from request queue wide instead of hw queue */ + blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); + wbt_disable_default(q); return 0; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 9e56a69422b6..eb3c65a21362 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -564,6 +564,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) int ret; if (!e) { + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = NULL; q->nr_requests = q->tag_set->queue_depth; return 0; diff --git a/block/blk-mq.c b/block/blk-mq.c index c13d03b2e17c..4cdb08a70912 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2142,20 +2142,6 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) } EXPORT_SYMBOL(blk_mq_run_hw_queue); -/* - * Is the request queue handled by an IO scheduler that does not respect - * hardware queues when dispatching? - */ -static bool blk_mq_has_sqsched(struct request_queue *q) -{ - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.dispatch_request && - !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) - return true; - return false; -} - /* * Return prefered queue to dispatch from (if any) for non-mq aware IO * scheduler. @@ -2188,7 +2174,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) unsigned long i; sq_hctx = NULL; - if (blk_mq_has_sqsched(q)) + if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) @@ -2216,7 +2202,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) unsigned long i; sq_hctx = NULL; - if (blk_mq_has_sqsched(q)) + if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 70ff2a599ef6..8f7c745b4a57 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -421,6 +421,8 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) blk_stat_enable_accounting(q); + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + eq->elevator_data = kqd; q->elevator = eq; @@ -1033,7 +1035,6 @@ static struct elevator_type kyber_sched = { #endif .elevator_attrs = kyber_sched_attrs, .elevator_name = "kyber", - .elevator_features = ELEVATOR_F_MQ_AWARE, .elevator_owner = THIS_MODULE, }; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 6ed602b2f80a..1a9e835e816c 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -642,6 +642,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); + /* We dispatch from request queue wide instead of hw queue */ + blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); + q->elevator = eq; return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 608d577734c2..bb6e3c31b3b7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -575,6 +575,7 @@ struct request_queue { #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ +#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -616,6 +617,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) +#define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); @@ -1006,8 +1008,6 @@ void disk_set_independent_access_ranges(struct gendisk *disk, */ /* Supports zoned block devices sequential write constraint */ #define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) -/* Supports scheduling on multiple hardware queues */ -#define ELEVATOR_F_MQ_AWARE (1U << 1) extern void blk_queue_required_elevator_features(struct request_queue *q, unsigned int features); -- cgit v1.2.3 From 034e5afad921f1c08c001bf147fb1ba76ae33498 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 10 Jun 2022 16:35:13 -0600 Subject: mm: re-allow pinning of zero pfns The commit referenced below subtly and inadvertently changed the logic to disallow pinning of zero pfns. This breaks device assignment with vfio and potentially various other users of gup. Exclude the zero page test from the negation. Link: https://lkml.kernel.org/r/165490039431.944052.12458624139225785964.stgit@omen Fixes: 1c563432588d ("mm: fix is_pinnable_page against a cma page") Signed-off-by: Alex Williamson Acked-by: Minchan Kim Acked-by: David Hildenbrand Reported-by: Yishai Hadas Cc: Paul E. McKenney Cc: John Hubbard Cc: John Dias Cc: Jason Gunthorpe Cc: Zhangfei Gao Cc: Matthew Wilcox Cc: Joao Martins Cc: Yi Liu Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index bc8f326be0ce..781fae17177d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1600,7 +1600,7 @@ static inline bool is_pinnable_page(struct page *page) if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif - return !(is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page))); + return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)); } #else static inline bool is_pinnable_page(struct page *page) -- cgit v1.2.3 From 67f22ba7750f940bcd7e1b12720896c505c2d63f Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Wed, 15 Jun 2022 17:32:09 +0800 Subject: mm/memory-failure: disable unpoison once hw error happens Currently unpoison_memory(unsigned long pfn) is designed for soft poison(hwpoison-inject) only. Since 17fae1294ad9d, the KPTE gets cleared on a x86 platform once hardware memory corrupts. Unpoisoning a hardware corrupted page puts page back buddy only, the kernel has a chance to access the page with *NOT PRESENT* KPTE. This leads BUG during accessing on the corrupted KPTE. Suggested by David&Naoya, disable unpoison mechanism when a real HW error happens to avoid BUG like this: Unpoison: Software-unpoisoned page 0x61234 BUG: unable to handle page fault for address: ffff888061234000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 2c01067 P4D 2c01067 PUD 107267063 PMD 10382b063 PTE 800fffff9edcb062 Oops: 0002 [#1] PREEMPT SMP NOPTI CPU: 4 PID: 26551 Comm: stress Kdump: loaded Tainted: G M OE 5.18.0.bm.1-amd64 #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ... RIP: 0010:clear_page_erms+0x7/0x10 Code: ... RSP: 0000:ffffc90001107bc8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000901 RCX: 0000000000001000 RDX: ffffea0001848d00 RSI: ffffea0001848d40 RDI: ffff888061234000 RBP: ffffea0001848d00 R08: 0000000000000901 R09: 0000000000001276 R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001 R13: 0000000000000000 R14: 0000000000140dca R15: 0000000000000001 FS: 00007fd8b2333740(0000) GS:ffff88813fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888061234000 CR3: 00000001023d2005 CR4: 0000000000770ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: prep_new_page+0x151/0x170 get_page_from_freelist+0xca0/0xe20 ? sysvec_apic_timer_interrupt+0xab/0xc0 ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 __alloc_pages+0x17e/0x340 __folio_alloc+0x17/0x40 vma_alloc_folio+0x84/0x280 __handle_mm_fault+0x8d4/0xeb0 handle_mm_fault+0xd5/0x2a0 do_user_addr_fault+0x1d0/0x680 ? kvm_read_and_reset_apf_flags+0x3b/0x50 exc_page_fault+0x78/0x170 asm_exc_page_fault+0x27/0x30 Link: https://lkml.kernel.org/r/20220615093209.259374-2-pizhenwei@bytedance.com Fixes: 847ce401df392 ("HWPOISON: Add unpoisoning support") Fixes: 17fae1294ad9d ("x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned") Signed-off-by: zhenwei pi Acked-by: David Hildenbrand Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Reviewed-by: Oscar Salvador Cc: Greg Kroah-Hartman Cc: [5.8+] Signed-off-by: Andrew Morton --- Documentation/vm/hwpoison.rst | 3 ++- drivers/base/memory.c | 2 +- include/linux/mm.h | 1 + mm/hwpoison-inject.c | 2 +- mm/madvise.c | 2 +- mm/memory-failure.c | 12 ++++++++++++ 6 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst index c742de1769d1..b9d5253c1305 100644 --- a/Documentation/vm/hwpoison.rst +++ b/Documentation/vm/hwpoison.rst @@ -120,7 +120,8 @@ Testing unpoison-pfn Software-unpoison page at PFN echoed into this file. This way a page can be reused again. This only works for Linux - injected failures, not for real memory failures. + injected failures, not for real memory failures. Once any hardware + memory failure happens, this feature is disabled. Note these injection interfaces are not stable and might change between kernel versions diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 084d67fd55cc..bc60c9cd3230 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -558,7 +558,7 @@ static ssize_t hard_offline_page_store(struct device *dev, if (kstrtoull(buf, 0, &pfn) < 0) return -EINVAL; pfn >>= PAGE_SHIFT; - ret = memory_failure(pfn, 0); + ret = memory_failure(pfn, MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; return ret ? ret : count; diff --git a/include/linux/mm.h b/include/linux/mm.h index 781fae17177d..cf3d0d673f6b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3232,6 +3232,7 @@ enum mf_flags { MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, + MF_SW_SIMULATED = 1 << 5, }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 5c0cddd81505..65e242b5a432 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -48,7 +48,7 @@ static int hwpoison_inject(void *data, u64 val) inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); - err = memory_failure(pfn, 0); + err = memory_failure(pfn, MF_SW_SIMULATED); return (err == -EOPNOTSUPP) ? 0 : err; } diff --git a/mm/madvise.c b/mm/madvise.c index d7b4f2602949..0316bbc6441b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1112,7 +1112,7 @@ static int madvise_inject_error(int behavior, } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); - ret = memory_failure(pfn, MF_COUNT_INCREASED); + ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b85661cbdc4a..da39ec8afca8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -69,6 +69,8 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); +static bool hw_memory_failure __read_mostly = false; + static bool __page_handle_poison(struct page *page) { int ret; @@ -1768,6 +1770,9 @@ int memory_failure(unsigned long pfn, int flags) mutex_lock(&mf_mutex); + if (!(flags & MF_SW_SIMULATED)) + hw_memory_failure = true; + p = pfn_to_online_page(pfn); if (!p) { res = arch_memory_failure(pfn, flags); @@ -2103,6 +2108,13 @@ int unpoison_memory(unsigned long pfn) mutex_lock(&mf_mutex); + if (hw_memory_failure) { + unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n", + pfn, &unpoison_rs); + ret = -EOPNOTSUPP; + goto unlock_mutex; + } + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); -- cgit v1.2.3 From 540a92bfe6dab7310b9df2e488ba247d784d0163 Mon Sep 17 00:00:00 2001 From: Edward Wu Date: Fri, 17 Jun 2022 11:32:20 +0800 Subject: ata: libata: add qc->flags in ata_qc_complete_template tracepoint Add flags value to check the result of ata completion Fixes: 255c03d15a29 ("libata: Add tracepoints") Cc: stable@vger.kernel.org Signed-off-by: Edward Wu Signed-off-by: Damien Le Moal --- include/trace/events/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/libata.h b/include/trace/events/libata.h index d4e631aa976f..6025dd8ba4aa 100644 --- a/include/trace/events/libata.h +++ b/include/trace/events/libata.h @@ -288,6 +288,7 @@ DECLARE_EVENT_CLASS(ata_qc_complete_template, __entry->hob_feature = qc->result_tf.hob_feature; __entry->nsect = qc->result_tf.nsect; __entry->hob_nsect = qc->result_tf.hob_nsect; + __entry->flags = qc->flags; ), TP_printk("ata_port=%u ata_dev=%u tag=%d flags=%s status=%s " \ -- cgit v1.2.3 From 5cf9c91ba927119fc6606b938b1895bb2459d3bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Jun 2022 09:48:25 +0200 Subject: block: serialize all debugfs operations using q->debugfs_mutex Various places like I/O schedulers or the QOS infrastructure try to register debugfs files on demans, which can race with creating and removing the main queue debugfs directory. Use the existing debugfs_mutex to serialize all debugfs operations that rely on q->debugfs_dir or the directories hanging off it. To make the teardown code a little simpler declare all debugfs dentry pointers and not just the main one uncoditionally in blkdev.h. Move debugfs_mutex next to the dentries that it protects and document what it is used for. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220614074827.458955-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 25 ++++++++++++++++++++----- block/blk-mq-debugfs.h | 5 ----- block/blk-mq-sched.c | 11 +++++++++++ block/blk-rq-qos.c | 2 ++ block/blk-rq-qos.h | 7 ++++++- block/blk-sysfs.c | 20 +++++++++----------- include/linux/blkdev.h | 8 ++++---- kernel/trace/blktrace.c | 3 --- 8 files changed, 52 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7e4136a60e1c..f0fcfe1387cb 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -711,11 +711,6 @@ void blk_mq_debugfs_register(struct request_queue *q) } } -void blk_mq_debugfs_unregister(struct request_queue *q) -{ - q->sched_debugfs_dir = NULL; -} - static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { @@ -746,6 +741,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; hctx->debugfs_dir = NULL; @@ -773,6 +770,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + /* * If the parent directory has not been created yet, return, we will be * called again later on and the directory/files will be created then. @@ -790,6 +789,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) void blk_mq_debugfs_unregister_sched(struct request_queue *q) { + lockdep_assert_held(&q->debugfs_mutex); + debugfs_remove_recursive(q->sched_debugfs_dir); q->sched_debugfs_dir = NULL; } @@ -811,6 +812,10 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { + lockdep_assert_held(&rqos->q->debugfs_mutex); + + if (!rqos->q->debugfs_dir) + return; debugfs_remove_recursive(rqos->debugfs_dir); rqos->debugfs_dir = NULL; } @@ -820,6 +825,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) struct request_queue *q = rqos->q; const char *dir_name = rq_qos_id_to_name(rqos->id); + lockdep_assert_held(&q->debugfs_mutex); + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) return; @@ -835,6 +842,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) { + lockdep_assert_held(&q->debugfs_mutex); + debugfs_remove_recursive(q->rqos_debugfs_dir); q->rqos_debugfs_dir = NULL; } @@ -844,6 +853,8 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + /* * If the parent debugfs directory has not been created yet, return; * We will be called again later on with appropriate parent debugfs @@ -863,6 +874,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { + lockdep_assert_held(&hctx->queue->debugfs_mutex); + + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; } diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 69918f4170d6..771d45832878 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -21,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); void blk_mq_debugfs_register(struct request_queue *q); -void blk_mq_debugfs_unregister(struct request_queue *q); void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); @@ -42,10 +41,6 @@ static inline void blk_mq_debugfs_register(struct request_queue *q) { } -static inline void blk_mq_debugfs_unregister(struct request_queue *q) -{ -} - static inline void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index eb3c65a21362..a4f7c101b53b 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -594,7 +594,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) if (ret) goto err_free_map_and_rqs; + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched(q); + mutex_unlock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { @@ -607,7 +609,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return ret; } } + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } return 0; @@ -648,14 +652,21 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched_hctx(hctx); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } flags = hctx->flags; } + + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index e83af7bc7591..249a6f05dd3b 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -294,7 +294,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, void rq_qos_exit(struct request_queue *q) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_queue_rqos(q); + mutex_unlock(&q->debugfs_mutex); while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 68267007da1c..0e46052b018a 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -104,8 +104,11 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) blk_mq_unfreeze_queue(q); - if (rqos->ops->debugfs_attrs) + if (rqos->ops->debugfs_attrs) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); + } } static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) @@ -129,7 +132,9 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) blk_mq_unfreeze_queue(q); + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); } typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 88bd41d4cb59..6e4801b217a7 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -779,14 +779,13 @@ static void blk_release_queue(struct kobject *kobj) if (queue_is_mq(q)) blk_mq_release(q); - blk_trace_shutdown(q); mutex_lock(&q->debugfs_mutex); + blk_trace_shutdown(q); debugfs_remove_recursive(q->debugfs_dir); + q->debugfs_dir = NULL; + q->sched_debugfs_dir = NULL; mutex_unlock(&q->debugfs_mutex); - if (queue_is_mq(q)) - blk_mq_debugfs_unregister(q); - bioset_exit(&q->bio_split); if (blk_queue_has_srcu(q)) @@ -836,17 +835,16 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } + if (queue_is_mq(q)) + __blk_mq_register_dev(dev, q); + mutex_lock(&q->sysfs_lock); + mutex_lock(&q->debugfs_mutex); q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), blk_debugfs_root); - mutex_unlock(&q->debugfs_mutex); - - if (queue_is_mq(q)) { - __blk_mq_register_dev(dev, q); + if (queue_is_mq(q)) blk_mq_debugfs_register(q); - } - - mutex_lock(&q->sysfs_lock); + mutex_unlock(&q->debugfs_mutex); ret = disk_register_independent_access_ranges(disk, NULL); if (ret) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bb6e3c31b3b7..73c886eba8e1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -482,7 +482,6 @@ struct request_queue { #endif /* CONFIG_BLK_DEV_ZONED */ int node; - struct mutex debugfs_mutex; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; #endif @@ -526,11 +525,12 @@ struct request_queue { struct bio_set bio_split; struct dentry *debugfs_dir; - -#ifdef CONFIG_BLK_DEBUG_FS struct dentry *sched_debugfs_dir; struct dentry *rqos_debugfs_dir; -#endif + /* + * Serializes all debugfs metadata operations using the above dentries. + */ + struct mutex debugfs_mutex; bool mq_sysfs_init_done; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 10a32b0f2deb..fe04c6f96ca5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -770,14 +770,11 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - - mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From c01d4d0a82b71857be7449380338bc53dde2da92 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 16 Jun 2022 15:00:51 +0200 Subject: random: quiet urandom warning ratelimit suppression message random.c ratelimits how much it warns about uninitialized urandom reads using __ratelimit(). When the RNG is finally initialized, it prints the number of missed messages due to ratelimiting. It has been this way since that functionality was introduced back in 2018. Recently, cc1e127bfa95 ("random: remove ratelimiting for in-kernel unseeded randomness") put a bit more stress on the urandom ratelimiting, which teased out a bug in the implementation. Specifically, when under pressure, __ratelimit() will print its own message and reset the count back to 0, making the final message at the end less useful. Secondly, it does so as a pr_warn(), which apparently is undesirable for people's CI. Fortunately, __ratelimit() has the RATELIMIT_MSG_ON_RELEASE flag exactly for this purpose, so we set the flag. Fixes: 4e00b339e264 ("random: rate limit unseeded randomness warnings") Cc: stable@vger.kernel.org Reported-by: Jon Hunter Reported-by: Ron Economos Tested-by: Ron Economos Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 2 +- include/linux/ratelimit_types.h | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/char/random.c b/drivers/char/random.c index d0e4c89c4fcb..07a022e24057 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -87,7 +87,7 @@ static struct fasync_struct *fasync; /* Control how we warn userspace. */ static struct ratelimit_state urandom_warning = - RATELIMIT_STATE_INIT("warn_urandom_randomness", HZ, 3); + RATELIMIT_STATE_INIT_FLAGS("urandom_warning", HZ, 3, RATELIMIT_MSG_ON_RELEASE); static int ratelimit_disable __read_mostly = IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM); module_param_named(ratelimit_disable, ratelimit_disable, int, 0644); diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index c21c7f8103e2..002266693e50 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -23,12 +23,16 @@ struct ratelimit_state { unsigned long flags; }; -#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) { \ - .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ - .interval = interval_init, \ - .burst = burst_init, \ +#define RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, flags_init) { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ + .interval = interval_init, \ + .burst = burst_init, \ + .flags = flags_init, \ } +#define RATELIMIT_STATE_INIT(name, interval_init, burst_init) \ + RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, 0) + #define RATELIMIT_STATE_INIT_DISABLED \ RATELIMIT_STATE_INIT(ratelimit_state, 0, DEFAULT_RATELIMIT_BURST) -- cgit v1.2.3 From 9243fc4cd28c8bdddd7fe0abd5bbec3c4fdf5052 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 3 Jun 2022 14:35:29 +0900 Subject: block: remove queue from struct blk_independent_access_range The request queue pointer in struct blk_independent_access_range is unused. Remove it. Signed-off-by: Damien Le Moal Fixes: 41e46b3c2aa2 ("block: Fix potential deadlock in blk_ia_range_sysfs_show()") Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220603053529.76405-1-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- block/blk-ia-ranges.c | 1 - include/linux/blkdev.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index 56ed48d2954e..47c89e65b57f 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -144,7 +144,6 @@ int disk_register_independent_access_ranges(struct gendisk *disk, } for (i = 0; i < iars->nr_ia_ranges; i++) { - iars->ia_range[i].queue = q; ret = kobject_init_and_add(&iars->ia_range[i].kobj, &blk_ia_range_ktype, &iars->kobj, "%d", i); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 73c886eba8e1..2f7b43444c5f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -342,7 +342,6 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, */ struct blk_independent_access_range { struct kobject kobj; - struct request_queue *queue; sector_t sector; sector_t nr_sectors; }; -- cgit v1.2.3 From 60050ffe3d770dd1df5b641aa48f49d07a54bd84 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 May 2022 23:48:09 +0100 Subject: certs: Move load_certificate_list() to be with the asymmetric keys code Move load_certificate_list(), which loads a series of binary X.509 certificates from a blob and inserts them as keys into a keyring, to be with the asymmetric keys code that it drives. This makes it easier to add FIPS selftest code in which we need to load up a private keyring for the tests to use. Signed-off-by: David Howells Reviewed-by: Simo Sorce Reviewed-by: Herbert Xu cc: keyrings@vger.kernel.org cc: linux-crypto@vger.kernel.org Link: https://lore.kernel.org/r/165515742145.1554877.13488098107542537203.stgit@warthog.procyon.org.uk/ --- certs/Makefile | 4 +-- certs/blacklist.c | 8 ++--- certs/common.c | 57 ------------------------------------ certs/common.h | 9 ------ certs/system_keyring.c | 6 ++-- crypto/asymmetric_keys/Makefile | 1 + crypto/asymmetric_keys/x509_loader.c | 57 ++++++++++++++++++++++++++++++++++++ include/keys/asymmetric-type.h | 3 ++ 8 files changed, 70 insertions(+), 75 deletions(-) delete mode 100644 certs/common.c delete mode 100644 certs/common.h create mode 100644 crypto/asymmetric_keys/x509_loader.c (limited to 'include') diff --git a/certs/Makefile b/certs/Makefile index cb1a9da3fc58..3aac9f33ee22 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -3,8 +3,8 @@ # Makefile for the linux kernel signature checking certificates. # -obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o common.o -obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o common.o +obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o +obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o obj-$(CONFIG_SYSTEM_REVOCATION_LIST) += revocation_certificates.o ifneq ($(CONFIG_SYSTEM_BLACKLIST_HASH_LIST),) quiet_cmd_check_blacklist_hashes = CHECK $(patsubst "%",%,$(2)) diff --git a/certs/blacklist.c b/certs/blacklist.c index 25094ea73600..41f10601cc72 100644 --- a/certs/blacklist.c +++ b/certs/blacklist.c @@ -15,10 +15,9 @@ #include #include #include -#include +#include #include #include "blacklist.h" -#include "common.h" /* * According to crypto/asymmetric_keys/x509_cert_parser.c:x509_note_pkey_algo(), @@ -365,8 +364,9 @@ static __init int load_revocation_certificate_list(void) if (revocation_certificate_list_size) pr_notice("Loading compiled-in revocation X.509 certificates\n"); - return load_certificate_list(revocation_certificate_list, revocation_certificate_list_size, - blacklist_keyring); + return x509_load_certificate_list(revocation_certificate_list, + revocation_certificate_list_size, + blacklist_keyring); } late_initcall(load_revocation_certificate_list); #endif diff --git a/certs/common.c b/certs/common.c deleted file mode 100644 index 16a220887a53..000000000000 --- a/certs/common.c +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include "common.h" - -int load_certificate_list(const u8 cert_list[], - const unsigned long list_size, - const struct key *keyring) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - p = cert_list; - end = p + list_size; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(keyring, 1), - "asymmetric", - NULL, - p, - plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_BUILT_IN | - KEY_ALLOC_BYPASS_RESTRICTION); - if (IS_ERR(key)) { - pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - } else { - pr_notice("Loaded X.509 cert '%s'\n", - key_ref_to_ptr(key)->description); - key_ref_put(key); - } - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} diff --git a/certs/common.h b/certs/common.h deleted file mode 100644 index abdb5795936b..000000000000 --- a/certs/common.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - -#ifndef _CERT_COMMON_H -#define _CERT_COMMON_H - -int load_certificate_list(const u8 cert_list[], const unsigned long list_size, - const struct key *keyring); - -#endif diff --git a/certs/system_keyring.c b/certs/system_keyring.c index 05b66ce9d1c9..5042cc54fa5e 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -16,7 +16,6 @@ #include #include #include -#include "common.h" static struct key *builtin_trusted_keys; #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING @@ -183,7 +182,8 @@ __init int load_module_cert(struct key *keyring) pr_notice("Loading compiled-in module X.509 certificates\n"); - return load_certificate_list(system_certificate_list, module_cert_size, keyring); + return x509_load_certificate_list(system_certificate_list, + module_cert_size, keyring); } /* @@ -204,7 +204,7 @@ static __init int load_system_certificate_list(void) size = system_certificate_list_size - module_cert_size; #endif - return load_certificate_list(p, size, builtin_trusted_keys); + return x509_load_certificate_list(p, size, builtin_trusted_keys); } late_initcall(load_system_certificate_list); diff --git a/crypto/asymmetric_keys/Makefile b/crypto/asymmetric_keys/Makefile index c38424f55b08..0f190500dd87 100644 --- a/crypto/asymmetric_keys/Makefile +++ b/crypto/asymmetric_keys/Makefile @@ -20,6 +20,7 @@ x509_key_parser-y := \ x509.asn1.o \ x509_akid.asn1.o \ x509_cert_parser.o \ + x509_loader.o \ x509_public_key.o $(obj)/x509_cert_parser.o: \ diff --git a/crypto/asymmetric_keys/x509_loader.c b/crypto/asymmetric_keys/x509_loader.c new file mode 100644 index 000000000000..1bc169dee22e --- /dev/null +++ b/crypto/asymmetric_keys/x509_loader.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +int x509_load_certificate_list(const u8 cert_list[], + const unsigned long list_size, + const struct key *keyring) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + p = cert_list; + end = p + list_size; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(keyring, 1), + "asymmetric", + NULL, + p, + plen, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(key)) { + pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + } else { + pr_notice("Loaded X.509 cert '%s'\n", + key_ref_to_ptr(key)->description); + key_ref_put(key); + } + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} diff --git a/include/keys/asymmetric-type.h b/include/keys/asymmetric-type.h index 6c5d4963e15b..69a13e1e5b2e 100644 --- a/include/keys/asymmetric-type.h +++ b/include/keys/asymmetric-type.h @@ -84,6 +84,9 @@ extern struct key *find_asymmetric_key(struct key *keyring, const struct asymmetric_key_id *id_2, bool partial); +int x509_load_certificate_list(const u8 cert_list[], const unsigned long list_size, + const struct key *keyring); + /* * The payload is at the discretion of the subtype. */ -- cgit v1.2.3 From e34a07c0ae3906f97eb18df50902e2a01c1015b6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Jun 2022 12:13:53 -0700 Subject: sock: redo the psock vs ULP protection check Commit 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()") has moved the inet_csk_has_ulp(sk) check from sk_psock_init() to the new tcp_bpf_update_proto() function. I'm guessing that this was done to allow creating psocks for non-inet sockets. Unfortunately the destruction path for psock includes the ULP unwind, so we need to fail the sk_psock_init() itself. Otherwise if ULP is already present we'll notice that later, and call tcp_update_ulp() with the sk_proto of the ULP itself, which will most likely result in the ULP looping its callbacks. Fixes: 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()") Signed-off-by: Jakub Kicinski Reviewed-by: John Fastabend Reviewed-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220620191353.1184629-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/net/inet_sock.h | 5 +++++ net/core/skmsg.c | 5 +++++ net/ipv4/tcp_bpf.c | 3 --- net/tls/tls_main.c | 2 ++ 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index c1b5dcd6597c..daead5fb389a 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -253,6 +253,11 @@ struct inet_sock { #define IP_CMSG_CHECKSUM BIT(7) #define IP_CMSG_RECVFRAGSIZE BIT(8) +static inline bool sk_is_inet(struct sock *sk) +{ + return sk->sk_family == AF_INET || sk->sk_family == AF_INET6; +} + /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 22b983ade0e7..b0fcd0200e84 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -699,6 +699,11 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); + if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { + psock = ERR_PTR(-EINVAL); + goto out; + } + if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index be3947e70fec..0d3f68bb51c0 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -611,9 +611,6 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) return 0; } - if (inet_csk_has_ulp(sk)) - return -EINVAL; - if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return -EINVAL; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index da176411c1b5..2ffede463e4a 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -921,6 +921,8 @@ static void tls_update(struct sock *sk, struct proto *p, { struct tls_context *ctx; + WARN_ON_ONCE(sk->sk_prot == p); + ctx = tls_get_ctx(sk); if (likely(ctx)) { ctx->sk_write_space = write_space; -- cgit v1.2.3 From 23c9cd56007e90b2c2317c5eab6ab12921b4314a Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 21 Jun 2022 09:05:00 +0200 Subject: nvme: fix the CRIMS and CRWMS definitions to match the spec Adjust the values of NVME_CAP_CRMS_CRIMS and NVME_CAP_CRMS_CRWMS masks as they are different from the ones in TP4084 - Time-to-ready. Fixes: 354201c53e61 ("nvme: add support for TP4084 - Time-to-Ready Enhancements"). Signed-off-by: Joel Granados Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 29ec3e3481ff..e3934003f239 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -233,8 +233,8 @@ enum { }; enum { - NVME_CAP_CRMS_CRIMS = 1ULL << 59, - NVME_CAP_CRMS_CRWMS = 1ULL << 60, + NVME_CAP_CRMS_CRWMS = 1ULL << 59, + NVME_CAP_CRMS_CRIMS = 1ULL << 60, }; struct nvme_id_power_state { -- cgit v1.2.3 From e70b64a3f28b9f54602ae3e706b1dc1338de3df7 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 23 Jun 2022 01:37:43 -0700 Subject: io_uring: move io_uring_get_opcode out of TP_printk The TP_printk macro's are not supposed to use custom code ([1]) or else tools such as perf cannot use these events. Convert the opcode string representation to use the __string wiring that the event framework provides ([2]). [1]: https://lwn.net/Articles/379903/ [2]: https://lwn.net/Articles/381064/ Fixes: 033b87d24f72 ("io_uring: use the text representation of ops in trace") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220623083743.2648321-1-dylany@fb.com [axboe: fixup spurious removal of sq_thread assignment] Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 42 ++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 66fcc5a1a5b1..aa2f951b07cd 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -158,6 +158,8 @@ TRACE_EVENT(io_uring_queue_async_work, __field( unsigned int, flags ) __field( struct io_wq_work *, work ) __field( int, rw ) + + __string( op_str, io_uring_get_opcode(opcode) ) ), TP_fast_assign( @@ -168,11 +170,13 @@ TRACE_EVENT(io_uring_queue_async_work, __entry->opcode = opcode; __entry->work = work; __entry->rw = rw; + + __assign_str(op_str, io_uring_get_opcode(opcode)); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, - io_uring_get_opcode(__entry->opcode), + __get_str(op_str), __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); @@ -198,6 +202,8 @@ TRACE_EVENT(io_uring_defer, __field( void *, req ) __field( unsigned long long, data ) __field( u8, opcode ) + + __string( op_str, io_uring_get_opcode(opcode) ) ), TP_fast_assign( @@ -205,11 +211,13 @@ TRACE_EVENT(io_uring_defer, __entry->req = req; __entry->data = user_data; __entry->opcode = opcode; + + __assign_str(op_str, io_uring_get_opcode(opcode)); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", __entry->ctx, __entry->req, __entry->data, - io_uring_get_opcode(__entry->opcode)) + __get_str(op_str)) ); /** @@ -298,6 +306,8 @@ TRACE_EVENT(io_uring_fail_link, __field( unsigned long long, user_data ) __field( u8, opcode ) __field( void *, link ) + + __string( op_str, io_uring_get_opcode(opcode) ) ), TP_fast_assign( @@ -306,11 +316,13 @@ TRACE_EVENT(io_uring_fail_link, __entry->user_data = user_data; __entry->opcode = opcode; __entry->link = link; + + __assign_str(op_str, io_uring_get_opcode(opcode)); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", __entry->ctx, __entry->req, __entry->user_data, - io_uring_get_opcode(__entry->opcode), __entry->link) + __get_str(op_str), __entry->link) ); /** @@ -390,6 +402,8 @@ TRACE_EVENT(io_uring_submit_sqe, __field( u32, flags ) __field( bool, force_nonblock ) __field( bool, sq_thread ) + + __string( op_str, io_uring_get_opcode(opcode) ) ), TP_fast_assign( @@ -400,11 +414,13 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->flags = flags; __entry->force_nonblock = force_nonblock; __entry->sq_thread = sq_thread; + + __assign_str(op_str, io_uring_get_opcode(opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " "non block %d, sq_thread %d", __entry->ctx, __entry->req, - __entry->user_data, io_uring_get_opcode(__entry->opcode), + __entry->user_data, __get_str(op_str), __entry->flags, __entry->force_nonblock, __entry->sq_thread) ); @@ -435,6 +451,8 @@ TRACE_EVENT(io_uring_poll_arm, __field( u8, opcode ) __field( int, mask ) __field( int, events ) + + __string( op_str, io_uring_get_opcode(opcode) ) ), TP_fast_assign( @@ -444,11 +462,13 @@ TRACE_EVENT(io_uring_poll_arm, __entry->opcode = opcode; __entry->mask = mask; __entry->events = events; + + __assign_str(op_str, io_uring_get_opcode(opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", __entry->ctx, __entry->req, __entry->user_data, - io_uring_get_opcode(__entry->opcode), + __get_str(op_str), __entry->mask, __entry->events) ); @@ -474,6 +494,8 @@ TRACE_EVENT(io_uring_task_add, __field( unsigned long long, user_data ) __field( u8, opcode ) __field( int, mask ) + + __string( op_str, io_uring_get_opcode(opcode) ) ), TP_fast_assign( @@ -482,11 +504,13 @@ TRACE_EVENT(io_uring_task_add, __entry->user_data = user_data; __entry->opcode = opcode; __entry->mask = mask; + + __assign_str(op_str, io_uring_get_opcode(opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", __entry->ctx, __entry->req, __entry->user_data, - io_uring_get_opcode(__entry->opcode), + __get_str(op_str), __entry->mask) ); @@ -523,6 +547,8 @@ TRACE_EVENT(io_uring_req_failed, __field( u64, pad1 ) __field( u64, addr3 ) __field( int, error ) + + __string( op_str, io_uring_get_opcode(sqe->opcode) ) ), TP_fast_assign( @@ -542,6 +568,8 @@ TRACE_EVENT(io_uring_req_failed, __entry->pad1 = sqe->__pad2[0]; __entry->addr3 = sqe->addr3; __entry->error = error; + + __assign_str(op_str, io_uring_get_opcode(sqe->opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, " @@ -550,7 +578,7 @@ TRACE_EVENT(io_uring_req_failed, "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, " "error=%d", __entry->ctx, __entry->req, __entry->user_data, - io_uring_get_opcode(__entry->opcode), + __get_str(op_str), __entry->flags, __entry->ioprio, (unsigned long long)__entry->off, (unsigned long long) __entry->addr, __entry->len, -- cgit v1.2.3 From 20fb0c8272bbb102d15bdd11aa64f828619dd7cc Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:52 +0200 Subject: Revert "printk: Wait for the global console lock when the system is going down" This reverts commit b87f02307d3cfbda768520f0687c51ca77e14fc3. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-2-pmladek@suse.com --- include/linux/printk.h | 5 ----- kernel/panic.c | 2 -- kernel/printk/internal.h | 2 -- kernel/printk/printk.c | 4 ---- kernel/printk/printk_safe.c | 32 -------------------------------- kernel/reboot.c | 2 -- 6 files changed, 47 deletions(-) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index c1e07c0652c7..cd26aab0ab2a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -174,7 +174,6 @@ extern void printk_prefer_direct_enter(void); extern void printk_prefer_direct_exit(void); extern bool pr_flush(int timeout_ms, bool reset_on_progress); -extern void try_block_console_kthreads(int timeout_ms); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -239,10 +238,6 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress) return true; } -static inline void try_block_console_kthreads(int timeout_ms) -{ -} - static inline int printk_ratelimit(void) { return 0; diff --git a/kernel/panic.c b/kernel/panic.c index fe73d18ecdf0..6737b2332275 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -273,7 +273,6 @@ void panic(const char *fmt, ...) * unfortunately means it may not be hardened to work in a * panic situation. */ - try_block_console_kthreads(10000); smp_send_stop(); } else { /* @@ -281,7 +280,6 @@ void panic(const char *fmt, ...) * kmsg_dump, we will need architecture dependent extra * works in addition to stopping other CPUs. */ - try_block_console_kthreads(10000); crash_smp_send_stop(); } diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index e7d8578860ad..d947ca6c84f9 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -20,8 +20,6 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -extern bool block_console_kthreads; - __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b095fb5f5f61..45c6c2b0b104 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -250,9 +250,6 @@ static atomic_t console_kthreads_active = ATOMIC_INIT(0); #define console_kthread_printing_exit() \ atomic_dec(&console_kthreads_active) -/* Block console kthreads to avoid processing new messages. */ -bool block_console_kthreads; - /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -3733,7 +3730,6 @@ static bool printer_should_wake(struct console *con, u64 seq) if (con->blocked || console_kthreads_atomically_blocked() || - block_console_kthreads || system_state > SYSTEM_RUNNING || oops_in_progress) { return false; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index caac4de1ea59..ef0f9a2044da 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -8,9 +8,7 @@ #include #include #include -#include #include -#include #include "internal.h" @@ -52,33 +50,3 @@ asmlinkage int vprintk(const char *fmt, va_list args) return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); - -/** - * try_block_console_kthreads() - Try to block console kthreads and - * make the global console_lock() avaialble - * - * @timeout_ms: The maximum time (in ms) to wait. - * - * Prevent console kthreads from starting processing new messages. Wait - * until the global console_lock() become available. - * - * Context: Can be called in any context. - */ -void try_block_console_kthreads(int timeout_ms) -{ - block_console_kthreads = true; - - /* Do not wait when the console lock could not be safely taken. */ - if (this_cpu_read(printk_context) || in_nmi()) - return; - - while (timeout_ms > 0) { - if (console_trylock()) { - console_unlock(); - return; - } - - udelay(1000); - timeout_ms -= 1; - } -} diff --git a/kernel/reboot.c b/kernel/reboot.c index 310363685502..4177645e74d6 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -74,7 +74,6 @@ void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; - try_block_console_kthreads(10000); usermodehelper_disable(); device_shutdown(); } @@ -263,7 +262,6 @@ static void kernel_shutdown_prepare(enum system_states state) blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; - try_block_console_kthreads(10000); usermodehelper_disable(); device_shutdown(); } -- cgit v1.2.3 From 2d9ef940f89e0ab4fde7ba6f769d82f2a450c35a Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:55 +0200 Subject: Revert "printk: extend console_lock for per-console locking" This reverts commit 8e274732115f63c1d09136284431b3555bd5cc56. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-5-pmladek@suse.com --- include/linux/console.h | 15 --- kernel/printk/printk.c | 261 +++++++++++------------------------------------- 2 files changed, 56 insertions(+), 220 deletions(-) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 143653090c48..9a251e70c090 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -16,7 +16,6 @@ #include #include -#include struct vc_data; struct console_font_op; @@ -155,20 +154,6 @@ struct console { u64 seq; unsigned long dropped; struct task_struct *thread; - bool blocked; - - /* - * The per-console lock is used by printing kthreads to synchronize - * this console with callers of console_lock(). This is necessary in - * order to allow printing kthreads to run in parallel to each other, - * while each safely accessing the @blocked field and synchronizing - * against direct printing via console_lock/console_unlock. - * - * Note: For synchronizing against direct printing via - * console_trylock/console_unlock, see the static global - * variable @console_kthreads_active. - */ - struct mutex lock; void *data; struct console *next; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index dfd1a19b95d6..ae489dc685a1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -223,33 +223,6 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; -/* - * Used to synchronize printing kthreads against direct printing via - * console_trylock/console_unlock. - * - * Values: - * -1 = console kthreads atomically blocked (via global trylock) - * 0 = no kthread printing, console not locked (via trylock) - * >0 = kthread(s) actively printing - * - * Note: For synchronizing against direct printing via - * console_lock/console_unlock, see the @lock variable in - * struct console. - */ -static atomic_t console_kthreads_active = ATOMIC_INIT(0); - -#define console_kthreads_atomic_tryblock() \ - (atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0) -#define console_kthreads_atomic_unblock() \ - atomic_cmpxchg(&console_kthreads_active, -1, 0) -#define console_kthreads_atomically_blocked() \ - (atomic_read(&console_kthreads_active) == -1) - -#define console_kthread_printing_tryenter() \ - atomic_inc_unless_negative(&console_kthreads_active) -#define console_kthread_printing_exit() \ - atomic_dec(&console_kthreads_active) - /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -297,49 +270,6 @@ static bool panic_in_progress(void) return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } -/* - * Tracks whether kthread printers are all blocked. A value of true implies - * that the console is locked via console_lock() or the console is suspended. - * Writing to this variable requires holding @console_sem. - */ -static bool console_kthreads_blocked; - -/* - * Block all kthread printers from a schedulable context. - * - * Requires holding @console_sem. - */ -static void console_kthreads_block(void) -{ - struct console *con; - - for_each_console(con) { - mutex_lock(&con->lock); - con->blocked = true; - mutex_unlock(&con->lock); - } - - console_kthreads_blocked = true; -} - -/* - * Unblock all kthread printers from a schedulable context. - * - * Requires holding @console_sem. - */ -static void console_kthreads_unblock(void) -{ - struct console *con; - - for_each_console(con) { - mutex_lock(&con->lock); - con->blocked = false; - mutex_unlock(&con->lock); - } - - console_kthreads_blocked = false; -} - /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -2673,6 +2603,13 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); + + /* + * While suspended, new records may have been added to the + * ringbuffer. Wake up the kthread printers to print them. + */ + wake_up_klogd(); + pr_flush(1000, true); } @@ -2691,14 +2628,9 @@ static int console_cpu_notify(unsigned int cpu) /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); - else { - /* - * If a new CPU comes online, the conditions for - * printer_should_wake() may have changed for some - * kthread printer with !CON_ANYTIME. - */ - wake_up_klogd(); - } + + /* Wake kthread printers. Some may have become usable. */ + wake_up_klogd(); } return 0; } @@ -2718,7 +2650,6 @@ void console_lock(void) down_console_sem(); if (console_suspended) return; - console_kthreads_block(); console_locked = 1; console_may_schedule = 1; } @@ -2740,10 +2671,6 @@ int console_trylock(void) up_console_sem(); return 0; } - if (!console_kthreads_atomic_tryblock()) { - up_console_sem(); - return 0; - } console_locked = 1; console_may_schedule = 0; return 1; @@ -2752,7 +2679,7 @@ EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { - return (console_locked || atomic_read(&console_kthreads_active)); + return console_locked; } EXPORT_SYMBOL(is_console_locked); @@ -2796,7 +2723,7 @@ static inline bool __console_is_usable(short flags) * Check if the given console is currently capable and allowed to print * records. * - * Requires holding the console_lock. + * Requires the console_lock. */ static inline bool console_is_usable(struct console *con) { @@ -2809,22 +2736,6 @@ static inline bool console_is_usable(struct console *con) static void __console_unlock(void) { console_locked = 0; - - /* - * Depending on whether console_lock() or console_trylock() was used, - * appropriately allow the kthread printers to continue. - */ - if (console_kthreads_blocked) - console_kthreads_unblock(); - else - console_kthreads_atomic_unblock(); - - /* - * New records may have arrived while the console was locked. - * Wake the kthread printers to print them. - */ - wake_up_klogd(); - up_console_sem(); } @@ -2842,19 +2753,17 @@ static void __console_unlock(void) * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the - * console_lock. Otherwise it is set to false. A NULL pointer may be provided - * to disable allowing the console_lock to be taken over by a printk waiter. + * console_lock. Otherwise it is set to false. * * Returns false if the given console has no next record to print, otherwise * true. * - * Requires the console_lock if @handover is non-NULL. - * Requires con->lock otherwise. + * Requires the console_lock. */ -static bool __console_emit_next_record(struct console *con, char *text, char *ext_text, - char *dropped_text, bool *handover) +static bool console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text, bool *handover) { - static atomic_t panic_console_dropped = ATOMIC_INIT(0); + static int panic_console_dropped; struct printk_info info; struct printk_record r; unsigned long flags; @@ -2863,8 +2772,7 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); - if (handover) - *handover = false; + *handover = false; if (!prb_read_valid(prb, con->seq, &r)) return false; @@ -2872,8 +2780,7 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex if (con->seq != r.info->seq) { con->dropped += r.info->seq - con->seq; con->seq = r.info->seq; - if (panic_in_progress() && - atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) { + if (panic_in_progress() && panic_console_dropped++ > 10) { suppress_panic_printk = 1; pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); } @@ -2895,61 +2802,31 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } - if (handover) { - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). - */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - - /* don't trace irqsoff print latency */ - stop_critical_timings(); - } + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + stop_critical_timings(); /* don't trace print latency */ call_console_driver(con, write_text, len, dropped_text); + start_critical_timings(); con->seq++; - if (handover) { - start_critical_timings(); - *handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); - } + *handover = console_lock_spinning_disable_and_check(); + printk_safe_exit_irqrestore(flags); skip: return true; } -/* - * Print a record for a given console, but allow another printk() caller to - * take over the console_lock and continue printing. - * - * Requires the console_lock, but depending on @handover after the call, the - * caller may no longer have the console_lock. - * - * See __console_emit_next_record() for argument and return details. - */ -static bool console_emit_next_record_transferable(struct console *con, char *text, char *ext_text, - char *dropped_text, bool *handover) -{ - /* - * Handovers are only supported if threaded printers are atomically - * blocked. The context taking over the console_lock may be atomic. - */ - if (!console_kthreads_atomically_blocked()) { - *handover = false; - handover = NULL; - } - - return __console_emit_next_record(con, text, ext_text, dropped_text, handover); -} - /* * Print out all remaining records to all consoles. * @@ -3001,11 +2878,13 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove if (con->flags & CON_EXTENDED) { /* Extended consoles do not print "dropped messages". */ - progress = console_emit_next_record_transferable(con, &text[0], - &ext_text[0], NULL, handover); + progress = console_emit_next_record(con, &text[0], + &ext_text[0], NULL, + handover); } else { - progress = console_emit_next_record_transferable(con, &text[0], - NULL, &dropped_text[0], handover); + progress = console_emit_next_record(con, &text[0], + NULL, &dropped_text[0], + handover); } if (*handover) return false; @@ -3120,10 +2999,6 @@ void console_unblank(void) if (oops_in_progress) { if (down_trylock_console_sem() != 0) return; - if (!console_kthreads_atomic_tryblock()) { - up_console_sem(); - return; - } } else console_lock(); @@ -3206,6 +3081,10 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); + + /* Wake the newly enabled kthread printer. */ + wake_up_klogd(); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -3407,8 +3286,6 @@ void register_console(struct console *newcon) newcon->dropped = 0; newcon->thread = NULL; - newcon->blocked = true; - mutex_init(&newcon->lock); if (newcon->flags & CON_PRINTBUFFER) { /* Get a consistent copy of @syslog_seq. */ @@ -3709,19 +3586,6 @@ static void printk_fallback_preferred_direct(void) console_unlock(); } -/* - * Print a record for a given console, not allowing another printk() caller - * to take over. This is appropriate for contexts that do not have the - * console_lock. - * - * See __console_emit_next_record() for argument and return details. - */ -static bool console_emit_next_record(struct console *con, char *text, char *ext_text, - char *dropped_text) -{ - return __console_emit_next_record(con, text, ext_text, dropped_text, NULL); -} - static bool printer_should_wake(struct console *con, u64 seq) { short flags; @@ -3729,10 +3593,8 @@ static bool printer_should_wake(struct console *con, u64 seq) if (kthread_should_stop() || !printk_kthreads_available) return true; - if (con->blocked || - console_kthreads_atomically_blocked()) { + if (console_suspended) return false; - } /* * This is an unsafe read from con->flags, but a false positive is @@ -3753,6 +3615,7 @@ static int printk_kthread_func(void *data) struct console *con = data; char *dropped_text = NULL; char *ext_text = NULL; + bool handover; u64 seq = 0; char *text; int error; @@ -3802,27 +3665,15 @@ static int printk_kthread_func(void *data) if (error) continue; - error = mutex_lock_interruptible(&con->lock); - if (error) - continue; + console_lock(); - if (con->blocked || - !console_kthread_printing_tryenter()) { - /* Another context has locked the console_lock. */ - mutex_unlock(&con->lock); + if (console_suspended) { + up_console_sem(); continue; } - /* - * Although this context has not locked the console_lock, it - * is known that the console_lock is not locked and it is not - * possible for any other context to lock the console_lock. - * Therefore it is safe to read con->flags. - */ - - if (!__console_is_usable(con->flags)) { - console_kthread_printing_exit(); - mutex_unlock(&con->lock); + if (!console_is_usable(con)) { + __console_unlock(); continue; } @@ -3835,13 +3686,13 @@ static int printk_kthread_func(void *data) * which can conditionally invoke cond_resched(). */ console_may_schedule = 0; - console_emit_next_record(con, text, ext_text, dropped_text); + console_emit_next_record(con, text, ext_text, dropped_text, &handover); + if (handover) + continue; seq = con->seq; - console_kthread_printing_exit(); - - mutex_unlock(&con->lock); + __console_unlock(); } con_printk(KERN_INFO, con, "printing thread stopped\n"); -- cgit v1.2.3 From 5831788afb17b89c5b531fb60cbd798613ccbb63 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:56 +0200 Subject: Revert "printk: add kthread console printers" This reverts commit 09c5ba0aa2fcfdadb17d045c3ee6f86d69270df7. This reverts commit b87f02307d3cfbda768520f0687c51ca77e14fc3. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-6-pmladek@suse.com --- include/linux/console.h | 2 - kernel/printk/printk.c | 329 ++++-------------------------------------------- 2 files changed, 22 insertions(+), 309 deletions(-) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 9a251e70c090..8c1686e2c233 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -153,8 +153,6 @@ struct console { uint ospeed; u64 seq; unsigned long dropped; - struct task_struct *thread; - void *data; struct console *next; }; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ae489dc685a1..5a6273e56b4e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -361,13 +361,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); -/* - * A flag to signify if printk_activate_kthreads() has already started the - * kthread printers. If true, any later registered consoles must start their - * own kthread directly. The flag is write protected by the console_lock. - */ -static bool printk_kthreads_available; - #ifdef CONFIG_PRINTK static atomic_t printk_prefer_direct = ATOMIC_INIT(0); @@ -397,39 +390,6 @@ void printk_prefer_direct_exit(void) WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0); } -/* - * Calling printk() always wakes kthread printers so that they can - * flush the new message to their respective consoles. Also, if direct - * printing is allowed, printk() tries to flush the messages directly. - * - * Direct printing is allowed in situations when the kthreads - * are not available or the system is in a problematic state. - * - * See the implementation about possible races. - */ -static inline bool allow_direct_printing(void) -{ - /* - * Checking kthread availability is a possible race because the - * kthread printers can become permanently disabled during runtime. - * However, doing that requires holding the console_lock, so any - * pending messages will be direct printed by console_unlock(). - */ - if (!printk_kthreads_available) - return true; - - /* - * Prefer direct printing when the system is in a problematic state. - * The context that sets this state will always see the updated value. - * The other contexts do not care. Anyway, direct printing is just a - * best effort. The direct output is only possible when console_lock - * is not already taken and no kthread printers are actively printing. - */ - return (system_state > SYSTEM_RUNNING || - oops_in_progress || - atomic_read(&printk_prefer_direct)); -} - DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -2320,10 +2280,10 @@ asmlinkage int vprintk_emit(int facility, int level, printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ - if (!in_sched && allow_direct_printing()) { + if (!in_sched) { /* * The caller may be holding system-critical or - * timing-sensitive locks. Disable preemption during direct + * timing-sensitive locks. Disable preemption during * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. @@ -2366,8 +2326,6 @@ EXPORT_SYMBOL(_printk); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); -static void printk_start_kthread(struct console *con); - #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 @@ -2401,8 +2359,6 @@ static void call_console_driver(struct console *con, const char *text, size_t le } static bool suppress_message_printing(int level) { return false; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } -static void printk_start_kthread(struct console *con) { } -static bool allow_direct_printing(void) { return true; } #endif /* CONFIG_PRINTK */ @@ -2603,13 +2559,6 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); - - /* - * While suspended, new records may have been added to the - * ringbuffer. Wake up the kthread printers to print them. - */ - wake_up_klogd(); - pr_flush(1000, true); } @@ -2628,9 +2577,6 @@ static int console_cpu_notify(unsigned int cpu) /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); - - /* Wake kthread printers. Some may have become usable. */ - wake_up_klogd(); } return 0; } @@ -2702,9 +2648,18 @@ static bool abandon_console_lock_in_panic(void) return atomic_read(&panic_cpu) != raw_smp_processor_id(); } -static inline bool __console_is_usable(short flags) +/* + * Check if the given console is currently capable and allowed to print + * records. + * + * Requires the console_lock. + */ +static inline bool console_is_usable(struct console *con) { - if (!(flags & CON_ENABLED)) + if (!(con->flags & CON_ENABLED)) + return false; + + if (!con->write) return false; /* @@ -2713,26 +2668,12 @@ static inline bool __console_is_usable(short flags) * cope (CON_ANYTIME) don't call them until this CPU is officially up. */ if (!cpu_online(raw_smp_processor_id()) && - !(flags & CON_ANYTIME)) + !(con->flags & CON_ANYTIME)) return false; return true; } -/* - * Check if the given console is currently capable and allowed to print - * records. - * - * Requires the console_lock. - */ -static inline bool console_is_usable(struct console *con) -{ - if (!con->write) - return false; - - return __console_is_usable(con->flags); -} - static void __console_unlock(void) { console_locked = 0; @@ -2845,8 +2786,8 @@ skip: * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this - * is not the panic CPU or direct printing is not preferred). Regardless the - * reason, the caller should assume it is not useful to immediately try again. + * is not the panic CPU). Regardless the reason, the caller should assume it + * is not useful to immediately try again. * * Requires the console_lock. */ @@ -2863,10 +2804,6 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove *handover = false; do { - /* Let the kthread printers do the work if they can. */ - if (!allow_direct_printing()) - return false; - any_progress = false; for_each_console(con) { @@ -3081,10 +3018,6 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); - - /* Wake the newly enabled kthread printer. */ - wake_up_klogd(); - __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -3285,8 +3218,6 @@ void register_console(struct console *newcon) nr_ext_console_drivers++; newcon->dropped = 0; - newcon->thread = NULL; - if (newcon->flags & CON_PRINTBUFFER) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); @@ -3296,10 +3227,6 @@ void register_console(struct console *newcon) /* Begin with next message. */ newcon->seq = prb_next_seq(prb); } - - if (printk_kthreads_available) - printk_start_kthread(newcon); - console_unlock(); console_sysfs_notify(); @@ -3326,7 +3253,6 @@ EXPORT_SYMBOL(register_console); int unregister_console(struct console *console) { - struct task_struct *thd; struct console *con; int res; @@ -3367,20 +3293,7 @@ int unregister_console(struct console *console) console_drivers->flags |= CON_CONSDEV; console->flags &= ~CON_ENABLED; - - /* - * console->thread can only be cleared under the console lock. But - * stopping the thread must be done without the console lock. The - * task that clears @thread is the task that stops the kthread. - */ - thd = console->thread; - console->thread = NULL; - console_unlock(); - - if (thd) - kthread_stop(thd); - console_sysfs_notify(); if (console->exit) @@ -3476,20 +3389,6 @@ static int __init printk_late_init(void) } late_initcall(printk_late_init); -static int __init printk_activate_kthreads(void) -{ - struct console *con; - - console_lock(); - printk_kthreads_available = true; - for_each_console(con) - printk_start_kthread(con); - console_unlock(); - - return 0; -} -early_initcall(printk_activate_kthreads); - #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) @@ -3564,180 +3463,11 @@ bool pr_flush(int timeout_ms, bool reset_on_progress) } EXPORT_SYMBOL(pr_flush); -static void __printk_fallback_preferred_direct(void) -{ - printk_prefer_direct_enter(); - pr_err("falling back to preferred direct printing\n"); - printk_kthreads_available = false; -} - -/* - * Enter preferred direct printing, but never exit. Mark console threads as - * unavailable. The system is then forever in preferred direct printing and - * any printing threads will exit. - * - * Must *not* be called under console_lock. Use - * __printk_fallback_preferred_direct() if already holding console_lock. - */ -static void printk_fallback_preferred_direct(void) -{ - console_lock(); - __printk_fallback_preferred_direct(); - console_unlock(); -} - -static bool printer_should_wake(struct console *con, u64 seq) -{ - short flags; - - if (kthread_should_stop() || !printk_kthreads_available) - return true; - - if (console_suspended) - return false; - - /* - * This is an unsafe read from con->flags, but a false positive is - * not a problem. Worst case it would allow the printer to wake up - * although it is disabled. But the printer will notice that when - * attempting to print and instead go back to sleep. - */ - flags = data_race(READ_ONCE(con->flags)); - - if (!__console_is_usable(flags)) - return false; - - return prb_read_valid(prb, seq, NULL); -} - -static int printk_kthread_func(void *data) -{ - struct console *con = data; - char *dropped_text = NULL; - char *ext_text = NULL; - bool handover; - u64 seq = 0; - char *text; - int error; - - text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); - if (!text) { - con_printk(KERN_ERR, con, "failed to allocate text buffer\n"); - printk_fallback_preferred_direct(); - goto out; - } - - if (con->flags & CON_EXTENDED) { - ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); - if (!ext_text) { - con_printk(KERN_ERR, con, "failed to allocate ext_text buffer\n"); - printk_fallback_preferred_direct(); - goto out; - } - } else { - dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL); - if (!dropped_text) { - con_printk(KERN_ERR, con, "failed to allocate dropped_text buffer\n"); - printk_fallback_preferred_direct(); - goto out; - } - } - - con_printk(KERN_INFO, con, "printing thread started\n"); - - for (;;) { - /* - * Guarantee this task is visible on the waitqueue before - * checking the wake condition. - * - * The full memory barrier within set_current_state() of - * prepare_to_wait_event() pairs with the full memory barrier - * within wq_has_sleeper(). - * - * This pairs with __wake_up_klogd:A. - */ - error = wait_event_interruptible(log_wait, - printer_should_wake(con, seq)); /* LMM(printk_kthread_func:A) */ - - if (kthread_should_stop() || !printk_kthreads_available) - break; - - if (error) - continue; - - console_lock(); - - if (console_suspended) { - up_console_sem(); - continue; - } - - if (!console_is_usable(con)) { - __console_unlock(); - continue; - } - - /* - * Even though the printk kthread is always preemptible, it is - * still not allowed to call cond_resched() from within - * console drivers. The task may become non-preemptible in the - * console driver call chain. For example, vt_console_print() - * takes a spinlock and then can call into fbcon_redraw(), - * which can conditionally invoke cond_resched(). - */ - console_may_schedule = 0; - console_emit_next_record(con, text, ext_text, dropped_text, &handover); - if (handover) - continue; - - seq = con->seq; - - __console_unlock(); - } - - con_printk(KERN_INFO, con, "printing thread stopped\n"); -out: - kfree(dropped_text); - kfree(ext_text); - kfree(text); - - console_lock(); - /* - * If this kthread is being stopped by another task, con->thread will - * already be NULL. That is fine. The important thing is that it is - * NULL after the kthread exits. - */ - con->thread = NULL; - console_unlock(); - - return 0; -} - -/* Must be called under console_lock. */ -static void printk_start_kthread(struct console *con) -{ - /* - * Do not start a kthread if there is no write() callback. The - * kthreads assume the write() callback exists. - */ - if (!con->write) - return; - - con->thread = kthread_run(printk_kthread_func, con, - "pr/%s%d", con->name, con->index); - if (IS_ERR(con->thread)) { - con->thread = NULL; - con_printk(KERN_ERR, con, "unable to start printing thread\n"); - __printk_fallback_preferred_direct(); - return; - } -} - /* * Delayed printk version, for scheduler-internal messages: */ -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_DIRECT_OUTPUT 0x02 +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); @@ -3745,14 +3475,10 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_DIRECT_OUTPUT) { - printk_prefer_direct_enter(); - + if (pending & PRINTK_PENDING_OUTPUT) { /* If trylock fails, someone else is doing the printing */ if (console_trylock()) console_unlock(); - - printk_prefer_direct_exit(); } if (pending & PRINTK_PENDING_WAKEUP) @@ -3777,11 +3503,10 @@ static void __wake_up_klogd(int val) * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * - * This pairs with devkmsg_read:A, syslog_print:A, and - * printk_kthread_func:A. + * This pairs with devkmsg_read:A and syslog_print:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ - (val & PRINTK_PENDING_DIRECT_OUTPUT)) { + (val & PRINTK_PENDING_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } @@ -3799,17 +3524,7 @@ void defer_console_output(void) * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ - int val = PRINTK_PENDING_WAKEUP; - - /* - * Make sure that some context will print the messages when direct - * printing is allowed. This happens in situations when the kthreads - * may not be as reliable or perhaps unusable. - */ - if (allow_direct_printing()) - val |= PRINTK_PENDING_DIRECT_OUTPUT; - - __wake_up_klogd(val); + __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) -- cgit v1.2.3 From 07a22b61946f0b80065b0ddcc703b715f84355f5 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 23 Jun 2022 16:51:57 +0200 Subject: Revert "printk: add functions to prefer direct printing" This reverts commit 2bb2b7b57f81255c13f4395ea911d6bdc70c9fe2. The testing of 5.19 release candidates revealed missing synchronization between early and regular console functionality. It would be possible to start the console kthreads later as a workaround. But it is clear that console lock serialized console drivers between each other. It opens a big area of possible problems that were not considered by people involved in the development and review. printk() is crucial for debugging kernel issues and console output is very important part of it. The number of consoles is huge and a proper review would take some time. As a result it need to be reverted for 5.19. Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220623145157.21938-7-pmladek@suse.com --- drivers/tty/sysrq.c | 2 -- include/linux/printk.h | 11 ----------- kernel/hung_task.c | 11 +---------- kernel/panic.c | 4 ---- kernel/printk/printk.c | 28 ---------------------------- kernel/rcu/tree_stall.h | 2 -- kernel/reboot.c | 14 +------------- kernel/watchdog.c | 4 ---- kernel/watchdog_hld.c | 4 ---- 9 files changed, 2 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 2884cd638d64..bbfd004449b5 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -578,7 +578,6 @@ void __handle_sysrq(int key, bool check_mask) rcu_sysrq_start(); rcu_read_lock(); - printk_prefer_direct_enter(); /* * Raise the apparent loglevel to maximum so that the sysrq header * is shown to provide the user with positive feedback. We do not @@ -620,7 +619,6 @@ void __handle_sysrq(int key, bool check_mask) pr_cont("\n"); console_loglevel = orig_log_level; } - printk_prefer_direct_exit(); rcu_read_unlock(); rcu_sysrq_end(); diff --git a/include/linux/printk.h b/include/linux/printk.h index cd26aab0ab2a..091fba7283e1 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -170,9 +170,6 @@ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit -extern void printk_prefer_direct_enter(void); -extern void printk_prefer_direct_exit(void); - extern bool pr_flush(int timeout_ms, bool reset_on_progress); /* @@ -225,14 +222,6 @@ static inline void printk_deferred_exit(void) { } -static inline void printk_prefer_direct_enter(void) -{ -} - -static inline void printk_prefer_direct_exit(void) -{ -} - static inline bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 02a65d554340..52501e5f7655 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -127,8 +127,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * complain: */ if (sysctl_hung_task_warnings) { - printk_prefer_direct_enter(); - if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", @@ -144,8 +142,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_all_cpu_backtrace) hung_task_show_all_bt = true; - - printk_prefer_direct_exit(); } touch_nmi_watchdog(); @@ -208,17 +204,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); - if (hung_task_show_lock) { - printk_prefer_direct_enter(); + if (hung_task_show_lock) debug_show_all_locks(); - printk_prefer_direct_exit(); - } if (hung_task_show_all_bt) { hung_task_show_all_bt = false; - printk_prefer_direct_enter(); trigger_all_cpu_backtrace(); - printk_prefer_direct_exit(); } if (hung_task_call_panic) diff --git a/kernel/panic.c b/kernel/panic.c index 6737b2332275..8355b19676f8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -579,8 +579,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - printk_prefer_direct_enter(); - if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, @@ -610,8 +608,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); - - printk_prefer_direct_exit(); } #ifndef __WARN_FLAGS diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5a6273e56b4e..b49c6ff6dca0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -362,34 +362,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; static DEFINE_MUTEX(syslog_lock); #ifdef CONFIG_PRINTK -static atomic_t printk_prefer_direct = ATOMIC_INIT(0); - -/** - * printk_prefer_direct_enter - cause printk() calls to attempt direct - * printing to all enabled consoles - * - * Since it is not possible to call into the console printing code from any - * context, there is no guarantee that direct printing will occur. - * - * This globally effects all printk() callers. - * - * Context: Any context. - */ -void printk_prefer_direct_enter(void) -{ - atomic_inc(&printk_prefer_direct); -} - -/** - * printk_prefer_direct_exit - restore printk() behavior - * - * Context: Any context. - */ -void printk_prefer_direct_exit(void) -{ - WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0); -} - DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4995c078cff9..a001e1e7a992 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -647,7 +647,6 @@ static void print_cpu_stall(unsigned long gps) * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ - printk_prefer_direct_enter(); trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected")); pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); @@ -685,7 +684,6 @@ static void print_cpu_stall(unsigned long gps) */ set_tsk_need_resched(current); set_preempt_need_resched(); - printk_prefer_direct_exit(); } static void check_cpu_stall(struct rcu_data *rdp) diff --git a/kernel/reboot.c b/kernel/reboot.c index 4177645e74d6..6bcc5d6a6572 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -447,11 +447,9 @@ static int __orderly_reboot(void) ret = run_cmd(reboot_cmd); if (ret) { - printk_prefer_direct_enter(); pr_warn("Failed to start orderly reboot: forcing the issue\n"); emergency_sync(); kernel_restart(NULL); - printk_prefer_direct_exit(); } return ret; @@ -464,7 +462,6 @@ static int __orderly_poweroff(bool force) ret = run_cmd(poweroff_cmd); if (ret && force) { - printk_prefer_direct_enter(); pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* @@ -474,7 +471,6 @@ static int __orderly_poweroff(bool force) */ emergency_sync(); kernel_power_off(); - printk_prefer_direct_exit(); } return ret; @@ -532,8 +528,6 @@ EXPORT_SYMBOL_GPL(orderly_reboot); */ static void hw_failure_emergency_poweroff_func(struct work_struct *work) { - printk_prefer_direct_enter(); - /* * We have reached here after the emergency shutdown waiting period has * expired. This means orderly_poweroff has not been able to shut off @@ -550,8 +544,6 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work) */ pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); emergency_restart(); - - printk_prefer_direct_exit(); } static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, @@ -590,13 +582,11 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) { static atomic_t allow_proceed = ATOMIC_INIT(1); - printk_prefer_direct_enter(); - pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); /* Shutdown should be initiated only once. */ if (!atomic_dec_and_test(&allow_proceed)) - goto out; + return; /* * Queue a backup emergency shutdown in the event of @@ -604,8 +594,6 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) */ hw_failure_emergency_poweroff(ms_until_forced); orderly_poweroff(true); -out: - printk_prefer_direct_exit(); } EXPORT_SYMBOL_GPL(hw_protection_shutdown); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 40024e03d422..9166220457bc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -424,8 +424,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); - printk_prefer_direct_enter(); - pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -444,8 +442,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); - - printk_prefer_direct_exit(); } return HRTIMER_RESTART; diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 701f35f0e2d4..247bf0b1582c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -135,8 +135,6 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; - printk_prefer_direct_enter(); - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", this_cpu); print_modules(); @@ -157,8 +155,6 @@ static void watchdog_overflow_callback(struct perf_event *event, if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); - printk_prefer_direct_exit(); - __this_cpu_write(hard_watchdog_warn, true); return; } -- cgit v1.2.3 From c7e1c443584ddd7facffca00e9e23b0d084e5bb3 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Mon, 6 Jun 2022 13:44:24 +0900 Subject: gpio: Fix kernel-doc comments to nested union Commit 48ec13d36d3f ("gpio: Properly document parent data union") is supposed to have fixed a warning from "make htmldocs" regarding kernel-doc comments to union members. However, the same warning still remains [1]. Fix the issue by following the example found in section "Nested structs/unions" of Documentation/doc-guide/kernel-doc.rst. Signed-off-by: Akira Yokosawa Reported-by: Stephen Rothwell Fixes: 48ec13d36d3f ("gpio: Properly document parent data union") Link: https://lore.kernel.org/r/20220606093302.21febee3@canb.auug.org.au/ [1] Cc: Linus Walleij Cc: Bartosz Golaszewski Cc: Joey Gouly Cc: Marc Zyngier Tested-by: Stephen Rothwell Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index b1e0f1f8ee2e..54c3c6506503 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -167,21 +167,24 @@ struct gpio_irq_chip { */ irq_flow_handler_t parent_handler; - /** - * @parent_handler_data: - * - * If @per_parent_data is false, @parent_handler_data is a single - * pointer used as the data associated with every parent interrupt. - * - * @parent_handler_data_array: - * - * If @per_parent_data is true, @parent_handler_data_array is - * an array of @num_parents pointers, and is used to associate - * different data for each parent. This cannot be NULL if - * @per_parent_data is true. - */ union { + /** + * @parent_handler_data: + * + * If @per_parent_data is false, @parent_handler_data is a + * single pointer used as the data associated with every + * parent interrupt. + */ void *parent_handler_data; + + /** + * @parent_handler_data_array: + * + * If @per_parent_data is true, @parent_handler_data_array is + * an array of @num_parents pointers, and is used to associate + * different data for each parent. This cannot be NULL if + * @per_parent_data is true. + */ void **parent_handler_data_array; }; -- cgit v1.2.3