From 7fbd166a8f2d697c3e2b4c8432d33253f00266b3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 14 Apr 2022 19:13:24 -0700 Subject: MAINTAINERS: Broadcom internal lists aren't maintainers Convert the broadcom internal list M: and L: entries to R: as exploder email addresses are neither maintainers nor mailing lists. Reorder the entries as necessary. Link: https://lkml.kernel.org/r/04eb301f5b3adbefdd78e76657eff0acb3e3d87f.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 64 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7341667e7313..d76c9aa1d38a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3743,7 +3743,7 @@ F: include/linux/platform_data/b53.h BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE M: Nicolas Saenz Julienne -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-rpi-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -3758,7 +3758,7 @@ BROADCOM BCM281XX/BCM11XXX/BCM216XX ARM ARCHITECTURE M: Florian Fainelli M: Ray Jui M: Scott Branden -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team S: Maintained T: git git://github.com/broadcom/mach-bcm F: arch/arm/mach-bcm/ @@ -3778,7 +3778,7 @@ F: arch/mips/include/asm/mach-bcm47xx/* BROADCOM BCM4908 ETHERNET DRIVER M: Rafał Miłecki -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/brcm,bcm4908-enet.yaml @@ -3787,7 +3787,7 @@ F: drivers/net/ethernet/broadcom/unimac.h BROADCOM BCM4908 PINMUX DRIVER M: Rafał Miłecki -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-gpio@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pinctrl/brcm,bcm4908-pinctrl.yaml @@ -3797,7 +3797,7 @@ BROADCOM BCM5301X ARM ARCHITECTURE M: Florian Fainelli M: Hauke Mehrtens M: Rafał Miłecki -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/boot/dts/bcm470* @@ -3808,7 +3808,7 @@ F: arch/arm/mach-bcm/bcm_5301x.c BROADCOM BCM53573 ARM ARCHITECTURE M: Florian Fainelli M: Rafał Miłecki -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/boot/dts/bcm47189* @@ -3816,7 +3816,7 @@ F: arch/arm/boot/dts/bcm53573* BROADCOM BCM63XX ARM ARCHITECTURE M: Florian Fainelli -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://github.com/broadcom/stblinux.git @@ -3830,7 +3830,7 @@ F: drivers/usb/gadget/udc/bcm63xx_udc.* BROADCOM BCM7XXX ARM ARCHITECTURE M: Florian Fainelli -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://github.com/broadcom/stblinux.git @@ -3848,21 +3848,21 @@ N: bcm7120 BROADCOM BDC DRIVER M: Al Cooper L: linux-usb@vger.kernel.org -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team S: Maintained F: Documentation/devicetree/bindings/usb/brcm,bdc.yaml F: drivers/usb/gadget/udc/bdc/ BROADCOM BMIPS CPUFREQ DRIVER M: Markus Mayer -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-pm@vger.kernel.org S: Maintained F: drivers/cpufreq/bmips-cpufreq.c BROADCOM BMIPS MIPS ARCHITECTURE M: Florian Fainelli -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-mips@vger.kernel.org S: Maintained T: git git://github.com/broadcom/stblinux.git @@ -3928,53 +3928,53 @@ F: drivers/net/wireless/broadcom/brcm80211/ BROADCOM BRCMSTB GPIO DRIVER M: Doug Berger M: Florian Fainelli -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team S: Supported F: Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.yaml F: drivers/gpio/gpio-brcmstb.c BROADCOM BRCMSTB I2C DRIVER M: Kamal Dasu +R: Broadcom Kernel Team L: linux-i2c@vger.kernel.org -L: bcm-kernel-feedback-list@broadcom.com S: Supported F: Documentation/devicetree/bindings/i2c/brcm,brcmstb-i2c.yaml F: drivers/i2c/busses/i2c-brcmstb.c BROADCOM BRCMSTB UART DRIVER M: Al Cooper +R: Broadcom Kernel Team L: linux-serial@vger.kernel.org -L: bcm-kernel-feedback-list@broadcom.com S: Maintained F: Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml F: drivers/tty/serial/8250/8250_bcm7271.c BROADCOM BRCMSTB USB EHCI DRIVER M: Al Cooper +R: Broadcom Kernel Team L: linux-usb@vger.kernel.org -L: bcm-kernel-feedback-list@broadcom.com S: Maintained F: Documentation/devicetree/bindings/usb/brcm,bcm7445-ehci.yaml F: drivers/usb/host/ehci-brcm.* BROADCOM BRCMSTB USB PIN MAP DRIVER M: Al Cooper +R: Broadcom Kernel Team L: linux-usb@vger.kernel.org -L: bcm-kernel-feedback-list@broadcom.com S: Maintained F: Documentation/devicetree/bindings/usb/brcm,usb-pinmap.yaml F: drivers/usb/misc/brcmstb-usb-pinmap.c BROADCOM BRCMSTB USB2 and USB3 PHY DRIVER M: Al Cooper +R: Broadcom Kernel Team L: linux-kernel@vger.kernel.org -L: bcm-kernel-feedback-list@broadcom.com S: Maintained F: drivers/phy/broadcom/phy-brcm-usb* BROADCOM ETHERNET PHY DRIVERS M: Florian Fainelli -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: netdev@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/net/broadcom-bcm87xx.txt @@ -3985,7 +3985,7 @@ F: include/linux/brcmphy.h BROADCOM GENET ETHERNET DRIVER M: Doug Berger M: Florian Fainelli -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: netdev@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml @@ -3999,7 +3999,7 @@ F: include/linux/platform_data/mdio-bcm-unimac.h BROADCOM IPROC ARM ARCHITECTURE M: Ray Jui M: Scott Branden -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://github.com/broadcom/stblinux.git @@ -4027,7 +4027,7 @@ N: stingray BROADCOM IPROC GBIT ETHERNET DRIVER M: Rafał Miłecki -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/brcm,amac.yaml @@ -4036,7 +4036,7 @@ F: drivers/net/ethernet/broadcom/unimac.h BROADCOM KONA GPIO DRIVER M: Ray Jui -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team S: Supported F: Documentation/devicetree/bindings/gpio/brcm,kona-gpio.txt F: drivers/gpio/gpio-bcm-kona.c @@ -4069,7 +4069,7 @@ F: drivers/firmware/broadcom/* BROADCOM PMB (POWER MANAGEMENT BUS) DRIVER M: Rafał Miłecki M: Florian Fainelli -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-pm@vger.kernel.org S: Maintained T: git git://github.com/broadcom/stblinux.git @@ -4085,7 +4085,7 @@ F: include/linux/bcma/ BROADCOM SPI DRIVER M: Kamal Dasu -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team S: Maintained F: Documentation/devicetree/bindings/spi/brcm,spi-bcm-qspi.yaml F: drivers/spi/spi-bcm-qspi.* @@ -4094,7 +4094,7 @@ F: drivers/spi/spi-iproc-qspi.c BROADCOM STB AVS CPUFREQ DRIVER M: Markus Mayer -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-pm@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt @@ -4102,7 +4102,7 @@ F: drivers/cpufreq/brcmstb* BROADCOM STB AVS TMON DRIVER M: Markus Mayer -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-pm@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/thermal/brcm,avs-tmon.yaml @@ -4110,7 +4110,7 @@ F: drivers/thermal/broadcom/brcmstb* BROADCOM STB DPFE DRIVER M: Markus Mayer -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: Documentation/devicetree/bindings/memory-controllers/brcm,dpfe-cpu.yaml @@ -4119,8 +4119,8 @@ F: drivers/memory/brcmstb_dpfe.c BROADCOM STB NAND FLASH DRIVER M: Brian Norris M: Kamal Dasu +R: Broadcom Kernel Team L: linux-mtd@lists.infradead.org -L: bcm-kernel-feedback-list@broadcom.com S: Maintained F: drivers/mtd/nand/raw/brcmnand/ F: include/linux/platform_data/brcmnand.h @@ -4129,7 +4129,7 @@ BROADCOM STB PCIE DRIVER M: Jim Quinlan M: Nicolas Saenz Julienne M: Florian Fainelli -M: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: linux-pci@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml @@ -4137,7 +4137,7 @@ F: drivers/pci/controller/pcie-brcmstb.c BROADCOM SYSTEMPORT ETHERNET DRIVER M: Florian Fainelli -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/broadcom/bcmsysport.* @@ -4154,7 +4154,7 @@ F: drivers/net/ethernet/broadcom/tg3.* BROADCOM VK DRIVER M: Scott Branden -L: bcm-kernel-feedback-list@broadcom.com +R: Broadcom Kernel Team S: Supported F: drivers/misc/bcm-vk/ F: include/uapi/linux/misc/bcm_vk.h @@ -17648,8 +17648,8 @@ K: \bTIF_SECCOMP\b SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) Broadcom BRCMSTB DRIVER M: Al Cooper +R: Broadcom Kernel Team L: linux-mmc@vger.kernel.org -L: bcm-kernel-feedback-list@broadcom.com S: Maintained F: drivers/mmc/host/sdhci-brcmstb* -- cgit v1.2.3 From 1bdec44b1eee32e311b44b5b06144bb7d9b33938 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 14 Apr 2022 19:13:27 -0700 Subject: tmpfs: fix regressions from wider use of ZERO_PAGE Chuck Lever reported fsx-based xfstests generic 075 091 112 127 failing when 5.18-rc1 NFS server exports tmpfs: bisected to recent tmpfs change. Whilst nfsd_splice_action() does contain some questionable handling of repeated pages, and Chuck was able to work around there, history from Mark Hemment makes clear that there might be similar dangers elsewhere: it was not a good idea for me to pass ZERO_PAGE down to unknown actors. Revert shmem_file_read_iter() to using ZERO_PAGE for holes only when iter_is_iovec(); in other cases, use the more natural iov_iter_zero() instead of copy_page_to_iter(). We would use iov_iter_zero() throughout, but the x86 clear_user() is not nearly so well optimized as copy to user (dd of 1T sparse tmpfs file takes 57 seconds rather than 44 seconds). And now pagecache_init() does not need to SetPageUptodate(ZERO_PAGE(0)): which had caused boot failure on arm noMMU STM32F7 and STM32H7 boards Link: https://lkml.kernel.org/r/9a978571-8648-e830-5735-1f4748ce2e30@google.com Fixes: 56a8c8eb1eaf ("tmpfs: do not allocate pages on read") Signed-off-by: Hugh Dickins Reported-by: Patrice CHOTARD Reported-by: Chuck Lever III Tested-by: Chuck Lever III Cc: Mark Hemment Cc: Patrice CHOTARD Cc: Mikulas Patocka Cc: Lukas Czerner Cc: Christoph Hellwig Cc: "Darrick J. Wong" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ------ mm/shmem.c | 31 ++++++++++++++++++++----------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 3a5ffb5587cd..9a1eef6c5d35 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1063,12 +1063,6 @@ void __init pagecache_init(void) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); - - /* - * tmpfs uses the ZERO_PAGE for reading holes: it is up-to-date, - * and splice's page_cache_pipe_buf_confirm() needs to see that. - */ - SetPageUptodate(ZERO_PAGE(0)); } /* diff --git a/mm/shmem.c b/mm/shmem.c index 529c9ad3e926..4b2fea33158e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2513,7 +2513,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) pgoff_t end_index; unsigned long nr, ret; loff_t i_size = i_size_read(inode); - bool got_page; end_index = i_size >> PAGE_SHIFT; if (index > end_index) @@ -2570,24 +2569,34 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) */ if (!offset) mark_page_accessed(page); - got_page = true; + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + */ + ret = copy_page_to_iter(page, offset, nr, to); + put_page(page); + + } else if (iter_is_iovec(to)) { + /* + * Copy to user tends to be so well optimized, but + * clear_user() not so much, that it is noticeably + * faster to copy the zero page instead of clearing. + */ + ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); } else { - page = ZERO_PAGE(0); - got_page = false; + /* + * But submitting the same page twice in a row to + * splice() - or others? - can result in confusion: + * so don't attempt that optimization on pipes etc. + */ + ret = iov_iter_zero(nr, to); } - /* - * Ok, we have the page, and it's up-to-date, so - * now we can copy it to user space... - */ - ret = copy_page_to_iter(page, offset, nr, to); retval += ret; offset += ret; index += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; - if (got_page) - put_page(page); if (!iov_iter_count(to)) break; if (ret < nr) { -- cgit v1.2.3 From f9b141f93659e09a52e28791ccbaf69c273b8e92 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Thu, 14 Apr 2022 19:13:31 -0700 Subject: mm/secretmem: fix panic when growing a memfd_secret When one tries to grow an existing memfd_secret with ftruncate, one gets a panic [1]. For example, doing the following reliably induces the panic: fd = memfd_secret(); ftruncate(fd, 10); ptr = mmap(NULL, 10, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); strcpy(ptr, "123456789"); munmap(ptr, 10); ftruncate(fd, 20); The basic reason for this is, when we grow with ftruncate, we call down into simple_setattr, and then truncate_inode_pages_range, and eventually we try to zero part of the memory. The normal truncation code does this via the direct map (i.e., it calls page_address() and hands that to memset()). For memfd_secret though, we specifically don't map our pages via the direct map (i.e. we call set_direct_map_invalid_noflush() on every fault). So the address returned by page_address() isn't useful, and when we try to memset() with it we panic. This patch avoids the panic by implementing a custom setattr for memfd_secret, which detects resizes specifically (setting the size for the first time works just fine, since there are no existing pages to try to zero), and rejects them with EINVAL. One could argue growing should be supported, but I think that will require a significantly more lengthy change. So, I propose a minimal fix for the benefit of stable kernels, and then perhaps to extend memfd_secret to support growing in a separate patch. [1]: BUG: unable to handle page fault for address: ffffa0a889277028 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD afa01067 P4D afa01067 PUD 83f909067 PMD 83f8bf067 PTE 800ffffef6d88060 Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI CPU: 0 PID: 281 Comm: repro Not tainted 5.17.0-dbg-DEV #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:memset_erms+0x9/0x10 Code: c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 f3 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 aa 4c 89 c8 c3 90 49 89 fa 40 0f b6 ce 48 b8 01 01 01 01 01 01 RSP: 0018:ffffb932c09afbf0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffda63c4249dc0 RCX: 0000000000000fd8 RDX: 0000000000000fd8 RSI: 0000000000000000 RDI: ffffa0a889277028 RBP: ffffb932c09afc00 R08: 0000000000001000 R09: ffffa0a889277028 R10: 0000000000020023 R11: 0000000000000000 R12: ffffda63c4249dc0 R13: ffffa0a890d70d98 R14: 0000000000000028 R15: 0000000000000fd8 FS: 00007f7294899580(0000) GS:ffffa0af9bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffa0a889277028 CR3: 0000000107ef6006 CR4: 0000000000370ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? zero_user_segments+0x82/0x190 truncate_inode_partial_folio+0xd4/0x2a0 truncate_inode_pages_range+0x380/0x830 truncate_setsize+0x63/0x80 simple_setattr+0x37/0x60 notify_change+0x3d8/0x4d0 do_sys_ftruncate+0x162/0x1d0 __x64_sys_ftruncate+0x1c/0x20 do_syscall_64+0x44/0xa0 entry_SYSCALL_64_after_hwframe+0x44/0xae Modules linked in: xhci_pci xhci_hcd virtio_net net_failover failover virtio_blk virtio_balloon uhci_hcd ohci_pci ohci_hcd evdev ehci_pci ehci_hcd 9pnet_virtio 9p netfs 9pnet CR2: ffffa0a889277028 [lkp@intel.com: secretmem_iops can be static] Signed-off-by: kernel test robot [axelrasmussen@google.com: return EINVAL] Link: https://lkml.kernel.org/r/20220324210909.1843814-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20220412193023.279320-1-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Cc: Mike Rapoport Cc: Matthew Wilcox Cc: Cc: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/secretmem.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/secretmem.c b/mm/secretmem.c index 098638d3b8a4..3b3cf2892b6a 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -158,6 +158,22 @@ const struct address_space_operations secretmem_aops = { .isolate_page = secretmem_isolate_page, }; +static int secretmem_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + unsigned int ia_valid = iattr->ia_valid; + + if ((ia_valid & ATTR_SIZE) && inode->i_size) + return -EINVAL; + + return simple_setattr(mnt_userns, dentry, iattr); +} + +static const struct inode_operations secretmem_iops = { + .setattr = secretmem_setattr, +}; + static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) @@ -177,6 +193,7 @@ static struct file *secretmem_file_create(unsigned long flags) mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); + inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; /* pretend we are a normal file with zero size */ -- cgit v1.2.3 From 25934fcfb93c4687ad32fd3d062bcf03457129d4 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 14 Apr 2022 19:13:34 -0700 Subject: irq_work: use kasan_record_aux_stack_noalloc() record callstack On PREEMPT_RT kernel and KASAN is enabled. the kasan_record_aux_stack() may call alloc_pages(), and the rt-spinlock will be acquired, if currently in atomic context, will trigger warning: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 239, name: bootlogd Preemption disabled at: [] rt_mutex_slowunlock+0xa1/0x4e0 CPU: 3 PID: 239 Comm: bootlogd Tainted: G W 5.17.1-rt17-yocto-preempt-rt+ #105 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Call Trace: __might_resched.cold+0x13b/0x173 rt_spin_lock+0x5b/0xf0 get_page_from_freelist+0x20c/0x1610 __alloc_pages+0x25e/0x5e0 __stack_depot_save+0x3c0/0x4a0 kasan_save_stack+0x3a/0x50 __kasan_record_aux_stack+0xb6/0xc0 kasan_record_aux_stack+0xe/0x10 irq_work_queue_on+0x6a/0x1c0 pull_rt_task+0x631/0x6b0 do_balance_callbacks+0x56/0x80 __balance_callbacks+0x63/0x90 rt_mutex_setprio+0x349/0x880 rt_mutex_slowunlock+0x22a/0x4e0 rt_spin_unlock+0x49/0x80 uart_write+0x186/0x2b0 do_output_char+0x2e9/0x3a0 n_tty_write+0x306/0x800 file_tty_write.isra.0+0x2af/0x450 tty_write+0x22/0x30 new_sync_write+0x27c/0x3a0 vfs_write+0x3f7/0x5d0 ksys_write+0xd9/0x180 __x64_sys_write+0x43/0x50 do_syscall_64+0x44/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Fix it by using kasan_record_aux_stack_noalloc() to avoid the call to alloc_pages(). Link: https://lkml.kernel.org/r/20220402142555.2699582-1-qiang1.zhang@intel.com Signed-off-by: Zqiang Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index f7df715ec28e..7afa40fe5cc4 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -137,7 +137,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; - kasan_record_aux_stack(work); + kasan_record_aux_stack_noalloc(work); preempt_disable(); if (cpu != smp_processor_id()) { -- cgit v1.2.3 From b1add418d457bc30cdd6a89c2bba092cd8941fcf Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Thu, 14 Apr 2022 19:13:37 -0700 Subject: kasan: fix hw tags enablement when KUNIT tests are disabled Kasan enables hw tags via kasan_enable_tagging() which based on the mode passed via kernel command line selects the correct hw backend. kasan_enable_tagging() is meant to be invoked indirectly via the cpu features framework of the architectures that support these backends. Currently the invocation of this function is guarded by CONFIG_KASAN_KUNIT_TEST which allows the enablement of the correct backend only when KUNIT tests are enabled in the kernel. This inconsistency was introduced in commit: ed6d74446cbf ("kasan: test: support async (again) and asymm modes for HW_TAGS") ... and prevents to enable MTE on arm64 when KUNIT tests for kasan hw_tags are disabled. Fix the issue making sure that the CONFIG_KASAN_KUNIT_TEST guard does not prevent the correct invocation of kasan_enable_tagging(). Link: https://lkml.kernel.org/r/20220408124323.10028-1-vincenzo.frascino@arm.com Fixes: ed6d74446cbf ("kasan: test: support async (again) and asymm modes for HW_TAGS") Signed-off-by: Vincenzo Frascino Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/hw_tags.c | 5 +++-- mm/kasan/kasan.h | 10 ++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 07a76c46daa5..9e1b6544bfa8 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -336,8 +336,6 @@ void __kasan_poison_vmalloc(const void *start, unsigned long size) #endif -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) - void kasan_enable_tagging(void) { if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) @@ -347,6 +345,9 @@ void kasan_enable_tagging(void) else hw_enable_tagging_sync(); } + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + EXPORT_SYMBOL_GPL(kasan_enable_tagging); void kasan_force_async_fault(void) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index d79b83d673b1..b01b4bbe0409 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -355,25 +355,27 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_set_mem_tag_range(addr, size, tag, init) \ arch_set_mem_tag_range((addr), (size), (tag), (init)) +void kasan_enable_tagging(void); + #else /* CONFIG_KASAN_HW_TAGS */ #define hw_enable_tagging_sync() #define hw_enable_tagging_async() #define hw_enable_tagging_asymm() +static inline void kasan_enable_tagging(void) { } + #endif /* CONFIG_KASAN_HW_TAGS */ #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -void kasan_enable_tagging(void); void kasan_force_async_fault(void); -#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ +#else /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ -static inline void kasan_enable_tagging(void) { } static inline void kasan_force_async_fault(void) { } -#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ +#endif /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ #ifdef CONFIG_KASAN_SW_TAGS u8 kasan_random_tag(void); -- cgit v1.2.3 From 2dfe63e61cc31ee59ce951672b0850b5229cd5b0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Apr 2022 19:13:40 -0700 Subject: mm, kfence: support kmem_dump_obj() for KFENCE objects Calling kmem_obj_info() via kmem_dump_obj() on KFENCE objects has been producing garbage data due to the object not actually being maintained by SLAB or SLUB. Fix this by implementing __kfence_obj_info() that copies relevant information to struct kmem_obj_info when the object was allocated by KFENCE; this is called by a common kmem_obj_info(), which also calls the slab/slub/slob specific variant now called __kmem_obj_info(). For completeness, kmem_dump_obj() now displays if the object was allocated by KFENCE. Link: https://lore.kernel.org/all/20220323090520.GG16885@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/20220406131558.3558585-1-elver@google.com Fixes: b89fb5ef0ce6 ("mm, kfence: insert KFENCE hooks for SLUB") Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB") Signed-off-by: Marco Elver Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reported-by: kernel test robot Acked-by: Vlastimil Babka [slab] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 24 ++++++++++++++++++++++++ mm/kfence/core.c | 21 --------------------- mm/kfence/kfence.h | 21 +++++++++++++++++++++ mm/kfence/report.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ mm/slab.c | 2 +- mm/slab.h | 2 +- mm/slab_common.c | 9 +++++++++ mm/slob.c | 2 +- mm/slub.c | 2 +- 9 files changed, 105 insertions(+), 25 deletions(-) diff --git a/include/linux/kfence.h b/include/linux/kfence.h index f49e64222628..726857a4b680 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -204,6 +204,22 @@ static __always_inline __must_check bool kfence_free(void *addr) */ bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs); +#ifdef CONFIG_PRINTK +struct kmem_obj_info; +/** + * __kfence_obj_info() - fill kmem_obj_info struct + * @kpp: kmem_obj_info to be filled + * @object: the object + * + * Return: + * * false - not a KFENCE object + * * true - a KFENCE object, filled @kpp + * + * Copies information to @kpp for KFENCE objects. + */ +bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); +#endif + #else /* CONFIG_KFENCE */ static inline bool is_kfence_address(const void *addr) { return false; } @@ -221,6 +237,14 @@ static inline bool __must_check kfence_handle_page_fault(unsigned long addr, boo return false; } +#ifdef CONFIG_PRINTK +struct kmem_obj_info; +static inline bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +{ + return false; +} +#endif + #endif #endif /* _LINUX_KFENCE_H */ diff --git a/mm/kfence/core.c b/mm/kfence/core.c index a203747ad2c0..9b2b5f56f4ae 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -231,27 +231,6 @@ static bool kfence_unprotect(unsigned long addr) return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); } -static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) -{ - long index; - - /* The checks do not affect performance; only called from slow-paths. */ - - if (!is_kfence_address((void *)addr)) - return NULL; - - /* - * May be an invalid index if called with an address at the edge of - * __kfence_pool, in which case we would report an "invalid access" - * error. - */ - index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; - if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) - return NULL; - - return &kfence_metadata[index]; -} - static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) { unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 9a6c4b1b12a8..600f2e2431d6 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -96,6 +96,27 @@ struct kfence_metadata { extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; +static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) +{ + long index; + + /* The checks do not affect performance; only called from slow-paths. */ + + if (!is_kfence_address((void *)addr)) + return NULL; + + /* + * May be an invalid index if called with an address at the edge of + * __kfence_pool, in which case we would report an "invalid access" + * error. + */ + index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; + if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) + return NULL; + + return &kfence_metadata[index]; +} + /* KFENCE error types for report generation. */ enum kfence_error_type { KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ diff --git a/mm/kfence/report.c b/mm/kfence/report.c index f93a7b2a338b..f5a6d8ba3e21 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -273,3 +273,50 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); } + +#ifdef CONFIG_PRINTK +static void kfence_to_kp_stack(const struct kfence_track *track, void **kp_stack) +{ + int i, j; + + i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL); + for (j = 0; i < track->num_stack_entries && j < KS_ADDRS_COUNT; ++i, ++j) + kp_stack[j] = (void *)track->stack_entries[i]; + if (j < KS_ADDRS_COUNT) + kp_stack[j] = NULL; +} + +bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +{ + struct kfence_metadata *meta = addr_to_metadata((unsigned long)object); + unsigned long flags; + + if (!meta) + return false; + + /* + * If state is UNUSED at least show the pointer requested; the rest + * would be garbage data. + */ + kpp->kp_ptr = object; + + /* Requesting info an a never-used object is almost certainly a bug. */ + if (WARN_ON(meta->state == KFENCE_OBJECT_UNUSED)) + return true; + + raw_spin_lock_irqsave(&meta->lock, flags); + + kpp->kp_slab = slab; + kpp->kp_slab_cache = meta->cache; + kpp->kp_objp = (void *)meta->addr; + kfence_to_kp_stack(&meta->alloc_track, kpp->kp_stack); + if (meta->state == KFENCE_OBJECT_FREED) + kfence_to_kp_stack(&meta->free_track, kpp->kp_free_stack); + /* get_stack_skipnr() ensures the first entry is outside allocator. */ + kpp->kp_ret = kpp->kp_stack[0]; + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + return true; +} +#endif diff --git a/mm/slab.c b/mm/slab.c index b04e40078bdf..0edb474edef1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3665,7 +3665,7 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { struct kmem_cache *cachep; unsigned int objnr; diff --git a/mm/slab.h b/mm/slab.h index fd7ae2024897..95eb34174c1b 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -868,7 +868,7 @@ struct kmem_obj_info { void *kp_stack[KS_ADDRS_COUNT]; void *kp_free_stack[KS_ADDRS_COUNT]; }; -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); #endif #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR diff --git a/mm/slab_common.c b/mm/slab_common.c index 6ee64d6208b3..2b3206a2c3b5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -555,6 +555,13 @@ bool kmem_valid_obj(void *object) } EXPORT_SYMBOL_GPL(kmem_valid_obj); +static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +{ + if (__kfence_obj_info(kpp, object, slab)) + return; + __kmem_obj_info(kpp, object, slab); +} + /** * kmem_dump_obj - Print available slab provenance information * @object: slab object for which to find provenance information. @@ -590,6 +597,8 @@ void kmem_dump_obj(void *object) pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); else pr_cont(" slab%s", cp); + if (is_kfence_address(object)) + pr_cont(" (kfence)"); if (kp.kp_objp) pr_cont(" start %px", kp.kp_objp); if (kp.kp_data_offset) diff --git a/mm/slob.c b/mm/slob.c index dfa6808dff36..40ea6e2d4ccd 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -463,7 +463,7 @@ out: } #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { kpp->kp_ptr = object; kpp->kp_slab = slab; diff --git a/mm/slub.c b/mm/slub.c index 74d92aa4a3a2..ed5c2c03a47a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4312,7 +4312,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) } #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { void *base; int __maybe_unused i; -- cgit v1.2.3 From e553f62f10d93551eb883eca227ac54d1a4fad84 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 14 Apr 2022 19:13:43 -0700 Subject: mm, page_alloc: fix build_zonerefs_node() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator") only zones with free memory are included in a built zonelist. This is problematic when e.g. all memory of a zone has been ballooned out when zonelists are being rebuilt. The decision whether to rebuild the zonelists when onlining new memory is done based on populated_zone() returning 0 for the zone the memory will be added to. The new zone is added to the zonelists only, if it has free memory pages (managed_zone() returns a non-zero value) after the memory has been onlined. This implies, that onlining memory will always free the added pages to the allocator immediately, but this is not true in all cases: when e.g. running as a Xen guest the onlined new memory will be added only to the ballooned memory list, it will be freed only when the guest is being ballooned up afterwards. Another problem with using managed_zone() for the decision whether a zone is being added to the zonelists is, that a zone with all memory used will in fact be removed from all zonelists in case the zonelists happen to be rebuilt. Use populated_zone() when building a zonelist as it has been done before that commit. There was a report that QubesOS (based on Xen) is hitting this problem. Xen has switched to use the zone device functionality in kernel 5.9 and QubesOS wants to use memory hotplugging for guests in order to be able to start a guest with minimal memory and expand it as needed. This was the report leading to the patch. Link: https://lkml.kernel.org/r/20220407120637.9035-1-jgross@suse.com Fixes: 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator") Signed-off-by: Juergen Gross Reported-by: Marek Marczykowski-Górecki Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Marek Marczykowski-Górecki Reviewed-by: Wei Yang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e5b4488a0c5..33ca8cab21e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6131,7 +6131,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) do { zone_type--; zone = pgdat->node_zones + zone_type; - if (managed_zone(zone)) { + if (populated_zone(zone)) { zoneref_set_zone(zone, &zonerefs[nr_zones++]); check_highest_zone(zone_type); } -- cgit v1.2.3 From e914d8f00391520ecc4495dd0ca0124538ab7119 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 14 Apr 2022 19:13:46 -0700 Subject: mm: fix unexpected zeroed page mapping with zram swap Two processes under CLONE_VM cloning, user process can be corrupted by seeing zeroed page unexpectedly. CPU A CPU B do_swap_page do_swap_page SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path swap_readpage valid data swap_slot_free_notify delete zram entry swap_readpage zeroed(invalid) data pte_lock map the *zero data* to userspace pte_unlock pte_lock if (!pte_same) goto out_nomap; pte_unlock return and next refault will read zeroed data The swap_slot_free_notify is bogus for CLONE_VM case since it doesn't increase the refcount of swap slot at copy_mm so it couldn't catch up whether it's safe or not to discard data from backing device. In the case, only the lock it could rely on to synchronize swap slot freeing is page table lock. Thus, this patch gets rid of the swap_slot_free_notify function. With this patch, CPU A will see correct data. CPU A CPU B do_swap_page do_swap_page SWP_SYNCHRONOUS_IO path SWP_SYNCHRONOUS_IO path swap_readpage original data pte_lock map the original data swap_free swap_range_free bd_disk->fops->swap_slot_free_notify swap_readpage read zeroed data pte_unlock pte_lock if (!pte_same) goto out_nomap; pte_unlock return on next refault will see mapped data by CPU B The concern of the patch would increase memory consumption since it could keep wasted memory with compressed form in zram as well as uncompressed form in address space. However, most of cases of zram uses no readahead and do_swap_page is followed by swap_free so it will free the compressed form from in zram quickly. Link: https://lkml.kernel.org/r/YjTVVxIAsnKAXjTd@google.com Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device") Reported-by: Ivan Babrou Tested-by: Ivan Babrou Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jens Axboe Cc: David Hildenbrand Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 54 ------------------------------------------------------ 1 file changed, 54 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index b417f000b49e..89fbf3cae30f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -51,54 +51,6 @@ void end_swap_bio_write(struct bio *bio) bio_put(bio); } -static void swap_slot_free_notify(struct page *page) -{ - struct swap_info_struct *sis; - struct gendisk *disk; - swp_entry_t entry; - - /* - * There is no guarantee that the page is in swap cache - the software - * suspend code (at least) uses end_swap_bio_read() against a non- - * swapcache page. So we must check PG_swapcache before proceeding with - * this optimization. - */ - if (unlikely(!PageSwapCache(page))) - return; - - sis = page_swap_info(page); - if (data_race(!(sis->flags & SWP_BLKDEV))) - return; - - /* - * The swap subsystem performs lazy swap slot freeing, - * expecting that the page will be swapped out again. - * So we can avoid an unnecessary write if the page - * isn't redirtied. - * This is good for real swap storage because we can - * reduce unnecessary I/O and enhance wear-leveling - * if an SSD is used as the as swap device. - * But if in-memory swap device (eg zram) is used, - * this causes a duplicated copy between uncompressed - * data in VM-owned memory and compressed data in - * zram-owned memory. So let's free zram-owned memory - * and make the VM-owned decompressed page *dirty*, - * so the page should be swapped out somewhere again if - * we again wish to reclaim it. - */ - disk = sis->bdev->bd_disk; - entry.val = page_private(page); - if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) { - unsigned long offset; - - offset = swp_offset(entry); - - SetPageDirty(page); - disk->fops->swap_slot_free_notify(sis->bdev, - offset); - } -} - static void end_swap_bio_read(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -114,7 +66,6 @@ static void end_swap_bio_read(struct bio *bio) } SetPageUptodate(page); - swap_slot_free_notify(page); out: unlock_page(page); WRITE_ONCE(bio->bi_private, NULL); @@ -394,11 +345,6 @@ int swap_readpage(struct page *page, bool synchronous) if (sis->flags & SWP_SYNCHRONOUS_IO) { ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); if (!ret) { - if (trylock_page(page)) { - swap_slot_free_notify(page); - unlock_page(page); - } - count_vm_event(PSWPIN); goto out; } -- cgit v1.2.3 From 31ca72fa7540bb654b55c56adaf99305847376e0 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Thu, 14 Apr 2022 19:13:49 -0700 Subject: mm: compaction: fix compiler warning when CONFIG_COMPACTION=n The below warning is reported when CONFIG_COMPACTION=n: mm/compaction.c:56:27: warning: 'HPAGE_FRAG_CHECK_INTERVAL_MSEC' defined but not used [-Wunused-const-variable=] 56 | static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix it by moving 'HPAGE_FRAG_CHECK_INTERVAL_MSEC' under CONFIG_COMPACTION defconfig. Also since this is just a 'static const int' type, use #define for it. Link: https://lkml.kernel.org/r/1647608518-20924-1-git-send-email-quic_charante@quicinc.com Signed-off-by: Charan Teja Kalla Reported-by: kernel test robot Acked-by: Vlastimil Babka Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index c3e37aa9ff9e..fe915db6149b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -26,6 +26,11 @@ #include "internal.h" #ifdef CONFIG_COMPACTION +/* + * Fragmentation score check interval for proactive compaction purposes. + */ +#define HPAGE_FRAG_CHECK_INTERVAL_MSEC (500) + static inline void count_compact_event(enum vm_event_item item) { count_vm_event(item); @@ -50,11 +55,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) #define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) -/* - * Fragmentation score check interval for proactive compaction purposes. - */ -static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; - /* * Page order with-respect-to which proactive compaction * calculates external fragmentation, which is used as -- cgit v1.2.3 From 5a317412ef884763fdf7aa17f9f3636959d11d8f Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 14 Apr 2022 19:13:52 -0700 Subject: hugetlb: do not demote poisoned hugetlb pages It is possible for poisoned hugetlb pages to reside on the free lists. The huge page allocation routines which dequeue entries from the free lists make a point of avoiding poisoned pages. There is no such check and avoidance in the demote code path. If a hugetlb page on the is on a free list, poison will only be set in the head page rather then the page with the actual error. If such a page is demoted, then the poison flag may follow the wrong page. A page without error could have poison set, and a page with poison could not have the flag set. Check for poison before attempting to demote a hugetlb page. Also, return -EBUSY to the caller if only poisoned pages are on the free list. Link: https://lkml.kernel.org/r/20220307215707.50916-1-mike.kravetz@oracle.com Fixes: 8531fc6f52f5 ("hugetlb: add hugetlb demote page support") Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b34f50156f7e..f8ca7cca3c1a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3475,7 +3475,6 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { int nr_nodes, node; struct page *page; - int rc = 0; lockdep_assert_held(&hugetlb_lock); @@ -3486,15 +3485,19 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) } for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - if (!list_empty(&h->hugepage_freelists[node])) { - page = list_entry(h->hugepage_freelists[node].next, - struct page, lru); - rc = demote_free_huge_page(h, page); - break; + list_for_each_entry(page, &h->hugepage_freelists[node], lru) { + if (PageHWPoison(page)) + continue; + + return demote_free_huge_page(h, page); } } - return rc; + /* + * Only way to get here is if all pages on free lists are poisoned. + * Return -EBUSY so that caller will not retry. + */ + return -EBUSY; } #define HSTATE_ATTR_RO(_name) \ -- cgit v1.2.3 From 354e923df042a11d1ab8ca06b3ebfab3a018a4ec Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 Apr 2022 19:13:55 -0700 Subject: revert "fs/binfmt_elf: fix PT_LOAD p_align values for loaders" Commit 925346c129da11 ("fs/binfmt_elf: fix PT_LOAD p_align values for loaders") was an attempt to fix regressions due to 9630f0d60fec5f ("fs/binfmt_elf: use PT_LOAD p_align values for static PIE"). But regressionss continue to be reported: https://lore.kernel.org/lkml/cb5b81bd-9882-e5dc-cd22-54bdbaaefbbc@leemhuis.info/ https://bugzilla.kernel.org/show_bug.cgi?id=215720 https://lkml.kernel.org/r/b685f3d0-da34-531d-1aa9-479accd3e21b@leemhuis.info This patch reverts the fix, so the original can also be reverted. Fixes: 925346c129da11 ("fs/binfmt_elf: fix PT_LOAD p_align values for loaders") Cc: H.J. Lu Cc: Chris Kennelly Cc: Al Viro Cc: Alexey Dobriyan Cc: Song Liu Cc: David Rientjes Cc: Ian Rogers Cc: Hugh Dickins Cc: Suren Baghdasaryan Cc: Sandeep Patil Cc: Fangrui Song Cc: Nick Desaulniers Cc: Kirill A. Shutemov Cc: Mike Kravetz Cc: Shuah Khan Cc: Thorsten Leemhuis Cc: Mike Rapoport Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6556e13ed95f..37d9c455d535 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1118,7 +1118,7 @@ out_free_interp: * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - if (interpreter || alignment > ELF_MIN_ALIGN) { + if (alignment > ELF_MIN_ALIGN) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); -- cgit v1.2.3 From aeb7923733d100b86c6bc68e7ae32913b0cec9d8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 Apr 2022 19:13:58 -0700 Subject: revert "fs/binfmt_elf: use PT_LOAD p_align values for static PIE" Despite Mike's attempted fix (925346c129da117122), regressions reports continue: https://lore.kernel.org/lkml/cb5b81bd-9882-e5dc-cd22-54bdbaaefbbc@leemhuis.info/ https://bugzilla.kernel.org/show_bug.cgi?id=215720 https://lkml.kernel.org/r/b685f3d0-da34-531d-1aa9-479accd3e21b@leemhuis.info So revert this patch. Fixes: 9630f0d60fec ("fs/binfmt_elf: use PT_LOAD p_align values for static PIE") Cc: Alexey Dobriyan Cc: Al Viro Cc: Chris Kennelly Cc: David Rientjes Cc: Fangrui Song Cc: H.J. Lu Cc: Hugh Dickins Cc: Ian Rogers Cc: Kirill A. Shutemov Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nick Desaulniers Cc: Sandeep Patil Cc: Shuah Khan Cc: Song Liu Cc: Suren Baghdasaryan Cc: Thorsten Leemhuis Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 37d9c455d535..63c7ebb0da89 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1117,11 +1117,11 @@ out_free_interp: * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ - alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - if (alignment > ELF_MIN_ALIGN) { + if (interpreter) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; -- cgit v1.2.3 From c12cd77cb028255663810e6c4528f0325facff66 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 14 Apr 2022 19:14:01 -0700 Subject: mm/vmalloc: fix spinning drain_vmap_work after reading from /proc/vmcore Commit 3ee48b6af49c ("mm, x86: Saving vmcore with non-lazy freeing of vmas") introduced set_iounmap_nonlazy(), which sets vmap_lazy_nr to lazy_max_pages() + 1, ensuring that any future vunmaps() immediately purge the vmap areas instead of doing it lazily. Commit 690467c81b1a ("mm/vmalloc: Move draining areas out of caller context") moved the purging from the vunmap() caller to a worker thread. Unfortunately, set_iounmap_nonlazy() can cause the worker thread to spin (possibly forever). For example, consider the following scenario: 1. Thread reads from /proc/vmcore. This eventually calls __copy_oldmem_page() -> set_iounmap_nonlazy(), which sets vmap_lazy_nr to lazy_max_pages() + 1. 2. Then it calls free_vmap_area_noflush() (via iounmap()), which adds 2 pages (one page plus the guard page) to the purge list and vmap_lazy_nr. vmap_lazy_nr is now lazy_max_pages() + 3, so the drain_vmap_work is scheduled. 3. Thread returns from the kernel and is scheduled out. 4. Worker thread is scheduled in and calls drain_vmap_area_work(). It frees the 2 pages on the purge list. vmap_lazy_nr is now lazy_max_pages() + 1. 5. This is still over the threshold, so it tries to purge areas again, but doesn't find anything. 6. Repeat 5. If the system is running with only one CPU (which is typicial for kdump) and preemption is disabled, then this will never make forward progress: there aren't any more pages to purge, so it hangs. If there is more than one CPU or preemption is enabled, then the worker thread will spin forever in the background. (Note that if there were already pages to be purged at the time that set_iounmap_nonlazy() was called, this bug is avoided.) This can be reproduced with anything that reads from /proc/vmcore multiple times. E.g., vmcore-dmesg /proc/vmcore. It turns out that improvements to vmap() over the years have obsoleted the need for this "optimization". I benchmarked `dd if=/proc/vmcore of=/dev/null` with 4k and 1M read sizes on a system with a 32GB vmcore. The test was run on 5.17, 5.18-rc1 with a fix that avoided the hang, and 5.18-rc1 with set_iounmap_nonlazy() removed entirely: |5.17 |5.18+fix|5.18+removal 4k|40.86s| 40.09s| 26.73s 1M|24.47s| 23.98s| 21.84s The removal was the fastest (by a wide margin with 4k reads). This patch removes set_iounmap_nonlazy(). Link: https://lkml.kernel.org/r/52f819991051f9b865e9ce25605509bfdbacadcd.1649277321.git.osandov@fb.com Fixes: 690467c81b1a ("mm/vmalloc: Move draining areas out of caller context") Signed-off-by: Omar Sandoval Acked-by: Chris Down Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Christoph Hellwig Acked-by: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/io.h | 2 -- arch/x86/kernel/crash_dump_64.c | 1 - mm/vmalloc.c | 11 ----------- 3 files changed, 14 deletions(-) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index f6d91ecb8026..e9736af126b2 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -210,8 +210,6 @@ void __iomem *ioremap(resource_size_t offset, unsigned long size); extern void iounmap(volatile void __iomem *addr); #define iounmap iounmap -extern void set_iounmap_nonlazy(void); - #ifdef __KERNEL__ void memcpy_fromio(void *, const volatile void __iomem *, size_t); diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index a7f617a3981d..97529552dd24 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -37,7 +37,6 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, } else memcpy(buf, vaddr + offset, csize); - set_iounmap_nonlazy(); iounmap((void __iomem *)vaddr); return csize; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e163372d3967..0b17498a34f1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1671,17 +1671,6 @@ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); -#ifdef CONFIG_X86_64 -/* - * called before a call to iounmap() if the caller wants vm_area_struct's - * immediately freed. - */ -void set_iounmap_nonlazy(void) -{ - atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); -} -#endif /* CONFIG_X86_64 */ - /* * Purges all lazily-freed vmap areas. */ -- cgit v1.2.3 From 23c2d497de21f25898fbea70aeb292ab8acc8c94 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Thu, 14 Apr 2022 19:14:04 -0700 Subject: mm: kmemleak: take a full lowmem check in kmemleak_*_phys() The kmemleak_*_phys() apis do not check the address for lowmem's min boundary, while the caller may pass an address below lowmem, which will trigger an oops: # echo scan > /sys/kernel/debug/kmemleak Unable to handle kernel paging request at virtual address ff5fffffffe00000 Oops [#1] Modules linked in: CPU: 2 PID: 134 Comm: bash Not tainted 5.18.0-rc1-next-20220407 #33 Hardware name: riscv-virtio,qemu (DT) epc : scan_block+0x74/0x15c ra : scan_block+0x72/0x15c epc : ffffffff801e5806 ra : ffffffff801e5804 sp : ff200000104abc30 gp : ffffffff815cd4e8 tp : ff60000004cfa340 t0 : 0000000000000200 t1 : 00aaaaaac23954cc t2 : 00000000000003ff s0 : ff200000104abc90 s1 : ffffffff81b0ff28 a0 : 0000000000000000 a1 : ff5fffffffe01000 a2 : ffffffff81b0ff28 a3 : 0000000000000002 a4 : 0000000000000001 a5 : 0000000000000000 a6 : ff200000104abd7c a7 : 0000000000000005 s2 : ff5fffffffe00ff9 s3 : ffffffff815cd998 s4 : ffffffff815d0e90 s5 : ffffffff81b0ff28 s6 : 0000000000000020 s7 : ffffffff815d0eb0 s8 : ffffffffffffffff s9 : ff5fffffffe00000 s10: ff5fffffffe01000 s11: 0000000000000022 t3 : 00ffffffaa17db4c t4 : 000000000000000f t5 : 0000000000000001 t6 : 0000000000000000 status: 0000000000000100 badaddr: ff5fffffffe00000 cause: 000000000000000d scan_gray_list+0x12e/0x1a6 kmemleak_scan+0x2aa/0x57e kmemleak_write+0x32a/0x40c full_proxy_write+0x56/0x82 vfs_write+0xa6/0x2a6 ksys_write+0x6c/0xe2 sys_write+0x22/0x2a ret_from_syscall+0x0/0x2 The callers may not quite know the actual address they pass(e.g. from devicetree). So the kmemleak_*_phys() apis should guarantee the address they finally use is in lowmem range, so check the address for lowmem's min boundary. Link: https://lkml.kernel.org/r/20220413122925.33856-1-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Acked-by: Catalin Marinas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index acd7cbb82e16..a182f5ddaf68 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1132,7 +1132,7 @@ EXPORT_SYMBOL(kmemleak_no_scan); void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, gfp_t gfp) { - if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) kmemleak_alloc(__va(phys), size, min_count, gfp); } EXPORT_SYMBOL(kmemleak_alloc_phys); @@ -1146,7 +1146,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); */ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { - if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) kmemleak_free_part(__va(phys), size); } EXPORT_SYMBOL(kmemleak_free_part_phys); @@ -1158,7 +1158,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); */ void __ref kmemleak_not_leak_phys(phys_addr_t phys) { - if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) kmemleak_not_leak(__va(phys)); } EXPORT_SYMBOL(kmemleak_not_leak_phys); @@ -1170,7 +1170,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys); */ void __ref kmemleak_ignore_phys(phys_addr_t phys) { - if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) kmemleak_ignore(__va(phys)); } EXPORT_SYMBOL(kmemleak_ignore_phys); -- cgit v1.2.3