From a9e73998f9d705c94a8dca9687633adc0f24a19a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 13 May 2019 17:15:40 -0700 Subject: kernel/sys.c: prctl: fix false positive in validate_prctl_map() While validating new map we require the @start_data to be strictly less than @end_data, which is fine for regular applications (this is why this nit didn't trigger for that long). These members are set from executable loaders such as elf handers, still it is pretty valid to have a loadable data section with zero size in file, in such case the start_data is equal to end_data once kernel loader finishes. As a result when we're trying to restore such programs the procedure fails and the kernel returns -EINVAL. From the image dump of a program: | "mm_start_code": "0x400000", | "mm_end_code": "0x8f5fb4", | "mm_start_data": "0xf1bfb0", | "mm_end_data": "0xf1bfb0", Thus we need to change validate_prctl_map from strictly less to less or equal operator use. Link: http://lkml.kernel.org/r/20190408143554.GY1421@uranus.lan Fixes: f606b77f1a9e3 ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation") Signed-off-by: Cyrill Gorcunov Cc: Andrey Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 12df0e5434b8..bdbfe8d37418 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); -- cgit v1.2.3 From cefdca0a86be517bc390fc4541e3674b8e7803b0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 13 May 2019 17:16:41 -0700 Subject: userfaultfd/sysctl: add vm.unprivileged_userfaultfd Userfaultfd can be misued to make it easier to exploit existing use-after-free (and similar) bugs that might otherwise only make a short window or race condition available. By using userfaultfd to stall a kernel thread, a malicious program can keep some state that it wrote, stable for an extended period, which it can then access using an existing exploit. While it doesn't cause the exploit itself, and while it's not the only thing that can stall a kernel thread when accessing a memory location, it's one of the few that never needs privilege. We can add a flag, allowing userfaultfd to be restricted, so that in general it won't be useable by arbitrary user programs, but in environments that require userfaultfd it can be turned back on. Add a global sysctl knob "vm.unprivileged_userfaultfd" to control whether userfaultfd is allowed by unprivileged users. When this is set to zero, only privileged users (root user, or users with the CAP_SYS_PTRACE capability) will be able to use the userfaultfd syscalls. Andrea said: : The only difference between the bpf sysctl and the userfaultfd sysctl : this way is that the bpf sysctl adds the CAP_SYS_ADMIN capability : requirement, while userfaultfd adds the CAP_SYS_PTRACE requirement, : because the userfaultfd monitor is more likely to need CAP_SYS_PTRACE : already if it's doing other kind of tracking on processes runtime, in : addition of userfaultfd. In other words both syscalls works only for : root, when the two sysctl are opt-in set to 1. [dgilbert@redhat.com: changelog additions] [akpm@linux-foundation.org: documentation tweak, per Mike] Link: http://lkml.kernel.org/r/20190319030722.12441-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Cc: Paolo Bonzini Cc: Hugh Dickins Cc: Luis Chamberlain Cc: Maxime Coquelin Cc: Maya Gokhale Cc: Jerome Glisse Cc: Pavel Emelyanov Cc: Johannes Weiner Cc: Martin Cracauer Cc: Denis Plotnikov Cc: Marty McFadden Cc: Mike Kravetz Cc: Kees Cook Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: "Dr . David Alan Gilbert" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 12 ++++++++++++ fs/userfaultfd.c | 5 +++++ include/linux/userfaultfd_k.h | 2 ++ kernel/sysctl.c | 12 ++++++++++++ 4 files changed, 31 insertions(+) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 3f13d8599337..749322060f10 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - stat_refresh - numa_stat - swappiness +- unprivileged_userfaultfd - user_reserve_kbytes - vfs_cache_pressure - watermark_boost_factor @@ -818,6 +819,17 @@ The default value is 60. ============================================================== +unprivileged_userfaultfd + +This flag controls whether unprivileged users can use the userfaultfd +system calls. Set this to 1 to allow unprivileged users to use the +userfaultfd system calls, or set this to 0 to restrict userfaultfd to only +privileged users (with SYS_CAP_PTRACE capability). + +The default value is 1. + +============================================================== + - user_reserve_kbytes When overcommit_memory is set to 2, "never overcommit" mode, reserve diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f5de1e726356..3b30301c90ec 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -30,6 +30,8 @@ #include #include +int sysctl_unprivileged_userfaultfd __read_mostly = 1; + static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; enum userfaultfd_state { @@ -1930,6 +1932,9 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) struct userfaultfd_ctx *ctx; int fd; + if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE)) + return -EPERM; + BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 37c9eba75c98..ac9d71e24b81 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -28,6 +28,8 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) +extern int sysctl_unprivileged_userfaultfd; + extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 599510a3355e..ba158f61aab4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -1719,6 +1720,17 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit v1.2.3 From 73b0140bf0fe9df90fb267c00673c4b9bf285430 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:11 -0700 Subject: mm/gup: change GUP fast to use flags rather than a write 'bool' To facilitate additional options to get_user_pages_fast() change the singular write parameter to be gup_flags. This patch does not change any functionality. New functionality will follow in subsequent patches. Some of the get_user_pages_fast() call sites were unchanged because they already passed FOLL_WRITE or 0 for the write parameter. NOTE: It was suggested to change the ordering of the get_user_pages_fast() arguments to ensure that callers were converted. This breaks the current GUP call site convention of having the returned pages be the final parameter. So the suggestion was rejected. Link: http://lkml.kernel.org/r/20190328084422.29911-4-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-4-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Mike Marshall Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dan Williams Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/gup.c | 11 ++++++----- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++-- arch/powerpc/kvm/e500_mmu.c | 2 +- arch/s390/kvm/interrupt.c | 2 +- arch/sh/mm/gup.c | 11 ++++++----- arch/sparc/mm/gup.c | 9 +++++---- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/svm.c | 2 +- drivers/fpga/dfl-afu-dma-region.c | 2 +- drivers/gpu/drm/via/via_dmablit.c | 3 ++- drivers/infiniband/hw/hfi1/user_pages.c | 3 ++- drivers/misc/genwqe/card_utils.c | 2 +- drivers/misc/vmw_vmci/vmci_host.c | 2 +- drivers/misc/vmw_vmci/vmci_queue_pair.c | 6 ++++-- drivers/platform/goldfish/goldfish_pipe.c | 3 ++- drivers/rapidio/devices/rio_mport_cdev.c | 4 +++- drivers/sbus/char/oradax.c | 2 +- drivers/scsi/st.c | 3 ++- drivers/staging/gasket/gasket_page_table.c | 4 ++-- drivers/tee/tee_shm.c | 2 +- drivers/vfio/vfio_iommu_spapr_tce.c | 3 ++- drivers/vhost/vhost.c | 2 +- drivers/video/fbdev/pvr2fb.c | 2 +- drivers/virt/fsl_hypervisor.c | 2 +- drivers/xen/gntdev.c | 2 +- fs/orangefs/orangefs-bufmap.c | 2 +- include/linux/mm.h | 4 ++-- kernel/futex.c | 2 +- lib/iov_iter.c | 7 +++++-- mm/gup.c | 10 +++++----- mm/util.c | 8 ++++---- net/ceph/pagevec.c | 2 +- net/rds/info.c | 2 +- net/rds/rdma.c | 3 ++- 34 files changed, 73 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 0d14e0d8eacf..4c2b4483683c 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); @@ -289,7 +290,7 @@ slow_irqon: pages += nr; ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - pages, write ? FOLL_WRITE : 0); + pages, gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index be7bc070eae5..ab3d484c5e2e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -600,7 +600,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* If writing != 0, then the HPTE must allow writing, if we get here */ write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, writing, pages); + npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages); if (npages < 1) { /* Check if it's an I/O mapping */ down_read(¤t->mm->mmap_sem); @@ -1193,7 +1193,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) goto err; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, 1, pages); + npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages); if (npages < 1) goto err; page = pages[0]; diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 24296f4cadc6..e0af53fd78c5 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -783,7 +783,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, if (!pages) return -ENOMEM; - ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); + ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages); if (ret < 0) goto free_pages; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 37503ae62486..1fd706f6206c 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2376,7 +2376,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) ret = -EFAULT; goto out; } - ret = get_user_pages_fast(map->addr, 1, 1, &map->page); + ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); if (ret < 0) goto out; BUG_ON(ret != 1); diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index 3e27f6d1f1ec..277c882f7489 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c @@ -204,7 +204,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -216,8 +216,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -241,7 +241,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); @@ -261,7 +262,7 @@ slow_irqon: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index aee6dba83d0e..1e770a517d4a 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -245,8 +245,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -303,7 +303,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); @@ -324,7 +325,7 @@ slow: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bdca39829bc..08715034e315 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -140,7 +140,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t *table; struct page *page; - npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); /* Check if the user is doing something meaningless. */ if (unlikely(npages != 1)) return -EFAULT; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 406b558abfef..6b92eaf4a3b1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1805,7 +1805,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return NULL; /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); goto err; diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c index e18a786fc943..c438722bf4e1 100644 --- a/drivers/fpga/dfl-afu-dma-region.c +++ b/drivers/fpga/dfl-afu-dma-region.c @@ -102,7 +102,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata, goto unlock_vm; } - pinned = get_user_pages_fast(region->user_addr, npages, 1, + pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE, region->pages); if (pinned < 0) { ret = pinned; diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index 8bf3a7c23ed3..062067438f1d 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -243,7 +243,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) if (NULL == vsg->pages) return -ENOMEM; ret = get_user_pages_fast((unsigned long)xfer->mem_addr, - vsg->num_pages, vsg->direction == DMA_FROM_DEVICE, + vsg->num_pages, + vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0, vsg->pages); if (ret != vsg->num_pages) { if (ret < 0) diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 24b592c6522e..78ccacaf97d0 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -105,7 +105,8 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np { int ret; - ret = get_user_pages_fast(vaddr, npages, writable, pages); + ret = get_user_pages_fast(vaddr, npages, writable ? FOLL_WRITE : 0, + pages); if (ret < 0) return ret; diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index 25265fd0fd6e..89cff9d1012b 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -603,7 +603,7 @@ int genwqe_user_vmap(struct genwqe_dev *cd, struct dma_mapping *m, void *uaddr, /* pin user pages in memory */ rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */ m->nr_pages, - m->write, /* readable/writable */ + m->write ? FOLL_WRITE : 0, /* readable/writable */ m->page_list); /* ptrs to pages */ if (rc < 0) goto fail_get_user_pages; diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c index 997f92543dd4..422d08da3244 100644 --- a/drivers/misc/vmw_vmci/vmci_host.c +++ b/drivers/misc/vmw_vmci/vmci_host.c @@ -242,7 +242,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context, /* * Lock physical page backing a given user VA. */ - retval = get_user_pages_fast(uva, 1, 1, &context->notify_page); + retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page); if (retval != 1) { context->notify_page = NULL; return VMCI_ERROR_GENERIC; diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c index f5f1aac9d163..1174735f003d 100644 --- a/drivers/misc/vmw_vmci/vmci_queue_pair.c +++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c @@ -659,7 +659,8 @@ static int qp_host_get_user_memory(u64 produce_uva, int err = VMCI_SUCCESS; retval = get_user_pages_fast((uintptr_t) produce_uva, - produce_q->kernel_if->num_pages, 1, + produce_q->kernel_if->num_pages, + FOLL_WRITE, produce_q->kernel_if->u.h.header_page); if (retval < (int)produce_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(produce) failed (retval=%d)", @@ -671,7 +672,8 @@ static int qp_host_get_user_memory(u64 produce_uva, } retval = get_user_pages_fast((uintptr_t) consume_uva, - consume_q->kernel_if->num_pages, 1, + consume_q->kernel_if->num_pages, + FOLL_WRITE, consume_q->kernel_if->u.h.header_page); if (retval < (int)consume_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(consume) failed (retval=%d)", diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 321bc673c417..cef0133aa47a 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -274,7 +274,8 @@ static int pin_user_pages(unsigned long first_page, *iter_last_page_size = last_page_size; } - ret = get_user_pages_fast(first_page, requested_pages, !is_write, + ret = get_user_pages_fast(first_page, requested_pages, + !is_write ? FOLL_WRITE : 0, pages); if (ret <= 0) return -EFAULT; diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 1e1f42e210a0..4a4a75fa26d5 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -868,7 +868,9 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, pinned = get_user_pages_fast( (unsigned long)xfer->loc_addr & PAGE_MASK, - nr_pages, dir == DMA_FROM_DEVICE, page_list); + nr_pages, + dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, + page_list); if (pinned != nr_pages) { if (pinned < 0) { diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index acd9ba40eabe..8090dc9a1514 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -437,7 +437,7 @@ static int dax_lock_page(void *va, struct page **p) dax_dbg("uva %p", va); - ret = get_user_pages_fast((unsigned long)va, 1, 1, p); + ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p); if (ret == 1) { dax_dbg("locked page %p, for VA %p", *p, va); return 0; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 19c022e66d63..3c6a18ad9a87 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -4922,7 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp, /* Try to fault in all of the necessary pages */ /* rw==READ means read from drive, write into memory area */ - res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages); + res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0, + pages); /* Errors and no page mapped should return here */ if (res < nr_pages) diff --git a/drivers/staging/gasket/gasket_page_table.c b/drivers/staging/gasket/gasket_page_table.c index 600928f63577..d35c4fb19e28 100644 --- a/drivers/staging/gasket/gasket_page_table.c +++ b/drivers/staging/gasket/gasket_page_table.c @@ -486,8 +486,8 @@ static int gasket_perform_mapping(struct gasket_page_table *pg_tbl, ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr + off + i * PAGE_SIZE; } else { - ret = get_user_pages_fast(page_addr - offset, 1, 1, - &page); + ret = get_user_pages_fast(page_addr - offset, 1, + FOLL_WRITE, &page); if (ret <= 0) { dev_err(pg_tbl->device, diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 0b9ab1d0dd45..49fd7312e2aa 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -273,7 +273,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, goto err; } - rc = get_user_pages_fast(start, num_pages, 1, shm->pages); + rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages); if (rc > 0) shm->num_pages = rc; if (rc != num_pages) { diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 6b64e45a5269..40ddc0c5f677 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -532,7 +532,8 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) enum dma_data_direction direction = iommu_tce_direction(tce); if (get_user_pages_fast(tce & PAGE_MASK, 1, - direction != DMA_TO_DEVICE, &page) != 1) + direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, + &page) != 1) return -EFAULT; *hpa = __pa((unsigned long) page_address(page)); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 351af88231ad..1e3ed41ae1f3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1704,7 +1704,7 @@ static int set_bit_to_user(int nr, void __user *addr) int bit = nr + (log % PAGE_SIZE) * 8; int r; - r = get_user_pages_fast(log, 1, 1, &page); + r = get_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c index dfed532ed606..4e4d6a0df978 100644 --- a/drivers/video/fbdev/pvr2fb.c +++ b/drivers/video/fbdev/pvr2fb.c @@ -686,7 +686,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf, if (!pages) return -ENOMEM; - ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages); + ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages); if (ret < nr_pages) { nr_pages = ret; ret = -EINVAL; diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 8ba726e600e9..6446bcab4185 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -244,7 +244,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) /* Get the physical addresses of the source buffer */ num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset, - num_pages, param.source != -1, pages); + num_pages, param.source != -1 ? FOLL_WRITE : 0, pages); if (num_pinned != num_pages) { /* get_user_pages() failed */ diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 7cf9c51318aa..02bc815982d4 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -852,7 +852,7 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, unsigned long xen_pfn; int ret; - ret = get_user_pages_fast(addr, 1, writeable, &page); + ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page); if (ret < 0) return ret; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index d4811f981608..2bb916d68576 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -269,7 +269,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap, /* map the pages */ ret = get_user_pages_fast((unsigned long)user_desc->ptr, - bufmap->page_count, 1, bufmap->page_array); + bufmap->page_count, FOLL_WRITE, bufmap->page_array); if (ret < 0) return ret; diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bc677ce8f01..c3c73b3c9adc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1505,8 +1505,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); /* Container for pinned pfns / pages */ struct frame_vector { diff --git a/kernel/futex.c b/kernel/futex.c index 6262f1534ac9..2268b97d5439 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -543,7 +543,7 @@ again: if (unlikely(should_fail_futex(fshared))) return -EFAULT; - err = get_user_pages_fast(address, 1, 1, &page); + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. diff --git a/lib/iov_iter.c b/lib/iov_iter.c index b396d328a764..f74fa832f3aa 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1293,7 +1293,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, len = maxpages * PAGE_SIZE; addr &= ~(PAGE_SIZE - 1); n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages); + res = get_user_pages_fast(addr, n, + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, + pages); if (unlikely(res < 0)) return res; return (res == n ? len : res * PAGE_SIZE) - *start; @@ -1374,7 +1376,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, p = get_pages_array(n); if (!p) return -ENOMEM; - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p); + res = get_user_pages_fast(addr, n, + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); if (unlikely(res < 0)) { kvfree(p); return res; diff --git a/mm/gup.c b/mm/gup.c index 113c18a98cf5..3dde6a8da670 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2062,7 +2062,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -2074,8 +2074,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { unsigned long addr, len, end; int nr = 0, ret = 0; @@ -2093,7 +2093,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_disable(); - gup_pgd_range(addr, end, write ? FOLL_WRITE : 0, pages, &nr); + gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); ret = nr; } @@ -2104,7 +2104,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, pages += nr; ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/util.c b/mm/util.c index 43a2984bccaa..05a464929b3e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -339,10 +339,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * were pinned, returns -errno. */ int __weak get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) + int nr_pages, unsigned int gup_flags, + struct page **pages) { - return get_user_pages_unlocked(start, nr_pages, pages, - write ? FOLL_WRITE : 0); + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index d3736f5bffec..74cafc0142ea 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, while (got < num_pages) { rc = get_user_pages_fast( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, write_page, pages + got); + num_pages - got, write_page ? FOLL_WRITE : 0, pages + got); if (rc < 0) break; BUG_ON(rc == 0); diff --git a/net/rds/info.c b/net/rds/info.c index e367a97a18c8..03f6fd56d237 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ret = -ENOMEM; goto out; } - ret = get_user_pages_fast(start, nr_pages, 1, pages); + ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 182ab8430594..b340ed4fc43a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, { int ret; - ret = get_user_pages_fast(user_addr, nr_pages, write, pages); + ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, + pages); if (ret >= 0 && ret < nr_pages) { while (ret--) -- cgit v1.2.3 From 6f4f13e8d9e27cefd2cd88dd4fd80aa6d68b9131 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:49 -0700 Subject: mm/mmu_notifier: contextual information for event triggering invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<---------------------------------------------------------------------- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } ---------------------------------------------------------------------->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Link: http://lkml.kernel.org/r/20190326164747.24405-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 5 ++++- kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 ++++++++---- mm/hugetlb.c | 12 ++++++++---- mm/khugepaged.c | 3 ++- mm/ksm.c | 6 ++++-- mm/madvise.c | 3 ++- mm/memory.c | 25 ++++++++++++++++--------- mm/migrate.c | 5 ++++- mm/mprotect.c | 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c | 3 ++- mm/rmap.c | 6 ++++-- 14 files changed, 62 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 95ca1fe7283c..ea464f2b9867 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1169,7 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(&range, mm, 0, -1UL); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2386e71ac1b8..62f94cd85455 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -356,6 +356,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + enum mmu_notifier_event event, + unsigned flags, + struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -491,7 +494,7 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, range->end = end; } -#define mmu_notifier_range_init(range, mm, start, end) \ +#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ _mmu_notifier_range_init(range, start, end) static inline bool diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4ca7364c956d..e34b699f3865 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 50c665b12cf1..428b5794f4b8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1224,7 +1224,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(&range, vma->vm_mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1388,7 +1389,8 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, vma->vm_mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2064,7 +2066,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pud_lock(vma->vm_mm, pud); @@ -2282,7 +2285,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 98a3c7c224cb..89d206d6ecf3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3294,7 +3294,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { - mmu_notifier_range_init(&range, src, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, src, + vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); } @@ -3406,7 +3407,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If sharing possible, alert mmu notifiers of worst case. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, + end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); address = start; @@ -3673,7 +3675,8 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, haddr, + haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* @@ -4408,7 +4411,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, + end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7ba7a1e4fa79..14581dbf62a5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); - mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* diff --git a/mm/ksm.c b/mm/ksm.c index fc64874dc6f4..01f5fe2c90cf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1066,7 +1066,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, mm, pvmw.address, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1154,7 +1155,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); diff --git a/mm/madvise.c b/mm/madvise.c index bb3a4554d5d5..1c52bdf1b696 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, mm, range.start, range.end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + range.start, range.end); lru_add_drain(); tlb_gather_mmu(&tlb, mm, range.start, range.end); diff --git a/mm/memory.c b/mm/memory.c index f7d962d7de19..90672674c582 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1010,7 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow = is_cow_mapping(vma->vm_flags); if (is_cow) { - mmu_notifier_range_init(&range, src_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, + src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1334,7 +1335,8 @@ void unmap_vmas(struct mmu_gather *tlb, { struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); @@ -1356,7 +1358,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + start, start + size); tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -1382,7 +1385,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address, address + size); tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -2279,7 +2283,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -4104,8 +4109,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PMD_MASK, - (address & PMD_MASK) + PMD_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, + NULL, mm, address & PMD_MASK, + (address & PMD_MASK) + PMD_SIZE); mmu_notifier_invalidate_range_start(range); } *ptlp = pmd_lock(mm, pmd); @@ -4122,8 +4128,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PAGE_MASK, - (address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + address & PAGE_MASK, + (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); diff --git a/mm/migrate.c b/mm/migrate.c index a1770403ff7f..855bdb3b3333 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2356,7 +2356,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm_walk.mm, + migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); walk_page_range(migrate->start, migrate->end, &mm_walk); @@ -2764,6 +2765,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) notified = true; mmu_notifier_range_init(&range, + MMU_NOTIFY_UNMAP, 0, + NULL, migrate->vma->vm_mm, addr, migrate->end); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/mprotect.c b/mm/mprotect.c index 028c724dcb1a..b10984052ae9 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -185,7 +185,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* invoke the mmu notifier if the pmd is populated */ if (!range.start) { - mmu_notifier_range_init(&range, vma->vm_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + vma, vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/mremap.c b/mm/mremap.c index e3edef6b7a12..fc241d23cd97 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + old_addr, old_end); mmu_notifier_invalidate_range_start(&range); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3a2484884cfd..539c91d0b26a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -531,7 +531,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, mm, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + vma, mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm, range.start, range.end); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { diff --git a/mm/rmap.c b/mm/rmap.c index 76c8dfd3ae1c..288e636b7813 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -896,7 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); mmu_notifier_invalidate_range_start(&range); @@ -1371,7 +1372,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); if (PageHuge(page)) { -- cgit v1.2.3 From 7269f999934b289da7972e975b781417b07ef836 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:53 -0700 Subject: mm/mmu_notifier: use correct mmu_notifier events for each invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This updates each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Link: http://lkml.kernel.org/r/20190326164747.24405-7-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 ++-- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 14 ++++++-------- mm/hugetlb.c | 8 ++++---- mm/khugepaged.c | 2 +- mm/ksm.c | 4 ++-- mm/madvise.c | 2 +- mm/memory.c | 14 +++++++------- mm/migrate.c | 4 ++-- mm/mprotect.c | 5 +++-- mm/rmap.c | 6 +++--- 11 files changed, 32 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ea464f2b9867..01d4eb0e6bd1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1169,8 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, - NULL, mm, 0, -1UL); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, + 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e34b699f3865..78f61bfc6b79 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 428b5794f4b8..61b1e05e86ee 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1224,9 +1224,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1389,9 +1388,8 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); spin_lock(vmf->ptl); @@ -2066,7 +2064,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2285,7 +2283,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 89d206d6ecf3..cab38ef30238 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3294,7 +3294,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, src, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); @@ -3675,7 +3675,7 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); @@ -4411,8 +4411,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, - end); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, + 0, vma, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 14581dbf62a5..a335f7c1fac4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1016,7 +1016,7 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ diff --git a/mm/ksm.c b/mm/ksm.c index 01f5fe2c90cf..81c20ed57bf6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1066,7 +1066,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1155,7 +1155,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/madvise.c b/mm/madvise.c index 1c52bdf1b696..628022e674a7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -472,7 +472,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, range.start, range.end); lru_add_drain(); diff --git a/mm/memory.c b/mm/memory.c index 90672674c582..9b68a72f8c17 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1010,8 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow = is_cow_mapping(vma->vm_flags); if (is_cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, - src_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1358,7 +1358,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, start, start + size); tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); update_hiwater_rss(vma->vm_mm); @@ -1385,7 +1385,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, address + size); tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); update_hiwater_rss(vma->vm_mm); @@ -2283,7 +2283,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -4109,7 +4109,7 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm, address & PMD_MASK, (address & PMD_MASK) + PMD_SIZE); mmu_notifier_invalidate_range_start(range); @@ -4128,7 +4128,7 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm, address & PAGE_MASK, (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); diff --git a/mm/migrate.c b/mm/migrate.c index 855bdb3b3333..f2ecc2855a12 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2356,7 +2356,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm_walk.mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); @@ -2765,7 +2765,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) notified = true; mmu_notifier_range_init(&range, - MMU_NOTIFY_UNMAP, 0, + MMU_NOTIFY_CLEAR, 0, NULL, migrate->vma->vm_mm, addr, migrate->end); diff --git a/mm/mprotect.c b/mm/mprotect.c index b10984052ae9..65242f1e4457 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -185,8 +185,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* invoke the mmu notifier if the pmd is populated */ if (!range.start) { - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, - vma, vma->vm_mm, addr, end); + mmu_notifier_range_init(&range, + MMU_NOTIFY_PROTECTION_VMA, 0, + vma, vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/rmap.c b/mm/rmap.c index 288e636b7813..0cbed70700ed 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -896,8 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - address, + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, vma->vm_mm, address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); mmu_notifier_invalidate_range_start(&range); @@ -1372,7 +1372,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); -- cgit v1.2.3 From 940519f0c8b757fdcbc5d14c93cdaada20ded14c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 13 May 2019 17:21:26 -0700 Subject: mm, memory_hotplug: provide a more generic restrictions for memory hotplug arch_add_memory, __add_pages take a want_memblock which controls whether the newly added memory should get the sysfs memblock user API (e.g. ZONE_DEVICE users do not want/need this interface). Some callers even want to control where do we allocate the memmap from by configuring altmap. Add a more generic hotplug context for arch_add_memory and __add_pages. struct mhp_restrictions contains flags which contains additional features to be enabled by the memory hotplug (MHP_MEMBLOCK_API currently) and altmap for alternative memmap allocator. This patch shouldn't introduce any functional change. [akpm@linux-foundation.org: build fix] Link: http://lkml.kernel.org/r/20190408082633.2864-3-osalvador@suse.de Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Cc: Dan Williams Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 6 +++--- arch/ia64/mm/init.c | 6 +++--- arch/powerpc/mm/mem.c | 6 +++--- arch/s390/mm/init.c | 6 +++--- arch/sh/mm/init.c | 6 +++--- arch/x86/mm/init_32.c | 6 +++--- arch/x86/mm/init_64.c | 10 +++++----- include/linux/memory_hotplug.h | 31 ++++++++++++++++++++++++------- kernel/memremap.c | 12 +++++++++--- mm/memory_hotplug.c | 11 +++++++---- 10 files changed, 63 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ef82312860ac..ef32d4839c3f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1065,8 +1065,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { int flags = 0; @@ -1077,6 +1077,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, - altmap, want_memblock); + restrictions); } #endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index e49200e31750..379eb1f9adc9 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -666,14 +666,14 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 20266898f3a8..de5c591a550d 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int __ref arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -127,7 +127,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altm } flush_inval_dcache_range(start, start + size); - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 5f48fc7e61d5..06bd05137a00 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -219,8 +219,8 @@ device_initcall(s390_cma_mem_init); #endif /* CONFIG_CMA */ -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -230,7 +230,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock); + rc = __add_pages(nid, start_pfn, size_pages, restrictions); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index aeb9f45c7a39..d3cd07bd2dc1 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -404,15 +404,15 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 85c94f9a87f8..755dbed85531 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -850,13 +850,13 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bccff68e3267..db42c11b48fb 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) } int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock) + struct mhp_restrictions *restrictions) { int ret; - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ @@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, return ret; } -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; init_memory_mapping(start, start + size); - return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return add_pages(nid, start_pfn, nr_pages, restrictions); } #define PAGE_INUSE 0xFD diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3c8cf347804c..b24aca54353e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -53,6 +53,16 @@ enum { MMOP_ONLINE_MOVABLE, }; +/* + * Restrictions for the memory hotplug: + * flags: MHP_ flags + * altmap: alternative allocator for memmap array + */ +struct mhp_restrictions { + unsigned long flags; + struct vmem_altmap *altmap; +}; + /* * Zone resizing functions * @@ -101,6 +111,8 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid); +extern int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions); extern u64 max_mem_size; extern bool memhp_auto_online; @@ -118,20 +130,27 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); #endif /* CONFIG_MEMORY_HOTREMOVE */ +/* + * Do we want sysfs memblock files created. This will allow userspace to online + * and offline memory explicitly. Lack of this bit means that the caller has to + * call move_pfn_range_to_zone to finish the initialization. + */ + +#define MHP_MEMBLOCK_API (1<<0) + /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock); + struct mhp_restrictions *restrictions); #ifndef CONFIG_ARCH_HAS_ADD_PAGES static inline int add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) + unsigned long nr_pages, struct mhp_restrictions *restrictions) { - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #else /* ARCH_HAS_ADD_PAGES */ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock); + struct mhp_restrictions *restrictions); #endif /* ARCH_HAS_ADD_PAGES */ #ifdef CONFIG_NUMA @@ -332,8 +351,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); -extern int arch_add_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern bool is_memblock_offlined(struct memory_block *mem); diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..4e59d29245f4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; + struct mhp_restrictions restrictions = { + /* + * We do not want any optional features only our own memmap + */ + .altmap = altmap, + }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); + align_size >> PAGE_SHIFT, &restrictions); } else { error = kasan_add_zero_shadow(__va(align_start), align_size); if (error) { @@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, altmap, - false); + error = arch_add_memory(nid, align_start, align_size, + &restrictions); } if (!error) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 75f9f6590677..339d5a62d5d5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -273,12 +273,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, * add the new pages. */ int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) + unsigned long nr_pages, struct mhp_restrictions *restrictions) { unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap = restrictions->altmap; /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); @@ -299,7 +299,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, section_nr_to_pfn(i), altmap, - want_memblock); + restrictions->flags & MHP_MEMBLOCK_API); /* * EEXIST is finally dealt with by ioresource collision @@ -1097,6 +1097,9 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { + struct mhp_restrictions restrictions = { + .flags = MHP_MEMBLOCK_API, + }; u64 start, size; bool new_node = false; int ret; @@ -1124,7 +1127,7 @@ int __ref add_memory_resource(int nid, struct resource *res) new_node = ret; /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, NULL, true); + ret = arch_add_memory(nid, start, size, &restrictions); if (ret < 0) goto error; -- cgit v1.2.3 From 350e88bad4964da6feabee02a1a70381bcdb087e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:22:59 -0700 Subject: mm: memblock: make keeping memblock memory opt-in rather than opt-out Most architectures do not need the memblock memory after the page allocator is initialized, but only few enable ARCH_DISCARD_MEMBLOCK in the arch Kconfig. Replacing ARCH_DISCARD_MEMBLOCK with ARCH_KEEP_MEMBLOCK and inverting the logic makes it clear which architectures actually use memblock after system initialization and skips the necessity to add ARCH_DISCARD_MEMBLOCK to the architectures that are still missing that option. Link: http://lkml.kernel.org/r/1556102150-32517-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Michael Ellerman (powerpc) Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Richard Kuo Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Ley Foon Tan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 1 + arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/nios2/Kconfig | 1 - arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 - arch/x86/Kconfig | 1 - include/linux/memblock.h | 3 ++- kernel/kexec_file.c | 16 ++++++++-------- mm/Kconfig | 2 +- mm/memblock.c | 6 +++--- mm/page_alloc.c | 3 +-- 16 files changed, 19 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a11dfcc2a130..5fd344bd06b9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -4,7 +4,6 @@ config ARM default y select ARCH_32BIT_OFF_T select ARCH_CLOCKSOURCE_DATA - select ARCH_DISCARD_MEMBLOCK if !HAVE_ARCH_PFN_VALID && !KEXEC select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE @@ -22,6 +21,7 @@ config ARM select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7a1aa53d188d..69a59a5d1143 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -60,6 +60,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT + select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 3e54a53208d5..b7d404bbaa0f 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -22,7 +22,6 @@ config HEXAGON select GENERIC_IRQ_SHOW select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select ARCH_DISCARD_MEMBLOCK select NEED_SG_DMA_LENGTH select NO_IOPORT_MAP select GENERIC_IOMAP diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 73a26f04644e..7468d8e50467 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -33,7 +33,6 @@ config IA64 select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB select VIRT_TO_BUS - select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index fe5cc2da6d10..218e037ef901 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -26,7 +26,6 @@ config M68K select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND3 select OLD_SIGACTION - select ARCH_DISCARD_MEMBLOCK select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ff8cff9fcf54..677e5bfeff47 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -5,7 +5,6 @@ config MIPS select ARCH_32BIT_OFF_T if !64BIT select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT select ARCH_CLOCKSOURCE_DATA - select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index ea37394ff3ea..26a9c760a98b 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -23,7 +23,6 @@ config NIOS2 select SPARSE_IRQ select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS - select ARCH_DISCARD_MEMBLOCK select MMU_GATHER_NO_RANGE if MMU config GENERIC_CSUM diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d7996cfaceca..8c1c636308c8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,6 +137,7 @@ config PPC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d0c046af65fa..109243fdb6ec 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -100,6 +100,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_KEEP_MEMBLOCK select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2a77033e1e7c..b77f512bb176 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -10,7 +10,6 @@ config SUPERH select DMA_DECLARE_COHERENT select HAVE_IDE if HAS_IOPORT_MAP select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK select HAVE_OPROFILE select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f21bc56e5d7b..818b361094ed 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,7 +47,6 @@ config X86 select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_DATA select ARCH_CLOCKSOURCE_INIT - select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 47e3c0612592..676d3900e1bd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -96,13 +96,14 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK #define __init_memblock __meminit #define __initdata_memblock __meminitdata void memblock_discard(void); #else #define __init_memblock #define __initdata_memblock +static inline void memblock_discard(void) {} #endif #define memblock_dbg(fmt, ...) \ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f7fb8f6a688f..072b6ee55e3f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static int kexec_walk_memblock(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { @@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, return ret; } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} #endif /** @@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); diff --git a/mm/Kconfig b/mm/Kconfig index 71e697e693df..c5124c2cb0b2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -136,7 +136,7 @@ config HAVE_MEMBLOCK_PHYS_MAP config HAVE_GENERIC_GUP bool -config ARCH_DISCARD_MEMBLOCK +config ARCH_KEEP_MEMBLOCK bool config MEMORY_ISOLATION diff --git a/mm/memblock.c b/mm/memblock.c index f315eca9f4a1..6bbad46f4d2c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -94,7 +94,7 @@ * :c:func:`mem_init` function frees all the memory to the buddy page * allocator. * - * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the + * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the * memblock data structures will be discarded after the system * initialization compltes. */ @@ -375,7 +375,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK /** * memblock_discard - discard memory and reserved arrays if they were allocated */ @@ -1987,7 +1987,7 @@ unsigned long __init memblock_free_all(void) return pages; } -#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbda9aea0bf5..f2f3fb4921d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1896,10 +1896,9 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ memblock_discard(); -#endif for_each_populated_zone(zone) set_zone_contiguous(zone); -- cgit v1.2.3 From 640be2d1ffbc1946f1547eb89b5005ed7542de99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:23:23 -0700 Subject: kernel/memremap.c: remove the unused device_private_entry_fault() export This export has been entirely unused since it was added more than 1 1/2 years ago. Link: http://lkml.kernel.org/r/20190429115535.12793-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 4e59d29245f4..1490e63f69a9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, */ return devmem->page_fault(vma, addr, page, flags, pmdp); } -EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ static void pgmap_array_delete(struct resource *res) -- cgit v1.2.3