diff options
22 files changed, 279 insertions, 290 deletions
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 368b136517e0..e1f551159f7d 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -553,6 +553,12 @@ static inline pte_t pte_clear_savedwrite(pte_t pte) } #endif /* CONFIG_NUMA_BALANCING */ +static inline bool pte_hw_valid(pte_t pte) +{ + return (pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE)) == + cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE); +} + static inline int pte_present(pte_t pte) { /* @@ -561,12 +567,11 @@ static inline int pte_present(pte_t pte) * invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID * if we find _PAGE_PRESENT cleared. */ - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)); -} -static inline bool pte_hw_valid(pte_t pte) -{ - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT)); + if (pte_hw_valid(pte)) + return true; + return (pte_raw(pte) & cpu_to_be64(_PAGE_INVALID | _PAGE_PTE)) == + cpu_to_be64(_PAGE_INVALID | _PAGE_PTE); } #ifdef CONFIG_PPC_MEM_KEYS @@ -1260,6 +1265,11 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, } #define pmdp_collapse_flush pmdp_collapse_flush +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL +pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, + pmd_t *pmdp, int full); + #define __HAVE_ARCH_PGTABLE_DEPOSIT static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 64d02a704bcb..3b95769739c7 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -113,8 +113,7 @@ static inline void hash__flush_tlb_kernel_range(unsigned long start, struct mmu_gather; extern void hash__tlb_flush(struct mmu_gather *tlb); /* Private function for use by PCI IO mapping code */ -extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, - unsigned long end); +extern void __flush_hash_table_range(unsigned long start, unsigned long end); extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr); #endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 506e4df2d730..37c8b50cb505 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -198,7 +198,7 @@ extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, unsigned int shift, const struct kvm_memory_slot *memslot, unsigned int lpid); -extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, +extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing, unsigned long gpa, unsigned int lpid); extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 04b2b927bb5a..c58e64a0a74f 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -14,6 +14,7 @@ #include <asm/book3s/64/mmu-hash.h> #include <asm/cpu_has_feature.h> #include <asm/ppc-opcode.h> +#include <asm/pte-walk.h> #ifdef CONFIG_PPC_PSERIES static inline bool kvmhv_on_pseries(void) @@ -434,7 +435,7 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) continue; } /* If pte is not present return None */ - if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT))) + if (unlikely(!pte_present(old_pte))) return __pte(0); new_pte = pte_mkyoung(old_pte); @@ -634,6 +635,37 @@ extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, unsigned long gpa, unsigned long hpa, unsigned long nbytes); +static inline pte_t *find_kvm_secondary_pte(struct kvm *kvm, unsigned long ea, + unsigned *hshift) +{ + pte_t *pte; + + VM_WARN(!spin_is_locked(&kvm->mmu_lock), + "%s called with kvm mmu_lock not held \n", __func__); + pte = __find_linux_pte(kvm->arch.pgtable, ea, NULL, hshift); + + return pte; +} + +static inline pte_t *find_kvm_host_pte(struct kvm *kvm, unsigned long mmu_seq, + unsigned long ea, unsigned *hshift) +{ + pte_t *pte; + + VM_WARN(!spin_is_locked(&kvm->mmu_lock), + "%s called with kvm mmu_lock not held \n", __func__); + + if (mmu_notifier_retry(kvm, mmu_seq)) + return NULL; + + pte = __find_linux_pte(kvm->mm->pgd, ea, NULL, hshift); + + return pte; +} + +extern pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid, + unsigned long ea, unsigned *hshift); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 0699cfeeb8c9..cf2a08bfd5cd 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -291,15 +291,6 @@ static inline bool early_radix_enabled(void) } #endif -#ifdef CONFIG_PPC_MEM_KEYS -extern u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address); -#else -static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) -{ - return 0; -} -#endif /* CONFIG_PPC_MEM_KEYS */ - #ifdef CONFIG_STRICT_KERNEL_RWX static inline bool strict_kernel_rwx_enabled(void) { diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 067b094bfeff..1d18991f3854 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -27,7 +27,7 @@ */ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) { - pte_t *ptep; + pte_t *ptep, pte; unsigned int shift; unsigned long pfn, flags; struct mm_struct *mm; @@ -39,19 +39,23 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) local_irq_save(flags); ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift); + if (!ptep) { + pfn = ULONG_MAX; + goto out; + } + pte = READ_ONCE(*ptep); - if (!ptep || pte_special(*ptep)) { + if (!pte_present(pte) || pte_special(pte)) { pfn = ULONG_MAX; goto out; } if (shift <= PAGE_SHIFT) - pfn = pte_pfn(*ptep); + pfn = pte_pfn(pte); else { unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; - pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask))); + pfn = pte_pfn(__pte(pte_val(pte) | (addr & rpnmask))); } - out: local_irq_restore(flags); return pfn; diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index f83d1f69b1dd..30d07fc79dd1 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -100,7 +100,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus) pci_name(bus->self)); #ifdef CONFIG_PPC_BOOK3S_64 - __flush_hash_table_range(&init_mm, res->start + _IO_BASE, + __flush_hash_table_range(res->start + _IO_BASE, res->end + _IO_BASE + 1); #endif return 0; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 6404df613ea3..18aed9775a3c 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -281,11 +281,10 @@ static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, { long ret; - /* Protect linux PTE lookup from page table destruction */ - rcu_read_lock_sched(); /* this disables preemption too */ + preempt_disable(); ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, kvm->mm->pgd, false, pte_idx_ret); - rcu_read_unlock_sched(); + preempt_enable(); if (ret == H_TOO_HARD) { /* this can't happen */ pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); @@ -602,20 +601,21 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * Read the PTE from the process' radix tree and use that * so we get the shift and attribute bits. */ - local_irq_disable(); - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + spin_lock(&kvm->mmu_lock); + ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); + pte = __pte(0); + if (ptep) + pte = READ_ONCE(*ptep); + spin_unlock(&kvm->mmu_lock); /* * If the PTE disappeared temporarily due to a THP * collapse, just return and let the guest try again. */ - if (!ptep) { - local_irq_enable(); + if (!pte_present(pte)) { if (page) put_page(page); return RESUME_GUEST; } - pte = *ptep; - local_irq_enable(); hpa = pte_pfn(pte) << PAGE_SHIFT; pte_size = PAGE_SIZE; if (shift) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 9f050064d2a2..271f1c3d8443 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -735,7 +735,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, return ret; } -bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, +bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing, unsigned long gpa, unsigned int lpid) { unsigned long pgflags; @@ -750,12 +750,12 @@ bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, pgflags = _PAGE_ACCESSED; if (writing) pgflags |= _PAGE_DIRTY; - /* - * We are walking the secondary (partition-scoped) page table here. - * We can do this without disabling irq because the Linux MM - * subsystem doesn't do THP splits and collapses on this tree. - */ - ptep = __find_linux_pte(pgtable, gpa, NULL, &shift); + + if (nested) + ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift); + else + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); + if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); return true; @@ -813,20 +813,21 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * Read the PTE from the process' radix tree and use that * so we get the shift and attribute bits. */ - local_irq_disable(); - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + spin_lock(&kvm->mmu_lock); + ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); + pte = __pte(0); + if (ptep) + pte = READ_ONCE(*ptep); + spin_unlock(&kvm->mmu_lock); /* * If the PTE disappeared temporarily due to a THP * collapse, just return and let the guest try again. */ - if (!ptep) { - local_irq_enable(); + if (!pte_present(pte)) { if (page) put_page(page); return RESUME_GUEST; } - pte = *ptep; - local_irq_enable(); /* If we're logging dirty pages, always map single pages */ large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); @@ -948,8 +949,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Failed to set the reference/change bits */ if (dsisr & DSISR_SET_RC) { spin_lock(&kvm->mmu_lock); - if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, - writing, gpa, kvm->arch.lpid)) + if (kvmppc_hv_handle_set_rc(kvm, false, writing, + gpa, kvm->arch.lpid)) dsisr &= ~DSISR_SET_RC; spin_unlock(&kvm->mmu_lock); @@ -980,11 +981,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, return 0; } - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); - return 0; + return 0; } /* Called with kvm->mmu_lock held */ @@ -1000,7 +1001,7 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ref; - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) { old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, gpa, shift); @@ -1027,7 +1028,7 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ref; - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) ref = 1; return ref; @@ -1047,7 +1048,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ret; - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { ret = 1; if (shift) @@ -1108,7 +1109,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm, gpa = memslot->base_gfn << PAGE_SHIFT; spin_lock(&kvm->mmu_lock); for (n = memslot->npages; n; --n) { - ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 6fcaf1fa8e02..167029e57c8f 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -74,8 +74,8 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, EXPORT_SYMBOL_GPL(kvmppc_find_table); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long tce, - unsigned long *ua, unsigned long **prmap) +static long kvmppc_rm_tce_to_ua(struct kvm *kvm, + unsigned long tce, unsigned long *ua) { unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; @@ -87,9 +87,6 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long tce, *ua = __gfn_to_hva_memslot(memslot, gfn) | (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); - if (prmap) - *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; - return 0; } @@ -116,7 +113,7 @@ static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt, if (iommu_tce_check_gpa(stt->page_shift, gpa)) return H_PARAMETER; - if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua, NULL)) + if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua)) return H_TOO_HARD; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -411,7 +408,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, return ret; dir = iommu_tce_direction(tce); - if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) + if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) return H_PARAMETER; entry = ioba >> stt->page_shift; @@ -437,8 +434,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, return H_SUCCESS; } -static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, - unsigned long ua, unsigned long *phpa) +static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq, + unsigned long ua, unsigned long *phpa) { pte_t *ptep, pte; unsigned shift = 0; @@ -452,10 +449,17 @@ static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, * to exit which will agains result in the below page table walk * to finish. */ - ptep = __find_linux_pte(vcpu->arch.pgdir, ua, NULL, &shift); - if (!ptep || !pte_present(*ptep)) + /* an rmap lock won't make it safe. because that just ensure hash + * page table entries are removed with rmap lock held. After that + * mmu notifier returns and we go ahead and removing ptes from Qemu page table. + */ + ptep = find_kvm_host_pte(vcpu->kvm, mmu_seq, ua, &shift); + if (!ptep) + return -ENXIO; + + pte = READ_ONCE(*ptep); + if (!pte_present(pte)) return -ENXIO; - pte = *ptep; if (!shift) shift = PAGE_SHIFT; @@ -477,10 +481,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce_list, unsigned long npages) { + struct kvm *kvm = vcpu->kvm; struct kvmppc_spapr_tce_table *stt; long i, ret = H_SUCCESS; unsigned long tces, entry, ua = 0; - unsigned long *rmap = NULL; + unsigned long mmu_seq; bool prereg = false; struct kvmppc_spapr_tce_iommu_table *stit; @@ -488,6 +493,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (kvm_is_radix(vcpu->kvm)) return H_TOO_HARD; + /* + * used to check for invalidations in progress + */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -515,7 +526,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ struct mm_iommu_table_group_mem_t *mem; - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) + if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua)) return H_TOO_HARD; mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); @@ -531,23 +542,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, * We do not require memory to be preregistered in this case * so lock rmap and do __find_linux_pte_or_hugepte(). */ - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) - return H_TOO_HARD; - - rmap = (void *) vmalloc_to_phys(rmap); - if (WARN_ON_ONCE_RM(!rmap)) + if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua)) return H_TOO_HARD; - /* - * Synchronize with the MMU notifier callbacks in - * book3s_64_mmu_hv.c (kvm_unmap_hva_range_hv etc.). - * While we have the rmap lock, code running on other CPUs - * cannot finish unmapping the host real page that backs - * this guest real page, so we are OK to access the host - * real page. - */ - lock_rmap(rmap); - if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { + arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); + if (kvmppc_rm_ua_to_hpa(vcpu, mmu_seq, ua, &tces)) { ret = H_TOO_HARD; goto unlock_exit; } @@ -565,7 +564,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); ua = 0; - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { + if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; goto invalidate_exit; } @@ -590,9 +589,8 @@ invalidate_exit: iommu_tce_kill_rm(stit->tbl, entry, npages); unlock_exit: - if (rmap) - unlock_rmap(rmap); - + if (!prereg) + arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); return ret; } diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index dc97e5be76f6..99011f1b772a 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -750,6 +750,23 @@ static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) return kvm->arch.nested_guests[lpid]; } +pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid, + unsigned long ea, unsigned *hshift) +{ + struct kvm_nested_guest *gp; + pte_t *pte; + + gp = kvmhv_find_nested(kvm, lpid); + if (!gp) + return NULL; + + VM_WARN(!spin_is_locked(&kvm->mmu_lock), + "%s called with kvm mmu_lock not held \n", __func__); + pte = __find_linux_pte(gp->shadow_pgtable, ea, NULL, hshift); + + return pte; +} + static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2) { return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK | @@ -792,19 +809,15 @@ static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap, unsigned long clr, unsigned long set, unsigned long hpa, unsigned long mask) { - struct kvm_nested_guest *gp; unsigned long gpa; unsigned int shift, lpid; pte_t *ptep; gpa = n_rmap & RMAP_NESTED_GPA_MASK; lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; - gp = kvmhv_find_nested(kvm, lpid); - if (!gp) - return; /* Find the pte */ - ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift); /* * If the pte is present and the pfn is still the same, update the pte. * If the pfn has changed then this is a stale rmap entry, the nested @@ -854,7 +867,7 @@ static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, return; /* Find and invalidate the pte */ - ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift); /* Don't spuriously invalidate ptes if the pfn has changed */ if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); @@ -921,7 +934,7 @@ static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu, int shift; spin_lock(&kvm->mmu_lock); - ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + ptep = find_kvm_nested_guest_pte(kvm, gp->l1_lpid, gpa, &shift); if (!shift) shift = PAGE_SHIFT; if (ptep && pte_present(*ptep)) { @@ -1212,16 +1225,16 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, spin_lock(&kvm->mmu_lock); /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ - ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing, - gpte.raddr, kvm->arch.lpid); + ret = kvmppc_hv_handle_set_rc(kvm, false, writing, + gpte.raddr, kvm->arch.lpid); if (!ret) { ret = -EINVAL; goto out_unlock; } /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ - ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa, - gp->shadow_lpid); + ret = kvmppc_hv_handle_set_rc(kvm, true, writing, + n_gpa, gp->shadow_lpid); if (!ret) ret = -EINVAL; else @@ -1362,7 +1375,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_run *run, /* See if can find translation in our partition scoped tables for L1 */ pte = __pte(0); spin_lock(&kvm->mmu_lock); - pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + pte_p = find_kvm_secondary_pte(kvm, gpa, &shift); if (!shift) shift = PAGE_SHIFT; if (pte_p) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 220305454c23..3b168c69d503 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -210,7 +210,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, pte_t *ptep; unsigned int writing; unsigned long mmu_seq; - unsigned long rcbits, irq_flags = 0; + unsigned long rcbits; if (kvm_is_radix(kvm)) return H_FUNCTION; @@ -248,17 +248,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); - /* - * If we had a page table table change after lookup, we would - * retry via mmu_notifier_retry. - */ - if (!realmode) - local_irq_save(irq_flags); - /* - * If called in real mode we have MSR_EE = 0. Otherwise - * we disable irq above. - */ - ptep = __find_linux_pte(pgdir, hva, NULL, &hpage_shift); + + arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); + ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &hpage_shift); if (ptep) { pte_t pte; unsigned int host_pte_size; @@ -272,8 +264,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, * to <= host page size, if host is using hugepage */ if (host_pte_size < psize) { - if (!realmode) - local_irq_restore(flags); + arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); return H_PARAMETER; } pte = kvmppc_read_update_linux_pte(ptep, writing); @@ -287,8 +278,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, pa |= gpa & ~PAGE_MASK; } } - if (!realmode) - local_irq_restore(irq_flags); + arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1); ptel |= pa; @@ -888,8 +878,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, return ret; } -static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa, - int writing, unsigned long *hpa, +static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq, + unsigned long gpa, int writing, unsigned long *hpa, struct kvm_memory_slot **memslot_p) { struct kvm *kvm = vcpu->kvm; @@ -908,7 +898,7 @@ static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa, hva = __gfn_to_hva_memslot(memslot, gfn); /* Try to find the host pte for that virtual address */ - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); if (!ptep) return H_TOO_HARD; pte = kvmppc_read_update_linux_pte(ptep, writing); @@ -943,16 +933,11 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu, mmu_seq = kvm->mmu_notifier_seq; smp_rmb(); - ret = kvmppc_get_hpa(vcpu, dest, 1, &pa, &memslot); - if (ret != H_SUCCESS) - return ret; + arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); - /* Check if we've been invalidated */ - raw_spin_lock(&kvm->mmu_lock.rlock); - if (mmu_notifier_retry(kvm, mmu_seq)) { - ret = H_TOO_HARD; + ret = kvmppc_get_hpa(vcpu, mmu_seq, dest, 1, &pa, &memslot); + if (ret != H_SUCCESS) goto out_unlock; - } /* Zero the page */ for (i = 0; i < SZ_4K; i += L1_CACHE_BYTES, pa += L1_CACHE_BYTES) @@ -960,7 +945,7 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu, kvmppc_update_dirty_map(memslot, dest >> PAGE_SHIFT, PAGE_SIZE); out_unlock: - raw_spin_unlock(&kvm->mmu_lock.rlock); + arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); return ret; } @@ -976,19 +961,14 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu, mmu_seq = kvm->mmu_notifier_seq; smp_rmb(); - ret = kvmppc_get_hpa(vcpu, dest, 1, &dest_pa, &dest_memslot); - if (ret != H_SUCCESS) - return ret; - ret = kvmppc_get_hpa(vcpu, src, 0, &src_pa, NULL); + arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); + ret = kvmppc_get_hpa(vcpu, mmu_seq, dest, 1, &dest_pa, &dest_memslot); if (ret != H_SUCCESS) - return ret; + goto out_unlock; - /* Check if we've been invalidated */ - raw_spin_lock(&kvm->mmu_lock.rlock); - if (mmu_notifier_retry(kvm, mmu_seq)) { - ret = H_TOO_HARD; + ret = kvmppc_get_hpa(vcpu, mmu_seq, src, 0, &src_pa, NULL); + if (ret != H_SUCCESS) goto out_unlock; - } /* Copy the page */ memcpy((void *)dest_pa, (void *)src_pa, SZ_4K); @@ -996,7 +976,7 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu, kvmppc_update_dirty_map(dest_memslot, dest >> PAGE_SHIFT, PAGE_SIZE); out_unlock: - raw_spin_unlock(&kvm->mmu_lock.rlock); + arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); return ret; } diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 64733b9cb20a..64ca375278dc 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -363,17 +363,6 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, * hash fault look at them. */ memset(pgtable, 0, PTE_FRAG_SIZE); - /* - * Serialize against find_current_mm_pte variants which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_curren_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); return old_pmd; } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 4a70d8dd39cd..1fa2173413b5 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -176,7 +176,6 @@ void hash__tlb_flush(struct mmu_gather *tlb) * from the hash table (and the TLB). But keeps * the linux PTEs intact. * - * @mm : mm_struct of the target address space (generally init_mm) * @start : starting address * @end : ending address (not included in the flush) * @@ -189,17 +188,14 @@ void hash__tlb_flush(struct mmu_gather *tlb) * Because of that usage pattern, it is implemented for small size rather * than speed. */ -void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, - unsigned long end) +void __flush_hash_table_range(unsigned long start, unsigned long end) { - bool is_thp; int hugepage_shift; unsigned long flags; start = _ALIGN_DOWN(start, PAGE_SIZE); end = _ALIGN_UP(end, PAGE_SIZE); - BUG_ON(!mm->pgd); /* * Note: Normally, we should only ever use a batch within a @@ -212,21 +208,15 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, - &hugepage_shift); + pte_t *ptep = find_init_mm_pte(start, &hugepage_shift); unsigned long pte; if (ptep == NULL) continue; pte = pte_val(*ptep); - if (is_thp) - trace_hugepage_invalidate(start, pte); if (!(pte & H_PAGE_HASHPTE)) continue; - if (unlikely(is_thp)) - hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); - else - hpte_need_flush(mm, start, ptep, pte, hugepage_shift); + hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift); } arch_leave_lazy_mmu_mode(); local_irq_restore(flags); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 8ed2411c3f39..3d727f73a8db 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1350,8 +1350,15 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, goto bail; } - /* Add _PAGE_PRESENT to the required access perm */ - access |= _PAGE_PRESENT; + /* + * Add _PAGE_PRESENT to the required access perm. If there are parallel + * updates to the pte that can possibly clear _PAGE_PTE, catch that too. + * + * We can safely use the return pte address in rest of the function + * because we do set H_PAGE_BUSY which prevents further updates to pte + * from generic code. + */ + access |= _PAGE_PRESENT | _PAGE_PTE; /* * Pre-check access permissions (will be re-checked atomically @@ -1539,14 +1546,11 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) } #endif -static void hash_preload(struct mm_struct *mm, unsigned long ea, +static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea, bool is_exec, unsigned long trap) { - int hugepage_shift; unsigned long vsid; pgd_t *pgdir; - pte_t *ptep; - unsigned long flags; int rc, ssize, update_flags = 0; unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); @@ -1568,30 +1572,18 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea, vsid = get_user_vsid(&mm->context, ea, ssize); if (!vsid) return; - /* - * Hash doesn't like irqs. Walking linux page table with irq disabled - * saves us from holding multiple locks. - */ - local_irq_save(flags); - - /* - * THP pages use update_mmu_cache_pmd. We don't do - * hash preload there. Hence can ignore THP here - */ - ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift); - if (!ptep) - goto out_exit; - WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take * care of it once we actually try to access the page. * That way we don't have to duplicate all of the logic for segment * page size demotion here + * Called with PTL held, hence can be sure the value won't change in + * between. */ if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) - goto out_exit; + return; #endif /* CONFIG_PPC_64K_PAGES */ /* Is that local to this CPU ? */ @@ -1616,8 +1608,6 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea, mm_ctx_user_psize(&mm->context), mm_ctx_user_psize(&mm->context), pte_val(*ptep)); -out_exit: - local_irq_restore(flags); } /* @@ -1668,32 +1658,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, return; } - hash_preload(vma->vm_mm, address, is_exec, trap); -} - -#ifdef CONFIG_PPC_MEM_KEYS -/* - * Return the protection key associated with the given address and the - * mm_struct. - */ -u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) -{ - pte_t *ptep; - u16 pkey = 0; - unsigned long flags; - - if (!mm || !mm->pgd) - return 0; - - local_irq_save(flags); - ptep = find_linux_pte(mm->pgd, address, NULL, NULL); - if (ptep) - pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep))); - local_irq_restore(flags); - - return pkey; + hash_preload(vma->vm_mm, ptep, address, is_exec, trap); } -#endif /* CONFIG_PPC_MEM_KEYS */ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM static inline void tm_flush_hash_page(int local) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index e0bb69c616e4..54b6d6d103ea 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -109,15 +109,25 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return __pmd(old_pmd); +} + +pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp, int full) +{ + pmd_t pmd; + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp)) || !pmd_present(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - * - * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires - * a special case check in pmd_access_permitted. + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. */ - serialize_against_pte_lookup(vma->vm_mm); - return __pmd(old_pmd); + if (!full) + flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + return pmd; } static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 8f9edf07063a..dfb9fe92aea8 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -962,7 +962,13 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre pmd = *pmdp; pmd_clear(pmdp); - /*FIXME!! Verify whether we need this kick below */ + /* + * pmdp collapse_flush need to ensure that there are no parallel gup + * walk after this call. This is needed so that we can have stable + * page ref count when collapsing a page. We don't allow a collapse page + * if we have gup taken on the page. We can ensure that by sending IPI + * because gup walk happens with IRQ disabled. + */ serialize_against_pte_lookup(vma->vm_mm); radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); @@ -1023,17 +1029,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); old_pmd = __pmd(old); - /* - * Serialize against find_current_mm_pte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. - */ - serialize_against_pte_lookup(mm); return old_pmd; } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 84af6c8eecf7..44457bae77a0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -118,9 +118,34 @@ static noinline int bad_area(struct pt_regs *regs, unsigned long address) return __bad_area(regs, address, SEGV_MAPERR); } -static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address, - int pkey) +#ifdef CONFIG_PPC_MEM_KEYS +static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, + struct vm_area_struct *vma) { + struct mm_struct *mm = current->mm; + int pkey; + + /* + * We don't try to fetch the pkey from page table because reading + * page table without locking doesn't guarantee stable pte value. + * Hence the pkey value that we return to userspace can be different + * from the pkey that actually caused access error. + * + * It does *not* guarantee that the VMA we find here + * was the one that we faulted on. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set AMR to deny access to pkey=4, touches, page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_sem, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ + pkey = vma_pkey(vma); + + up_read(&mm->mmap_sem); + /* * If we are in kernel mode, bail out with a SEGV, this will * be caught by the assembly which will restore the non-volatile @@ -133,6 +158,7 @@ static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address, return 0; } +#endif static noinline int bad_access(struct pt_regs *regs, unsigned long address) { @@ -289,8 +315,23 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, return false; } -static bool access_error(bool is_write, bool is_exec, - struct vm_area_struct *vma) +#ifdef CONFIG_PPC_MEM_KEYS +static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey, + struct vm_area_struct *vma) +{ + /* + * Make sure to check the VMA so that we do not perform + * faults just to hit a pkey fault as soon as we fill in a + * page. Only called for current mm, hence foreign == 0 + */ + if (!arch_vma_access_permitted(vma, is_write, is_exec, 0)) + return true; + + return false; +} +#endif + +static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma) { /* * Allow execution from readable areas if the MMU does not @@ -483,10 +524,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (error_code & DSISR_KEYFAULT) - return bad_key_fault_exception(regs, address, - get_mm_addr_key(mm, address)); - /* * We want to do this outside mmap_sem, because reading code around nip * can result in fault, which will cause a deadlock when called with @@ -555,6 +592,13 @@ retry: return bad_area(regs, address); good_area: + +#ifdef CONFIG_PPC_MEM_KEYS + if (unlikely(access_pkey_error(is_write, is_exec, + (error_code & DSISR_KEYFAULT), vma))) + return bad_access_pkey(regs, address, vma); +#endif /* CONFIG_PPC_MEM_KEYS */ + if (unlikely(access_error(is_write, is_exec, vma))) return bad_access(regs, address); @@ -565,21 +609,6 @@ good_area: */ fault = handle_mm_fault(vma, address, flags); -#ifdef CONFIG_PPC_MEM_KEYS - /* - * we skipped checking for access error due to key earlier. - * Check that using handle_mm_fault error return. - */ - if (unlikely(fault & VM_FAULT_SIGSEGV) && - !arch_vma_access_permitted(vma, is_write, is_exec, 0)) { - - int pkey = vma_pkey(vma); - - up_read(&mm->mmap_sem); - return bad_key_fault_exception(regs, address, pkey); - } -#endif /* CONFIG_PPC_MEM_KEYS */ - major |= fault & VM_FAULT_MAJOR; if (fault_signal_pending(fault, regs)) diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c index df1ffd8b20f2..b63086b663ef 100644 --- a/arch/powerpc/perf/callchain_64.c +++ b/arch/powerpc/perf/callchain_64.c @@ -26,43 +26,25 @@ */ int read_user_stack_slow(void __user *ptr, void *buf, int nb) { - int ret = -EFAULT; - pgd_t *pgdir; - pte_t *ptep, pte; - unsigned int shift; + unsigned long addr = (unsigned long) ptr; unsigned long offset; - unsigned long pfn, flags; + struct page *page; + int nrpages; void *kaddr; - pgdir = current->mm->pgd; - if (!pgdir) - return -EFAULT; + nrpages = __get_user_pages_fast(addr, 1, 1, &page); + if (nrpages == 1) { + kaddr = page_address(page); + + /* align address to page boundary */ + offset = addr & ~PAGE_MASK; - local_irq_save(flags); - ptep = find_current_mm_pte(pgdir, addr, NULL, &shift); - if (!ptep) - goto err_out; - if (!shift) - shift = PAGE_SHIFT; - - /* align address to page boundary */ - offset = addr & ((1UL << shift) - 1); - - pte = READ_ONCE(*ptep); - if (!pte_present(pte) || !pte_user(pte)) - goto err_out; - pfn = pte_pfn(pte); - if (!page_is_ram(pfn)) - goto err_out; - - /* no highmem to worry about here */ - kaddr = pfn_to_kaddr(pfn); - memcpy(buf, kaddr + offset, nb); - ret = 0; -err_out: - local_irq_restore(flags); - return ret; + memcpy(buf, kaddr + offset, nb); + put_page(page); + return 0; + } + return -EFAULT; } static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 6076c8c912d2..e2528e057980 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1560,7 +1560,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, } #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL -static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, +static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full) { @@ -1569,7 +1569,7 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); return pmd; } - return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + return pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); } #define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 329b8c8ca703..d10be362eafa 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -159,11 +159,11 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL -static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, +static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { - return pmdp_huge_get_and_clear(mm, address, pmdp); + return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6ecd1045113b..16f2bd6f1549 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1852,8 +1852,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, + tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) |