summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h20
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-hash.h3
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h2
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h34
-rw-r--r--arch/powerpc/include/asm/mmu.h9
-rw-r--r--arch/powerpc/kernel/mce_power.c14
-rw-r--r--arch/powerpc/kernel/pci_64.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c18
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c43
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c64
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c37
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c58
-rw-r--r--arch/powerpc/mm/book3s64/hash_pgtable.c11
-rw-r--r--arch/powerpc/mm/book3s64/hash_tlb.c16
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c62
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c24
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c19
-rw-r--r--arch/powerpc/mm/fault.c75
-rw-r--r--arch/powerpc/perf/callchain_64.c46
-rw-r--r--arch/s390/include/asm/pgtable.h4
-rw-r--r--include/asm-generic/pgtable.h4
-rw-r--r--mm/huge_memory.c4
22 files changed, 279 insertions, 290 deletions
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 368b136517e0..e1f551159f7d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -553,6 +553,12 @@ static inline pte_t pte_clear_savedwrite(pte_t pte)
}
#endif /* CONFIG_NUMA_BALANCING */
+static inline bool pte_hw_valid(pte_t pte)
+{
+ return (pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE)) ==
+ cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE);
+}
+
static inline int pte_present(pte_t pte)
{
/*
@@ -561,12 +567,11 @@ static inline int pte_present(pte_t pte)
* invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID
* if we find _PAGE_PRESENT cleared.
*/
- return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
-}
-static inline bool pte_hw_valid(pte_t pte)
-{
- return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT));
+ if (pte_hw_valid(pte))
+ return true;
+ return (pte_raw(pte) & cpu_to_be64(_PAGE_INVALID | _PAGE_PTE)) ==
+ cpu_to_be64(_PAGE_INVALID | _PAGE_PTE);
}
#ifdef CONFIG_PPC_MEM_KEYS
@@ -1260,6 +1265,11 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
}
#define pmdp_collapse_flush pmdp_collapse_flush
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmdp, int full);
+
#define __HAVE_ARCH_PGTABLE_DEPOSIT
static inline void pgtable_trans_huge_deposit(struct mm_struct *mm,
pmd_t *pmdp, pgtable_t pgtable)
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
index 64d02a704bcb..3b95769739c7 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -113,8 +113,7 @@ static inline void hash__flush_tlb_kernel_range(unsigned long start,
struct mmu_gather;
extern void hash__tlb_flush(struct mmu_gather *tlb);
/* Private function for use by PCI IO mapping code */
-extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
- unsigned long end);
+extern void __flush_hash_table_range(unsigned long start, unsigned long end);
extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr);
#endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 506e4df2d730..37c8b50cb505 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -198,7 +198,7 @@ extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
unsigned int shift,
const struct kvm_memory_slot *memslot,
unsigned int lpid);
-extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
+extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested,
bool writing, unsigned long gpa,
unsigned int lpid);
extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 04b2b927bb5a..c58e64a0a74f 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -14,6 +14,7 @@
#include <asm/book3s/64/mmu-hash.h>
#include <asm/cpu_has_feature.h>
#include <asm/ppc-opcode.h>
+#include <asm/pte-walk.h>
#ifdef CONFIG_PPC_PSERIES
static inline bool kvmhv_on_pseries(void)
@@ -434,7 +435,7 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
continue;
}
/* If pte is not present return None */
- if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT)))
+ if (unlikely(!pte_present(old_pte)))
return __pte(0);
new_pte = pte_mkyoung(old_pte);
@@ -634,6 +635,37 @@ extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
unsigned long gpa, unsigned long hpa,
unsigned long nbytes);
+static inline pte_t *find_kvm_secondary_pte(struct kvm *kvm, unsigned long ea,
+ unsigned *hshift)
+{
+ pte_t *pte;
+
+ VM_WARN(!spin_is_locked(&kvm->mmu_lock),
+ "%s called with kvm mmu_lock not held \n", __func__);
+ pte = __find_linux_pte(kvm->arch.pgtable, ea, NULL, hshift);
+
+ return pte;
+}
+
+static inline pte_t *find_kvm_host_pte(struct kvm *kvm, unsigned long mmu_seq,
+ unsigned long ea, unsigned *hshift)
+{
+ pte_t *pte;
+
+ VM_WARN(!spin_is_locked(&kvm->mmu_lock),
+ "%s called with kvm mmu_lock not held \n", __func__);
+
+ if (mmu_notifier_retry(kvm, mmu_seq))
+ return NULL;
+
+ pte = __find_linux_pte(kvm->mm->pgd, ea, NULL, hshift);
+
+ return pte;
+}
+
+extern pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
+ unsigned long ea, unsigned *hshift);
+
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 0699cfeeb8c9..cf2a08bfd5cd 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -291,15 +291,6 @@ static inline bool early_radix_enabled(void)
}
#endif
-#ifdef CONFIG_PPC_MEM_KEYS
-extern u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address);
-#else
-static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
- return 0;
-}
-#endif /* CONFIG_PPC_MEM_KEYS */
-
#ifdef CONFIG_STRICT_KERNEL_RWX
static inline bool strict_kernel_rwx_enabled(void)
{
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 067b094bfeff..1d18991f3854 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -27,7 +27,7 @@
*/
unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
{
- pte_t *ptep;
+ pte_t *ptep, pte;
unsigned int shift;
unsigned long pfn, flags;
struct mm_struct *mm;
@@ -39,19 +39,23 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
local_irq_save(flags);
ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
+ if (!ptep) {
+ pfn = ULONG_MAX;
+ goto out;
+ }
+ pte = READ_ONCE(*ptep);
- if (!ptep || pte_special(*ptep)) {
+ if (!pte_present(pte) || pte_special(pte)) {
pfn = ULONG_MAX;
goto out;
}
if (shift <= PAGE_SHIFT)
- pfn = pte_pfn(*ptep);
+ pfn = pte_pfn(pte);
else {
unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
- pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask)));
+ pfn = pte_pfn(__pte(pte_val(pte) | (addr & rpnmask)));
}
-
out:
local_irq_restore(flags);
return pfn;
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index f83d1f69b1dd..30d07fc79dd1 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -100,7 +100,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus)
pci_name(bus->self));
#ifdef CONFIG_PPC_BOOK3S_64
- __flush_hash_table_range(&init_mm, res->start + _IO_BASE,
+ __flush_hash_table_range(res->start + _IO_BASE,
res->end + _IO_BASE + 1);
#endif
return 0;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 6404df613ea3..18aed9775a3c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -281,11 +281,10 @@ static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
{
long ret;
- /* Protect linux PTE lookup from page table destruction */
- rcu_read_lock_sched(); /* this disables preemption too */
+ preempt_disable();
ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
kvm->mm->pgd, false, pte_idx_ret);
- rcu_read_unlock_sched();
+ preempt_enable();
if (ret == H_TOO_HARD) {
/* this can't happen */
pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
@@ -602,20 +601,21 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
* Read the PTE from the process' radix tree and use that
* so we get the shift and attribute bits.
*/
- local_irq_disable();
- ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
+ spin_lock(&kvm->mmu_lock);
+ ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
+ pte = __pte(0);
+ if (ptep)
+ pte = READ_ONCE(*ptep);
+ spin_unlock(&kvm->mmu_lock);
/*
* If the PTE disappeared temporarily due to a THP
* collapse, just return and let the guest try again.
*/
- if (!ptep) {
- local_irq_enable();
+ if (!pte_present(pte)) {
if (page)
put_page(page);
return RESUME_GUEST;
}
- pte = *ptep;
- local_irq_enable();
hpa = pte_pfn(pte) << PAGE_SHIFT;
pte_size = PAGE_SIZE;
if (shift)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 9f050064d2a2..271f1c3d8443 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -735,7 +735,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
return ret;
}
-bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
+bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
unsigned long gpa, unsigned int lpid)
{
unsigned long pgflags;
@@ -750,12 +750,12 @@ bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
pgflags = _PAGE_ACCESSED;
if (writing)
pgflags |= _PAGE_DIRTY;
- /*
- * We are walking the secondary (partition-scoped) page table here.
- * We can do this without disabling irq because the Linux MM
- * subsystem doesn't do THP splits and collapses on this tree.
- */
- ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
+
+ if (nested)
+ ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
+ else
+ ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
+
if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
return true;
@@ -813,20 +813,21 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
* Read the PTE from the process' radix tree and use that
* so we get the shift and attribute bits.
*/
- local_irq_disable();
- ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
+ spin_lock(&kvm->mmu_lock);
+ ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
+ pte = __pte(0);
+ if (ptep)
+ pte = READ_ONCE(*ptep);
+ spin_unlock(&kvm->mmu_lock);
/*
* If the PTE disappeared temporarily due to a THP
* collapse, just return and let the guest try again.
*/
- if (!ptep) {
- local_irq_enable();
+ if (!pte_present(pte)) {
if (page)
put_page(page);
return RESUME_GUEST;
}
- pte = *ptep;
- local_irq_enable();
/* If we're logging dirty pages, always map single pages */
large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
@@ -948,8 +949,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* Failed to set the reference/change bits */
if (dsisr & DSISR_SET_RC) {
spin_lock(&kvm->mmu_lock);
- if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
- writing, gpa, kvm->arch.lpid))
+ if (kvmppc_hv_handle_set_rc(kvm, false, writing,
+ gpa, kvm->arch.lpid))
dsisr &= ~DSISR_SET_RC;
spin_unlock(&kvm->mmu_lock);
@@ -980,11 +981,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
return 0;
}
- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+ ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
kvm->arch.lpid);
- return 0;
+ return 0;
}
/* Called with kvm->mmu_lock held */
@@ -1000,7 +1001,7 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
return ref;
- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+ ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
gpa, shift);
@@ -1027,7 +1028,7 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
return ref;
- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+ ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep) && pte_young(*ptep))
ref = 1;
return ref;
@@ -1047,7 +1048,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
return ret;
- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+ ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
ret = 1;
if (shift)
@@ -1108,7 +1109,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm,
gpa = memslot->base_gfn << PAGE_SHIFT;
spin_lock(&kvm->mmu_lock);
for (n = memslot->npages; n; --n) {
- ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+ ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
kvm->arch.lpid);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 6fcaf1fa8e02..167029e57c8f 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -74,8 +74,8 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
EXPORT_SYMBOL_GPL(kvmppc_find_table);
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long tce,
- unsigned long *ua, unsigned long **prmap)
+static long kvmppc_rm_tce_to_ua(struct kvm *kvm,
+ unsigned long tce, unsigned long *ua)
{
unsigned long gfn = tce >> PAGE_SHIFT;
struct kvm_memory_slot *memslot;
@@ -87,9 +87,6 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long tce,
*ua = __gfn_to_hva_memslot(memslot, gfn) |
(tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
- if (prmap)
- *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
-
return 0;
}
@@ -116,7 +113,7 @@ static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
if (iommu_tce_check_gpa(stt->page_shift, gpa))
return H_PARAMETER;
- if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua, NULL))
+ if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua))
return H_TOO_HARD;
list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -411,7 +408,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
return ret;
dir = iommu_tce_direction(tce);
- if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
+ if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua))
return H_PARAMETER;
entry = ioba >> stt->page_shift;
@@ -437,8 +434,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
return H_SUCCESS;
}
-static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
- unsigned long ua, unsigned long *phpa)
+static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq,
+ unsigned long ua, unsigned long *phpa)
{
pte_t *ptep, pte;
unsigned shift = 0;
@@ -452,10 +449,17 @@ static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
* to exit which will agains result in the below page table walk
* to finish.
*/
- ptep = __find_linux_pte(vcpu->arch.pgdir, ua, NULL, &shift);
- if (!ptep || !pte_present(*ptep))
+ /* an rmap lock won't make it safe. because that just ensure hash
+ * page table entries are removed with rmap lock held. After that
+ * mmu notifier returns and we go ahead and removing ptes from Qemu page table.
+ */
+ ptep = find_kvm_host_pte(vcpu->kvm, mmu_seq, ua, &shift);
+ if (!ptep)
+ return -ENXIO;
+
+ pte = READ_ONCE(*ptep);
+ if (!pte_present(pte))
return -ENXIO;
- pte = *ptep;
if (!shift)
shift = PAGE_SHIFT;
@@ -477,10 +481,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
unsigned long liobn, unsigned long ioba,
unsigned long tce_list, unsigned long npages)
{
+ struct kvm *kvm = vcpu->kvm;
struct kvmppc_spapr_tce_table *stt;
long i, ret = H_SUCCESS;
unsigned long tces, entry, ua = 0;
- unsigned long *rmap = NULL;
+ unsigned long mmu_seq;
bool prereg = false;
struct kvmppc_spapr_tce_iommu_table *stit;
@@ -488,6 +493,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
if (kvm_is_radix(vcpu->kvm))
return H_TOO_HARD;
+ /*
+ * used to check for invalidations in progress
+ */
+ mmu_seq = kvm->mmu_notifier_seq;
+ smp_rmb();
+
stt = kvmppc_find_table(vcpu->kvm, liobn);
if (!stt)
return H_TOO_HARD;
@@ -515,7 +526,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
*/
struct mm_iommu_table_group_mem_t *mem;
- if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
+ if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua))
return H_TOO_HARD;
mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
@@ -531,23 +542,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
* We do not require memory to be preregistered in this case
* so lock rmap and do __find_linux_pte_or_hugepte().
*/
- if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
- return H_TOO_HARD;
-
- rmap = (void *) vmalloc_to_phys(rmap);
- if (WARN_ON_ONCE_RM(!rmap))
+ if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua))
return H_TOO_HARD;
- /*
- * Synchronize with the MMU notifier callbacks in
- * book3s_64_mmu_hv.c (kvm_unmap_hva_range_hv etc.).
- * While we have the rmap lock, code running on other CPUs
- * cannot finish unmapping the host real page that backs
- * this guest real page, so we are OK to access the host
- * real page.
- */
- lock_rmap(rmap);
- if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+ arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
+ if (kvmppc_rm_ua_to_hpa(vcpu, mmu_seq, ua, &tces)) {
ret = H_TOO_HARD;
goto unlock_exit;
}
@@ -565,7 +564,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
ua = 0;
- if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
+ if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) {
ret = H_PARAMETER;
goto invalidate_exit;
}
@@ -590,9 +589,8 @@ invalidate_exit:
iommu_tce_kill_rm(stit->tbl, entry, npages);
unlock_exit:
- if (rmap)
- unlock_rmap(rmap);
-
+ if (!prereg)
+ arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
return ret;
}
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index dc97e5be76f6..99011f1b772a 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -750,6 +750,23 @@ static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
return kvm->arch.nested_guests[lpid];
}
+pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
+ unsigned long ea, unsigned *hshift)
+{
+ struct kvm_nested_guest *gp;
+ pte_t *pte;
+
+ gp = kvmhv_find_nested(kvm, lpid);
+ if (!gp)
+ return NULL;
+
+ VM_WARN(!spin_is_locked(&kvm->mmu_lock),
+ "%s called with kvm mmu_lock not held \n", __func__);
+ pte = __find_linux_pte(gp->shadow_pgtable, ea, NULL, hshift);
+
+ return pte;
+}
+
static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
{
return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
@@ -792,19 +809,15 @@ static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap,
unsigned long clr, unsigned long set,
unsigned long hpa, unsigned long mask)
{
- struct kvm_nested_guest *gp;
unsigned long gpa;
unsigned int shift, lpid;
pte_t *ptep;
gpa = n_rmap & RMAP_NESTED_GPA_MASK;
lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
- gp = kvmhv_find_nested(kvm, lpid);
- if (!gp)
- return;
/* Find the pte */
- ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+ ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
/*
* If the pte is present and the pfn is still the same, update the pte.
* If the pfn has changed then this is a stale rmap entry, the nested
@@ -854,7 +867,7 @@ static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
return;
/* Find and invalidate the pte */
- ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+ ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
/* Don't spuriously invalidate ptes if the pfn has changed */
if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
@@ -921,7 +934,7 @@ static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
int shift;
spin_lock(&kvm->mmu_lock);
- ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+ ptep = find_kvm_nested_guest_pte(kvm, gp->l1_lpid, gpa, &shift);
if (!shift)
shift = PAGE_SHIFT;
if (ptep && pte_present(*ptep)) {
@@ -1212,16 +1225,16 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
spin_lock(&kvm->mmu_lock);
/* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
- ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
- gpte.raddr, kvm->arch.lpid);
+ ret = kvmppc_hv_handle_set_rc(kvm, false, writing,
+ gpte.raddr, kvm->arch.lpid);
if (!ret) {
ret = -EINVAL;
goto out_unlock;
}
/* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
- ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
- gp->shadow_lpid);
+ ret = kvmppc_hv_handle_set_rc(kvm, true, writing,
+ n_gpa, gp->shadow_lpid);
if (!ret)
ret = -EINVAL;
else
@@ -1362,7 +1375,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_run *run,
/* See if can find translation in our partition scoped tables for L1 */
pte = __pte(0);
spin_lock(&kvm->mmu_lock);
- pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+ pte_p = find_kvm_secondary_pte(kvm, gpa, &shift);
if (!shift)
shift = PAGE_SHIFT;
if (pte_p)
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 220305454c23..3b168c69d503 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -210,7 +210,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
pte_t *ptep;
unsigned int writing;
unsigned long mmu_seq;
- unsigned long rcbits, irq_flags = 0;
+ unsigned long rcbits;
if (kvm_is_radix(kvm))
return H_FUNCTION;
@@ -248,17 +248,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
/* Translate to host virtual address */
hva = __gfn_to_hva_memslot(memslot, gfn);
- /*
- * If we had a page table table change after lookup, we would
- * retry via mmu_notifier_retry.
- */
- if (!realmode)
- local_irq_save(irq_flags);
- /*
- * If called in real mode we have MSR_EE = 0. Otherwise
- * we disable irq above.
- */
- ptep = __find_linux_pte(pgdir, hva, NULL, &hpage_shift);
+
+ arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
+ ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &hpage_shift);
if (ptep) {
pte_t pte;
unsigned int host_pte_size;
@@ -272,8 +264,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
* to <= host page size, if host is using hugepage
*/
if (host_pte_size < psize) {
- if (!realmode)
- local_irq_restore(flags);
+ arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
return H_PARAMETER;
}
pte = kvmppc_read_update_linux_pte(ptep, writing);
@@ -287,8 +278,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
pa |= gpa & ~PAGE_MASK;
}
}
- if (!realmode)
- local_irq_restore(irq_flags);
+ arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1);
ptel |= pa;
@@ -888,8 +878,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
return ret;
}
-static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa,
- int writing, unsigned long *hpa,
+static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq,
+ unsigned long gpa, int writing, unsigned long *hpa,
struct kvm_memory_slot **memslot_p)
{
struct kvm *kvm = vcpu->kvm;
@@ -908,7 +898,7 @@ static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa,
hva = __gfn_to_hva_memslot(memslot, gfn);
/* Try to find the host pte for that virtual address */
- ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
+ ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
if (!ptep)
return H_TOO_HARD;
pte = kvmppc_read_update_linux_pte(ptep, writing);
@@ -943,16 +933,11 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu,
mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
- ret = kvmppc_get_hpa(vcpu, dest, 1, &pa, &memslot);
- if (ret != H_SUCCESS)
- return ret;
+ arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
- /* Check if we've been invalidated */
- raw_spin_lock(&kvm->mmu_lock.rlock);
- if (mmu_notifier_retry(kvm, mmu_seq)) {
- ret = H_TOO_HARD;
+ ret = kvmppc_get_hpa(vcpu, mmu_seq, dest, 1, &pa, &memslot);
+ if (ret != H_SUCCESS)
goto out_unlock;
- }
/* Zero the page */
for (i = 0; i < SZ_4K; i += L1_CACHE_BYTES, pa += L1_CACHE_BYTES)
@@ -960,7 +945,7 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu,
kvmppc_update_dirty_map(memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
out_unlock:
- raw_spin_unlock(&kvm->mmu_lock.rlock);
+ arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
return ret;
}
@@ -976,19 +961,14 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu,
mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
- ret = kvmppc_get_hpa(vcpu, dest, 1, &dest_pa, &dest_memslot);
- if (ret != H_SUCCESS)
- return ret;
- ret = kvmppc_get_hpa(vcpu, src, 0, &src_pa, NULL);
+ arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
+ ret = kvmppc_get_hpa(vcpu, mmu_seq, dest, 1, &dest_pa, &dest_memslot);
if (ret != H_SUCCESS)
- return ret;
+ goto out_unlock;
- /* Check if we've been invalidated */
- raw_spin_lock(&kvm->mmu_lock.rlock);
- if (mmu_notifier_retry(kvm, mmu_seq)) {
- ret = H_TOO_HARD;
+ ret = kvmppc_get_hpa(vcpu, mmu_seq, src, 0, &src_pa, NULL);
+ if (ret != H_SUCCESS)
goto out_unlock;
- }
/* Copy the page */
memcpy((void *)dest_pa, (void *)src_pa, SZ_4K);
@@ -996,7 +976,7 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu,
kvmppc_update_dirty_map(dest_memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
out_unlock:
- raw_spin_unlock(&kvm->mmu_lock.rlock);
+ arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
return ret;
}
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 64733b9cb20a..64ca375278dc 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -363,17 +363,6 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
* hash fault look at them.
*/
memset(pgtable, 0, PTE_FRAG_SIZE);
- /*
- * Serialize against find_current_mm_pte variants which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_curren_mm_pte to finish.
- */
- serialize_against_pte_lookup(mm);
return old_pmd;
}
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
index 4a70d8dd39cd..1fa2173413b5 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -176,7 +176,6 @@ void hash__tlb_flush(struct mmu_gather *tlb)
* from the hash table (and the TLB). But keeps
* the linux PTEs intact.
*
- * @mm : mm_struct of the target address space (generally init_mm)
* @start : starting address
* @end : ending address (not included in the flush)
*
@@ -189,17 +188,14 @@ void hash__tlb_flush(struct mmu_gather *tlb)
* Because of that usage pattern, it is implemented for small size rather
* than speed.
*/
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
- unsigned long end)
+void __flush_hash_table_range(unsigned long start, unsigned long end)
{
- bool is_thp;
int hugepage_shift;
unsigned long flags;
start = _ALIGN_DOWN(start, PAGE_SIZE);
end = _ALIGN_UP(end, PAGE_SIZE);
- BUG_ON(!mm->pgd);
/*
* Note: Normally, we should only ever use a batch within a
@@ -212,21 +208,15 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
- &hugepage_shift);
+ pte_t *ptep = find_init_mm_pte(start, &hugepage_shift);
unsigned long pte;
if (ptep == NULL)
continue;
pte = pte_val(*ptep);
- if (is_thp)
- trace_hugepage_invalidate(start, pte);
if (!(pte & H_PAGE_HASHPTE))
continue;
- if (unlikely(is_thp))
- hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
- else
- hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
+ hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift);
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 8ed2411c3f39..3d727f73a8db 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1350,8 +1350,15 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
goto bail;
}
- /* Add _PAGE_PRESENT to the required access perm */
- access |= _PAGE_PRESENT;
+ /*
+ * Add _PAGE_PRESENT to the required access perm. If there are parallel
+ * updates to the pte that can possibly clear _PAGE_PTE, catch that too.
+ *
+ * We can safely use the return pte address in rest of the function
+ * because we do set H_PAGE_BUSY which prevents further updates to pte
+ * from generic code.
+ */
+ access |= _PAGE_PRESENT | _PAGE_PTE;
/*
* Pre-check access permissions (will be re-checked atomically
@@ -1539,14 +1546,11 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
}
#endif
-static void hash_preload(struct mm_struct *mm, unsigned long ea,
+static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
bool is_exec, unsigned long trap)
{
- int hugepage_shift;
unsigned long vsid;
pgd_t *pgdir;
- pte_t *ptep;
- unsigned long flags;
int rc, ssize, update_flags = 0;
unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
@@ -1568,30 +1572,18 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea,
vsid = get_user_vsid(&mm->context, ea, ssize);
if (!vsid)
return;
- /*
- * Hash doesn't like irqs. Walking linux page table with irq disabled
- * saves us from holding multiple locks.
- */
- local_irq_save(flags);
-
- /*
- * THP pages use update_mmu_cache_pmd. We don't do
- * hash preload there. Hence can ignore THP here
- */
- ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
- if (!ptep)
- goto out_exit;
- WARN_ON(hugepage_shift);
#ifdef CONFIG_PPC_64K_PAGES
/* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
* a 64K kernel), then we don't preload, hash_page() will take
* care of it once we actually try to access the page.
* That way we don't have to duplicate all of the logic for segment
* page size demotion here
+ * Called with PTL held, hence can be sure the value won't change in
+ * between.
*/
if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
- goto out_exit;
+ return;
#endif /* CONFIG_PPC_64K_PAGES */
/* Is that local to this CPU ? */
@@ -1616,8 +1608,6 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea,
mm_ctx_user_psize(&mm->context),
mm_ctx_user_psize(&mm->context),
pte_val(*ptep));
-out_exit:
- local_irq_restore(flags);
}
/*
@@ -1668,32 +1658,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
return;
}
- hash_preload(vma->vm_mm, address, is_exec, trap);
-}
-
-#ifdef CONFIG_PPC_MEM_KEYS
-/*
- * Return the protection key associated with the given address and the
- * mm_struct.
- */
-u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
- pte_t *ptep;
- u16 pkey = 0;
- unsigned long flags;
-
- if (!mm || !mm->pgd)
- return 0;
-
- local_irq_save(flags);
- ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
- if (ptep)
- pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
- local_irq_restore(flags);
-
- return pkey;
+ hash_preload(vma->vm_mm, ptep, address, is_exec, trap);
}
-#endif /* CONFIG_PPC_MEM_KEYS */
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
static inline void tm_flush_hash_page(int local)
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index e0bb69c616e4..54b6d6d103ea 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -109,15 +109,25 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return __pmd(old_pmd);
+}
+
+pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp, int full)
+{
+ pmd_t pmd;
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
/*
- * This ensures that generic code that rely on IRQ disabling
- * to prevent a parallel THP split work as expected.
- *
- * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires
- * a special case check in pmd_access_permitted.
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
*/
- serialize_against_pte_lookup(vma->vm_mm);
- return __pmd(old_pmd);
+ if (!full)
+ flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+ return pmd;
}
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 8f9edf07063a..dfb9fe92aea8 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -962,7 +962,13 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
pmd = *pmdp;
pmd_clear(pmdp);
- /*FIXME!! Verify whether we need this kick below */
+ /*
+ * pmdp collapse_flush need to ensure that there are no parallel gup
+ * walk after this call. This is needed so that we can have stable
+ * page ref count when collapsing a page. We don't allow a collapse page
+ * if we have gup taken on the page. We can ensure that by sending IPI
+ * because gup walk happens with IRQ disabled.
+ */
serialize_against_pte_lookup(vma->vm_mm);
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
@@ -1023,17 +1029,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
old_pmd = __pmd(old);
- /*
- * Serialize against find_current_mm_pte which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
- */
- serialize_against_pte_lookup(mm);
return old_pmd;
}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 84af6c8eecf7..44457bae77a0 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -118,9 +118,34 @@ static noinline int bad_area(struct pt_regs *regs, unsigned long address)
return __bad_area(regs, address, SEGV_MAPERR);
}
-static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
- int pkey)
+#ifdef CONFIG_PPC_MEM_KEYS
+static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+ struct vm_area_struct *vma)
{
+ struct mm_struct *mm = current->mm;
+ int pkey;
+
+ /*
+ * We don't try to fetch the pkey from page table because reading
+ * page table without locking doesn't guarantee stable pte value.
+ * Hence the pkey value that we return to userspace can be different
+ * from the pkey that actually caused access error.
+ *
+ * It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1 : set AMR to deny access to pkey=4, touches, page
+ * 3. T1 : faults...
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1 : enters fault handler, takes mmap_sem, etc...
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+ pkey = vma_pkey(vma);
+
+ up_read(&mm->mmap_sem);
+
/*
* If we are in kernel mode, bail out with a SEGV, this will
* be caught by the assembly which will restore the non-volatile
@@ -133,6 +158,7 @@ static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
return 0;
}
+#endif
static noinline int bad_access(struct pt_regs *regs, unsigned long address)
{
@@ -289,8 +315,23 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
return false;
}
-static bool access_error(bool is_write, bool is_exec,
- struct vm_area_struct *vma)
+#ifdef CONFIG_PPC_MEM_KEYS
+static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
+ struct vm_area_struct *vma)
+{
+ /*
+ * Make sure to check the VMA so that we do not perform
+ * faults just to hit a pkey fault as soon as we fill in a
+ * page. Only called for current mm, hence foreign == 0
+ */
+ if (!arch_vma_access_permitted(vma, is_write, is_exec, 0))
+ return true;
+
+ return false;
+}
+#endif
+
+static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma)
{
/*
* Allow execution from readable areas if the MMU does not
@@ -483,10 +524,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (error_code & DSISR_KEYFAULT)
- return bad_key_fault_exception(regs, address,
- get_mm_addr_key(mm, address));
-
/*
* We want to do this outside mmap_sem, because reading code around nip
* can result in fault, which will cause a deadlock when called with
@@ -555,6 +592,13 @@ retry:
return bad_area(regs, address);
good_area:
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma)))
+ return bad_access_pkey(regs, address, vma);
+#endif /* CONFIG_PPC_MEM_KEYS */
+
if (unlikely(access_error(is_write, is_exec, vma)))
return bad_access(regs, address);
@@ -565,21 +609,6 @@ good_area:
*/
fault = handle_mm_fault(vma, address, flags);
-#ifdef CONFIG_PPC_MEM_KEYS
- /*
- * we skipped checking for access error due to key earlier.
- * Check that using handle_mm_fault error return.
- */
- if (unlikely(fault & VM_FAULT_SIGSEGV) &&
- !arch_vma_access_permitted(vma, is_write, is_exec, 0)) {
-
- int pkey = vma_pkey(vma);
-
- up_read(&mm->mmap_sem);
- return bad_key_fault_exception(regs, address, pkey);
- }
-#endif /* CONFIG_PPC_MEM_KEYS */
-
major |= fault & VM_FAULT_MAJOR;
if (fault_signal_pending(fault, regs))
diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c
index df1ffd8b20f2..b63086b663ef 100644
--- a/arch/powerpc/perf/callchain_64.c
+++ b/arch/powerpc/perf/callchain_64.c
@@ -26,43 +26,25 @@
*/
int read_user_stack_slow(void __user *ptr, void *buf, int nb)
{
- int ret = -EFAULT;
- pgd_t *pgdir;
- pte_t *ptep, pte;
- unsigned int shift;
+
unsigned long addr = (unsigned long) ptr;
unsigned long offset;
- unsigned long pfn, flags;
+ struct page *page;
+ int nrpages;
void *kaddr;
- pgdir = current->mm->pgd;
- if (!pgdir)
- return -EFAULT;
+ nrpages = __get_user_pages_fast(addr, 1, 1, &page);
+ if (nrpages == 1) {
+ kaddr = page_address(page);
+
+ /* align address to page boundary */
+ offset = addr & ~PAGE_MASK;
- local_irq_save(flags);
- ptep = find_current_mm_pte(pgdir, addr, NULL, &shift);
- if (!ptep)
- goto err_out;
- if (!shift)
- shift = PAGE_SHIFT;
-
- /* align address to page boundary */
- offset = addr & ((1UL << shift) - 1);
-
- pte = READ_ONCE(*ptep);
- if (!pte_present(pte) || !pte_user(pte))
- goto err_out;
- pfn = pte_pfn(pte);
- if (!page_is_ram(pfn))
- goto err_out;
-
- /* no highmem to worry about here */
- kaddr = pfn_to_kaddr(pfn);
- memcpy(buf, kaddr + offset, nb);
- ret = 0;
-err_out:
- local_irq_restore(flags);
- return ret;
+ memcpy(buf, kaddr + offset, nb);
+ put_page(page);
+ return 0;
+ }
+ return -EFAULT;
}
static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 6076c8c912d2..e2528e057980 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1560,7 +1560,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
}
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmdp, int full)
{
@@ -1569,7 +1569,7 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
return pmd;
}
- return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+ return pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 329b8c8ca703..d10be362eafa 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -159,11 +159,11 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
int full)
{
- return pmdp_huge_get_and_clear(mm, address, pmdp);
+ return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6ecd1045113b..16f2bd6f1549 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1852,8 +1852,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())