summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJanosch Frank <frankja@linux.ibm.com>2018-07-30 23:20:00 +0200
committerJanosch Frank <frankja@linux.ibm.com>2018-07-30 23:20:48 +0200
commit2375846193663a1282c0ef7093640ed3210dc09f (patch)
tree8f8a434b5475eed580308753adba5cd999274e7c
parent57cb198cfdd2e666a8dc2b7e5df6d2708191eddb (diff)
parenta449938297e55e7e8958f8b48583f7d342da1930 (diff)
downloadlinux-2375846193663a1282c0ef7093640ed3210dc09f.tar.bz2
Merge tag 'hlp_stage1' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into kvms390/next
KVM: s390: initial host large page support - must be enabled via module parameter hpage=1 - cannot be used together with nested - does support migration - does support hugetlbfs - no THP yet
-rw-r--r--Documentation/virtual/kvm/api.txt16
-rw-r--r--arch/s390/include/asm/gmap.h10
-rw-r--r--arch/s390/include/asm/hugetlb.h5
-rw-r--r--arch/s390/include/asm/mmu.h2
-rw-r--r--arch/s390/include/asm/mmu_context.h1
-rw-r--r--arch/s390/include/asm/pgtable.h13
-rw-r--r--arch/s390/kvm/kvm-s390.c82
-rw-r--r--arch/s390/kvm/priv.c103
-rw-r--r--arch/s390/mm/gmap.c454
-rw-r--r--arch/s390/mm/hugetlbpage.c24
-rw-r--r--arch/s390/mm/pageattr.c6
-rw-r--r--arch/s390/mm/pgtable.c159
-rw-r--r--include/uapi/linux/kvm.h1
13 files changed, 756 insertions, 120 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d10944e619d3..cb8db4f9d097 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4391,6 +4391,22 @@ all such vmexits.
Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
+7.14 KVM_CAP_S390_HPAGE_1M
+
+Architectures: s390
+Parameters: none
+Returns: 0 on success, -EINVAL if hpage module parameter was not set
+ or cmma is enabled
+
+With this capability the KVM support for memory backing with 1m pages
+through hugetlbfs can be enabled for a VM. After the capability is
+enabled, cmma can't be enabled anymore and pfmfi and the storage key
+interpretation are disabled. If cmma has already been enabled or the
+hpage module parameter is not set to 1, -EINVAL is returned.
+
+While it is generally possible to create a huge page backed VM without
+this capability, the VM will not be able to run.
+
8. Other capabilities.
----------------------
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index e07cce88dfb0..fcbd638fb9f4 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -9,6 +9,14 @@
#ifndef _ASM_S390_GMAP_H
#define _ASM_S390_GMAP_H
+/* Generic bits for GMAP notification on DAT table entry changes. */
+#define GMAP_NOTIFY_SHADOW 0x2
+#define GMAP_NOTIFY_MPROT 0x1
+
+/* Status bits only for huge segment entries */
+#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */
+#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */
+
/**
* struct gmap_struct - guest address space
* @list: list head for the mm->context gmap list
@@ -132,4 +140,6 @@ void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
int gmap_mprotect_notify(struct gmap *, unsigned long start,
unsigned long len, int prot);
+void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
+ unsigned long gaddr, unsigned long vmaddr);
#endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 9c5fc50204dd..2d1afa58a4b6 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -37,7 +37,10 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-#define arch_clear_hugepage_flags(page) do { } while (0)
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+ clear_bit(PG_arch_1, &page->flags);
+}
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index f5ff9dbad8ac..f31a15044c24 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -24,6 +24,8 @@ typedef struct {
unsigned int uses_skeys:1;
/* The mmu context uses CMM. */
unsigned int uses_cmm:1;
+ /* The gmaps associated with this context are allowed to use huge pages. */
+ unsigned int allow_gmap_hpage_1m:1;
} mm_context_t;
#define INIT_MM_CONTEXT(name) \
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index d16bc79c30bb..0717ee76885d 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -32,6 +32,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.has_pgste = 0;
mm->context.uses_skeys = 0;
mm->context.uses_cmm = 0;
+ mm->context.allow_gmap_hpage_1m = 0;
#endif
switch (mm->context.asce_limit) {
case _REGION2_SIZE:
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5ab636089c60..0e7cb0dc9c33 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -268,8 +268,10 @@ static inline int is_module_addr(void *addr)
#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe2fUL
/* Bits in the segment table entry */
-#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
-#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
+#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
+#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
+#define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe30UL
+#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE 0xfffffffffff00730UL
#define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */
#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */
#define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */
@@ -1101,7 +1103,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
pte_t *sptep, pte_t *tptep, pte_t pte);
void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
-bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
+bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address,
+ pte_t *ptep);
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char key, bool nq);
int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
@@ -1116,6 +1119,10 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long addr,
int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep);
int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
unsigned long *oldpte, unsigned long *oldpgste);
+void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr);
+void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
+void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
+void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
/*
* Certain architectures need to do special things when PTEs
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0cc8a48fc732..91ad4a9425c0 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -172,6 +172,10 @@ static int nested;
module_param(nested, int, S_IRUGO);
MODULE_PARM_DESC(nested, "Nested virtualization support");
+/* allow 1m huge page guest backing, if !nested */
+static int hpage;
+module_param(hpage, int, 0444);
+MODULE_PARM_DESC(hpage, "1m huge page backing support");
/*
* For now we handle at most 16 double words as this is what the s390 base
@@ -475,6 +479,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_AIS_MIGRATION:
r = 1;
break;
+ case KVM_CAP_S390_HPAGE_1M:
+ r = 0;
+ if (hpage)
+ r = 1;
+ break;
case KVM_CAP_S390_MEM_OP:
r = MEM_OP_MAX_SIZE;
break;
@@ -511,19 +520,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
}
static void kvm_s390_sync_dirty_log(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ struct kvm_memory_slot *memslot)
{
+ int i;
gfn_t cur_gfn, last_gfn;
- unsigned long address;
+ unsigned long gaddr, vmaddr;
struct gmap *gmap = kvm->arch.gmap;
+ DECLARE_BITMAP(bitmap, _PAGE_ENTRIES);
- /* Loop over all guest pages */
+ /* Loop over all guest segments */
+ cur_gfn = memslot->base_gfn;
last_gfn = memslot->base_gfn + memslot->npages;
- for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
- address = gfn_to_hva_memslot(memslot, cur_gfn);
+ for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) {
+ gaddr = gfn_to_gpa(cur_gfn);
+ vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
+ if (kvm_is_error_hva(vmaddr))
+ continue;
+
+ bitmap_zero(bitmap, _PAGE_ENTRIES);
+ gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr);
+ for (i = 0; i < _PAGE_ENTRIES; i++) {
+ if (test_bit(i, bitmap))
+ mark_page_dirty(kvm, cur_gfn + i);
+ }
- if (test_and_clear_guest_dirty(gmap->mm, address))
- mark_page_dirty(kvm, cur_gfn);
if (fatal_signal_pending(current))
return;
cond_resched();
@@ -667,6 +687,27 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s",
r ? "(not available)" : "(success)");
break;
+ case KVM_CAP_S390_HPAGE_1M:
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus)
+ r = -EBUSY;
+ else if (!hpage || kvm->arch.use_cmma)
+ r = -EINVAL;
+ else {
+ r = 0;
+ kvm->mm->context.allow_gmap_hpage_1m = 1;
+ /*
+ * We might have to create fake 4k page
+ * tables. To avoid that the hardware works on
+ * stale PGSTEs, we emulate these instructions.
+ */
+ kvm->arch.use_skf = 0;
+ kvm->arch.use_pfmfi = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ VM_EVENT(kvm, 3, "ENABLE: CAP_S390_HPAGE %s",
+ r ? "(not available)" : "(success)");
+ break;
case KVM_CAP_S390_USER_STSI:
VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI");
kvm->arch.user_stsi = 1;
@@ -714,10 +755,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (!sclp.has_cmma)
break;
- ret = -EBUSY;
VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
mutex_lock(&kvm->lock);
- if (!kvm->created_vcpus) {
+ if (kvm->created_vcpus)
+ ret = -EBUSY;
+ else if (kvm->mm->context.allow_gmap_hpage_1m)
+ ret = -EINVAL;
+ else {
kvm->arch.use_cmma = 1;
/* Not compatible with cmma. */
kvm->arch.use_pfmfi = 0;
@@ -1514,6 +1558,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
uint8_t *keys;
uint64_t hva;
int srcu_idx, i, r = 0;
+ bool unlocked;
if (args->flags != 0)
return -EINVAL;
@@ -1538,9 +1583,11 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (r)
goto out;
+ i = 0;
down_read(&current->mm->mmap_sem);
srcu_idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < args->count; i++) {
+ while (i < args->count) {
+ unlocked = false;
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
@@ -1554,8 +1601,14 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
}
r = set_guest_storage_key(current->mm, hva, keys[i], 0);
- if (r)
- break;
+ if (r) {
+ r = fixup_user_fault(current, current->mm, hva,
+ FAULT_FLAG_WRITE, &unlocked);
+ if (r)
+ break;
+ }
+ if (!r)
+ i++;
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
up_read(&current->mm->mmap_sem);
@@ -4141,6 +4194,11 @@ static int __init kvm_s390_init(void)
return -ENODEV;
}
+ if (nested && hpage) {
+ pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently");
+ return -EINVAL;
+ }
+
for (i = 0; i < 16; i++)
kvm_s390_fac_base[i] |=
S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 63285b14ff31..d68f10441a16 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -244,9 +244,10 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)
static int handle_iske(struct kvm_vcpu *vcpu)
{
- unsigned long addr;
+ unsigned long gaddr, vmaddr;
unsigned char key;
int reg1, reg2;
+ bool unlocked;
int rc;
vcpu->stat.instruction_iske++;
@@ -260,18 +261,28 @@ static int handle_iske(struct kvm_vcpu *vcpu)
kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
- addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
- addr = kvm_s390_logical_to_effective(vcpu, addr);
- addr = kvm_s390_real_to_abs(vcpu, addr);
- addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
- if (kvm_is_error_hva(addr))
+ gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
+ gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
+ vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
+ if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
+retry:
+ unlocked = false;
down_read(&current->mm->mmap_sem);
- rc = get_guest_storage_key(current->mm, addr, &key);
- up_read(&current->mm->mmap_sem);
+ rc = get_guest_storage_key(current->mm, vmaddr, &key);
+
+ if (rc) {
+ rc = fixup_user_fault(current, current->mm, vmaddr,
+ FAULT_FLAG_WRITE, &unlocked);
+ if (!rc) {
+ up_read(&current->mm->mmap_sem);
+ goto retry;
+ }
+ }
if (rc)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ up_read(&current->mm->mmap_sem);
vcpu->run->s.regs.gprs[reg1] &= ~0xff;
vcpu->run->s.regs.gprs[reg1] |= key;
return 0;
@@ -279,8 +290,9 @@ static int handle_iske(struct kvm_vcpu *vcpu)
static int handle_rrbe(struct kvm_vcpu *vcpu)
{
- unsigned long addr;
+ unsigned long vmaddr, gaddr;
int reg1, reg2;
+ bool unlocked;
int rc;
vcpu->stat.instruction_rrbe++;
@@ -294,19 +306,27 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
- addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
- addr = kvm_s390_logical_to_effective(vcpu, addr);
- addr = kvm_s390_real_to_abs(vcpu, addr);
- addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
- if (kvm_is_error_hva(addr))
+ gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
+ gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
+ vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
+ if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
+retry:
+ unlocked = false;
down_read(&current->mm->mmap_sem);
- rc = reset_guest_reference_bit(current->mm, addr);
- up_read(&current->mm->mmap_sem);
+ rc = reset_guest_reference_bit(current->mm, vmaddr);
+ if (rc < 0) {
+ rc = fixup_user_fault(current, current->mm, vmaddr,
+ FAULT_FLAG_WRITE, &unlocked);
+ if (!rc) {
+ up_read(&current->mm->mmap_sem);
+ goto retry;
+ }
+ }
if (rc < 0)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
+ up_read(&current->mm->mmap_sem);
kvm_s390_set_psw_cc(vcpu, rc);
return 0;
}
@@ -321,6 +341,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
unsigned long start, end;
unsigned char key, oldkey;
int reg1, reg2;
+ bool unlocked;
int rc;
vcpu->stat.instruction_sske++;
@@ -353,19 +374,28 @@ static int handle_sske(struct kvm_vcpu *vcpu)
}
while (start != end) {
- unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+ unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+ unlocked = false;
- if (kvm_is_error_hva(addr))
+ if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
down_read(&current->mm->mmap_sem);
- rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey,
+ rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey,
m3 & SSKE_NQ, m3 & SSKE_MR,
m3 & SSKE_MC);
- up_read(&current->mm->mmap_sem);
- if (rc < 0)
+
+ if (rc < 0) {
+ rc = fixup_user_fault(current, current->mm, vmaddr,
+ FAULT_FLAG_WRITE, &unlocked);
+ rc = !rc ? -EAGAIN : rc;
+ }
+ if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- start += PAGE_SIZE;
+
+ up_read(&current->mm->mmap_sem);
+ if (rc >= 0)
+ start += PAGE_SIZE;
}
if (m3 & (SSKE_MC | SSKE_MR)) {
@@ -946,11 +976,12 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
while (start != end) {
- unsigned long useraddr;
+ unsigned long vmaddr;
+ bool unlocked = false;
/* Translate guest address to host address */
- useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
- if (kvm_is_error_hva(useraddr))
+ vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+ if (kvm_is_error_hva(vmaddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
@@ -964,14 +995,20 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
down_read(&current->mm->mmap_sem);
- rc = cond_set_guest_storage_key(current->mm, useraddr,
+ rc = cond_set_guest_storage_key(current->mm, vmaddr,
key, NULL, nq, mr, mc);
- up_read(&current->mm->mmap_sem);
- if (rc < 0)
+ if (rc < 0) {
+ rc = fixup_user_fault(current, current->mm, vmaddr,
+ FAULT_FLAG_WRITE, &unlocked);
+ rc = !rc ? -EAGAIN : rc;
+ }
+ if (rc == -EFAULT)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- }
- start += PAGE_SIZE;
+ up_read(&current->mm->mmap_sem);
+ if (rc >= 0)
+ start += PAGE_SIZE;
+ }
}
if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT) {
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index bc56ec8abcf7..bb44990c8212 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2,8 +2,10 @@
/*
* KVM guest address space mapping code
*
- * Copyright IBM Corp. 2007, 2016
+ * Copyright IBM Corp. 2007, 2016, 2018
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * David Hildenbrand <david@redhat.com>
+ * Janosch Frank <frankja@linux.vnet.ibm.com>
*/
#include <linux/kernel.h>
@@ -521,6 +523,9 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
rcu_read_unlock();
}
+static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
+ unsigned long gaddr);
+
/**
* gmap_link - set up shadow page tables to connect a host to a guest address
* @gmap: pointer to guest mapping meta data structure
@@ -541,6 +546,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
+ u64 unprot;
int rc;
BUG_ON(gmap_is_shadow(gmap));
@@ -584,8 +590,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
return -EFAULT;
pmd = pmd_offset(pud, vmaddr);
VM_BUG_ON(pmd_none(*pmd));
- /* large pmds cannot yet be handled */
- if (pmd_large(*pmd))
+ /* Are we allowed to use huge pages? */
+ if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
return -EFAULT;
/* Link gmap segment table entry location to page table. */
rc = radix_tree_preload(GFP_KERNEL);
@@ -596,10 +602,22 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
if (*table == _SEGMENT_ENTRY_EMPTY) {
rc = radix_tree_insert(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT, table);
- if (!rc)
- *table = pmd_val(*pmd);
- } else
- rc = 0;
+ if (!rc) {
+ if (pmd_large(*pmd)) {
+ *table = (pmd_val(*pmd) &
+ _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
+ | _SEGMENT_ENTRY_GMAP_UC;
+ } else
+ *table = pmd_val(*pmd) &
+ _SEGMENT_ENTRY_HARDWARE_BITS;
+ }
+ } else if (*table & _SEGMENT_ENTRY_PROTECT &&
+ !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
+ unprot = (u64)*table;
+ unprot &= ~_SEGMENT_ENTRY_PROTECT;
+ unprot |= _SEGMENT_ENTRY_GMAP_UC;
+ gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
+ }
spin_unlock(&gmap->guest_table_lock);
spin_unlock(ptl);
radix_tree_preload_end();
@@ -690,6 +708,12 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
vmaddr |= gaddr & ~PMD_MASK;
/* Find vma in the parent mm */
vma = find_vma(gmap->mm, vmaddr);
+ /*
+ * We do not discard pages that are backed by
+ * hugetlbfs, so we don't have to refault them.
+ */
+ if (vma && is_vm_hugetlb_page(vma))
+ continue;
size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
zap_page_range(vma, vmaddr, size);
}
@@ -864,7 +888,128 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
*/
static void gmap_pte_op_end(spinlock_t *ptl)
{
- spin_unlock(ptl);
+ if (ptl)
+ spin_unlock(ptl);
+}
+
+/**
+ * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
+ * and return the pmd pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ *
+ * Returns a pointer to the pmd for a guest address, or NULL
+ */
+static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
+{
+ pmd_t *pmdp;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ spin_lock(&gmap->guest_table_lock);
+ pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
+
+ if (!pmdp || pmd_none(*pmdp)) {
+ spin_unlock(&gmap->guest_table_lock);
+ return NULL;
+ }
+
+ /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
+ if (!pmd_large(*pmdp))
+ spin_unlock(&gmap->guest_table_lock);
+ return pmdp;
+}
+
+/**
+ * gmap_pmd_op_end - release the guest_table_lock if needed
+ * @gmap: pointer to the guest mapping meta data structure
+ * @pmdp: pointer to the pmd
+ */
+static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
+{
+ if (pmd_large(*pmdp))
+ spin_unlock(&gmap->guest_table_lock);
+}
+
+/*
+ * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
+ * @pmdp: pointer to the pmd to be protected
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: notification bits to set
+ *
+ * Returns:
+ * 0 if successfully protected
+ * -EAGAIN if a fixup is needed
+ * -EINVAL if unsupported notifier bits have been specified
+ *
+ * Expected to be called with sg->mm->mmap_sem in read and
+ * guest_table_lock held.
+ */
+static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
+ pmd_t *pmdp, int prot, unsigned long bits)
+{
+ int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
+ int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
+ pmd_t new = *pmdp;
+
+ /* Fixup needed */
+ if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
+ return -EAGAIN;
+
+ if (prot == PROT_NONE && !pmd_i) {
+ pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
+ gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
+ }
+
+ if (prot == PROT_READ && !pmd_p) {
+ pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
+ pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+ gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
+ }
+
+ if (bits & GMAP_NOTIFY_MPROT)
+ pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
+
+ /* Shadow GMAP protection needs split PMDs */
+ if (bits & GMAP_NOTIFY_SHADOW)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * gmap_protect_pte - remove access rights to memory and set pgste bits
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @pmdp: pointer to the pmd associated with the pte
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: notification bits to set
+ *
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EAGAIN if a fixup is needed.
+ *
+ * Expected to be called with sg->mm->mmap_sem in read
+ */
+static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
+ pmd_t *pmdp, int prot, unsigned long bits)
+{
+ int rc;
+ pte_t *ptep;
+ spinlock_t *ptl = NULL;
+ unsigned long pbits = 0;
+
+ if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+ return -EAGAIN;
+
+ ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+
+ pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
+ pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
+ /* Protect and unlock. */
+ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
+ gmap_pte_op_end(ptl);
+ return rc;
}
/*
@@ -883,30 +1028,45 @@ static void gmap_pte_op_end(spinlock_t *ptl)
static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
unsigned long len, int prot, unsigned long bits)
{
- unsigned long vmaddr;
- spinlock_t *ptl;
- pte_t *ptep;
+ unsigned long vmaddr, dist;
+ pmd_t *pmdp;
int rc;
BUG_ON(gmap_is_shadow(gmap));
while (len) {
rc = -EAGAIN;
- ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
- if (ptep) {
- rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
- gmap_pte_op_end(ptl);
+ pmdp = gmap_pmd_op_walk(gmap, gaddr);
+ if (pmdp) {
+ if (!pmd_large(*pmdp)) {
+ rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
+ bits);
+ if (!rc) {
+ len -= PAGE_SIZE;
+ gaddr += PAGE_SIZE;
+ }
+ } else {
+ rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
+ bits);
+ if (!rc) {
+ dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
+ len = len < dist ? 0 : len - dist;
+ gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
+ }
+ }
+ gmap_pmd_op_end(gmap, pmdp);
}
if (rc) {
+ if (rc == -EINVAL)
+ return rc;
+
+ /* -EAGAIN, fixup of userspace mm and gmap */
vmaddr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(vmaddr))
return vmaddr;
rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
if (rc)
return rc;
- continue;
}
- gaddr += PAGE_SIZE;
- len -= PAGE_SIZE;
}
return 0;
}
@@ -935,7 +1095,7 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
down_read(&gmap->mm->mmap_sem);
- rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+ rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
up_read(&gmap->mm->mmap_sem);
return rc;
}
@@ -1474,6 +1634,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
unsigned long limit;
int rc;
+ BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
BUG_ON(gmap_is_shadow(parent));
spin_lock(&parent->shadow_lock);
sg = gmap_find_shadow(parent, asce, edat_level);
@@ -1526,7 +1687,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
down_read(&parent->mm->mmap_sem);
rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
- PROT_READ, PGSTE_VSIE_BIT);
+ PROT_READ, GMAP_NOTIFY_SHADOW);
up_read(&parent->mm->mmap_sem);
spin_lock(&parent->shadow_lock);
new->initialized = true;
@@ -2092,6 +2253,225 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
}
EXPORT_SYMBOL_GPL(ptep_notify);
+static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
+ unsigned long gaddr)
+{
+ pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
+}
+
+/**
+ * gmap_pmdp_xchg - exchange a gmap pmd with another
+ * @gmap: pointer to the guest address space structure
+ * @pmdp: pointer to the pmd entry
+ * @new: replacement entry
+ * @gaddr: the affected guest address
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
+ unsigned long gaddr)
+{
+ gaddr &= HPAGE_MASK;
+ pmdp_notify_gmap(gmap, pmdp, gaddr);
+ pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ if (MACHINE_HAS_TLB_GUEST)
+ __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
+ IDTE_GLOBAL);
+ else if (MACHINE_HAS_IDTE)
+ __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
+ else
+ __pmdp_csp(pmdp);
+ *pmdp = new;
+}
+
+static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
+ int purge)
+{
+ pmd_t *pmdp;
+ struct gmap *gmap;
+ unsigned long gaddr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ spin_lock(&gmap->guest_table_lock);
+ pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ if (pmdp) {
+ gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
+ pmdp_notify_gmap(gmap, pmdp, gaddr);
+ WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC));
+ if (purge)
+ __pmdp_csp(pmdp);
+ pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ }
+ spin_unlock(&gmap->guest_table_lock);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
+ * flushing
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
+{
+ gmap_pmdp_clear(mm, vmaddr, 0);
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
+
+/**
+ * gmap_pmdp_csp - csp all affected guest pmd entries
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
+{
+ gmap_pmdp_clear(mm, vmaddr, 1);
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
+
+/**
+ * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
+{
+ unsigned long *entry, gaddr;
+ struct gmap *gmap;
+ pmd_t *pmdp;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ spin_lock(&gmap->guest_table_lock);
+ entry = radix_tree_delete(&gmap->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ if (entry) {
+ pmdp = (pmd_t *)entry;
+ gaddr = __gmap_segment_gaddr(entry);
+ pmdp_notify_gmap(gmap, pmdp, gaddr);
+ WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC));
+ if (MACHINE_HAS_TLB_GUEST)
+ __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
+ gmap->asce, IDTE_LOCAL);
+ else if (MACHINE_HAS_IDTE)
+ __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
+ *entry = _SEGMENT_ENTRY_EMPTY;
+ }
+ spin_unlock(&gmap->guest_table_lock);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
+
+/**
+ * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
+{
+ unsigned long *entry, gaddr;
+ struct gmap *gmap;
+ pmd_t *pmdp;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ spin_lock(&gmap->guest_table_lock);
+ entry = radix_tree_delete(&gmap->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ if (entry) {
+ pmdp = (pmd_t *)entry;
+ gaddr = __gmap_segment_gaddr(entry);
+ pmdp_notify_gmap(gmap, pmdp, gaddr);
+ WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC));
+ if (MACHINE_HAS_TLB_GUEST)
+ __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
+ gmap->asce, IDTE_GLOBAL);
+ else if (MACHINE_HAS_IDTE)
+ __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
+ else
+ __pmdp_csp(pmdp);
+ *entry = _SEGMENT_ENTRY_EMPTY;
+ }
+ spin_unlock(&gmap->guest_table_lock);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
+
+/**
+ * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
+ * @gmap: pointer to guest address space
+ * @pmdp: pointer to the pmd to be tested
+ * @gaddr: virtual address in the guest address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
+ unsigned long gaddr)
+{
+ if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+ return false;
+
+ /* Already protected memory, which did not change is clean */
+ if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
+ !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
+ return false;
+
+ /* Clear UC indication and reset protection */
+ pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
+ gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
+ return true;
+}
+
+/**
+ * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
+ * @gmap: pointer to guest address space
+ * @bitmap: dirty bitmap for this pmd
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: virtual address in the host address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
+ unsigned long gaddr, unsigned long vmaddr)
+{
+ int i;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ spinlock_t *ptl;
+
+ pmdp = gmap_pmd_op_walk(gmap, gaddr);
+ if (!pmdp)
+ return;
+
+ if (pmd_large(*pmdp)) {
+ if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
+ bitmap_fill(bitmap, _PAGE_ENTRIES);
+ } else {
+ for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
+ ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
+ if (!ptep)
+ continue;
+ if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
+ set_bit(i, bitmap);
+ spin_unlock(ptl);
+ }
+ }
+ gmap_pmd_op_end(gmap, pmdp);
+}
+EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
+
static inline void thp_split_mm(struct mm_struct *mm)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2168,17 +2548,45 @@ EXPORT_SYMBOL_GPL(s390_enable_sie);
* Enable storage key handling from now on and initialize the storage
* keys with the default key.
*/
-static int __s390_enable_skey(pte_t *pte, unsigned long addr,
- unsigned long next, struct mm_walk *walk)
+static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
{
/* Clear storage key */
ptep_zap_key(walk->mm, addr, pte);
return 0;
}
+static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
+ unsigned long hmask, unsigned long next,
+ struct mm_walk *walk)
+{
+ pmd_t *pmd = (pmd_t *)pte;
+ unsigned long start, end;
+ struct page *page = pmd_page(*pmd);
+
+ /*
+ * The write check makes sure we do not set a key on shared
+ * memory. This is needed as the walker does not differentiate
+ * between actual guest memory and the process executable or
+ * shared libraries.
+ */
+ if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
+ !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
+ return 0;
+
+ start = pmd_val(*pmd) & HPAGE_MASK;
+ end = start + HPAGE_SIZE - 1;
+ __storage_key_init_range(start, end);
+ set_bit(PG_arch_1, &page->flags);
+ return 0;
+}
+
int s390_enable_skey(void)
{
- struct mm_walk walk = { .pte_entry = __s390_enable_skey };
+ struct mm_walk walk = {
+ .hugetlb_entry = __s390_enable_skey_hugetlb,
+ .pte_entry = __s390_enable_skey_pte,
+ };
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int rc = 0;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index e804090f4470..b0246c705a19 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -123,6 +123,29 @@ static inline pte_t __rste_to_pte(unsigned long rste)
return pte;
}
+static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
+{
+ struct page *page;
+ unsigned long size, paddr;
+
+ if (!mm_uses_skeys(mm) ||
+ rste & _SEGMENT_ENTRY_INVALID)
+ return;
+
+ if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+ page = pud_page(__pud(rste));
+ size = PUD_SIZE;
+ paddr = rste & PUD_MASK;
+ } else {
+ page = pmd_page(__pmd(rste));
+ size = PMD_SIZE;
+ paddr = rste & PMD_MASK;
+ }
+
+ if (!test_and_set_bit(PG_arch_1, &page->flags))
+ __storage_key_init_range(paddr, paddr + size - 1);
+}
+
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
@@ -137,6 +160,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
else
rste |= _SEGMENT_ENTRY_LARGE;
+ clear_huge_pte_skeys(mm, rste);
pte_val(*ptep) = rste;
}
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index c44171588d08..f8c6faab41f4 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -14,7 +14,7 @@
static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
{
- asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0"
+ asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0"
: [addr] "+a" (addr) : [skey] "d" (skey));
return addr;
}
@@ -23,8 +23,6 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
{
unsigned long boundary, size;
- if (!PAGE_DEFAULT_KEY)
- return;
while (start < end) {
if (MACHINE_HAS_EDAT1) {
/* set storage keys for a 1MB frame */
@@ -37,7 +35,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
continue;
}
}
- page_set_storage_key(start, PAGE_DEFAULT_KEY, 0);
+ page_set_storage_key(start, PAGE_DEFAULT_KEY, 1);
start += PAGE_SIZE;
}
}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 301e466e4263..f2cc7da473e4 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -347,18 +347,27 @@ static inline void pmdp_idte_local(struct mm_struct *mm,
mm->context.asce, IDTE_LOCAL);
else
__pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL);
+ if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+ gmap_pmdp_idte_local(mm, addr);
}
static inline void pmdp_idte_global(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- if (MACHINE_HAS_TLB_GUEST)
+ if (MACHINE_HAS_TLB_GUEST) {
__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
mm->context.asce, IDTE_GLOBAL);
- else if (MACHINE_HAS_IDTE)
+ if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+ gmap_pmdp_idte_global(mm, addr);
+ } else if (MACHINE_HAS_IDTE) {
__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
- else
+ if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+ gmap_pmdp_idte_global(mm, addr);
+ } else {
__pmdp_csp(pmdp);
+ if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+ gmap_pmdp_csp(mm, addr);
+ }
}
static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
@@ -392,6 +401,8 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
cpumask_of(smp_processor_id()))) {
pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
mm->context.flush_mm = 1;
+ if (mm_has_pgste(mm))
+ gmap_pmdp_invalidate(mm, addr);
} else {
pmdp_idte_global(mm, addr, pmdp);
}
@@ -399,6 +410,24 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
return old;
}
+static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(mm, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
+ return NULL;
+ pmd = pmd_alloc(mm, pud, addr);
+ return pmd;
+}
+
pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t new)
{
@@ -693,40 +722,14 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
/*
* Test and reset if a guest page is dirty
*/
-bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
+bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
{
- spinlock_t *ptl;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
pgste_t pgste;
- pte_t *ptep;
pte_t pte;
bool dirty;
int nodat;
- pgd = pgd_offset(mm, addr);
- p4d = p4d_alloc(mm, pgd, addr);
- if (!p4d)
- return false;
- pud = pud_alloc(mm, p4d, addr);
- if (!pud)
- return false;
- pmd = pmd_alloc(mm, pud, addr);
- if (!pmd)
- return false;
- /* We can't run guests backed by huge pages, but userspace can
- * still set them up and then try to migrate them without any
- * migration support.
- */
- if (pmd_large(*pmd))
- return true;
-
- ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl);
- if (unlikely(!ptep))
- return false;
-
pgste = pgste_get_lock(ptep);
dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
pgste_val(pgste) &= ~PGSTE_UC_BIT;
@@ -742,21 +745,43 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
*ptep = pte;
}
pgste_set_unlock(ptep, pgste);
-
- spin_unlock(ptl);
return dirty;
}
-EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty);
+EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc);
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char key, bool nq)
{
- unsigned long keyul;
+ unsigned long keyul, paddr;
spinlock_t *ptl;
pgste_t old, new;
+ pmd_t *pmdp;
pte_t *ptep;
- ptep = get_locked_pte(mm, addr, &ptl);
+ pmdp = pmd_alloc_map(mm, addr);
+ if (unlikely(!pmdp))
+ return -EFAULT;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (!pmd_present(*pmdp)) {
+ spin_unlock(ptl);
+ return -EFAULT;
+ }
+
+ if (pmd_large(*pmdp)) {
+ paddr = pmd_val(*pmdp) & HPAGE_MASK;
+ paddr |= addr & ~HPAGE_MASK;
+ /*
+ * Huge pmds need quiescing operations, they are
+ * always mapped.
+ */
+ page_set_storage_key(paddr, key, 1);
+ spin_unlock(ptl);
+ return 0;
+ }
+ spin_unlock(ptl);
+
+ ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -767,14 +792,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
if (!(pte_val(*ptep) & _PAGE_INVALID)) {
- unsigned long address, bits, skey;
+ unsigned long bits, skey;
- address = pte_val(*ptep) & PAGE_MASK;
- skey = (unsigned long) page_get_storage_key(address);
+ paddr = pte_val(*ptep) & PAGE_MASK;
+ skey = (unsigned long) page_get_storage_key(paddr);
bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
/* Set storage key ACC and FP */
- page_set_storage_key(address, skey, !nq);
+ page_set_storage_key(paddr, skey, !nq);
/* Merge host changed & referenced into pgste */
pgste_val(new) |= bits << 52;
}
@@ -830,11 +855,32 @@ EXPORT_SYMBOL(cond_set_guest_storage_key);
int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
{
spinlock_t *ptl;
+ unsigned long paddr;
pgste_t old, new;
+ pmd_t *pmdp;
pte_t *ptep;
int cc = 0;
- ptep = get_locked_pte(mm, addr, &ptl);
+ pmdp = pmd_alloc_map(mm, addr);
+ if (unlikely(!pmdp))
+ return -EFAULT;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (!pmd_present(*pmdp)) {
+ spin_unlock(ptl);
+ return -EFAULT;
+ }
+
+ if (pmd_large(*pmdp)) {
+ paddr = pmd_val(*pmdp) & HPAGE_MASK;
+ paddr |= addr & ~HPAGE_MASK;
+ cc = page_reset_referenced(paddr);
+ spin_unlock(ptl);
+ return cc;
+ }
+ spin_unlock(ptl);
+
+ ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -843,7 +889,8 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
pgste_val(new) &= ~PGSTE_GR_BIT;
if (!(pte_val(*ptep) & _PAGE_INVALID)) {
- cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+ paddr = pte_val(*ptep) & PAGE_MASK;
+ cc = page_reset_referenced(paddr);
/* Merge real referenced bit into host-set */
pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
}
@@ -862,18 +909,42 @@ EXPORT_SYMBOL(reset_guest_reference_bit);
int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char *key)
{
+ unsigned long paddr;
spinlock_t *ptl;
pgste_t pgste;
+ pmd_t *pmdp;
pte_t *ptep;
- ptep = get_locked_pte(mm, addr, &ptl);
+ pmdp = pmd_alloc_map(mm, addr);
+ if (unlikely(!pmdp))
+ return -EFAULT;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (!pmd_present(*pmdp)) {
+ /* Not yet mapped memory has a zero key */
+ spin_unlock(ptl);
+ *key = 0;
+ return 0;
+ }
+
+ if (pmd_large(*pmdp)) {
+ paddr = pmd_val(*pmdp) & HPAGE_MASK;
+ paddr |= addr & ~HPAGE_MASK;
+ *key = page_get_storage_key(paddr);
+ spin_unlock(ptl);
+ return 0;
+ }
+ spin_unlock(ptl);
+
+ ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
if (unlikely(!ptep))
return -EFAULT;
pgste = pgste_get_lock(ptep);
*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+ paddr = pte_val(*ptep) & PAGE_MASK;
if (!(pte_val(*ptep) & _PAGE_INVALID))
- *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+ *key = page_get_storage_key(paddr);
/* Reflect guest's logical view, not physical */
*key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
pgste_set_unlock(ptep, pgste);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b6270a3b38e9..b955b986b341 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -949,6 +949,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_GET_MSR_FEATURES 153
#define KVM_CAP_HYPERV_EVENTFD 154
#define KVM_CAP_HYPERV_TLBFLUSH 155
+#define KVM_CAP_S390_HPAGE_1M 156
#ifdef KVM_CAP_IRQ_ROUTING