summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu/mmu.c')
-rw-r--r--arch/x86/kvm/mmu/mmu.c176
1 files changed, 125 insertions, 51 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b6f96d47e596..835426254e76 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -22,6 +22,7 @@
#include "tdp_mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "smm.h"
#include "kvm_emulate.h"
#include "cpuid.h"
#include "spte.h"
@@ -802,15 +803,31 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
}
-void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- if (sp->lpage_disallowed)
+ /*
+ * If it's possible to replace the shadow page with an NX huge page,
+ * i.e. if the shadow page is the only thing currently preventing KVM
+ * from using a huge page, add the shadow page to the list of "to be
+ * zapped for NX recovery" pages. Note, the shadow page can already be
+ * on the list if KVM is reusing an existing shadow page, i.e. if KVM
+ * links a shadow page at multiple points.
+ */
+ if (!list_empty(&sp->possible_nx_huge_page_link))
return;
++kvm->stat.nx_lpage_splits;
- list_add_tail(&sp->lpage_disallowed_link,
- &kvm->arch.lpage_disallowed_mmu_pages);
- sp->lpage_disallowed = true;
+ list_add_tail(&sp->possible_nx_huge_page_link,
+ &kvm->arch.possible_nx_huge_pages);
+}
+
+static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool nx_huge_page_possible)
+{
+ sp->nx_huge_page_disallowed = true;
+
+ if (nx_huge_page_possible)
+ track_possible_nx_huge_page(kvm, sp);
}
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -830,11 +847,20 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
+ if (list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
--kvm->stat.nx_lpage_splits;
- sp->lpage_disallowed = false;
- list_del(&sp->lpage_disallowed_link);
+ list_del_init(&sp->possible_nx_huge_page_link);
+}
+
+static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ sp->nx_huge_page_disallowed = false;
+
+ untrack_possible_nx_huge_page(kvm, sp);
}
static struct kvm_memory_slot *
@@ -1645,7 +1671,7 @@ static int is_empty_shadow_page(u64 *spt)
u64 *pos;
u64 *end;
- for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+ for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
if (is_shadow_present_pte(*pos)) {
printk(KERN_ERR "%s: %p %llx\n", __func__,
pos, *pos);
@@ -1793,7 +1819,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
continue;
}
- child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(ent);
if (child->unsync_children) {
if (mmu_pages_add(pvec, child, i))
@@ -1894,7 +1920,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (sp->role.invalid)
return true;
- /* TDP MMU pages due not use the MMU generation. */
+ /* TDP MMU pages do not use the MMU generation. */
return !sp->tdp_mmu_page &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2129,6 +2155,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
/*
* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
* depends on valid pages being added to the head of the list. See
@@ -2350,7 +2378,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* so we should update the spte at this point to get
* a new sp with the correct access.
*/
- child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(*sptep);
if (child->role.access == direct_access)
return;
@@ -2371,7 +2399,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
} else {
- child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(pte);
drop_parent_pte(child, spte);
/*
@@ -2487,8 +2515,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
zapped_root = !is_obsolete_sp(kvm, sp);
}
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ if (sp->nx_huge_page_disallowed)
+ unaccount_nx_huge_page(kvm, sp);
sp->role.invalid = 1;
@@ -2811,7 +2839,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
struct kvm_mmu_page *child;
u64 pte = *sptep;
- child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(pte);
drop_parent_pte(child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
@@ -3085,7 +3113,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
if (cur_level > PG_LEVEL_4K &&
cur_level == fault->goal_level &&
is_shadow_present_pte(spte) &&
- !is_large_pte(spte)) {
+ !is_large_pte(spte) &&
+ spte_to_child_sp(spte)->nx_huge_page_disallowed) {
/*
* A small SPTE exists for this pfn, but FNAME(fetch)
* and __direct_map would like to create a large PTE
@@ -3127,9 +3156,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
continue;
link_shadow_page(vcpu, it.sptep, sp);
- if (fault->is_tdp && fault->huge_page_disallowed &&
- fault->req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
}
if (WARN_ON_ONCE(it.level != fault->goal_level))
@@ -3149,8 +3178,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
{
+ if (is_sigpending_pfn(pfn)) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
/*
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
@@ -3172,7 +3206,7 @@ static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fau
{
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(fault->pfn)))
- return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
+ return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn);
if (unlikely(!fault->slot)) {
gva_t gva = fault->is_tdp ? 0 : fault->addr;
@@ -3423,7 +3457,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
- sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
+ /*
+ * The "root" may be a special root, e.g. a PAE entry, treat it as a
+ * SPTE to ensure any non-PA bits are dropped.
+ */
+ sp = spte_to_child_sp(*root_hpa);
if (WARN_ON(!sp))
return;
@@ -3908,8 +3946,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu->pae_root[i];
if (IS_VALID_PAE_ROOT(root)) {
- root &= SPTE_BASE_ADDR_MASK;
- sp = to_shadow_page(root);
+ sp = spte_to_child_sp(root);
mmu_sync_children(vcpu, sp, true);
}
}
@@ -4170,7 +4207,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
async = false;
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
fault->write, &fault->map_writable,
&fault->hva);
if (!async)
@@ -4187,7 +4224,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
}
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
+ /*
+ * Allow gup to bail on pending non-fatal signals when it's also allowed
+ * to wait for IO. Note, gup always bails if it is unable to quickly
+ * get a page and a fatal signal, i.e. SIGKILL, is pending.
+ */
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
fault->write, &fault->map_writable,
&fault->hva);
return RET_PF_CONTINUE;
@@ -5972,7 +6014,7 @@ int kvm_mmu_init_vm(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
- INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
r = kvm_mmu_init_tdp_mmu(kvm);
@@ -6657,7 +6699,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
}
mutex_unlock(&kvm_lock);
}
@@ -6789,7 +6831,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
mutex_unlock(&kvm_lock);
}
@@ -6797,9 +6839,10 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
return err;
}
-static void kvm_recover_nx_lpages(struct kvm *kvm)
+static void kvm_recover_nx_huge_pages(struct kvm *kvm)
{
unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
+ struct kvm_memory_slot *slot;
int rcu_idx;
struct kvm_mmu_page *sp;
unsigned int ratio;
@@ -6820,24 +6863,55 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
- if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
+ if (list_empty(&kvm->arch.possible_nx_huge_pages))
break;
/*
* We use a separate list instead of just using active_mmu_pages
- * because the number of lpage_disallowed pages is expected to
- * be relatively small compared to the total.
+ * because the number of shadow pages that be replaced with an
+ * NX huge page is expected to be relatively small compared to
+ * the total number of shadow pages. And because the TDP MMU
+ * doesn't use active_mmu_pages.
*/
- sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
struct kvm_mmu_page,
- lpage_disallowed_link);
- WARN_ON_ONCE(!sp->lpage_disallowed);
- if (is_tdp_mmu_page(sp)) {
+ possible_nx_huge_page_link);
+ WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
+ WARN_ON_ONCE(!sp->role.direct);
+
+ /*
+ * Unaccount and do not attempt to recover any NX Huge Pages
+ * that are being dirty tracked, as they would just be faulted
+ * back in as 4KiB pages. The NX Huge Pages in this slot will be
+ * recovered, along with all the other huge pages in the slot,
+ * when dirty logging is disabled.
+ *
+ * Since gfn_to_memslot() is relatively expensive, it helps to
+ * skip it if it the test cannot possibly return true. On the
+ * other hand, if any memslot has logging enabled, chances are
+ * good that all of them do, in which case unaccount_nx_huge_page()
+ * is much cheaper than zapping the page.
+ *
+ * If a memslot update is in progress, reading an incorrect value
+ * of kvm->nr_memslots_dirty_logging is not a problem: if it is
+ * becoming zero, gfn_to_memslot() will be done unnecessarily; if
+ * it is becoming nonzero, the page will be zapped unnecessarily.
+ * Either way, this only affects efficiency in racy situations,
+ * and not correctness.
+ */
+ slot = NULL;
+ if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
+ slot = gfn_to_memslot(kvm, sp->gfn);
+ WARN_ON_ONCE(!slot);
+ }
+
+ if (slot && kvm_slot_dirty_track_enabled(slot))
+ unaccount_nx_huge_page(kvm, sp);
+ else if (is_tdp_mmu_page(sp))
flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
- } else {
+ else
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- WARN_ON_ONCE(sp->lpage_disallowed);
- }
+ WARN_ON_ONCE(sp->nx_huge_page_disallowed);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
@@ -6857,7 +6931,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
-static long get_nx_lpage_recovery_timeout(u64 start_time)
+static long get_nx_huge_page_recovery_timeout(u64 start_time)
{
bool enabled;
uint period;
@@ -6868,19 +6942,19 @@ static long get_nx_lpage_recovery_timeout(u64 start_time)
: MAX_SCHEDULE_TIMEOUT;
}
-static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
{
u64 start_time;
long remaining_time;
while (true) {
start_time = get_jiffies_64();
- remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop() && remaining_time > 0) {
schedule_timeout(remaining_time);
- remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
set_current_state(TASK_INTERRUPTIBLE);
}
@@ -6889,7 +6963,7 @@ static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
if (kthread_should_stop())
return 0;
- kvm_recover_nx_lpages(kvm);
+ kvm_recover_nx_huge_pages(kvm);
}
}
@@ -6897,17 +6971,17 @@ int kvm_mmu_post_init_vm(struct kvm *kvm)
{
int err;
- err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
"kvm-nx-lpage-recovery",
- &kvm->arch.nx_lpage_recovery_thread);
+ &kvm->arch.nx_huge_page_recovery_thread);
if (!err)
- kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+ kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
return err;
}
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
- if (kvm->arch.nx_lpage_recovery_thread)
- kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+ if (kvm->arch.nx_huge_page_recovery_thread)
+ kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
}