From 19ada5c4b6170bbc7ac4f2f38dba0068fdc7755a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 27 Jul 2010 11:21:18 +0800
Subject: KVM: MMU: remove valueless output message

After commit 53383eaad08d, the '*spte' has updated before call
rmap_remove()(in most case it's 'shadow_trap_nonpresent_pte'), so
remove this information from error message

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad8951..82f7622c17d3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -645,18 +645,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
-		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
 		BUG();
 	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+		rmap_printk("rmap_remove:  %p 1->0\n", spte);
 		if ((u64 *)*rmapp != spte) {
-			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
-			       spte, *spte);
+			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
 			BUG();
 		}
 		*rmapp = 0;
 	} else {
-		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+		rmap_printk("rmap_remove:  %p many->many\n", spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 		prev_desc = NULL;
 		while (desc) {
@@ -670,7 +669,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 			prev_desc = desc;
 			desc = desc->more;
 		}
-		pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
+		pr_err("rmap_remove: %p many->many\n", spte);
 		BUG();
 	}
 }
-- 
cgit v1.2.3


From 251464c464cf7df7d6d548f1065f49a3ecd08118 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 2 Aug 2010 16:12:08 +0800
Subject: KVM: MMU: using kvm_set_pfn_accessed() instead of
 mark_page_accessed()

It's a small cleanup that using using kvm_set_pfn_accessed() instead
of mark_page_accessed()

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 82f7622c17d3..e430a383ad15 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -309,7 +309,7 @@ static void update_spte(u64 *sptep, u64 new_spte)
 	else {
 		old_spte = __xchg_spte(sptep, new_spte);
 		if (old_spte & shadow_accessed_mask)
-			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+			kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 	}
 }
 
-- 
cgit v1.2.3


From 8672b7217a234c41d425a63b171af809e1169842 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 2 Aug 2010 16:14:04 +0800
Subject: KVM: MMU: move bits lost judgement into a separate function

Introduce spte_has_volatile_bits() function to judge whether spte
bits will miss, it's more readable and can help us to cleanup code
later

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e430a383ad15..c07b9a200bc8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -299,6 +299,20 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
 #endif
 }
 
+static bool spte_has_volatile_bits(u64 spte)
+{
+	if (!shadow_accessed_mask)
+		return false;
+
+	if (!is_shadow_present_pte(spte))
+		return false;
+
+	if (spte & shadow_accessed_mask)
+		return false;
+
+	return true;
+}
+
 static void update_spte(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte;
@@ -679,14 +693,14 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 	pfn_t pfn;
 	u64 old_spte = *sptep;
 
-	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
-	      old_spte & shadow_accessed_mask) {
+	if (!spte_has_volatile_bits(old_spte))
 		__set_spte(sptep, new_spte);
-	} else
+	else
 		old_spte = __xchg_spte(sptep, new_spte);
 
 	if (!is_rmap_spte(old_spte))
 		return;
+
 	pfn = spte_to_pfn(old_spte);
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-- 
cgit v1.2.3


From 4132779b1718f066ec2d06a71c8958039865cd49 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 2 Aug 2010 16:15:08 +0800
Subject: KVM: MMU: mark page dirty only when page is really written

Mark page dirty only when this page is really written, it's more exacter,
and also can fix dirty page marking in speculation path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c07b9a200bc8..ff95d418750d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -307,24 +307,42 @@ static bool spte_has_volatile_bits(u64 spte)
 	if (!is_shadow_present_pte(spte))
 		return false;
 
-	if (spte & shadow_accessed_mask)
+	if ((spte & shadow_accessed_mask) &&
+	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
 		return false;
 
 	return true;
 }
 
+static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+	return (old_spte & bit_mask) && !(new_spte & bit_mask);
+}
+
 static void update_spte(u64 *sptep, u64 new_spte)
 {
-	u64 old_spte;
+	u64 mask, old_spte = *sptep;
+
+	WARN_ON(!is_rmap_spte(new_spte));
 
-	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
-	      !is_rmap_spte(*sptep))
+	new_spte |= old_spte & shadow_dirty_mask;
+
+	mask = shadow_accessed_mask;
+	if (is_writable_pte(old_spte))
+		mask |= shadow_dirty_mask;
+
+	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
 		__set_spte(sptep, new_spte);
-	else {
+	else
 		old_spte = __xchg_spte(sptep, new_spte);
-		if (old_spte & shadow_accessed_mask)
-			kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-	}
+
+	if (!shadow_accessed_mask)
+		return;
+
+	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -704,7 +722,7 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 	pfn = spte_to_pfn(old_spte);
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(old_spte))
+	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
 		kvm_set_pfn_dirty(pfn);
 }
 
@@ -759,13 +777,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		}
 		spte = rmap_next(kvm, rmapp, spte);
 	}
-	if (write_protected) {
-		pfn_t pfn;
-
-		spte = rmap_next(kvm, rmapp, NULL);
-		pfn = spte_to_pfn(*spte);
-		kvm_set_pfn_dirty(pfn);
-	}
 
 	/* check for huge page mappings */
 	for (i = PT_DIRECTORY_LEVEL;
@@ -1938,7 +1949,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	 * whether the guest actually used the pte (in order to detect
 	 * demand paging).
 	 */
-	spte = shadow_base_present_pte | shadow_dirty_mask;
+	spte = shadow_base_present_pte;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
 	if (!dirty)
@@ -1999,8 +2010,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	if (is_writable_pte(*sptep) && !is_writable_pte(spte))
-		kvm_set_pfn_dirty(pfn);
 	update_spte(sptep, spte);
 done:
 	return ret;
-- 
cgit v1.2.3


From e0df7b9f6cee43c01d6f4a8491bccfd410cb86e1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:05 -0700
Subject: KVM: abstract kvm x86 mmu->n_free_mmu_pages

"free" is a poor name for this value.  In this context, it means,
"the number of mmu pages which this kvm instance should be able to
allocate."  But "free" implies much more that the objects are there
and ready for use.  "available" is a much better description, especially
when you see how it is calculated.

In this patch, we abstract its use into a function.  We'll soon
replace the function's contents by calculating the value in a
different way.

All of the reads of n_free_mmu_pages are taken care of in this
patch.  The modification sites will be handled in a patch
later in the series.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 11 ++++-------
 arch/x86/kvm/mmu.h |  7 ++++++-
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff95d418750d..625b17894661 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1696,7 +1696,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	int used_pages;
 	LIST_HEAD(invalid_list);
 
-	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
+	used_pages = kvm->arch.n_alloc_mmu_pages - kvm_mmu_available_pages(kvm);
 	used_pages = max(0, used_pages);
 
 	/*
@@ -2959,18 +2959,15 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	int free_pages;
 	LIST_HEAD(invalid_list);
 
-	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
-	while (free_pages < KVM_REFILL_PAGES &&
+	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 		struct kvm_mmu_page *sp;
 
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
-		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-						       &invalid_list);
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3145,7 +3142,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 		npages = kvm->arch.n_alloc_mmu_pages -
-			 kvm->arch.n_free_mmu_pages;
+			 kvm_mmu_available_pages(kvm);
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
 			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a5..c3a689ae7df0 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -50,9 +50,14 @@
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 
+static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+{
+	return kvm->arch.n_free_mmu_pages;
+}
+
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+	if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
 		__kvm_mmu_free_some_pages(vcpu);
 }
 
-- 
cgit v1.2.3


From 39de71ec5397f374aed95e99509372d605e1407c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:14 -0700
Subject: KVM: rename x86 kvm->arch.n_alloc_mmu_pages

arch.n_alloc_mmu_pages is a poor choice of name. This value truly
means, "the number of pages which _may_ be allocated".  But,
reading the name, "n_alloc_mmu_pages" implies "the number of allocated
mmu pages", which is dead wrong.

It's really the high watermark, so let's give it a name to match:
nr_max_mmu_pages.  This change will make the next few patches
much more obvious and easy to read.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/mmu.c              | 8 ++++----
 arch/x86/kvm/x86.c              | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c52e2eb40a1e..02963684cd28 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -369,7 +369,7 @@ struct kvm_vcpu_arch {
 struct kvm_arch {
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
-	unsigned int n_alloc_mmu_pages;
+	unsigned int n_max_mmu_pages;
 	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 625b17894661..6979e7d1464e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1696,7 +1696,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	int used_pages;
 	LIST_HEAD(invalid_list);
 
-	used_pages = kvm->arch.n_alloc_mmu_pages - kvm_mmu_available_pages(kvm);
+	used_pages = kvm->arch.n_max_mmu_pages - kvm_mmu_available_pages(kvm);
 	used_pages = max(0, used_pages);
 
 	/*
@@ -1721,9 +1721,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	}
 	else
 		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
-					 - kvm->arch.n_alloc_mmu_pages;
+					 - kvm->arch.n_max_mmu_pages;
 
-	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+	kvm->arch.n_max_mmu_pages = kvm_nr_mmu_pages;
 }
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -3141,7 +3141,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
-		npages = kvm->arch.n_alloc_mmu_pages -
+		npages = kvm->arch.n_max_mmu_pages -
 			 kvm_mmu_available_pages(kvm);
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c0004eb354d3..4b4d2836240f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2759,7 +2759,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 
 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 {
-	return kvm->arch.n_alloc_mmu_pages;
+	return kvm->arch.n_max_mmu_pages;
 }
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-- 
cgit v1.2.3


From 49d5ca26636cb8feb05aff92fc4dba3e494ec683 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:28 -0700
Subject: KVM: replace x86 kvm n_free_mmu_pages with n_used_mmu_pages

Doing this makes the code much more readable.  That's
borne out by the fact that this patch removes code.  "used"
also happens to be the number that we need to return back to
the slab code when our shrinker gets called.  Keeping this
value as opposed to free makes the next patch simpler.

So, 'struct kvm' is kzalloc()'d.  'struct kvm_arch' is a
structure member (and not a pointer) of 'struct kvm'.  That
means they start out zeroed.  I _think_ they get initialized
properly by kvm_mmu_change_mmu_pages().  But, that only happens
via kvm ioctls.

Another benefit of storing 'used' intead of 'free' is
that the values are consistent from the moment the structure is
allocated: no negative "used" value.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 27 +++++++++------------------
 arch/x86/kvm/mmu.h              |  3 ++-
 3 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 02963684cd28..e01b72825564 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -367,7 +367,7 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_arch {
-	unsigned int n_free_mmu_pages;
+	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
 	atomic_t invlpg_counter;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6979e7d1464e..ff39b85d7a4d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -980,7 +980,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->role.direct)
 		__free_page(virt_to_page(sp->gfns));
 	kmem_cache_free(mmu_page_header_cache, sp);
-	++kvm->arch.n_free_mmu_pages;
+	--kvm->arch.n_used_mmu_pages;
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1003,7 +1003,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-	--vcpu->kvm->arch.n_free_mmu_pages;
+	++vcpu->kvm->arch.n_used_mmu_pages;
 	return sp;
 }
 
@@ -1689,41 +1689,32 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 /*
  * Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
  */
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 {
-	int used_pages;
 	LIST_HEAD(invalid_list);
-
-	used_pages = kvm->arch.n_max_mmu_pages - kvm_mmu_available_pages(kvm);
-	used_pages = max(0, used_pages);
-
 	/*
 	 * If we set the number of mmu pages to be smaller be than the
 	 * number of actived pages , we must to free some mmu pages before we
 	 * change the value
 	 */
 
-	if (used_pages > kvm_nr_mmu_pages) {
-		while (used_pages > kvm_nr_mmu_pages &&
+	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
 			!list_empty(&kvm->arch.active_mmu_pages)) {
 			struct kvm_mmu_page *page;
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
-			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+			kvm_mmu_prepare_zap_page(kvm, page,
 							       &invalid_list);
 		}
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
-		kvm_nr_mmu_pages = used_pages;
-		kvm->arch.n_free_mmu_pages = 0;
+		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
-	else
-		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
-					 - kvm->arch.n_max_mmu_pages;
 
-	kvm->arch.n_max_mmu_pages = kvm_nr_mmu_pages;
+	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
 }
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c3a689ae7df0..f05a03dfba4e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,7 +52,8 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
-	return kvm->arch.n_free_mmu_pages;
+	return kvm->arch.n_max_mmu_pages -
+		kvm->arch.n_used_mmu_pages;
 }
 
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 45221ab6684a82a5b60208b76d6f8bfb1bbcb969 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:37 -0700
Subject: KVM: create aggregate kvm_total_used_mmu_pages value

Of slab shrinkers, the VM code says:

 * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
 * querying the cache size, so a fastpath for that case is appropriate.

and it *means* it.  Look at how it calls the shrinkers:

    nr_before = (*shrinker->shrink)(0, gfp_mask);
    shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);

So, if you do anything stupid in your shrinker, the VM will doubly
punish you.

The mmu_shrink() function takes the global kvm_lock, then acquires
every VM's kvm->mmu_lock in sequence.  If we have 100 VMs, then
we're going to take 101 locks.  We do it twice, so each call takes
202 locks.  If we're under memory pressure, we can have each cpu
trying to do this.  It can get really hairy, and we've seen lock
spinning in mmu_shrink() be the dominant entry in profiles.

This is guaranteed to optimize at least half of those lock
aquisitions away.  It removes the need to take any of the locks
when simply trying to count objects.

A 'percpu_counter' can be a large object, but we only have one
of these for the entire system.  There are not any better
alternatives at the moment, especially ones that handle CPU
hotplug.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff39b85d7a4d..33d7af50cf8e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,6 +178,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
 
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -971,6 +972,18 @@ static int is_empty_shadow_page(u64 *spt)
 }
 #endif
 
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values.  We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+{
+	kvm->arch.n_used_mmu_pages += nr;
+	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
@@ -980,7 +993,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->role.direct)
 		__free_page(virt_to_page(sp->gfns));
 	kmem_cache_free(mmu_page_header_cache, sp);
-	--kvm->arch.n_used_mmu_pages;
+	kvm_mod_used_mmu_pages(kvm, -1);
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1003,7 +1016,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-	++vcpu->kvm->arch.n_used_mmu_pages;
+	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
 	return sp;
 }
 
@@ -3122,23 +3135,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	struct kvm *kvm;
 	struct kvm *kvm_freed = NULL;
-	int cache_count = 0;
+
+	if (nr_to_scan == 0)
+		goto out;
 
 	spin_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int npages, idx, freed_pages;
+		int idx, freed_pages;
 		LIST_HEAD(invalid_list);
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
-		npages = kvm->arch.n_max_mmu_pages -
-			 kvm_mmu_available_pages(kvm);
-		cache_count += npages;
-		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
+		if (!kvm_freed && nr_to_scan > 0 &&
+		    kvm->arch.n_used_mmu_pages > 0) {
 			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
 							  &invalid_list);
-			cache_count -= freed_pages;
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
@@ -3152,7 +3164,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 	spin_unlock(&kvm_lock);
 
-	return cache_count;
+out:
+	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
 }
 
 static struct shrinker mmu_shrinker = {
@@ -3195,6 +3208,7 @@ int kvm_mmu_module_init(void)
 	if (!mmu_page_header_cache)
 		goto nomem;
 
+	percpu_counter_init(&kvm_total_used_mmu_pages, 0);
 	register_shrinker(&mmu_shrinker);
 
 	return 0;
-- 
cgit v1.2.3


From 80b63faf028fba79e630d3643b0e615bddf4067b Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Tue, 24 Aug 2010 10:31:07 +0800
Subject: KVM: MMU: fix regression from rework mmu_shrink() code

Latest kvm mmu_shrink code rework makes kernel changes kvm->arch.n_used_mmu_pages/
kvm->arch.n_max_mmu_pages at kvm_mmu_free_page/kvm_mmu_alloc_page, which is called
by kvm_mmu_commit_zap_page. So the kvm->arch.n_used_mmu_pages or
kvm_mmu_available_pages(vcpu->kvm) is unchanged after kvm_mmu_prepare_zap_page(),
This caused kvm_mmu_change_mmu_pages/__kvm_mmu_free_some_pages loops forever.
Moving kvm_mmu_commit_zap_page would make the while loop performs as normal.

Reported-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Tested-by: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 33d7af50cf8e..c2ac7004441a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1720,10 +1720,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
-			kvm_mmu_prepare_zap_page(kvm, page,
-							       &invalid_list);
+			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
+			kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		}
-		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
 
@@ -2972,9 +2971,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
-	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
-- 
cgit v1.2.3


From 45bf21a8ce7a2884f067a702a5c7683684846ce1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 23 Aug 2010 16:13:15 +0800
Subject: KVM: MMU: fix missing percpu counter destroy

commit ad05c88266b4cce1c820928ce8a0fb7690912ba1
(KVM: create aggregate kvm_total_used_mmu_pages value)
introduce percpu counter kvm_total_used_mmu_pages but never
destroy it, this may cause oops when rmmod & modprobe.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c2ac7004441a..54a50268cebf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3185,6 +3185,7 @@ static void mmu_destroy_caches(void)
 void kvm_mmu_module_exit(void)
 {
 	mmu_destroy_caches();
+	percpu_counter_destroy(&kvm_total_used_mmu_pages);
 	unregister_shrinker(&mmu_shrinker);
 }
 
@@ -3207,7 +3208,9 @@ int kvm_mmu_module_init(void)
 	if (!mmu_page_header_cache)
 		goto nomem;
 
-	percpu_counter_init(&kvm_total_used_mmu_pages, 0);
+	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
+		goto nomem;
+
 	register_shrinker(&mmu_shrinker);
 
 	return 0;
-- 
cgit v1.2.3


From 957ed9effd80b04482cbdce8c95bdf803a656b94 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 22 Aug 2010 19:12:48 +0800
Subject: KVM: MMU: prefetch ptes when intercepted guest #PF

Support prefetch ptes when intercept guest #PF, avoid to #PF by later
access

If we meet any failure in the prefetch path, we will exit it and
not try other ptes to avoid become heavy path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 104 ++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/paging_tmpl.h |  72 +++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 54a50268cebf..b0037a77e56b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
 	}
 #endif
 
+#define PTE_PREFETCH_NUM		8
+
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
@@ -400,7 +402,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-				   rmap_desc_cache, 4);
+				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -2089,6 +2091,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
+static struct kvm_memory_slot *
+pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = gfn_to_memslot(vcpu->kvm, gfn);
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
+	      (no_dirty_log && slot->dirty_bitmap))
+		slot = NULL;
+
+	return slot;
+}
+
+static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+				     bool no_dirty_log)
+{
+	struct kvm_memory_slot *slot;
+	unsigned long hva;
+
+	slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
+	if (!slot) {
+		get_page(bad_page);
+		return page_to_pfn(bad_page);
+	}
+
+	hva = gfn_to_hva_memslot(slot, gfn);
+
+	return hva_to_pfn_atomic(vcpu->kvm, hva);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp,
+				    u64 *start, u64 *end)
+{
+	struct page *pages[PTE_PREFETCH_NUM];
+	unsigned access = sp->role.access;
+	int i, ret;
+	gfn_t gfn;
+
+	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+	if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
+		return -1;
+
+	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+	if (ret <= 0)
+		return -1;
+
+	for (i = 0; i < ret; i++, gfn++, start++)
+		mmu_set_spte(vcpu, start, ACC_ALL,
+			     access, 0, 0, 1, NULL,
+			     sp->role.level, gfn,
+			     page_to_pfn(pages[i]), true, true);
+
+	return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *sp, u64 *sptep)
+{
+	u64 *spte, *start = NULL;
+	int i;
+
+	WARN_ON(!sp->role.direct);
+
+	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+	spte = sp->spt + i;
+
+	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+			if (!start)
+				continue;
+			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+				break;
+			start = NULL;
+		} else if (!start)
+			start = spte;
+	}
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+
+	/*
+	 * Since it's no accessed bit on EPT, it's no way to
+	 * distinguish between actually accessed translations
+	 * and prefetched, so disable pte prefetch if EPT is
+	 * enabled.
+	 */
+	if (!shadow_accessed_mask)
+		return;
+
+	sp = page_header(__pa(sptep));
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return;
+
+	__direct_pte_prefetch(vcpu, sp, sptep);
+}
+
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			int level, gfn_t gfn, pfn_t pfn)
 {
@@ -2102,6 +2203,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write,
 				     level, gfn, pfn, false, true);
+			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef9097960d..872ff265c91e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -310,6 +310,77 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
 	return r || curr_pte != gw->ptes[level - 1];
 }
 
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+	pt_element_t gptep[PTE_PREFETCH_NUM];
+	gpa_t first_pte_gpa;
+	int offset = 0, i;
+	u64 *spte;
+
+	sp = page_header(__pa(sptep));
+
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return;
+
+	if (sp->role.direct)
+		return __direct_pte_prefetch(vcpu, sp, sptep);
+
+	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+
+	if (PTTYPE == 32)
+		offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+	first_pte_gpa = gfn_to_gpa(sp->gfn) +
+				(offset + i) * sizeof(pt_element_t);
+
+	if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
+					sizeof(gptep)) < 0)
+		return;
+
+	spte = sp->spt + i;
+
+	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+		pt_element_t gpte;
+		unsigned pte_access;
+		gfn_t gfn;
+		pfn_t pfn;
+		bool dirty;
+
+		if (spte == sptep)
+			continue;
+
+		if (*spte != shadow_trap_nonpresent_pte)
+			continue;
+
+		gpte = gptep[i];
+
+		if (!is_present_gpte(gpte) ||
+		      is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
+			if (!sp->unsync)
+				__set_spte(spte, shadow_notrap_nonpresent_pte);
+			continue;
+		}
+
+		if (!(gpte & PT_ACCESSED_MASK))
+			continue;
+
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		gfn = gpte_to_gfn(gpte);
+		dirty = is_dirty_gpte(gpte);
+		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+				      (pte_access & ACC_WRITE_MASK) && dirty);
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			break;
+		}
+
+		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+			     dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+			     pfn, true, true);
+	}
+}
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
@@ -391,6 +462,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, false, true);
+	FNAME(pte_prefetch)(vcpu, it.sptep);
 
 	return it.sptep;
 
-- 
cgit v1.2.3


From 9ad17b10011702cb56c5e32e41ecd5fe281c3574 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:19:42 +0800
Subject: KVM: MMU: fix compile warning in audit code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

arch/x86/kvm/mmu.c: In function ‘kvm_mmu_unprotect_page’:
arch/x86/kvm/mmu.c:1741: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c:1745: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c: In function ‘mmu_unshadow’:
arch/x86/kvm/mmu.c:1761: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c: In function ‘set_spte’:
arch/x86/kvm/mmu.c:2005: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c: In function ‘mmu_set_spte’:
arch/x86/kvm/mmu.c:2033: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 7 has type ‘gfn_t’

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b0037a77e56b..59bf1d9553a7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1738,11 +1738,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	LIST_HEAD(invalid_list);
 	int r;
 
-	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
 	r = 0;
 
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
 			 sp->role.word);
 		r = 1;
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1758,7 +1758,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 	LIST_HEAD(invalid_list);
 
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: zap %lx %x\n",
+		pgprintk("%s: zap %llx %x\n",
 			 __func__, gfn, sp->role.word);
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
@@ -2002,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			goto set_pte;
 
 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
-			pgprintk("%s: found shadow page for %lx, marking ro\n",
+			pgprintk("%s: found shadow page for %llx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
@@ -2031,7 +2031,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	int rmap_count;
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
-		 " user_fault %d gfn %lx\n",
+		 " user_fault %d gfn %llx\n",
 		 __func__, *sptep, pt_access,
 		 write_fault, user_fault, gfn);
 
@@ -2050,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			__set_spte(sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else if (pfn != spte_to_pfn(*sptep)) {
-			pgprintk("hfn old %lx new %lx\n",
+			pgprintk("hfn old %llx new %llx\n",
 				 spte_to_pfn(*sptep), pfn);
 			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2067,7 +2067,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
 		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
 		 *sptep, sptep);
@@ -3651,9 +3651,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 		if (!gfn_to_memslot(kvm, gfn)) {
 			if (!printk_ratelimit())
 				return;
-			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
+			printk(KERN_ERR "%s: no memslot for gfn %llx\n",
 					 audit_msg, gfn);
-			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
+			printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
 			       audit_msg, (long int)(sptep - rev_sp->spt),
 					rev_sp->gfn);
 			dump_stack();
@@ -3728,7 +3728,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		while (spte) {
 			if (is_writable_pte(*spte))
 				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %lx role %x\n",
+				"writable mappings: gfn %llx role %x\n",
 			       __func__, audit_msg, sp->gfn,
 			       sp->role.word);
 			spte = rmap_next(vcpu->kvm, rmapp, spte);
-- 
cgit v1.2.3


From 0beb8d660425aab339ff68e6f4d4528739e8fc4f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:20:47 +0800
Subject: KVM: MMU: check rmap for every spte

The read-only spte also has reverse mapping, so fix the code to check them,
also modify the function name to fit its doing

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 57 +++++++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 59bf1d9553a7..1c784b96dac3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3644,40 +3644,38 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	struct kvm_mmu_page *rev_sp;
 	gfn_t gfn;
 
-	if (is_writable_pte(*sptep)) {
-		rev_sp = page_header(__pa(sptep));
-		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
-		if (!gfn_to_memslot(kvm, gfn)) {
-			if (!printk_ratelimit())
-				return;
-			printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-					 audit_msg, gfn);
-			printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-			       audit_msg, (long int)(sptep - rev_sp->spt),
-					rev_sp->gfn);
-			dump_stack();
-			return;
-		}
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
-		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-		if (!*rmapp) {
-			if (!printk_ratelimit())
-				return;
-			printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-					 audit_msg, *sptep);
-			dump_stack();
-		}
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+				 audit_msg, gfn);
+		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+		       audit_msg, (long int)(sptep - rev_sp->spt),
+				rev_sp->gfn);
+		dump_stack();
+		return;
 	}
 
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+				 audit_msg, *sptep);
+		dump_stack();
+	}
 }
 
-void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
 {
 	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
 }
 
-static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
 	int i;
@@ -3689,12 +3687,9 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 			continue;
 
 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			u64 ent = pt[i];
-
-			if (!(ent & PT_PRESENT_MASK))
-				continue;
-			if (!is_writable_pte(ent))
+			if (!is_rmap_spte(pt[i]))
 				continue;
+
 			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 		}
 	}
@@ -3703,7 +3698,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 
 static void audit_rmap(struct kvm_vcpu *vcpu)
 {
-	check_writable_mappings_rmap(vcpu);
+	check_mappings_rmap(vcpu);
 	count_rmaps(vcpu);
 }
 
@@ -3746,7 +3741,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
 	audit_write_protection(vcpu);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
-	audit_writable_sptes_have_rmaps(vcpu);
+	audit_sptes_have_rmaps(vcpu);
 	dbg = olddbg;
 }
 
-- 
cgit v1.2.3


From bc32ce2152406431acf4daf4a81dc1664bb7b91b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:22:46 +0800
Subject: KVM: MMU: fix wrong not write protected sp report

The audit code reports some sp not write protected in current code, it's just the
bug in audit_write_protection(), since:

- the invalid sp not need write protected
- using uninitialize local variable('gfn')
- call kvm_mmu_audit() out of mmu_lock's protection

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 5 +++--
 arch/x86/kvm/paging_tmpl.h | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1c784b96dac3..68575dc32ec7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3708,16 +3708,17 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
-	gfn_t gfn;
 
 	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
 		if (sp->role.direct)
 			continue;
 		if (sp->unsync)
 			continue;
+		if (sp->role.invalid)
+			continue;
 
 		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[gfn - slot->base_gfn];
+		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
 		while (spte) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a4e8389df2ad..a0f2febf5692 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -504,7 +504,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	unsigned long mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
-	kvm_mmu_audit(vcpu, "pre page fault");
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -542,6 +541,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
+
+	kvm_mmu_audit(vcpu, "pre page fault");
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
-- 
cgit v1.2.3


From 365fb3fdf6769d3553999d8eb6cc2a8c56c747c1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:24:13 +0800
Subject: KVM: MMU: rewrite audit_mappings_page() function

There is a bugs in this function, we call gfn_to_pfn() and kvm_mmu_gva_to_gpa_read() in
atomic context(kvm_mmu_audit() is called under the spinlock(mmu_lock)'s protection).

This patch fix it by:
- introduce gfn_to_pfn_atomic instead of gfn_to_pfn
- get the mapping gfn from kvm_mmu_page_get_gfn()

And it adds 'notrap' ptes check in unsync/direct sps

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 75 ++++++++++++++++++++++++++----------------------
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 15 ++++++++--
 3 files changed, 54 insertions(+), 37 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 68575dc32ec7..0d91f60af1a8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3487,15 +3487,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 static const char *audit_msg;
 
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
-	gva = (long long)(gva << 16) >> 16;
-#endif
-	return gva;
-}
-
-
 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
 
 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -3550,39 +3541,53 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 ent = pt[i];
+		u64 *sptep = pt + i;
+		struct kvm_mmu_page *sp;
+		gfn_t gfn;
+		pfn_t pfn;
+		hpa_t hpa;
 
-		if (ent == shadow_trap_nonpresent_pte)
-			continue;
+		sp = page_header(__pa(sptep));
 
-		va = canonicalize(va);
-		if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
-			audit_mappings_page(vcpu, ent, va, level - 1);
-		else {
-			gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
-			gfn_t gfn = gpa >> PAGE_SHIFT;
-			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
-			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
+		if (sp->unsync) {
+			if (level != PT_PAGE_TABLE_LEVEL) {
+				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+						audit_msg, sp, level);
+				return;
+			}
 
-			if (is_error_pfn(pfn)) {
-				kvm_release_pfn_clean(pfn);
-				continue;
+			if (*sptep == shadow_notrap_nonpresent_pte) {
+				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+						audit_msg, sp);
+				return;
 			}
+		}
 
-			if (is_shadow_present_pte(ent)
-			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
-				printk(KERN_ERR "xx audit error: (%s) levels %d"
-				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
-				       audit_msg, vcpu->arch.mmu.root_level,
-				       va, gpa, hpa, ent,
-				       is_shadow_present_pte(ent));
-			else if (ent == shadow_notrap_nonpresent_pte
-				 && !is_error_hpa(hpa))
-				printk(KERN_ERR "audit: (%s) notrap shadow,"
-				       " valid guest gva %lx\n", audit_msg, va);
-			kvm_release_pfn_clean(pfn);
+		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+					audit_msg, sp);
+			return;
+		}
+
+		if (!is_shadow_present_pte(*sptep) ||
+		      !is_last_spte(*sptep, level))
+			return;
+
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
 
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			return;
 		}
+
+		hpa =  pfn << PAGE_SHIFT;
+
+		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+			printk(KERN_ERR "xx audit error: (%s) levels %d"
+					   " gva %lx pfn %llx hpa %llx ent %llxn",
+					   audit_msg, vcpu->arch.mmu.root_level,
+					   va, pfn, hpa, *sptep);
 	}
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b837ec80885d..f2ecdd52032b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -300,6 +300,7 @@ void kvm_set_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
 			 struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2eb0b7500a2a..c7a57b4feb39 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -999,7 +999,7 @@ pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
 {
 	unsigned long addr;
 
@@ -1009,7 +1009,18 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 		return page_to_pfn(bad_page);
 	}
 
-	return hva_to_pfn(kvm, addr, false);
+	return hva_to_pfn(kvm, addr, atomic);
+}
+
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_pfn(kvm, gfn, true);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+	return __gfn_to_pfn(kvm, gfn, false);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
-- 
cgit v1.2.3


From 8e0e8afa82018a3c751ea474eb47dfc65f00f4c3 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:25:09 +0800
Subject: KVM: MMU: remove count_rmaps()

Nothing is checked in count_rmaps(), so remove it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 38 --------------------------------------
 1 file changed, 38 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0d91f60af1a8..0bff4d54817e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3606,43 +3606,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
 						    2);
 }
 
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_memslots *slots;
-	int nmaps = 0;
-	int i, j, k, idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *m = &slots->memslots[i];
-		struct kvm_rmap_desc *d;
-
-		for (j = 0; j < m->npages; ++j) {
-			unsigned long *rmapp = &m->rmap[j];
-
-			if (!*rmapp)
-				continue;
-			if (!(*rmapp & 1)) {
-				++nmaps;
-				continue;
-			}
-			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-			while (d) {
-				for (k = 0; k < RMAP_EXT; ++k)
-					if (d->sptes[k])
-						++nmaps;
-					else
-						break;
-				d = d->more;
-			}
-		}
-	}
-	srcu_read_unlock(&kvm->srcu, idx);
-	return nmaps;
-}
-
 void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
@@ -3704,7 +3667,6 @@ static void check_mappings_rmap(struct kvm_vcpu *vcpu)
 static void audit_rmap(struct kvm_vcpu *vcpu)
 {
 	check_mappings_rmap(vcpu);
-	count_rmaps(vcpu);
 }
 
 static void audit_write_protection(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 8b1fe17cc7a8b2c62b400dcbfaebd96da6b4f58e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:22:53 +0800
Subject: KVM: MMU: support disable/enable mmu audit dynamicly

Add a r/w module parameter named 'mmu_audit', it can control audit
enable/disable:

enable:
  echo 1 > /sys/module/kvm/parameters/mmu_audit

disable:
  echo 0 > /sys/module/kvm/parameters/mmu_audit

This patch not change the logic

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/Kconfig       |  7 ++++
 arch/x86/kvm/mmu.c         | 91 +++++++++++++++++++++++++++++++++++++---------
 arch/x86/kvm/mmutrace.h    | 19 ++++++++++
 arch/x86/kvm/paging_tmpl.h |  4 +-
 4 files changed, 101 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd479516..ddc131ff438f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-amd.
 
+config KVM_MMU_AUDIT
+	bool "Audit KVM MMU"
+	depends on KVM && TRACEPOINTS
+	---help---
+	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
+	 audit  KVM MMU at runtime.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bff4d54817e..8b750ff6911a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -49,15 +49,21 @@
  */
 bool tdp_enabled = false;
 
-#undef MMU_DEBUG
+enum {
+	AUDIT_PRE_PAGE_FAULT,
+	AUDIT_POST_PAGE_FAULT,
+	AUDIT_PRE_PTE_WRITE,
+	AUDIT_POST_PTE_WRITE
+};
 
-#undef AUDIT
+char *audit_point_name[] = {
+	"pre page fault",
+	"post page fault",
+	"pre pte write",
+	"post pte write"
+};
 
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
+#undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
 
@@ -71,7 +77,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
 
 #endif
 
-#if defined(MMU_DEBUG) || defined(AUDIT)
+#ifdef MMU_DEBUG
 static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
@@ -2964,7 +2970,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
-	kvm_mmu_audit(vcpu, "pre pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 	if (guest_initiated) {
 		if (gfn == vcpu->arch.last_pt_write_gfn
 		    && !last_updated_pte_accessed(vcpu)) {
@@ -3037,7 +3043,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	kvm_mmu_audit(vcpu, "post pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
 		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -3483,8 +3489,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
-#ifdef AUDIT
-
+#ifdef CONFIG_KVM_MMU_AUDIT
 static const char *audit_msg;
 
 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
@@ -3699,18 +3704,68 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
-	int olddbg = dbg;
-
-	dbg = 0;
-	audit_msg = msg;
+	audit_msg = audit_point_name[audit_point];
 	audit_rmap(vcpu);
 	audit_write_protection(vcpu);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
 	audit_sptes_have_rmaps(vcpu);
-	dbg = olddbg;
 }
 
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
 #endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0930ef..b60b4fdb3eda 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 
 	TP_ARGS(sp)
 );
+
+TRACE_EVENT(
+	kvm_mmu_audit,
+	TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
+	TP_ARGS(vcpu, audit_point),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(int, audit_point)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->audit_point = audit_point;
+	),
+
+	TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
+		  audit_point_name[__entry->audit_point])
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a0f2febf5692..debe77035366 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -542,7 +542,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 
-	kvm_mmu_audit(vcpu, "pre page fault");
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
@@ -554,7 +554,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
 	++vcpu->stat.pf_fixed;
-	kvm_mmu_audit(vcpu, "post page fault (fixed)");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return write_pt;
-- 
cgit v1.2.3


From 2f4f337248cd5660040b7e09b7287a7a0a861f3f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:24:10 +0800
Subject: KVM: MMU: move audit to a separate file

Move the audit code from arch/x86/kvm/mmu.c to arch/x86/kvm/mmu_audit.c

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 279 +-------------------------------------------
 arch/x86/kvm/mmu_audit.c | 297 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 298 insertions(+), 278 deletions(-)
 create mode 100644 arch/x86/kvm/mmu_audit.c

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b750ff6911a..d2dad65a45f8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3490,282 +3490,5 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 #ifdef CONFIG_KVM_MMU_AUDIT
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
-		}
-	}
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
-	int i;
-	struct kvm_mmu_page *sp;
-
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
-		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
-		return;
-	}
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root && VALID_PAGE(root)) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
-		}
-	}
-	return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
-{
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
-
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
-			return;
-		}
-
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
-			return;
-
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
-
-		hpa =  pfn << PAGE_SHIFT;
-
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
-	}
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
-	unsigned long *rmapp;
-	struct kvm_mmu_page *rev_sp;
-	gfn_t gfn;
-
-
-	rev_sp = page_header(__pa(sptep));
-	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
-	if (!gfn_to_memslot(kvm, gfn)) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-				 audit_msg, gfn);
-		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-		       audit_msg, (long int)(sptep - rev_sp->spt),
-				rev_sp->gfn);
-		dump_stack();
-		return;
-	}
-
-	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-	if (!*rmapp) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-				 audit_msg, *sptep);
-		dump_stack();
-	}
-}
-
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	int i;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
-
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
-
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
-
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
-	}
-	return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	struct kvm_memory_slot *slot;
-	unsigned long *rmapp;
-	u64 *spte;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
-
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %llx role %x\n",
-			       __func__, audit_msg, sp->gfn,
-			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
-	}
-}
-
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
-{
-	audit_msg = audit_point_name[audit_point];
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
-}
-
-static bool mmu_audit;
-
-static void mmu_audit_enable(void)
-{
-	int ret;
-
-	if (mmu_audit)
-		return;
-
-	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	WARN_ON(ret);
-
-	mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
-	if (!mmu_audit)
-		return;
-
-	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	tracepoint_synchronize_unregister();
-	mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
-	int ret;
-	unsigned long enable;
-
-	ret = strict_strtoul(val, 10, &enable);
-	if (ret < 0)
-		return -EINVAL;
-
-	switch (enable) {
-	case 0:
-		mmu_audit_disable();
-		break;
-	case 1:
-		mmu_audit_enable();
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static struct kernel_param_ops audit_param_ops = {
-	.set = mmu_audit_set,
-	.get = param_get_bool,
-};
-
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
+#include "mmu_audit.c"
 #endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000000..fb8a461333c5
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,297 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Marcelo Tosatti <mtosatti@redhat.com>
+ *   Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+static const char *audit_msg;
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 ent = sp->spt[i];
+
+		if (is_shadow_present_pte(ent)) {
+			if (!is_last_spte(ent, sp->role.level)) {
+				struct kvm_mmu_page *child;
+				child = page_header(ent & PT64_BASE_ADDR_MASK);
+				__mmu_spte_walk(kvm, child, fn);
+			} else
+				fn(kvm, &sp->spt[i]);
+		}
+	}
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root && VALID_PAGE(root)) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			__mmu_spte_walk(vcpu->kvm, sp, fn);
+		}
+	}
+	return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+				gva_t va, int level)
+{
+	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+	int i;
+	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+		u64 *sptep = pt + i;
+		struct kvm_mmu_page *sp;
+		gfn_t gfn;
+		pfn_t pfn;
+		hpa_t hpa;
+
+		sp = page_header(__pa(sptep));
+
+		if (sp->unsync) {
+			if (level != PT_PAGE_TABLE_LEVEL) {
+				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+						audit_msg, sp, level);
+				return;
+			}
+
+			if (*sptep == shadow_notrap_nonpresent_pte) {
+				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+						audit_msg, sp);
+				return;
+			}
+		}
+
+		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+					audit_msg, sp);
+			return;
+		}
+
+		if (!is_shadow_present_pte(*sptep) ||
+		      !is_last_spte(*sptep, level))
+			return;
+
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			return;
+		}
+
+		hpa =  pfn << PAGE_SHIFT;
+
+		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+			printk(KERN_ERR "xx audit error: (%s) levels %d"
+					   " gva %lx pfn %llx hpa %llx ent %llxn",
+					   audit_msg, vcpu->arch.mmu.root_level,
+					   va, pfn, hpa, *sptep);
+	}
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned i;
+
+	if (vcpu->arch.mmu.root_level == 4)
+		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+	else
+		for (i = 0; i < 4; ++i)
+			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+				audit_mappings_page(vcpu,
+						    vcpu->arch.mmu.pae_root[i],
+						    i << 30,
+						    2);
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+	unsigned long *rmapp;
+	struct kvm_mmu_page *rev_sp;
+	gfn_t gfn;
+
+
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+				 audit_msg, gfn);
+		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+		       audit_msg, (long int)(sptep - rev_sp->spt),
+				rev_sp->gfn);
+		dump_stack();
+		return;
+	}
+
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+				 audit_msg, *sptep);
+		dump_stack();
+	}
+}
+
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	int i;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		u64 *pt = sp->spt;
+
+		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+			continue;
+
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			if (!is_rmap_spte(pt[i]))
+				continue;
+
+			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
+		}
+	}
+	return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+	check_mappings_rmap(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	u64 *spte;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		if (sp->role.direct)
+			continue;
+		if (sp->unsync)
+			continue;
+		if (sp->role.invalid)
+			continue;
+
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+		spte = rmap_next(vcpu->kvm, rmapp, NULL);
+		while (spte) {
+			if (is_writable_pte(*spte))
+				printk(KERN_ERR "%s: (%s) shadow page has "
+				"writable mappings: gfn %llx role %x\n",
+			       __func__, audit_msg, sp->gfn,
+			       sp->role.word);
+			spte = rmap_next(vcpu->kvm, rmapp, spte);
+		}
+	}
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
+{
+	audit_msg = audit_point_name[audit_point];
+	audit_rmap(vcpu);
+	audit_write_protection(vcpu);
+	if (strcmp("pre pte write", audit_msg) != 0)
+		audit_mappings(vcpu);
+	audit_sptes_have_rmaps(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
-- 
cgit v1.2.3


From f87f928882d080eaec8b0d76aecff003d664697d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 2 Sep 2010 17:29:45 +0200
Subject: KVM: MMU: Fix 32 bit legacy paging with NPT

This patch fixes 32 bit legacy paging with NPT enabled. The
mmu_check_root call on the top-level of the loop causes
root_gfn to take values (in the tdp_enabled path) which are
outside of guest memory. So the mmu_check_root call fails at
some point in the loop interation causing the guest to
tiple-fault.
This patch changes the mmu_check_root calls to the places
where they are really necessary. As a side-effect it
introduces a check for the root of a pae page table too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d2dad65a45f8..b2136f921d7e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2387,6 +2387,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 	direct = !is_paging(vcpu);
+
+	if (mmu_check_root(vcpu, root_gfn))
+		return 1;
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2398,10 +2402,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 				continue;
 			}
 			root_gfn = pdptr >> PAGE_SHIFT;
+			if (mmu_check_root(vcpu, root_gfn))
+				return 1;
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
-		if (mmu_check_root(vcpu, root_gfn))
-			return 1;
 		if (tdp_enabled) {
 			direct = 1;
 			root_gfn = i << 30;
-- 
cgit v1.2.3


From c5a78f2b649ae75ae788e7622ca5a586af2cb35a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:39 +0200
Subject: KVM: MMU: Make tdp_enabled a mmu-context parameter

This patch changes the tdp_enabled flag from its global
meaning to the mmu-context and renames it to direct_map
there. This is necessary for Nested SVM with emulation of
Nested Paging where we need an extra MMU context to shadow
the Nested Nested Page Table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 22 ++++++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 789e9462668f..80ef28bddcc3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -249,6 +249,7 @@ struct kvm_mmu {
 	int root_level;
 	int shadow_root_level;
 	union kvm_mmu_page_role base_role;
+	bool direct_map;
 
 	u64 *pae_root;
 	u64 rsvd_bits_mask[2][4];
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b2136f921d7e..5c28e979d730 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1448,7 +1448,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (role.direct)
 		role.cr4_pae = 0;
 	role.access = access;
-	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+	if (!vcpu->arch.mmu.direct_map
+	    && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
@@ -1973,7 +1974,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_user_mask;
 	if (level > PT_PAGE_TABLE_LEVEL)
 		spte |= PT_PAGE_SIZE_MASK;
-	if (tdp_enabled)
+	if (vcpu->arch.mmu.direct_map)
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
 			kvm_is_mmio_pfn(pfn));
 
@@ -1983,8 +1984,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	spte |= (u64)pfn << PAGE_SHIFT;
 
 	if ((pte_access & ACC_WRITE_MASK)
-	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
-		&& !user_fault)) {
+	    || (!vcpu->arch.mmu.direct_map && write_fault
+		&& !is_write_protection(vcpu) && !user_fault)) {
 
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1995,7 +1996,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 		spte |= PT_WRITABLE_MASK;
 
-		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
+		if (!vcpu->arch.mmu.direct_map
+		    && !(pte_access & ACC_WRITE_MASK))
 			spte &= ~PT_USER_MASK;
 
 		/*
@@ -2371,7 +2373,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		ASSERT(!VALID_PAGE(root));
 		if (mmu_check_root(vcpu, root_gfn))
 			return 1;
-		if (tdp_enabled) {
+		if (vcpu->arch.mmu.direct_map) {
 			direct = 1;
 			root_gfn = 0;
 		}
@@ -2406,7 +2408,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 				return 1;
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
-		if (tdp_enabled) {
+		if (vcpu->arch.mmu.direct_map) {
 			direct = 1;
 			root_gfn = i << 30;
 		}
@@ -2544,6 +2546,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = true;
 	return 0;
 }
 
@@ -2663,6 +2666,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->root_level = level;
 	context->shadow_root_level = level;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
 	return 0;
 }
 
@@ -2687,6 +2691,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
 	return 0;
 }
 
@@ -2708,6 +2713,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->invlpg = nonpaging_invlpg;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = true;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -3060,7 +3066,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t gpa;
 	int r;
 
-	if (tdp_enabled)
+	if (vcpu->arch.mmu.direct_map)
 		return 0;
 
 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-- 
cgit v1.2.3


From f43addd46168110d572dcf69100cb215a4e9fd08 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:40 +0200
Subject: KVM: MMU: Make set_cr3 a function pointer in kvm_mmu

This is necessary to implement Nested Nested Paging. As a
side effect this allows some cleanups in the SVM nested
paging code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.c              | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 80ef28bddcc3..53cedede88fa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -236,6 +236,7 @@ struct kvm_pio_request {
  */
 struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
+	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5c28e979d730..c8acb9609ca4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2714,6 +2714,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
+	context->set_cr3 = kvm_x86_ops->set_cr3;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2752,7 +2753,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 		r = paging32_init_context(vcpu);
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
-	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
 
 	return r;
 }
@@ -2796,7 +2798,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	/* set_cr3() should ensure TLB has been flushed */
-	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 out:
 	return r;
 }
-- 
cgit v1.2.3


From 1c97f0a04c74196880f22a563134c8f6d0b9d752 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:41 +0200
Subject: KVM: X86: Introduce a tdp_set_cr3 function

This patch introduces a special set_tdp_cr3 function pointer
in kvm_x86_ops which is only used for tpd enabled mmu
contexts. This allows to remove some hacks from svm code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c              |  2 +-
 arch/x86/kvm/svm.c              | 23 ++++++++++++++---------
 arch/x86/kvm/vmx.c              |  2 ++
 4 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 53cedede88fa..81a51473f745 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -526,6 +526,8 @@ struct kvm_x86_ops {
 	bool (*rdtscp_supported)(void);
 	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
 
+	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
 	bool (*has_wbinvd_exit)(void);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c8acb9609ca4..a55f8d5a7985 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2714,7 +2714,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
-	context->set_cr3 = kvm_x86_ops->set_cr3;
+	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index fcbc491e1f87..53c9039583fd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3216,9 +3216,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	savesegment(gs, gs_selector);
 	ldt_selector = kvm_read_ldt();
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
-	/* required for live migration with NPT */
-	if (npt_enabled)
-		svm->vmcb->save.cr3 = vcpu->arch.cr3;
 
 	clgi();
 
@@ -3340,16 +3337,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (npt_enabled) {
-		svm->vmcb->control.nested_cr3 = root;
-		force_new_asid(vcpu);
-		return;
-	}
-
 	svm->vmcb->save.cr3 = root;
 	force_new_asid(vcpu);
 }
 
+static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.nested_cr3 = root;
+
+	/* Also sync guest cr3 here in case we live migrate */
+	svm->vmcb->save.cr3 = vcpu->arch.cr3;
+
+	force_new_asid(vcpu);
+}
+
 static int is_disabled(void)
 {
 	u64 vm_cr;
@@ -3576,6 +3579,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.write_tsc_offset = svm_write_tsc_offset,
 	.adjust_tsc_offset = svm_adjust_tsc_offset,
+
+	.set_tdp_cr3 = set_tdp_cr3,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 275a81d571cf..ff7a8d48fd24 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4341,6 +4341,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.write_tsc_offset = vmx_write_tsc_offset,
 	.adjust_tsc_offset = vmx_adjust_tsc_offset,
+
+	.set_tdp_cr3 = vmx_set_cr3,
 };
 
 static int __init vmx_init(void)
-- 
cgit v1.2.3


From 5777ed340d89cdc6c76a5c552337a3861b40a806 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:42 +0200
Subject: KVM: MMU: Introduce get_cr3 function pointer

This function pointer in the MMU context is required to
implement Nested Nested Paging.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.c              | 9 ++++++++-
 arch/x86/kvm/paging_tmpl.h      | 4 ++--
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 81a51473f745..6c97b8debfa8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -237,6 +237,7 @@ struct kvm_pio_request {
 struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a55f8d5a7985..e4a7de4c8c77 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2365,7 +2365,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	int direct = 0;
 	u64 pdptr;
 
-	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2562,6 +2562,11 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
+static unsigned long get_cr3(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr3;
+}
+
 static void inject_page_fault(struct kvm_vcpu *vcpu,
 			      u64 addr,
 			      u32 err_code)
@@ -2715,6 +2720,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+	context->get_cr3 = get_cr3;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2755,6 +2761,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
 	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
+	vcpu->arch.mmu.get_cr3           = get_cr3;
 
 	return r;
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e4ad3dc84df3..13d0c06b1bc8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -130,7 +130,7 @@ walk:
 	present = true;
 	eperm = rsvd_fault = false;
 	walker->level = vcpu->arch.mmu.root_level;
-	pte = vcpu->arch.cr3;
+	pte = vcpu->arch.mmu.get_cr3(vcpu);
 #if PTTYPE == 64
 	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
@@ -143,7 +143,7 @@ walk:
 	}
 #endif
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+	       (vcpu->arch.mmu.get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
 	pt_access = ACC_ALL;
 
-- 
cgit v1.2.3


From cb659db8a7d1ed558898f533a957dfc342f9499d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:43 +0200
Subject: KVM: MMU: Introduce inject_page_fault function pointer

This patch introduces an inject_page_fault function pointer
into struct kvm_mmu which will be used to inject a page
fault. This will be used later when Nested Nested Paging is
implemented.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 +++
 arch/x86/kvm/mmu.c              | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6c97b8debfa8..009a4a1b370e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -239,6 +239,9 @@ struct kvm_mmu {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+				  unsigned long addr,
+				  u32 error_code);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    u32 *error);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e4a7de4c8c77..a751dfc8526d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2571,7 +2571,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
 			      u64 addr,
 			      u32 err_code)
 {
-	kvm_inject_page_fault(vcpu, addr, err_code);
+	vcpu->arch.mmu.inject_page_fault(vcpu, addr, err_code);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
@@ -2721,6 +2721,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->direct_map = true;
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 	context->get_cr3 = get_cr3;
+	context->inject_page_fault = kvm_inject_page_fault;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2762,6 +2763,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
 	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
 	vcpu->arch.mmu.get_cr3           = get_cr3;
+	vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault;
 
 	return r;
 }
-- 
cgit v1.2.3


From 52fde8df7dd13d90f5f8dc43157418bff968d90a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:44 +0200
Subject: KVM: MMU: Introduce kvm_init_shadow_mmu helper function

Some logic of the init_kvm_softmmu function is required to
build the Nested Nested Paging context. So factor the
required logic into a seperate function and export it.
Also make the whole init path suitable for more than one mmu
context.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 60 +++++++++++++++++++++++++++++++-----------------------
 arch/x86/kvm/mmu.h |  1 +
 2 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a751dfc8526d..9e48a774fceb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2532,10 +2532,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+static int nonpaging_init_context(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
-
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = nonpaging_page_fault;
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2595,9 +2594,10 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
 #include "paging_tmpl.h"
 #undef PTTYPE
 
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context,
+				  int level)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 
@@ -2656,9 +2656,11 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 	}
 }
 
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+static int paging64_init_context_common(struct kvm_vcpu *vcpu,
+					struct kvm_mmu *context,
+					int level)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	reset_rsvds_bits_mask(vcpu, context, level);
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -2675,17 +2677,17 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	return 0;
 }
 
-static int paging64_init_context(struct kvm_vcpu *vcpu)
+static int paging64_init_context(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu *context)
 {
-	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
-	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
 }
 
-static int paging32_init_context(struct kvm_vcpu *vcpu)
+static int paging32_init_context(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu *context)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 
-	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2700,10 +2702,10 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
+static int paging32E_init_context(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context)
 {
-	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
-	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
 }
 
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
@@ -2727,15 +2729,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
-		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 	} else if (is_pae(vcpu)) {
-		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 	} else {
-		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
 	}
@@ -2743,24 +2745,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
 	int r;
-
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (!is_paging(vcpu))
-		r = nonpaging_init_context(vcpu);
+		r = nonpaging_init_context(vcpu, context);
 	else if (is_long_mode(vcpu))
-		r = paging64_init_context(vcpu);
+		r = paging64_init_context(vcpu, context);
 	else if (is_pae(vcpu))
-		r = paging32E_init_context(vcpu);
+		r = paging32E_init_context(vcpu, context);
 	else
-		r = paging32_init_context(vcpu);
+		r = paging32_init_context(vcpu, context);
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+	int r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+
 	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
 	vcpu->arch.mmu.get_cr3           = get_cr3;
 	vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f05a03dfba4e..7086ca85d3e7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,7 @@
 #define PFERR_FETCH_MASK (1U << 4)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
-- 
cgit v1.2.3


From 3241f22da85d26505b39f525a88f52ebd1235975 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:45 +0200
Subject: KVM: MMU: Let is_rsvd_bits_set take mmu context instead of vcpu

This patch changes is_rsvd_bits_set() function prototype to
take only a kvm_mmu context instead of a full vcpu.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 6 +++---
 arch/x86/kvm/paging_tmpl.h | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e48a774fceb..86f7557cf3fb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2578,12 +2578,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
 	nonpaging_free(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
 {
 	int bit7;
 
 	bit7 = (gpte >> 7) & 1;
-	return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
 #define PTTYPE 64
@@ -2859,7 +2859,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 		return;
         }
 
-	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+	if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
 		return;
 
 	++vcpu->kvm->stat.mmu_pte_updated;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 13d0c06b1bc8..68ee1b7fa89f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -168,7 +168,7 @@ walk:
 			break;
 		}
 
-		if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+		if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
 			rsvd_fault = true;
 			break;
 		}
@@ -327,6 +327,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 				u64 *sptep)
 {
 	struct kvm_mmu_page *sp;
+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 	pt_element_t *gptep = gw->prefetch_ptes;
 	u64 *spte;
 	int i;
@@ -358,7 +359,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		gpte = gptep[i];
 
 		if (!is_present_gpte(gpte) ||
-		      is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
+		      is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
 			if (!sp->unsync)
 				__set_spte(spte, shadow_notrap_nonpresent_pte);
 			continue;
@@ -713,7 +714,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+		if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
 		      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
 		      || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
-- 
cgit v1.2.3


From 8df25a328a6ca3bd0f048278f4d5ae0a1f6fadc1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:46 +0200
Subject: KVM: MMU: Track page fault data in struct vcpu

This patch introduces a struct with two new fields in
vcpu_arch for x86:

	* fault.address
	* fault.error_code

This will be used to correctly propagate page faults back
into the guest when we could have either an ordinary page
fault or a nested page fault. In the case of a nested page
fault the fault-address is different from the original
address that should be walked. So we need to keep track
about the real fault-address.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 -
 arch/x86/include/asm/kvm_host.h    | 17 ++++++++++++-----
 arch/x86/kvm/emulate.c             | 30 ++++++++++++++----------------
 arch/x86/kvm/mmu.c                 |  6 ++----
 arch/x86/kvm/paging_tmpl.h         |  6 +++++-
 arch/x86/kvm/x86.c                 |  9 +++++----
 6 files changed, 38 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1bf11400ae99..5187dd88019b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -229,7 +229,6 @@ struct x86_emulate_ctxt {
 	int exception; /* exception that happens during emulation or -1 */
 	u32 error_code; /* error code for exception */
 	bool error_code_valid;
-	unsigned long cr2; /* faulted address in case of #PF */
 
 	/* decode cache */
 	struct decode_cache decode;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 009a4a1b370e..3fde5b322534 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -239,9 +239,7 @@ struct kvm_mmu {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
-	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
-				  unsigned long addr,
-				  u32 error_code);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    u32 *error);
@@ -288,6 +286,16 @@ struct kvm_vcpu_arch {
 	bool tpr_access_reporting;
 
 	struct kvm_mmu mmu;
+
+	/*
+	 * This struct is filled with the necessary information to propagate a
+	 * page fault into the guest
+	 */
+	struct {
+		u64      address;
+		unsigned error_code;
+	} fault;
+
 	/* only needed in kvm_pv_mmu_op() path, but it's hot so
 	 * put it here to avoid allocation */
 	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -624,8 +632,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
-			   u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 27d2c22b114e..2b08b78b6cab 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -487,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
 	emulate_exception(ctxt, GP_VECTOR, err, true);
 }
 
-static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
-		       int err)
+static void emulate_pf(struct x86_emulate_ctxt *ctxt)
 {
-	ctxt->cr2 = addr;
-	emulate_exception(ctxt, PF_VECTOR, err, true);
+	emulate_exception(ctxt, PF_VECTOR, 0, true);
 }
 
 static void emulate_ud(struct x86_emulate_ctxt *ctxt)
@@ -834,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt, addr, err);
+			emulate_pf(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -921,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	addr = dt.address + index * 8;
 	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
+		emulate_pf(ctxt);
 
        return ret;
 }
@@ -947,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	addr = dt.address + index * 8;
 	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
+		emulate_pf(ctxt);
 
 	return ret;
 }
@@ -1117,7 +1115,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					&err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt, c->dst.addr.mem, err);
+			emulate_pf(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
@@ -1939,7 +1937,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -1949,7 +1947,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -1957,7 +1955,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, new_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -1970,7 +1968,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt, new_tss_base, err);
+			emulate_pf(ctxt);
 			return ret;
 		}
 	}
@@ -2081,7 +2079,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2091,7 +2089,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2099,7 +2097,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, new_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2112,7 +2110,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt, new_tss_base, err);
+			emulate_pf(ctxt);
 			return ret;
 		}
 	}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 86f7557cf3fb..99367274b97c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2566,11 +2566,9 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr3;
 }
 
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-			      u64 addr,
-			      u32 err_code)
+static void inject_page_fault(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.mmu.inject_page_fault(vcpu, addr, err_code);
+	vcpu->arch.mmu.inject_page_fault(vcpu);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 68ee1b7fa89f..d07f48a06f09 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -258,6 +258,10 @@ error:
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
+
+	vcpu->arch.fault.address    = addr;
+	vcpu->arch.fault.error_code = walker->error_code;
+
 	trace_kvm_mmu_walker_error(walker->error_code);
 	return 0;
 }
@@ -521,7 +525,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	 */
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __func__);
-		inject_page_fault(vcpu, addr, walker.error_code);
+		inject_page_fault(vcpu);
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 		return 0;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9d434777154d..48b74d2fbfb7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -329,11 +329,12 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
-			   u32 error_code)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
 {
+	unsigned error_code = vcpu->arch.fault.error_code;
+
 	++vcpu->stat.pf_guest;
-	vcpu->arch.cr2 = addr;
+	vcpu->arch.cr2 = vcpu->arch.fault.address;
 	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 }
 
@@ -4080,7 +4081,7 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception == PF_VECTOR)
-		kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+		kvm_inject_page_fault(vcpu);
 	else if (ctxt->error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
 	else
-- 
cgit v1.2.3


From 14dfe855f978181cd611ec018e5ceba860a98545 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:49 +0200
Subject: KVM: X86: Introduce pointer to mmu context used for gva_to_gpa

This patch introduces the walk_mmu pointer which points to
the mmu-context currently used for gva_to_gpa translations.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 13 +++++++++++++
 arch/x86/kvm/mmu.c              | 10 +++++-----
 arch/x86/kvm/x86.c              | 17 ++++++++++-------
 3 files changed, 28 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4915b7c8f2ec..1b3eb8a0a1bc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,8 +286,21 @@ struct kvm_vcpu_arch {
 	u64 ia32_misc_enable_msr;
 	bool tpr_access_reporting;
 
+	/*
+	 * Paging state of the vcpu
+	 *
+	 * If the vcpu runs in guest mode with two level paging this still saves
+	 * the paging mode of the l1 guest. This context is always used to
+	 * handle faults.
+	 */
 	struct kvm_mmu mmu;
 
+	/*
+	 * Pointer to the mmu context currently used for
+	 * gva_to_gpa translations.
+	 */
+	struct kvm_mmu *walk_mmu;
+
 	/*
 	 * This struct is filled with the necessary information to propagate a
 	 * page fault into the guest
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 99367274b97c..cb06adac92b1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2708,7 +2708,7 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu,
 
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	struct kvm_mmu *context = vcpu->arch.walk_mmu;
 
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = tdp_page_fault;
@@ -2767,11 +2767,11 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-	int r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
 
-	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
-	vcpu->arch.mmu.get_cr3           = get_cr3;
-	vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault;
+	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
+	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
+	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
 
 	return r;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2364c2cad891..4196fc719142 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3456,27 +3456,27 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_FETCH_MASK;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_WRITE_MASK;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
 }
 
 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3487,7 +3487,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 	int r = X86EMUL_CONTINUE;
 
 	while (bytes) {
-		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
+		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
+							    error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3542,8 +3543,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
 	int r = X86EMUL_CONTINUE;
 
 	while (bytes) {
-		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
-						       PFERR_WRITE_MASK, error);
+		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+							     PFERR_WRITE_MASK,
+							     error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -5663,6 +5665,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm = vcpu->kvm;
 
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
-- 
cgit v1.2.3


From 6539e738f65a8f1fc7806295d5d701fba4008343 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:50 +0200
Subject: KVM: MMU: Implement nested gva_to_gpa functions

This patch adds the functions to do a nested l2_gva to
l1_gpa page table walk.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 10 ++++++++++
 arch/x86/kvm/mmu.c              |  8 ++++++++
 arch/x86/kvm/paging_tmpl.h      | 31 +++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.h              |  5 +++++
 4 files changed, 54 insertions(+)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1b3eb8a0a1bc..8ec3547c433d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -295,6 +295,16 @@ struct kvm_vcpu_arch {
 	 */
 	struct kvm_mmu mmu;
 
+	/*
+	 * Paging state of an L2 guest (used for nested npt)
+	 *
+	 * This context will save all necessary information to walk page tables
+	 * of the an L2 guest. This context is only initialized for page table
+	 * walking and not for faulting since we never handle l2 page faults on
+	 * the host.
+	 */
+	struct kvm_mmu nested_mmu;
+
 	/*
 	 * Pointer to the mmu context currently used for
 	 * gva_to_gpa translations.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb06adac92b1..1e215e8b9377 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2466,6 +2466,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return vaddr;
 }
 
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+					 u32 access, u32 *error)
+{
+	if (error)
+		*error = 0;
+	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a704a8130e44..eefe363156b9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -276,6 +276,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 					write_fault, user_fault, fetch_fault);
 }
 
+static int FNAME(walk_addr_nested)(struct guest_walker *walker,
+				   struct kvm_vcpu *vcpu, gva_t addr,
+				   int write_fault, int user_fault,
+				   int fetch_fault)
+{
+	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
+					addr, write_fault, user_fault,
+					fetch_fault);
+}
+
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 {
@@ -660,6 +670,27 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	return gpa;
 }
 
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+				      u32 access, u32 *error)
+{
+	struct guest_walker walker;
+	gpa_t gpa = UNMAPPED_GVA;
+	int r;
+
+	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr,
+				    access & PFERR_WRITE_MASK,
+				    access & PFERR_USER_MASK,
+				    access & PFERR_FETCH_MASK);
+
+	if (r) {
+		gpa = gfn_to_gpa(walker.gfn);
+		gpa |= vaddr & ~PAGE_MASK;
+	} else if (error)
+		*error = walker.error_code;
+
+	return gpa;
+}
+
 static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu_page *sp)
 {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2d6385e44ccf..bf4dc2f40d7f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
 #endif
 }
 
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
 static inline int is_pae(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
-- 
cgit v1.2.3


From 02f59dc9f1f51d2148d87d48f84adb455a4fd697 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:54 +0200
Subject: KVM: MMU: Introduce init_kvm_nested_mmu()

This patch introduces the init_kvm_nested_mmu() function
which is used to re-initialize the nested mmu when the l2
guest changes its paging mode.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 37 ++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/x86.c | 17 +++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1e215e8b9377..a26f13bd34e0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2784,11 +2784,46 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+	g_context->get_cr3           = get_cr3;
+	g_context->inject_page_fault = kvm_inject_page_fault;
+
+	/*
+	 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
+	 * translation of l2_gpa to l1_gpa addresses is done using the
+	 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
+	 * functions between mmu and nested_mmu are swapped.
+	 */
+	if (!is_paging(vcpu)) {
+		g_context->root_level = 0;
+		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+	} else if (is_long_mode(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
+		g_context->root_level = PT64_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+	} else if (is_pae(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
+		g_context->root_level = PT32E_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+	} else {
+		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
+		g_context->root_level = PT32_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+	}
+
+	return 0;
+}
+
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.update_pte.pfn = bad_pfn;
 
-	if (tdp_enabled)
+	if (mmu_is_nested(vcpu))
+		return init_kvm_nested_mmu(vcpu);
+	else if (tdp_enabled)
 		return init_kvm_tdp_mmu(vcpu);
 	else
 		return init_kvm_softmmu(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca85d3e7..513abbb5ff46 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -47,6 +47,7 @@
 #define PFERR_USER_MASK (1U << 2)
 #define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
+#define PFERR_NESTED_MASK (1U << 31)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 46843ed36dc1..e4c76bf86081 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3489,6 +3489,22 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 	return gpa;
 }
 
+static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+	gpa_t t_gpa;
+	u32 error;
+
+	BUG_ON(!mmu_is_nested(vcpu));
+
+	/* NPT walks are always user-walks */
+	access |= PFERR_USER_MASK;
+	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
+	if (t_gpa == UNMAPPED_GVA)
+		vcpu->arch.fault.error_code |= PFERR_NESTED_MASK;
+
+	return t_gpa;
+}
+
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -5704,6 +5720,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
+	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
-- 
cgit v1.2.3


From d41d1895eb856b5d1c82f3be106b7a3e75e4216b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:58 +0200
Subject: KVM: MMU: Introduce kvm_pdptr_read_mmu

This function is implemented to load the pdptr pointers of
the currently running guest (l1 or l2 guest). Therefore it
takes care about the current paging mode and can read pdptrs
out of l2 guest physical memory.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/kvm_cache_regs.h | 7 +++++++
 arch/x86/kvm/mmu.c            | 2 +-
 arch/x86/kvm/paging_tmpl.h    | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index a37abe2ec39a..975bb45329a1 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -45,6 +45,13 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 	return vcpu->arch.walk_mmu->pdptrs[index];
 }
 
+static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
+{
+	load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
+
+	return mmu->pdptrs[index];
+}
+
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
 {
 	ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a26f13bd34e0..a25173a0d8b9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2398,7 +2398,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
 		ASSERT(!VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-			pdptr = kvm_pdptr_read(vcpu, i);
+			pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
 			if (!is_present_gpte(pdptr)) {
 				vcpu->arch.mmu.pae_root[i] = 0;
 				continue;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f4e09d341e28..a28f09bb76c6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -137,7 +137,7 @@ walk:
 
 #if PTTYPE == 64
 	if (walker->level == PT32E_ROOT_LEVEL) {
-		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+		pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte)) {
 			present = false;
-- 
cgit v1.2.3


From 651dd37a9ce6fdacdcd75da86619c62111efcbc2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:59 +0200
Subject: KVM: MMU: Refactor mmu_alloc_roots function

This patch factors out the direct-mapping paths of the
mmu_alloc_roots function into a seperate function. This
makes it a lot easier to avoid all the unnecessary checks
done in the shadow path which may break when running direct.
In fact, this patch already fixes a problem when running PAE
guests on a PAE shadow page table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 82 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a25173a0d8b9..9cd5a717ede5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2357,42 +2357,77 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	return ret;
 }
 
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	int i;
+
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu);
+		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
+				      1, ACC_ALL, NULL);
+		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+		for (i = 0; i < 4; ++i) {
+			hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+			ASSERT(!VALID_PAGE(root));
+			spin_lock(&vcpu->kvm->mmu_lock);
+			kvm_mmu_free_some_pages(vcpu);
+			sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
+					      PT32_ROOT_LEVEL, 1, ACC_ALL,
+					      NULL);
+			root = __pa(sp->spt);
+			++sp->root_count;
+			spin_unlock(&vcpu->kvm->mmu_lock);
+			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+			vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+		}
+	} else
+		BUG();
+
+	return 0;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
 	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	int direct = 0;
 	u64 pdptr;
 
 	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
 
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (mmu_check_root(vcpu, root_gfn))
+		return 1;
+
+	/*
+	 * Do we shadow a long mode page table? If so we need to
+	 * write-protect the guests page table root.
+	 */
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		ASSERT(!VALID_PAGE(root));
-		if (mmu_check_root(vcpu, root_gfn))
-			return 1;
-		if (vcpu->arch.mmu.direct_map) {
-			direct = 1;
-			root_gfn = 0;
-		}
+
 		spin_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_free_some_pages(vcpu);
-		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, direct,
-				      ACC_ALL, NULL);
+		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+				      0, ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		vcpu->arch.mmu.root_hpa = root;
 		return 0;
 	}
-	direct = !is_paging(vcpu);
-
-	if (mmu_check_root(vcpu, root_gfn))
-		return 1;
 
+	/*
+	 * We shadow a 32 bit page table. This may be a legacy 2-level
+	 * or a PAE 3-level page table.
+	 */
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2406,16 +2441,11 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = pdptr >> PAGE_SHIFT;
 			if (mmu_check_root(vcpu, root_gfn))
 				return 1;
-		} else if (vcpu->arch.mmu.root_level == 0)
-			root_gfn = 0;
-		if (vcpu->arch.mmu.direct_map) {
-			direct = 1;
-			root_gfn = i << 30;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, direct,
+				      PT32_ROOT_LEVEL, 0,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
@@ -2427,6 +2457,14 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.mmu.direct_map)
+		return mmu_alloc_direct_roots(vcpu);
+	else
+		return mmu_alloc_shadow_roots(vcpu);
+}
+
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
-- 
cgit v1.2.3


From 81407ca553c0c852b8cd3f38f3ec362d307f829b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:00 +0200
Subject: KVM: MMU: Allow long mode shadows for legacy page tables

Currently the KVM softmmu implementation can not shadow a 32
bit legacy or PAE page table with a long mode page table.
This is a required feature for nested paging emulation
because the nested page table must alway be in host format.
So this patch implements the missing pieces to allow long
mode page tables for page table types.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 60 +++++++++++++++++++++++++++++++++++------
 2 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e70de376544..bd59b482f1a8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -256,6 +256,7 @@ struct kvm_mmu {
 	bool direct_map;
 
 	u64 *pae_root;
+	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
 
 	u64 pdptrs[4]; /* pae */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cd5a717ede5..dd76765310ce 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1504,6 +1504,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 	iterator->addr = addr;
 	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
 	iterator->level = vcpu->arch.mmu.shadow_root_level;
+
+	if (iterator->level == PT64_ROOT_LEVEL &&
+	    vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+	    !vcpu->arch.mmu.direct_map)
+		--iterator->level;
+
 	if (iterator->level == PT32E_ROOT_LEVEL) {
 		iterator->shadow_addr
 			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -2314,7 +2320,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
+	    (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+	     vcpu->arch.mmu.direct_map)) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		sp = page_header(root);
@@ -2394,10 +2402,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
-	int i;
-	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	u64 pdptr;
+	u64 pdptr, pm_mask;
+	gfn_t root_gfn;
+	int i;
 
 	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
 
@@ -2426,8 +2434,13 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
 	/*
 	 * We shadow a 32 bit page table. This may be a legacy 2-level
-	 * or a PAE 3-level page table.
+	 * or a PAE 3-level page table. In either case we need to be aware that
+	 * the shadow page table may be a PAE or a long mode page table.
 	 */
+	pm_mask = PT_PRESENT_MASK;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2451,9 +2464,35 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		++sp->root_count;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 
-		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 	}
-	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+
+	/*
+	 * If we shadow a 32 bit page table with a long mode page
+	 * table we enter this path.
+	 */
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		if (vcpu->arch.mmu.lm_root == NULL) {
+			/*
+			 * The additional page necessary for this is only
+			 * allocated on demand.
+			 */
+
+			u64 *lm_root;
+
+			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+			if (lm_root == NULL)
+				return 1;
+
+			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+
+			vcpu->arch.mmu.lm_root = lm_root;
+		}
+
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+	}
+
 	return 0;
 }
 
@@ -2470,9 +2509,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	int i;
 	struct kvm_mmu_page *sp;
 
+	if (vcpu->arch.mmu.direct_map)
+		return;
+
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
 		mmu_sync_children(vcpu, sp);
@@ -3253,6 +3295,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	free_page((unsigned long)vcpu->arch.mmu.pae_root);
+	if (vcpu->arch.mmu.lm_root != NULL)
+		free_page((unsigned long)vcpu->arch.mmu.lm_root);
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 2d48a985c7bbcd72b4e92e301ea96bf1252ffc61 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:01 +0200
Subject: KVM: MMU: Track NX state in struct kvm_mmu

With Nested Paging emulation the NX state between the two
MMU contexts may differ. To make sure that always the right
fault error code is recorded this patch moves the NX state
into struct kvm_mmu so that the code can distinguish between
L1 and L2 NX state.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c              | 16 +++++++++++++++-
 arch/x86/kvm/paging_tmpl.h      |  4 ++--
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bd59b482f1a8..b43686a44877 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -259,6 +259,8 @@ struct kvm_mmu {
 	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
 
+	bool nx;
+
 	u64 pdptrs[4]; /* pae */
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd76765310ce..95cbeed74cf9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2634,6 +2634,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
+	context->nx = false;
 	return 0;
 }
 
@@ -2687,7 +2688,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 
-	if (!is_nx(vcpu))
+	if (!context->nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
 	switch (level) {
 	case PT32_ROOT_LEVEL:
@@ -2746,6 +2747,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 					struct kvm_mmu *context,
 					int level)
 {
+	context->nx = is_nx(vcpu);
+
 	reset_rsvds_bits_mask(vcpu, context, level);
 
 	ASSERT(is_pae(vcpu));
@@ -2772,6 +2775,8 @@ static int paging64_init_context(struct kvm_vcpu *vcpu,
 static int paging32_init_context(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu *context)
 {
+	context->nx = false;
+
 	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 
 	context->new_cr3 = paging_new_cr3;
@@ -2810,19 +2815,24 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 	context->get_cr3 = get_cr3;
 	context->inject_page_fault = kvm_inject_page_fault;
+	context->nx = is_nx(vcpu);
 
 	if (!is_paging(vcpu)) {
+		context->nx = false;
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
+		context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 	} else if (is_pae(vcpu)) {
+		context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 	} else {
+		context->nx = false;
 		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
@@ -2878,17 +2888,21 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	 * functions between mmu and nested_mmu are swapped.
 	 */
 	if (!is_paging(vcpu)) {
+		g_context->nx = false;
 		g_context->root_level = 0;
 		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
 	} else if (is_long_mode(vcpu)) {
+		g_context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
 		g_context->root_level = PT64_ROOT_LEVEL;
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else if (is_pae(vcpu)) {
+		g_context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
 		g_context->root_level = PT32E_ROOT_LEVEL;
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else {
+		g_context->nx = false;
 		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
 		g_context->root_level = PT32_ROOT_LEVEL;
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a28f09bb76c6..2bdd843ad63f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -105,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 #if PTTYPE == 64
-	if (is_nx(vcpu))
+	if (vcpu->arch.mmu.nx)
 		access &= ~(gpte >> PT64_NX_SHIFT);
 #endif
 	return access;
@@ -272,7 +272,7 @@ error:
 		walker->error_code |= PFERR_WRITE_MASK;
 	if (user_fault)
 		walker->error_code |= PFERR_USER_MASK;
-	if (fetch_fault && is_nx(vcpu))
+	if (fetch_fault && mmu->nx)
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
-- 
cgit v1.2.3


From 4b16184c1ccafa4b0c188c622ea532fb90e6f5b0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:03 +0200
Subject: KVM: SVM: Initialize Nested Nested MMU context on VMRUN

This patch adds code to initialize the Nested Nested Paging
MMU context when the L1 guest executes a VMRUN instruction
and has nested paging enabled in its VMCB.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c |  1 +
 arch/x86/kvm/svm.c | 50 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 42 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95cbeed74cf9..6e248d80e350 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2962,6 +2962,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
 	mmu_free_roots(vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9a9a4405b571..3184772dedfe 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -294,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 	force_new_asid(vcpu);
 }
 
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+	return PT64_ROOT_LEVEL;
+#else
+	return PT32E_ROOT_LEVEL;
+#endif
+}
+
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	vcpu->arch.efer = efer;
@@ -1630,6 +1639,26 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
 	nested_svm_vmexit(svm);
 }
 
+static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+
+	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
+	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
+	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
+	vcpu->arch.mmu.shadow_root_level = get_npt_level();
+	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+	return r;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
 static int nested_svm_check_permissions(struct vcpu_svm *svm)
 {
 	if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1998,6 +2027,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
+	svm->nested.nested_cr3 = 0;
+
 	/* Restore selected save entries */
 	svm->vmcb->save.es = hsave->save.es;
 	svm->vmcb->save.cs = hsave->save.cs;
@@ -2024,6 +2055,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	nested_svm_unmap(page);
 
+	nested_svm_uninit_mmu_context(&svm->vcpu);
 	kvm_mmu_reset_context(&svm->vcpu);
 	kvm_mmu_load(&svm->vcpu);
 
@@ -2071,6 +2103,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 	if (vmcb->control.asid == 0)
 		return false;
 
+	if (vmcb->control.nested_ctl && !npt_enabled)
+		return false;
+
 	return true;
 }
 
@@ -2143,6 +2178,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	else
 		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
 
+	if (nested_vmcb->control.nested_ctl) {
+		kvm_mmu_unload(&svm->vcpu);
+		svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+		nested_svm_init_mmu_context(&svm->vcpu);
+	}
+
 	/* Load the nested guest state */
 	svm->vmcb->save.es = nested_vmcb->save.es;
 	svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -3415,15 +3456,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
-static int get_npt_level(void)
-{
-#ifdef CONFIG_X86_64
-	return PT64_ROOT_LEVEL;
-#else
-	return PT32E_ROOT_LEVEL;
-#endif
-}
-
 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	return 0;
-- 
cgit v1.2.3


From b0bc3ee2b54fcea0df42cc9aa05103b1ccd89db0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 13 Sep 2010 16:45:28 +0200
Subject: KVM: MMU: Fix regression with ept memory types merged into non-ept
 page tables

Commit "KVM: MMU: Make tdp_enabled a mmu-context parameter" made real-mode
set ->direct_map, and changed the code that merges in the memory type depend
on direct_map instead of tdp_enabled.  However, in this case what really
matters is tdp, not direct_map, since tdp changes the pte format regardless
of whether the mapping is direct or not.

As a result, real-mode shadow mappings got corrupted with ept memory types.
The result was a huge slowdown, likely due to the cache being disabled.

Change it back as the simplest fix for the regression (real fix is to move
all that to vmx code, and not use tdp_enabled as a synonym for ept).

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6e248d80e350..3ce56bfe056e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1980,7 +1980,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_user_mask;
 	if (level > PT_PAGE_TABLE_LEVEL)
 		spte |= PT_PAGE_SIZE_MASK;
-	if (vcpu->arch.mmu.direct_map)
+	if (tdp_enabled)
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
 			kvm_is_mmio_pfn(pfn));
 
-- 
cgit v1.2.3


From cb16a7b3872e9a806f16b1f09b59103fafc7b796 Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Sat, 18 Sep 2010 08:41:02 +0800
Subject: KVM: MMU: fix counting of rmap entries in rmap_add()

It seems that rmap entries are under counted.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3ce56bfe056e..c94c43289f53 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -632,6 +632,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 		desc->sptes[0] = (u64 *)*rmapp;
 		desc->sptes[1] = spte;
 		*rmapp = (unsigned long)desc | 1;
+		++count;
 	} else {
 		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -644,7 +645,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 			desc = desc->more;
 		}
 		for (i = 0; desc->sptes[i]; ++i)
-			;
+			++count;
 		desc->sptes[i] = spte;
 	}
 	return count;
-- 
cgit v1.2.3


From 6292757fb0e758748fdb441861f8c50d397de9f0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:02:12 +0800
Subject: KVM: MMU: update 'root_hpa' out of loop in PAE shadow path

The value of 'vcpu->arch.mmu.pae_root' is not modified, so we can update
'root_hpa' out of the loop.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c94c43289f53..363004699012 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2393,8 +2393,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 			++sp->root_count;
 			spin_unlock(&vcpu->kvm->mmu_lock);
 			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
-			vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 		}
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 	} else
 		BUG();
 
@@ -2466,8 +2466,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		spin_unlock(&vcpu->kvm->mmu_lock);
 
 		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
-		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 	}
+	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 
 	/*
 	 * If we shadow a 32 bit page table with a long mode page
-- 
cgit v1.2.3


From 33f91edb9211f5c0392071f9eb01958ec69f2193 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:05:00 +0800
Subject: KVM: MMU: set access bit for direct mapping

Set access bit while setup up direct page table if it's nonpaing or npt enabled,
it's good for CPU's speculate access

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 363004699012..88203fa4ef05 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2240,7 +2240,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			__set_spte(iterator.sptep,
 				   __pa(sp->spt)
 				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
-				   | shadow_user_mask | shadow_x_mask);
+				   | shadow_user_mask | shadow_x_mask
+				   | shadow_accessed_mask);
 		}
 	}
 	return pt_write;
-- 
cgit v1.2.3


From c42fffe3a3aa8c62b8028fff32d18156f5325c3b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:07:07 +0800
Subject: KVM: MMU: audit: unregister audit tracepoints before module unloaded

fix:

Call Trace:
 [<ffffffffa01e46ba>] ? kvm_mmu_pte_write+0x229/0x911 [kvm]
 [<ffffffffa01c6ba9>] ? gfn_to_memslot+0x39/0xa0 [kvm]
 [<ffffffffa01c6c26>] ? mark_page_dirty+0x16/0x2e [kvm]
 [<ffffffffa01c6d6f>] ? kvm_write_guest_page+0x67/0x7f [kvm]
 [<ffffffff81066fbd>] ? local_clock+0x2a/0x3b
 [<ffffffffa01d52ce>] emulator_write_phys+0x46/0x54 [kvm]
 ......
Code:  Bad RIP value.
RIP  [<ffffffffa0172056>] 0xffffffffa0172056
 RSP <ffff880134f69a70>
CR2: ffffffffa0172056

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 88203fa4ef05..afde64ba118d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3355,15 +3355,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 	return init_kvm_mmu(vcpu);
 }
 
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-
-	destroy_kvm_mmu(vcpu);
-	free_mmu_pages(vcpu);
-	mmu_free_memory_caches(vcpu);
-}
-
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
@@ -3662,4 +3653,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 #ifdef CONFIG_KVM_MMU_AUDIT
 #include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
 #endif
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+
+	destroy_kvm_mmu(vcpu);
+	free_mmu_pages(vcpu);
+	mmu_free_memory_caches(vcpu);
+	mmu_audit_disable();
+}
-- 
cgit v1.2.3


From 6903074c367cfb13166c2974d6a886fdc7a00d21 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:09:29 +0800
Subject: KVM: MMU: audit: check whether have unsync sps after root sync

After root synced, all unsync sps are synced, this patch add a check to make
sure it's no unsync sps in VCPU's page table

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 11 +++++++++--
 arch/x86/kvm/mmu_audit.c | 11 ++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index afde64ba118d..ba7e7646fb78 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -53,14 +53,18 @@ enum {
 	AUDIT_PRE_PAGE_FAULT,
 	AUDIT_POST_PAGE_FAULT,
 	AUDIT_PRE_PTE_WRITE,
-	AUDIT_POST_PTE_WRITE
+	AUDIT_POST_PTE_WRITE,
+	AUDIT_PRE_SYNC,
+	AUDIT_POST_SYNC
 };
 
 char *audit_point_name[] = {
 	"pre page fault",
 	"post page fault",
 	"pre pte write",
-	"post pte write"
+	"post pte write",
+	"pre sync",
+	"post sync"
 };
 
 #undef MMU_DEBUG
@@ -2516,6 +2520,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
+
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
@@ -2531,6 +2537,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 			mmu_sync_children(vcpu, sp);
 		}
 	}
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 }
 
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 66219afcc91e..4aee32c3cf92 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -164,6 +164,14 @@ static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 		inspect_spte_has_rmap(vcpu->kvm, sptep);
 }
 
+static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+	if (audit_point == AUDIT_POST_SYNC && sp->unsync)
+		audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+}
+
 static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	int i;
@@ -179,7 +187,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 }
 
-void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
@@ -215,6 +223,7 @@ static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
 	audit_sptes_have_rmaps(vcpu, sptep, level);
 	audit_mappings(vcpu, sptep, level);
+	audit_spte_after_sync(vcpu, sptep, level);
 }
 
 static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 7ebaf15eefe7b019def72bd9d4420c7bc51ed69e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 3 Oct 2010 18:51:39 +0200
Subject: KVM: MMU: Avoid sign extension in mmu_alloc_direct_roots() pae root
 address

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ba7e7646fb78..dc1b4fb299b7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2374,7 +2374,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
-	int i;
+	unsigned i;
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		spin_lock(&vcpu->kvm->mmu_lock);
-- 
cgit v1.2.3


From 9611c187774f0e20c258c23ced2599c44bd2fef4 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Wed, 6 Oct 2010 14:23:22 +0200
Subject: KVM: fix typo in copyright notice

Fix typo in copyright notice.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c     | 2 +-
 arch/x86/kvm/i8254.c       | 2 +-
 arch/x86/kvm/i8259.c       | 2 +-
 arch/x86/kvm/irq.c         | 2 +-
 arch/x86/kvm/lapic.c       | 2 +-
 arch/x86/kvm/mmu.c         | 2 +-
 arch/x86/kvm/mmu_audit.c   | 2 +-
 arch/x86/kvm/paging_tmpl.h | 2 +-
 arch/x86/kvm/svm.c         | 2 +-
 arch/x86/kvm/timer.c       | 2 +-
 arch/x86/kvm/vmx.c         | 2 +-
 arch/x86/kvm/x86.c         | 2 +-
 virt/kvm/irq_comm.c        | 2 +-
 virt/kvm/kvm_main.c        | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d0df25d84acd..38b6e8dafaff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
  * privileged instructions:
  *
  * Copyright (C) 2006 Qumranet
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  *   Avi Kivity <avi@qumranet.com>
  *   Yaniv Kamay <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 2ad40a4ddc34..efad72385058 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2006 Intel Corporation
  * Copyright (c) 2007 Keir Fraser, XenSource Inc
  * Copyright (c) 2008 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index dd54c5bb2e5e..f628234fbeca 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2003-2004 Fabrice Bellard
  * Copyright (c) 2007 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index f994da40ad94..7e06ba1618bd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,7 +1,7 @@
 /*
  * irq.c: API for in kernel interrupt controller
  * Copyright (c) 2007, Intel Corporation.
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c6f2f159384a..82118087d9e9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,7 +5,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2007 Novell
  * Copyright (C) 2007 Intel
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Dor Laor <dor.laor@qumranet.com>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dc1b4fb299b7..eb65b9c5ea40 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 4aee32c3cf92..ba2bcdde6221 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -4,7 +4,7 @@
  * Audit code for KVM MMU
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9a5f7bb5f840..cd7a833a3b52 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,7 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c929d007696d..82e144a4e514 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,7 +4,7 @@
  * AMD SVM support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index e16a0dbe74d8..fc7a101c4a35 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -6,7 +6,7 @@
  *
  * timer support
  *
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 007be8402efb..8da0e45ff7c9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,7 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e96038e1bc3a..dcee64e4434f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 369e38010ad5..8edca9141b78 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -17,7 +17,7 @@
  * Authors:
  *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
  *
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  */
 
 #include <linux/kvm_host.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b8499f544e1d..1aeeb7fbe2ef 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5,7 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
-- 
cgit v1.2.3


From 77db5cbd29b7cb0e0fb4fd146e7f7ac2831a025a Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 8 Oct 2010 16:24:15 +0800
Subject: KVM: MCE: Send SRAR SIGBUS directly

Originally, SRAR SIGBUS is sent to QEMU-KVM via touching the poisoned
page. But commit 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the
signal from being sent. So now the signal is sent via
force_sig_info_fault directly.

[marcelo: use send_sig_info instead]

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eb65b9c5ea40..908ea5464a51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2251,22 +2251,24 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 	return pt_write;
 }
 
-static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
 {
-	char buf[1];
-	void __user *hva;
-	int r;
+	siginfo_t info;
+
+	info.si_signo	= SIGBUS;
+	info.si_errno	= 0;
+	info.si_code	= BUS_MCEERR_AR;
+	info.si_addr	= (void __user *)address;
+	info.si_addr_lsb = PAGE_SHIFT;
 
-	/* Touch the page, so send SIGBUS */
-	hva = (void __user *)gfn_to_hva(kvm, gfn);
-	r = copy_from_user(buf, hva, 1);
+	send_sig_info(SIGBUS, &info, tsk);
 }
 
 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 {
 	kvm_release_pfn_clean(pfn);
 	if (is_hwpoison_pfn(pfn)) {
-		kvm_send_hwpoison_signal(kvm, gfn);
+		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
 		return 0;
 	} else if (is_fault_pfn(pfn))
 		return -EFAULT;
-- 
cgit v1.2.3