diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-08 05:27:39 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-08 05:27:39 -0400 |
commit | e4e65676f272adb63655a2ca95207e8212d282f1 (patch) | |
tree | 3679a3e6897d698ee949642660281e7f74e2852b /arch/x86/kvm/mmu.c | |
parent | f89f4a06a59f30dec64b2afc4111426fc01e9e12 (diff) | |
parent | f439ed27f8b8b90d243ae15acb193d37f96eebe0 (diff) | |
download | linux-e4e65676f272adb63655a2ca95207e8212d282f1.tar.bz2 |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini:
"Fixes and features for 3.18.
Apart from the usual cleanups, here is the summary of new features:
- s390 moves closer towards host large page support
- PowerPC has improved support for debugging (both inside the guest
and via gdbstub) and support for e6500 processors
- ARM/ARM64 support read-only memory (which is necessary to put
firmware in emulated NOR flash)
- x86 has the usual emulator fixes and nested virtualization
improvements (including improved Windows support on Intel and
Jailhouse hypervisor support on AMD), adaptive PLE which helps
overcommitting of huge guests. Also included are some patches that
make KVM more friendly to memory hot-unplug, and fixes for rare
caching bugs.
Two patches have trivial mm/ parts that were acked by Rik and Andrew.
Note: I will soon switch to a subkey for signing purposes"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (157 commits)
kvm: do not handle APIC access page if in-kernel irqchip is not in use
KVM: s390: count vcpu wakeups in stat.halt_wakeup
KVM: s390/facilities: allow TOD-CLOCK steering facility bit
KVM: PPC: BOOK3S: HV: CMA: Reserve cma region only in hypervisor mode
arm/arm64: KVM: Report correct FSC for unsupported fault types
arm/arm64: KVM: Fix VTTBR_BADDR_MASK and pgd alloc
kvm: Fix kvm_get_page_retry_io __gup retval check
arm/arm64: KVM: Fix set_clear_sgi_pend_reg offset
kvm: x86: Unpin and remove kvm_arch->apic_access_page
kvm: vmx: Implement set_apic_access_page_addr
kvm: x86: Add request bit to reload APIC access page address
kvm: Add arch specific mmu notifier for page invalidation
kvm: Rename make_all_cpus_request() to kvm_make_all_cpus_request() and make it non-static
kvm: Fix page ageing bugs
kvm/x86/mmu: Pass gfn and level to rmapp callback.
x86: kvm: use alternatives for VMCALL vs. VMMCALL if kernel text is read-only
kvm: x86: use macros to compute bank MSRs
KVM: x86: Remove debug assertion of non-PAE reserved bits
kvm: don't take vcpu mutex for obviously invalid vcpu ioctls
kvm: Faults which trigger IO release the mmap_sem
...
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 139 |
1 files changed, 77 insertions, 62 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 931467881da7..3201e93ebd07 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -199,16 +199,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); /* - * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, - * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation - * number. + * the low bit of the generation number is always presumed to be zero. + * This disables mmio caching during memslot updates. The concept is + * similar to a seqcount but instead of retrying the access we just punt + * and ignore the cache. + * + * spte bits 3-11 are used as bits 1-9 of the generation number, + * the bits 52-61 are used as bits 10-19 of the generation number. */ -#define MMIO_SPTE_GEN_LOW_SHIFT 3 +#define MMIO_SPTE_GEN_LOW_SHIFT 2 #define MMIO_SPTE_GEN_HIGH_SHIFT 52 -#define MMIO_GEN_SHIFT 19 -#define MMIO_GEN_LOW_SHIFT 9 -#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_GEN_SHIFT 20 +#define MMIO_GEN_LOW_SHIFT 10 +#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) #define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) @@ -236,12 +240,7 @@ static unsigned int get_mmio_spte_generation(u64 spte) static unsigned int kvm_current_mmio_generation(struct kvm *kvm) { - /* - * Init kvm generation close to MMIO_MAX_GEN to easily test the - * code of handling generation number wrap-around. - */ - return (kvm_memslots(kvm)->generation + - MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; + return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; } static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, @@ -296,11 +295,6 @@ static bool check_mmio_spte(struct kvm *kvm, u64 spte) return likely(kvm_gen == spte_gen); } -static inline u64 rsvd_bits(int s, int e) -{ - return ((1ULL << (e - s + 1)) - 1) << s; -} - void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask) { @@ -1180,7 +1174,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. * - * Note: write protection is difference between drity logging and spte + * Note: write protection is difference between dirty logging and spte * protection: * - for dirty logging, the spte can be set to writable at anytime if * its dirty bitmap is properly set. @@ -1268,7 +1262,8 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1276,7 +1271,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, while ((sptep = rmap_get_first(*rmapp, &iter))) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n", + sptep, *sptep, gfn, level); drop_spte(kvm, sptep); need_tlb_flush = 1; @@ -1286,7 +1282,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1300,7 +1297,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!is_shadow_present_pte(*sptep)); - rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", + sptep, *sptep, gfn, level); need_flush = 1; @@ -1334,6 +1332,8 @@ static int kvm_handle_hva_range(struct kvm *kvm, int (*handler)(struct kvm *kvm, unsigned long *rmapp, struct kvm_memory_slot *slot, + gfn_t gfn, + int level, unsigned long data)) { int j; @@ -1363,6 +1363,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { unsigned long idx, idx_end; unsigned long *rmapp; + gfn_t gfn = gfn_start; /* * {idx(page_j) | page_j intersects with @@ -1373,8 +1374,10 @@ static int kvm_handle_hva_range(struct kvm *kvm, rmapp = __gfn_to_rmap(gfn_start, j, memslot); - for (; idx <= idx_end; ++idx) - ret |= handler(kvm, rmapp++, memslot, data); + for (; idx <= idx_end; + ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j))) + ret |= handler(kvm, rmapp++, memslot, + gfn, j, data); } } @@ -1385,6 +1388,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data, int (*handler)(struct kvm *kvm, unsigned long *rmapp, struct kvm_memory_slot *slot, + gfn_t gfn, int level, unsigned long data)) { return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); @@ -1406,24 +1410,14 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) } static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) { u64 *sptep; struct rmap_iterator uninitialized_var(iter); int young = 0; - /* - * In case of absence of EPT Access and Dirty Bits supports, - * emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask) { - young = kvm_unmap_rmapp(kvm, rmapp, slot, data); - goto out; - } + BUG_ON(!shadow_accessed_mask); for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { @@ -1435,14 +1429,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, (unsigned long *)sptep); } } -out: - /* @data has hva passed to kvm_age_hva(). */ - trace_kvm_age_page(data, slot, young); + trace_kvm_age_page(gfn, level, slot, young); return young; } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, + int level, unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1480,13 +1473,33 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0); + kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp); + /* + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ + if (!shadow_accessed_mask) { + /* + * We are holding the kvm->mmu_lock, and we are blowing up + * shadow PTEs. MMU notifier consumers need to be kept at bay. + * This is correct as long as we don't decouple the mmu_lock + * protected regions (like invalidate_range_start|end does). + */ + kvm->mmu_notifier_seq++; + return kvm_handle_hva_range(kvm, start, end, 0, + kvm_unmap_rmapp); + } + + return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) @@ -1749,7 +1762,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 1; } - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); return 0; } @@ -1802,7 +1815,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); if (flush) - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } struct mmu_page_path { @@ -2536,7 +2549,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, true, host_writable)) { if (write_fault) *emulate = 1; - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } if (unlikely(is_mmio_spte(*sptep) && emulate)) @@ -3163,7 +3176,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - vcpu_clear_mmio_info(vcpu, ~0ul); + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -3206,7 +3219,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, { if (exception) exception->error_code = 0; - return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); + return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -3450,13 +3463,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); - void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); @@ -3518,6 +3524,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; + u64 nonleaf_bit8_rsvd = 0; context->bad_mt_xwr = 0; @@ -3525,6 +3532,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, exb_bit_rsvd = rsvd_bits(63, 63); if (!guest_cpuid_has_gbpages(vcpu)) gbpages_bit_rsvd = rsvd_bits(7, 7); + + /* + * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for + * leaf entries) on AMD CPUs only. + */ + if (guest_cpuid_is_amd(vcpu)) + nonleaf_bit8_rsvd = rsvd_bits(8, 8); + switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ @@ -3559,9 +3574,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); + nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | @@ -3962,7 +3977,7 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, if (remote_flush) kvm_flush_remote_tlbs(vcpu->kvm); else if (local_flush) - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, @@ -4223,7 +4238,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { vcpu->arch.mmu.invlpg(vcpu, gva); - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -4433,7 +4448,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { + if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } |