summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kvm/book3s_hv.c
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@ozlabs.org>2018-10-08 16:31:04 +1100
committerMichael Ellerman <mpe@ellerman.id.au>2018-10-09 16:04:27 +1100
commit360cae313702cdd0b90f82c261a8302fecef030a (patch)
treef5964bcafc1ccff72298a64d9fb8e1acea1830b0 /arch/powerpc/kvm/book3s_hv.c
parent8e3f5fc1045dc49fd175b978c5457f5f51e7a2ce (diff)
downloadlinux-360cae313702cdd0b90f82c261a8302fecef030a.tar.bz2
KVM: PPC: Book3S HV: Nested guest entry via hypercall
This adds a new hypercall, H_ENTER_NESTED, which is used by a nested hypervisor to enter one of its nested guests. The hypercall supplies register values in two structs. Those values are copied by the level 0 (L0) hypervisor (the one which is running in hypervisor mode) into the vcpu struct of the L1 guest, and then the guest is run until an interrupt or error occurs which needs to be reported to L1 via the hypercall return value. Currently this assumes that the L0 and L1 hypervisors are the same endianness, and the structs passed as arguments are in native endianness. If they are of different endianness, the version number check will fail and the hcall will be rejected. Nested hypervisors do not support indep_threads_mode=N, so this adds code to print a warning message if the administrator has set indep_threads_mode=N, and treat it as Y. Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Paul Mackerras <paulus@ozlabs.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'arch/powerpc/kvm/book3s_hv.c')
-rw-r--r--arch/powerpc/kvm/book3s_hv.c214
1 files changed, 184 insertions, 30 deletions
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d8fc49effab0..7a2f344af4f8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -942,6 +942,13 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
break;
case H_ENTER_NESTED:
ret = H_FUNCTION;
+ if (!vcpu->kvm->arch.nested_enable)
+ break;
+ ret = kvmhv_enter_nested_guest(vcpu);
+ if (ret == H_INTERRUPT) {
+ kvmppc_set_gpr(vcpu, 3, 0);
+ return -EINTR;
+ }
break;
case H_TLB_INVALIDATE:
ret = H_FUNCTION;
@@ -1272,6 +1279,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
return r;
}
+static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
+{
+ int r;
+ int srcu_idx;
+
+ vcpu->stat.sum_exits++;
+
+ /*
+ * This can happen if an interrupt occurs in the last stages
+ * of guest entry or the first stages of guest exit (i.e. after
+ * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
+ * and before setting it to KVM_GUEST_MODE_HOST_HV).
+ * That can happen due to a bug, or due to a machine check
+ * occurring at just the wrong time.
+ */
+ if (vcpu->arch.shregs.msr & MSR_HV) {
+ pr_emerg("KVM trap in HV mode while nested!\n");
+ pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+ vcpu->arch.trap, kvmppc_get_pc(vcpu),
+ vcpu->arch.shregs.msr);
+ kvmppc_dump_regs(vcpu);
+ return RESUME_HOST;
+ }
+ switch (vcpu->arch.trap) {
+ /* We're good on these - the host merely wanted to get our attention */
+ case BOOK3S_INTERRUPT_HV_DECREMENTER:
+ vcpu->stat.dec_exits++;
+ r = RESUME_GUEST;
+ break;
+ case BOOK3S_INTERRUPT_EXTERNAL:
+ vcpu->stat.ext_intr_exits++;
+ r = RESUME_HOST;
+ break;
+ case BOOK3S_INTERRUPT_H_DOORBELL:
+ case BOOK3S_INTERRUPT_H_VIRT:
+ vcpu->stat.ext_intr_exits++;
+ r = RESUME_GUEST;
+ break;
+ /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
+ case BOOK3S_INTERRUPT_HMI:
+ case BOOK3S_INTERRUPT_PERFMON:
+ case BOOK3S_INTERRUPT_SYSTEM_RESET:
+ r = RESUME_GUEST;
+ break;
+ case BOOK3S_INTERRUPT_MACHINE_CHECK:
+ /* Pass the machine check to the L1 guest */
+ r = RESUME_HOST;
+ /* Print the MCE event to host console. */
+ machine_check_print_event_info(&vcpu->arch.mce_evt, false);
+ break;
+ /*
+ * We get these next two if the guest accesses a page which it thinks
+ * it has mapped but which is not actually present, either because
+ * it is for an emulated I/O device or because the corresonding
+ * host page has been paged out.
+ */
+ case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvmhv_nested_page_fault(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+ break;
+ case BOOK3S_INTERRUPT_H_INST_STORAGE:
+ vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+ vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
+ DSISR_SRR1_MATCH_64S;
+ if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+ vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvmhv_nested_page_fault(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+ break;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ case BOOK3S_INTERRUPT_HV_SOFTPATCH:
+ /*
+ * This occurs for various TM-related instructions that
+ * we need to emulate on POWER9 DD2.2. We have already
+ * handled the cases where the guest was in real-suspend
+ * mode and was transitioning to transactional state.
+ */
+ r = kvmhv_p9_tm_emulation(vcpu);
+ break;
+#endif
+
+ case BOOK3S_INTERRUPT_HV_RM_HARD:
+ vcpu->arch.trap = 0;
+ r = RESUME_GUEST;
+ if (!xive_enabled())
+ kvmppc_xics_rm_complete(vcpu, 0);
+ break;
+ default:
+ r = RESUME_HOST;
+ break;
+ }
+
+ return r;
+}
+
static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
@@ -3098,7 +3203,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
/*
* Load up hypervisor-mode registers on P9.
*/
-static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit)
+static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
+ unsigned long lpcr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
s64 hdec;
@@ -3155,7 +3261,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit)
mtspr(SPRN_AMOR, ~0UL);
- mtspr(SPRN_LPCR, vc->lpcr);
+ mtspr(SPRN_LPCR, lpcr);
isync();
kvmppc_xive_push_vcpu(vcpu);
@@ -3225,7 +3331,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit)
* Virtual-mode guest entry for POWER9 and later when the host and
* guest are both using the radix MMU. The LPIDR has already been set.
*/
-int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit)
+int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
+ unsigned long lpcr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long host_dscr = mfspr(SPRN_DSCR);
@@ -3291,14 +3398,32 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit)
mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
- if (vcpu->arch.doorbell_request) {
- vc->dpdes = 1;
- smp_wmb();
- vcpu->arch.doorbell_request = 0;
+ if (kvmhv_on_pseries()) {
+ /* call our hypervisor to load up HV regs and go */
+ struct hv_guest_state hvregs;
+
+ kvmhv_save_hv_regs(vcpu, &hvregs);
+ hvregs.lpcr = lpcr;
+ vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+ hvregs.version = HV_GUEST_STATE_VERSION;
+ if (vcpu->arch.nested) {
+ hvregs.lpid = vcpu->arch.nested->shadow_lpid;
+ hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
+ } else {
+ hvregs.lpid = vcpu->kvm->arch.lpid;
+ hvregs.vcpu_token = vcpu->vcpu_id;
+ }
+ hvregs.hdec_expiry = time_limit;
+ trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
+ __pa(&vcpu->arch.regs));
+ kvmhv_restore_hv_return_state(vcpu, &hvregs);
+ vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+ vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
+ vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
+ } else {
+ trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
}
- trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit);
-
vcpu->arch.slb_max = 0;
dec = mfspr(SPRN_DEC);
tb = mftb();
@@ -3539,6 +3664,11 @@ out:
trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
}
+/*
+ * This never fails for a radix guest, as none of the operations it does
+ * for a radix guest can fail or have a way to report failure.
+ * kvmhv_run_single_vcpu() relies on this fact.
+ */
static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
{
int r = 0;
@@ -3688,13 +3818,16 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return vcpu->arch.ret;
}
-static int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
- struct kvm_vcpu *vcpu, u64 time_limit)
+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
+ struct kvm_vcpu *vcpu, u64 time_limit,
+ unsigned long lpcr)
{
int trap, r, pcpu, pcpu0;
int srcu_idx;
struct kvmppc_vcore *vc;
struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
+ unsigned long lpid;
trace_kvmppc_run_vcpu_enter(vcpu);
@@ -3715,16 +3848,8 @@ static int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
vc->runner = vcpu;
/* See if the MMU is ready to go */
- if (!kvm->arch.mmu_ready) {
- r = kvmhv_setup_mmu(vcpu);
- if (r) {
- kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.
- hardware_entry_failure_reason = 0;
- vcpu->arch.ret = r;
- goto out;
- }
- }
+ if (!kvm->arch.mmu_ready)
+ kvmhv_setup_mmu(vcpu);
if (need_resched())
cond_resched();
@@ -3746,7 +3871,22 @@ static int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
goto out;
- kvmppc_core_prepare_to_enter(vcpu);
+ if (!nested) {
+ kvmppc_core_prepare_to_enter(vcpu);
+ if (vcpu->arch.doorbell_request) {
+ vc->dpdes = 1;
+ smp_wmb();
+ vcpu->arch.doorbell_request = 0;
+ }
+ if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
+ &vcpu->arch.pending_exceptions))
+ lpcr |= LPCR_MER;
+ } else if (vcpu->arch.pending_exceptions ||
+ vcpu->arch.doorbell_request ||
+ xive_interrupt_pending(vcpu)) {
+ vcpu->arch.ret = RESUME_HOST;
+ goto out;
+ }
kvmppc_clear_host_core(pcpu);
@@ -3760,7 +3900,10 @@ static int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
vc->vcore_state = VCORE_RUNNING;
trace_kvmppc_run_core(vc, 0);
- mtspr(SPRN_LPID, vc->kvm->arch.lpid);
+ lpid = vc->kvm->arch.lpid;
+ if (nested)
+ lpid = nested->shadow_lpid;
+ mtspr(SPRN_LPID, lpid);
isync();
/* See comment above in kvmppc_run_core() about this */
@@ -3769,7 +3912,7 @@ static int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
pcpu0 &= ~0x3UL;
if (cpumask_test_cpu(pcpu0, &kvm->arch.need_tlb_flush)) {
- radix__local_flush_tlb_lpid_guest(kvm->arch.lpid);
+ radix__local_flush_tlb_lpid_guest(lpid);
/* Clear the bit after the TLB flush */
cpumask_clear_cpu(pcpu0, &kvm->arch.need_tlb_flush);
}
@@ -3781,7 +3924,7 @@ static int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
this_cpu_disable_ftrace();
- trap = kvmhv_p9_guest_entry(vcpu, time_limit);
+ trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
vcpu->arch.trap = trap;
this_cpu_enable_ftrace();
@@ -3809,8 +3952,12 @@ static int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
trace_kvm_guest_exit(vcpu);
r = RESUME_GUEST;
- if (trap)
- r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
+ if (trap) {
+ if (!nested)
+ r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
+ else
+ r = kvmppc_handle_nested_exit(vcpu);
+ }
vcpu->arch.ret = r;
if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
@@ -3925,7 +4072,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
do {
if (kvm->arch.threads_indep && kvm_is_radix(kvm))
- r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0);
+ r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
+ vcpu->arch.vcore->lpcr);
else
r = kvmppc_run_vcpu(run, vcpu);
@@ -4479,8 +4627,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
* On POWER9, we only need to do this if the "indep_threads_mode"
* module parameter has been set to N.
*/
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- kvm->arch.threads_indep = indep_threads_mode;
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
+ pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
+ kvm->arch.threads_indep = true;
+ } else {
+ kvm->arch.threads_indep = indep_threads_mode;
+ }
+ }
if (!kvm->arch.threads_indep)
kvm_hv_vm_activated();