summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt37
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h1
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h2
-rw-r--r--arch/powerpc/include/asm/kvm_host.h9
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h2
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h2
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h6
-rw-r--r--arch/powerpc/kernel/asm-offsets.c3
-rw-r--r--arch/powerpc/kernel/mce.c1
-rw-r--r--arch/powerpc/kvm/book3s_hv.c562
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv_interrupts.S20
-rw-r--r--arch/powerpc/kvm/book3s_hv_ras.c18
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S236
-rw-r--r--arch/powerpc/kvm/emulate.c4
-rw-r--r--arch/powerpc/kvm/powerpc.c40
-rw-r--r--include/uapi/linux/kvm.h2
17 files changed, 736 insertions, 211 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 912b7df8215a..3a9831b72945 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4131,6 +4131,34 @@ Parameters: none
Allow use of adapter-interruption suppression.
Returns: 0 on success; -EBUSY if a VCPU has already been created.
+7.11 KVM_CAP_PPC_SMT
+
+Architectures: ppc
+Parameters: vsmt_mode, flags
+
+Enabling this capability on a VM provides userspace with a way to set
+the desired virtual SMT mode (i.e. the number of virtual CPUs per
+virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2
+between 1 and 8. On POWER8, vsmt_mode must also be no greater than
+the number of threads per subcore for the host. Currently flags must
+be 0. A successful call to enable this capability will result in
+vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is
+subsequently queried for the VM. This capability is only supported by
+HV KVM, and can only be set before any VCPUs have been created.
+The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT
+modes are available.
+
+7.12 KVM_CAP_PPC_FWNMI
+
+Architectures: ppc
+Parameters: none
+
+With this capability a machine check exception in the guest address
+space will cause KVM to exit the guest with NMI exit reason. This
+enables QEMU to build error log and branch to guest kernel registered
+machine check handling routine. Without this capability KVM will
+branch to guests' 0x200 interrupt vector.
+
8. Other capabilities.
----------------------
@@ -4292,3 +4320,12 @@ Currently the following bits are defined for the device_irq_level bitmap:
Future versions of kvm may implement additional events. These will get
indicated by returning a higher number from KVM_CHECK_EXTENSION and will be
listed above.
+
+8.10 KVM_CAP_PPC_SMT_POSSIBLE
+
+Architectures: ppc
+
+Querying this capability returns a bitmap indicating the possible
+virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N
+(counting from the right) is set, then a virtual SMT mode of 2^N is
+available.
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 2bf35017ffc0..b8d5b8e35244 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -86,7 +86,6 @@ struct kvmppc_vcore {
u16 last_cpu;
u8 vcore_state;
u8 in_guest;
- struct kvmppc_vcore *master_vcore;
struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
struct list_head preempt_list;
spinlock_t lock;
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index b148496ffe36..7cea76f11c26 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -81,7 +81,7 @@ struct kvm_split_mode {
u8 subcore_size;
u8 do_nap;
u8 napped[MAX_SMT_THREADS];
- struct kvmppc_vcore *master_vcs[MAX_SUBCORES];
+ struct kvmppc_vcore *vc[MAX_SUBCORES];
};
/*
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 50e0bc9723cc..8b3f1238d07f 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -35,6 +35,7 @@
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <asm/hvcall.h>
+#include <asm/mce.h>
#define KVM_MAX_VCPUS NR_CPUS
#define KVM_MAX_VCORES NR_CPUS
@@ -267,6 +268,8 @@ struct kvm_resize_hpt;
struct kvm_arch {
unsigned int lpid;
+ unsigned int smt_mode; /* # vcpus per virtual core */
+ unsigned int emul_smt_mode; /* emualted SMT mode, on P9 */
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
unsigned int tlb_sets;
struct kvm_hpt_info hpt;
@@ -285,6 +288,7 @@ struct kvm_arch {
cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
u8 radix;
+ u8 fwnmi_enabled;
pgd_t *pgtable;
u64 process_table;
struct dentry *debugfs_dir;
@@ -566,6 +570,7 @@ struct kvm_vcpu_arch {
ulong wort;
ulong tid;
ulong psscr;
+ ulong hfscr;
ulong shadow_srr1;
#endif
u32 vrsave; /* also USPRG0 */
@@ -579,7 +584,7 @@ struct kvm_vcpu_arch {
ulong mcsrr0;
ulong mcsrr1;
ulong mcsr;
- u32 dec;
+ ulong dec;
#ifdef CONFIG_BOOKE
u32 decar;
#endif
@@ -710,6 +715,7 @@ struct kvm_vcpu_arch {
unsigned long pending_exceptions;
u8 ceded;
u8 prodded;
+ u8 doorbell_request;
u32 last_inst;
struct swait_queue_head *wqp;
@@ -722,6 +728,7 @@ struct kvm_vcpu_arch {
int prev_cpu;
bool timer_running;
wait_queue_head_t cpu_run;
+ struct machine_check_event mce_evt; /* Valid if trap == 0x200 */
struct kvm_vcpu_arch_shared *shared;
#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e0d88c38602b..ba5fadd6f3c9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -315,6 +315,8 @@ struct kvmppc_ops {
struct irq_bypass_producer *);
int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg);
int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
+ int (*set_smt_mode)(struct kvm *kvm, unsigned long mode,
+ unsigned long flags);
};
extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 3a8d278e7421..1a9b45198c06 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -103,6 +103,8 @@
#define OP_31_XOP_STBUX 247
#define OP_31_XOP_LHZX 279
#define OP_31_XOP_LHZUX 311
+#define OP_31_XOP_MSGSNDP 142
+#define OP_31_XOP_MSGCLRP 174
#define OP_31_XOP_MFSPR 339
#define OP_31_XOP_LWAX 341
#define OP_31_XOP_LHAX 343
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 07fbeb927834..8cf8f0c96906 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -60,6 +60,12 @@ struct kvm_regs {
#define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */
+/* flags for kvm_run.flags */
+#define KVM_RUN_PPC_NMI_DISP_MASK (3 << 0)
+#define KVM_RUN_PPC_NMI_DISP_FULLY_RECOV (1 << 0)
+#define KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV (2 << 0)
+#define KVM_RUN_PPC_NMI_DISP_NOT_RECOV (3 << 0)
+
/*
* Feature bits indicate which sections of the sregs struct are valid,
* both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 709e23425317..ae8e89e0d083 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -485,6 +485,7 @@ int main(void)
OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls);
OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
OFFSET(KVM_RADIX, kvm, arch.radix);
+ OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
@@ -513,6 +514,7 @@ int main(void)
OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions);
OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded);
OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded);
+ OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc);
@@ -542,6 +544,7 @@ int main(void)
OFFSET(VCPU_WORT, kvm_vcpu, arch.wort);
OFFSET(VCPU_TID, kvm_vcpu, arch.tid);
OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr);
+ OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr);
OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map);
OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest);
OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 5f9eada3519b..a9bfa49f3698 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -405,6 +405,7 @@ void machine_check_print_event_info(struct machine_check_event *evt,
break;
}
}
+EXPORT_SYMBOL_GPL(machine_check_print_event_info);
uint64_t get_mce_fault_addr(struct machine_check_event *evt)
{
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 42b7a4fd57d9..c6313c5d331c 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -46,6 +46,8 @@
#include <linux/of.h>
#include <asm/reg.h>
+#include <asm/ppc-opcode.h>
+#include <asm/disassemble.h>
#include <asm/cputable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -645,6 +647,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
unsigned long stolen;
unsigned long core_stolen;
u64 now;
+ unsigned long flags;
dt = vcpu->arch.dtl_ptr;
vpa = vcpu->arch.vpa.pinned_addr;
@@ -652,10 +655,10 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
core_stolen = vcore_stolen_time(vc, now);
stolen = core_stolen - vcpu->arch.stolen_logged;
vcpu->arch.stolen_logged = core_stolen;
- spin_lock_irq(&vcpu->arch.tbacct_lock);
+ spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
stolen += vcpu->arch.busy_stolen;
vcpu->arch.busy_stolen = 0;
- spin_unlock_irq(&vcpu->arch.tbacct_lock);
+ spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
if (!dt || !vpa)
return;
memset(dt, 0, sizeof(struct dtl_entry));
@@ -675,6 +678,26 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
vcpu->arch.dtl.dirty = true;
}
+/* See if there is a doorbell interrupt pending for a vcpu */
+static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
+{
+ int thr;
+ struct kvmppc_vcore *vc;
+
+ if (vcpu->arch.doorbell_request)
+ return true;
+ /*
+ * Ensure that the read of vcore->dpdes comes after the read
+ * of vcpu->doorbell_request. This barrier matches the
+ * lwsync in book3s_hv_rmhandlers.S just before the
+ * fast_guest_return label.
+ */
+ smp_rmb();
+ vc = vcpu->arch.vcore;
+ thr = vcpu->vcpu_id - vc->first_vcpuid;
+ return !!(vc->dpdes & (1 << thr));
+}
+
static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
@@ -926,6 +949,101 @@ static int kvmppc_emulate_debug_inst(struct kvm_run *run,
}
}
+static void do_nothing(void *x)
+{
+}
+
+static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
+{
+ int thr, cpu, pcpu, nthreads;
+ struct kvm_vcpu *v;
+ unsigned long dpdes;
+
+ nthreads = vcpu->kvm->arch.emul_smt_mode;
+ dpdes = 0;
+ cpu = vcpu->vcpu_id & ~(nthreads - 1);
+ for (thr = 0; thr < nthreads; ++thr, ++cpu) {
+ v = kvmppc_find_vcpu(vcpu->kvm, cpu);
+ if (!v)
+ continue;
+ /*
+ * If the vcpu is currently running on a physical cpu thread,
+ * interrupt it in order to pull it out of the guest briefly,
+ * which will update its vcore->dpdes value.
+ */
+ pcpu = READ_ONCE(v->cpu);
+ if (pcpu >= 0)
+ smp_call_function_single(pcpu, do_nothing, NULL, 1);
+ if (kvmppc_doorbell_pending(v))
+ dpdes |= 1 << thr;
+ }
+ return dpdes;
+}
+
+/*
+ * On POWER9, emulate doorbell-related instructions in order to
+ * give the guest the illusion of running on a multi-threaded core.
+ * The instructions emulated are msgsndp, msgclrp, mfspr TIR,
+ * and mfspr DPDES.
+ */
+static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
+{
+ u32 inst, rb, thr;
+ unsigned long arg;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu *tvcpu;
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ return EMULATE_FAIL;
+ if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE)
+ return RESUME_GUEST;
+ if (get_op(inst) != 31)
+ return EMULATE_FAIL;
+ rb = get_rb(inst);
+ thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
+ switch (get_xop(inst)) {
+ case OP_31_XOP_MSGSNDP:
+ arg = kvmppc_get_gpr(vcpu, rb);
+ if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+ break;
+ arg &= 0x3f;
+ if (arg >= kvm->arch.emul_smt_mode)
+ break;
+ tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
+ if (!tvcpu)
+ break;
+ if (!tvcpu->arch.doorbell_request) {
+ tvcpu->arch.doorbell_request = 1;
+ kvmppc_fast_vcpu_kick_hv(tvcpu);
+ }
+ break;
+ case OP_31_XOP_MSGCLRP:
+ arg = kvmppc_get_gpr(vcpu, rb);
+ if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+ break;
+ vcpu->arch.vcore->dpdes = 0;
+ vcpu->arch.doorbell_request = 0;
+ break;
+ case OP_31_XOP_MFSPR:
+ switch (get_sprn(inst)) {
+ case SPRN_TIR:
+ arg = thr;
+ break;
+ case SPRN_DPDES:
+ arg = kvmppc_read_dpdes(vcpu);
+ break;
+ default:
+ return EMULATE_FAIL;
+ }
+ kvmppc_set_gpr(vcpu, get_rt(inst), arg);
+ break;
+ default:
+ return EMULATE_FAIL;
+ }
+ kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+ return RESUME_GUEST;
+}
+
static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct task_struct *tsk)
{
@@ -971,15 +1089,20 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
- /*
- * Deliver a machine check interrupt to the guest.
- * We have to do this, even if the host has handled the
- * machine check, because machine checks use SRR0/1 and
- * the interrupt might have trashed guest state in them.
- */
- kvmppc_book3s_queue_irqprio(vcpu,
- BOOK3S_INTERRUPT_MACHINE_CHECK);
- r = RESUME_GUEST;
+ /* Exit to guest with KVM_EXIT_NMI as exit reason */
+ run->exit_reason = KVM_EXIT_NMI;
+ run->hw.hardware_exit_reason = vcpu->arch.trap;
+ /* Clear out the old NMI status from run->flags */
+ run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
+ /* Now set the NMI status */
+ if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
+ run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
+ else
+ run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
+
+ r = RESUME_HOST;
+ /* Print the MCE event to host console. */
+ machine_check_print_event_info(&vcpu->arch.mce_evt, false);
break;
case BOOK3S_INTERRUPT_PROGRAM:
{
@@ -1048,12 +1171,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
/*
* This occurs if the guest (kernel or userspace), does something that
- * is prohibited by HFSCR. We just generate a program interrupt to
- * the guest.
+ * is prohibited by HFSCR.
+ * On POWER9, this could be a doorbell instruction that we need
+ * to emulate.
+ * Otherwise, we just generate a program interrupt to the guest.
*/
case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
- kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
- r = RESUME_GUEST;
+ r = EMULATE_FAIL;
+ if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG)
+ r = kvmppc_emulate_doorbell_instr(vcpu);
+ if (r == EMULATE_FAIL) {
+ kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+ r = RESUME_GUEST;
+ }
break;
case BOOK3S_INTERRUPT_HV_RM_HARD:
r = RESUME_PASSTHROUGH;
@@ -1143,6 +1273,12 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
if (cpu_has_feature(CPU_FTR_ARCH_207S))
mask |= LPCR_AIL;
+ /*
+ * On POWER9, allow userspace to enable large decrementer for the
+ * guest, whether or not the host has it enabled.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ mask |= LPCR_LD;
/* Broken 32-bit version of LPCR must not clear top bits */
if (preserve_top32)
@@ -1486,6 +1622,14 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
break;
case KVM_REG_PPC_TB_OFFSET:
+ /*
+ * POWER9 DD1 has an erratum where writing TBU40 causes
+ * the timebase to lose ticks. So we don't let the
+ * timebase offset be changed on P9 DD1. (It is
+ * initialized to zero.)
+ */
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+ break;
/* round up to multiple of 2^24 */
vcpu->arch.vcore->tb_offset =
ALIGN(set_reg_val(id, *val), 1UL << 24);
@@ -1603,7 +1747,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
init_swait_queue_head(&vcore->wq);
vcore->preempt_tb = TB_NIL;
vcore->lpcr = kvm->arch.lpcr;
- vcore->first_vcpuid = core * threads_per_vcore();
+ vcore->first_vcpuid = core * kvm->arch.smt_mode;
vcore->kvm = kvm;
INIT_LIST_HEAD(&vcore->preempt_list);
@@ -1762,14 +1906,10 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
unsigned int id)
{
struct kvm_vcpu *vcpu;
- int err = -EINVAL;
+ int err;
int core;
struct kvmppc_vcore *vcore;
- core = id / threads_per_vcore();
- if (core >= KVM_MAX_VCORES)
- goto out;
-
err = -ENOMEM;
vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
if (!vcpu)
@@ -1800,6 +1940,20 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
vcpu->arch.busy_preempt = TB_NIL;
vcpu->arch.intr_msr = MSR_SF | MSR_ME;
+ /*
+ * Set the default HFSCR for the guest from the host value.
+ * This value is only used on POWER9.
+ * On POWER9 DD1, TM doesn't work, so we make sure to
+ * prevent the guest from using it.
+ * On POWER9, we want to virtualize the doorbell facility, so we
+ * turn off the HFSCR bit, which causes those instructions to trap.
+ */
+ vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
+ if (!cpu_has_feature(CPU_FTR_TM))
+ vcpu->arch.hfscr &= ~HFSCR_TM;
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ vcpu->arch.hfscr &= ~HFSCR_MSGP;
+
kvmppc_mmu_book3s_hv_init(vcpu);
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -1807,11 +1961,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
init_waitqueue_head(&vcpu->arch.cpu_run);
mutex_lock(&kvm->lock);
- vcore = kvm->arch.vcores[core];
- if (!vcore) {
- vcore = kvmppc_vcore_create(kvm, core);
- kvm->arch.vcores[core] = vcore;
- kvm->arch.online_vcores++;
+ vcore = NULL;
+ err = -EINVAL;
+ core = id / kvm->arch.smt_mode;
+ if (core < KVM_MAX_VCORES) {
+ vcore = kvm->arch.vcores[core];
+ if (!vcore) {
+ err = -ENOMEM;
+ vcore = kvmppc_vcore_create(kvm, core);
+ kvm->arch.vcores[core] = vcore;
+ kvm->arch.online_vcores++;
+ }
}
mutex_unlock(&kvm->lock);
@@ -1839,6 +1999,43 @@ out:
return ERR_PTR(err);
}
+static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
+ unsigned long flags)
+{
+ int err;
+ int esmt = 0;
+
+ if (flags)
+ return -EINVAL;
+ if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
+ return -EINVAL;
+ if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /*
+ * On POWER8 (or POWER7), the threading mode is "strict",
+ * so we pack smt_mode vcpus per vcore.
+ */
+ if (smt_mode > threads_per_subcore)
+ return -EINVAL;
+ } else {
+ /*
+ * On POWER9, the threading mode is "loose",
+ * so each vcpu gets its own vcore.
+ */
+ esmt = smt_mode;
+ smt_mode = 1;
+ }
+ mutex_lock(&kvm->lock);
+ err = -EBUSY;
+ if (!kvm->arch.online_vcores) {
+ kvm->arch.smt_mode = smt_mode;
+ kvm->arch.emul_smt_mode = esmt;
+ err = 0;
+ }
+ mutex_unlock(&kvm->lock);
+
+ return err;
+}
+
static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
{
if (vpa->pinned_addr)
@@ -1889,7 +2086,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
}
}
-extern void __kvmppc_vcore_entry(void);
+extern int __kvmppc_vcore_entry(void);
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
struct kvm_vcpu *vcpu)
@@ -1954,10 +2151,6 @@ static void kvmppc_release_hwthread(int cpu)
tpaca->kvm_hstate.kvm_split_mode = NULL;
}
-static void do_nothing(void *x)
-{
-}
-
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
{
int i;
@@ -1975,11 +2168,35 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
smp_call_function_single(cpu + i, do_nothing, NULL, 1);
}
+static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ /*
+ * With radix, the guest can do TLB invalidations itself,
+ * and it could choose to use the local form (tlbiel) if
+ * it is invalidating a translation that has only ever been
+ * used on one vcpu. However, that doesn't mean it has
+ * only ever been used on one physical cpu, since vcpus
+ * can move around between pcpus. To cope with this, when
+ * a vcpu moves from one pcpu to another, we need to tell
+ * any vcpus running on the same core as this vcpu previously
+ * ran to flush the TLB. The TLB is shared between threads,
+ * so we use a single bit in .need_tlb_flush for all 4 threads.
+ */
+ if (vcpu->arch.prev_cpu != pcpu) {
+ if (vcpu->arch.prev_cpu >= 0 &&
+ cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+ cpu_first_thread_sibling(pcpu))
+ radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
+ vcpu->arch.prev_cpu = pcpu;
+ }
+}
+
static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
{
int cpu;
struct paca_struct *tpaca;
- struct kvmppc_vcore *mvc = vc->master_vcore;
struct kvm *kvm = vc->kvm;
cpu = vc->pcpu;
@@ -1989,36 +2206,16 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
vcpu->arch.timer_running = 0;
}
cpu += vcpu->arch.ptid;
- vcpu->cpu = mvc->pcpu;
+ vcpu->cpu = vc->pcpu;
vcpu->arch.thread_cpu = cpu;
-
- /*
- * With radix, the guest can do TLB invalidations itself,
- * and it could choose to use the local form (tlbiel) if
- * it is invalidating a translation that has only ever been
- * used on one vcpu. However, that doesn't mean it has
- * only ever been used on one physical cpu, since vcpus
- * can move around between pcpus. To cope with this, when
- * a vcpu moves from one pcpu to another, we need to tell
- * any vcpus running on the same core as this vcpu previously
- * ran to flush the TLB. The TLB is shared between threads,
- * so we use a single bit in .need_tlb_flush for all 4 threads.
- */
- if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
- if (vcpu->arch.prev_cpu >= 0 &&
- cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
- cpu_first_thread_sibling(cpu))
- radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
- vcpu->arch.prev_cpu = cpu;
- }
cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
}
tpaca = &paca[cpu];
tpaca->kvm_hstate.kvm_vcpu = vcpu;
- tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
+ tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
/* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
smp_wmb();
- tpaca->kvm_hstate.kvm_vcore = mvc;
+ tpaca->kvm_hstate.kvm_vcore = vc;
if (cpu != smp_processor_id())
kvmppc_ipi_thread(cpu);
}
@@ -2147,8 +2344,7 @@ struct core_info {
int max_subcore_threads;
int total_threads;
int subcore_threads[MAX_SUBCORES];
- struct kvm *subcore_vm[MAX_SUBCORES];
- struct list_head vcs[MAX_SUBCORES];
+ struct kvmppc_vcore *vc[MAX_SUBCORES];
};
/*
@@ -2159,17 +2355,12 @@ static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
{
- int sub;
-
memset(cip, 0, sizeof(*cip));
cip->n_subcores = 1;
cip->max_subcore_threads = vc->num_threads;
cip->total_threads = vc->num_threads;
cip->subcore_threads[0] = vc->num_threads;
- cip->subcore_vm[0] = vc->kvm;
- for (sub = 0; sub < MAX_SUBCORES; ++sub)
- INIT_LIST_HEAD(&cip->vcs[sub]);
- list_add_tail(&vc->preempt_list, &cip->vcs[0]);
+ cip->vc[0] = vc;
}
static bool subcore_config_ok(int n_subcores, int n_threads)
@@ -2189,9 +2380,8 @@ static bool subcore_config_ok(int n_subcores, int n_threads)
return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
}
-static void init_master_vcore(struct kvmppc_vcore *vc)
+static void init_vcore_to_run(struct kvmppc_vcore *vc)
{
- vc->master_vcore = vc;
vc->entry_exit_map = 0;
vc->in_guest = 0;
vc->napping_threads = 0;
@@ -2216,9 +2406,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
++cip->n_subcores;
cip->total_threads += vc->num_threads;
cip->subcore_threads[sub] = vc->num_threads;
- cip->subcore_vm[sub] = vc->kvm;
- init_master_vcore(vc);
- list_move_tail(&vc->preempt_list, &cip->vcs[sub]);
+ cip->vc[sub] = vc;
+ init_vcore_to_run(vc);
+ list_del_init(&vc->preempt_list);
return true;
}
@@ -2286,6 +2476,18 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
spin_unlock(&lp->lock);
}
+static bool recheck_signals(struct core_info *cip)
+{
+ int sub, i;
+ struct kvm_vcpu *vcpu;
+
+ for (sub = 0; sub < cip->n_subcores; ++sub)
+ for_each_runnable_thread(i, vcpu, cip->vc[sub])
+ if (signal_pending(vcpu->arch.run_task))
+ return true;
+ return false;
+}
+
static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
{
int still_running = 0, i;
@@ -2323,7 +2525,6 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
wake_up(&vcpu->arch.cpu_run);
}
}
- list_del_init(&vc->preempt_list);
if (!is_master) {
if (still_running > 0) {
kvmppc_vcore_preempt(vc);
@@ -2385,6 +2586,21 @@ static inline int kvmppc_set_host_core(unsigned int cpu)
return 0;
}
+static void set_irq_happened(int trap)
+{
+ switch (trap) {
+ case BOOK3S_INTERRUPT_EXTERNAL:
+ local_paca->irq_happened |= PACA_IRQ_EE;
+ break;
+ case BOOK3S_INTERRUPT_H_DOORBELL:
+ local_paca->irq_happened |= PACA_IRQ_DBELL;
+ break;
+ case BOOK3S_INTERRUPT_HMI:
+ local_paca->irq_happened |= PACA_IRQ_HMI;
+ break;
+ }
+}
+
/*
* Run a set of guest threads on a physical core.
* Called with vc->lock held.
@@ -2395,7 +2611,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int i;
int srcu_idx;
struct core_info core_info;
- struct kvmppc_vcore *pvc, *vcnext;
+ struct kvmppc_vcore *pvc;
struct kvm_split_mode split_info, *sip;
int split, subcore_size, active;
int sub;
@@ -2404,6 +2620,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int pcpu, thr;
int target_threads;
int controlled_threads;
+ int trap;
/*
* Remove from the list any threads that have a signal pending
@@ -2418,7 +2635,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
/*
* Initialize *vc.
*/
- init_master_vcore(vc);
+ init_vcore_to_run(vc);
vc->preempt_tb = TB_NIL;
/*
@@ -2455,6 +2672,43 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
if (vc->num_threads < target_threads)
collect_piggybacks(&core_info, target_threads);
+ /*
+ * On radix, arrange for TLB flushing if necessary.
+ * This has to be done before disabling interrupts since
+ * it uses smp_call_function().
+ */
+ pcpu = smp_processor_id();
+ if (kvm_is_radix(vc->kvm)) {
+ for (sub = 0; sub < core_info.n_subcores; ++sub)
+ for_each_runnable_thread(i, vcpu, core_info.vc[sub])
+ kvmppc_prepare_radix_vcpu(vcpu, pcpu);
+ }
+
+ /*
+ * Hard-disable interrupts, and check resched flag and signals.
+ * If we need to reschedule or deliver a signal, clean up
+ * and return without going into the guest(s).
+ */
+ local_irq_disable();
+ hard_irq_disable();
+ if (lazy_irq_pending() || need_resched() ||
+ recheck_signals(&core_info)) {
+ local_irq_enable();
+ vc->vcore_state = VCORE_INACTIVE;
+ /* Unlock all except the primary vcore */
+ for (sub = 1; sub < core_info.n_subcores; ++sub) {
+ pvc = core_info.vc[sub];
+ /* Put back on to the preempted vcores list */
+ kvmppc_vcore_preempt(pvc);
+ spin_unlock(&pvc->lock);
+ }
+ for (i = 0; i < controlled_threads; ++i)
+ kvmppc_release_hwthread(pcpu + i);
+ return;
+ }
+
+ kvmppc_clear_host_core(pcpu);
+
/* Decide on micro-threading (split-core) mode */
subcore_size = threads_per_subcore;
cmd_bit = stat_bit = 0;
@@ -2478,13 +2732,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
split_info.ldbar = mfspr(SPRN_LDBAR);
split_info.subcore_size = subcore_size;
for (sub = 0; sub < core_info.n_subcores; ++sub)
- split_info.master_vcs[sub] =
- list_first_entry(&core_info.vcs[sub],
- struct kvmppc_vcore, preempt_list);
+ split_info.vc[sub] = core_info.vc[sub];
/* order writes to split_info before kvm_split_mode pointer */
smp_wmb();
}
- pcpu = smp_processor_id();
for (thr = 0; thr < controlled_threads; ++thr)
paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
@@ -2504,32 +2755,29 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
}
}
- kvmppc_clear_host_core(pcpu);
-
/* Start all the threads */
active = 0;
for (sub = 0; sub < core_info.n_subcores; ++sub) {
thr = subcore_thread_map[sub];
thr0_done = false;
active |= 1 << thr;
- list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
- pvc->pcpu = pcpu + thr;
- for_each_runnable_thread(i, vcpu, pvc) {
- kvmppc_start_thread(vcpu, pvc);
- kvmppc_create_dtl_entry(vcpu, pvc);
- trace_kvm_guest_enter(vcpu);
- if (!vcpu->arch.ptid)
- thr0_done = true;
- active |= 1 << (thr + vcpu->arch.ptid);
- }
- /*
- * We need to start the first thread of each subcore
- * even if it doesn't have a vcpu.
- */
- if (pvc->master_vcore == pvc && !thr0_done)
- kvmppc_start_thread(NULL, pvc);
- thr += pvc->num_threads;
+ pvc = core_info.vc[sub];
+ pvc->pcpu = pcpu + thr;
+ for_each_runnable_thread(i, vcpu, pvc) {
+ kvmppc_start_thread(vcpu, pvc);
+ kvmppc_create_dtl_entry(vcpu, pvc);
+ trace_kvm_guest_enter(vcpu);
+ if (!vcpu->arch.ptid)
+ thr0_done = true;
+ active |= 1 << (thr + vcpu->arch.ptid);
}
+ /*
+ * We need to start the first thread of each subcore
+ * even if it doesn't have a vcpu.
+ */
+ if (!thr0_done)
+ kvmppc_start_thread(NULL, pvc);
+ thr += pvc->num_threads;
}
/*
@@ -2556,17 +2804,27 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
trace_kvmppc_run_core(vc, 0);
for (sub = 0; sub < core_info.n_subcores; ++sub)
- list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
- spin_unlock(&pvc->lock);
+ spin_unlock(&core_info.vc[sub]->lock);
+
+ /*
+ * Interrupts will be enabled once we get into the guest,
+ * so tell lockdep that we're about to enable interrupts.
+ */
+ trace_hardirqs_on();
guest_enter();
srcu_idx = srcu_read_lock(&vc->kvm->srcu);
- __kvmppc_vcore_entry();
+ trap = __kvmppc_vcore_entry();
srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+ guest_exit();
+
+ trace_hardirqs_off();
+ set_irq_happened(trap);
+
spin_lock(&vc->lock);
/* prevent other vcpu threads from doing kvmppc_start_thread() now */
vc->vcore_state = VCORE_EXITING;
@@ -2594,6 +2852,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
split_info.do_nap = 0;
}
+ kvmppc_set_host_core(pcpu);
+
+ local_irq_enable();
+
/* Let secondaries go back to the offline loop */
for (i = 0; i < controlled_threads; ++i) {
kvmppc_release_hwthread(pcpu + i);
@@ -2602,18 +2864,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
}
- kvmppc_set_host_core(pcpu);
-
spin_unlock(&vc->lock);
/* make sure updates to secondary vcpu structs are visible now */
smp_mb();
- guest_exit();
- for (sub = 0; sub < core_info.n_subcores; ++sub)
- list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
- preempt_list)
- post_guest_process(pvc, pvc == vc);
+ for (sub = 0; sub < core_info.n_subcores; ++sub) {
+ pvc = core_info.vc[sub];
+ post_guest_process(pvc, pvc == vc);
+ }
spin_lock(&vc->lock);
preempt_enable();
@@ -2658,6 +2917,30 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
vc->halt_poll_ns /= halt_poll_ns_shrink;
}
+#ifdef CONFIG_KVM_XICS
+static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
+{
+ if (!xive_enabled())
+ return false;
+ return vcpu->arch.xive_saved_state.pipr <
+ vcpu->arch.xive_saved_state.cppr;
+}
+#else
+static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+#endif /* CONFIG_KVM_XICS */
+
+static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
+ kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
+ return true;
+
+ return false;
+}
+
/*
* Check to see if any of the runnable vcpus on the vcore have pending
* exceptions or are no longer ceded
@@ -2668,8 +2951,7 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
int i;
for_each_runnable_thread(i, vcpu, vc) {
- if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
- vcpu->arch.prodded)
+ if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
return 1;
}
@@ -2811,15 +3093,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
*/
if (!signal_pending(current)) {
if (vc->vcore_state == VCORE_PIGGYBACK) {
- struct kvmppc_vcore *mvc = vc->master_vcore;
- if (spin_trylock(&mvc->lock)) {
- if (mvc->vcore_state == VCORE_RUNNING &&
- !VCORE_IS_EXITING(mvc)) {
+ if (spin_trylock(&vc->lock)) {
+ if (vc->vcore_state == VCORE_RUNNING &&
+ !VCORE_IS_EXITING(vc)) {
kvmppc_create_dtl_entry(vcpu, vc);
kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu);
}
- spin_unlock(&mvc->lock);
+ spin_unlock(&vc->lock);
}
} else if (vc->vcore_state == VCORE_RUNNING &&
!VCORE_IS_EXITING(vc)) {
@@ -2855,7 +3136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
break;
n_ceded = 0;
for_each_runnable_thread(i, v, vc) {
- if (!v->arch.pending_exceptions && !v->arch.prodded)
+ if (!kvmppc_vcpu_woken(v))
n_ceded += v->arch.ceded;
else
v->arch.ceded = 0;
@@ -2907,12 +3188,36 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
int r;
int srcu_idx;
+ unsigned long ebb_regs[3] = {}; /* shut up GCC */
+ unsigned long user_tar = 0;
+ unsigned int user_vrsave;
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
return -EINVAL;
}
+ /*
+ * Don't allow entry with a suspended transaction, because
+ * the guest entry/exit code will lose it.
+ * If the guest has TM enabled, save away their TM-related SPRs
+ * (they will get restored by the TM unavailable interrupt).
+ */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+ (current->thread.regs->msr & MSR_TM)) {
+ if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
+ run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ run->fail_entry.hardware_entry_failure_reason = 0;
+ return -EINVAL;
+ }
+ current->thread.tm_tfhar = mfspr(SPRN_TFHAR);
+ current->thread.tm_tfiar = mfspr(SPRN_TFIAR);
+ current->thread.tm_texasr = mfspr(SPRN_TEXASR);
+ current->thread.regs->msr &= ~MSR_TM;
+ }
+#endif
+
kvmppc_core_prepare_to_enter(vcpu);
/* No need to go into the guest when all we'll do is come back out */
@@ -2934,6 +3239,15 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
flush_all_to_thread(current);
+ /* Save userspace EBB and other register values */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ ebb_regs[0] = mfspr(SPRN_EBBHR);
+ ebb_regs[1] = mfspr(SPRN_EBBRR);
+ ebb_regs[2] = mfspr(SPRN_BESCR);
+ user_tar = mfspr(SPRN_TAR);
+ }
+ user_vrsave = mfspr(SPRN_VRSAVE);
+
vcpu->arch.wqp = &vcpu->arch.vcore->wq;
vcpu->arch.pgdir = current->mm->pgd;
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
@@ -2960,6 +3274,16 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
}
} while (is_kvmppc_resume_guest(r));
+ /* Restore userspace EBB and other register values */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ mtspr(SPRN_EBBHR, ebb_regs[0]);
+ mtspr(SPRN_EBBRR, ebb_regs[1]);
+ mtspr(SPRN_BESCR, ebb_regs[2]);
+ mtspr(SPRN_TAR, user_tar);
+ mtspr(SPRN_FSCR, current->thread.fscr);
+ }
+ mtspr(SPRN_VRSAVE, user_vrsave);
+
out:
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
atomic_dec(&vcpu->kvm->arch.vcpus_running);
@@ -3468,6 +3792,19 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
kvm_hv_vm_activated();
/*
+ * Initialize smt_mode depending on processor.
+ * POWER8 and earlier have to use "strict" threading, where
+ * all vCPUs in a vcore have to run on the same (sub)core,
+ * whereas on POWER9 the threads can each run a different
+ * guest.
+ */
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ kvm->arch.smt_mode = threads_per_subcore;
+ else
+ kvm->arch.smt_mode = 1;
+ kvm->arch.emul_smt_mode = 1;
+
+ /*
* Create a debugfs directory for the VM
*/
snprintf(buf, sizeof(buf), "vm%d", current->pid);
@@ -3896,6 +4233,7 @@ static struct kvmppc_ops kvm_ops_hv = {
#endif
.configure_mmu = kvmhv_configure_mmu,
.get_rmmu_info = kvmhv_get_rmmu_info,
+ .set_smt_mode = kvmhv_set_smt_mode,
};
static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index ee4c2558c305..90644db9d38e 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -307,7 +307,7 @@ void kvmhv_commence_exit(int trap)
return;
for (i = 0; i < MAX_SUBCORES; ++i) {
- vc = sip->master_vcs[i];
+ vc = sip->vc[i];
if (!vc)
break;
do {
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 0fdc4a28970b..dc54373c8780 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -61,13 +61,6 @@ BEGIN_FTR_SECTION
std r3, HSTATE_DABR(r13)
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
- /* Hard-disable interrupts */
- mfmsr r10
- std r10, HSTATE_HOST_MSR(r13)
- rldicl r10,r10,48,1
- rotldi r10,r10,16
- mtmsrd r10,1
-
/* Save host PMU registers */
BEGIN_FTR_SECTION
/* Work around P8 PMAE bug */
@@ -121,10 +114,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
* Put whatever is in the decrementer into the
* hypervisor decrementer.
*/
+BEGIN_FTR_SECTION
+ ld r5, HSTATE_KVM_VCORE(r13)
+ ld r6, VCORE_KVM(r5)
+ ld r9, KVM_HOST_LPCR(r6)
+ andis. r9, r9, LPCR_LD@h
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
mfspr r8,SPRN_DEC
mftb r7
- mtspr SPRN_HDEC,r8
+BEGIN_FTR_SECTION
+ /* On POWER9, don't sign-extend if host LPCR[LD] bit is set */
+ bne 32f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r8,r8
+32: mtspr SPRN_HDEC,r8
add r8,r8,r7
std r8,HSTATE_DECEXP(r13)
@@ -143,6 +146,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
*
* R1 = host R1
* R2 = host R2
+ * R3 = trap number on this thread
* R12 = exit handler id
* R13 = PACA
*/
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 7ef0993214f3..c356f9a40b24 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -130,12 +130,28 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
out:
/*
+ * For guest that supports FWNMI capability, hook the MCE event into
+ * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI
+ * exit reason. On our way to exit we will pull this event from vcpu
+ * structure and print it from thread 0 of the core/subcore.
+ *
+ * For guest that does not support FWNMI capability (old QEMU):
* We are now going enter guest either through machine check
* interrupt (for unhandled errors) or will continue from
* current HSRR0 (for handled errors) in guest. Hence
* queue up the event so that we can log it from host console later.
*/
- machine_check_queue_event();
+ if (vcpu->kvm->arch.fwnmi_enabled) {
+ /*
+ * Hook up the mce event on to vcpu structure.
+ * First clear the old event.
+ */
+ memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt));
+ if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
+ vcpu->arch.mce_evt = mce_evt;
+ }
+ } else
+ machine_check_queue_event();
return handled;
}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index bdb3f76ceb6b..6ea4b53f4b16 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -32,12 +32,30 @@
#include <asm/opal.h>
#include <asm/xive-regs.h>
+/* Sign-extend HDEC if not on POWER9 */
+#define EXTEND_HDEC(reg) \
+BEGIN_FTR_SECTION; \
+ extsw reg, reg; \
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
/* Values in HSTATE_NAPPING(r13) */
#define NAPPING_CEDE 1
#define NAPPING_NOVCPU 2
+/* Stack frame offsets for kvmppc_hv_entry */
+#define SFS 160
+#define STACK_SLOT_TRAP (SFS-4)
+#define STACK_SLOT_TID (SFS-16)
+#define STACK_SLOT_PSSCR (SFS-24)
+#define STACK_SLOT_PID (SFS-32)
+#define STACK_SLOT_IAMR (SFS-40)
+#define STACK_SLOT_CIABR (SFS-48)
+#define STACK_SLOT_DAWR (SFS-56)
+#define STACK_SLOT_DAWRX (SFS-64)
+#define STACK_SLOT_HFSCR (SFS-72)
+
/*
* Call kvmppc_hv_entry in real mode.
* Must be called with interrupts hard-disabled.
@@ -51,6 +69,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
std r0, PPC_LR_STKOFF(r1)
stdu r1, -112(r1)
mfmsr r10
+ std r10, HSTATE_HOST_MSR(r13)
LOAD_REG_ADDR(r5, kvmppc_call_hv_entry)
li r0,MSR_RI
andc r0,r10,r0
@@ -135,20 +154,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
stb r0, HSTATE_HWTHREAD_REQ(r13)
/*
- * For external and machine check interrupts, we need
- * to call the Linux handler to process the interrupt.
- * We do that by jumping to absolute address 0x500 for
- * external interrupts, or the machine_check_fwnmi label
- * for machine checks (since firmware might have patched
- * the vector area at 0x200). The [h]rfid at the end of the
- * handler will return to the book3s_hv_interrupts.S code.
- * For other interrupts we do the rfid to get back
- * to the book3s_hv_interrupts.S code here.
+ * For external interrupts we need to call the Linux
+ * handler to process the interrupt. We do that by jumping
+ * to absolute address 0x500 for external interrupts.
+ * The [h]rfid at the end of the handler will return to
+ * the book3s_hv_interrupts.S code. For other interrupts
+ * we do the rfid to get back to the book3s_hv_interrupts.S
+ * code here.
*/
ld r8, 112+PPC_LR_STKOFF(r1)
addi r1, r1, 112
ld r7, HSTATE_HOST_MSR(r13)
+ /* Return the trap number on this thread as the return value */
+ mr r3, r12
+
/*
* If we came back from the guest via a relocation-on interrupt,
* we will be in virtual mode at this point, which makes it a
@@ -158,62 +178,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
andi. r0, r0, MSR_IR /* in real mode? */
bne .Lvirt_return
- cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
- cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
- beq 11f
- cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
- beq 15f /* Invoke the H_DOORBELL handler */
- cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI
- beq cr2, 14f /* HMI check */
-
- /* RFI into the highmem handler, or branch to interrupt handler */
+ /* RFI into the highmem handler */
mfmsr r6
li r0, MSR_RI
andc r6, r6, r0
mtmsrd r6, 1 /* Clear RI in MSR */
mtsrr0 r8
mtsrr1 r7
- beq cr1, 13f /* machine check */
RFI
- /* On POWER7, we have external interrupts set to use HSRR0/1 */
-11: mtspr SPRN_HSRR0, r8
- mtspr SPRN_HSRR1, r7
- ba 0x500
-
-13: b machine_check_fwnmi
-
-14: mtspr SPRN_HSRR0, r8
- mtspr SPRN_HSRR1, r7
- b hmi_exception_after_realmode
-
-15: mtspr SPRN_HSRR0, r8
- mtspr SPRN_HSRR1, r7
- ba 0xe80
-
- /* Virtual-mode return - can't get here for HMI or machine check */
+ /* Virtual-mode return */
.Lvirt_return:
- cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
- beq 16f
- cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
- beq 17f
- andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */
- beq 18f
- mtmsrd r7, 1 /* if so then re-enable them */
-18: mtlr r8
+ mtlr r8
blr
-16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */
- mtspr SPRN_HSRR1, r7
- b exc_virt_0x4500_hardware_interrupt
-
-17: mtspr SPRN_HSRR0, r8
- mtspr SPRN_HSRR1, r7
- b exc_virt_0x4e80_h_doorbell
-
kvmppc_primary_no_guest:
/* We handle this much like a ceded vcpu */
/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
+ /* HDEC may be larger than DEC for arch >= v3.00, but since the */
+ /* HDEC value came from DEC in the first place, it will fit */
mfspr r3, SPRN_HDEC
mtspr SPRN_DEC, r3
/*
@@ -295,8 +278,9 @@ kvm_novcpu_wakeup:
/* See if our timeslice has expired (HDEC is negative) */
mfspr r0, SPRN_HDEC
+ EXTEND_HDEC(r0)
li r12, BOOK3S_INTERRUPT_HV_DECREMENTER
- cmpwi r0, 0
+ cmpdi r0, 0
blt kvm_novcpu_exit
/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
@@ -319,10 +303,10 @@ kvm_novcpu_exit:
bl kvmhv_accumulate_time
#endif
13: mr r3, r12
- stw r12, 112-4(r1)
+ stw r12, STACK_SLOT_TRAP(r1)
bl kvmhv_commence_exit
nop
- lwz r12, 112-4(r1)
+ lwz r12, STACK_SLOT_TRAP(r1)
b kvmhv_switch_to_host
/*
@@ -390,8 +374,8 @@ kvm_secondary_got_guest:
lbz r4, HSTATE_PTID(r13)
cmpwi r4, 0
bne 63f
- lis r6, 0x7fff
- ori r6, r6, 0xffff
+ LOAD_REG_ADDR(r6, decrementer_max)
+ ld r6, 0(r6)
mtspr SPRN_HDEC, r6
/* and set per-LPAR registers, if doing dynamic micro-threading */
ld r6, HSTATE_SPLIT_MODE(r13)
@@ -545,11 +529,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
* *
*****************************************************************************/
-/* Stack frame offsets */
-#define STACK_SLOT_TID (112-16)
-#define STACK_SLOT_PSSCR (112-24)
-#define STACK_SLOT_PID (112-32)
-
.global kvmppc_hv_entry
kvmppc_hv_entry:
@@ -565,7 +544,7 @@ kvmppc_hv_entry:
*/
mflr r0
std r0, PPC_LR_STKOFF(r1)
- stdu r1, -112(r1)
+ stdu r1, -SFS(r1)
/* Save R1 in the PACA */
std r1, HSTATE_HOST_R1(r13)
@@ -749,10 +728,22 @@ BEGIN_FTR_SECTION
mfspr r5, SPRN_TIDR
mfspr r6, SPRN_PSSCR
mfspr r7, SPRN_PID
+ mfspr r8, SPRN_IAMR
std r5, STACK_SLOT_TID(r1)
std r6, STACK_SLOT_PSSCR(r1)
std r7, STACK_SLOT_PID(r1)
+ std r8, STACK_SLOT_IAMR(r1)
+ mfspr r5, SPRN_HFSCR
+ std r5, STACK_SLOT_HFSCR(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+BEGIN_FTR_SECTION
+ mfspr r5, SPRN_CIABR
+ mfspr r6, SPRN_DAWR
+ mfspr r7, SPRN_DAWRX
+ std r5, STACK_SLOT_CIABR(r1)
+ std r6, STACK_SLOT_DAWR(r1)
+ std r7, STACK_SLOT_DAWRX(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
/* Set partition DABR */
@@ -895,8 +886,10 @@ FTR_SECTION_ELSE
ld r5, VCPU_TID(r4)
ld r6, VCPU_PSSCR(r4)
oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */
+ ld r7, VCPU_HFSCR(r4)
mtspr SPRN_TIDR, r5
mtspr SPRN_PSSCR, r6
+ mtspr SPRN_HFSCR, r7
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
8:
@@ -911,7 +904,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
mftb r7
subf r3,r7,r8
mtspr SPRN_DEC,r3
- stw r3,VCPU_DEC(r4)
+ std r3,VCPU_DEC(r4)
ld r5, VCPU_SPRG0(r4)
ld r6, VCPU_SPRG1(r4)
@@ -968,7 +961,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
/* Check if HDEC expires soon */
mfspr r3, SPRN_HDEC
- cmpwi r3, 512 /* 1 microsecond */
+ EXTEND_HDEC(r3)
+ cmpdi r3, 512 /* 1 microsecond */
blt hdec_soon
#ifdef CONFIG_KVM_XICS
@@ -1022,7 +1016,13 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
li r0, BOOK3S_INTERRUPT_EXTERNAL
bne cr1, 12f
mfspr r0, SPRN_DEC
- cmpwi r0, 0
+BEGIN_FTR_SECTION
+ /* On POWER9 check whether the guest has large decrementer enabled */
+ andis. r8, r8, LPCR_LD@h
+ bne 15f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ extsw r0, r0
+15: cmpdi r0, 0
li r0, BOOK3S_INTERRUPT_DECREMENTER
bge 5f
@@ -1032,6 +1032,23 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
mr r9, r4
bl kvmppc_msr_interrupt
5:
+BEGIN_FTR_SECTION
+ b fast_guest_return
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+ /* On POWER9, check for pending doorbell requests */
+ lbz r0, VCPU_DBELL_REQ(r4)
+ cmpwi r0, 0
+ beq fast_guest_return
+ ld r5, HSTATE_KVM_VCORE(r13)
+ /* Set DPDES register so the CPU will take a doorbell interrupt */
+ li r0, 1
+ mtspr SPRN_DPDES, r0
+ std r0, VCORE_DPDES(r5)
+ /* Make sure other cpus see vcore->dpdes set before dbell req clear */
+ lwsync
+ /* Clear the pending doorbell request */
+ li r0, 0
+ stb r0, VCPU_DBELL_REQ(r4)
/*
* Required state:
@@ -1206,6 +1223,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
stw r12,VCPU_TRAP(r9)
+ /*
+ * Now that we have saved away SRR0/1 and HSRR0/1,
+ * interrupts are recoverable in principle, so set MSR_RI.
+ * This becomes important for relocation-on interrupts from
+ * the guest, which we can get in radix mode on POWER9.
+ */
+ li r0, MSR_RI
+ mtmsrd r0, 1
+
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
addi r3, r9, VCPU_TB_RMINTR
mr r4, r9
@@ -1262,6 +1288,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
beq 4f
b guest_exit_cont
3:
+ /* If it's a hypervisor facility unavailable interrupt, save HFSCR */
+ cmpwi r12, BOOK3S_INTERRUPT_H_FAC_UNAVAIL
+ bne 14f
+ mfspr r3, SPRN_HFSCR
+ std r3, VCPU_HFSCR(r9)
+ b guest_exit_cont
+14:
/* External interrupt ? */
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
bne+ guest_exit_cont
@@ -1449,12 +1482,18 @@ mc_cont:
mtspr SPRN_SPURR,r4
/* Save DEC */
+ ld r3, HSTATE_KVM_VCORE(r13)
mfspr r5,SPRN_DEC
mftb r6
+ /* On P9, if the guest has large decr enabled, don't sign extend */
+BEGIN_FTR_SECTION
+ ld r4, VCORE_LPCR(r3)
+ andis. r4, r4, LPCR_LD@h
+ bne 16f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r5,r5
- add r5,r5,r6
+16: add r5,r5,r6
/* r5 is a guest timebase value here, convert to host TB */
- ld r3,HSTATE_KVM_VCORE(r13)
ld r4,VCORE_TB_OFFSET(r3)
subf r5,r4,r5
std r5,VCPU_DEC_EXPIRES(r9)
@@ -1499,17 +1538,19 @@ FTR_SECTION_ELSE
rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */
rotldi r6, r6, 60
std r6, VCPU_PSSCR(r9)
+ /* Restore host HFSCR value */
+ ld r7, STACK_SLOT_HFSCR(r1)
+ mtspr SPRN_HFSCR, r7
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
/*
* Restore various registers to 0, where non-zero values
* set by the guest could disrupt the host.
*/
li r0, 0
- mtspr SPRN_IAMR, r0
- mtspr SPRN_CIABR, r0
- mtspr SPRN_DAWRX, r0
+ mtspr SPRN_PSPB, r0
mtspr SPRN_WORT, r0
BEGIN_FTR_SECTION
+ mtspr SPRN_IAMR, r0
mtspr SPRN_TCSCR, r0
/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
li r0, 1
@@ -1525,6 +1566,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
std r6,VCPU_UAMOR(r9)
li r6,0
mtspr SPRN_AMR,r6
+ mtspr SPRN_UAMOR, r6
/* Switch DSCR back to host value */
mfspr r8, SPRN_DSCR
@@ -1670,12 +1712,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* Restore host values of some registers */
BEGIN_FTR_SECTION
+ ld r5, STACK_SLOT_CIABR(r1)
+ ld r6, STACK_SLOT_DAWR(r1)
+ ld r7, STACK_SLOT_DAWRX(r1)
+ mtspr SPRN_CIABR, r5
+ mtspr SPRN_DAWR, r6
+ mtspr SPRN_DAWRX, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+BEGIN_FTR_SECTION
ld r5, STACK_SLOT_TID(r1)
ld r6, STACK_SLOT_PSSCR(r1)
ld r7, STACK_SLOT_PID(r1)
+ ld r8, STACK_SLOT_IAMR(r1)
mtspr SPRN_TIDR, r5
mtspr SPRN_PSSCR, r6
mtspr SPRN_PID, r7
+ mtspr SPRN_IAMR, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
BEGIN_FTR_SECTION
PPC_INVALIDATE_ERAT
@@ -1819,8 +1871,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
li r0, KVM_GUEST_MODE_NONE
stb r0, HSTATE_IN_GUEST(r13)
- ld r0, 112+PPC_LR_STKOFF(r1)
- addi r1, r1, 112
+ ld r0, SFS+PPC_LR_STKOFF(r1)
+ addi r1, r1, SFS
mtlr r0
blr
@@ -2366,12 +2418,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
mfspr r3, SPRN_DEC
mfspr r4, SPRN_HDEC
mftb r5
- cmpw r3, r4
+BEGIN_FTR_SECTION
+ /* On P9 check whether the guest has large decrementer mode enabled */
+ ld r6, HSTATE_KVM_VCORE(r13)
+ ld r6, VCORE_LPCR(r6)
+ andis. r6, r6, LPCR_LD@h
+ bne 68f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ extsw r3, r3
+68: EXTEND_HDEC(r4)
+ cmpd r3, r4
ble 67f
mtspr SPRN_DEC, r4
67:
/* save expiry time of guest decrementer */
- extsw r3, r3
add r3, r3, r5
ld r4, HSTATE_KVM_VCPU(r13)
ld r5, HSTATE_KVM_VCORE(r13)
@@ -2552,22 +2612,32 @@ machine_check_realmode:
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
/*
- * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through
- * machine check interrupt (set HSRR0 to 0x200). And for handled
- * errors (no-fatal), just go back to guest execution with current
- * HSRR0 instead of exiting guest. This new approach will inject
- * machine check to guest for fatal error causing guest to crash.
- *
- * The old code used to return to host for unhandled errors which
- * was causing guest to hang with soft lockups inside guest and
- * makes it difficult to recover guest instance.
+ * For the guest that is FWNMI capable, deliver all the MCE errors
+ * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit
+ * reason. This new approach injects machine check errors in guest
+ * address space to guest with additional information in the form
+ * of RTAS event, thus enabling guest kernel to suitably handle
+ * such errors.
*
+ * For the guest that is not FWNMI capable (old QEMU) fallback
+ * to old behaviour for backward compatibility:
+ * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either
+ * through machine check interrupt (set HSRR0 to 0x200).
+ * For handled errors (no-fatal), just go back to guest execution
+ * with current HSRR0.
* if we receive machine check with MSR(RI=0) then deliver it to
* guest as machine check causing guest to crash.
*/
ld r11, VCPU_MSR(r9)
rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
bne mc_cont /* if so, exit to host */
+ /* Check if guest is capable of handling NMI exit */
+ ld r10, VCPU_KVM(r9)
+ lbz r10, KVM_FWNMI(r10)
+ cmpdi r10, 1 /* FWNMI capable? */
+ beq mc_cont /* if so, exit with KVM_EXIT_NMI. */
+
+ /* if not, fall through for backward compatibility. */
andi. r10, r11, MSR_RI /* check for unrecoverable exception */
beq 1f /* Deliver a machine check to guest */
ld r10, VCPU_PC(r9)
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index c873ffe55362..4d8b4d6cebff 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -39,7 +39,7 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
unsigned long dec_nsec;
unsigned long long dec_time;
- pr_debug("mtDEC: %x\n", vcpu->arch.dec);
+ pr_debug("mtDEC: %lx\n", vcpu->arch.dec);
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
#ifdef CONFIG_PPC_BOOK3S
@@ -109,7 +109,7 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
case SPRN_TBWU: break;
case SPRN_DEC:
- vcpu->arch.dec = spr_val;
+ vcpu->arch.dec = (u32) spr_val;
kvmppc_emulate_dec(vcpu);
break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c5aa3a42883a..1a75c0b5f4ca 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -553,13 +553,28 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
case KVM_CAP_PPC_SMT:
r = 0;
- if (hv_enabled) {
+ if (kvm) {
+ if (kvm->arch.emul_smt_mode > 1)
+ r = kvm->arch.emul_smt_mode;
+ else
+ r = kvm->arch.smt_mode;
+ } else if (hv_enabled) {
if (cpu_has_feature(CPU_FTR_ARCH_300))
r = 1;
else
r = threads_per_subcore;
}
break;
+ case KVM_CAP_PPC_SMT_POSSIBLE:
+ r = 1;
+ if (hv_enabled) {
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ r = ((threads_per_subcore << 1) - 1);
+ else
+ /* P9 can emulate dbells, so allow any mode */
+ r = 8 | 4 | 2 | 1;
+ }
+ break;
case KVM_CAP_PPC_RMA:
r = 0;
break;
@@ -618,6 +633,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
break;
#endif
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ case KVM_CAP_PPC_FWNMI:
+ r = hv_enabled;
+ break;
+#endif
case KVM_CAP_PPC_HTM:
r = cpu_has_feature(CPU_FTR_TM_COMP) &&
is_kvmppc_hv_enabled(kvm);
@@ -1537,6 +1557,15 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
break;
}
#endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ case KVM_CAP_PPC_FWNMI:
+ r = -EINVAL;
+ if (!is_kvmppc_hv_enabled(vcpu->kvm))
+ break;
+ r = 0;
+ vcpu->kvm->arch.fwnmi_enabled = true;
+ break;
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
default:
r = -EINVAL;
break;
@@ -1711,6 +1740,15 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
break;
}
+ case KVM_CAP_PPC_SMT: {
+ unsigned long mode = cap->args[0];
+ unsigned long flags = cap->args[1];
+
+ r = -EINVAL;
+ if (kvm->arch.kvm_ops->set_smt_mode)
+ r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
+ break;
+ }
#endif
default:
r = -EINVAL;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2b8dc1ca18d4..c0b6dfec5f87 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -925,6 +925,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_X86_GUEST_MWAIT 143
#define KVM_CAP_ARM_USER_IRQ 144
#define KVM_CAP_S390_CMMA_MIGRATION 145
+#define KVM_CAP_PPC_FWNMI 146
+#define KVM_CAP_PPC_SMT_POSSIBLE 147
#ifdef KVM_CAP_IRQ_ROUTING