summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kvm/book3s_xive.c
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@ozlabs.org>2019-08-13 20:06:48 +1000
committerMichael Ellerman <mpe@ellerman.id.au>2019-08-16 14:16:59 +1000
commitda15c03b047dca891d37b9f4ef9ca14d84a6484f (patch)
tree2ea7be961ab577536dfded9c88a117386c316646 /arch/powerpc/kvm/book3s_xive.c
parent8d4ba9c931bc384bcc6889a43915aaaf19d3e499 (diff)
downloadlinux-da15c03b047dca891d37b9f4ef9ca14d84a6484f.tar.bz2
powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race
Testing has revealed the existence of a race condition where a XIVE interrupt being shut down can be in one of the XIVE interrupt queues (of which there are up to 8 per CPU, one for each priority) at the point where free_irq() is called. If this happens, can return an interrupt number which has been shut down. This can lead to various symptoms: - irq_to_desc(irq) can be NULL. In this case, no end-of-interrupt function gets called, resulting in the CPU's elevated interrupt priority (numerically lowered CPPR) never gets reset. That then means that the CPU stops processing interrupts, causing device timeouts and other errors in various device drivers. - The irq descriptor or related data structures can be in the process of being freed as the interrupt code is using them. This typically leads to crashes due to bad pointer dereferences. This race is basically what commit 62e0468650c3 ("genirq: Add optional hardware synchronization for shutdown", 2019-06-28) is intended to fix, given a get_irqchip_state() method for the interrupt controller being used. It works by polling the interrupt controller when an interrupt is being freed until the controller says it is not pending. With XIVE, the PQ bits of the interrupt source indicate the state of the interrupt source, and in particular the P bit goes from 0 to 1 at the point where the hardware writes an entry into the interrupt queue that this interrupt is directed towards. Normally, the code will then process the interrupt and do an end-of-interrupt (EOI) operation which will reset PQ to 00 (assuming another interrupt hasn't been generated in the meantime). However, there are situations where the code resets P even though a queue entry exists (for example, by setting PQ to 01, which disables the interrupt source), and also situations where the code leaves P at 1 after removing the queue entry (for example, this is done for escalation interrupts so they cannot fire again until they are explicitly re-enabled). The code already has a 'saved_p' flag for the interrupt source which indicates that a queue entry exists, although it isn't maintained consistently. This patch adds a 'stale_p' flag to indicate that P has been left at 1 after processing a queue entry, and adds code to set and clear saved_p and stale_p as necessary to maintain a consistent indication of whether a queue entry may or may not exist. With this, we can implement xive_get_irqchip_state() by looking at stale_p, saved_p and the ESB PQ bits for the interrupt. There is some additional code to handle escalation interrupts properly; because they are enabled and disabled in KVM assembly code, which does not have access to the xive_irq_data struct for the escalation interrupt. Hence, stale_p may be incorrect when the escalation interrupt is freed in kvmppc_xive_{,native_}cleanup_vcpu(). Fortunately, we can fix it up by looking at vcpu->arch.xive_esc_on, with some careful attention to barriers in order to ensure the correct result if xive_esc_irq() races with kvmppc_xive_cleanup_vcpu(). Finally, this adds code to make noise on the console (pr_crit and WARN_ON(1)) if we find an interrupt queue entry for an interrupt which does not have a descriptor. While this won't catch the race reliably, if it does get triggered it will be an indication that the race is occurring and needs to be debugged. Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Paul Mackerras <paulus@ozlabs.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20190813100648.GE9567@blackberry
Diffstat (limited to 'arch/powerpc/kvm/book3s_xive.c')
-rw-r--r--arch/powerpc/kvm/book3s_xive.c31
1 files changed, 31 insertions, 0 deletions
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 586867e46e51..591bfb4bfd0f 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -166,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
*/
vcpu->arch.xive_esc_on = false;
+ /* This orders xive_esc_on = false vs. subsequent stale_p = true */
+ smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */
+
return IRQ_HANDLED;
}
@@ -1119,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
vcpu->arch.xive_esc_raddr = 0;
}
+/*
+ * In single escalation mode, the escalation interrupt is marked so
+ * that EOI doesn't re-enable it, but just sets the stale_p flag to
+ * indicate that the P bit has already been dealt with. However, the
+ * assembly code that enters the guest sets PQ to 00 without clearing
+ * stale_p (because it has no easy way to address it). Hence we have
+ * to adjust stale_p before shutting down the interrupt.
+ */
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+ struct kvmppc_xive_vcpu *xc, int irq)
+{
+ struct irq_data *d = irq_get_irq_data(irq);
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ /*
+ * This slightly odd sequence gives the right result
+ * (i.e. stale_p set if xive_esc_on is false) even if
+ * we race with xive_esc_irq() and xive_irq_eoi().
+ */
+ xd->stale_p = false;
+ smp_mb(); /* paired with smb_wmb in xive_esc_irq */
+ if (!vcpu->arch.xive_esc_on)
+ xd->stale_p = true;
+}
+
void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@@ -1143,6 +1171,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
if (xc->esc_virq[i]) {
+ if (xc->xive->single_escalation)
+ xive_cleanup_single_escalation(vcpu, xc,
+ xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
irq_dispose_mapping(xc->esc_virq[i]);
kfree(xc->esc_virq_names[i]);