summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-11-25 12:32:42 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-11-25 12:32:42 -0800
commit081f359ef5334b0e034979e4e930c2ce80f3001b (patch)
tree9255c89731446ccc5735b2f6c9074bdc20a3440e
parent0b1dcc2cf55ae6523c6fbd0d741b3ac28c9f4536 (diff)
parent25c94b051592c010abe92c85b0485f1faedc83f3 (diff)
downloadlinux-081f359ef5334b0e034979e4e930c2ce80f3001b.tar.bz2
Merge tag 'hyperv-fixes-signed-20221125' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux
Pull hyperv fixes from Wei Liu: - Fix IRTE allocation in Hyper-V PCI controller (Dexuan Cui) - Fix handling of SCSI srb_status and capacity change events (Michael Kelley) - Restore VP assist page after CPU offlining and onlining (Vitaly Kuznetsov) - Fix some memory leak issues in VMBus (Yang Yingliang) * tag 'hyperv-fixes-signed-20221125' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: Drivers: hv: vmbus: fix possible memory leak in vmbus_device_register() Drivers: hv: vmbus: fix double free in the error path of vmbus_add_channel_work() PCI: hv: Only reuse existing IRTE allocation for Multi-MSI scsi: storvsc: Fix handling of srb_status and capacity change events x86/hyperv: Restore VP assist page after cpu offlining/onlining
-rw-r--r--arch/x86/hyperv/hv_init.c54
-rw-r--r--drivers/hv/channel_mgmt.c6
-rw-r--r--drivers/hv/vmbus_drv.c1
-rw-r--r--drivers/pci/controller/pci-hyperv.c90
-rw-r--r--drivers/scsi/storvsc_drv.c69
5 files changed, 141 insertions, 79 deletions
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index f49bc3ec76e6..a269049a43ce 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -77,7 +77,7 @@ static int hyperv_init_ghcb(void)
static int hv_cpu_init(unsigned int cpu)
{
union hv_vp_assist_msr_contents msr = { 0 };
- struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
+ struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
int ret;
ret = hv_common_cpu_init(cpu);
@@ -87,34 +87,32 @@ static int hv_cpu_init(unsigned int cpu)
if (!hv_vp_assist_page)
return 0;
- if (!*hvp) {
- if (hv_root_partition) {
- /*
- * For root partition we get the hypervisor provided VP assist
- * page, instead of allocating a new page.
- */
- rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
- *hvp = memremap(msr.pfn <<
- HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
- PAGE_SIZE, MEMREMAP_WB);
- } else {
- /*
- * The VP assist page is an "overlay" page (see Hyper-V TLFS's
- * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
- * out to make sure we always write the EOI MSR in
- * hv_apic_eoi_write() *after* the EOI optimization is disabled
- * in hv_cpu_die(), otherwise a CPU may not be stopped in the
- * case of CPU offlining and the VM will hang.
- */
+ if (hv_root_partition) {
+ /*
+ * For root partition we get the hypervisor provided VP assist
+ * page, instead of allocating a new page.
+ */
+ rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+ } else {
+ /*
+ * The VP assist page is an "overlay" page (see Hyper-V TLFS's
+ * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
+ * out to make sure we always write the EOI MSR in
+ * hv_apic_eoi_write() *after* the EOI optimization is disabled
+ * in hv_cpu_die(), otherwise a CPU may not be stopped in the
+ * case of CPU offlining and the VM will hang.
+ */
+ if (!*hvp)
*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
- if (*hvp)
- msr.pfn = vmalloc_to_pfn(*hvp);
- }
- WARN_ON(!(*hvp));
- if (*hvp) {
- msr.enable = 1;
- wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
- }
+ if (*hvp)
+ msr.pfn = vmalloc_to_pfn(*hvp);
+
+ }
+ if (!WARN_ON(!(*hvp))) {
+ msr.enable = 1;
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
}
return hyperv_init_ghcb();
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 5b120402d405..cc23b90cae02 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -533,13 +533,17 @@ static void vmbus_add_channel_work(struct work_struct *work)
* Add the new device to the bus. This will kick off device-driver
* binding which eventually invokes the device driver's AddDevice()
* method.
+ *
+ * If vmbus_device_register() fails, the 'device_obj' is freed in
+ * vmbus_device_release() as called by device_unregister() in the
+ * error path of vmbus_device_register(). In the outside error
+ * path, there's no need to free it.
*/
ret = vmbus_device_register(newchannel->device_obj);
if (ret != 0) {
pr_err("unable to add child device object (relid %d)\n",
newchannel->offermsg.child_relid);
- kfree(newchannel->device_obj);
goto err_deq_chan;
}
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 8b2e413bf19c..e592c481f7ae 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2082,6 +2082,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
ret = device_register(&child_device_obj->device);
if (ret) {
pr_err("Unable to register child device\n");
+ put_device(&child_device_obj->device);
return ret;
}
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index ba64284eaf9f..f1ec8931dfbc 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1613,7 +1613,7 @@ out:
}
static u32 hv_compose_msi_req_v1(
- struct pci_create_interrupt *int_pkt, const struct cpumask *affinity,
+ struct pci_create_interrupt *int_pkt,
u32 slot, u8 vector, u16 vector_count)
{
int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
@@ -1632,6 +1632,35 @@ static u32 hv_compose_msi_req_v1(
}
/*
+ * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and
+ * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be
+ * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V
+ * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is
+ * not irrelevant because Hyper-V chooses the physical CPU to handle the
+ * interrupts based on the vCPU specified in message sent to the vPCI VSP in
+ * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest,
+ * but assigning too many vPCI device interrupts to the same pCPU can cause a
+ * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V
+ * to spread out the pCPUs that it selects.
+ *
+ * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu()
+ * to always return the same dummy vCPU, because a second call to
+ * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a
+ * new pCPU for the interrupt. But for the multi-MSI case, the second call to
+ * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the
+ * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that
+ * the pCPUs are spread out. All interrupts for a multi-MSI device end up using
+ * the same pCPU, even though the vCPUs will be spread out by later calls
+ * to hv_irq_unmask(), but that is the best we can do now.
+ *
+ * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not*
+ * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an
+ * enhancement is planned for a future version. With that enhancement, the
+ * dummy vCPU selection won't matter, and interrupts for the same multi-MSI
+ * device will be spread across multiple pCPUs.
+ */
+
+/*
* Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
* by subsequent retarget in hv_irq_unmask().
*/
@@ -1640,18 +1669,39 @@ static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity)
return cpumask_first_and(affinity, cpu_online_mask);
}
-static u32 hv_compose_msi_req_v2(
- struct pci_create_interrupt2 *int_pkt, const struct cpumask *affinity,
- u32 slot, u8 vector, u16 vector_count)
+/*
+ * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0.
+ */
+static int hv_compose_multi_msi_req_get_cpu(void)
{
+ static DEFINE_SPINLOCK(multi_msi_cpu_lock);
+
+ /* -1 means starting with CPU 0 */
+ static int cpu_next = -1;
+
+ unsigned long flags;
int cpu;
+ spin_lock_irqsave(&multi_msi_cpu_lock, flags);
+
+ cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids,
+ false);
+ cpu = cpu_next;
+
+ spin_unlock_irqrestore(&multi_msi_cpu_lock, flags);
+
+ return cpu;
+}
+
+static u32 hv_compose_msi_req_v2(
+ struct pci_create_interrupt2 *int_pkt, int cpu,
+ u32 slot, u8 vector, u16 vector_count)
+{
int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
int_pkt->wslot.slot = slot;
int_pkt->int_desc.vector = vector;
int_pkt->int_desc.vector_count = vector_count;
int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
- cpu = hv_compose_msi_req_get_cpu(affinity);
int_pkt->int_desc.processor_array[0] =
hv_cpu_number_to_vp_number(cpu);
int_pkt->int_desc.processor_count = 1;
@@ -1660,18 +1710,15 @@ static u32 hv_compose_msi_req_v2(
}
static u32 hv_compose_msi_req_v3(
- struct pci_create_interrupt3 *int_pkt, const struct cpumask *affinity,
+ struct pci_create_interrupt3 *int_pkt, int cpu,
u32 slot, u32 vector, u16 vector_count)
{
- int cpu;
-
int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3;
int_pkt->wslot.slot = slot;
int_pkt->int_desc.vector = vector;
int_pkt->int_desc.reserved = 0;
int_pkt->int_desc.vector_count = vector_count;
int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
- cpu = hv_compose_msi_req_get_cpu(affinity);
int_pkt->int_desc.processor_array[0] =
hv_cpu_number_to_vp_number(cpu);
int_pkt->int_desc.processor_count = 1;
@@ -1715,12 +1762,18 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
struct pci_create_interrupt3 v3;
} int_pkts;
} __packed ctxt;
+ bool multi_msi;
u64 trans_id;
u32 size;
int ret;
+ int cpu;
+
+ msi_desc = irq_data_get_msi_desc(data);
+ multi_msi = !msi_desc->pci.msi_attrib.is_msix &&
+ msi_desc->nvec_used > 1;
/* Reuse the previous allocation */
- if (data->chip_data) {
+ if (data->chip_data && multi_msi) {
int_desc = data->chip_data;
msg->address_hi = int_desc->address >> 32;
msg->address_lo = int_desc->address & 0xffffffff;
@@ -1728,7 +1781,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return;
}
- msi_desc = irq_data_get_msi_desc(data);
pdev = msi_desc_to_pci_dev(msi_desc);
dest = irq_data_get_effective_affinity_mask(data);
pbus = pdev->bus;
@@ -1738,11 +1790,18 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
if (!hpdev)
goto return_null_message;
+ /* Free any previous message that might have already been composed. */
+ if (data->chip_data && !multi_msi) {
+ int_desc = data->chip_data;
+ data->chip_data = NULL;
+ hv_int_desc_free(hpdev, int_desc);
+ }
+
int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
if (!int_desc)
goto drop_reference;
- if (!msi_desc->pci.msi_attrib.is_msix && msi_desc->nvec_used > 1) {
+ if (multi_msi) {
/*
* If this is not the first MSI of Multi MSI, we already have
* a mapping. Can exit early.
@@ -1767,9 +1826,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
*/
vector = 32;
vector_count = msi_desc->nvec_used;
+ cpu = hv_compose_multi_msi_req_get_cpu();
} else {
vector = hv_msi_get_int_vector(data);
vector_count = 1;
+ cpu = hv_compose_msi_req_get_cpu(dest);
}
/*
@@ -1785,7 +1846,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
switch (hbus->protocol_version) {
case PCI_PROTOCOL_VERSION_1_1:
size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
- dest,
hpdev->desc.win_slot.slot,
(u8)vector,
vector_count);
@@ -1794,7 +1854,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
case PCI_PROTOCOL_VERSION_1_2:
case PCI_PROTOCOL_VERSION_1_3:
size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
- dest,
+ cpu,
hpdev->desc.win_slot.slot,
(u8)vector,
vector_count);
@@ -1802,7 +1862,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
case PCI_PROTOCOL_VERSION_1_4:
size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
- dest,
+ cpu,
hpdev->desc.win_slot.slot,
vector,
vector_count);
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index bc46721aa01c..3c5b7e4227b2 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -303,16 +303,21 @@ enum storvsc_request_type {
};
/*
- * SRB status codes and masks; a subset of the codes used here.
+ * SRB status codes and masks. In the 8-bit field, the two high order bits
+ * are flags, while the remaining 6 bits are an integer status code. The
+ * definitions here include only the subset of the integer status codes that
+ * are tested for in this driver.
*/
-
#define SRB_STATUS_AUTOSENSE_VALID 0x80
#define SRB_STATUS_QUEUE_FROZEN 0x40
-#define SRB_STATUS_INVALID_LUN 0x20
-#define SRB_STATUS_SUCCESS 0x01
-#define SRB_STATUS_ABORTED 0x02
-#define SRB_STATUS_ERROR 0x04
-#define SRB_STATUS_DATA_OVERRUN 0x12
+
+/* SRB status integer codes */
+#define SRB_STATUS_SUCCESS 0x01
+#define SRB_STATUS_ABORTED 0x02
+#define SRB_STATUS_ERROR 0x04
+#define SRB_STATUS_INVALID_REQUEST 0x06
+#define SRB_STATUS_DATA_OVERRUN 0x12
+#define SRB_STATUS_INVALID_LUN 0x20
#define SRB_STATUS(status) \
(status & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN))
@@ -969,38 +974,25 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
void (*process_err_fn)(struct work_struct *work);
struct hv_host_device *host_dev = shost_priv(host);
- /*
- * In some situations, Hyper-V sets multiple bits in the
- * srb_status, such as ABORTED and ERROR. So process them
- * individually, with the most specific bits first.
- */
-
- if (vm_srb->srb_status & SRB_STATUS_INVALID_LUN) {
- set_host_byte(scmnd, DID_NO_CONNECT);
- process_err_fn = storvsc_remove_lun;
- goto do_work;
- }
+ switch (SRB_STATUS(vm_srb->srb_status)) {
+ case SRB_STATUS_ERROR:
+ case SRB_STATUS_ABORTED:
+ case SRB_STATUS_INVALID_REQUEST:
+ if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID) {
+ /* Check for capacity change */
+ if ((asc == 0x2a) && (ascq == 0x9)) {
+ process_err_fn = storvsc_device_scan;
+ /* Retry the I/O that triggered this. */
+ set_host_byte(scmnd, DID_REQUEUE);
+ goto do_work;
+ }
- if (vm_srb->srb_status & SRB_STATUS_ABORTED) {
- if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID &&
- /* Capacity data has changed */
- (asc == 0x2a) && (ascq == 0x9)) {
- process_err_fn = storvsc_device_scan;
/*
- * Retry the I/O that triggered this.
+ * Otherwise, let upper layer deal with the
+ * error when sense message is present
*/
- set_host_byte(scmnd, DID_REQUEUE);
- goto do_work;
- }
- }
-
- if (vm_srb->srb_status & SRB_STATUS_ERROR) {
- /*
- * Let upper layer deal with error when
- * sense message is present.
- */
- if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)
return;
+ }
/*
* If there is an error; offline the device since all
@@ -1023,6 +1015,13 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
default:
set_host_byte(scmnd, DID_ERROR);
}
+ return;
+
+ case SRB_STATUS_INVALID_LUN:
+ set_host_byte(scmnd, DID_NO_CONNECT);
+ process_err_fn = storvsc_remove_lun;
+ goto do_work;
+
}
return;