summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 13:24:39 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2021-02-21 13:24:39 -0800
commit9c5b80b795e9c847a7b7f5e63c6bcf07873fbcdf (patch)
tree2258006102e47fd885daf3e6777f243a4378050c /arch
parent08179b47e1fdf288e5d59f90e5ce31513bb019c3 (diff)
parent3019270282a175defc02c8331786c73e082cd2a8 (diff)
downloadlinux-9c5b80b795e9c847a7b7f5e63c6bcf07873fbcdf.tar.bz2
Merge tag 'hyperv-next-signed-20210216' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux
Pull Hyper-V updates from Wei Liu: - VMBus hardening patches from Andrea Parri and Andres Beltran. - Patches to make Linux boot as the root partition on Microsoft Hypervisor from Wei Liu. - One patch to add a new sysfs interface to support hibernation on Hyper-V from Dexuan Cui. - Two miscellaneous clean-up patches from Colin and Gustavo. * tag 'hyperv-next-signed-20210216' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (31 commits) Revert "Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring buffer" iommu/hyperv: setup an IO-APIC IRQ remapping domain for root partition x86/hyperv: implement an MSI domain for root partition asm-generic/hyperv: import data structures for mapping device interrupts asm-generic/hyperv: introduce hv_device_id and auxiliary structures asm-generic/hyperv: update hv_interrupt_entry asm-generic/hyperv: update hv_msi_entry x86/hyperv: implement and use hv_smp_prepare_cpus x86/hyperv: provide a bunch of helper functions ACPI / NUMA: add a stub function for node_to_pxm() x86/hyperv: handling hypercall page setup for root x86/hyperv: extract partition ID from Microsoft Hypervisor if necessary x86/hyperv: allocate output arg pages if required clocksource/hyperv: use MSR-based access if running as root Drivers: hv: vmbus: skip VMBus initialization if Linux is root x86/hyperv: detect if Linux is the root partition asm-generic/hyperv: change HV_CPU_POWER_MANAGEMENT to HV_CPU_MANAGEMENT hv: hyperv.h: Replace one-element array with flexible-array in struct icmsg_negotiate hv_netvsc: Restrict configurations on isolated guests Drivers: hv: vmbus: Enforce 'VMBus version >= 5.2' on isolated guests ...
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/hyperv/Makefile4
-rw-r--r--arch/x86/hyperv/hv_init.c122
-rw-r--r--arch/x86/hyperv/hv_proc.c219
-rw-r--r--arch/x86/hyperv/irqdomain.c385
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h38
-rw-r--r--arch/x86/include/asm/mshyperv.h19
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c58
7 files changed, 833 insertions, 12 deletions
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 89b1f74d3225..48e2c51464e8 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y := hv_init.o mmu.o nested.o
-obj-$(CONFIG_X86_64) += hv_apic.o
+obj-y := hv_init.o mmu.o nested.o irqdomain.o
+obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
ifdef CONFIG_X86_64
obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 6375967a8244..b81047dec1da 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -10,6 +10,7 @@
#include <linux/acpi.h>
#include <linux/efi.h>
#include <linux/types.h>
+#include <linux/bitfield.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
@@ -26,8 +27,11 @@
#include <linux/cpuhotplug.h>
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
+#include <linux/highmem.h>
int hyperv_init_cpuhp;
+u64 hv_current_partition_id = ~0ull;
+EXPORT_SYMBOL_GPL(hv_current_partition_id);
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
@@ -44,6 +48,9 @@ EXPORT_SYMBOL_GPL(hv_vp_assist_page);
void __percpu **hyperv_pcpu_input_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
+void __percpu **hyperv_pcpu_output_arg;
+EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
+
u32 hv_max_vp_index;
EXPORT_SYMBOL_GPL(hv_max_vp_index);
@@ -76,12 +83,19 @@ static int hv_cpu_init(unsigned int cpu)
void **input_arg;
struct page *pg;
- input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
- pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL);
+ pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0);
if (unlikely(!pg))
return -ENOMEM;
+
+ input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
*input_arg = page_address(pg);
+ if (hv_root_partition) {
+ void **output_arg;
+
+ output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
+ *output_arg = page_address(pg + 1);
+ }
hv_get_vp_index(msr_vp_index);
@@ -208,14 +222,23 @@ static int hv_cpu_die(unsigned int cpu)
unsigned int new_cpu;
unsigned long flags;
void **input_arg;
- void *input_pg = NULL;
+ void *pg;
local_irq_save(flags);
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
- input_pg = *input_arg;
+ pg = *input_arg;
*input_arg = NULL;
+
+ if (hv_root_partition) {
+ void **output_arg;
+
+ output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
+ *output_arg = NULL;
+ }
+
local_irq_restore(flags);
- free_page((unsigned long)input_pg);
+
+ free_pages((unsigned long)pg, hv_root_partition ? 1 : 0);
if (hv_vp_assist_page && hv_vp_assist_page[cpu])
wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
@@ -264,6 +287,9 @@ static int hv_suspend(void)
union hv_x64_msr_hypercall_contents hypercall_msr;
int ret;
+ if (hv_root_partition)
+ return -EPERM;
+
/*
* Reset the hypercall page as it is going to be invalidated
* accross hibernation. Setting hv_hypercall_pg to NULL ensures
@@ -334,6 +360,24 @@ static void __init hv_stimer_setup_percpu_clockev(void)
old_setup_percpu_clockev();
}
+static void __init hv_get_partition_id(void)
+{
+ struct hv_get_partition_id *output_page;
+ u64 status;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
+ if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ /* No point in proceeding if this failed */
+ pr_err("Failed to get partition ID: %lld\n", status);
+ BUG();
+ }
+ hv_current_partition_id = output_page->partition_id;
+ local_irq_restore(flags);
+}
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -368,6 +412,12 @@ void __init hyperv_init(void)
BUG_ON(hyperv_pcpu_input_arg == NULL);
+ /* Allocate the per-CPU state for output arg for root */
+ if (hv_root_partition) {
+ hyperv_pcpu_output_arg = alloc_percpu(void *);
+ BUG_ON(hyperv_pcpu_output_arg == NULL);
+ }
+
/* Allocate percpu VP index */
hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
GFP_KERNEL);
@@ -408,8 +458,35 @@ void __init hyperv_init(void)
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
hypercall_msr.enable = 1;
- hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
- wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ if (hv_root_partition) {
+ struct page *pg;
+ void *src, *dst;
+
+ /*
+ * For the root partition, the hypervisor will set up its
+ * hypercall page. The hypervisor guarantees it will not show
+ * up in the root's address space. The root can't change the
+ * location of the hypercall page.
+ *
+ * Order is important here. We must enable the hypercall page
+ * so it is populated with code, then copy the code to an
+ * executable page.
+ */
+ wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ pg = vmalloc_to_page(hv_hypercall_pg);
+ dst = kmap(pg);
+ src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE,
+ MEMREMAP_WB);
+ BUG_ON(!(src && dst));
+ memcpy(dst, src, HV_HYP_PAGE_SIZE);
+ memunmap(src);
+ kunmap(pg);
+ } else {
+ hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
+ wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ }
/*
* hyperv_init() is called before LAPIC is initialized: see
@@ -428,6 +505,21 @@ void __init hyperv_init(void)
register_syscore_ops(&hv_syscore_ops);
hyperv_init_cpuhp = cpuhp;
+
+ if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
+ hv_get_partition_id();
+
+ BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull);
+
+#ifdef CONFIG_PCI_MSI
+ /*
+ * If we're running as root, we want to create our own PCI MSI domain.
+ * We can't set this in hv_pci_init because that would be too late.
+ */
+ if (hv_root_partition)
+ x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
+#endif
+
return;
remove_cpuhp_state:
@@ -552,6 +644,20 @@ EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
bool hv_is_hibernation_supported(void)
{
- return acpi_sleep_state_supported(ACPI_STATE_S4);
+ return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
+
+enum hv_isolation_type hv_get_isolation_type(void)
+{
+ if (!(ms_hyperv.features_b & HV_ISOLATION))
+ return HV_ISOLATION_TYPE_NONE;
+ return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
+}
+EXPORT_SYMBOL_GPL(hv_get_isolation_type);
+
+bool hv_is_isolation_supported(void)
+{
+ return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
+}
+EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
new file mode 100644
index 000000000000..60461e598239
--- /dev/null
+++ b/arch/x86/hyperv/hv_proc.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/acpi.h>
+#include <linux/hyperv.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <linux/minmax.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
+#include <asm/apic.h>
+
+#include <asm/trace/hyperv.h>
+
+/*
+ * See struct hv_deposit_memory. The first u64 is partition ID, the rest
+ * are GPAs.
+ */
+#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
+
+/* Deposits exact number of pages. Must be called with interrupts enabled. */
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
+{
+ struct page **pages, *page;
+ int *counts;
+ int num_allocations;
+ int i, j, page_count;
+ int order;
+ u64 status;
+ int ret;
+ u64 base_pfn;
+ struct hv_deposit_memory *input_page;
+ unsigned long flags;
+
+ if (num_pages > HV_DEPOSIT_MAX)
+ return -E2BIG;
+ if (!num_pages)
+ return 0;
+
+ /* One buffer for page pointers and counts */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ pages = page_address(page);
+
+ counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
+ if (!counts) {
+ free_page((unsigned long)pages);
+ return -ENOMEM;
+ }
+
+ /* Allocate all the pages before disabling interrupts */
+ i = 0;
+
+ while (num_pages) {
+ /* Find highest order we can actually allocate */
+ order = 31 - __builtin_clz(num_pages);
+
+ while (1) {
+ pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+ if (pages[i])
+ break;
+ if (!order) {
+ ret = -ENOMEM;
+ num_allocations = i;
+ goto err_free_allocations;
+ }
+ --order;
+ }
+
+ split_page(pages[i], order);
+ counts[i] = 1 << order;
+ num_pages -= counts[i];
+ i++;
+ }
+ num_allocations = i;
+
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->partition_id = partition_id;
+
+ /* Populate gpa_page_list - these will fit on the input page */
+ for (i = 0, page_count = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j, ++page_count)
+ input_page->gpa_page_list[page_count] = base_pfn + j;
+ }
+ status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
+ page_count, 0, input_page, NULL);
+ local_irq_restore(flags);
+
+ if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ pr_err("Failed to deposit pages: %lld\n", status);
+ ret = status;
+ goto err_free_allocations;
+ }
+
+ ret = 0;
+ goto free_buf;
+
+err_free_allocations:
+ for (i = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j)
+ __free_page(pfn_to_page(base_pfn + j));
+ }
+
+free_buf:
+ free_page((unsigned long)pages);
+ kfree(counts);
+ return ret;
+}
+
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
+{
+ struct hv_add_logical_processor_in *input;
+ struct hv_add_logical_processor_out *output;
+ u64 status;
+ unsigned long flags;
+ int ret = 0;
+ int pxm = node_to_pxm(node);
+
+ /*
+ * When adding a logical processor, the hypervisor may return
+ * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
+ * pages and retry.
+ */
+ do {
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ /* We don't do anything with the output right now */
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input->lp_index = lp_index;
+ input->apic_id = apic_id;
+ input->flags = 0;
+ input->proximity_domain_info.domain_id = pxm;
+ input->proximity_domain_info.flags.reserved = 0;
+ input->proximity_domain_info.flags.proximity_info_valid = 1;
+ input->proximity_domain_info.flags.proximity_preferred = 1;
+ status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
+ input, output);
+ local_irq_restore(flags);
+
+ status &= HV_HYPERCALL_RESULT_MASK;
+
+ if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (status != HV_STATUS_SUCCESS) {
+ pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
+ lp_index, apic_id, status);
+ ret = status;
+ }
+ break;
+ }
+ ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
+{
+ struct hv_create_vp *input;
+ u64 status;
+ unsigned long irq_flags;
+ int ret = 0;
+ int pxm = node_to_pxm(node);
+
+ /* Root VPs don't seem to need pages deposited */
+ if (partition_id != hv_current_partition_id) {
+ /* The value 90 is empirically determined. It may change. */
+ ret = hv_call_deposit_pages(node, partition_id, 90);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->flags = flags;
+ input->subnode_type = HvSubnodeAny;
+ if (node != NUMA_NO_NODE) {
+ input->proximity_domain_info.domain_id = pxm;
+ input->proximity_domain_info.flags.reserved = 0;
+ input->proximity_domain_info.flags.proximity_info_valid = 1;
+ input->proximity_domain_info.flags.proximity_preferred = 1;
+ } else {
+ input->proximity_domain_info.as_uint64 = 0;
+ }
+ status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
+ local_irq_restore(irq_flags);
+
+ status &= HV_HYPERCALL_RESULT_MASK;
+
+ if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (status != HV_STATUS_SUCCESS) {
+ pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
+ vp_index, flags, status);
+ ret = status;
+ }
+ break;
+ }
+ ret = hv_call_deposit_pages(node, partition_id, 1);
+
+ } while (!ret);
+
+ return ret;
+}
+
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
new file mode 100644
index 000000000000..4421a8d92e23
--- /dev/null
+++ b/arch/x86/hyperv/irqdomain.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
+ *
+ * Authors:
+ * Sunil Muthuswamy <sunilmut@microsoft.com>
+ * Wei Liu <wei.liu@kernel.org>
+ */
+
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <asm/mshyperv.h>
+
+static int hv_map_interrupt(union hv_device_id device_id, bool level,
+ int cpu, int vector, struct hv_interrupt_entry *entry)
+{
+ struct hv_input_map_device_interrupt *input;
+ struct hv_output_map_device_interrupt *output;
+ struct hv_device_interrupt_descriptor *intr_desc;
+ unsigned long flags;
+ u64 status;
+ int nr_bank, var_size;
+
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ intr_desc = &input->interrupt_descriptor;
+ memset(input, 0, sizeof(*input));
+ input->partition_id = hv_current_partition_id;
+ input->device_id = device_id.as_uint64;
+ intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
+ intr_desc->vector_count = 1;
+ intr_desc->target.vector = vector;
+
+ if (level)
+ intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL;
+ else
+ intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
+
+ intr_desc->target.vp_set.valid_bank_mask = 0;
+ intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu));
+ if (nr_bank < 0) {
+ local_irq_restore(flags);
+ pr_err("%s: unable to generate VP set\n", __func__);
+ return EINVAL;
+ }
+ intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+
+ /*
+ * var-sized hypercall, var-size starts after vp_mask (thus
+ * vp_set.format does not count, but vp_set.valid_bank_mask
+ * does).
+ */
+ var_size = nr_bank + 1;
+
+ status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
+ input, output);
+ *entry = output->interrupt_entry;
+
+ local_irq_restore(flags);
+
+ if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
+ pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+
+ return status & HV_HYPERCALL_RESULT_MASK;
+}
+
+static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
+{
+ unsigned long flags;
+ struct hv_input_unmap_device_interrupt *input;
+ struct hv_interrupt_entry *intr_entry;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+ intr_entry = &input->interrupt_entry;
+ input->partition_id = hv_current_partition_id;
+ input->device_id = id;
+ *intr_entry = *old_entry;
+
+ status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
+ local_irq_restore(flags);
+
+ return status & HV_HYPERCALL_RESULT_MASK;
+}
+
+#ifdef CONFIG_PCI_MSI
+struct rid_data {
+ struct pci_dev *bridge;
+ u32 rid;
+};
+
+static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
+{
+ struct rid_data *rd = data;
+ u8 bus = PCI_BUS_NUM(rd->rid);
+
+ if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) {
+ rd->bridge = pdev;
+ rd->rid = alias;
+ }
+
+ return 0;
+}
+
+static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
+{
+ union hv_device_id dev_id;
+ struct rid_data data = {
+ .bridge = NULL,
+ .rid = PCI_DEVID(dev->bus->number, dev->devfn)
+ };
+
+ pci_for_each_dma_alias(dev, get_rid_cb, &data);
+
+ dev_id.as_uint64 = 0;
+ dev_id.device_type = HV_DEVICE_TYPE_PCI;
+ dev_id.pci.segment = pci_domain_nr(dev->bus);
+
+ dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
+ dev_id.pci.bdf.device = PCI_SLOT(data.rid);
+ dev_id.pci.bdf.function = PCI_FUNC(data.rid);
+ dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
+
+ if (data.bridge) {
+ int pos;
+
+ /*
+ * Microsoft Hypervisor requires a bus range when the bridge is
+ * running in PCI-X mode.
+ *
+ * To distinguish conventional vs PCI-X bridge, we can check
+ * the bridge's PCI-X Secondary Status Register, Secondary Bus
+ * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
+ * Specification Revision 1.0 5.2.2.1.3.
+ *
+ * Value zero means it is in conventional mode, otherwise it is
+ * in PCI-X mode.
+ */
+
+ pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
+ if (pos) {
+ u16 status;
+
+ pci_read_config_word(data.bridge, pos +
+ PCI_X_BRIDGE_SSTATUS, &status);
+
+ if (status & PCI_X_SSTATUS_FREQ) {
+ /* Non-zero, PCI-X mode */
+ u8 sec_bus, sub_bus;
+
+ dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
+
+ pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
+ dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
+ pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
+ dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
+ }
+ }
+ }
+
+ return dev_id;
+}
+
+static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector,
+ struct hv_interrupt_entry *entry)
+{
+ union hv_device_id device_id = hv_build_pci_dev_id(dev);
+
+ return hv_map_interrupt(device_id, false, cpu, vector, entry);
+}
+
+static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
+{
+ /* High address is always 0 */
+ msg->address_hi = 0;
+ msg->address_lo = entry->msi_entry.address.as_uint32;
+ msg->data = entry->msi_entry.data.as_uint32;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
+static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct msi_desc *msidesc;
+ struct pci_dev *dev;
+ struct hv_interrupt_entry out_entry, *stored_entry;
+ struct irq_cfg *cfg = irqd_cfg(data);
+ cpumask_t *affinity;
+ int cpu;
+ u64 status;
+
+ msidesc = irq_data_get_msi_desc(data);
+ dev = msi_desc_to_pci_dev(msidesc);
+
+ if (!cfg) {
+ pr_debug("%s: cfg is NULL", __func__);
+ return;
+ }
+
+ affinity = irq_data_get_effective_affinity_mask(data);
+ cpu = cpumask_first_and(affinity, cpu_online_mask);
+
+ if (data->chip_data) {
+ /*
+ * This interrupt is already mapped. Let's unmap first.
+ *
+ * We don't use retarget interrupt hypercalls here because
+ * Microsoft Hypervisor doens't allow root to change the vector
+ * or specify VPs outside of the set that is initially used
+ * during mapping.
+ */
+ stored_entry = data->chip_data;
+ data->chip_data = NULL;
+
+ status = hv_unmap_msi_interrupt(dev, stored_entry);
+
+ kfree(stored_entry);
+
+ if (status != HV_STATUS_SUCCESS) {
+ pr_debug("%s: failed to unmap, status %lld", __func__, status);
+ return;
+ }
+ }
+
+ stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC);
+ if (!stored_entry) {
+ pr_debug("%s: failed to allocate chip data\n", __func__);
+ return;
+ }
+
+ status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry);
+ if (status != HV_STATUS_SUCCESS) {
+ kfree(stored_entry);
+ return;
+ }
+
+ *stored_entry = out_entry;
+ data->chip_data = stored_entry;
+ entry_to_msi_msg(&out_entry, msg);
+
+ return;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry)
+{
+ return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
+}
+
+static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+{
+ u64 status;
+ struct hv_interrupt_entry old_entry;
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct msi_msg msg;
+
+ desc = irq_to_desc(irq);
+ if (!desc) {
+ pr_debug("%s: no irq desc\n", __func__);
+ return;
+ }
+
+ data = &desc->irq_data;
+ if (!data) {
+ pr_debug("%s: no irq data\n", __func__);
+ return;
+ }
+
+ if (!data->chip_data) {
+ pr_debug("%s: no chip data\n!", __func__);
+ return;
+ }
+
+ old_entry = *(struct hv_interrupt_entry *)data->chip_data;
+ entry_to_msi_msg(&old_entry, &msg);
+
+ kfree(data->chip_data);
+ data->chip_data = NULL;
+
+ status = hv_unmap_msi_interrupt(dev, &old_entry);
+
+ if (status != HV_STATUS_SUCCESS) {
+ pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+ return;
+ }
+}
+
+static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ int i;
+ struct msi_desc *entry;
+ struct pci_dev *pdev;
+
+ if (WARN_ON_ONCE(!dev_is_pci(dev)))
+ return;
+
+ pdev = to_pci_dev(dev);
+
+ for_each_pci_msi_entry(entry, pdev) {
+ if (entry->irq) {
+ for (i = 0; i < entry->nvec_used; i++) {
+ hv_teardown_msi_irq_common(pdev, entry, entry->irq + i);
+ irq_domain_free_irqs(entry->irq + i, 1);
+ }
+ }
+ }
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip hv_pci_msi_controller = {
+ .name = "HV-PCI-MSI",
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = hv_irq_compose_msi_msg,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+ .domain_free_irqs = hv_msi_domain_free_irqs,
+ .msi_prepare = pci_msi_prepare,
+};
+
+static struct msi_domain_info hv_pci_msi_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_PCI_MSIX,
+ .ops = &pci_msi_domain_ops,
+ .chip = &hv_pci_msi_controller,
+ .handler = handle_edge_irq,
+ .handler_name = "edge",
+};
+
+struct irq_domain * __init hv_create_pci_msi_domain(void)
+{
+ struct irq_domain *d = NULL;
+ struct fwnode_handle *fn;
+
+ fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI");
+ if (fn)
+ d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain);
+
+ /* No point in going further if we can't get an irq domain */
+ BUG_ON(!d);
+
+ return d;
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
+{
+ union hv_device_id device_id;
+
+ device_id.as_uint64 = 0;
+ device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
+ device_id.ioapic.ioapic_id = (u8)ioapic_id;
+
+ return hv_unmap_interrupt(device_id.as_uint64, entry);
+}
+EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
+
+int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector,
+ struct hv_interrupt_entry *entry)
+{
+ union hv_device_id device_id;
+
+ device_id.as_uint64 = 0;
+ device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
+ device_id.ioapic.ioapic_id = (u8)ioapic_id;
+
+ return hv_map_interrupt(device_id, level, cpu, vector, entry);
+}
+EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt);
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 6bf42aed387e..e6cd3fee562b 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -21,7 +21,9 @@
#define HYPERV_CPUID_FEATURES 0x40000003
#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007
#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A
+#define HYPERV_CPUID_ISOLATION_CONFIG 0x4000000C
#define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081
#define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */
@@ -111,6 +113,15 @@
#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14)
/*
+ * CPU management features identification.
+ * These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits.
+ */
+#define HV_X64_START_LOGICAL_PROCESSOR BIT(0)
+#define HV_X64_CREATE_ROOT_VIRTUAL_PROCESSOR BIT(1)
+#define HV_X64_PERFORMANCE_COUNTER_SYNC BIT(2)
+#define HV_X64_RESERVED_IDENTITY_BIT BIT(31)
+
+/*
* Virtual processor will never share a physical core with another virtual
* processor, except for virtual processors that are reported as sibling SMT
* threads.
@@ -122,6 +133,20 @@
#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
#define HV_X64_NESTED_MSR_BITMAP BIT(19)
+/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */
+#define HV_PARAVISOR_PRESENT BIT(0)
+
+/* HYPERV_CPUID_ISOLATION_CONFIG.EBX bits. */
+#define HV_ISOLATION_TYPE GENMASK(3, 0)
+#define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5)
+#define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6)
+
+enum hv_isolation_type {
+ HV_ISOLATION_TYPE_NONE = 0,
+ HV_ISOLATION_TYPE_VBS = 1,
+ HV_ISOLATION_TYPE_SNP = 2
+};
+
/* Hyper-V specific model specific registers (MSRs) */
/* MSR used to identify the guest OS. */
@@ -523,6 +548,19 @@ struct hv_partition_assist_pg {
u32 tlb_lock_count;
};
+enum hv_interrupt_type {
+ HV_X64_INTERRUPT_TYPE_FIXED = 0x0000,
+ HV_X64_INTERRUPT_TYPE_LOWESTPRIORITY = 0x0001,
+ HV_X64_INTERRUPT_TYPE_SMI = 0x0002,
+ HV_X64_INTERRUPT_TYPE_REMOTEREAD = 0x0003,
+ HV_X64_INTERRUPT_TYPE_NMI = 0x0004,
+ HV_X64_INTERRUPT_TYPE_INIT = 0x0005,
+ HV_X64_INTERRUPT_TYPE_SIPI = 0x0006,
+ HV_X64_INTERRUPT_TYPE_EXTINT = 0x0007,
+ HV_X64_INTERRUPT_TYPE_LOCALINT0 = 0x0008,
+ HV_X64_INTERRUPT_TYPE_LOCALINT1 = 0x0009,
+ HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A,
+};
#include <asm-generic/hyperv-tlfs.h>
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 30f76b966857..ccf60a809a17 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -78,6 +78,13 @@ extern int hyperv_init_cpuhp;
extern void *hv_hypercall_pg;
extern void __percpu **hyperv_pcpu_input_arg;
+extern void __percpu **hyperv_pcpu_output_arg;
+
+extern u64 hv_current_partition_id;
+
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
{
@@ -239,6 +246,8 @@ int hyperv_fill_flush_guest_mapping_list(
struct hv_guest_mapping_flush_list *flush,
u64 start_gfn, u64 end_gfn);
+extern bool hv_root_partition;
+
#ifdef CONFIG_X86_64
void hv_apic_init(void);
void __init hv_init_spinlocks(void);
@@ -250,10 +259,16 @@ static inline void hv_apic_init(void) {}
static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
struct msi_desc *msi_desc)
{
- msi_entry->address = msi_desc->msg.address_lo;
- msi_entry->data = msi_desc->msg.data;
+ msi_entry->address.as_uint32 = msi_desc->msg.address_lo;
+ msi_entry->data.as_uint32 = msi_desc->msg.data;
}
+struct irq_domain *hv_create_pci_msi_domain(void);
+
+int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
+ struct hv_interrupt_entry *entry);
+int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
+
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline void hyperv_setup_mmu_ops(void) {}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 43b54bef5448..e88bc296afca 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -31,6 +31,11 @@
#include <asm/reboot.h>
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
+#include <asm/numa.h>
+
+/* Is Linux running as the root partition? */
+bool hv_root_partition;
+EXPORT_SYMBOL_GPL(hv_root_partition);
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -226,6 +231,32 @@ static void __init hv_smp_prepare_boot_cpu(void)
hv_init_spinlocks();
#endif
}
+
+static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ int ret;
+#endif
+
+ native_smp_prepare_cpus(max_cpus);
+
+#ifdef CONFIG_X86_64
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
+ BUG_ON(ret);
+ }
+
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i);
+ BUG_ON(ret);
+ }
+#endif
+}
#endif
static void __init ms_hyperv_init_platform(void)
@@ -243,6 +274,7 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
@@ -256,6 +288,22 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
/*
+ * Check CPU management privilege.
+ *
+ * To mirror what Windows does we should extract CPU management
+ * features and use the ReservedIdentityBit to detect if Linux is the
+ * root partition. But that requires negotiating CPU management
+ * interface (a process to be finalized).
+ *
+ * For now, use the privilege flag as the indicator for running as
+ * root.
+ */
+ if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) {
+ hv_root_partition = true;
+ pr_info("Hyper-V: running as root partition\n");
+ }
+
+ /*
* Extract host information.
*/
if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
@@ -277,6 +325,14 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
+ if (ms_hyperv.features_b & HV_ISOLATION) {
+ ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
+ ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
+
+ pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
+ ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+ }
+
if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
ms_hyperv.nested_features =
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
@@ -366,6 +422,8 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
+ if (hv_root_partition)
+ smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
/*