From 437cfd714db9c1d28878a6e2555e9a730f3490c8 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 17 May 2022 16:36:28 +0000 Subject: Documentation/virt/kvm/api.rst: Add protvirt dump/info api descriptions Time to add the dump API changes to the api documentation file. Also some minor cleanup. Signed-off-by: Janosch Frank Acked-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20220517163629.3443-11-frankja@linux.ibm.com Message-Id: <20220517163629.3443-11-frankja@linux.ibm.com> Signed-off-by: Christian Borntraeger --- Documentation/virt/kvm/api.rst | 154 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 2 deletions(-) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 28b547a9d96a..47d3064f5b79 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5127,7 +5127,7 @@ into ESA mode. This reset is a superset of the initial reset. __u32 reserved[3]; }; -cmd values: +**cmd values:** KVM_PV_ENABLE Allocate memory and register the VM with the Ultravisor, thereby @@ -5143,7 +5143,6 @@ KVM_PV_ENABLE ===== ============================= KVM_PV_DISABLE - Deregister the VM from the Ultravisor and reclaim the memory that had been donated to the Ultravisor, making it usable by the kernel again. All registered VCPUs are converted back to non-protected @@ -5160,6 +5159,117 @@ KVM_PV_VM_VERIFY Verify the integrity of the unpacked image. Only if this succeeds, KVM is allowed to start protected VCPUs. +KVM_PV_INFO + :Capability: KVM_CAP_S390_PROTECTED_DUMP + + Presents an API that provides Ultravisor related data to userspace + via subcommands. len_max is the size of the user space buffer, + len_written is KVM's indication of how much bytes of that buffer + were actually written to. len_written can be used to determine the + valid fields if more response fields are added in the future. + + :: + + enum pv_cmd_info_id { + KVM_PV_INFO_VM, + KVM_PV_INFO_DUMP, + }; + + struct kvm_s390_pv_info_header { + __u32 id; + __u32 len_max; + __u32 len_written; + __u32 reserved; + }; + + struct kvm_s390_pv_info { + struct kvm_s390_pv_info_header header; + struct kvm_s390_pv_info_dump dump; + struct kvm_s390_pv_info_vm vm; + }; + +**subcommands:** + + KVM_PV_INFO_VM + This subcommand provides basic Ultravisor information for PV + hosts. These values are likely also exported as files in the sysfs + firmware UV query interface but they are more easily available to + programs in this API. + + The installed calls and feature_indication members provide the + installed UV calls and the UV's other feature indications. + + The max_* members provide information about the maximum number of PV + vcpus, PV guests and PV guest memory size. + + :: + + struct kvm_s390_pv_info_vm { + __u64 inst_calls_list[4]; + __u64 max_cpus; + __u64 max_guests; + __u64 max_guest_addr; + __u64 feature_indication; + }; + + + KVM_PV_INFO_DUMP + This subcommand provides information related to dumping PV guests. + + :: + + struct kvm_s390_pv_info_dump { + __u64 dump_cpu_buffer_len; + __u64 dump_config_mem_buffer_per_1m; + __u64 dump_config_finalize_len; + }; + +KVM_PV_DUMP + :Capability: KVM_CAP_S390_PROTECTED_DUMP + + Presents an API that provides calls which facilitate dumping a + protected VM. + + :: + + struct kvm_s390_pv_dmp { + __u64 subcmd; + __u64 buff_addr; + __u64 buff_len; + __u64 gaddr; /* For dump storage state */ + }; + + **subcommands:** + + KVM_PV_DUMP_INIT + Initializes the dump process of a protected VM. If this call does + not succeed all other subcommands will fail with -EINVAL. This + subcommand will return -EINVAL if a dump process has not yet been + completed. + + Not all PV vms can be dumped, the owner needs to set `dump + allowed` PCF bit 34 in the SE header to allow dumping. + + KVM_PV_DUMP_CONFIG_STOR_STATE + Stores `buff_len` bytes of tweak component values starting with + the 1MB block specified by the absolute guest address + (`gaddr`). `buff_len` needs to be `conf_dump_storage_state_len` + aligned and at least >= the `conf_dump_storage_state_len` value + provided by the dump uv_info data. buff_user might be written to + even if an error rc is returned. For instance if we encounter a + fault after writing the first page of data. + + KVM_PV_DUMP_COMPLETE + If the subcommand succeeds it completes the dump process and lets + KVM_PV_DUMP_INIT be called again. + + On success `conf_dump_finalize_len` bytes of completion data will be + stored to the `buff_addr`. The completion data contains a key + derivation seed, IV, tweak nonce and encryption keys as well as an + authentication tag all of which are needed to decrypt the dump at a + later time. + + 4.126 KVM_X86_SET_MSR_FILTER ---------------------------- @@ -5802,6 +5912,32 @@ of CPUID leaf 0xD on the host. This ioctl injects an event channel interrupt directly to the guest vCPU. +4.136 KVM_S390_PV_CPU_COMMAND +----------------------------- + +:Capability: KVM_CAP_S390_PROTECTED_DUMP +:Architectures: s390 +:Type: vcpu ioctl +:Parameters: none +:Returns: 0 on success, < 0 on error + +This ioctl closely mirrors `KVM_S390_PV_COMMAND` but handles requests +for vcpus. It re-uses the kvm_s390_pv_dmp struct and hence also shares +the command ids. + +**command:** + +KVM_PV_DUMP + Presents an API that provides calls which facilitate dumping a vcpu + of a protected VM. + +**subcommand:** + +KVM_PV_DUMP_CPU + Provides encrypted dump data like register values. + The length of the returned data is provided by uv_info.guest_cpu_stor_len. + + 5. The kvm_run structure ======================== @@ -7954,6 +8090,20 @@ should adjust CPUID leaf 0xA to reflect that the PMU is disabled. When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request. +8.37 KVM_CAP_S390_PROTECTED_DUMP +-------------------------------- + +:Capability: KVM_CAP_S390_PROTECTED_DUMP +:Architectures: s390 +:Type: vm + +This capability indicates that KVM and the Ultravisor support dumping +PV guests. The `KVM_PV_DUMP` command is available for the +`KVM_S390_PV_COMMAND` ioctl and the `KVM_PV_INFO` command provides +dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is +available and supports the `KVM_PV_DUMP_CPU` subcommand. + + 9. Known KVM API problems ========================= -- cgit v1.2.3 From b0f46280d3fcd59a65cfae9742fa5172362af893 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 17 May 2022 16:36:29 +0000 Subject: Documentation/virt/kvm/api.rst: Explain rc/rrc delivery Let's explain in which situations the rc/rrc will set in struct kvm_pv_cmd so it's clear that the struct members should be set to 0. rc/rrc are independent of the IOCTL return code. Signed-off-by: Janosch Frank Acked-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20220517163629.3443-12-frankja@linux.ibm.com Message-Id: <20220517163629.3443-12-frankja@linux.ibm.com> Signed-off-by: Christian Borntraeger --- Documentation/virt/kvm/api.rst | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 47d3064f5b79..0dd6d23c32ee 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5127,6 +5127,14 @@ into ESA mode. This reset is a superset of the initial reset. __u32 reserved[3]; }; +**Ultravisor return codes** +The Ultravisor return (reason) codes are provided by the kernel if a +Ultravisor call has been executed to achieve the results expected by +the command. Therefore they are independent of the IOCTL return +code. If KVM changes `rc`, its value will always be greater than 0 +hence setting it to 0 before issuing a PV command is advised to be +able to detect a change of `rc`. + **cmd values:** KVM_PV_ENABLE -- cgit v1.2.3 From 35875316384b71d23dc2a45a969732fc8cab16af Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Tue, 19 Apr 2022 23:44:44 +0800 Subject: KVM: x86: Allow userspace to set maximum VCPU id for VM Introduce new max_vcpu_ids in KVM for x86 architecture. Userspace can assign maximum possible vcpu id for current VM session using KVM_CAP_MAX_VCPU_ID of KVM_ENABLE_CAP ioctl(). This is done for x86 only because the sole use case is to guide memory allocation for PID-pointer table, a structure needed to enable VMX IPI. By default, max_vcpu_ids set as KVM_MAX_VCPU_IDS. Suggested-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Zeng Guang Message-Id: <20220419154444.11888-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/x86.c | 20 ++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 94d73ec52aba..421479a67da5 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7494,6 +7494,27 @@ The valid bits in cap.args[0] are: generate a #UD within the guest. =================================== ============================================ +7.32 KVM_CAP_MAX_VCPU_ID +------------------------ + +:Architectures: x86 +:Target: VM +:Parameters: args[0] - maximum APIC ID value set for current VM +:Returns: 0 on success, -EINVAL if args[0] is beyond KVM_MAX_VCPU_IDS + supported in KVM or if it has been set. + +This capability allows userspace to specify maximum possible APIC ID +assigned for current VM session prior to the creation of vCPUs, saving +memory for data structures indexed by the APIC ID. Userspace is able +to calculate the limit to APIC ID values from designated +CPU topology. + +The value can be changed only until KVM_ENABLE_CAP is set to a nonzero +value or until a vCPU is created. Upon creation of the first vCPU, +if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM +uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as +the maximum APIC ID. + 8. Other capabilities. ====================== diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8109805b5429..9dc8d6d0a67d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1243,6 +1243,12 @@ struct kvm_arch { hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; #endif + /* + * VM-scope maximum vCPU ID. Used to determine the size of structures + * that increase along with the maximum vCPU ID, in which case, using + * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. + */ + u32 max_vcpu_ids; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9c2e2b6d1767..25fbb90c7c93 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6087,6 +6087,20 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_IDS) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -11246,6 +11260,12 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + return 0; } -- cgit v1.2.3 From ed2351174e38ad4febbbc0dba802803e6cff8ae0 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Tue, 24 May 2022 21:56:21 +0800 Subject: KVM: x86: Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault For the triple fault sythesized by KVM, e.g. the RSM path or nested_vmx_abort(), if KVM exits to userspace before the request is serviced, userspace could migrate the VM and lose the triple fault. Extend KVM_{G,S}ET_VCPU_EVENTS to support pending triple fault with a new event KVM_VCPUEVENT_VALID_FAULT_FAULT so that userspace can save and restore the triple fault event. This extension is guarded by a new KVM capability KVM_CAP_TRIPLE_FAULT_EVENT. Note that in the set_vcpu_events path, userspace is able to set/clear the triple fault request through triple_fault.pending field. Signed-off-by: Chenyi Qiang Message-Id: <20220524135624.22988-2-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 8 ++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/include/uapi/asm/kvm.h | 6 +++++- arch/x86/kvm/x86.c | 21 ++++++++++++++++++++- include/uapi/linux/kvm.h | 1 + 5 files changed, 36 insertions(+), 2 deletions(-) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 421479a67da5..f67e367c4059 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1150,6 +1150,10 @@ The following bits are defined in the flags field: fields contain a valid state. This bit will be set whenever KVM_CAP_EXCEPTION_PAYLOAD is enabled. +- KVM_VCPUEVENT_VALID_TRIPLE_FAULT may be set to signal that the + triple_fault_pending field contains a valid state. This bit will + be set whenever KVM_CAP_TRIPLE_FAULT_EVENT is enabled. + ARM64: ^^^^^^ @@ -1245,6 +1249,10 @@ can be set in the flags field to signal that the exception_has_payload, exception_payload, and exception.pending fields contain a valid state and shall be written into the VCPU. +If KVM_CAP_TRIPLE_FAULT_EVENT is enabled, KVM_VCPUEVENT_VALID_TRIPLE_FAULT +can be set in flags field to signal that the triple_fault field contains +a valid state and shall be written into the VCPU. + ARM64: ^^^^^^ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 032278f0ee6d..d6c62276e131 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1174,6 +1174,8 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool triple_fault_event; + bool bus_lock_detection_enabled; bool enable_pmu; /* diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 21614807a2cb..24c807c8d5f7 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -325,6 +325,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 +#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -359,7 +360,10 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u8 reserved[27]; + struct { + __u8 pending; + } triple_fault; + __u8 reserved[26]; __u8 exception_has_payload; __u64 exception_payload; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8dfdef9e52f..422fbb0d7518 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4296,6 +4296,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: @@ -4942,6 +4943,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + if (vcpu->kvm->arch.triple_fault_event) { + events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); + events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + } memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -4955,7 +4960,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM - | KVM_VCPUEVENT_VALID_PAYLOAD)) + | KVM_VCPUEVENT_VALID_PAYLOAD + | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { @@ -5028,6 +5034,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, } } + if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + if (!vcpu->kvm->arch.triple_fault_event) + return -EINVAL; + if (events->triple_fault.pending) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + else + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -6029,6 +6044,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + kvm->arch.triple_fault_event = cap->args[0]; + r = 0; + break; case KVM_CAP_X86_USER_SPACE_MSR: kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c4a32910b88a..ca799319acfd 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1158,6 +1158,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SYSTEM_EVENT_DATA 215 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216 #define KVM_CAP_S390_PROTECTED_DUMP 217 +#define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 2f4073e08f4cc5a41e35d777c240aaadd0257051 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Tue, 24 May 2022 21:56:24 +0800 Subject: KVM: VMX: Enable Notify VM exit There are cases that malicious virtual machines can cause CPU stuck (due to event windows don't open up), e.g., infinite loop in microcode when nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and IRQ) can be delivered. It leads the CPU to be unavailable to host or other VMs. VMM can enable notify VM exit that a VM exit generated if no event window occurs in VM non-root mode for a specified amount of time (notify window). Feature enabling: - The new vmcs field SECONDARY_EXEC_NOTIFY_VM_EXITING is introduced to enable this feature. VMM can set NOTIFY_WINDOW vmcs field to adjust the expected notify window. - Add a new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT so that user space can query and enable this feature in per-VM scope. The argument is a 64bit value: bits 63:32 are used for notify window, and bits 31:0 are for flags. Current supported flags: - KVM_X86_NOTIFY_VMEXIT_ENABLED: enable the feature with the notify window provided. - KVM_X86_NOTIFY_VMEXIT_USER: exit to userspace once the exits happen. - It's safe to even set notify window to zero since an internal hardware threshold is added to vmcs.notify_window. VM exit handling: - Introduce a vcpu state notify_window_exits to records the count of notify VM exits and expose it through the debugfs. - Notify VM exit can happen incident to delivery of a vector event. Allow it in KVM. - Exit to userspace unconditionally for handling when VM_CONTEXT_INVALID bit is set. Nested handling - Nested notify VM exits are not supported yet. Keep the same notify window control in vmcs02 as vmcs01, so that L1 can't escape the restriction of notify VM exits through launching L2 VM. Notify VM exit is defined in latest Intel Architecture Instruction Set Extensions Programming Reference, chapter 9.2. Co-developed-by: Xiaoyao Li Signed-off-by: Xiaoyao Li Signed-off-by: Tao Xu Co-developed-by: Chenyi Qiang Signed-off-by: Chenyi Qiang Message-Id: <20220524135624.22988-5-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 49 ++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 7 ++++++ arch/x86/include/asm/vmx.h | 7 ++++++ arch/x86/include/asm/vmxfeatures.h | 1 + arch/x86/include/uapi/asm/vmx.h | 4 +++- arch/x86/kvm/vmx/capabilities.h | 6 +++++ arch/x86/kvm/vmx/nested.c | 8 +++++++ arch/x86/kvm/vmx/vmx.c | 40 +++++++++++++++++++++++++++++-- arch/x86/kvm/x86.c | 22 ++++++++++++++++- arch/x86/kvm/x86.h | 7 ++++++ include/uapi/linux/kvm.h | 11 +++++++++ 11 files changed, 158 insertions(+), 4 deletions(-) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index f67e367c4059..30e31a886422 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6557,6 +6557,26 @@ array field represents return values. The userspace should update the return values of SBI call before resuming the VCPU. For more details on RISC-V SBI spec refer, https://github.com/riscv/riscv-sbi-doc. +:: + + /* KVM_EXIT_NOTIFY */ + struct { + #define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) + __u32 flags; + } notify; + +Used on x86 systems. When the VM capability KVM_CAP_X86_NOTIFY_VMEXIT is +enabled, a VM exit generated if no event window occurs in VM non-root mode +for a specified amount of time. Once KVM_X86_NOTIFY_VMEXIT_USER is set when +enabling the cap, it would exit to userspace with the exit reason +KVM_EXIT_NOTIFY for further handling. The "flags" field contains more +detailed info. + +The valid value for 'flags' is: + + - KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid + in VMCS. It would run into unknown result if resume the target VM. + :: /* Fix the size of the union. */ @@ -7523,6 +7543,35 @@ if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as the maximum APIC ID. +7.33 KVM_CAP_X86_NOTIFY_VMEXIT +------------------------------ + +:Architectures: x86 +:Target: VM +:Parameters: args[0] is the value of notify window as well as some flags +:Returns: 0 on success, -EINVAL if args[0] contains invalid flags or notify + VM exit is unsupported. + +Bits 63:32 of args[0] are used for notify window. +Bits 31:0 of args[0] are for some flags. Valid bits are:: + + #define KVM_X86_NOTIFY_VMEXIT_ENABLED (1 << 0) + #define KVM_X86_NOTIFY_VMEXIT_USER (1 << 1) + +This capability allows userspace to configure the notify VM exit on/off +in per-VM scope during VM creation. Notify VM exit is disabled by default. +When userspace sets KVM_X86_NOTIFY_VMEXIT_ENABLED bit in args[0], VMM will +enable this feature with the notify window provided, which will generate +a VM exit if no event window occurs in VM non-root mode for a specified of +time (notify window). + +If KVM_X86_NOTIFY_VMEXIT_USER is set in args[0], upon notify VM exits happen, +KVM would exit to userspace for handling. + +This capability is aimed to mitigate the threat that malicious VMs can +cause CPU stuck (due to event windows don't open up) and make the CPU +unavailable to host or other VMs. + 8. Other capabilities. ====================== diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4e00bca08cfa..6cf5d77d7896 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -65,6 +65,9 @@ #define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ KVM_BUS_LOCK_DETECTION_EXIT) +#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ + KVM_X86_NOTIFY_VMEXIT_USER) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -1178,6 +1181,9 @@ struct kvm_arch { bool bus_lock_detection_enabled; bool enable_pmu; + + u32 notify_window; + u32 notify_vmexit_flags; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace @@ -1325,6 +1331,7 @@ struct kvm_vcpu_stat { u64 directed_yield_attempted; u64 directed_yield_successful; u64 guest_mode; + u64 notify_window_exits; }; struct x86_instruction_info; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 89d2172787c5..c371ef695fcc 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -75,6 +75,7 @@ #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) #define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) +#define SECONDARY_EXEC_NOTIFY_VM_EXITING VMCS_CONTROL_BIT(NOTIFY_VM_EXITING) /* * Definitions of Tertiary Processor-Based VM-Execution Controls. @@ -280,6 +281,7 @@ enum vmcs_field { SECONDARY_VM_EXEC_CONTROL = 0x0000401e, PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, + NOTIFY_WINDOW = 0x00004024, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -564,6 +566,11 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT) #define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) +/* + * Exit Qualifications for NOTIFY VM EXIT + */ +#define NOTIFY_VM_CONTEXT_INVALID BIT(0) + /* * VM-instruction error numbers */ diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 589608c157bf..c6a7eed03914 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -85,6 +85,7 @@ #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ #define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ +#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* VM-Exit when no event windows after notify window */ /* Tertiary Processor-Based VM-Execution Controls, word 3 */ #define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 946d761adbd3..a5faf6d88f1b 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -91,6 +91,7 @@ #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 +#define EXIT_REASON_NOTIFY 75 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -153,7 +154,8 @@ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ - { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ + { EXIT_REASON_NOTIFY, "NOTIFY" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index f14c4bef97e0..2d3f13b18714 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -436,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void) return debugctl; } +static inline bool cpu_has_notify_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_NOTIFY_VM_EXITING; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5c5f4e3762f5..7d8cd0ebcc75 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2133,6 +2133,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + /* * If vmcs02 hasn't been initialized, set the constant vmcs02 state * according to L0's settings (vmcs12 is irrelevant here). Host @@ -2175,6 +2177,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + /* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based @@ -6112,6 +6117,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); case EXIT_REASON_ENCLS: return nested_vmx_exit_handled_encls(vcpu, vmcs12); + case EXIT_REASON_NOTIFY: + /* Notify VM exit is not exposed to L1 */ + return false; default: return true; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6d631941ac1a..2e00890d752a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2499,7 +2499,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_BUS_LOCK_DETECTION; + SECONDARY_EXEC_BUS_LOCK_DETECTION | + SECONDARY_EXEC_NOTIFY_VM_EXITING; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -4417,6 +4418,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + if (!kvm_notify_vmexit_enabled(vcpu->kvm)) + exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; + return exec_control; } @@ -4498,6 +4502,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->ple_window_dirty = true; } + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -5784,6 +5791,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) return 1; } +static int handle_notify(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; + + ++vcpu->stat.notify_window_exits; + + /* + * Notify VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || + context_invalid) { + vcpu->run->exit_reason = KVM_EXIT_NOTIFY; + vcpu->run->notify.flags = context_invalid ? + KVM_NOTIFY_CONTEXT_INVALID : 0; + return 0; + } + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5841,6 +5874,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, + [EXIT_REASON_NOTIFY] = handle_notify, }; static const int kvm_vmx_max_exit_handlers = @@ -6214,7 +6248,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && - exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + exit_reason.basic != EXIT_REASON_TASK_SWITCH && + exit_reason.basic != EXIT_REASON_NOTIFY)) { int ndata = 3; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -8137,6 +8172,7 @@ static __init int hardware_setup(void) kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; kvm_caps.tsc_scaling_ratio_frac_bits = 48; kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53e5f2ad2422..a8014233fd57 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -284,7 +284,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, nested_run), STATS_DESC_COUNTER(VCPU, directed_yield_attempted), STATS_DESC_COUNTER(VCPU, directed_yield_successful), - STATS_DESC_ICOUNTER(VCPU, guest_mode) + STATS_DESC_ICOUNTER(VCPU, guest_mode), + STATS_DESC_COUNTER(VCPU, notify_window_exits), }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -4402,6 +4403,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; + break; default: break; } @@ -6125,6 +6129,22 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = -EINVAL; + if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) + break; + if (!kvm_caps.has_notify_vmexit) + break; + if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) + break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.notify_window = cap->args[0] >> 32; + kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 359d0454ad28..501b884b8cc4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -21,6 +21,8 @@ struct kvm_caps { u64 default_tsc_scaling_ratio; /* bus lock detection supported? */ bool has_bus_lock_exit; + /* notify VM exit supported? */ + bool has_notify_vmexit; u64 supported_mce_cap; u64 supported_xcr0; @@ -364,6 +366,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } +static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) +{ + return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; +} + enum kvm_intr_type { /* Values are arbitrary, but must be non-zero. */ KVM_HANDLING_IRQ = 1, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ca799319acfd..7569b4ec199c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -270,6 +270,7 @@ struct kvm_xen_exit { #define KVM_EXIT_X86_BUS_LOCK 33 #define KVM_EXIT_XEN 34 #define KVM_EXIT_RISCV_SBI 35 +#define KVM_EXIT_NOTIFY 36 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -496,6 +497,11 @@ struct kvm_run { unsigned long args[6]; unsigned long ret[2]; } riscv_sbi; + /* KVM_EXIT_NOTIFY */ + struct { +#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) + __u32 flags; + } notify; /* Fix the size of the union. */ char padding[256]; }; @@ -1159,6 +1165,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SYSTEM_SUSPEND 216 #define KVM_CAP_S390_PROTECTED_DUMP 217 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 +#define KVM_CAP_X86_NOTIFY_VMEXIT 219 #ifdef KVM_CAP_IRQ_ROUTING @@ -2174,4 +2181,8 @@ struct kvm_stats_desc { /* Available with KVM_CAP_S390_PROTECTED_DUMP */ #define KVM_S390_PV_CPU_COMMAND _IOWR(KVMIO, 0xd0, struct kvm_pv_cmd) +/* Available with KVM_CAP_X86_NOTIFY_VMEXIT */ +#define KVM_X86_NOTIFY_VMEXIT_ENABLED (1ULL << 0) +#define KVM_X86_NOTIFY_VMEXIT_USER (1ULL << 1) + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From b172862241b4849985c3e0e86cfb05d61e4a841d Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Tue, 31 May 2022 13:44:21 +0100 Subject: KVM: x86: PIT: Preserve state of speaker port data bit Currently the state of the speaker port (0x61) data bit (bit 1) is not saved in the exported state (kvm_pit_state2) and hence is lost when re-constructing guest state. This patch removes the 'speaker_data_port' field from kvm_kpit_state and instead tracks the state using a new KVM_PIT_FLAGS_SPEAKER_DATA_ON flag defined in the API. Signed-off-by: Paul Durrant Message-Id: <20220531124421.1427-1-pdurrant@amazon.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 4 +++- arch/x86/include/uapi/asm/kvm.h | 3 ++- arch/x86/kvm/i8254.c | 10 +++++++--- arch/x86/kvm/i8254.h | 1 - 4 files changed, 12 insertions(+), 6 deletions(-) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 30e31a886422..9cbbfdb663b6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3006,7 +3006,9 @@ KVM_CREATE_PIT2. The state is returned in the following structure:: Valid flags are:: /* disable PIT in HPET legacy mode */ - #define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + #define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + /* speaker port data bit enabled */ + #define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002 This IOCTL replaces the obsolete KVM_GET_PIT. diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 24c807c8d5f7..50a4e787d5e6 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -306,7 +306,8 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002 struct kvm_pit_state2 { struct kvm_pit_channel_state channels[3]; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 1c83076091af..e0a7a0e7a73c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -591,7 +591,10 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu, return -EOPNOTSUPP; mutex_lock(&pit_state->lock); - pit_state->speaker_data_on = (val >> 1) & 1; + if (val & (1 << 1)) + pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON; + else + pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON; pit_set_gate(pit, 2, val & 1); mutex_unlock(&pit_state->lock); return 0; @@ -612,8 +615,9 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | - (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); + ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) | + pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) | + (refresh_clock << 4); if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 394d9527da7e..a768212ba821 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -29,7 +29,6 @@ struct kvm_kpit_state { bool is_periodic; s64 period; /* unit: ns */ struct hrtimer timer; - u32 speaker_data_on; struct mutex lock; atomic_t reinject; -- cgit v1.2.3 From 8deb03e75f6048b33b80025b6475c92975670c5b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Jun 2022 12:16:53 -0700 Subject: KVM: Fix references to non-existent KVM_CAP_TRIPLE_FAULT_EVENT The x86-only KVM_CAP_TRIPLE_FAULT_EVENT was (appropriately) renamed to KVM_CAP_X86_TRIPLE_FAULT_EVENT when the patches were applied, but the docs and selftests got left behind. Fix them. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 4 ++-- tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9cbbfdb663b6..84c486ce6279 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1152,7 +1152,7 @@ The following bits are defined in the flags field: - KVM_VCPUEVENT_VALID_TRIPLE_FAULT may be set to signal that the triple_fault_pending field contains a valid state. This bit will - be set whenever KVM_CAP_TRIPLE_FAULT_EVENT is enabled. + be set whenever KVM_CAP_X86_TRIPLE_FAULT_EVENT is enabled. ARM64: ^^^^^^ @@ -1249,7 +1249,7 @@ can be set in the flags field to signal that the exception_has_payload, exception_payload, and exception.pending fields contain a valid state and shall be written into the VCPU. -If KVM_CAP_TRIPLE_FAULT_EVENT is enabled, KVM_VCPUEVENT_VALID_TRIPLE_FAULT +If KVM_CAP_X86_TRIPLE_FAULT_EVENT is enabled, KVM_VCPUEVENT_VALID_TRIPLE_FAULT can be set in flags field to signal that the triple_fault field contains a valid state and shall be written into the VCPU. diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c index 6e1de0631ce9..66378140764d 100644 --- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -47,7 +47,7 @@ int main(void) struct ucall uc; struct kvm_enable_cap cap = { - .cap = KVM_CAP_TRIPLE_FAULT_EVENT, + .cap = KVM_CAP_X86_TRIPLE_FAULT_EVENT, .args = {1} }; @@ -56,8 +56,8 @@ int main(void) exit(KSFT_SKIP); } - if (!kvm_check_cap(KVM_CAP_TRIPLE_FAULT_EVENT)) { - print_skip("KVM_CAP_TRIPLE_FAULT_EVENT not supported"); + if (!kvm_check_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT)) { + print_skip("KVM_CAP_X86_TRIPLE_FAULT_EVENT not supported"); exit(KSFT_SKIP); } -- cgit v1.2.3 From bfbcc81bb82cbbad8bf4e40cea274f42b50674e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jun 2022 22:45:12 +0000 Subject: KVM: x86: Add a quirk for KVM's "MONITOR/MWAIT are NOPs!" behavior Add a quirk for KVM's behavior of emulating intercepted MONITOR/MWAIT instructions a NOPs regardless of whether or not they are supported in guest CPUID. KVM's current behavior was likely motiviated by a certain fruity operating system that expects MONITOR/MWAIT to be supported unconditionally and blindly executes MONITOR/MWAIT without first checking CPUID. And because KVM does NOT advertise MONITOR/MWAIT to userspace, that's effectively the default setup for any VMM that regurgitates KVM_GET_SUPPORTED_CPUID to KVM_SET_CPUID2. Note, this quirk interacts with KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT. The behavior is actually desirable, as userspace VMMs that want to unconditionally hide MONITOR/MWAIT from the guest can leave the MISC_ENABLE quirk enabled. Signed-off-by: Sean Christopherson Message-Id: <20220608224516.3788274-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 13 +++++++++++++ arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/x86.c | 30 +++++++++++++++++++----------- 4 files changed, 35 insertions(+), 12 deletions(-) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 84c486ce6279..320cb04f7bd9 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7522,6 +7522,19 @@ The valid bits in cap.args[0] are: hypercall instructions. Executing the incorrect hypercall instruction will generate a #UD within the guest. + +KVM_X86_QUIRK_MWAIT_NEVER_FAULTS By default, KVM emulates MONITOR/MWAIT (if + they are intercepted) as NOPs regardless of + whether or not MONITOR/MWAIT are supported + according to guest CPUID. When this quirk + is disabled and KVM_X86_DISABLE_EXITS_MWAIT + is not set (MONITOR/MWAIT are intercepted), + KVM will inject a #UD on MONITOR/MWAIT if + they're unsupported per guest CPUID. Note, + KVM will modify MONITOR/MWAIT support in + guest CPUID on writes to MISC_ENABLE if + KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is + disabled. =================================== ============================================ 7.32 KVM_CAP_MAX_VCPU_ID diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1038ccb7056a..e37727a74d0a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2076,6 +2076,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ - KVM_X86_QUIRK_FIX_HYPERCALL_INSN) + KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ + KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 50a4e787d5e6..ee3896416c68 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -439,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) +#define KVM_X86_QUIRK_MWAIT_NEVER_FAULTS (1 << 6) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b70fe8560881..9b9f9f65c548 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2036,13 +2036,6 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); -int kvm_emulate_mwait(struct kvm_vcpu *vcpu) -{ - pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); - return kvm_emulate_as_nop(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); - int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -2050,11 +2043,26 @@ int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); -int kvm_emulate_monitor(struct kvm_vcpu *vcpu) + +static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) && + !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + return kvm_handle_invalid_op(vcpu); + + pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); +} EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) @@ -3884,8 +3892,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); /* - * Userspace is allowed to read MSRs that KVM reports in - * KVM_GET_MSR_INDEX_LIST, even if an MSR isn't fully supported. + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. */ if (!msr_info->host_initiated) return 1; -- cgit v1.2.3 From 084cc29f8bbb034cf30a7ee07a816c115e0c28df Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 13 Jun 2022 21:25:21 +0000 Subject: KVM: x86/MMU: Allow NX huge pages to be disabled on a per-vm basis In some cases, the NX hugepage mitigation for iTLB multihit is not needed for all guests on a host. Allow disabling the mitigation on a per-VM basis to avoid the performance hit of NX hugepages on trusted workloads. In order to disable NX hugepages on a VM, ensure that the userspace actor has permission to reboot the system. Since disabling NX hugepages would allow a guest to crash the system, it is similar to reboot permissions. Ideally, KVM would require userspace to prove it has access to KVM's nx_huge_pages module param, e.g. so that userspace can opt out without needing full reboot permissions. But getting access to the module param file info is difficult because it is buried in layers of sysfs and module glue. Requiring CAP_SYS_BOOT is sufficient for all known use cases. Suggested-by: Jim Mattson Reviewed-by: David Matlack Reviewed-by: Peter Xu Signed-off-by: Ben Gardon Message-Id: <20220613212523.3436117-9-bgardon@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 16 ++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu/mmu_internal.h | 7 ++++--- arch/x86/kvm/mmu/spte.c | 7 ++++--- arch/x86/kvm/mmu/spte.h | 3 ++- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 8 files changed, 60 insertions(+), 8 deletions(-) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 320cb04f7bd9..bafaeedd455c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8206,6 +8206,22 @@ PV guests. The `KVM_PV_DUMP` command is available for the dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is available and supports the `KVM_PV_DUMP_CPU` subcommand. +8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES +--------------------------- + +:Capability KVM_CAP_VM_DISABLE_NX_HUGE_PAGES +:Architectures: x86 +:Type: vm +:Parameters: arg[0] must be 0. +:Returns 0 on success, -EPERM if the userspace process does not + have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been + created. + +This capability disables the NX huge pages mitigation for iTLB MULTIHIT. + +The capability has no effect if the nx_huge_pages module parameter is not set. + +This capability may only be set before any vCPUs are created. 9. Known KVM API problems ========================= diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e37727a74d0a..7e4c31b57a75 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1336,6 +1336,8 @@ struct kvm_arch { * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. */ u32 max_vcpu_ids; + + bool disable_nx_huge_pages; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5e1e3c8f8aaa..bb9d12ac0db3 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -155,9 +155,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) +static inline bool is_nx_huge_page_enabled(struct kvm *kvm) { - return READ_ONCE(nx_huge_pages); + return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages; } struct kvm_page_fault { @@ -256,7 +256,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .user = err & PFERR_USER_MASK, .prefetch = prefetch, .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + .nx_huge_page_workaround_enabled = + is_nx_huge_page_enabled(vcpu->kvm), .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 242e4828d7df..db294c1beea2 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -147,7 +147,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { + is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } @@ -246,7 +246,8 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level, + int index) { u64 child_spte; int child_level; @@ -274,7 +275,7 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) * When splitting to a 4K page, mark the page executable as the * NX hugepage mitigation no longer applies. */ - if (is_nx_huge_page_enabled()) + if (is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 121c5eaaec77..256f90587e8d 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -421,7 +421,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, int huge_level, + int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1ea40809ef1f..522e2532343b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1478,7 +1478,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * not been linked in yet and thus is not reachable from any other CPU. */ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, level, i); /* * Replace the huge spte with a pointer to the populated lower level diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1b3b2ea8ee0..7ce0c6fe166d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4324,6 +4324,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SYS_ATTRIBUTES: case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -6184,6 +6185,35 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7569b4ec199c..a36e78710382 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1166,6 +1166,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_PROTECTED_DUMP 217 #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 #define KVM_CAP_X86_NOTIFY_VMEXIT 219 +#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From db1c875e0539518e3d5fe9876ef50975cf4476bb Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 6 Jun 2022 16:33:24 -0400 Subject: KVM: s390: add KVM_S390_ZPCI_OP to manage guest zPCI devices The KVM_S390_ZPCI_OP ioctl provides a mechanism for managing hardware-assisted virtualization features for s390x zPCI passthrough. Add the first 2 operations, which can be used to enable/disable the specified device for Adapter Event Notification interpretation. Signed-off-by: Matthew Rosato Acked-by: Pierre Morel Reviewed-by: Thomas Huth Link: https://lore.kernel.org/r/20220606203325.110625-21-mjrosato@linux.ibm.com Signed-off-by: Christian Borntraeger --- Documentation/virt/kvm/api.rst | 47 +++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 16 ++++++++ arch/s390/kvm/pci.c | 85 ++++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/pci.h | 2 + include/uapi/linux/kvm.h | 31 +++++++++++++++ 5 files changed, 181 insertions(+) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 11e00a46c610..d58354e9af8f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5802,6 +5802,53 @@ of CPUID leaf 0xD on the host. This ioctl injects an event channel interrupt directly to the guest vCPU. +4.137 KVM_S390_ZPCI_OP +-------------------- + +:Capability: KVM_CAP_S390_ZPCI_OP +:Architectures: s390 +:Type: vm ioctl +:Parameters: struct kvm_s390_zpci_op (in) +:Returns: 0 on success, <0 on error + +Used to manage hardware-assisted virtualization features for zPCI devices. + +Parameters are specified via the following structure:: + + struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; + }; + +The type of operation is specified in the "op" field. +KVM_S390_ZPCIOP_REG_AEN is used to register the VM for adapter event +notification interpretation, which will allow firmware delivery of adapter +events directly to the vm, with KVM providing a backup delivery mechanism; +KVM_S390_ZPCIOP_DEREG_AEN is used to subsequently disable interpretation of +adapter event notifications. + +The target zPCI function must also be specified via the "fh" field. For the +KVM_S390_ZPCIOP_REG_AEN operation, additional information to establish firmware +delivery must be provided via the "reg_aen" struct. + +The "pad" and "reserved" fields may be used for future extensions and should be +set to 0s by userspace. + 5. The kvm_run structure ======================== diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4758bb731199..f214e0fc62ed 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -618,6 +618,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_PROTECTED: r = is_prot_virt_host(); break; + case KVM_CAP_S390_ZPCI_OP: + r = kvm_s390_pci_interp_allowed(); + break; default: r = 0; } @@ -2629,6 +2632,19 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; break; } + case KVM_S390_ZPCI_OP: { + struct kvm_s390_zpci_op args; + + r = -EINVAL; + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + break; + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + r = kvm_s390_pci_zpci_op(kvm, &args); + break; + } default: r = -ENOTTY; } diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index 24211741deb0..4946fb7757d6 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -585,6 +585,91 @@ void kvm_s390_pci_clear_list(struct kvm *kvm) spin_unlock(&kvm->arch.kzdev_list_lock); } +static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh) +{ + struct zpci_dev *zdev = NULL; + struct kvm_zdev *kzdev; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) { + if (kzdev->zdev->fh == fh) { + zdev = kzdev->zdev; + break; + } + } + spin_unlock(&kvm->arch.kzdev_list_lock); + + return zdev; +} + +static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev, + struct kvm_s390_zpci_op *args) +{ + struct zpci_fib fib = {}; + bool hostflag; + + fib.fmt0.aibv = args->u.reg_aen.ibv; + fib.fmt0.isc = args->u.reg_aen.isc; + fib.fmt0.noi = args->u.reg_aen.noi; + if (args->u.reg_aen.sb != 0) { + fib.fmt0.aisb = args->u.reg_aen.sb; + fib.fmt0.aisbo = args->u.reg_aen.sbo; + fib.fmt0.sum = 1; + } else { + fib.fmt0.aisb = 0; + fib.fmt0.aisbo = 0; + fib.fmt0.sum = 0; + } + + hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST); + return kvm_s390_pci_aif_enable(zdev, &fib, hostflag); +} + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args) +{ + struct kvm_zdev *kzdev; + struct zpci_dev *zdev; + int r; + + zdev = get_zdev_from_kvm_by_fh(kvm, args->fh); + if (!zdev) + return -ENODEV; + + mutex_lock(&zdev->kzdev_lock); + mutex_lock(&kvm->lock); + + kzdev = zdev->kzdev; + if (!kzdev) { + r = -ENODEV; + goto out; + } + if (kzdev->kvm != kvm) { + r = -EPERM; + goto out; + } + + switch (args->op) { + case KVM_S390_ZPCIOP_REG_AEN: + /* Fail on unknown flags */ + if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) { + r = -EINVAL; + break; + } + r = kvm_s390_pci_zpci_reg_aen(zdev, args); + break; + case KVM_S390_ZPCIOP_DEREG_AEN: + r = kvm_s390_pci_aif_disable(zdev, false); + break; + default: + r = -EINVAL; + } + +out: + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return r; +} + int kvm_s390_pci_init(void) { aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL); diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h index fb2b91b76e0c..0351382e990f 100644 --- a/arch/s390/kvm/pci.h +++ b/arch/s390/kvm/pci.h @@ -59,6 +59,8 @@ void kvm_s390_pci_aen_exit(void); void kvm_s390_pci_init_list(struct kvm *kvm); void kvm_s390_pci_clear_list(struct kvm *kvm); +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args); + int kvm_s390_pci_init(void); void kvm_s390_pci_exit(void); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5088bd9f1922..2f302e2287d1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1157,6 +1157,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_TSC_CONTROL 214 #define KVM_CAP_SYSTEM_EVENT_DATA 215 #define KVM_CAP_ARM_SYSTEM_SUSPEND 216 +#define KVM_CAP_S390_ZPCI_OP 221 #ifdef KVM_CAP_IRQ_ROUTING @@ -2118,4 +2119,34 @@ struct kvm_stats_desc { /* Available with KVM_CAP_XSAVE2 */ #define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) +/* Available with KVM_CAP_S390_ZPCI_OP */ +#define KVM_S390_ZPCI_OP _IOW(KVMIO, 0xd1, struct kvm_s390_zpci_op) + +struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; +}; + +/* types for kvm_s390_zpci_op->op */ +#define KVM_S390_ZPCIOP_REG_AEN 0 +#define KVM_S390_ZPCIOP_DEREG_AEN 1 + +/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ +#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From 5efab5cdf06b932ba7a53264b60f7d5c6ec87edf Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Tue, 12 Jul 2022 16:29:54 +0700 Subject: Documentation: kvm: extend KVM_S390_ZPCI_OP subheading underline Stephen Rothwell reported the htmldocs warning: Documentation/virt/kvm/api.rst:5959: WARNING: Title underline too short. 4.137 KVM_S390_ZPCI_OP -------------------- The warning is due to subheading underline on KVM_S390_ZPCI_OP section is short of 2 dashes. Extend the underline to fix the warning. Link: https://lore.kernel.org/linux-next/20220711205557.183c3b14@canb.auug.org.au/ Fixes: a0c4d1109d6cc5 ("KVM: s390: add KVM_S390_ZPCI_OP to manage guest zPCI devices") Reported-by: Stephen Rothwell Cc: Paolo Bonzini Cc: Jonathan Corbet Cc: Pierre Morel Cc: Thomas Huth Cc: Matthew Rosato Cc: Christian Borntraeger Cc: Janosch Frank Cc: kvm@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20220712092954.142027-4-bagasdotme@gmail.com Signed-off-by: Christian Borntraeger --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 5abc0c1a5aff..5be5cc59869d 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5956,7 +5956,7 @@ KVM_PV_DUMP_CPU The length of the returned data is provided by uv_info.guest_cpu_stor_len. 4.137 KVM_S390_ZPCI_OP --------------------- +---------------------- :Capability: KVM_CAP_S390_ZPCI_OP :Architectures: s390 -- cgit v1.2.3 From 43bb9e000ea4c62154c01844771fea25b8b83520 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Jul 2022 22:57:53 +0000 Subject: KVM: x86: Tweak name of MONITOR/MWAIT #UD quirk to make it #UD specific Add a "UD" clause to KVM_X86_QUIRK_MWAIT_NEVER_FAULTS to make it clear that the quirk only controls the #UD behavior of MONITOR/MWAIT. KVM doesn't currently enforce fault checks when MONITOR/MWAIT are supported, but that could change in the future. SVM also has a virtualization hole in that it checks all faults before intercepts, and so "never faults" is already a lie when running on SVM. Fixes: bfbcc81bb82c ("KVM: x86: Add a quirk for KVM's "MONITOR/MWAIT are NOPs!" behavior") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220711225753.1073989-4-seanjc@google.com --- Documentation/virt/kvm/api.rst | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/include/uapi/asm/kvm.h | 2 +- arch/x86/kvm/x86.c | 2 +- tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index bafaeedd455c..cd9361f22530 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7523,7 +7523,7 @@ The valid bits in cap.args[0] are: incorrect hypercall instruction will generate a #UD within the guest. -KVM_X86_QUIRK_MWAIT_NEVER_FAULTS By default, KVM emulates MONITOR/MWAIT (if +KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if they are intercepted) as NOPs regardless of whether or not MONITOR/MWAIT are supported according to guest CPUID. When this quirk diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd6a26f7d46c..d4ece7bf2124 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2096,6 +2096,6 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ - KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ee3896416c68..a0c0ab0c898e 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -439,7 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) -#define KVM_X86_QUIRK_MWAIT_NEVER_FAULTS (1 << 6) +#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 67dcaa670874..db46c060acb5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2046,7 +2046,7 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_FAULTS) && + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) return kvm_handle_invalid_op(vcpu); diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c index 6a4ebcdfa374..094c68d744c0 100644 --- a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -113,7 +113,7 @@ int main(int argc, char *argv[]) disabled_quirks = 0; if (testcase & MWAIT_QUIRK_DISABLED) - disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_FAULTS; + disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS; if (testcase & MISC_ENABLES_QUIRK_DISABLED) disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT; vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks); -- cgit v1.2.3 From f5ecfee94493475783074e86ded10a0499d779fc Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Thu, 14 Jul 2022 21:43:34 +0200 Subject: KVM: s390: resetting the Topology-Change-Report During a subsystem reset the Topology-Change-Report is cleared. Let's give userland the possibility to clear the MTCR in the case of a subsystem reset. To migrate the MTCR, we give userland the possibility to query the MTCR state. We indicate KVM support for the CPU topology facility with a new KVM capability: KVM_CAP_S390_CPU_TOPOLOGY. Signed-off-by: Pierre Morel Reviewed-by: Janis Schoetterl-Glausch Reviewed-by: Janosch Frank Message-Id: <20220714194334.127812-1-pmorel@linux.ibm.com> Link: https://lore.kernel.org/all/20220714194334.127812-1-pmorel@linux.ibm.com/ [frankja@linux.ibm.com: Simple conflict resolution in Documentation/virt/kvm/api.rst] Signed-off-by: Janosch Frank --- Documentation/virt/kvm/api.rst | 25 ++++++++++++++++++++ arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/kvm-s390.c | 51 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 4 files changed, 78 insertions(+) (limited to 'Documentation/virt/kvm/api.rst') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 5be5cc59869d..3c4551a2f6d0 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8269,6 +8269,31 @@ The capability has no effect if the nx_huge_pages module parameter is not set. This capability may only be set before any vCPUs are created. +8.39 KVM_CAP_S390_CPU_TOPOLOGY +------------------------------ + +:Capability: KVM_CAP_S390_CPU_TOPOLOGY +:Architectures: s390 +:Type: vm + +This capability indicates that KVM will provide the S390 CPU Topology +facility which consist of the interpretation of the PTF instruction for +the function code 2 along with interception and forwarding of both the +PTF instruction with function codes 0 or 1 and the STSI(15,1,x) +instruction to the userland hypervisor. + +The stfle facility 11, CPU Topology facility, should not be indicated +to the guest without this capability. + +When this capability is present, KVM provides a new attribute group +on vm fd, KVM_S390_VM_CPU_TOPOLOGY. +This new attribute allows to get, set or clear the Modified Change +Topology Report (MTCR) bit of the SCA through the kvm_device_attr +structure. + +When getting the Modified Change Topology Report value, the attr->addr +must point to a byte where the value will be stored or retrieved from. + 9. Known KVM API problems ========================= diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 7a6b14874d65..a73cf01a1606 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req { #define KVM_S390_VM_CRYPTO 2 #define KVM_S390_VM_CPU_MODEL 3 #define KVM_S390_VM_MIGRATION 4 +#define KVM_S390_VM_CPU_TOPOLOGY 5 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5d18b66a08c9..edfd4bbd0cba 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -642,6 +642,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_ZPCI_OP: r = kvm_s390_pci_interp_allowed(); break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = test_facility(11); + break; default: r = 0; } @@ -853,6 +856,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) icpt_operexc_on_all_vcpus(kvm); r = 0; break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (test_facility(11)) { + set_kvm_facility(kvm->arch.model.fac_mask, 11); + set_kvm_facility(kvm->arch.model.fac_list, 11); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", + r ? "(not available)" : "(success)"); + break; default: r = -EINVAL; break; @@ -1789,6 +1806,31 @@ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) read_unlock(&kvm->arch.sca_lock); } +static int kvm_s390_set_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + kvm_s390_update_topology_change_report(kvm, !!attr->attr); + return 0; +} + +static int kvm_s390_get_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u8 topo; + + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + read_lock(&kvm->arch.sca_lock); + topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; + read_unlock(&kvm->arch.sca_lock); + + return put_user(topo, (u8 __user *)attr->addr); +} + static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; @@ -1809,6 +1851,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_set_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_set_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1834,6 +1879,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_get_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_get_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1907,6 +1955,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = 0; break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO; + break; default: ret = -ENXIO; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 20817dd7f2f1..7e06194129e3 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1168,6 +1168,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_X86_NOTIFY_VMEXIT 219 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 #define KVM_CAP_S390_ZPCI_OP 221 +#define KVM_CAP_S390_CPU_TOPOLOGY 222 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3