From db205f7e1edc8d8f0880f0218d3a03b13fe94af3 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Wed, 21 Sep 2022 15:15:22 +0000 Subject: KVM: x86: Add a VALID_MASK for the MSR exit reason flags Add the mask KVM_MSR_EXIT_REASON_VALID_MASK for the MSR exit reason flags. This simplifies checks that validate these flags, and makes it easier to introduce new flags in the future. No functional change intended. Signed-off-by: Aaron Lewis Message-Id: <20220921151525.904162-3-aaronlewis@google.com> Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0d5d4419139a..7fea12369245 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -485,6 +485,9 @@ struct kvm_run { #define KVM_MSR_EXIT_REASON_INVAL (1 << 0) #define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) #define KVM_MSR_EXIT_REASON_FILTER (1 << 2) +#define KVM_MSR_EXIT_REASON_VALID_MASK (KVM_MSR_EXIT_REASON_INVAL | \ + KVM_MSR_EXIT_REASON_UNKNOWN | \ + KVM_MSR_EXIT_REASON_FILTER) __u32 reason; /* kernel -> user */ __u32 index; /* kernel -> user */ __u64 data; /* kernel <-> user */ -- cgit v1.2.3 From 86bdf3ebcfe1ded055282536fecce13001874740 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Nov 2022 18:49:10 +0800 Subject: KVM: Support dirty ring in conjunction with bitmap ARM64 needs to dirty memory outside of a VCPU context when VGIC/ITS is enabled. It's conflicting with that ring-based dirty page tracking always requires a running VCPU context. Introduce a new flavor of dirty ring that requires the use of both VCPU dirty rings and a dirty bitmap. The expectation is that for non-VCPU sources of dirty memory (such as the VGIC/ITS on arm64), KVM writes to the dirty bitmap. Userspace should scan the dirty bitmap before migrating the VM to the target. Use an additional capability to advertise this behavior. The newly added capability (KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP) can't be enabled before KVM_CAP_DIRTY_LOG_RING_ACQ_REL on ARM64. In this way, the newly added capability is treated as an extension of KVM_CAP_DIRTY_LOG_RING_ACQ_REL. Suggested-by: Marc Zyngier Suggested-by: Peter Xu Co-developed-by: Oliver Upton Signed-off-by: Oliver Upton Signed-off-by: Gavin Shan Acked-by: Peter Xu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221110104914.31280-4-gshan@redhat.com --- Documentation/virt/kvm/api.rst | 34 +++++++++++--- Documentation/virt/kvm/devices/arm-vgic-its.rst | 5 +- include/linux/kvm_dirty_ring.h | 7 +++ include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + virt/kvm/Kconfig | 6 +++ virt/kvm/dirty_ring.c | 14 ++++++ virt/kvm/kvm_main.c | 61 +++++++++++++++++++++---- 8 files changed, 112 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index eee9f857a986..1f1b09aa6db4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8003,13 +8003,6 @@ flushing is done by the KVM_GET_DIRTY_LOG ioctl). To achieve that, one needs to kick the vcpu out of KVM_RUN using a signal. The resulting vmexit ensures that all dirty GFNs are flushed to the dirty rings. -NOTE: the capability KVM_CAP_DIRTY_LOG_RING and the corresponding -ioctl KVM_RESET_DIRTY_RINGS are mutual exclusive to the existing ioctls -KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG. After enabling -KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual -machine will switch to ring-buffer dirty page tracking and further -KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail. - NOTE: KVM_CAP_DIRTY_LOG_RING_ACQ_REL is the only capability that should be exposed by weakly ordered architecture, in order to indicate the additional memory ordering requirements imposed on userspace when @@ -8018,6 +8011,33 @@ Architecture with TSO-like ordering (such as x86) are allowed to expose both KVM_CAP_DIRTY_LOG_RING and KVM_CAP_DIRTY_LOG_RING_ACQ_REL to userspace. +After enabling the dirty rings, the userspace needs to detect the +capability of KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP to see whether the +ring structures can be backed by per-slot bitmaps. With this capability +advertised, it means the architecture can dirty guest pages without +vcpu/ring context, so that some of the dirty information will still be +maintained in the bitmap structure. KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP +can't be enabled if the capability of KVM_CAP_DIRTY_LOG_RING_ACQ_REL +hasn't been enabled, or any memslot has been existing. + +Note that the bitmap here is only a backup of the ring structure. The +use of the ring and bitmap combination is only beneficial if there is +only a very small amount of memory that is dirtied out of vcpu/ring +context. Otherwise, the stand-alone per-slot bitmap mechanism needs to +be considered. + +To collect dirty bits in the backup bitmap, userspace can use the same +KVM_GET_DIRTY_LOG ioctl. KVM_CLEAR_DIRTY_LOG isn't needed as long as all +the generation of the dirty bits is done in a single pass. Collecting +the dirty bitmap should be the very last thing that the VMM does before +considering the state as complete. VMM needs to ensure that the dirty +state is final and avoid missing dirty pages from another ioctl ordered +after the bitmap collection. + +NOTE: One example of using the backup bitmap is saving arm64 vgic/its +tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on +KVM device "kvm-arm-vgic-its" when dirty ring is enabled. + 8.30 KVM_CAP_XEN_HVM -------------------- diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst b/Documentation/virt/kvm/devices/arm-vgic-its.rst index d257eddbae29..e053124f77c4 100644 --- a/Documentation/virt/kvm/devices/arm-vgic-its.rst +++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst @@ -52,7 +52,10 @@ KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEV_ARM_ITS_SAVE_TABLES save the ITS table data into guest RAM, at the location provisioned - by the guest in corresponding registers/table entries. + by the guest in corresponding registers/table entries. Should userspace + require a form of dirty tracking to identify which pages are modified + by the saving process, it should use a bitmap even if using another + mechanism to track the memory dirtied by the vCPUs. The layout of the tables in guest memory defines an ABI. The entries are laid out in little endian format as described in the last paragraph. diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 199ead37b104..4862c98d80d3 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -37,6 +37,11 @@ static inline u32 kvm_dirty_ring_get_rsvd_entries(void) return 0; } +static inline bool kvm_use_dirty_bitmap(struct kvm *kvm) +{ + return true; +} + static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) { @@ -67,6 +72,8 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ int kvm_cpu_dirty_log_size(void); +bool kvm_use_dirty_bitmap(struct kvm *kvm); +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); u32 kvm_dirty_ring_get_rsvd_entries(void); int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 648d663f32c4..db83f63f4e61 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -779,6 +779,7 @@ struct kvm { pid_t userspace_pid; unsigned int max_halt_poll_ns; u32 dirty_ring_size; + bool dirty_ring_with_bitmap; bool vm_bugged; bool vm_dead; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0d5d4419139a..c87b5882d7ae 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 +#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 224 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 800f9470e36b..9fb1ff6f19e5 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -33,6 +33,12 @@ config HAVE_KVM_DIRTY_RING_ACQ_REL bool select HAVE_KVM_DIRTY_RING +# Allow enabling both the dirty bitmap and dirty ring. Only architectures +# that need to dirty memory outside of a vCPU context should select this. +config NEED_KVM_DIRTY_RING_WITH_BITMAP + bool + depends on HAVE_KVM_DIRTY_RING + config HAVE_KVM_EVENTFD bool select EVENTFD diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index fecbb7d75ad2..c1cd7dfe4a90 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -21,6 +21,20 @@ u32 kvm_dirty_ring_get_rsvd_entries(void) return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(); } +bool kvm_use_dirty_bitmap(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->slots_lock); + + return !kvm->dirty_ring_size || kvm->dirty_ring_with_bitmap; +} + +#ifndef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) +{ + return false; +} +#endif + static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring) { return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 04b22d2f99d8..be40d1ce6e91 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1617,7 +1617,7 @@ static int kvm_prepare_memory_region(struct kvm *kvm, new->dirty_bitmap = NULL; else if (old && old->dirty_bitmap) new->dirty_bitmap = old->dirty_bitmap; - else if (!kvm->dirty_ring_size) { + else if (kvm_use_dirty_bitmap(kvm)) { r = kvm_alloc_dirty_bitmap(new); if (r) return r; @@ -2060,8 +2060,8 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, unsigned long n; unsigned long any = 0; - /* Dirty ring tracking is exclusive to dirty log tracking */ - if (kvm->dirty_ring_size) + /* Dirty ring tracking may be exclusive to dirty log tracking */ + if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; *memslot = NULL; @@ -2125,8 +2125,8 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) unsigned long *dirty_bitmap_buffer; bool flush; - /* Dirty ring tracking is exclusive to dirty log tracking */ - if (kvm->dirty_ring_size) + /* Dirty ring tracking may be exclusive to dirty log tracking */ + if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; @@ -2237,8 +2237,8 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, unsigned long *dirty_bitmap_buffer; bool flush; - /* Dirty ring tracking is exclusive to dirty log tracking */ - if (kvm->dirty_ring_size) + /* Dirty ring tracking may be exclusive to dirty log tracking */ + if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO; as_id = log->slot >> 16; @@ -3305,7 +3305,10 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); #ifdef CONFIG_HAVE_KVM_DIRTY_RING - if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm)) + if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm)) + return; + + if (WARN_ON_ONCE(!kvm_arch_allow_write_without_running_vcpu(kvm) && !vcpu)) return; #endif @@ -3313,7 +3316,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, unsigned long rel_gfn = gfn - memslot->base_gfn; u32 slot = (memslot->as_id << 16) | memslot->id; - if (kvm->dirty_ring_size) + if (kvm->dirty_ring_size && vcpu) kvm_dirty_ring_push(vcpu, slot, rel_gfn); else set_bit_le(rel_gfn, memslot->dirty_bitmap); @@ -4482,6 +4485,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); #else return 0; +#endif +#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP + case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: #endif case KVM_CAP_BINARY_STATS_FD: case KVM_CAP_SYSTEM_EVENT_DATA: @@ -4558,6 +4564,20 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; } +static bool kvm_are_all_memslots_empty(struct kvm *kvm) +{ + int i; + + lockdep_assert_held(&kvm->slots_lock); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + if (!kvm_memslots_empty(__kvm_memslots(kvm, i))) + return false; + } + + return true; +} + static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -4588,6 +4608,29 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, return -EINVAL; return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); + case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: { + int r = -EINVAL; + + if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) || + !kvm->dirty_ring_size || cap->flags) + return r; + + mutex_lock(&kvm->slots_lock); + + /* + * For simplicity, allow enabling ring+bitmap if and only if + * there are no memslots, e.g. to ensure all memslots allocate + * a bitmap after the capability is enabled. + */ + if (kvm_are_all_memslots_empty(kvm)) { + kvm->dirty_ring_with_bitmap = true; + r = 0; + } + + mutex_unlock(&kvm->slots_lock); + + return r; + } default: return kvm_vm_ioctl_enable_cap(kvm, cap); } -- cgit v1.2.3 From fb491d5500a7ca551e49bc32d9b19d226023f68d Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 11 Nov 2022 18:06:27 +0100 Subject: KVM: s390: pv: asynchronous destroy for reboot Until now, destroying a protected guest was an entirely synchronous operation that could potentially take a very long time, depending on the size of the guest, due to the time needed to clean up the address space from protected pages. This patch implements an asynchronous destroy mechanism, that allows a protected guest to reboot significantly faster than previously. This is achieved by clearing the pages of the old guest in background. In case of reboot, the new guest will be able to run in the same address space almost immediately. The old protected guest is then only destroyed when all of its memory has been destroyed or otherwise made non protected. Two new PV commands are added for the KVM_S390_PV_COMMAND ioctl: KVM_PV_ASYNC_CLEANUP_PREPARE: set aside the current protected VM for later asynchronous teardown. The current KVM VM will then continue immediately as non-protected. If a protected VM had already been set aside for asynchronous teardown, but without starting the teardown process, this call will fail. There can be at most one VM set aside at any time. Once it is set aside, the protected VM only exists in the context of the Ultravisor, it is not associated with the KVM VM anymore. Its protected CPUs have already been destroyed, but not its memory. This command can be issued again immediately after starting KVM_PV_ASYNC_CLEANUP_PERFORM, without having to wait for completion. KVM_PV_ASYNC_CLEANUP_PERFORM: tears down the protected VM previously set aside using KVM_PV_ASYNC_CLEANUP_PREPARE. Ideally the KVM_PV_ASYNC_CLEANUP_PERFORM PV command should be issued by userspace from a separate thread. If a fatal signal is received (or if the process terminates naturally), the command will terminate immediately without completing. All protected VMs whose teardown was interrupted will be put in the need_cleanup list. The rest of the normal KVM teardown process will take care of properly cleaning up all remaining protected VMs, including the ones on the need_cleanup list. Signed-off-by: Claudio Imbrenda Reviewed-by: Nico Boehr Reviewed-by: Janosch Frank Reviewed-by: Steffen Eiden Link: https://lore.kernel.org/r/20221111170632.77622-2-imbrenda@linux.ibm.com Message-Id: <20221111170632.77622-2-imbrenda@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/include/asm/kvm_host.h | 2 + arch/s390/kvm/kvm-s390.c | 49 +++++-- arch/s390/kvm/kvm-s390.h | 3 + arch/s390/kvm/pv.c | 295 +++++++++++++++++++++++++++++++++++++-- include/uapi/linux/kvm.h | 2 + 5 files changed, 333 insertions(+), 18 deletions(-) (limited to 'include/uapi') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 21f1339a4197..d67ce719d16a 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -942,6 +942,8 @@ struct kvm_s390_pv { unsigned long stor_base; void *stor_var; bool dumping; + void *set_aside; + struct list_head need_cleanup; struct mmu_notifier mmu_notifier; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bd6e0201bfe5..f0abaaf7eea4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -209,6 +209,8 @@ unsigned int diag9c_forwarding_hz; module_param(diag9c_forwarding_hz, uint, 0644); MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); +static int async_destroy; + /* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond @@ -2504,9 +2506,13 @@ static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd, static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) { + const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM); + void __user *argp = (void __user *)cmd->data; int r = 0; u16 dummy; - void __user *argp = (void __user *)cmd->data; + + if (need_lock) + mutex_lock(&kvm->lock); switch (cmd->cmd) { case KVM_PV_ENABLE: { @@ -2540,6 +2546,31 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); break; } + case KVM_PV_ASYNC_CLEANUP_PREPARE: + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !async_destroy) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + case KVM_PV_ASYNC_CLEANUP_PERFORM: + r = -EINVAL; + if (!async_destroy) + break; + /* kvm->lock must not be held; this is asserted inside the function. */ + r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc); + break; case KVM_PV_DISABLE: { r = -EINVAL; if (!kvm_s390_pv_is_protected(kvm)) @@ -2553,7 +2584,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) */ if (r) break; - r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc); + r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc); /* no need to block service interrupts any more */ clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); @@ -2703,6 +2734,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) default: r = -ENOTTY; } + if (need_lock) + mutex_unlock(&kvm->lock); + return r; } @@ -2907,9 +2941,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EINVAL; break; } - mutex_lock(&kvm->lock); + /* must be called without kvm->lock */ r = kvm_s390_handle_pv(kvm, &args); - mutex_unlock(&kvm->lock); if (copy_to_user(argp, &args, sizeof(args))) { r = -EFAULT; break; @@ -3228,6 +3261,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_vsie_init(kvm); if (use_gisa) kvm_s390_gisa_init(kvm); + INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -3272,11 +3307,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) /* * We are already at the end of life and kvm->lock is not taken. * This is ok as the file descriptor is closed by now and nobody - * can mess with the pv state. To avoid lockdep_assert_held from - * complaining we do not use kvm_s390_pv_is_protected. + * can mess with the pv state. */ - if (kvm_s390_pv_get_handle(kvm)) - kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); + kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc); /* * Remove the mmu notifier only when the whole KVM VM is torn down, * and only if one was registered to begin with. If the VM is diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index a60d1e5c44cd..826754937ae4 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -244,6 +244,9 @@ static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) /* implemented in pv.c */ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 48c4f57d5d76..5f958fcf6283 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -18,6 +18,29 @@ #include #include "kvm-s390.h" +/** + * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to + * be destroyed + * + * @list: list head for the list of leftover VMs + * @old_gmap_table: the gmap table of the leftover protected VM + * @handle: the handle of the leftover protected VM + * @stor_var: pointer to the variable storage of the leftover protected VM + * @stor_base: address of the base storage of the leftover protected VM + * + * Represents a protected VM that is still registered with the Ultravisor, + * but which does not correspond any longer to an active KVM VM. It should + * be destroyed at some point later, either asynchronously or when the + * process terminates. + */ +struct pv_vm_to_be_destroyed { + struct list_head list; + unsigned long old_gmap_table; + u64 handle; + void *stor_var; + unsigned long stor_base; +}; + static void kvm_s390_clear_pv_state(struct kvm *kvm) { kvm->arch.pv.handle = 0; @@ -161,23 +184,150 @@ out_err: return -ENOMEM; } -/* this should not fail, but if it does, we must not free the donated memory */ -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +/** + * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM. + * @kvm: the KVM that was associated with this leftover protected VM + * @leftover: details about the leftover protected VM that needs a clean up + * @rc: the RC code of the Destroy Secure Configuration UVC + * @rrc: the RRC code of the Destroy Secure Configuration UVC + * + * Destroy one leftover protected VM. + * On success, kvm->mm->context.protected_count will be decremented atomically + * and all other resources used by the VM will be freed. + * + * Return: 0 in case of success, otherwise 1 + */ +static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm, + struct pv_vm_to_be_destroyed *leftover, + u16 *rc, u16 *rrc) { int cc; - cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), - UVC_CMD_DESTROY_SEC_CONF, rc, rrc); - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc); + if (cc) + return cc; /* - * if the mm still has a mapping, make all its pages accessible - * before destroying the guest + * Intentionally leak unusable memory. If the UVC fails, the memory + * used for the VM and its metadata is permanently unusable. + * This can only happen in case of a serious KVM or hardware bug; it + * is not expected to happen in normal operation. */ - if (mmget_not_zero(kvm->mm)) { - s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); - mmput(kvm->mm); + free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len)); + free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER); + vfree(leftover->stor_var); + atomic_dec(&kvm->mm->context.protected_count); + return 0; +} + +/** + * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. + * @kvm: the VM whose memory is to be cleared. + * + * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. + * The CPUs of the protected VM need to be destroyed beforehand. + */ +static void kvm_s390_destroy_lower_2g(struct kvm *kvm) +{ + const unsigned long pages_2g = SZ_2G / PAGE_SIZE; + struct kvm_memory_slot *slot; + unsigned long len; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + /* Take the memslot containing guest absolute address 0 */ + slot = gfn_to_memslot(kvm, 0); + /* Clear all slots or parts thereof that are below 2GB */ + while (slot && slot->base_gfn < pages_2g) { + len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; + s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); + /* Take the next memslot */ + slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +/** + * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown. + * @kvm: the VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Set aside the protected VM for a subsequent teardown. The VM will be able + * to continue immediately as a non-secure VM, and the information needed to + * properly tear down the protected VM is set aside. If another protected VM + * was already set aside without starting its teardown, this function will + * fail. + * The CPUs of the protected VM need to be destroyed beforehand. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, -EINVAL if another protected VM was already set + * aside, -ENOMEM if the system ran out of memory. + */ +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *priv; + + lockdep_assert_held(&kvm->lock); + /* + * If another protected VM was already prepared for teardown, refuse. + * A normal deinitialization has to be performed instead. + */ + if (kvm->arch.pv.set_aside) + return -EINVAL; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->stor_var = kvm->arch.pv.stor_var; + priv->stor_base = kvm->arch.pv.stor_base; + priv->handle = kvm_s390_pv_get_handle(kvm); + priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (s390_replace_asce(kvm->arch.gmap)) { + kfree(priv); + return -ENOMEM; } + kvm_s390_destroy_lower_2g(kvm); + kvm_s390_clear_pv_state(kvm); + kvm->arch.pv.set_aside = priv; + + *rc = UVC_RC_EXECUTED; + *rrc = 42; + return 0; +} + +/** + * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM + * @kvm: the KVM whose protected VM needs to be deinitialized + * @rc: the RC code of the UVC + * @rrc: the RRC code of the UVC + * + * Deinitialize the current protected VM. This function will destroy and + * cleanup the current protected VM, but it will not cleanup the guest + * memory. This function should only be called when the protected VM has + * just been created and therefore does not have any guest memory, or when + * the caller cleans up the guest memory separately. + * + * This function should not fail, but if it does, the donated memory must + * not be freed. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; + + cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); if (!cc) { atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); @@ -191,6 +341,131 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return cc ? -EIO : 0; } +/** + * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated + * with a specific KVM. + * @kvm: the KVM to be cleaned up + * @rc: the RC code of the first failing UVC + * @rrc: the RRC code of the first failing UVC + * + * This function will clean up all protected VMs associated with a KVM. + * This includes the active one, the one prepared for deinitialization with + * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list. + * + * Context: kvm->lock needs to be held unless being called from + * kvm_arch_destroy_vm. + * + * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO + */ +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *cur; + bool need_zap = false; + u16 _rc, _rrc; + int cc = 0; + + /* Make sure the counter does not reach 0 before calling s390_uv_destroy_range */ + atomic_inc(&kvm->mm->context.protected_count); + + *rc = 1; + /* If the current VM is protected, destroy it */ + if (kvm_s390_pv_get_handle(kvm)) { + cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc); + need_zap = true; + } + + /* If a previous protected VM was set aside, put it in the need_cleanup list */ + if (kvm->arch.pv.set_aside) { + list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; + } + + /* Cleanup all protected VMs in the need_cleanup list */ + while (!list_empty(&kvm->arch.pv.need_cleanup)) { + cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list); + need_zap = true; + if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) { + cc = 1; + /* + * Only return the first error rc and rrc, so make + * sure it is not overwritten. All destroys will + * additionally be reported via KVM_UV_EVENT(). + */ + if (*rc == UVC_RC_EXECUTED) { + *rc = _rc; + *rrc = _rrc; + } + } + list_del(&cur->list); + kfree(cur); + } + + /* + * If the mm still has a mapping, try to mark all its pages as + * accessible. The counter should not reach zero before this + * cleanup has been performed. + */ + if (need_zap && mmget_not_zero(kvm->mm)) { + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + mmput(kvm->mm); + } + + /* Now the counter can safely reach 0 */ + atomic_dec(&kvm->mm->context.protected_count); + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM. + * @kvm: the VM previously associated with the protected VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Tear down the protected VM that had been previously prepared for teardown + * using kvm_s390_pv_set_aside_vm. Ideally this should be called by + * userspace asynchronously from a separate thread. + * + * Context: kvm->lock must not be held. + * + * Return: 0 in case of success, -EINVAL if no protected VM had been + * prepared for asynchronous teardowm, -EIO in case of other errors. + */ +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *p; + int ret = 0; + + lockdep_assert_not_held(&kvm->lock); + mutex_lock(&kvm->lock); + p = kvm->arch.pv.set_aside; + kvm->arch.pv.set_aside = NULL; + mutex_unlock(&kvm->lock); + if (!p) + return -EINVAL; + + /* When a fatal signal is received, stop immediately */ + if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + goto done; + if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) + ret = -EIO; + kfree(p); + p = NULL; +done: + /* + * p is not NULL if we aborted because of a fatal signal, in which + * case queue the leftover for later cleanup. + */ + if (p) { + mutex_lock(&kvm->lock); + list_add(&p->list, &kvm->arch.pv.need_cleanup); + mutex_unlock(&kvm->lock); + /* Did not finish, but pretend things went well */ + *rc = UVC_RC_EXECUTED; + *rrc = 42; + } + return ret; +} + static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, struct mm_struct *mm) { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0d5d4419139a..b3701b23ca18 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1740,6 +1740,8 @@ enum pv_cmd_id { KVM_PV_UNSHARE_ALL, KVM_PV_INFO, KVM_PV_DUMP, + KVM_PV_ASYNC_CLEANUP_PREPARE, + KVM_PV_ASYNC_CLEANUP_PERFORM, }; struct kvm_pv_cmd { -- cgit v1.2.3 From 8c516b25d6e9c70e6d76627932b14b0ef03a82c4 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 11 Nov 2022 18:06:29 +0100 Subject: KVM: s390: pv: add KVM_CAP_S390_PROTECTED_ASYNC_DISABLE Add KVM_CAP_S390_PROTECTED_ASYNC_DISABLE to signal that the KVM_PV_ASYNC_DISABLE and KVM_PV_ASYNC_DISABLE_PREPARE commands for the KVM_S390_PV_COMMAND ioctl are available. Signed-off-by: Claudio Imbrenda Reviewed-by: Nico Boehr Reviewed-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20221111170632.77622-4-imbrenda@linux.ibm.com Message-Id: <20221111170632.77622-4-imbrenda@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 3 +++ include/uapi/linux/kvm.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f0abaaf7eea4..b6cc7d2935c0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -618,6 +618,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_BPB: r = test_facility(82); break; + case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE: + r = async_destroy && is_prot_virt_host(); + break; case KVM_CAP_S390_PROTECTED: r = is_prot_virt_host(); break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b3701b23ca18..d3f86a280858 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 +#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From d8ba8ba4c801b794f47852a6f1821ea48f83b5d1 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 27 Nov 2022 12:22:10 +0000 Subject: KVM: x86/xen: Allow XEN_RUNSTATE_UPDATE flag behaviour to be configured Closer inspection of the Xen code shows that we aren't supposed to be using the XEN_RUNSTATE_UPDATE flag unconditionally. It should be explicitly enabled by guests through the HYPERVISOR_vm_assist hypercall. If we randomly set the top bit of ->state_entry_time for a guest that hasn't asked for it and doesn't expect it, that could make the runtimes fail to add up and confuse the guest. Without the flag it's perfectly safe for a vCPU to read its own vcpu_runstate_info; just not for one vCPU to read *another's*. I briefly pondered adding a word for the whole set of VMASST_TYPE_* flags but the only one we care about for HVM guests is this, so it seemed a bit pointless. Signed-off-by: David Woodhouse Message-Id: <20221127122210.248427-3-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 34 ++++++++++--- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 3 +- arch/x86/kvm/xen.c | 57 +++++++++++++++++----- include/uapi/linux/kvm.h | 4 ++ .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 14 ++++++ 6 files changed, 93 insertions(+), 20 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9175d41e8081..5617bc4f899f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5339,6 +5339,7 @@ KVM_PV_ASYNC_CLEANUP_PERFORM union { __u8 long_mode; __u8 vector; + __u8 runstate_update_flag; struct { __u64 gfn; } shared_info; @@ -5416,6 +5417,14 @@ KVM_XEN_ATTR_TYPE_XEN_VERSION event channel delivery, so responding within the kernel without exiting to userspace is beneficial. +KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG + This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates + support for KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG. It enables the + XEN_RUNSTATE_UPDATE flag which allows guest vCPUs to safely read + other vCPUs' vcpu_runstate_info. Xen guests enable this feature via + the VM_ASST_TYPE_runstate_update_flag of the HYPERVISOR_vm_assist + hypercall. + 4.127 KVM_XEN_HVM_GET_ATTR -------------------------- @@ -8059,12 +8068,13 @@ to userspace. This capability indicates the features that Xen supports for hosting Xen PVHVM guests. Valid flags are:: - #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) - #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) - #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) - #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) - #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) - #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) + #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) + #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) + #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) + #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) + #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) + #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) + #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG ioctl is available, for the guest to set its hypercall page. @@ -8096,6 +8106,18 @@ KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes. related to event channel delivery, timers, and the XENVER_version interception. +The KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG flag indicates that KVM supports +the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute in the KVM_XEN_SET_ATTR +and KVM_XEN_GET_ATTR ioctls. This controls whether KVM will set the +XEN_RUNSTATE_UPDATE flag in guest memory mapped vcpu_runstate_info during +updates of the runstate information. Note that versions of KVM which support +the RUNSTATE feature above, but not thie RUNSTATE_UPDATE_FLAG feature, will +always set the XEN_RUNSTATE_UPDATE flag when updating the guest structure, +which is perhaps counterintuitive. When this flag is advertised, KVM will +behave more correctly, not using the XEN_RUNSTATE_UPDATE flag until/unless +specifically enabled (by the guest making the hypercall, causing the VMM +to enable the KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG attribute). + 8.31 KVM_CAP_PPC_MULTITCE ------------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 70af7240a1d5..283cbb83d6ae 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1113,6 +1113,7 @@ struct msr_bitmap_range { struct kvm_xen { u32 xen_version; bool long_mode; + bool runstate_update_flag; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; struct idr evtchn_ports; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 72ac6bf05c8b..59fd55badd73 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4431,7 +4431,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | KVM_XEN_HVM_CONFIG_EVTCHN_SEND; if (sched_info_on()) - r |= KVM_XEN_HVM_CONFIG_RUNSTATE; + r |= KVM_XEN_HVM_CONFIG_RUNSTATE | + KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; break; #endif case KVM_CAP_SYNC_REGS: diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index cfc1c07bc78f..7acac5dfe2f8 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -179,7 +179,8 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) struct vcpu_runstate_info rs; unsigned long flags; size_t times_ofs; - uint8_t *update_bit; + uint8_t *update_bit = NULL; + uint64_t entry_time; uint64_t *rs_times; int *rs_state; @@ -297,7 +298,8 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) */ rs_state = gpc1->khva; rs_times = gpc1->khva + times_ofs; - update_bit = ((void *)(&rs_times[1])) - 1; + if (v->kvm->arch.xen.runstate_update_flag) + update_bit = ((void *)(&rs_times[1])) - 1; } else { /* * The guest's runstate_info is split across two pages and we @@ -351,12 +353,14 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * The update_bit is still directly in the guest memory, * via one GPC or the other. */ - if (user_len1 >= times_ofs + sizeof(uint64_t)) - update_bit = gpc1->khva + times_ofs + - sizeof(uint64_t) - 1; - else - update_bit = gpc2->khva + times_ofs + - sizeof(uint64_t) - 1 - user_len1; + if (v->kvm->arch.xen.runstate_update_flag) { + if (user_len1 >= times_ofs + sizeof(uint64_t)) + update_bit = gpc1->khva + times_ofs + + sizeof(uint64_t) - 1; + else + update_bit = gpc2->khva + times_ofs + + sizeof(uint64_t) - 1 - user_len1; + } #ifdef CONFIG_X86_64 /* @@ -376,8 +380,12 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * different cache line to the rest of the 64-bit word, due to * the (lack of) alignment constraints. */ - *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; - smp_wmb(); + entry_time = vx->runstate_entry_time; + if (update_bit) { + entry_time |= XEN_RUNSTATE_UPDATE; + *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; + smp_wmb(); + } /* * Now assemble the actual structure, either on our kernel stack @@ -385,7 +393,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * rs_times pointers were set up above. */ *rs_state = vx->current_runstate; - rs_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; + rs_times[0] = entry_time; memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); /* For the split case, we have to then copy it to the guest. */ @@ -396,8 +404,11 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) smp_wmb(); /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */ - *update_bit = vx->runstate_entry_time >> 56; - smp_wmb(); + if (update_bit) { + entry_time &= ~XEN_RUNSTATE_UPDATE; + *update_bit = entry_time >> 56; + smp_wmb(); + } if (user_len2) read_unlock(&gpc2->lock); @@ -619,6 +630,17 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + mutex_lock(&kvm->lock); + kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; + mutex_unlock(&kvm->lock); + r = 0; + break; + default: break; } @@ -656,6 +678,15 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag; + r = 0; + break; + default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 88448397642c..64dfe9c07c87 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1271,6 +1271,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) +#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) struct kvm_xen_hvm_config { __u32 flags; @@ -1776,6 +1777,7 @@ struct kvm_xen_hvm_attr { union { __u8 long_mode; __u8 vector; + __u8 runstate_update_flag; struct { __u64 gfn; } shared_info; @@ -1816,6 +1818,8 @@ struct kvm_xen_hvm_attr { /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ #define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 #define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ +#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 /* Per-vCPU Xen attributes */ #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 7f39815f1772..c9b0110d73f3 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -440,6 +440,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO); bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE); + bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG); bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL); bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND); @@ -475,6 +476,19 @@ int main(int argc, char *argv[]) }; vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm); + if (do_runstate_flag) { + struct kvm_xen_hvm_attr ruf = { + .type = KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG, + .u.runstate_update_flag = 1, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ruf); + + ruf.u.runstate_update_flag = 0; + vm_ioctl(vm, KVM_XEN_HVM_GET_ATTR, &ruf); + TEST_ASSERT(ruf.u.runstate_update_flag == 1, + "Failed to read back RUNSTATE_UPDATE_FLAG attr"); + } + struct kvm_xen_hvm_attr ha = { .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE, -- cgit v1.2.3 From 61e15f871241ee86f217320909005cd022dd844f Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 2 Dec 2022 11:50:08 +0100 Subject: KVM: Delete all references to removed KVM_SET_MEMORY_REGION ioctl The documentation says that the ioctl has been deprecated, but it has been actually removed and the remaining references are just left overs. Suggested-by: Sean Christopherson Signed-off-by: Javier Martinez Canillas Message-Id: <20221202105011.185147-2-javierm@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 16 ---------------- include/uapi/linux/kvm.h | 12 ------------ tools/include/uapi/linux/kvm.h | 12 ------------ 3 files changed, 40 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 5617bc4f899f..850e187c0a38 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -272,18 +272,6 @@ the VCPU file descriptor can be mmap-ed, including: KVM_CAP_DIRTY_LOG_RING, see section 8.3. -4.6 KVM_SET_MEMORY_REGION -------------------------- - -:Capability: basic -:Architectures: all -:Type: vm ioctl -:Parameters: struct kvm_memory_region (in) -:Returns: 0 on success, -1 on error - -This ioctl is obsolete and has been removed. - - 4.7 KVM_CREATE_VCPU ------------------- @@ -1377,10 +1365,6 @@ the memory region are automatically reflected into the guest. For example, an mmap() that affects the region will be made visible immediately. Another example is madvise(MADV_DROP). -It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. -The KVM_SET_MEMORY_REGION does not allow fine grained control over memory -allocation and is deprecated. - 4.36 KVM_SET_TSS_ADDR --------------------- diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 64dfe9c07c87..c338ca2c972d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -86,14 +86,6 @@ struct kvm_debug_guest { /* *** End of deprecated interfaces *** */ -/* for KVM_CREATE_MEMORY_REGION */ -struct kvm_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ -}; - /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -1442,10 +1434,6 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; -/* - * ioctls for VM fds - */ -#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 0d5d4419139a..8899201d5964 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -86,14 +86,6 @@ struct kvm_debug_guest { /* *** End of deprecated interfaces *** */ -/* for KVM_CREATE_MEMORY_REGION */ -struct kvm_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ -}; - /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -1437,10 +1429,6 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; -/* - * ioctls for VM fds - */ -#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. -- cgit v1.2.3 From 66a9221d73e71199a120ca12ea5bfaac2aa670a3 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 2 Dec 2022 11:50:09 +0100 Subject: KVM: Delete all references to removed KVM_SET_MEMORY_ALIAS ioctl The documentation says that the ioctl has been deprecated, but it has been actually removed and the remaining references are just left overs. Suggested-by: Sean Christopherson Signed-off-by: Javier Martinez Canillas Message-Id: <20221202105011.185147-3-javierm@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 11 ----------- arch/x86/include/uapi/asm/kvm.h | 8 -------- include/uapi/linux/kvm.h | 2 -- tools/arch/x86/include/uapi/asm/kvm.h | 8 -------- tools/include/uapi/linux/kvm.h | 2 -- 5 files changed, 31 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 850e187c0a38..07e8a42d839a 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -356,17 +356,6 @@ see the description of the capability. Note that the Xen shared info page, if configured, shall always be assumed to be dirty. KVM will not explicitly mark it such. -4.9 KVM_SET_MEMORY_ALIAS ------------------------- - -:Capability: basic -:Architectures: x86 -:Type: vm ioctl -:Parameters: struct kvm_memory_alias (in) -:Returns: 0 (success), -1 (error) - -This ioctl is obsolete and has been removed. - 4.10 KVM_RUN ------------ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c6df6b16a088..e48deab8901d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -53,14 +53,6 @@ /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 -struct kvm_memory_alias { - __u32 slot; /* this has a different namespace than memory slots */ - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; - __u64 target_phys_addr; -}; - /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ struct kvm_pic_state { __u8 last_irr; /* edge detection */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c338ca2c972d..ce9183765616 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1440,8 +1440,6 @@ struct kvm_vfio_spapr_tce { */ #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) -/* KVM_SET_MEMORY_ALIAS is obsolete: */ -#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 46de10a809ec..649e50a8f9dd 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -53,14 +53,6 @@ /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 -struct kvm_memory_alias { - __u32 slot; /* this has a different namespace than memory slots */ - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; - __u64 target_phys_addr; -}; - /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ struct kvm_pic_state { __u8 last_irr; /* edge detection */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 8899201d5964..6ba2928f8f18 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1435,8 +1435,6 @@ struct kvm_vfio_spapr_tce { */ #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) -/* KVM_SET_MEMORY_ALIAS is obsolete: */ -#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \ -- cgit v1.2.3 From 30ee198ce42d60101620d33f8bc70c3234798365 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 2 Dec 2022 11:50:10 +0100 Subject: KVM: Reference to kvm_userspace_memory_region in doc and comments There are still references to the removed kvm_memory_region data structure but the doc and comments should mention struct kvm_userspace_memory_region instead, since that is what's used by the ioctl that replaced the old one and this data structure support the same set of flags. Signed-off-by: Javier Martinez Canillas Message-Id: <20221202105011.185147-4-javierm@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 2 +- include/linux/kvm_host.h | 4 ++-- include/uapi/linux/kvm.h | 6 +++--- tools/include/uapi/linux/kvm.h | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 07e8a42d839a..92e14823619a 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1309,7 +1309,7 @@ yet and must be cleared on entry. __u64 userspace_addr; /* start of the userspace allocated memory */ }; - /* for kvm_memory_region::flags */ + /* for kvm_userspace_memory_region::flags */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8f874a964313..a5a82b536774 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -50,8 +50,8 @@ #endif /* - * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used - * in kvm, other bits are visible for userspace which are defined in + * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally + * used in kvm, other bits are visible for userspace which are defined in * include/linux/kvm_h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ce9183765616..03708ce10bda 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -96,9 +96,9 @@ struct kvm_userspace_memory_region { }; /* - * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, - * other bits are reserved for kvm internal use which are defined in - * include/linux/kvm_host.h. + * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for + * userspace, other bits are reserved for kvm internal use which are defined + * in include/linux/kvm_host.h. */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 6ba2928f8f18..21d6d29502e4 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -96,9 +96,9 @@ struct kvm_userspace_memory_region { }; /* - * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, - * other bits are reserved for kvm internal use which are defined in - * include/linux/kvm_host.h. + * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for + * userspace, other bits are reserved for kvm internal use which are defined + *in include/linux/kvm_host.h. */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) -- cgit v1.2.3