From cd48ebc5c4f2e94830b238f035ebf04f1c3a4433 Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Thu, 22 Sep 2022 20:35:07 +0800 Subject: vfio/mlx5: Switch to use module_pci_driver() macro Since pci provides the helper macro module_pci_driver(), we may replace the module_init/exit with it. Signed-off-by: Shang XiaoJing Reviewed-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220922123507.11222-1-shangxiaojing@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fd6ccb8454a2..457138b92f13 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -676,18 +676,7 @@ static struct pci_driver mlx5vf_pci_driver = { .driver_managed_dma = true, }; -static void __exit mlx5vf_pci_cleanup(void) -{ - pci_unregister_driver(&mlx5vf_pci_driver); -} - -static int __init mlx5vf_pci_init(void) -{ - return pci_register_driver(&mlx5vf_pci_driver); -} - -module_init(mlx5vf_pci_init); -module_exit(mlx5vf_pci_cleanup); +module_pci_driver(mlx5vf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy "); -- cgit v1.2.3 From e67e070632a665c932d534b8b800477bb3111449 Mon Sep 17 00:00:00 2001 From: Rafael Mendonca Date: Tue, 18 Oct 2022 12:28:25 -0300 Subject: vfio: platform: Do not pass return buffer to ACPI _RST method The ACPI _RST method has no return value, there's no need to pass a return buffer to acpi_evaluate_object(). Fixes: d30daa33ec1d ("vfio: platform: call _RST method when using ACPI") Signed-off-by: Rafael Mendonca Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20221018152825.891032-1-rafaelmendsr@gmail.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 55dc4f43c31e..1a0a238ffa35 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -72,12 +72,11 @@ static int vfio_platform_acpi_call_reset(struct vfio_platform_device *vdev, const char **extra_dbg) { #ifdef CONFIG_ACPI - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; struct device *dev = vdev->device; acpi_handle handle = ACPI_HANDLE(dev); acpi_status acpi_ret; - acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, &buffer); + acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, NULL); if (ACPI_FAILURE(acpi_ret)) { if (extra_dbg) *extra_dbg = acpi_format_exception(acpi_ret); -- cgit v1.2.3 From ea00d4ededcd8639f9e814513426cfeccdd3aaf0 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 25 Oct 2022 20:31:13 +0100 Subject: vfio/iova_bitmap: Explicitly include linux/slab.h kzalloc/kzfree are used so include `slab.h`. While it happens to work without it, due to commit 8b9f3ac5b01d ("fs: introduce alloc_inode_sb() to allocate filesystems specific inode") which indirectly includes via: . ./include/linux/mm.h .. ./include/linux/huge_mm.h ... ./include/linux/fs.h .... ./include/linux/slab.h Make it explicit should any of its indirect dependencies be dropped/changed for entirely different reasons as it was the cause prior to commit above recently (i.e. <= v5.18). Signed-off-by: Joao Martins Link: https://lore.kernel.org/r/20221025193114.58695-2-joao.m.martins@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/iova_bitmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index 6631e8befe1b..56816ba1be9b 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) -- cgit v1.2.3 From f38044e5ef58ad0346fdabd7027ea5c1e1a3b624 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 25 Oct 2022 20:31:14 +0100 Subject: vfio/iova_bitmap: Fix PAGE_SIZE unaligned bitmaps iova_bitmap_set() doesn't consider the end of the page boundary when the first bitmap page offset isn't zero, and wrongly changes the consecutive page right after. Consequently this leads to missing dirty pages from reported by the device as seen from the VMM. The current logic iterates over a given number of base pages and clamps it to the remaining indexes to iterate in the last page. Instead of having to consider extra pages to pin (e.g. first and extra pages), just handle the first page as its own range and let the rest of the bitmap be handled as if it was base page aligned. This is done by changing iova_bitmap_mapped_remaining() to return PAGE_SIZE - pgoff (on the first bitmap page), and leads to pgoff being set to 0 on following iterations. Fixes: 58ccf0190d19 ("vfio: Add an IOVA bitmap support") Reported-by: Avihai Horon Tested-by: Avihai Horon Signed-off-by: Joao Martins Link: https://lore.kernel.org/r/20221025193114.58695-3-joao.m.martins@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/iova_bitmap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index 56816ba1be9b..de6d6ea5c496 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -296,11 +296,15 @@ void iova_bitmap_free(struct iova_bitmap *bitmap) */ static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) { - unsigned long remaining; + unsigned long remaining, bytes; + + /* Cap to one page in the first iteration, if PAGE_SIZE unaligned. */ + bytes = !bitmap->mapped.pgoff ? bitmap->mapped.npages << PAGE_SHIFT : + PAGE_SIZE - bitmap->mapped.pgoff; remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; remaining = min_t(unsigned long, remaining, - (bitmap->mapped.npages << PAGE_SHIFT) / sizeof(*bitmap->bitmap)); + bytes / sizeof(*bitmap->bitmap)); return remaining; } -- cgit v1.2.3 From d1104f9327df9b26901b97cd026949f80ccab0d3 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:06 +0100 Subject: vfio/ccw: replace vfio_init_device with _alloc_ Now that we have a reasonable separation of structs that follow the subchannel and mdev lifecycles, there's no reason we can't call the official vfio_alloc_device routine for our private data, and behave like everyone else. Signed-off-by: Eric Farman Reviewed-by: Kevin Tian Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-7-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 18 ------------------ drivers/s390/cio/vfio_ccw_ops.c | 28 ++++++++++++++++++---------- drivers/s390/cio/vfio_ccw_private.h | 2 -- drivers/vfio/vfio_main.c | 10 +++++----- include/linux/vfio.h | 2 -- 5 files changed, 23 insertions(+), 37 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 9fbd1b27a1ac..c2a65808605a 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -152,24 +152,6 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); } -void vfio_ccw_free_private(struct vfio_ccw_private *private) -{ - struct vfio_ccw_crw *crw, *temp; - - list_for_each_entry_safe(crw, temp, &private->crw, next) { - list_del(&crw->next); - kfree(crw); - } - - kmem_cache_free(vfio_ccw_crw_region, private->crw_region); - kmem_cache_free(vfio_ccw_schib_region, private->schib_region); - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); - kmem_cache_free(vfio_ccw_io_region, private->io_region); - kfree(private->cp.guest_cp); - mutex_destroy(&private->io_mutex); - kfree(private); -} - static void vfio_ccw_free_parent(struct device *dev) { struct vfio_ccw_parent *parent = container_of(dev, struct vfio_ccw_parent, dev); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 8a929a9cf3c6..1155f8bcedd9 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -102,15 +102,10 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) struct vfio_ccw_private *private; int ret; - private = kzalloc(sizeof(*private), GFP_KERNEL); - if (!private) - return -ENOMEM; - - ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); - if (ret) { - kfree(private); - return ret; - } + private = vfio_alloc_device(vfio_ccw_private, vdev, &mdev->dev, + &vfio_ccw_dev_ops); + if (IS_ERR(private)) + return PTR_ERR(private); dev_set_drvdata(&parent->dev, private); @@ -135,8 +130,21 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) { struct vfio_ccw_private *private = container_of(vdev, struct vfio_ccw_private, vdev); + struct vfio_ccw_crw *crw, *temp; + + list_for_each_entry_safe(crw, temp, &private->crw, next) { + list_del(&crw->next); + kfree(crw); + } + + kmem_cache_free(vfio_ccw_crw_region, private->crw_region); + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); + kmem_cache_free(vfio_ccw_io_region, private->io_region); + kfree(private->cp.guest_cp); + mutex_destroy(&private->io_mutex); - vfio_ccw_free_private(private); + vfio_free_device(vdev); } static void vfio_ccw_mdev_remove(struct mdev_device *mdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 2278fd38d34e..b441ae6700fd 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -131,8 +131,6 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch); void vfio_ccw_sch_io_todo(struct work_struct *work); void vfio_ccw_crw_todo(struct work_struct *work); -void vfio_ccw_free_private(struct vfio_ccw_private *private); - extern struct mdev_driver vfio_ccw_mdev_driver; /* diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2d168793d4e1..2901b8ad5be9 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -348,6 +348,9 @@ static void vfio_device_release(struct device *dev) device->ops->release(device); } +static int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops); + /* * Allocate and initialize vfio_device so it can be registered to vfio * core. @@ -386,11 +389,9 @@ EXPORT_SYMBOL_GPL(_vfio_alloc_device); /* * Initialize a vfio_device so it can be registered to vfio core. - * - * Only vfio-ccw driver should call this interface. */ -int vfio_init_device(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops) +static int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops) { int ret; @@ -422,7 +423,6 @@ out_uninit: ida_free(&vfio.device_ida, device->index); return ret; } -EXPORT_SYMBOL_GPL(vfio_init_device); /* * The helper called by driver @release callback to free the device diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7cebeb875dd..ba809268a48e 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -176,8 +176,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, dev, ops), \ struct dev_struct, member) -int vfio_init_device(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops); void vfio_free_device(struct vfio_device *device); static inline void vfio_put_device(struct vfio_device *device) { -- cgit v1.2.3 From 913447d06f032a9e9c84870bec0b1adb8c588f29 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Fri, 4 Nov 2022 15:20:07 +0100 Subject: vfio: Remove vfio_free_device With the "mess" sorted out, we should be able to inline the vfio_free_device call introduced by commit cb9ff3f3b84c ("vfio: Add helpers for unifying vfio_device life cycle") and remove them from driver release callbacks. Signed-off-by: Eric Farman Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Cornelia Huck Reviewed-by: Tony Krowiak # vfio-ap part Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221104142007.1314999-8-farman@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/kvmgt.c | 1 - drivers/s390/cio/vfio_ccw_ops.c | 2 -- drivers/s390/crypto/vfio_ap_ops.c | 6 ------ drivers/vfio/fsl-mc/vfio_fsl_mc.c | 1 - drivers/vfio/pci/vfio_pci_core.c | 1 - drivers/vfio/platform/vfio_amba.c | 1 - drivers/vfio/platform/vfio_platform.c | 1 - drivers/vfio/vfio_main.c | 22 ++++------------------ include/linux/vfio.h | 1 - samples/vfio-mdev/mbochs.c | 1 - samples/vfio-mdev/mdpy.c | 1 - samples/vfio-mdev/mtty.c | 1 - 12 files changed, 4 insertions(+), 35 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 7a45e5360caf..eee6805e67de 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1461,7 +1461,6 @@ static void intel_vgpu_release_dev(struct vfio_device *vfio_dev) struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); intel_gvt_destroy_vgpu(vgpu); - vfio_free_device(vfio_dev); } static const struct vfio_device_ops intel_vgpu_dev_ops = { diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 1155f8bcedd9..598a3814d428 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -143,8 +143,6 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) kmem_cache_free(vfio_ccw_io_region, private->io_region); kfree(private->cp.guest_cp); mutex_destroy(&private->io_mutex); - - vfio_free_device(vdev); } static void vfio_ccw_mdev_remove(struct mdev_device *mdev) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 0b4cc8c597ae..f108c0f14712 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -765,11 +765,6 @@ static void vfio_ap_mdev_unlink_fr_queues(struct ap_matrix_mdev *matrix_mdev) } } -static void vfio_ap_mdev_release_dev(struct vfio_device *vdev) -{ - vfio_free_device(vdev); -} - static void vfio_ap_mdev_remove(struct mdev_device *mdev) { struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(&mdev->dev); @@ -1784,7 +1779,6 @@ static const struct attribute_group vfio_queue_attr_group = { static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { .init = vfio_ap_mdev_init_dev, - .release = vfio_ap_mdev_release_dev, .open_device = vfio_ap_mdev_open_device, .close_device = vfio_ap_mdev_close_device, .ioctl = vfio_ap_mdev_ioctl, diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index b16874e913e4..7b8889f55007 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -568,7 +568,6 @@ static void vfio_fsl_mc_release_dev(struct vfio_device *core_vdev) vfio_fsl_uninit_device(vdev); mutex_destroy(&vdev->igate); - vfio_free_device(core_vdev); } static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index badc9d828cac..9be2d5be5d95 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2109,7 +2109,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); - vfio_free_device(core_vdev); } EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index eaea63e5294c..18faf2678b99 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -95,7 +95,6 @@ static void vfio_amba_release_dev(struct vfio_device *core_vdev) vfio_platform_release_common(vdev); kfree(vdev->name); - vfio_free_device(core_vdev); } static void vfio_amba_remove(struct amba_device *adev) diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 82cedcebfd90..9910451dc341 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -83,7 +83,6 @@ static void vfio_platform_release_dev(struct vfio_device *core_vdev) container_of(core_vdev, struct vfio_platform_device, vdev); vfio_platform_release_common(vdev); - vfio_free_device(core_vdev); } static int vfio_platform_remove(struct platform_device *pdev) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2901b8ad5be9..9835757e2bee 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -339,13 +339,10 @@ static void vfio_device_release(struct device *dev) vfio_release_device_set(device); ida_free(&vfio.device_ida, device->index); - /* - * kvfree() cannot be done here due to a life cycle mess in - * vfio-ccw. Before the ccw part is fixed all drivers are - * required to support @release and call vfio_free_device() - * from there. - */ - device->ops->release(device); + if (device->ops->release) + device->ops->release(device); + + kvfree(device); } static int vfio_init_device(struct vfio_device *device, struct device *dev, @@ -424,17 +421,6 @@ out_uninit: return ret; } -/* - * The helper called by driver @release callback to free the device - * structure. Drivers which don't have private data to clean can - * simply use this helper as its @release. - */ -void vfio_free_device(struct vfio_device *device) -{ - kvfree(device); -} -EXPORT_SYMBOL_GPL(vfio_free_device); - static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, enum vfio_group_type type) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index ba809268a48e..e7480154825e 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -176,7 +176,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, dev, ops), \ struct dev_struct, member) -void vfio_free_device(struct vfio_device *device); static inline void vfio_put_device(struct vfio_device *device) { put_device(&device->device); diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 117a8d799f71..8b5a3a778a25 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -594,7 +594,6 @@ static void mbochs_release_dev(struct vfio_device *vdev) atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); kfree(mdev_state->pages); kfree(mdev_state->vconfig); - vfio_free_device(vdev); } static void mbochs_remove(struct mdev_device *mdev) diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 946e8cfde6fd..721fb06c6413 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -283,7 +283,6 @@ static void mdpy_release_dev(struct vfio_device *vdev) vfree(mdev_state->memblk); kfree(mdev_state->vconfig); - vfio_free_device(vdev); } static void mdpy_remove(struct mdev_device *mdev) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index e72085fc1376..3c2a421b9b69 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -784,7 +784,6 @@ static void mtty_release_dev(struct vfio_device *vdev) atomic_add(mdev_state->nr_ports, &mdev_avail_ports); kfree(mdev_state->vconfig); - vfio_free_device(vdev); } static void mtty_remove(struct mdev_device *mdev) -- cgit v1.2.3 From 4e016f969529f2aec0545e90119e7eb3cb124c46 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:18 +0200 Subject: vfio: Add an option to get migration data size Add an option to get migration data size by introducing a new migration feature named VFIO_DEVICE_FEATURE_MIG_DATA_SIZE. Upon VFIO_DEVICE_FEATURE_GET the estimated data length that will be required to complete STOP_COPY is returned. This option may better enable user space to consider before moving to STOP_COPY whether it can meet the downtime SLA based on the returned data. The patch also includes the implementation for mlx5 and hisi for this new option to make it feature complete for the existing drivers in this area. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Reviewed-by: Longfang Liu Link: https://lore.kernel.org/r/20221106174630.25909-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 9 ++++++++ drivers/vfio/pci/mlx5/main.c | 18 +++++++++++++++ drivers/vfio/pci/vfio_pci_core.c | 3 ++- drivers/vfio/vfio_main.c | 32 ++++++++++++++++++++++++++ include/linux/vfio.h | 5 ++++ include/uapi/linux/vfio.h | 13 +++++++++++ 6 files changed, 79 insertions(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 39eeca18a0f7..0c0c0c7f0521 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -957,6 +957,14 @@ hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev, return res; } +static int +hisi_acc_vfio_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + *stop_copy_length = sizeof(struct acc_vf_data); + return 0; +} + static int hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) @@ -1213,6 +1221,7 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { .migration_set_state = hisi_acc_vfio_pci_set_device_state, .migration_get_state = hisi_acc_vfio_pci_get_device_state, + .migration_get_data_size = hisi_acc_vfio_pci_get_data_size, }; static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 457138b92f13..6e9cf2aacc52 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -512,6 +512,23 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev, return res; } +static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + size_t state_size; + int ret; + + mutex_lock(&mvdev->state_mutex); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, + &state_size); + if (!ret) + *stop_copy_length = state_size; + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { @@ -577,6 +594,7 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { .migration_set_state = mlx5vf_pci_set_device_state, .migration_get_state = mlx5vf_pci_get_device_state, + .migration_get_data_size = mlx5vf_pci_get_data_size, }; static const struct vfio_log_ops mlx5vf_pci_log_ops = { diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9be2d5be5d95..189d4930c276 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2127,7 +2127,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (vdev->vdev.mig_ops) { if (!(vdev->vdev.mig_ops->migration_get_state && - vdev->vdev.mig_ops->migration_set_state) || + vdev->vdev.mig_ops->migration_set_state && + vdev->vdev.mig_ops->migration_get_data_size) || !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY)) return -EINVAL; } diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9835757e2bee..662e267a3e13 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1242,6 +1242,34 @@ out_copy: return 0; } +static int +vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + struct vfio_device_feature_mig_data_size data_size = {}; + unsigned long stop_copy_length; + int ret; + + if (!device->mig_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(data_size)); + if (ret != 1) + return ret; + + ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); + if (ret) + return ret; + + data_size.stop_copy_length = stop_copy_length; + if (copy_to_user(arg, &data_size, sizeof(data_size))) + return -EFAULT; + + return 0; +} + static int vfio_ioctl_device_feature_migration(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) @@ -1469,6 +1497,10 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, return vfio_ioctl_device_feature_logging_report( device, feature.flags, arg->data, feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: + return vfio_ioctl_device_feature_migration_data_size( + device, feature.flags, arg->data, + feature.argsz - minsz); default: if (unlikely(!device->ops->device_feature)) return -EINVAL; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7480154825e..43b67e46a2cb 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -107,6 +107,9 @@ struct vfio_device_ops { * @migration_get_state: Optional callback to get the migration state for * devices that support migration. It's mandatory for * VFIO_DEVICE_FEATURE_MIGRATION migration support. + * @migration_get_data_size: Optional callback to get the estimated data + * length that will be required to complete stop copy. It's mandatory for + * VFIO_DEVICE_FEATURE_MIGRATION migration support. */ struct vfio_migration_ops { struct file *(*migration_set_state)( @@ -114,6 +117,8 @@ struct vfio_migration_ops { enum vfio_device_mig_state new_state); int (*migration_get_state)(struct vfio_device *device, enum vfio_device_mig_state *curr_state); + int (*migration_get_data_size)(struct vfio_device *device, + unsigned long *stop_copy_length); }; /** diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index d7d8e0922376..3e45dbaf190e 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1128,6 +1128,19 @@ struct vfio_device_feature_dma_logging_report { #define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8 +/* + * Upon VFIO_DEVICE_FEATURE_GET read back the estimated data length that will + * be required to complete stop copy. + * + * Note: Can be called on each device state. + */ + +struct vfio_device_feature_mig_data_size { + __aligned_u64 stop_copy_length; +}; + +#define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- cgit v1.2.3 From 2f5d8cef45c30edcf3972d345f606df563d3a48e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 6 Nov 2022 19:46:19 +0200 Subject: vfio/mlx5: Fix a typo in mlx5vf_cmd_load_vhca_state() Fix a typo in mlx5vf_cmd_load_vhca_state() to use the 'load' memory layout. As in/out sizes are equal for save and load commands there wasn't any functional issue. Fixes: f1d98f346ee3 ("vfio/mlx5: Expose migration commands over mlx5 device") Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20221106174630.25909-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c604b70437a5..0848bc905d3e 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -378,8 +378,8 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { struct mlx5_core_dev *mdev; - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; - u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; u32 pdn, mkey; int err; -- cgit v1.2.3 From b058ea3ab5afea873ab8d976277539ca9e43869a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 29 Nov 2022 13:12:35 +0000 Subject: vfio/iova_bitmap: refactor iova_bitmap_set() to better handle page boundaries Commit f38044e5ef58 ("vfio/iova_bitmap: Fix PAGE_SIZE unaligned bitmaps") had fixed the unaligned bitmaps by capping the remaining iterable set at the start of the bitmap. Although, that mistakenly worked around iova_bitmap_set() incorrectly setting bits across page boundary. Fix this by reworking the loop inside iova_bitmap_set() to iterate over a range of bits to set (cur_bit .. last_bit) which may span different pinned pages, thus updating @page_idx and @offset as it sets the bits. The previous cap to the first page is now adjusted to be always accounted rather than when there's only a non-zero pgoff. While at it, make @page_idx , @offset and @nbits to be unsigned int given that it won't be more than 512 and 4096 respectively (even a bigger PAGE_SIZE or a smaller struct page size won't make this bigger than the above 32-bit max). Also, delete the stale kdoc on Return type. Cc: Avihai Horon Fixes: f38044e5ef58 ("vfio/iova_bitmap: Fix PAGE_SIZE unaligned bitmaps") Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Tested-by: Avihai Horon Link: https://lore.kernel.org/r/20221129131235.38880-1-joao.m.martins@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/iova_bitmap.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c index de6d6ea5c496..0848f920efb7 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/vfio/iova_bitmap.c @@ -298,9 +298,7 @@ static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) { unsigned long remaining, bytes; - /* Cap to one page in the first iteration, if PAGE_SIZE unaligned. */ - bytes = !bitmap->mapped.pgoff ? bitmap->mapped.npages << PAGE_SHIFT : - PAGE_SIZE - bitmap->mapped.pgoff; + bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff; remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; remaining = min_t(unsigned long, remaining, @@ -399,29 +397,27 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, * Set the bits corresponding to the range [iova .. iova+length-1] in * the user bitmap. * - * Return: The number of bits set. */ void iova_bitmap_set(struct iova_bitmap *bitmap, unsigned long iova, size_t length) { struct iova_bitmap_map *mapped = &bitmap->mapped; - unsigned long offset = (iova - mapped->iova) >> mapped->pgshift; - unsigned long nbits = max_t(unsigned long, 1, length >> mapped->pgshift); - unsigned long page_idx = offset / BITS_PER_PAGE; - unsigned long page_offset = mapped->pgoff; - void *kaddr; - - offset = offset % BITS_PER_PAGE; + unsigned long cur_bit = ((iova - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; + unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; do { - unsigned long size = min(BITS_PER_PAGE - offset, nbits); + unsigned int page_idx = cur_bit / BITS_PER_PAGE; + unsigned int offset = cur_bit % BITS_PER_PAGE; + unsigned int nbits = min(BITS_PER_PAGE - offset, + last_bit - cur_bit + 1); + void *kaddr; kaddr = kmap_local_page(mapped->pages[page_idx]); - bitmap_set(kaddr + page_offset, offset, size); + bitmap_set(kaddr, offset, nbits); kunmap_local(kaddr); - page_offset = offset = 0; - nbits -= size; - page_idx++; - } while (nbits > 0); + cur_bit += nbits; + } while (cur_bit <= last_bit); } EXPORT_SYMBOL_GPL(iova_bitmap_set); -- cgit v1.2.3 From 8f8bcc8c720c360885639de66fe69756febed824 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:16 -0400 Subject: vfio/pci: Move all the SPAPR PCI specific logic to vfio_pci_core.ko The vfio_spapr_pci_eeh_open/release() functions are one line wrappers around an arch function. Just call them directly. This eliminates some weird exported symbols that don't need to exist. Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/1-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 11 +++++++++-- drivers/vfio/vfio_spapr_eeh.c | 13 ------------- include/linux/vfio.h | 11 ----------- 3 files changed, 9 insertions(+), 26 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 189d4930c276..56501e7ef564 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,6 +27,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#include +#endif #include "vfio_pci_priv.h" @@ -686,7 +689,9 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vdev->sriov_pf_core_dev->vf_token->users--; mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); } - vfio_spapr_pci_eeh_release(vdev->pdev); +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) + eeh_dev_release(vdev->pdev); +#endif vfio_pci_core_disable(vdev); mutex_lock(&vdev->igate); @@ -705,7 +710,9 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); - vfio_spapr_pci_eeh_open(vdev->pdev); +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) + eeh_dev_open(vdev->pdev); +#endif if (vdev->sriov_pf_core_dev) { mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index 67f55ac1d459..c9d102aafbcd 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -15,19 +15,6 @@ #define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" #define DRIVER_DESC "VFIO IOMMU SPAPR EEH" -/* We might build address mapping here for "fast" path later */ -void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) -{ - eeh_dev_open(pdev); -} -EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open); - -void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) -{ - eeh_dev_release(pdev); -} -EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release); - long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 43b67e46a2cb..9378ca79d548 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -233,21 +233,10 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, int max_irq_type, size_t *data_size); -struct pci_dev; #if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) -void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); -void vfio_spapr_pci_eeh_release(struct pci_dev *pdev); long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg); #else -static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) -{ -} - -static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) -{ -} - static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg) -- cgit v1.2.3 From e5c38a203eb4343993e889eb69f5386f085f25ef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:17 -0400 Subject: vfio/spapr: Move VFIO_CHECK_EXTENSION into tce_iommu_ioctl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PPC64 kconfig is a bit of a rats nest, but it turns out that if CONFIG_SPAPR_TCE_IOMMU is on then EEH must be too: config SPAPR_TCE_IOMMU bool "sPAPR TCE IOMMU Support" depends on PPC_POWERNV || PPC_PSERIES select IOMMU_API help Enables bits of IOMMU API required by VFIO. The iommu_ops is not implemented as it is not necessary for VFIO. config PPC_POWERNV select FORCE_PCI config PPC_PSERIES select FORCE_PCI config EEH bool depends on (PPC_POWERNV || PPC_PSERIES) && PCI default y So, just open code the call to eeh_enabled() into tce_iommu_ioctl(). Reviewed-by: Christoph Hellwig Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_spapr_tce.c | 10 ++++------ drivers/vfio/vfio_spapr_eeh.c | 6 ------ 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 169f07ac162d..73cec2beae70 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -785,14 +785,12 @@ static long tce_iommu_ioctl(void *iommu_data, switch (arg) { case VFIO_SPAPR_TCE_IOMMU: case VFIO_SPAPR_TCE_v2_IOMMU: - ret = 1; - break; + return 1; + case VFIO_EEH: + return eeh_enabled(); default: - ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); - break; + return 0; } - - return (ret < 0) ? 0 : ret; } /* diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index c9d102aafbcd..221b1b637e18 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -24,12 +24,6 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, long ret = -EINVAL; switch (cmd) { - case VFIO_CHECK_EXTENSION: - if (arg == VFIO_EEH) - ret = eeh_enabled() ? 1 : 0; - else - ret = 0; - break; case VFIO_EEH_PE_OP: pe = eeh_iommu_group_to_pe(group); if (!pe) -- cgit v1.2.3 From e276e25819b8a173a21947720bb0a548c0b724b7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:18 -0400 Subject: vfio: Move vfio_spapr_iommu_eeh_ioctl into vfio_iommu_spapr_tce.c As with the previous patch EEH is always enabled if SPAPR_TCE_IOMMU, so move this last bit of code into the main module. Now that this function only processes VFIO_EEH_PE_OP remove a level of indenting as well, it is only called by a case statement that already checked VFIO_EEH_PE_OP. This eliminates an unnecessary module and SPAPR code in a global header. Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Makefile | 1 - drivers/vfio/vfio_iommu_spapr_tce.c | 55 ++++++++++++++++++++++- drivers/vfio/vfio_spapr_eeh.c | 88 ------------------------------------- include/linux/vfio.h | 12 ----- 4 files changed, 53 insertions(+), 103 deletions(-) delete mode 100644 drivers/vfio/vfio_spapr_eeh.c (limited to 'drivers/vfio') diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index b693a1169286..50b8e8e3fb10 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -10,7 +10,6 @@ vfio-y += vfio_main.o \ obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o -obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 73cec2beae70..60a50ce8701e 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -4,6 +4,7 @@ * * Copyright (C) 2013 IBM Corp. All rights reserved. * Author: Alexey Kardashevskiy + * Copyright Gavin Shan, IBM Corporation 2014. * * Derived from original vfio_iommu_type1.c: * Copyright (C) 2012 Red Hat, Inc. All rights reserved. @@ -773,6 +774,57 @@ static long tce_iommu_create_default_window(struct tce_container *container) return ret; } +static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group, + unsigned long arg) +{ + struct eeh_pe *pe; + struct vfio_eeh_pe_op op; + unsigned long minsz; + + pe = eeh_iommu_group_to_pe(group); + if (!pe) + return -ENODEV; + + minsz = offsetofend(struct vfio_eeh_pe_op, op); + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + if (op.argsz < minsz || op.flags) + return -EINVAL; + + switch (op.op) { + case VFIO_EEH_PE_DISABLE: + return eeh_pe_set_option(pe, EEH_OPT_DISABLE); + case VFIO_EEH_PE_ENABLE: + return eeh_pe_set_option(pe, EEH_OPT_ENABLE); + case VFIO_EEH_PE_UNFREEZE_IO: + return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); + case VFIO_EEH_PE_UNFREEZE_DMA: + return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); + case VFIO_EEH_PE_GET_STATE: + return eeh_pe_get_state(pe); + break; + case VFIO_EEH_PE_RESET_DEACTIVATE: + return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); + case VFIO_EEH_PE_RESET_HOT: + return eeh_pe_reset(pe, EEH_RESET_HOT, true); + case VFIO_EEH_PE_RESET_FUNDAMENTAL: + return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); + case VFIO_EEH_PE_CONFIGURE: + return eeh_pe_configure(pe); + case VFIO_EEH_PE_INJECT_ERR: + minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); + if (op.argsz < minsz) + return -EINVAL; + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + + return eeh_pe_inject_err(pe, op.err.type, op.err.func, + op.err.addr, op.err.mask); + default: + return -EINVAL; + } +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -1044,8 +1096,7 @@ static long tce_iommu_ioctl(void *iommu_data, ret = 0; list_for_each_entry(tcegrp, &container->group_list, next) { - ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, - cmd, arg); + ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg); if (ret) return ret; } diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c deleted file mode 100644 index 221b1b637e18..000000000000 --- a/drivers/vfio/vfio_spapr_eeh.c +++ /dev/null @@ -1,88 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * EEH functionality support for VFIO devices. The feature is only - * available on sPAPR compatible platforms. - * - * Copyright Gavin Shan, IBM Corporation 2014. - */ - -#include -#include -#include -#include - -#define DRIVER_VERSION "0.1" -#define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" -#define DRIVER_DESC "VFIO IOMMU SPAPR EEH" - -long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, - unsigned int cmd, unsigned long arg) -{ - struct eeh_pe *pe; - struct vfio_eeh_pe_op op; - unsigned long minsz; - long ret = -EINVAL; - - switch (cmd) { - case VFIO_EEH_PE_OP: - pe = eeh_iommu_group_to_pe(group); - if (!pe) - return -ENODEV; - - minsz = offsetofend(struct vfio_eeh_pe_op, op); - if (copy_from_user(&op, (void __user *)arg, minsz)) - return -EFAULT; - if (op.argsz < minsz || op.flags) - return -EINVAL; - - switch (op.op) { - case VFIO_EEH_PE_DISABLE: - ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE); - break; - case VFIO_EEH_PE_ENABLE: - ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE); - break; - case VFIO_EEH_PE_UNFREEZE_IO: - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); - break; - case VFIO_EEH_PE_UNFREEZE_DMA: - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); - break; - case VFIO_EEH_PE_GET_STATE: - ret = eeh_pe_get_state(pe); - break; - case VFIO_EEH_PE_RESET_DEACTIVATE: - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); - break; - case VFIO_EEH_PE_RESET_HOT: - ret = eeh_pe_reset(pe, EEH_RESET_HOT, true); - break; - case VFIO_EEH_PE_RESET_FUNDAMENTAL: - ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); - break; - case VFIO_EEH_PE_CONFIGURE: - ret = eeh_pe_configure(pe); - break; - case VFIO_EEH_PE_INJECT_ERR: - minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); - if (op.argsz < minsz) - return -EINVAL; - if (copy_from_user(&op, (void __user *)arg, minsz)) - return -EFAULT; - - ret = eeh_pe_inject_err(pe, op.err.type, op.err.func, - op.err.addr, op.err.mask); - break; - default: - ret = -EINVAL; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl); - -MODULE_VERSION(DRIVER_VERSION); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 9378ca79d548..b4d5d4ca3d7d 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -233,18 +233,6 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, int max_irq_type, size_t *data_size); -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) -long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, - unsigned long arg); -#else -static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, - unsigned int cmd, - unsigned long arg) -{ - return -ENOTTY; -} -#endif /* CONFIG_VFIO_SPAPR_EEH */ - /* * IRQfd - generic */ -- cgit v1.2.3 From 20601c45a0fa20bbb5545f4dd69f4f18448f4973 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:19 -0400 Subject: vfio: Remove CONFIG_VFIO_SPAPR_EEH We don't need a kconfig symbol for this, just directly test CONFIG_EEH in the few places that need it. Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 5 ----- drivers/vfio/pci/vfio_pci_core.c | 6 +++--- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 86c381ceb9a1..d25b91adfd64 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -20,11 +20,6 @@ config VFIO_IOMMU_SPAPR_TCE depends on SPAPR_TCE_IOMMU default VFIO -config VFIO_SPAPR_EEH - tristate - depends on EEH && VFIO_IOMMU_SPAPR_TCE - default VFIO - config VFIO_VIRQFD tristate select EVENTFD diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 56501e7ef564..f9365a5bc961 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,7 +27,7 @@ #include #include #include -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) #include #endif @@ -689,7 +689,7 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) vdev->sriov_pf_core_dev->vf_token->users--; mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); } -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) eeh_dev_release(vdev->pdev); #endif vfio_pci_core_disable(vdev); @@ -710,7 +710,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); -#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) +#if IS_ENABLED(CONFIG_EEH) eeh_dev_open(vdev->pdev); #endif -- cgit v1.2.3 From e2d55709398e62cf53e5c7df3758ae52cc62d63a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 5 Dec 2022 11:29:20 -0400 Subject: vfio: Fold vfio_virqfd.ko into vfio.ko This is only 1.8k, putting it in its own module is not really necessary. The kconfig infrastructure is still there to completely remove it for systems that are trying for small footprint. Put it in the main vfio.ko module now that kbuild can support multiple .c files. Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/5-v5-fc5346cacfd4+4c482-vfio_modules_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 2 +- drivers/vfio/Makefile | 4 +--- drivers/vfio/vfio.h | 13 +++++++++++++ drivers/vfio/vfio_main.c | 7 +++++++ drivers/vfio/virqfd.c | 17 +++-------------- 5 files changed, 25 insertions(+), 18 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index d25b91adfd64..0b8d53f63c7e 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -21,7 +21,7 @@ config VFIO_IOMMU_SPAPR_TCE default VFIO config VFIO_VIRQFD - tristate + bool select EVENTFD default n diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 50b8e8e3fb10..0721ed4831c9 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,13 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -vfio_virqfd-y := virqfd.o - obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ iova_bitmap.o \ container.o +vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o -obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_PCI) += pci/ diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index bcad54bbab08..a7113b4baaa2 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -124,6 +124,19 @@ long vfio_container_ioctl_check_extension(struct vfio_container *container, int __init vfio_container_init(void); void vfio_container_cleanup(void); +#if IS_ENABLED(CONFIG_VFIO_VIRQFD) +int __init vfio_virqfd_init(void); +void vfio_virqfd_exit(void); +#else +static inline int __init vfio_virqfd_init(void) +{ + return 0; +} +static inline void vfio_virqfd_exit(void) +{ +} +#endif + #ifdef CONFIG_VFIO_NOIOMMU extern bool vfio_noiommu __read_mostly; #else diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 662e267a3e13..7f88569c3eba 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1832,6 +1832,10 @@ static int __init vfio_init(void) if (ret) return ret; + ret = vfio_virqfd_init(); + if (ret) + goto err_virqfd; + /* /dev/vfio/$GROUP */ vfio.class = class_create(THIS_MODULE, "vfio"); if (IS_ERR(vfio.class)) { @@ -1862,6 +1866,8 @@ err_dev_class: class_destroy(vfio.class); vfio.class = NULL; err_group_class: + vfio_virqfd_exit(); +err_virqfd: vfio_container_cleanup(); return ret; } @@ -1876,6 +1882,7 @@ static void __exit vfio_cleanup(void) class_destroy(vfio.device_class); vfio.device_class = NULL; class_destroy(vfio.class); + vfio_virqfd_exit(); vfio_container_cleanup(); vfio.class = NULL; xa_destroy(&vfio_device_set_xa); diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c index 414e98d82b02..497a17b37865 100644 --- a/drivers/vfio/virqfd.c +++ b/drivers/vfio/virqfd.c @@ -12,15 +12,12 @@ #include #include #include - -#define DRIVER_VERSION "0.1" -#define DRIVER_AUTHOR "Alex Williamson " -#define DRIVER_DESC "IRQFD support for VFIO bus drivers" +#include "vfio.h" static struct workqueue_struct *vfio_irqfd_cleanup_wq; static DEFINE_SPINLOCK(virqfd_lock); -static int __init vfio_virqfd_init(void) +int __init vfio_virqfd_init(void) { vfio_irqfd_cleanup_wq = create_singlethread_workqueue("vfio-irqfd-cleanup"); @@ -30,7 +27,7 @@ static int __init vfio_virqfd_init(void) return 0; } -static void __exit vfio_virqfd_exit(void) +void vfio_virqfd_exit(void) { destroy_workqueue(vfio_irqfd_cleanup_wq); } @@ -216,11 +213,3 @@ void vfio_virqfd_disable(struct virqfd **pvirqfd) flush_workqueue(vfio_irqfd_cleanup_wq); } EXPORT_SYMBOL_GPL(vfio_virqfd_disable); - -module_init(vfio_virqfd_init); -module_exit(vfio_virqfd_exit); - -MODULE_VERSION(DRIVER_VERSION); -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); -- cgit v1.2.3 From 4db52602a6074e9cc523500b8304600ff63e7b85 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Dec 2022 10:34:26 +0200 Subject: vfio: Extend the device migration protocol with PRE_COPY The optional PRE_COPY states open the saving data transfer FD before reaching STOP_COPY and allows the device to dirty track internal state changes with the general idea to reduce the volume of data transferred in the STOP_COPY stage. While in PRE_COPY the device remains RUNNING, but the saving FD is open. Only if the device also supports RUNNING_P2P can it support PRE_COPY_P2P, which halts P2P transfers while continuing the saving FD. PRE_COPY, with P2P support, requires the driver to implement 7 new arcs and exists as an optional FSM branch between RUNNING and STOP_COPY: RUNNING -> PRE_COPY -> PRE_COPY_P2P -> STOP_COPY A new ioctl VFIO_MIG_GET_PRECOPY_INFO is provided to allow userspace to query the progress of the precopy operation in the driver with the idea it will judge to move to STOP_COPY at least once the initial data set is transferred, and possibly after the dirty size has shrunk appropriately. This ioctl is valid only in PRE_COPY states and kernel driver should return -EINVAL from any other migration state. Compared to the v1 clarification, STOP_COPY -> PRE_COPY is blocked and to be defined in future. We also split the pending_bytes report into the initial and sustaining values, e.g.: initial_bytes and dirty_bytes. initial_bytes: Amount of initial precopy data. dirty_bytes: Device state changes relative to data previously retrieved. These fields are not required to have any bearing to STOP_COPY phase. It is recommended to leave PRE_COPY for STOP_COPY only after the initial_bytes field reaches zero. Leaving PRE_COPY earlier might make things slower. Signed-off-by: Jason Gunthorpe Signed-off-by: Shay Drory Reviewed-by: Kevin Tian Reviewed-by: Shameer Kolothum Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 74 +++++++++++++++++++++++++++- include/uapi/linux/vfio.h | 123 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 191 insertions(+), 6 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 7f88569c3eba..03dbcd3d96f0 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1042,7 +1042,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, enum vfio_device_mig_state new_fsm, enum vfio_device_mig_state *next_fsm) { - enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 }; + enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; /* * The coding in this table requires the driver to implement the * following FSM arcs: @@ -1057,30 +1057,65 @@ int vfio_mig_get_next_state(struct vfio_device *device, * RUNNING_P2P -> RUNNING * RUNNING_P2P -> STOP * STOP -> RUNNING_P2P - * Without P2P the driver must implement: + * + * If precopy is supported then the driver must support these additional + * FSM arcs: + * RUNNING -> PRE_COPY + * PRE_COPY -> RUNNING + * PRE_COPY -> STOP_COPY + * However, if precopy and P2P are supported together then the driver + * must support these additional arcs beyond the P2P arcs above: + * PRE_COPY -> RUNNING + * PRE_COPY -> PRE_COPY_P2P + * PRE_COPY_P2P -> PRE_COPY + * PRE_COPY_P2P -> RUNNING_P2P + * PRE_COPY_P2P -> STOP_COPY + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P + * + * Without P2P and precopy the driver must implement: * RUNNING -> STOP * STOP -> RUNNING * * The coding will step through multiple states for some combination * transitions; if all optional features are supported, this means the * following ones: + * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY + * PRE_COPY -> RUNNING -> RUNNING_P2P + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING + * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING + * PRE_COPY_P2P -> RUNNING_P2P -> STOP + * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING * RESUMING -> STOP -> RUNNING_P2P + * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P * RESUMING -> STOP -> RUNNING_P2P -> RUNNING + * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY * RESUMING -> STOP -> STOP_COPY + * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P * RUNNING -> RUNNING_P2P -> STOP * RUNNING -> RUNNING_P2P -> STOP -> RESUMING * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY + * RUNNING_P2P -> RUNNING -> PRE_COPY * RUNNING_P2P -> STOP -> RESUMING * RUNNING_P2P -> STOP -> STOP_COPY + * STOP -> RUNNING_P2P -> PRE_COPY_P2P * STOP -> RUNNING_P2P -> RUNNING + * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY * STOP_COPY -> STOP -> RESUMING * STOP_COPY -> STOP -> RUNNING_P2P * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING + * + * The following transitions are blocked: + * STOP_COPY -> PRE_COPY + * STOP_COPY -> PRE_COPY_P2P */ static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { [VFIO_DEVICE_STATE_STOP] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, @@ -1089,14 +1124,38 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RUNNING] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, }, + [VFIO_DEVICE_STATE_PRE_COPY] = { + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, + }, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, + }, [VFIO_DEVICE_STATE_STOP_COPY] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, @@ -1105,6 +1164,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RESUMING] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, @@ -1113,6 +1174,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_RUNNING_P2P] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, @@ -1121,6 +1184,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, [VFIO_DEVICE_STATE_ERROR] = { [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, @@ -1131,6 +1196,11 @@ int vfio_mig_get_next_state(struct vfio_device *device, static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, + [VFIO_DEVICE_STATE_PRE_COPY] = + VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY, [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, [VFIO_DEVICE_STATE_RUNNING_P2P] = diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 3e45dbaf190e..23105eb036fa 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -819,12 +819,20 @@ struct vfio_device_feature { * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P * is supported in addition to the STOP_COPY states. * + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY means that + * PRE_COPY is supported in addition to the STOP_COPY states. + * + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY + * means that RUNNING_P2P, PRE_COPY and PRE_COPY_P2P are supported + * in addition to the STOP_COPY states. + * * Other combinations of flags have behavior to be defined in the future. */ struct vfio_device_feature_migration { __aligned_u64 flags; #define VFIO_MIGRATION_STOP_COPY (1 << 0) #define VFIO_MIGRATION_P2P (1 << 1) +#define VFIO_MIGRATION_PRE_COPY (1 << 2) }; #define VFIO_DEVICE_FEATURE_MIGRATION 1 @@ -875,8 +883,13 @@ struct vfio_device_feature_mig_state { * RESUMING - The device is stopped and is loading a new internal state * ERROR - The device has failed and must be reset * - * And 1 optional state to support VFIO_MIGRATION_P2P: + * And optional states to support VFIO_MIGRATION_P2P: * RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA + * And VFIO_MIGRATION_PRE_COPY: + * PRE_COPY - The device is running normally but tracking internal state + * changes + * And VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY: + * PRE_COPY_P2P - PRE_COPY, except the device cannot do peer to peer DMA * * The FSM takes actions on the arcs between FSM states. The driver implements * the following behavior for the FSM arcs: @@ -908,20 +921,48 @@ struct vfio_device_feature_mig_state { * * To abort a RESUMING session the device must be reset. * + * PRE_COPY -> RUNNING * RUNNING_P2P -> RUNNING * While in RUNNING the device is fully operational, the device may generate * interrupts, DMA, respond to MMIO, all vfio device regions are functional, * and the device may advance its internal state. * + * The PRE_COPY arc will terminate a data transfer session. + * + * PRE_COPY_P2P -> RUNNING_P2P * RUNNING -> RUNNING_P2P * STOP -> RUNNING_P2P * While in RUNNING_P2P the device is partially running in the P2P quiescent * state defined below. * + * The PRE_COPY_P2P arc will terminate a data transfer session. + * + * RUNNING -> PRE_COPY + * RUNNING_P2P -> PRE_COPY_P2P * STOP -> STOP_COPY - * This arc begin the process of saving the device state and will return a - * new data_fd. + * PRE_COPY, PRE_COPY_P2P and STOP_COPY form the "saving group" of states + * which share a data transfer session. Moving between these states alters + * what is streamed in session, but does not terminate or otherwise affect + * the associated fd. + * + * These arcs begin the process of saving the device state and will return a + * new data_fd. The migration driver may perform actions such as enabling + * dirty logging of device state when entering PRE_COPY or PER_COPY_P2P. * + * Each arc does not change the device operation, the device remains + * RUNNING, P2P quiesced or in STOP. The STOP_COPY state is described below + * in PRE_COPY_P2P -> STOP_COPY. + * + * PRE_COPY -> PRE_COPY_P2P + * Entering PRE_COPY_P2P continues all the behaviors of PRE_COPY above. + * However, while in the PRE_COPY_P2P state, the device is partially running + * in the P2P quiescent state defined below, like RUNNING_P2P. + * + * PRE_COPY_P2P -> PRE_COPY + * This arc allows returning the device to a full RUNNING behavior while + * continuing all the behaviors of PRE_COPY. + * + * PRE_COPY_P2P -> STOP_COPY * While in the STOP_COPY state the device has the same behavior as STOP * with the addition that the data transfers session continues to stream the * migration state. End of stream on the FD indicates the entire device @@ -939,6 +980,13 @@ struct vfio_device_feature_mig_state { * device state for this arc if required to prepare the device to receive the * migration data. * + * STOP_COPY -> PRE_COPY + * STOP_COPY -> PRE_COPY_P2P + * These arcs are not permitted and return error if requested. Future + * revisions of this API may define behaviors for these arcs, in this case + * support will be discoverable by a new flag in + * VFIO_DEVICE_FEATURE_MIGRATION. + * * any -> ERROR * ERROR cannot be specified as a device state, however any transition request * can be failed with an errno return and may then move the device_state into @@ -950,7 +998,7 @@ struct vfio_device_feature_mig_state { * The optional peer to peer (P2P) quiescent state is intended to be a quiescent * state for the device for the purposes of managing multiple devices within a * user context where peer-to-peer DMA between devices may be active. The - * RUNNING_P2P states must prevent the device from initiating + * RUNNING_P2P and PRE_COPY_P2P states must prevent the device from initiating * any new P2P DMA transactions. If the device can identify P2P transactions * then it can stop only P2P DMA, otherwise it must stop all DMA. The migration * driver must complete any such outstanding operations prior to completing the @@ -963,6 +1011,8 @@ struct vfio_device_feature_mig_state { * above FSM arcs. As there are multiple paths through the FSM arcs the path * should be selected based on the following rules: * - Select the shortest path. + * - The path cannot have saving group states as interior arcs, only + * starting/end states. * Refer to vfio_mig_get_next_state() for the result of the algorithm. * * The automatic transit through the FSM arcs that make up the combination @@ -976,6 +1026,9 @@ struct vfio_device_feature_mig_state { * support them. The user can discover if these states are supported by using * VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can * avoid knowing about these optional states if the kernel driver supports them. + * + * Arcs touching PRE_COPY and PRE_COPY_P2P are removed if support for PRE_COPY + * is not present. */ enum vfio_device_mig_state { VFIO_DEVICE_STATE_ERROR = 0, @@ -984,8 +1037,70 @@ enum vfio_device_mig_state { VFIO_DEVICE_STATE_STOP_COPY = 3, VFIO_DEVICE_STATE_RESUMING = 4, VFIO_DEVICE_STATE_RUNNING_P2P = 5, + VFIO_DEVICE_STATE_PRE_COPY = 6, + VFIO_DEVICE_STATE_PRE_COPY_P2P = 7, +}; + +/** + * VFIO_MIG_GET_PRECOPY_INFO - _IO(VFIO_TYPE, VFIO_BASE + 21) + * + * This ioctl is used on the migration data FD in the precopy phase of the + * migration data transfer. It returns an estimate of the current data sizes + * remaining to be transferred. It allows the user to judge when it is + * appropriate to leave PRE_COPY for STOP_COPY. + * + * This ioctl is valid only in PRE_COPY states and kernel driver should + * return -EINVAL from any other migration state. + * + * The vfio_precopy_info data structure returned by this ioctl provides + * estimates of data available from the device during the PRE_COPY states. + * This estimate is split into two categories, initial_bytes and + * dirty_bytes. + * + * The initial_bytes field indicates the amount of initial precopy + * data available from the device. This field should have a non-zero initial + * value and decrease as migration data is read from the device. + * It is recommended to leave PRE_COPY for STOP_COPY only after this field + * reaches zero. Leaving PRE_COPY earlier might make things slower. + * + * The dirty_bytes field tracks device state changes relative to data + * previously retrieved. This field starts at zero and may increase as + * the internal device state is modified or decrease as that modified + * state is read from the device. + * + * Userspace may use the combination of these fields to estimate the + * potential data size available during the PRE_COPY phases, as well as + * trends relative to the rate the device is dirtying its internal + * state, but these fields are not required to have any bearing relative + * to the data size available during the STOP_COPY phase. + * + * Drivers have a lot of flexibility in when and what they transfer during the + * PRE_COPY phase, and how they report this from VFIO_MIG_GET_PRECOPY_INFO. + * + * During pre-copy the migration data FD has a temporary "end of stream" that is + * reached when both initial_bytes and dirty_byte are zero. For instance, this + * may indicate that the device is idle and not currently dirtying any internal + * state. When read() is done on this temporary end of stream the kernel driver + * should return ENOMSG from read(). Userspace can wait for more data (which may + * never come) by using poll. + * + * Once in STOP_COPY the migration data FD has a permanent end of stream + * signaled in the usual way by read() always returning 0 and poll always + * returning readable. ENOMSG may not be returned in STOP_COPY. + * Support for this ioctl is mandatory if a driver claims to support + * VFIO_MIGRATION_PRE_COPY. + * + * Return: 0 on success, -1 and errno set on failure. + */ +struct vfio_precopy_info { + __u32 argsz; + __u32 flags; + __aligned_u64 initial_bytes; + __aligned_u64 dirty_bytes; }; +#define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21) + /* * Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power * state with the platform-based power management. Device use of lower power -- cgit v1.2.3 From 0e7caa65d707b93fbb4322c6313f739fa9103dfa Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:27 +0200 Subject: vfio/mlx5: Enforce a single SAVE command at a time Enforce a single SAVE command at a time. As the SAVE command is an asynchronous one, we must enforce running only a single command at a time. This will preserve ordering between multiple calls and protect from races on the migration file data structure. This is a must for the next patches from the series where as part of PRE_COPY we may have multiple images to be saved and multiple SAVE commands may be issued from different flows. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 6 ++++++ drivers/vfio/pci/mlx5/cmd.h | 1 + drivers/vfio/pci/mlx5/main.c | 7 +++++++ 3 files changed, 14 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0848bc905d3e..55ee8036f59c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -281,6 +281,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); + complete(&migf->save_comp); fput(migf->filp); } @@ -321,6 +322,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return -ENOTCONN; mdev = mvdev->mdev; + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + err = mlx5_core_alloc_pd(mdev, &pdn); if (err) return err; @@ -371,6 +376,7 @@ err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: mlx5_core_dealloc_pd(mdev, pdn); + complete(&migf->save_comp); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 921d5720a1e5..8ffa7699872c 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -37,6 +37,7 @@ struct mlx5_vf_migration_file { unsigned long last_offset; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; + struct completion save_comp; struct mlx5_async_ctx async_ctx; struct mlx5vf_async_data async_data; }; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 6e9cf2aacc52..0d71ebb2a972 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -245,6 +245,13 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); + init_completion(&migf->save_comp); + /* + * save_comp is being used as a binary semaphore built from + * a completion. A normal mutex cannot be used because the lock is + * passed between kernel threads and lockdep can't model this. + */ + complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, -- cgit v1.2.3 From 9945a67ea4b30657dd998c7fbbea1b3950747168 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:28 +0200 Subject: vfio/mlx5: Refactor PD usage This patch refactors PD usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED and a single PD can be simply reused. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 53 +++++++++++++++++++++++++++++--------------- drivers/vfio/pci/mlx5/cmd.h | 5 ++++- drivers/vfio/pci/mlx5/main.c | 44 ++++++++++++++++++++++++++---------- 3 files changed, 71 insertions(+), 31 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 55ee8036f59c..a97eac49e3d6 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -279,7 +279,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5_core_destroy_mkey(mdev, async_data->mkey); dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); - mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -314,7 +313,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; struct mlx5_core_dev *mdev; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -326,16 +325,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); if (err) goto err_dma_map; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_create_mkey; @@ -357,7 +352,6 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, migf->total_length = 0; get_file(migf->filp); async_data->mkey = mkey; - async_data->pdn = pdn; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -375,7 +369,6 @@ err_out: err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: - mlx5_core_dealloc_pd(mdev, pdn); complete(&migf->save_comp); return err; } @@ -386,7 +379,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 pdn, mkey; + u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -400,15 +393,11 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, } mdev = mvdev->mdev; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - goto end; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); if (err) - goto err_reg; + goto end; - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); + err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); if (err) goto err_mkey; @@ -424,13 +413,41 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, mlx5_core_destroy_mkey(mdev, mkey); err_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -err_reg: - mlx5_core_dealloc_pd(mdev, pdn); end: mutex_unlock(&migf->lock); return err; } +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) +{ + int err; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return -ENOTCONN; + + err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); + return err; +} + +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return; + + mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); +} + +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + + WARN_ON(migf->mvdev->mdev_detach); + + mlx5vf_cmd_dealloc_pd(migf); +} + static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, u32 req_nodes) { diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8ffa7699872c..ba760f956d53 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -16,7 +16,6 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 pdn; u32 mkey; void *out; }; @@ -27,6 +26,7 @@ struct mlx5_vf_migration_file { u8 disabled:1; u8 is_err:1; + u32 pdn; struct sg_append_table table; size_t total_length; size_t allocated_length; @@ -127,6 +127,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 0d71ebb2a972..1916f7c1468c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -236,12 +236,15 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, O_RDONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); @@ -257,20 +260,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &migf->total_length); if (ret) - goto out_free; + goto out_pd; ret = mlx5vf_add_migration_pages( migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); if (ret) - goto out_free; + goto out_pd; - migf->mvdev = mvdev; ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); if (ret) - goto out_free; + goto out_save; return migf; +out_save: + mlx5vf_disable_fd(migf); +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); +end: + kfree(migf); return ERR_PTR(ret); } @@ -352,6 +360,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); if (!migf) @@ -360,20 +369,30 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, O_WRONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_free: + fput(migf->filp); +end: + kfree(migf); + return ERR_PTR(ret); } void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); fput(mvdev->resuming_migf->filp); mvdev->resuming_migf = NULL; } @@ -381,6 +400,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; } -- cgit v1.2.3 From 91454f8b9bf4ce6be1d9a0b4de401bc3c6313a95 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:29 +0200 Subject: vfio/mlx5: Refactor MKEY usage This patch refactors MKEY usage such as its life cycle will be as of the migration file instead of allocating/destroying it upon each SAVE/LOAD command. This is a preparation step towards the PRE_COPY series where multiple images will be SAVED/LOADED. We achieve it by having a new struct named mlx5_vhca_data_buffer which holds the mkey and its related stuff as of sg_append_table, allocated_length, etc. The above fields were taken out from the migration file main struct, into mlx5_vhca_data_buffer dedicated struct with the proper helpers in place. For now we have a single mlx5_vhca_data_buffer per migration file. However, in coming patches we'll have multiple of them to support multiple images. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 162 +++++++++++++++++++++++++++---------------- drivers/vfio/pci/mlx5/cmd.h | 37 +++++++--- drivers/vfio/pci/mlx5/main.c | 92 +++++++++++++----------- 3 files changed, 178 insertions(+), 113 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index a97eac49e3d6..ed4c472d2eae 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -210,11 +210,11 @@ err_exec: } static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : + size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : recv_buf->npages; int err = 0, inlen; __be64 *mtt; @@ -232,10 +232,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - if (migf) { + if (buf) { struct sg_dma_page_iter dma_iter; - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); } else { int i; @@ -255,20 +255,99 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, - migf ? migf->total_length : (npages * PAGE_SIZE)); + MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; } +static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; + int ret; + + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) + return -ENOTCONN; + + if (buf->dmaed || !buf->allocated_length) + return -EINVAL; + + ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + if (ret) + return ret; + + ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); + if (ret) + goto err; + + buf->dmaed = true; + + return 0; +err: + dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + return ret; +} + +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5_vf_migration_file *migf = buf->migf; + struct sg_page_iter sg_iter; + + lockdep_assert_held(&migf->mvdev->state_mutex); + WARN_ON(migf->mvdev->mdev_detach); + + if (buf->dmaed) { + mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, + buf->dma_dir, 0); + } + + /* Undo alloc_pages_bulk_array() */ + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&buf->table); + kfree(buf); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, + enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf; + int ret; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + buf->dma_dir = dma_dir; + buf->migf = migf; + if (length) { + ret = mlx5vf_add_migration_pages(buf, + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (ret) + goto end; + + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } + + return buf; +end: + mlx5vf_free_data_buffer(buf); + return ERR_PTR(ret); +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, struct mlx5vf_async_data, work); struct mlx5_vf_migration_file *migf = container_of(async_data, struct mlx5_vf_migration_file, async_data); - struct mlx5_core_dev *mdev = migf->mvdev->mdev; mutex_lock(&migf->lock); if (async_data->status) { @@ -276,9 +355,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); - - mlx5_core_destroy_mkey(mdev, async_data->mkey); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -292,7 +368,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->total_length, + WRITE_ONCE(migf->buf->length, MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size)); wake_up_interruptible(&migf->poll_wait); @@ -307,39 +383,28 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) } int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5vf_async_data *async_data; - struct mlx5_core_dev *mdev; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mdev = mvdev->mdev; err = wait_for_completion_interruptible(&migf->save_comp); if (err) return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, - 0); - if (err) - goto err_dma_map; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_create_mkey; - MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(save_vhca_state_in, in, mkey, mkey); - MLX5_SET(save_vhca_state_in, in, size, migf->total_length); + MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; async_data->out = kvzalloc(out_size, GFP_KERNEL); @@ -348,10 +413,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, goto err_out; } - /* no data exists till the callback comes back */ - migf->total_length = 0; get_file(migf->filp); - async_data->mkey = mkey; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -365,57 +427,33 @@ err_exec: fput(migf->filp); kvfree(async_data->out); err_out: - mlx5_core_destroy_mkey(mdev, mkey); -err_create_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); -err_dma_map: complete(&migf->save_comp); return err; } int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { - struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; - u32 mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mutex_lock(&migf->lock); - if (!migf->total_length) { - err = -EINVAL; - goto end; - } - - mdev = mvdev->mdev; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); + err = mlx5vf_dma_data_buffer(buf); if (err) - goto end; - - err = _create_mkey(mdev, migf->pdn, migf, NULL, &mkey); - if (err) - goto err_mkey; + return err; MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(load_vhca_state_in, in, mkey, mkey); - MLX5_SET(load_vhca_state_in, in, size, migf->total_length); - - err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out); - - mlx5_core_destroy_mkey(mdev, mkey); -err_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -end: - mutex_unlock(&migf->lock); - return err; + MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(load_vhca_state_in, in, size, buf->length); + return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); } int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) @@ -445,6 +483,10 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) WARN_ON(migf->mvdev->mdev_detach); + if (migf->buf) { + mlx5vf_free_data_buffer(migf->buf); + migf->buf = NULL; + } mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index ba760f956d53..b0f08dfc8120 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,11 +12,25 @@ #include #include +struct mlx5_vhca_data_buffer { + struct sg_append_table table; + loff_t start_pos; + u64 length; + u64 allocated_length; + u32 mkey; + enum dma_data_direction dma_dir; + u8 dmaed:1; + struct mlx5_vf_migration_file *migf; + /* Optimize mlx5vf_get_migration_page() for sequential access */ + struct scatterlist *last_offset_sg; + unsigned int sg_last_entry; + unsigned long last_offset; +}; + struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; int status; - u32 mkey; void *out; }; @@ -27,14 +41,7 @@ struct mlx5_vf_migration_file { u8 is_err:1; u32 pdn; - struct sg_append_table table; - size_t total_length; - size_t allocated_length; - - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; + struct mlx5_vhca_data_buffer *buf; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -124,12 +131,20 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 1916f7c1468c..5f694fce854c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -33,7 +33,7 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) } static struct page * -mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { unsigned long cur_offset = 0; @@ -41,20 +41,20 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, unsigned int i; /* All accesses are sequential */ - if (offset < migf->last_offset || !migf->last_offset_sg) { - migf->last_offset = 0; - migf->last_offset_sg = migf->table.sgt.sgl; - migf->sg_last_entry = 0; + if (offset < buf->last_offset || !buf->last_offset_sg) { + buf->last_offset = 0; + buf->last_offset_sg = buf->table.sgt.sgl; + buf->sg_last_entry = 0; } - cur_offset = migf->last_offset; + cur_offset = buf->last_offset; - for_each_sg(migf->last_offset_sg, sg, - migf->table.sgt.orig_nents - migf->sg_last_entry, i) { + for_each_sg(buf->last_offset_sg, sg, + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { if (offset < sg->length + cur_offset) { - migf->last_offset_sg = sg; - migf->sg_last_entry += i; - migf->last_offset = cur_offset; + buf->last_offset_sg = sg; + buf->sg_last_entry += i; + buf->last_offset = cur_offset; return nth_page(sg_page(sg), (offset - cur_offset) / PAGE_SIZE); } @@ -63,8 +63,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, return NULL; } -static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, - unsigned int npages) +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages) { unsigned int to_alloc = npages; struct page **page_list; @@ -85,13 +85,13 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, } to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &migf->table, page_list, filled, 0, + &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL); if (ret) goto err; - migf->allocated_length += filled * PAGE_SIZE; + buf->allocated_length += filled * PAGE_SIZE; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -108,16 +108,8 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { - struct sg_page_iter sg_iter; - mutex_lock(&migf->lock); - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&migf->table); migf->disabled = true; - migf->total_length = 0; - migf->allocated_length = 0; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -136,6 +128,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; ssize_t done = 0; if (pos) @@ -144,16 +137,16 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(migf->total_length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || migf->is_err)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { + if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { done = -EAGAIN; goto out_unlock; } - if (*pos > migf->total_length) { + if (*pos > vhca_buf->length) { done = -EINVAL; goto out_unlock; } @@ -162,7 +155,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, goto out_unlock; } - len = min_t(size_t, migf->total_length - *pos, len); + len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { size_t page_offset; struct page *page; @@ -171,7 +164,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -208,7 +201,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->disabled || migf->is_err) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->total_length)) + else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -227,6 +220,8 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -257,22 +252,23 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &migf->total_length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; - ret = mlx5vf_add_migration_pages( - migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); - if (ret) + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); goto out_pd; + } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; + migf->buf = buf; return migf; out_save: - mlx5vf_disable_fd(migf); + mlx5vf_free_data_buffer(buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -286,6 +282,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; loff_t requested_length; ssize_t done = 0; @@ -306,10 +303,10 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; } - if (migf->allocated_length < requested_length) { + if (vhca_buf->allocated_length < requested_length) { done = mlx5vf_add_migration_pages( - migf, - DIV_ROUND_UP(requested_length - migf->allocated_length, + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, PAGE_SIZE)); if (done) goto out_unlock; @@ -323,7 +320,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, int ret; page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); + page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); if (!page) { if (done == 0) done = -EINVAL; @@ -342,7 +339,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, len -= page_len; done += page_len; buf += page_len; - migf->total_length += page_len; + vhca_buf->length += page_len; } out_unlock: mutex_unlock(&migf->lock); @@ -360,6 +357,7 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -378,9 +376,18 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto out_free; + buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_pd; + } + + migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); return migf; +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); end: @@ -474,7 +481,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf); + mvdev->resuming_migf, + mvdev->resuming_migf->buf); if (ret) return ERR_PTR(ret); mlx5vf_disable_fds(mvdev); -- cgit v1.2.3 From 8b599d143419669e57da3881d8293f17809688d7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:30 +0200 Subject: vfio/mlx5: Refactor migration file state Refactor migration file state to be an emum which is mutual exclusive. As of that dropped the 'disabled' state as 'error' is the same from functional point of view. Next patches from the series will extend this enum for other relevant states. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 2 +- drivers/vfio/pci/mlx5/cmd.h | 7 +++++-- drivers/vfio/pci/mlx5/main.c | 11 ++++++----- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index ed4c472d2eae..fcba12326185 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,7 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->is_err = true; + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index b0f08dfc8120..14403e654e4e 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,6 +12,10 @@ #include #include +enum mlx5_vf_migf_state { + MLX5_MIGF_STATE_ERROR = 1, +}; + struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; @@ -37,8 +41,7 @@ struct mlx5vf_async_data { struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; - u8 disabled:1; - u8 is_err:1; + enum mlx5_vf_migf_state state; u32 pdn; struct mlx5_vhca_data_buffer *buf; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 5f694fce854c..d95646c2f010 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -109,7 +109,7 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { mutex_lock(&migf->lock); - migf->disabled = true; + migf->state = MLX5_MIGF_STATE_ERROR; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -137,7 +137,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || migf->is_err)) + READ_ONCE(vhca_buf->length) || + migf->state == MLX5_MIGF_STATE_ERROR)) return -ERESTARTSYS; } @@ -150,7 +151,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, done = -EINVAL; goto out_unlock; } - if (migf->disabled || migf->is_err) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } @@ -199,7 +200,7 @@ static __poll_t mlx5vf_save_poll(struct file *filp, poll_wait(filp, &migf->poll_wait, wait); mutex_lock(&migf->lock); - if (migf->disabled || migf->is_err) + if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; else if (READ_ONCE(migf->buf->length)) pollflags = EPOLLIN | EPOLLRDNORM; @@ -298,7 +299,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, return -ENOMEM; mutex_lock(&migf->lock); - if (migf->disabled) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } -- cgit v1.2.3 From c668878381b5702f867ec7f43ee3b74259c6ea03 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:31 +0200 Subject: vfio/mlx5: Refactor to use queue based data chunks Refactor to use queue based data chunks on the migration file. The SAVE command adds a chunk to the tail of the queue while the read() API finds the required chunk and returns its data. In case the queue is empty but the state of the migration file is MLX5_MIGF_STATE_COMPLETE, read() may not be blocked but will return 0 to indicate end of file. This is a step towards maintaining multiple images and their meta data (i.e. headers) on the migration file as part of next patches from the series. Note: At that point, we still use a single chunk on the migration file but becomes ready to support multiple. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 24 +++++-- drivers/vfio/pci/mlx5/cmd.h | 5 ++ drivers/vfio/pci/mlx5/main.c | 145 +++++++++++++++++++++++++++++++++---------- 3 files changed, 136 insertions(+), 38 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index fcba12326185..0e36b4c8c816 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,6 +351,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { + migf->buf = async_data->buf; migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -368,9 +369,15 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->buf->length, - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size)); + unsigned long flags; + + async_data->buf->length = + MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -407,6 +414,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); async_data = &migf->async_data; + async_data->buf = buf; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -479,14 +487,22 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) { - lockdep_assert_held(&migf->mvdev->state_mutex); + struct mlx5_vhca_data_buffer *entry; + lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); if (migf->buf) { mlx5vf_free_data_buffer(migf->buf); migf->buf = NULL; } + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + mlx5vf_free_data_buffer(entry); + } + mlx5vf_cmd_dealloc_pd(migf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 14403e654e4e..6e594689566e 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -14,6 +14,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_COMPLETE, }; struct mlx5_vhca_data_buffer { @@ -24,6 +25,7 @@ struct mlx5_vhca_data_buffer { u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; + struct list_head buf_elm; struct mlx5_vf_migration_file *migf; /* Optimize mlx5vf_get_migration_page() for sequential access */ struct scatterlist *last_offset_sg; @@ -34,6 +36,7 @@ struct mlx5_vhca_data_buffer { struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; + struct mlx5_vhca_data_buffer *buf; int status; void *out; }; @@ -45,6 +48,8 @@ struct mlx5_vf_migration_file { u32 pdn; struct mlx5_vhca_data_buffer *buf; + spinlock_t list_lock; + struct list_head buf_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index d95646c2f010..ca16425811c4 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -124,11 +124,90 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp) return 0; } +static struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, + bool *end_of_data) +{ + struct mlx5_vhca_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = MLX5_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = mlx5vf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; - struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; ssize_t done = 0; if (pos) @@ -137,53 +216,47 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(vhca_buf->length) || - migf->state == MLX5_MIGF_STATE_ERROR)) + !list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) { - done = -EAGAIN; - goto out_unlock; - } - if (*pos > vhca_buf->length) { - done = -EINVAL; - goto out_unlock; - } if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } - len = min_t(size_t, vhca_buf->length - *pos, len); while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *from_buff; - int ret; + ssize_t count; + + vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, + &end_of_data); + if (first_loop_call) { + first_loop_call = false; + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { + if (filp->f_flags & O_NONBLOCK) { + done = -EAGAIN; + goto out_unlock; + } + } + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; + if (end_of_data) + goto out_unlock; + + if (!vhca_buf) { + done = -EINVAL; goto out_unlock; } - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - from_buff = kmap_local_page(page); - ret = copy_to_user(buf, from_buff + page_offset, page_len); - kunmap_local(from_buff); - if (ret) { - done = -EFAULT; + count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; goto out_unlock; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; + done += count; } out_unlock: @@ -202,7 +275,8 @@ static __poll_t mlx5vf_save_poll(struct file *filp, mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->buf->length)) + else if (!list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_COMPLETE) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); @@ -253,6 +327,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); if (ret) goto out_pd; @@ -266,7 +342,6 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); if (ret) goto out_save; - migf->buf = buf; return migf; out_save: mlx5vf_free_data_buffer(buf); @@ -386,6 +461,8 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf = buf; stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + spin_lock_init(&migf->list_lock); return migf; out_pd: mlx5vf_cmd_dealloc_pd(migf); -- cgit v1.2.3 From 3319d287f4c04b9deece8ea00e27a70bbe32941b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:32 +0200 Subject: vfio/mlx5: Introduce device transitions of PRE_COPY In order to support PRE_COPY, mlx5 driver is transferring multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. The device is saving three kinds of states: 1) Initial state - when the device moves to PRE_COPY state. 2) Middle state - during PRE_COPY phase via VFIO_MIG_GET_PRECOPY_INFO. There can be multiple states of this type. 3) Final state - when the device moves to STOP_COPY state. After moving to PRE_COPY state, user is holding the saving migf FD and can use it. For example: user can start transferring data via read() callback. Also, user can switch from PRE_COPY to STOP_COPY whenever he sees it fits. This will invoke saving of final state. This means that mlx5 VFIO device can be switched to STOP_COPY without transferring any data in PRE_COPY state. Therefore, when the device moves to STOP_COPY, mlx5 will store the final state on a dedicated queue entry on the list. Co-developed-by: Shay Drory Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-9-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 96 ++++++++++++++++++++++++++++++++++++++++---- drivers/vfio/pci/mlx5/cmd.h | 16 +++++++- drivers/vfio/pci/mlx5/main.c | 90 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 184 insertions(+), 18 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0e36b4c8c816..5fcece201d4c 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; + int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while the device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the save + * command once it will try to turn on 'tracking' on a suspended device. + */ + if (migf) { + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + } + MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); - return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + if (migf) + complete(&migf->save_comp); + + return err; } int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) @@ -45,7 +63,7 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size) + size_t *state_size, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; @@ -59,6 +77,8 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); + MLX5_SET(query_vhca_migration_state_in, in, incremental, + query_flags & MLX5VF_QUERY_INC); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); @@ -342,6 +362,56 @@ end: return ERR_PTR(ret); } +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + spin_lock_irq(&buf->migf->list_lock); + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); + spin_unlock_irq(&buf->migf->list_lock); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf, *temp_buf; + struct list_head free_list; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return ERR_PTR(-ENOTCONN); + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + if (buf->dma_dir == dma_dir) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to mlx5vf_free_data_buffer which + * might sleep. + */ + list_add(&buf->buf_elm, &free_list); + } + } + spin_unlock_irq(&migf->list_lock); + buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + mlx5vf_free_data_buffer(temp_buf); + } + + return buf; +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, @@ -351,7 +421,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { - migf->buf = async_data->buf; + mlx5vf_put_data_buffer(async_data->buf); migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -369,15 +439,19 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { + size_t image_size; unsigned long flags; - async_data->buf->length = - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size); + image_size = MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + async_data->buf->length = image_size; + async_data->buf->start_pos = migf->max_pos; + migf->max_pos += async_data->buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - migf->state = MLX5_MIGF_STATE_COMPLETE; + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); } @@ -391,7 +465,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf) + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; @@ -412,9 +487,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, incremental, inc); + MLX5_SET(save_vhca_state_in, in, set_track, track); async_data = &migf->async_data; async_data->buf = buf; + async_data->last_chunk = !track; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -497,6 +575,8 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) migf->buf = NULL; } + list_splice(&migf->avail_list, &migf->buf_list); + while ((entry = list_first_entry_or_null(&migf->buf_list, struct mlx5_vhca_data_buffer, buf_elm))) { list_del(&entry->buf_elm); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 6e594689566e..34e61c7aa23d 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -38,6 +38,7 @@ struct mlx5vf_async_data { struct work_struct work; struct mlx5_vhca_data_buffer *buf; int status; + u8 last_chunk:1; void *out; }; @@ -47,9 +48,11 @@ struct mlx5_vf_migration_file { enum mlx5_vf_migf_state state; u32 pdn; + loff_t max_pos; struct mlx5_vhca_data_buffer *buf; spinlock_t list_lock; struct list_head buf_list; + struct list_head avail_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; struct completion save_comp; @@ -129,10 +132,14 @@ struct mlx5vf_pci_core_device { struct mlx5_core_dev *mdev; }; +enum { + MLX5VF_QUERY_INC = (1UL << 0), +}; + int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size); + size_t *state_size, u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); @@ -140,7 +147,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, - struct mlx5_vhca_data_buffer *buf); + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf); @@ -151,6 +159,10 @@ struct mlx5_vhca_data_buffer * mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, size_t length, enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index ca16425811c4..9cabba456044 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -195,6 +195,7 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, if (*pos >= vhca_buf->start_pos + vhca_buf->length) { spin_lock_irq(&vhca_buf->migf->list_lock); list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); spin_unlock_irq(&vhca_buf->migf->list_lock); } @@ -283,6 +284,16 @@ static __poll_t mlx5vf_save_poll(struct file *filp, return pollflags; } +/* + * FD is exposed and user can use it after receiving an error. + * Mark migf in error, and wake the user. + */ +static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) +{ + migf->state = MLX5_MIGF_STATE_ERROR; + wake_up_interruptible(&migf->poll_wait); +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, @@ -291,8 +302,42 @@ static const struct file_operations mlx5vf_save_fops = { .llseek = no_llseek, }; +static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; + int ret; + + if (migf->state == MLX5_MIGF_STATE_ERROR) + return -ENODEV; + + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + MLX5VF_QUERY_INC); + if (ret) + goto err; + + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); + if (ret) + goto err_save; + + return 0; + +err_save: + mlx5vf_put_data_buffer(buf); +err: + mlx5vf_mark_err(migf); + return ret; +} + static struct mlx5_vf_migration_file * -mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) +mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) { struct mlx5_vf_migration_file *migf; struct mlx5_vhca_data_buffer *buf; @@ -328,8 +373,9 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); if (ret) goto out_pd; @@ -339,7 +385,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) goto out_pd; } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); if (ret) goto out_save; return migf; @@ -462,6 +508,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); return migf; out_pd: @@ -514,7 +561,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) @@ -522,7 +570,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) @@ -533,7 +582,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct mlx5_vf_migration_file *migf; - migf = mlx5vf_pci_save_device_data(mvdev); + migf = mlx5vf_pci_save_device_data(mvdev, false); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -541,7 +590,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return migf->filp; } - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_RUNNING_P2P)) { mlx5vf_disable_fds(mvdev); return NULL; } @@ -567,6 +619,28 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct mlx5_vf_migration_file *migf; + + migf = mlx5vf_pci_save_device_data(mvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + mvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = mlx5vf_cmd_suspend_vhca(mvdev, + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); + if (ret) + return ERR_PTR(ret); + ret = mlx5vf_pci_save_device_inc_data(mvdev); + return ret ? ERR_PTR(ret) : NULL; + } + /* * vfio_mig_get_next_state() does not use arcs other than the above */ @@ -635,7 +709,7 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, mutex_lock(&mvdev->state_mutex); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &state_size); + &state_size, 0); if (!ret) *stop_copy_length = state_size; mlx5vf_state_mutex_unlock(mvdev); -- cgit v1.2.3 From 0c9a38fee8b210a8dfd3f177526daac567ec9265 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:33 +0200 Subject: vfio/mlx5: Introduce SW headers for migration states As mentioned in the previous patches, mlx5 is transferring multiple states when the PRE_COPY protocol is used. This states mechanism requires the target VM to know the states' size in order to execute multiple loads. Therefore, add SW header, with the needed information, for each saved state the source VM is transferring to the target VM. This patch implements the source VM handling of the headers, following patch will implement the target VM handling of the headers. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-10-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 56 +++++++++++++++++++++++++++++++++++++++++--- drivers/vfio/pci/mlx5/cmd.h | 13 ++++++++++ drivers/vfio/pci/mlx5/main.c | 2 +- 3 files changed, 67 insertions(+), 4 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 5fcece201d4c..160fa38fc78d 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -351,9 +351,11 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, if (ret) goto end; - ret = mlx5vf_dma_data_buffer(buf); - if (ret) - goto end; + if (dma_dir != DMA_NONE) { + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } } return buf; @@ -422,6 +424,8 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mutex_lock(&migf->lock); if (async_data->status) { mlx5vf_put_data_buffer(async_data->buf); + if (async_data->header_buf) + mlx5vf_put_data_buffer(async_data->header_buf); migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } @@ -431,6 +435,32 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) fput(migf->filp); } +static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, + size_t image_size) +{ + struct mlx5_vf_migration_file *migf = header_buf->migf; + struct mlx5_vf_migration_header header = {}; + unsigned long flags; + struct page *page; + u8 *to_buff; + + header.image_size = cpu_to_le64(image_size); + page = mlx5vf_get_migration_page(header_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + kunmap_local(to_buff); + header_buf->length = sizeof(header); + header_buf->header_image_size = image_size; + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + return 0; +} + static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) { struct mlx5vf_async_data *async_data = container_of(context, @@ -444,6 +474,11 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); + if (async_data->header_buf) { + status = add_buf_header(async_data->header_buf, image_size); + if (status) + goto err; + } async_data->buf->length = image_size; async_data->buf->start_pos = migf->max_pos; migf->max_pos += async_data->buf->length; @@ -455,6 +490,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) wake_up_interruptible(&migf->poll_wait); } +err: /* * The error and the cleanup flows can't run from an * interrupt context @@ -470,6 +506,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + struct mlx5_vhca_data_buffer *header_buf = NULL; struct mlx5vf_async_data *async_data; int err; @@ -499,6 +536,16 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, goto err_out; } + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; + } + } + + async_data->header_buf = header_buf; get_file(migf->filp); err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, @@ -510,7 +557,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return 0; err_exec: + if (header_buf) + mlx5vf_put_data_buffer(header_buf); fput(migf->filp); +err_free: kvfree(async_data->out); err_out: complete(&migf->save_comp); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 34e61c7aa23d..3e36ccca820a 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,16 +12,26 @@ #include #include +#define MLX5VF_PRE_COPY_SUPP(mvdev) \ + ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) + enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, MLX5_MIGF_STATE_COMPLETE, }; +struct mlx5_vf_migration_header { + __le64 image_size; + /* For future use in case we may need to change the kernel protocol */ + __le64 flags; +}; + struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; u64 length; u64 allocated_length; + u64 header_image_size; u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; @@ -37,6 +47,7 @@ struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *header_buf; int status; u8 last_chunk:1; void *out; @@ -165,6 +176,8 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); +struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9cabba456044..9a36e36ec33b 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -32,7 +32,7 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -static struct page * +struct page * mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { -- cgit v1.2.3 From 0dce165b1adf8d7f67030bb257e00107db8022de Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:34 +0200 Subject: vfio/mlx5: Introduce vfio precopy ioctl implementation vfio precopy ioctl returns an estimation of data available for transferring from the device. Whenever a user is using VFIO_MIG_GET_PRECOPY_INFO, track the current state of the device, and if needed, append the dirty data to the transfer FD data. This is done by saving a middle state. As mlx5 runs the SAVE command asynchronously, make sure to query for incremental data only once there is no active save command. Running both in parallel, might end-up with a failure in the incremental query command on un-tracked vhca. Also, a middle state will be saved only after the previous state has finished its SAVE command and has been fully transferred, this prevents endless use resources. Co-developed-by: Shay Drory Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-11-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 16 +++++++ drivers/vfio/pci/mlx5/main.c | 111 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 160fa38fc78d..12e74ecebe64 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -67,12 +67,25 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; + bool inc = query_flags & MLX5VF_QUERY_INC; int ret; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the + * incremental query command on un-tracked vhca. + */ + if (inc) { + ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); + if (ret) + return ret; + } + MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); @@ -82,6 +95,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); + if (inc) + complete(&mvdev->saving_migf->save_comp); + if (ret) return ret; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9a36e36ec33b..2c8ac763057c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -294,10 +294,121 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) wake_up_interruptible(&migf->poll_wait); } +static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; + struct mlx5_vhca_data_buffer *buf; + struct vfio_precopy_info info = {}; + loff_t *pos = &filp->f_pos; + unsigned long minsz; + size_t inc_length = 0; + bool end_of_data; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&mvdev->state_mutex); + if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto err_state_unlock; + } + + /* + * We can't issue a SAVE command when the device is suspended, so as + * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra + * bytes that can't be read. + */ + if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { + /* + * Once the query returns it's guaranteed that there is no + * active SAVE command. + * As so, the other code below is safe with the proper locks. + */ + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, + MLX5VF_QUERY_INC); + if (ret) + goto err_state_unlock; + } + + mutex_lock(&migf->lock); + if (migf->state == MLX5_MIGF_STATE_ERROR) { + ret = -ENODEV; + goto err_migf_unlock; + } + + buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (buf) { + if (buf->start_pos == 0) { + info.initial_bytes = buf->header_image_size - *pos; + } else if (buf->start_pos == + sizeof(struct mlx5_vf_migration_header)) { + /* First data buffer following the header */ + info.initial_bytes = buf->start_pos + + buf->length - *pos; + } else { + info.dirty_bytes = buf->start_pos + buf->length - *pos; + } + } else { + if (!end_of_data) { + ret = -EINVAL; + goto err_migf_unlock; + } + + info.dirty_bytes = inc_length; + } + + if (!end_of_data || !inc_length) { + mutex_unlock(&migf->lock); + goto done; + } + + mutex_unlock(&migf->lock); + /* + * We finished transferring the current state and the device has a + * dirty state, save a new state to be ready for. + */ + buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + mlx5vf_mark_err(migf); + goto err_state_unlock; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); + if (ret) { + mlx5vf_mark_err(migf); + mlx5vf_put_data_buffer(buf); + goto err_state_unlock; + } + +done: + mlx5vf_state_mutex_unlock(mvdev); + return copy_to_user((void __user *)arg, &info, minsz); +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, .poll = mlx5vf_save_poll, + .unlocked_ioctl = mlx5vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = mlx5vf_release_file, .llseek = no_llseek, }; -- cgit v1.2.3 From 81156c27271c4a6c594e492c8d119fbacfc99f36 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:35 +0200 Subject: vfio/mlx5: Consider temporary end of stream as part of PRE_COPY During PRE_COPY the migration data FD may have a temporary "end of stream" that is reached when the initial_bytes were read and no other dirty data exists yet. For instance, this may indicate that the device is idle and not currently dirtying any internal state. When read() is done on this temporary end of stream the kernel driver should return ENOMSG from read(). Userspace can wait for more data or consider moving to STOP_COPY. To not block the user upon read() and let it get ENOMSG we add a new state named MLX5_MIGF_STATE_PRE_COPY on the migration file. In addition, we add the MLX5_MIGF_STATE_SAVE_LAST state to block the read() once we call the last SAVE upon moving to STOP_COPY. Any further error will be marked with MLX5_MIGF_STATE_ERROR and the user won't be blocked. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-12-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 7 +++++-- drivers/vfio/pci/mlx5/cmd.h | 2 ++ drivers/vfio/pci/mlx5/main.c | 7 +++++++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 12e74ecebe64..f6293da033cc 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -501,8 +501,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - if (async_data->last_chunk) - migf->state = MLX5_MIGF_STATE_COMPLETE; + migf->state = async_data->last_chunk ? + MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; wake_up_interruptible(&migf->poll_wait); } @@ -561,6 +561,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } } + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_SAVE_LAST; + async_data->header_buf = header_buf; get_file(migf->filp); err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 3e36ccca820a..d048f23977dd 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -17,6 +17,8 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY, + MLX5_MIGF_STATE_SAVE_LAST, MLX5_MIGF_STATE_COMPLETE, }; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 2c8ac763057c..44b1543c751c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -219,6 +219,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (wait_event_interruptible(migf->poll_wait, !list_empty(&migf->buf_list) || migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY || migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } @@ -236,6 +237,12 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, &end_of_data); if (first_loop_call) { first_loop_call = false; + /* Temporary end of file as part of PRE_COPY */ + if (end_of_data && migf->state == MLX5_MIGF_STATE_PRE_COPY) { + done = -ENOMSG; + goto out_unlock; + } + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { if (filp->f_flags & O_NONBLOCK) { done = -EAGAIN; -- cgit v1.2.3 From 34e2f27143d1b373f088e805f7e11cdf778f791d Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 10:34:36 +0200 Subject: vfio/mlx5: Introduce multiple loads In order to support PRE_COPY, mlx5 driver transfers multiple states (images) of the device. e.g.: the source VF can save and transfer multiple states, and the target VF will load them by that order. This patch implements the changes for the target VF to decompose the header for each state and to write and load multiple states. Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-13-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 13 +- drivers/vfio/pci/mlx5/cmd.h | 10 ++ drivers/vfio/pci/mlx5/main.c | 279 ++++++++++++++++++++++++++++++++++++------- 3 files changed, 257 insertions(+), 45 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index f6293da033cc..993749818d90 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -598,9 +598,11 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (mvdev->mdev_detach) return -ENOTCONN; - err = mlx5vf_dma_data_buffer(buf); - if (err) - return err; + if (!buf->dmaed) { + err = mlx5vf_dma_data_buffer(buf); + if (err) + return err; + } MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); @@ -644,6 +646,11 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) migf->buf = NULL; } + if (migf->buf_header) { + mlx5vf_free_data_buffer(migf->buf_header); + migf->buf_header = NULL; + } + list_splice(&migf->avail_list, &migf->buf_list); while ((entry = list_first_entry_or_null(&migf->buf_list, diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index d048f23977dd..7729eac8c78c 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -22,6 +22,14 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_COMPLETE, }; +enum mlx5_vf_load_state { + MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, + MLX5_VF_LOAD_STATE_READ_HEADER, + MLX5_VF_LOAD_STATE_PREP_IMAGE, + MLX5_VF_LOAD_STATE_READ_IMAGE, + MLX5_VF_LOAD_STATE_LOAD_IMAGE, +}; + struct mlx5_vf_migration_header { __le64 image_size; /* For future use in case we may need to change the kernel protocol */ @@ -60,9 +68,11 @@ struct mlx5_vf_migration_file { struct mutex lock; enum mlx5_vf_migf_state state; + enum mlx5_vf_load_state load_state; u32 pdn; loff_t max_pos; struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *buf_header; spinlock_t list_lock; struct list_head buf_list; struct list_head avail_list; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 44b1543c751c..5a669b73994a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -518,13 +518,162 @@ end: return ERR_PTR(ret); } +static int +mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + unsigned long offset; + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + + page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + page_offset, *buf, page_len); + kunmap_local(to_buff); + if (ret) + return -EFAULT; + + *pos += page_len; + *done += page_len; + *buf += page_len; + *len -= page_len; + vhca_buf->length += page_len; + return 0; +} + +static int +mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, + loff_t requested_length, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + int ret; + + if (requested_length > MAX_MIGRATION_SIZE) + return -ENOMEM; + + if (vhca_buf->allocated_length < requested_length) { + ret = mlx5vf_add_migration_pages( + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, + PAGE_SIZE)); + if (ret) + return ret; + } + + while (*len) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, + done); + if (ret) + return ret; + } + + return 0; +} + +static ssize_t +mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + size_t image_size, const char __user **buf, + size_t *len, loff_t *pos, ssize_t *done, + bool *has_work) +{ + size_t copy_len, to_copy; + int ret; + + to_copy = min_t(size_t, *len, image_size - vhca_buf->length); + copy_len = to_copy; + while (to_copy) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, + done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == image_size) { + migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; + migf->max_pos += image_size; + *has_work = true; + } + + return 0; +} + +static int +mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, + size_t *len, loff_t *pos, + ssize_t *done, bool *has_work) +{ + struct page *page; + size_t copy_len; + u8 *to_buff; + int ret; + + copy_len = min_t(size_t, *len, + sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); + page = mlx5vf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); + if (ret) { + ret = -EFAULT; + goto end; + } + + *buf += copy_len; + *pos += copy_len; + *done += copy_len; + *len -= copy_len; + vhca_buf->length += copy_len; + if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { + u64 flags; + + vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); + if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { + ret = -ENOMEM; + goto end; + } + + flags = le64_to_cpup((__le64 *)(to_buff + + offsetof(struct mlx5_vf_migration_header, flags))); + if (flags) { + ret = -EOPNOTSUPP; + goto end; + } + + migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; + migf->max_pos += vhca_buf->length; + *has_work = true; + } +end: + kunmap_local(to_buff); + return ret; +} + static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; loff_t requested_length; + bool has_work = false; ssize_t done = 0; + int ret = 0; if (pos) return -ESPIPE; @@ -534,56 +683,83 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, check_add_overflow((loff_t)len, *pos, &requested_length)) return -EINVAL; - if (requested_length > MAX_MIGRATION_SIZE) - return -ENOMEM; - + mutex_lock(&migf->mvdev->state_mutex); mutex_lock(&migf->lock); if (migf->state == MLX5_MIGF_STATE_ERROR) { - done = -ENODEV; + ret = -ENODEV; goto out_unlock; } - if (vhca_buf->allocated_length < requested_length) { - done = mlx5vf_add_migration_pages( - vhca_buf, - DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, - PAGE_SIZE)); - if (done) - goto out_unlock; - } + while (len || has_work) { + has_work = false; + switch (migf->load_state) { + case MLX5_VF_LOAD_STATE_READ_HEADER: + ret = mlx5vf_resume_read_header(migf, vhca_buf_header, + &buf, &len, pos, + &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_PREP_IMAGE: + { + u64 size = vhca_buf_header->header_image_size; + + if (vhca_buf->allocated_length < size) { + mlx5vf_free_data_buffer(vhca_buf); + + migf->buf = mlx5vf_alloc_data_buffer(migf, + size, DMA_TO_DEVICE); + if (IS_ERR(migf->buf)) { + ret = PTR_ERR(migf->buf); + migf->buf = NULL; + goto out_unlock; + } - while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *to_buff; - int ret; + vhca_buf = migf->buf; + } - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; - goto out_unlock; + vhca_buf->start_pos = migf->max_pos; + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; + break; } + case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: + ret = mlx5vf_resume_read_image_no_header(vhca_buf, + requested_length, + &buf, &len, pos, &done); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_READ_IMAGE: + ret = mlx5vf_resume_read_image(migf, vhca_buf, + vhca_buf_header->header_image_size, + &buf, &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_LOAD_IMAGE: + ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); + if (ret) + goto out_unlock; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - to_buff = kmap_local_page(page); - ret = copy_from_user(to_buff + page_offset, buf, page_len); - kunmap_local(to_buff); - if (ret) { - done = -EFAULT; - goto out_unlock; + /* prep header buf for next image */ + vhca_buf_header->length = 0; + vhca_buf_header->header_image_size = 0; + /* prep data buf for next image */ + vhca_buf->length = 0; + + break; + default: + break; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; - vhca_buf->length += page_len; } + out_unlock: + if (ret) + migf->state = MLX5_MIGF_STATE_ERROR; mutex_unlock(&migf->lock); - return done; + mlx5vf_state_mutex_unlock(migf->mvdev); + return ret ? ret : done; } static const struct file_operations mlx5vf_resume_fops = { @@ -623,12 +799,29 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) } migf->buf = buf; + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + buf = mlx5vf_alloc_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_buf; + } + + migf->buf_header = buf; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + } else { + /* Initial state will be to read the image */ + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; + } + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); return migf; +out_buf: + mlx5vf_free_data_buffer(buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -728,11 +921,13 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf, - mvdev->resuming_migf->buf); - if (ret) - return ERR_PTR(ret); + if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { + ret = mlx5vf_cmd_load_vhca_state(mvdev, + mvdev->resuming_migf, + mvdev->resuming_migf->buf); + if (ret) + return ERR_PTR(ret); + } mlx5vf_disable_fds(mvdev); return NULL; } -- cgit v1.2.3 From d6e18a4bec431c181a60d32876c6c89955b2a4f8 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:37 +0200 Subject: vfio/mlx5: Fallback to STOP_COPY upon specific PRE_COPY error Before a SAVE command is issued, a QUERY command is issued in order to know the device data size. In case PRE_COPY is used, the above commands are issued while the device is running. Thus, it is possible that between the QUERY and the SAVE commands the state of the device will be changed significantly and thus the SAVE will fail. Currently, if a SAVE command is failing, the driver will fail the migration. In the above case, don't fail the migration, but don't allow for new SAVEs to be executed while the device is in a RUNNING state. Once the device will be moved to STOP_COPY, SAVE can be executed again and the full device state will be read. Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-14-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 27 ++++++++++++++++++++++++++- drivers/vfio/pci/mlx5/cmd.h | 2 ++ drivers/vfio/pci/mlx5/main.c | 6 ++++-- 3 files changed, 32 insertions(+), 3 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 993749818d90..01ef695e9441 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -84,6 +84,19 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); if (ret) return ret; + if (mvdev->saving_migf->state == + MLX5_MIGF_STATE_PRE_COPY_ERROR) { + /* + * In case we had a PRE_COPY error, only query full + * image for final image + */ + if (!(query_flags & MLX5VF_QUERY_FINAL)) { + *state_size = 0; + complete(&mvdev->saving_migf->save_comp); + return 0; + } + query_flags &= ~MLX5VF_QUERY_INC; + } } MLX5_SET(query_vhca_migration_state_in, in, opcode, @@ -442,7 +455,10 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5vf_put_data_buffer(async_data->buf); if (async_data->header_buf) mlx5vf_put_data_buffer(async_data->header_buf); - migf->state = MLX5_MIGF_STATE_ERROR; + if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) + migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; + else + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); @@ -511,6 +527,8 @@ err: * The error and the cleanup flows can't run from an * interrupt context */ + if (status == -EREMOTEIO) + status = MLX5_GET(save_vhca_state_out, async_data->out, status); async_data->status = status; queue_work(migf->mvdev->cb_wq, &async_data->work); } @@ -534,6 +552,13 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; + if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) + /* + * In case we had a PRE_COPY error, SAVE is triggered only for + * the final image, read device full image. + */ + inc = false; + MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 7729eac8c78c..5483171d57ad 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -17,6 +17,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY_ERROR, MLX5_MIGF_STATE_PRE_COPY, MLX5_MIGF_STATE_SAVE_LAST, MLX5_MIGF_STATE_COMPLETE, @@ -157,6 +158,7 @@ struct mlx5vf_pci_core_device { enum { MLX5VF_QUERY_INC = (1UL << 0), + MLX5VF_QUERY_FINAL = (1UL << 1), }; int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 5a669b73994a..cd90eb86128c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -219,6 +219,7 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (wait_event_interruptible(migf->poll_wait, !list_empty(&migf->buf_list) || migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || migf->state == MLX5_MIGF_STATE_PRE_COPY || migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; @@ -238,7 +239,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (first_loop_call) { first_loop_call = false; /* Temporary end of file as part of PRE_COPY */ - if (end_of_data && migf->state == MLX5_MIGF_STATE_PRE_COPY) { + if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { done = -ENOMSG; goto out_unlock; } @@ -431,7 +433,7 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) return -ENODEV; ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, - MLX5VF_QUERY_INC); + MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); if (ret) goto err; -- cgit v1.2.3 From ccc2a52e464d1c983efede1a6d44728c151cb2ed Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 10:34:38 +0200 Subject: vfio/mlx5: Enable MIGRATION_PRE_COPY flag Now that everything has been set up for MIGRATION_PRE_COPY, enable it. Signed-off-by: Shay Drory Reviewed-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20221206083438.37807-15-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 01ef695e9441..64e68d13cb98 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -222,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) mvdev->core_device.vdev.log_ops = log_ops; + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) + mvdev->core_device.vdev.migration_flags |= + VFIO_MIGRATION_PRE_COPY; + end: mlx5_vf_put_core_dev(mvdev->mdev); } -- cgit v1.2.3 From 64ffbbb1e948876dd9044c32a3ab8a790662b9bb Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:33 +0000 Subject: hisi_acc_vfio_pci: Add support for precopy IOCTL PRECOPY IOCTL in the case of HiSiIicon ACC driver can be used to perform the device compatibility check earlier during migration. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-2-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 52 ++++++++++++++++++++++++++ drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 + 2 files changed, 53 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 0c0c0c7f0521..f3b74a06edb6 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -764,9 +764,58 @@ hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; return migf; } +static long hisi_acc_vf_precopy_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct hisi_acc_vf_migration_file *migf = filp->private_data; + struct hisi_acc_vf_core_device *hisi_acc_vdev = migf->hisi_acc_vdev; + loff_t *pos = &filp->f_pos; + struct vfio_precopy_info info; + unsigned long minsz; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&hisi_acc_vdev->state_mutex); + if (hisi_acc_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY) { + mutex_unlock(&hisi_acc_vdev->state_mutex); + return -EINVAL; + } + + mutex_lock(&migf->lock); + + if (migf->disabled) { + ret = -ENODEV; + goto out; + } + + if (*pos > migf->total_length) { + ret = -EINVAL; + goto out; + } + + info.dirty_bytes = 0; + info.initial_bytes = migf->total_length - *pos; + + ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +out: + mutex_unlock(&migf->lock); + mutex_unlock(&hisi_acc_vdev->state_mutex); + return ret; +} + static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { @@ -807,6 +856,8 @@ out_unlock: static const struct file_operations hisi_acc_vf_save_fops = { .owner = THIS_MODULE, .read = hisi_acc_vf_save_read, + .unlocked_ioctl = hisi_acc_vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = hisi_acc_vf_release_file, .llseek = no_llseek, }; @@ -832,6 +883,7 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + migf->hisi_acc_vdev = hisi_acc_vdev; ret = vf_qm_state_save(hisi_acc_vdev, migf); if (ret) { diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 67343325b320..11d51345f5b5 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -91,6 +91,7 @@ struct hisi_acc_vf_migration_file { struct mutex lock; bool disabled; + struct hisi_acc_vf_core_device *hisi_acc_vdev; struct acc_vf_data vf_data; size_t total_length; }; -- cgit v1.2.3 From d9a871e4a143047d1d84a606772af319f11516f9 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:34 +0000 Subject: hisi_acc_vfio_pci: Introduce support for PRE_COPY state transitions The saving_migf is open in PRE_COPY state if it is supported and reads initial device match data. hisi_acc_vf_stop_copy() is refactored to make use of common code. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-3-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 74 ++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index f3b74a06edb6..c8658636a84c 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -863,7 +863,7 @@ static const struct file_operations hisi_acc_vf_save_fops = { }; static struct hisi_acc_vf_migration_file * -hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct hisi_acc_vf_migration_file *migf; int ret; @@ -885,7 +885,7 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) mutex_init(&migf->lock); migf->hisi_acc_vdev = hisi_acc_vdev; - ret = vf_qm_state_save(hisi_acc_vdev, migf); + ret = vf_qm_get_match_data(hisi_acc_vdev, &migf->vf_data); if (ret) { fput(migf->filp); return ERR_PTR(ret); @@ -894,6 +894,44 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) return migf; } +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_pre_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) +{ + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + + migf->total_length = QM_MATCH_SIZE; + return migf; +} + +static struct hisi_acc_vf_migration_file * +hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, bool open) +{ + int ret; + struct hisi_acc_vf_migration_file *migf = NULL; + + if (open) { + /* + * Userspace didn't use PRECOPY support. Hence saving_migf + * is not opened yet. + */ + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); + if (IS_ERR(migf)) + return migf; + } else { + migf = hisi_acc_vdev->saving_migf; + } + + ret = vf_qm_state_save(hisi_acc_vdev, migf); + if (ret) + return ERR_PTR(ret); + + return open ? migf : NULL; +} + static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct device *dev = &hisi_acc_vdev->vf_dev->dev; @@ -921,6 +959,31 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 cur = hisi_acc_vdev->mig_state; int ret; + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) { + struct hisi_acc_vf_migration_file *migf; + + migf = hisi_acc_vf_pre_copy(hisi_acc_vdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + hisi_acc_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct hisi_acc_vf_migration_file *migf; + + ret = hisi_acc_vf_stop_device(hisi_acc_vdev); + if (ret) + return ERR_PTR(ret); + + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, false); + if (IS_ERR(migf)) + return ERR_CAST(migf); + + return NULL; + } + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) { ret = hisi_acc_vf_stop_device(hisi_acc_vdev); if (ret) @@ -931,7 +994,7 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct hisi_acc_vf_migration_file *migf; - migf = hisi_acc_vf_stop_copy(hisi_acc_vdev); + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, true); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -963,6 +1026,11 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev, return NULL; } + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) { + hisi_acc_vf_disable_fds(hisi_acc_vdev); + return NULL; + } + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING) { hisi_acc_vf_start_device(hisi_acc_vdev); return NULL; -- cgit v1.2.3 From 190125adcad4c5850fd74ecd697e20a446b74ed8 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:35 +0000 Subject: hisi_acc_vfio_pci: Move the dev compatibility tests for early check Instead of waiting till data transfer is complete to perform dev compatibility, do it as soon as we have enough data to perform the check. This will be useful when we enable the support for PRE_COPY. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-4-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 19 +++++++------------ drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 + 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index c8658636a84c..9a51f41e1d2a 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -360,8 +360,8 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, u32 que_iso_state; int ret; - if (migf->total_length < QM_MATCH_SIZE) - return -EINVAL; + if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) + return 0; if (vf_data->acc_magic != ACC_DEV_MAGIC) { dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); @@ -406,6 +406,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, } hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + hisi_acc_vdev->match_done = true; return 0; } @@ -493,10 +494,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct device *dev = &vf_qm->pdev->dev; int ret; - ret = vf_qm_get_match_data(hisi_acc_vdev, vf_data); - if (ret) - return ret; - if (unlikely(qm_wait_dev_not_ready(vf_qm))) { /* Update state and return with match data */ vf_data->vf_qm_state = QM_NOT_READY; @@ -673,12 +670,6 @@ static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) struct hisi_acc_vf_migration_file *migf = hisi_acc_vdev->resuming_migf; int ret; - /* Check dev compatibility */ - ret = vf_qm_check_match(hisi_acc_vdev, migf); - if (ret) { - dev_err(dev, "failed to match the VF!\n"); - return ret; - } /* Recover data to VF */ ret = vf_qm_load_data(hisi_acc_vdev, migf); if (ret) { @@ -732,6 +723,10 @@ static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *bu *pos += len; done = len; migf->total_length += len; + + ret = vf_qm_check_match(migf->hisi_acc_vdev, migf); + if (ret) + done = -EFAULT; out_unlock: mutex_unlock(&migf->lock); return done; diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 11d51345f5b5..dcabfeec6ca1 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -98,6 +98,7 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; + u8 match_done:1; u8 deferred_reset:1; /* For migration state */ struct mutex state_mutex; -- cgit v1.2.3 From f2240b4441cc5bd87820371ee79ad7c9125e76b6 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 23 Nov 2022 11:32:36 +0000 Subject: hisi_acc_vfio_pci: Enable PRE_COPY flag Now that we have everything to support the PRE_COPY state, enable it. Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20221123113236.896-5-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 9a51f41e1d2a..51941bb4f31f 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1351,7 +1351,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) hisi_acc_vdev->vf_dev = pdev; mutex_init(&hisi_acc_vdev->state_mutex); - core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY; + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY; core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; return vfio_pci_core_init_dev(core_vdev); -- cgit v1.2.3 From fe3dd71db2b81c202bc80532bbe0e07238a45ed9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Dec 2022 19:01:26 +0300 Subject: vfio/mlx5: fix error code in mlx5vf_precopy_ioctl() The copy_to_user() function returns the number of bytes remaining to be copied but we want to return a negative error code here. Fixes: 0dce165b1adf ("vfio/mlx5: Introduce vfio precopy ioctl implementation") Signed-off-by: Dan Carpenter Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/Y5IKVknlf5Z5NPtU@kili Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index cd90eb86128c..94f7a0fd10e8 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -404,7 +404,10 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, done: mlx5vf_state_mutex_unlock(mvdev); - return copy_to_user((void __user *)arg, &info, minsz); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + return 0; + err_migf_unlock: mutex_unlock(&migf->lock); err_state_unlock: -- cgit v1.2.3 From 70be6f322860d322ebcd120cf0c05402ead5c6de Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Dec 2022 19:02:17 +0300 Subject: vfio/mlx5: error pointer dereference in error handling This code frees the wrong "buf" variable and results in an error pointer dereference. Fixes: 34e2f27143d1 ("vfio/mlx5: Introduce multiple loads") Signed-off-by: Dan Carpenter Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/Y5IKia5SaiVxYmG5@kili Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 94f7a0fd10e8..031ac8cc215d 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -826,7 +826,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) spin_lock_init(&migf->list_lock); return migf; out_buf: - mlx5vf_free_data_buffer(buf); + mlx5vf_free_data_buffer(migf->buf); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: -- cgit v1.2.3