summaryrefslogtreecommitdiffstats
path: root/drivers/vfio/pci/mlx5/cmd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vfio/pci/mlx5/cmd.c')
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c413
1 files changed, 340 insertions, 73 deletions
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index c604b70437a5..64e68d13cb98 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
+ int err;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
+ /*
+ * In case PRE_COPY is used, saving_migf is exposed while the device is
+ * running. Make sure to run only once there is no active save command.
+ * Running both in parallel, might end-up with a failure in the save
+ * command once it will try to turn on 'tracking' on a suspended device.
+ */
+ if (migf) {
+ err = wait_for_completion_interruptible(&migf->save_comp);
+ if (err)
+ return err;
+ }
+
MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
- return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
+ err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
+ if (migf)
+ complete(&migf->save_comp);
+
+ return err;
}
int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
@@ -45,23 +63,54 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
}
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size)
+ size_t *state_size, u8 query_flags)
{
u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
+ bool inc = query_flags & MLX5VF_QUERY_INC;
int ret;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
+ /*
+ * In case PRE_COPY is used, saving_migf is exposed while device is
+ * running. Make sure to run only once there is no active save command.
+ * Running both in parallel, might end-up with a failure in the
+ * incremental query command on un-tracked vhca.
+ */
+ if (inc) {
+ ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
+ if (ret)
+ return ret;
+ if (mvdev->saving_migf->state ==
+ MLX5_MIGF_STATE_PRE_COPY_ERROR) {
+ /*
+ * In case we had a PRE_COPY error, only query full
+ * image for final image
+ */
+ if (!(query_flags & MLX5VF_QUERY_FINAL)) {
+ *state_size = 0;
+ complete(&mvdev->saving_migf->save_comp);
+ return 0;
+ }
+ query_flags &= ~MLX5VF_QUERY_INC;
+ }
+ }
+
MLX5_SET(query_vhca_migration_state_in, in, opcode,
MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
+ MLX5_SET(query_vhca_migration_state_in, in, incremental,
+ query_flags & MLX5VF_QUERY_INC);
ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
out);
+ if (inc)
+ complete(&mvdev->saving_migf->save_comp);
+
if (ret)
return ret;
@@ -173,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
mvdev->core_device.vdev.log_ops = log_ops;
+ if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
+ MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
+ mvdev->core_device.vdev.migration_flags |=
+ VFIO_MIGRATION_PRE_COPY;
+
end:
mlx5_vf_put_core_dev(mvdev->mdev);
}
@@ -210,11 +264,11 @@ err_exec:
}
static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
- struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf,
struct mlx5_vhca_recv_buf *recv_buf,
u32 *mkey)
{
- size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) :
+ size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
recv_buf->npages;
int err = 0, inlen;
__be64 *mtt;
@@ -232,10 +286,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
DIV_ROUND_UP(npages, 2));
mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
- if (migf) {
+ if (buf) {
struct sg_dma_page_iter dma_iter;
- for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0)
+ for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
} else {
int i;
@@ -255,35 +309,195 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
MLX5_SET(mkc, mkc, qpn, 0xffffff);
MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
- MLX5_SET64(mkc, mkc, len,
- migf ? migf->total_length : (npages * PAGE_SIZE));
+ MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
kvfree(in);
return err;
}
+static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
+ struct mlx5_core_dev *mdev = mvdev->mdev;
+ int ret;
+
+ lockdep_assert_held(&mvdev->state_mutex);
+ if (mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ if (buf->dmaed || !buf->allocated_length)
+ return -EINVAL;
+
+ ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+ if (ret)
+ return ret;
+
+ ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
+ if (ret)
+ goto err;
+
+ buf->dmaed = true;
+
+ return 0;
+err:
+ dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+ return ret;
+}
+
+void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ struct mlx5_vf_migration_file *migf = buf->migf;
+ struct sg_page_iter sg_iter;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ WARN_ON(migf->mvdev->mdev_detach);
+
+ if (buf->dmaed) {
+ mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
+ dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
+ buf->dma_dir, 0);
+ }
+
+ /* Undo alloc_pages_bulk_array() */
+ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
+ __free_page(sg_page_iter_page(&sg_iter));
+ sg_free_append_table(&buf->table);
+ kfree(buf);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length,
+ enum dma_data_direction dma_dir)
+{
+ struct mlx5_vhca_data_buffer *buf;
+ int ret;
+
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ buf->dma_dir = dma_dir;
+ buf->migf = migf;
+ if (length) {
+ ret = mlx5vf_add_migration_pages(buf,
+ DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (ret)
+ goto end;
+
+ if (dma_dir != DMA_NONE) {
+ ret = mlx5vf_dma_data_buffer(buf);
+ if (ret)
+ goto end;
+ }
+ }
+
+ return buf;
+end:
+ mlx5vf_free_data_buffer(buf);
+ return ERR_PTR(ret);
+}
+
+void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ spin_lock_irq(&buf->migf->list_lock);
+ list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
+ spin_unlock_irq(&buf->migf->list_lock);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir)
+{
+ struct mlx5_vhca_data_buffer *buf, *temp_buf;
+ struct list_head free_list;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return ERR_PTR(-ENOTCONN);
+
+ INIT_LIST_HEAD(&free_list);
+
+ spin_lock_irq(&migf->list_lock);
+ list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
+ if (buf->dma_dir == dma_dir) {
+ list_del_init(&buf->buf_elm);
+ if (buf->allocated_length >= length) {
+ spin_unlock_irq(&migf->list_lock);
+ goto found;
+ }
+ /*
+ * Prevent holding redundant buffers. Put in a free
+ * list and call at the end not under the spin lock
+ * (&migf->list_lock) to mlx5vf_free_data_buffer which
+ * might sleep.
+ */
+ list_add(&buf->buf_elm, &free_list);
+ }
+ }
+ spin_unlock_irq(&migf->list_lock);
+ buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
+
+found:
+ while ((temp_buf = list_first_entry_or_null(&free_list,
+ struct mlx5_vhca_data_buffer, buf_elm))) {
+ list_del(&temp_buf->buf_elm);
+ mlx5vf_free_data_buffer(temp_buf);
+ }
+
+ return buf;
+}
+
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
{
struct mlx5vf_async_data *async_data = container_of(_work,
struct mlx5vf_async_data, work);
struct mlx5_vf_migration_file *migf = container_of(async_data,
struct mlx5_vf_migration_file, async_data);
- struct mlx5_core_dev *mdev = migf->mvdev->mdev;
mutex_lock(&migf->lock);
if (async_data->status) {
- migf->is_err = true;
+ mlx5vf_put_data_buffer(async_data->buf);
+ if (async_data->header_buf)
+ mlx5vf_put_data_buffer(async_data->header_buf);
+ if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
+ migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
+ else
+ migf->state = MLX5_MIGF_STATE_ERROR;
wake_up_interruptible(&migf->poll_wait);
}
mutex_unlock(&migf->lock);
-
- mlx5_core_destroy_mkey(mdev, async_data->mkey);
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
- mlx5_core_dealloc_pd(mdev, async_data->pdn);
kvfree(async_data->out);
+ complete(&migf->save_comp);
fput(migf->filp);
}
+static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
+ size_t image_size)
+{
+ struct mlx5_vf_migration_file *migf = header_buf->migf;
+ struct mlx5_vf_migration_header header = {};
+ unsigned long flags;
+ struct page *page;
+ u8 *to_buff;
+
+ header.image_size = cpu_to_le64(image_size);
+ page = mlx5vf_get_migration_page(header_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &header, sizeof(header));
+ kunmap_local(to_buff);
+ header_buf->length = sizeof(header);
+ header_buf->header_image_size = image_size;
+ header_buf->start_pos = header_buf->migf->max_pos;
+ migf->max_pos += header_buf->length;
+ spin_lock_irqsave(&migf->list_lock, flags);
+ list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+ spin_unlock_irqrestore(&migf->list_lock, flags);
+ return 0;
+}
+
static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
{
struct mlx5vf_async_data *async_data = container_of(context,
@@ -292,67 +506,96 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
struct mlx5_vf_migration_file, async_data);
if (!status) {
- WRITE_ONCE(migf->total_length,
- MLX5_GET(save_vhca_state_out, async_data->out,
- actual_image_size));
+ size_t image_size;
+ unsigned long flags;
+
+ image_size = MLX5_GET(save_vhca_state_out, async_data->out,
+ actual_image_size);
+ if (async_data->header_buf) {
+ status = add_buf_header(async_data->header_buf, image_size);
+ if (status)
+ goto err;
+ }
+ async_data->buf->length = image_size;
+ async_data->buf->start_pos = migf->max_pos;
+ migf->max_pos += async_data->buf->length;
+ spin_lock_irqsave(&migf->list_lock, flags);
+ list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
+ spin_unlock_irqrestore(&migf->list_lock, flags);
+ migf->state = async_data->last_chunk ?
+ MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
wake_up_interruptible(&migf->poll_wait);
}
+err:
/*
* The error and the cleanup flows can't run from an
* interrupt context
*/
+ if (status == -EREMOTEIO)
+ status = MLX5_GET(save_vhca_state_out, async_data->out, status);
async_data->status = status;
queue_work(migf->mvdev->cb_wq, &async_data->work);
}
int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf)
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf, bool inc,
+ bool track)
{
u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
+ struct mlx5_vhca_data_buffer *header_buf = NULL;
struct mlx5vf_async_data *async_data;
- struct mlx5_core_dev *mdev;
- u32 pdn, mkey;
int err;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
- mdev = mvdev->mdev;
- err = mlx5_core_alloc_pd(mdev, &pdn);
+ err = wait_for_completion_interruptible(&migf->save_comp);
if (err)
return err;
- err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE,
- 0);
- if (err)
- goto err_dma_map;
-
- err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
- if (err)
- goto err_create_mkey;
+ if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
+ /*
+ * In case we had a PRE_COPY error, SAVE is triggered only for
+ * the final image, read device full image.
+ */
+ inc = false;
MLX5_SET(save_vhca_state_in, in, opcode,
MLX5_CMD_OP_SAVE_VHCA_STATE);
MLX5_SET(save_vhca_state_in, in, op_mod, 0);
MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
- MLX5_SET(save_vhca_state_in, in, mkey, mkey);
- MLX5_SET(save_vhca_state_in, in, size, migf->total_length);
+ MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
+ MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
+ MLX5_SET(save_vhca_state_in, in, incremental, inc);
+ MLX5_SET(save_vhca_state_in, in, set_track, track);
async_data = &migf->async_data;
+ async_data->buf = buf;
+ async_data->last_chunk = !track;
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
goto err_out;
}
- /* no data exists till the callback comes back */
- migf->total_length = 0;
+ if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ header_buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(header_buf)) {
+ err = PTR_ERR(header_buf);
+ goto err_free;
+ }
+ }
+
+ if (async_data->last_chunk)
+ migf->state = MLX5_MIGF_STATE_SAVE_LAST;
+
+ async_data->header_buf = header_buf;
get_file(migf->filp);
- async_data->mkey = mkey;
- async_data->pdn = pdn;
err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
async_data->out,
out_size, mlx5vf_save_callback,
@@ -363,68 +606,92 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
return 0;
err_exec:
+ if (header_buf)
+ mlx5vf_put_data_buffer(header_buf);
fput(migf->filp);
+err_free:
kvfree(async_data->out);
err_out:
- mlx5_core_destroy_mkey(mdev, mkey);
-err_create_mkey:
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
-err_dma_map:
- mlx5_core_dealloc_pd(mdev, pdn);
+ complete(&migf->save_comp);
return err;
}
int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf)
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf)
{
- struct mlx5_core_dev *mdev;
- u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {};
- u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
- u32 pdn, mkey;
+ u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
int err;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
- mutex_lock(&migf->lock);
- if (!migf->total_length) {
- err = -EINVAL;
- goto end;
+ if (!buf->dmaed) {
+ err = mlx5vf_dma_data_buffer(buf);
+ if (err)
+ return err;
}
- mdev = mvdev->mdev;
- err = mlx5_core_alloc_pd(mdev, &pdn);
- if (err)
- goto end;
-
- err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
- if (err)
- goto err_reg;
-
- err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
- if (err)
- goto err_mkey;
-
MLX5_SET(load_vhca_state_in, in, opcode,
MLX5_CMD_OP_LOAD_VHCA_STATE);
MLX5_SET(load_vhca_state_in, in, op_mod, 0);
MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
- MLX5_SET(load_vhca_state_in, in, mkey, mkey);
- MLX5_SET(load_vhca_state_in, in, size, migf->total_length);
+ MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
+ MLX5_SET(load_vhca_state_in, in, size, buf->length);
+ return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
+}
- err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out);
+int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
+{
+ int err;
- mlx5_core_destroy_mkey(mdev, mkey);
-err_mkey:
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
-err_reg:
- mlx5_core_dealloc_pd(mdev, pdn);
-end:
- mutex_unlock(&migf->lock);
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
return err;
}
+void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
+{
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return;
+
+ mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
+}
+
+void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
+{
+ struct mlx5_vhca_data_buffer *entry;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ WARN_ON(migf->mvdev->mdev_detach);
+
+ if (migf->buf) {
+ mlx5vf_free_data_buffer(migf->buf);
+ migf->buf = NULL;
+ }
+
+ if (migf->buf_header) {
+ mlx5vf_free_data_buffer(migf->buf_header);
+ migf->buf_header = NULL;
+ }
+
+ list_splice(&migf->avail_list, &migf->buf_list);
+
+ while ((entry = list_first_entry_or_null(&migf->buf_list,
+ struct mlx5_vhca_data_buffer, buf_elm))) {
+ list_del(&entry->buf_elm);
+ mlx5vf_free_data_buffer(entry);
+ }
+
+ mlx5vf_cmd_dealloc_pd(migf);
+}
+
static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
u32 req_nodes)
{