From 1068be7bd4b05ca41a6a8de724f52a9c87861412 Mon Sep 17 00:00:00 2001 From: "Bryant G. Ly" Date: Tue, 6 Jun 2017 09:28:49 -0500 Subject: tcmu: Add netlink for device reconfiguration This gives tcmu the ability to handle events that can cause reconfiguration, such as resize, path changes, write_cache, etc... Signed-off-by: Bryant G. Ly Reviewed-By: Mike Christie Signed-off-by: Nicholas Bellinger --- include/uapi/linux/target_core_user.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index af17b4154ef6..403a61faada0 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -130,6 +130,7 @@ enum tcmu_genl_cmd { TCMU_CMD_UNSPEC, TCMU_CMD_ADDED_DEVICE, TCMU_CMD_REMOVED_DEVICE, + TCMU_CMD_RECONFIG_DEVICE, __TCMU_CMD_MAX, }; #define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1) -- cgit v1.2.3 From 8a45885c1514cdae2ee64b5ac03ffc00a1a8a9d7 Mon Sep 17 00:00:00 2001 From: "Bryant G. Ly" Date: Tue, 6 Jun 2017 09:28:52 -0500 Subject: tcmu: Add Type of reconfig into netlink This patch adds more info about the attribute being changed, so that usersapce can easily figure out what is happening. Signed-off-by: Bryant G. Ly Reviewed-By: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 20 ++++++++++++++------ include/uapi/linux/target_core_user.h | 8 ++++++++ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 7c6475731895..afc1fd6bacaf 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1176,7 +1176,8 @@ static int tcmu_release(struct uio_info *info, struct inode *inode) return 0; } -static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int minor) +static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, + int minor, int type) { struct sk_buff *skb; void *msg_header; @@ -1198,6 +1199,10 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int mino if (ret < 0) goto free_skb; + ret = nla_put_u32(skb, TCMU_ATTR_TYPE, type); + if (ret < 0) + goto free_skb; + genlmsg_end(skb, msg_header); ret = genlmsg_multicast_allns(&tcmu_genl_family, skb, 0, @@ -1301,7 +1306,7 @@ static int tcmu_configure_device(struct se_device *dev) kref_get(&udev->kref); ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name, - udev->uio_info.uio_dev->minor); + udev->uio_info.uio_dev->minor, NO_RECONFIG); if (ret) goto err_netlink; @@ -1383,7 +1388,7 @@ static void tcmu_free_device(struct se_device *dev) if (tcmu_dev_configured(udev)) { tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, - udev->uio_info.uio_dev->minor); + udev->uio_info.uio_dev->minor, NO_RECONFIG); uio_unregister_device(&udev->uio_info); } @@ -1577,7 +1582,8 @@ static ssize_t tcmu_dev_path_store(struct config_item *item, const char *page, if (tcmu_dev_configured(udev)) { ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE, udev->uio_info.name, - udev->uio_info.uio_dev->minor); + udev->uio_info.uio_dev->minor, + CONFIG_PATH); if (ret) { pr_err("Unable to reconfigure device\n"); return ret; @@ -1615,7 +1621,8 @@ static ssize_t tcmu_dev_size_store(struct config_item *item, const char *page, if (tcmu_dev_configured(udev)) { ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE, udev->uio_info.name, - udev->uio_info.uio_dev->minor); + udev->uio_info.uio_dev->minor, + CONFIG_SIZE); if (ret) { pr_err("Unable to reconfigure device\n"); return ret; @@ -1654,7 +1661,8 @@ static ssize_t tcmu_emulate_write_cache_store(struct config_item *item, if (tcmu_dev_configured(udev)) { ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE, udev->uio_info.name, - udev->uio_info.uio_dev->minor); + udev->uio_info.uio_dev->minor, + CONFIG_WRITECACHE); if (ret) { pr_err("Unable to reconfigure device\n"); return ret; diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index 403a61faada0..5b00e3500005 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -139,8 +139,16 @@ enum tcmu_genl_attr { TCMU_ATTR_UNSPEC, TCMU_ATTR_DEVICE, TCMU_ATTR_MINOR, + TCMU_ATTR_TYPE, __TCMU_ATTR_MAX, }; #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) +enum tcmu_reconfig_types { + NO_RECONFIG, + CONFIG_PATH, + CONFIG_SIZE, + CONFIG_WRITECACHE, +}; + #endif -- cgit v1.2.3 From 2d76443e02f260d7a5bd0ede1851ae5534f0c68d Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 12 Jun 2017 01:34:28 -0500 Subject: tcmu: reconfigure netlink attr changes 1. TCMU_ATTR_TYPE is too generic when it describes only the reconfiguration type, so rename to TCMU_ATTR_RECONFIG_TYPE. 2. Only return the reconfig type when it is a TCMU_CMD_RECONFIG_DEVICE command. 3. CONFIG_* type is not needed. We can pass the value along with an ATTR to userspace, so it does not need to read sysfs/configfs. 4. Fix leak in tcmu_dev_path_store and rename to dev_config to reflect it is more than just a path that can be changed. 6. Don't update kernel struct value if netlink sending fails. Signed-off-by: Mike Christie Reviewed-by: "Bryant G. Ly" Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 73 +++++++++++++++++++++-------------- include/uapi/linux/target_core_user.h | 12 ++---- 2 files changed, 48 insertions(+), 37 deletions(-) (limited to 'include/uapi') diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 6322269d9e85..ca5b081295db 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1177,7 +1177,8 @@ static int tcmu_release(struct uio_info *info, struct inode *inode) } static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, - int minor, int type) + int minor, int reconfig_attr, + const void *reconfig_data) { struct sk_buff *skb; void *msg_header; @@ -1199,9 +1200,27 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, if (ret < 0) goto free_skb; - ret = nla_put_u32(skb, TCMU_ATTR_TYPE, type); - if (ret < 0) - goto free_skb; + if (cmd == TCMU_CMD_RECONFIG_DEVICE) { + switch (reconfig_attr) { + case TCMU_ATTR_DEV_CFG: + ret = nla_put_string(skb, reconfig_attr, reconfig_data); + break; + case TCMU_ATTR_DEV_SIZE: + ret = nla_put_u64_64bit(skb, reconfig_attr, + *((u64 *)reconfig_data), + TCMU_ATTR_PAD); + break; + case TCMU_ATTR_WRITECACHE: + ret = nla_put_u8(skb, reconfig_attr, + *((u8 *)reconfig_data)); + break; + default: + BUG(); + } + + if (ret < 0) + goto free_skb; + } genlmsg_end(skb, msg_header); @@ -1306,7 +1325,7 @@ static int tcmu_configure_device(struct se_device *dev) kref_get(&udev->kref); ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name, - udev->uio_info.uio_dev->minor, NO_RECONFIG); + udev->uio_info.uio_dev->minor, 0, NULL); if (ret) goto err_netlink; @@ -1388,7 +1407,7 @@ static void tcmu_free_device(struct se_device *dev) if (tcmu_dev_configured(udev)) { tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, - udev->uio_info.uio_dev->minor, NO_RECONFIG); + udev->uio_info.uio_dev->minor, 0, NULL); uio_unregister_device(&udev->uio_info); } @@ -1553,7 +1572,7 @@ static ssize_t tcmu_cmd_time_out_store(struct config_item *item, const char *pag } CONFIGFS_ATTR(tcmu_, cmd_time_out); -static ssize_t tcmu_dev_path_show(struct config_item *item, char *page) +static ssize_t tcmu_dev_config_show(struct config_item *item, char *page) { struct se_dev_attrib *da = container_of(to_config_group(item), struct se_dev_attrib, da_group); @@ -1562,37 +1581,34 @@ static ssize_t tcmu_dev_path_show(struct config_item *item, char *page) return snprintf(page, PAGE_SIZE, "%s\n", udev->dev_config); } -static ssize_t tcmu_dev_path_store(struct config_item *item, const char *page, - size_t count) +static ssize_t tcmu_dev_config_store(struct config_item *item, const char *page, + size_t count) { struct se_dev_attrib *da = container_of(to_config_group(item), struct se_dev_attrib, da_group); struct tcmu_dev *udev = TCMU_DEV(da->da_dev); - char *copy = NULL; - int ret; + int ret, len; - copy = kstrdup(page, GFP_KERNEL); - if (!copy) { - kfree(copy); + len = strlen(page); + if (!len || len > TCMU_CONFIG_LEN - 1) return -EINVAL; - } - strlcpy(udev->dev_config, copy, TCMU_CONFIG_LEN); /* Check if device has been configured before */ if (tcmu_dev_configured(udev)) { ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE, udev->uio_info.name, udev->uio_info.uio_dev->minor, - CONFIG_PATH); + TCMU_ATTR_DEV_CFG, page); if (ret) { pr_err("Unable to reconfigure device\n"); return ret; } } + strlcpy(udev->dev_config, page, TCMU_CONFIG_LEN); return count; } -CONFIGFS_ATTR(tcmu_, dev_path); +CONFIGFS_ATTR(tcmu_, dev_config); static ssize_t tcmu_dev_size_show(struct config_item *item, char *page) { @@ -1609,26 +1625,25 @@ static ssize_t tcmu_dev_size_store(struct config_item *item, const char *page, struct se_dev_attrib *da = container_of(to_config_group(item), struct se_dev_attrib, da_group); struct tcmu_dev *udev = TCMU_DEV(da->da_dev); - unsigned long val; + u64 val; int ret; - ret = kstrtoul(page, 0, &val); + ret = kstrtou64(page, 0, &val); if (ret < 0) return ret; - udev->dev_size = val; /* Check if device has been configured before */ if (tcmu_dev_configured(udev)) { ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE, udev->uio_info.name, udev->uio_info.uio_dev->minor, - CONFIG_SIZE); + TCMU_ATTR_DEV_SIZE, &val); if (ret) { pr_err("Unable to reconfigure device\n"); return ret; } } - + udev->dev_size = val; return count; } CONFIGFS_ATTR(tcmu_, dev_size); @@ -1648,33 +1663,33 @@ static ssize_t tcmu_emulate_write_cache_store(struct config_item *item, struct se_dev_attrib *da = container_of(to_config_group(item), struct se_dev_attrib, da_group); struct tcmu_dev *udev = TCMU_DEV(da->da_dev); - int val; + u8 val; int ret; - ret = kstrtouint(page, 0, &val); + ret = kstrtou8(page, 0, &val); if (ret < 0) return ret; - da->emulate_write_cache = val; - /* Check if device has been configured before */ if (tcmu_dev_configured(udev)) { ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE, udev->uio_info.name, udev->uio_info.uio_dev->minor, - CONFIG_WRITECACHE); + TCMU_ATTR_WRITECACHE, &val); if (ret) { pr_err("Unable to reconfigure device\n"); return ret; } } + + da->emulate_write_cache = val; return count; } CONFIGFS_ATTR(tcmu_, emulate_write_cache); static struct configfs_attribute *tcmu_attrib_attrs[] = { &tcmu_attr_cmd_time_out, - &tcmu_attr_dev_path, + &tcmu_attr_dev_config, &tcmu_attr_dev_size, &tcmu_attr_emulate_write_cache, NULL, diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index 5b00e3500005..4bfc9a1b635c 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -139,16 +139,12 @@ enum tcmu_genl_attr { TCMU_ATTR_UNSPEC, TCMU_ATTR_DEVICE, TCMU_ATTR_MINOR, - TCMU_ATTR_TYPE, + TCMU_ATTR_PAD, + TCMU_ATTR_DEV_CFG, + TCMU_ATTR_DEV_SIZE, + TCMU_ATTR_WRITECACHE, __TCMU_ATTR_MAX, }; #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) -enum tcmu_reconfig_types { - NO_RECONFIG, - CONFIG_PATH, - CONFIG_SIZE, - CONFIG_WRITECACHE, -}; - #endif -- cgit v1.2.3 From b3af66e24393f03ef81db17a11387d9e6174bd01 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 23 Jun 2017 01:18:15 -0500 Subject: tcmu: perfom device add, del and reconfig synchronously This makes the device add, del reconfig operations sync. It fixes the issue where for add and reconfig, we do not know if userspace successfully completely the operation, so we leave invalid kernel structs or report incorrect status for the config/reconfig operations. Signed-off-by: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 213 ++++++++++++++++++++++++++++++---- include/uapi/linux/target_core_user.h | 7 ++ 2 files changed, 200 insertions(+), 20 deletions(-) (limited to 'include/uapi') diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index e58127b8db8a..e080cd1a8fde 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -87,6 +87,8 @@ /* Default maximum of the global data blocks(512K * PAGE_SIZE) */ #define TCMU_GLOBAL_MAX_BLOCKS (512 * 1024) +static u8 tcmu_kern_cmd_reply_supported; + static struct device *tcmu_root_device; struct tcmu_hba { @@ -95,6 +97,13 @@ struct tcmu_hba { #define TCMU_CONFIG_LEN 256 +struct tcmu_nl_cmd { + /* wake up thread waiting for reply */ + struct completion complete; + int cmd; + int status; +}; + struct tcmu_dev { struct list_head node; struct kref kref; @@ -135,6 +144,11 @@ struct tcmu_dev { struct timer_list timeout; unsigned int cmd_time_out; + spinlock_t nl_cmd_lock; + struct tcmu_nl_cmd curr_nl_cmd; + /* wake up threads waiting on curr_nl_cmd */ + wait_queue_head_t nl_cmd_wq; + char dev_config[TCMU_CONFIG_LEN]; }; @@ -178,16 +192,128 @@ static const struct genl_multicast_group tcmu_mcgrps[] = { [TCMU_MCGRP_CONFIG] = { .name = "config", }, }; +static struct nla_policy tcmu_attr_policy[TCMU_ATTR_MAX+1] = { + [TCMU_ATTR_DEVICE] = { .type = NLA_STRING }, + [TCMU_ATTR_MINOR] = { .type = NLA_U32 }, + [TCMU_ATTR_CMD_STATUS] = { .type = NLA_S32 }, + [TCMU_ATTR_DEVICE_ID] = { .type = NLA_U32 }, + [TCMU_ATTR_SUPP_KERN_CMD_REPLY] = { .type = NLA_U8 }, +}; + +static int tcmu_genl_cmd_done(struct genl_info *info, int completed_cmd) +{ + struct se_device *dev; + struct tcmu_dev *udev; + struct tcmu_nl_cmd *nl_cmd; + int dev_id, rc, ret = 0; + bool is_removed = (completed_cmd == TCMU_CMD_REMOVED_DEVICE); + + if (!info->attrs[TCMU_ATTR_CMD_STATUS] || + !info->attrs[TCMU_ATTR_DEVICE_ID]) { + printk(KERN_ERR "TCMU_ATTR_CMD_STATUS or TCMU_ATTR_DEVICE_ID not set, doing nothing\n"); + return -EINVAL; + } + + dev_id = nla_get_u32(info->attrs[TCMU_ATTR_DEVICE_ID]); + rc = nla_get_s32(info->attrs[TCMU_ATTR_CMD_STATUS]); + + dev = target_find_device(dev_id, !is_removed); + if (!dev) { + printk(KERN_ERR "tcmu nl cmd %u/%u completion could not find device with dev id %u.\n", + completed_cmd, rc, dev_id); + return -ENODEV; + } + udev = TCMU_DEV(dev); + + spin_lock(&udev->nl_cmd_lock); + nl_cmd = &udev->curr_nl_cmd; + + pr_debug("genl cmd done got id %d curr %d done %d rc %d\n", dev_id, + nl_cmd->cmd, completed_cmd, rc); + + if (nl_cmd->cmd != completed_cmd) { + printk(KERN_ERR "Mismatched commands (Expecting reply for %d. Current %d).\n", + completed_cmd, nl_cmd->cmd); + ret = -EINVAL; + } else { + nl_cmd->status = rc; + } + + spin_unlock(&udev->nl_cmd_lock); + if (!is_removed) + target_undepend_item(&dev->dev_group.cg_item); + if (!ret) + complete(&nl_cmd->complete); + return ret; +} + +static int tcmu_genl_rm_dev_done(struct sk_buff *skb, struct genl_info *info) +{ + return tcmu_genl_cmd_done(info, TCMU_CMD_REMOVED_DEVICE); +} + +static int tcmu_genl_add_dev_done(struct sk_buff *skb, struct genl_info *info) +{ + return tcmu_genl_cmd_done(info, TCMU_CMD_ADDED_DEVICE); +} + +static int tcmu_genl_reconfig_dev_done(struct sk_buff *skb, + struct genl_info *info) +{ + return tcmu_genl_cmd_done(info, TCMU_CMD_RECONFIG_DEVICE); +} + +static int tcmu_genl_set_features(struct sk_buff *skb, struct genl_info *info) +{ + if (info->attrs[TCMU_ATTR_SUPP_KERN_CMD_REPLY]) { + tcmu_kern_cmd_reply_supported = + nla_get_u8(info->attrs[TCMU_ATTR_SUPP_KERN_CMD_REPLY]); + printk(KERN_INFO "tcmu daemon: command reply support %u.\n", + tcmu_kern_cmd_reply_supported); + } + + return 0; +} + +static const struct genl_ops tcmu_genl_ops[] = { + { + .cmd = TCMU_CMD_SET_FEATURES, + .flags = GENL_ADMIN_PERM, + .policy = tcmu_attr_policy, + .doit = tcmu_genl_set_features, + }, + { + .cmd = TCMU_CMD_ADDED_DEVICE_DONE, + .flags = GENL_ADMIN_PERM, + .policy = tcmu_attr_policy, + .doit = tcmu_genl_add_dev_done, + }, + { + .cmd = TCMU_CMD_REMOVED_DEVICE_DONE, + .flags = GENL_ADMIN_PERM, + .policy = tcmu_attr_policy, + .doit = tcmu_genl_rm_dev_done, + }, + { + .cmd = TCMU_CMD_RECONFIG_DEVICE_DONE, + .flags = GENL_ADMIN_PERM, + .policy = tcmu_attr_policy, + .doit = tcmu_genl_reconfig_dev_done, + }, +}; + /* Our generic netlink family */ static struct genl_family tcmu_genl_family __ro_after_init = { .module = THIS_MODULE, .hdrsize = 0, .name = "TCM-USER", - .version = 1, + .version = 2, .maxattr = TCMU_ATTR_MAX, .mcgrps = tcmu_mcgrps, .n_mcgrps = ARRAY_SIZE(tcmu_mcgrps), .netnsok = true, + .ops = tcmu_genl_ops, + .n_ops = ARRAY_SIZE(tcmu_genl_ops), }; #define tcmu_cmd_set_dbi_cur(cmd, index) ((cmd)->dbi_cur = (index)) @@ -989,6 +1115,9 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) setup_timer(&udev->timeout, tcmu_device_timedout, (unsigned long)udev); + init_waitqueue_head(&udev->nl_cmd_wq); + spin_lock_init(&udev->nl_cmd_lock); + return &udev->se_dev; } @@ -1176,9 +1305,54 @@ static int tcmu_release(struct uio_info *info, struct inode *inode) return 0; } -static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, - int minor, int reconfig_attr, - const void *reconfig_data) +static void tcmu_init_genl_cmd_reply(struct tcmu_dev *udev, int cmd) +{ + struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd; + + if (!tcmu_kern_cmd_reply_supported) + return; +relock: + spin_lock(&udev->nl_cmd_lock); + + if (nl_cmd->cmd != TCMU_CMD_UNSPEC) { + spin_unlock(&udev->nl_cmd_lock); + pr_debug("sleeping for open nl cmd\n"); + wait_event(udev->nl_cmd_wq, (nl_cmd->cmd == TCMU_CMD_UNSPEC)); + goto relock; + } + + memset(nl_cmd, 0, sizeof(*nl_cmd)); + nl_cmd->cmd = cmd; + init_completion(&nl_cmd->complete); + + spin_unlock(&udev->nl_cmd_lock); +} + +static int tcmu_wait_genl_cmd_reply(struct tcmu_dev *udev) +{ + struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd; + int ret; + DEFINE_WAIT(__wait); + + if (!tcmu_kern_cmd_reply_supported) + return 0; + + pr_debug("sleeping for nl reply\n"); + wait_for_completion(&nl_cmd->complete); + + spin_lock(&udev->nl_cmd_lock); + nl_cmd->cmd = TCMU_CMD_UNSPEC; + ret = nl_cmd->status; + nl_cmd->status = 0; + spin_unlock(&udev->nl_cmd_lock); + + wake_up_all(&udev->nl_cmd_wq); + + return ret;; +} + +static int tcmu_netlink_event(struct tcmu_dev *udev, enum tcmu_genl_cmd cmd, + int reconfig_attr, const void *reconfig_data) { struct sk_buff *skb; void *msg_header; @@ -1192,11 +1366,15 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, if (!msg_header) goto free_skb; - ret = nla_put_string(skb, TCMU_ATTR_DEVICE, name); + ret = nla_put_string(skb, TCMU_ATTR_DEVICE, udev->uio_info.name); if (ret < 0) goto free_skb; - ret = nla_put_u32(skb, TCMU_ATTR_MINOR, minor); + ret = nla_put_u32(skb, TCMU_ATTR_MINOR, udev->uio_info.uio_dev->minor); + if (ret < 0) + goto free_skb; + + ret = nla_put_u32(skb, TCMU_ATTR_DEVICE_ID, udev->se_dev.dev_index); if (ret < 0) goto free_skb; @@ -1224,12 +1402,15 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, genlmsg_end(skb, msg_header); + tcmu_init_genl_cmd_reply(udev, cmd); + ret = genlmsg_multicast_allns(&tcmu_genl_family, skb, 0, TCMU_MCGRP_CONFIG, GFP_KERNEL); - /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; + if (!ret) + ret = tcmu_wait_genl_cmd_reply(udev); return ret; free_skb: @@ -1324,8 +1505,7 @@ static int tcmu_configure_device(struct se_device *dev) */ kref_get(&udev->kref); - ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name, - udev->uio_info.uio_dev->minor, 0, NULL); + ret = tcmu_netlink_event(udev, TCMU_CMD_ADDED_DEVICE, 0, NULL); if (ret) goto err_netlink; @@ -1414,8 +1594,7 @@ static void tcmu_destroy_device(struct se_device *dev) tcmu_blocks_release(udev); if (tcmu_dev_configured(udev)) { - tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, - udev->uio_info.uio_dev->minor, 0, NULL); + tcmu_netlink_event(udev, TCMU_CMD_REMOVED_DEVICE, 0, NULL); uio_unregister_device(&udev->uio_info); } @@ -1600,9 +1779,7 @@ static ssize_t tcmu_dev_config_store(struct config_item *item, const char *page, /* Check if device has been configured before */ if (tcmu_dev_configured(udev)) { - ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE, - udev->uio_info.name, - udev->uio_info.uio_dev->minor, + ret = tcmu_netlink_event(udev, TCMU_CMD_RECONFIG_DEVICE, TCMU_ATTR_DEV_CFG, page); if (ret) { pr_err("Unable to reconfigure device\n"); @@ -1639,9 +1816,7 @@ static ssize_t tcmu_dev_size_store(struct config_item *item, const char *page, /* Check if device has been configured before */ if (tcmu_dev_configured(udev)) { - ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE, - udev->uio_info.name, - udev->uio_info.uio_dev->minor, + ret = tcmu_netlink_event(udev, TCMU_CMD_RECONFIG_DEVICE, TCMU_ATTR_DEV_SIZE, &val); if (ret) { pr_err("Unable to reconfigure device\n"); @@ -1677,9 +1852,7 @@ static ssize_t tcmu_emulate_write_cache_store(struct config_item *item, /* Check if device has been configured before */ if (tcmu_dev_configured(udev)) { - ret = tcmu_netlink_event(TCMU_CMD_RECONFIG_DEVICE, - udev->uio_info.name, - udev->uio_info.uio_dev->minor, + ret = tcmu_netlink_event(udev, TCMU_CMD_RECONFIG_DEVICE, TCMU_ATTR_WRITECACHE, &val); if (ret) { pr_err("Unable to reconfigure device\n"); diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index 4bfc9a1b635c..24a1c4ec2248 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -131,6 +131,10 @@ enum tcmu_genl_cmd { TCMU_CMD_ADDED_DEVICE, TCMU_CMD_REMOVED_DEVICE, TCMU_CMD_RECONFIG_DEVICE, + TCMU_CMD_ADDED_DEVICE_DONE, + TCMU_CMD_REMOVED_DEVICE_DONE, + TCMU_CMD_RECONFIG_DEVICE_DONE, + TCMU_CMD_SET_FEATURES, __TCMU_CMD_MAX, }; #define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1) @@ -143,6 +147,9 @@ enum tcmu_genl_attr { TCMU_ATTR_DEV_CFG, TCMU_ATTR_DEV_SIZE, TCMU_ATTR_WRITECACHE, + TCMU_ATTR_CMD_STATUS, + TCMU_ATTR_DEVICE_ID, + TCMU_ATTR_SUPP_KERN_CMD_REPLY, __TCMU_ATTR_MAX, }; #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) -- cgit v1.2.3 From 949c033694864082db9b3f5304723a6d7407f8e2 Mon Sep 17 00:00:00 2001 From: Gleb Fotengauer-Malinovskiy Date: Tue, 11 Jul 2017 00:22:33 +0300 Subject: KVM: s390: Fix KVM_S390_GET_CMMA_BITS ioctl definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case of KVM_S390_GET_CMMA_BITS, the kernel does not only read struct kvm_s390_cmma_log passed from userspace (which constitutes _IOC_WRITE), it also writes back a return value (which constitutes _IOC_READ) making this an _IOWR ioctl instead of _IOW. Fixes: 4036e387 ("KVM: s390: ioctls to get and set guest storage attributes") Signed-off-by: Gleb Fotengauer-Malinovskiy Acked-by: Christian Borntraeger Signed-off-by: Radim Krčmář --- include/uapi/linux/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c0b6dfec5f87..ebd604c222d8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1351,7 +1351,7 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_X86_SMM */ #define KVM_SMI _IO(KVMIO, 0xb7) /* Available with KVM_CAP_S390_CMMA_MIGRATION */ -#define KVM_S390_GET_CMMA_BITS _IOW(KVMIO, 0xb8, struct kvm_s390_cmma_log) +#define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log) #define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -- cgit v1.2.3 From 0791e3644e5ef21646fe565b9061788d05ec71d4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 12 Jul 2017 14:34:28 -0700 Subject: kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files With current epoll architecture target files are addressed with file_struct and file descriptor number, where the last is not unique. Moreover files can be transferred from another process via unix socket, added into queue and closed then so we won't find this descriptor in the task fdinfo list. Thus to checkpoint and restore such processes CRIU needs to find out where exactly the target file is present to add it into epoll queue. For this sake one can use kcmp call where some particular target file from the queue is compared with arbitrary file passed as an argument. Because epoll target files can have same file descriptor number but different file_struct a caller should explicitly specify the offset within. To test if some particular file is matching entry inside epoll one have to - fill kcmp_epoll_slot structure with epoll file descriptor, target file number and target file offset (in case if only one target is present then it should be 0) - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot) - the kernel fetch file pointer matching file descriptor @fd of pid1 - lookups for file struct in epoll queue of pid2 and returns traditional 0,1,2 result for sorting purpose Link: http://lkml.kernel.org/r/20170424154423.511592110@gmail.com Signed-off-by: Cyrill Gorcunov Acked-by: Andrey Vagin Cc: Al Viro Cc: Pavel Emelyanov Cc: Michael Kerrisk Cc: Jason Baron Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 42 ++++++++++++++++++++++++++++++++++ include/linux/eventpoll.h | 3 +++ include/uapi/linux/kcmp.h | 10 +++++++++ kernel/kcmp.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+) (limited to 'include/uapi') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 322904c3ebdf..e7e9901c3790 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1077,6 +1077,48 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) return epir; } +static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) +{ + struct rb_node *rbp; + struct epitem *epi; + + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (epi->ffd.fd == tfd) { + if (toff == 0) + return epi; + else + toff--; + } + cond_resched(); + } + + return NULL; +} + +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, + unsigned long toff) +{ + struct file *file_raw; + struct eventpoll *ep; + struct epitem *epi; + + if (!is_file_epoll(file)) + return ERR_PTR(-EINVAL); + + ep = file->private_data; + + mutex_lock(&ep->mtx); + epi = ep_find_tfd(ep, tfd, toff); + if (epi) + file_raw = epi->ffd.file; + else + file_raw = ERR_PTR(-ENOENT); + mutex_unlock(&ep->mtx); + + return file_raw; +} + /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 6daf6d4971f6..d8625d214ea7 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -14,6 +14,7 @@ #define _LINUX_EVENTPOLL_H #include +#include /* Forward declarations to avoid compiler errors */ @@ -22,6 +23,8 @@ struct file; #ifdef CONFIG_EPOLL +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); + /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) { diff --git a/include/uapi/linux/kcmp.h b/include/uapi/linux/kcmp.h index 84df14b37360..481e103da78e 100644 --- a/include/uapi/linux/kcmp.h +++ b/include/uapi/linux/kcmp.h @@ -1,6 +1,8 @@ #ifndef _UAPI_LINUX_KCMP_H #define _UAPI_LINUX_KCMP_H +#include + /* Comparison type */ enum kcmp_type { KCMP_FILE, @@ -10,8 +12,16 @@ enum kcmp_type { KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, + KCMP_EPOLL_TFD, KCMP_TYPES, }; +/* Slot for KCMP_EPOLL_TFD */ +struct kcmp_epoll_slot { + __u32 efd; /* epoll file descriptor */ + __u32 tfd; /* target file number */ + __u32 toff; /* target offset within same numbered sequence */ +}; + #endif /* _UAPI_LINUX_KCMP_H */ diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 3a47fa998fe0..ea34ed8bb952 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -11,6 +11,10 @@ #include #include #include +#include +#include +#include +#include #include @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2) return err; } +#ifdef CONFIG_EPOLL +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + struct file *filp, *filp_epoll, *filp_tgt; + struct kcmp_epoll_slot slot; + struct files_struct *files; + + if (copy_from_user(&slot, uslot, sizeof(slot))) + return -EFAULT; + + filp = get_file_raw_ptr(task1, idx1); + if (!filp) + return -EBADF; + + files = get_files_struct(task2); + if (!files) + return -EBADF; + + spin_lock(&files->file_lock); + filp_epoll = fcheck_files(files, slot.efd); + if (filp_epoll) + get_file(filp_epoll); + else + filp_tgt = ERR_PTR(-EBADF); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (filp_epoll) { + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); + fput(filp_epoll); + } else + + if (IS_ERR(filp_tgt)) + return PTR_ERR(filp_tgt); + + return kcmp_ptr(filp, filp_tgt, KCMP_FILE); +} +#else +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + return -EOPNOTSUPP; +} +#endif + SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, ret = -EOPNOTSUPP; #endif break; + case KCMP_EPOLL_TFD: + ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); + break; default: ret = -EINVAL; break; -- cgit v1.2.3 From 2cd648c110b5570c3280bd645797658cabbe5f5c Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 12 Jul 2017 14:34:44 -0700 Subject: include/linux/sem.h: correctly document sem_ctime sem_ctime is initialized to the semget() time and then updated at every semctl() that changes the array. Thus it does not represent the time of the last change. Especially, semop() calls are only stored in sem_otime, not in sem_ctime. This is already described in ipc/sem.c, I just overlooked that there is a comment in include/linux/sem.h and man semctl(2) as well. So: Correct wrong comments. Link: http://lkml.kernel.org/r/20170515171912.6298-4-manfred@colorfullife.com Signed-off-by: Manfred Spraul Cc: Kees Cook Cc: <1vier1@web.de> Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sem.h | 2 +- include/uapi/linux/sem.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/sem.h b/include/linux/sem.h index 9db14093b73c..be5cf2ea14ad 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -30,7 +30,7 @@ struct sem { /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */ - time_t sem_ctime; /* last change time */ + time_t sem_ctime; /* create/last semctl() time */ struct list_head pending_alter; /* pending operations */ /* that alter the array */ struct list_head pending_const; /* pending complex operations */ diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index dd73b908b2f3..67eb90361692 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h @@ -23,7 +23,7 @@ struct semid_ds { struct ipc_perm sem_perm; /* permissions .. see ipc.h */ __kernel_time_t sem_otime; /* last semop time */ - __kernel_time_t sem_ctime; /* last change time */ + __kernel_time_t sem_ctime; /* create/last semctl() time */ struct sem *sem_base; /* ptr to first semaphore in array */ struct sem_queue *sem_pending; /* pending operations to be processed */ struct sem_queue **sem_pending_last; /* last pending operation */ -- cgit v1.2.3 From efc479e6900c22bad9a2b649d13405ed9cde2d53 Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Thu, 22 Jun 2017 16:51:01 +0300 Subject: kvm: x86: hyperv: add KVM_CAP_HYPERV_SYNIC2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a flaw in the Hyper-V SynIC implementation in KVM: when message page or event flags page is enabled by setting the corresponding msr, KVM zeroes it out. This is problematic because on migration the corresponding MSRs are loaded on the destination, so the content of those pages is lost. This went unnoticed so far because the only user of those pages was in-KVM hyperv synic timers, which could continue working despite that zeroing. Newer QEMU uses those pages for Hyper-V VMBus implementation, and zeroing them breaks the migration. Besides, in newer QEMU the content of those pages is fully managed by QEMU, so zeroing them is undesirable even when writing the MSRs from the guest side. To support this new scheme, introduce a new capability, KVM_CAP_HYPERV_SYNIC2, which, when enabled, makes sure that the synic pages aren't zeroed out in KVM. Signed-off-by: Roman Kagan Signed-off-by: Radim Krčmář --- Documentation/virtual/kvm/api.txt | 9 +++++++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/hyperv.c | 13 +++++++++---- arch/x86/kvm/hyperv.h | 2 +- arch/x86/kvm/x86.c | 7 ++++++- include/uapi/linux/kvm.h | 1 + 6 files changed, 27 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 3a9831b72945..78ac577c9378 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4329,3 +4329,12 @@ Querying this capability returns a bitmap indicating the possible virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N (counting from the right) is set, then a virtual SMT mode of 2^N is available. + +8.11 KVM_CAP_HYPERV_SYNIC2 + +Architectures: x86 + +This capability enables a newer version of Hyper-V Synthetic interrupt +controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM +doesn't clear SynIC message and event flags pages when they are enabled by +writing to the respective MSRs. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ef37d0dc61bd..9d8de5dd7546 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -462,6 +462,7 @@ struct kvm_vcpu_hv_synic { DECLARE_BITMAP(auto_eoi_bitmap, 256); DECLARE_BITMAP(vec_bitmap, 256); bool active; + bool dont_zero_synic_pages; }; /* Hyper-V per vcpu emulation context */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ebae57ac5902..a8084406707e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -221,7 +221,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic->version = data; break; case HV_X64_MSR_SIEFP: - if (data & HV_SYNIC_SIEFP_ENABLE) + if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -232,7 +233,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic_exit(synic, msr); break; case HV_X64_MSR_SIMP: - if (data & HV_SYNIC_SIMP_ENABLE) + if ((data & HV_SYNIC_SIMP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -687,14 +689,17 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) stimer_init(&hv_vcpu->stimer[i], i); } -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + /* * Hyper-V SynIC auto EOI SINT's are * not compatible with APICV, so deactivate APICV */ kvm_vcpu_deactivate_apicv(vcpu); - vcpu_to_synic(vcpu)->active = true; + synic->active = true; + synic->dont_zero_synic_pages = dont_zero_synic_pages; return 0; } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index cd1119538add..12f65fe1011d 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -56,7 +56,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 08aa5e442aa7..4f41c5222ecd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2659,6 +2659,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_HYPERV_SYNIC: + case KVM_CAP_HYPERV_SYNIC2: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -3382,10 +3383,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return -EINVAL; switch (cap->cap) { + case KVM_CAP_HYPERV_SYNIC2: + if (cap->args[0]) + return -EINVAL; case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; - return kvm_hv_activate_synic(vcpu); + return kvm_hv_activate_synic(vcpu, cap->cap == + KVM_CAP_HYPERV_SYNIC2); default: return -EINVAL; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ebd604c222d8..38b2cfbc8112 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -927,6 +927,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_CMMA_MIGRATION 145 #define KVM_CAP_PPC_FWNMI 146 #define KVM_CAP_PPC_SMT_POSSIBLE 147 +#define KVM_CAP_HYPERV_SYNIC2 148 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From d3457c877b14aaee8c52923eedf05a3b78af0476 Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Fri, 14 Jul 2017 17:13:20 +0300 Subject: kvm: x86: hyperv: make VP_INDEX managed by userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyper-V identifies vCPUs by Virtual Processor Index, which can be queried via HV_X64_MSR_VP_INDEX msr. It is defined by the spec as a sequential number which can't exceed the maximum number of vCPUs per VM. APIC ids can be sparse and thus aren't a valid replacement for VP indices. Current KVM uses its internal vcpu index as VP_INDEX. However, to make it predictable and persistent across VM migrations, the userspace has to control the value of VP_INDEX. This patch achieves that, by storing vp_index explicitly on vcpu, and allowing HV_X64_MSR_VP_INDEX to be set from the host side. For compatibility it's initialized to KVM vcpu index. Also a few variables are renamed to make clear distinction betweed this Hyper-V vp_index and KVM vcpu_id (== APIC id). Besides, a new capability, KVM_CAP_HYPERV_VP_INDEX, is added to allow the userspace to skip attempting msr writes where unsupported, to avoid spamming error logs. Signed-off-by: Roman Kagan Signed-off-by: Radim Krčmář --- Documentation/virtual/kvm/api.txt | 9 +++++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/hyperv.c | 54 +++++++++++++++++++++++++-------------- arch/x86/kvm/hyperv.h | 1 + arch/x86/kvm/x86.c | 3 +++ include/uapi/linux/kvm.h | 1 + 6 files changed, 50 insertions(+), 19 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 78ac577c9378..e63a35fafef0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4338,3 +4338,12 @@ This capability enables a newer version of Hyper-V Synthetic interrupt controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM doesn't clear SynIC message and event flags pages when they are enabled by writing to the respective MSRs. + +8.12 KVM_CAP_HYPERV_VP_INDEX + +Architectures: x86 + +This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its +value is used to denote the target vcpu for a SynIC interrupt. For +compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this +capability is absent, userspace can still query this msr's value. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da3261e384d3..87ac4fba6d8e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -467,6 +467,7 @@ struct kvm_vcpu_hv_synic { /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { + u32 vp_index; u64 hv_vapic; s64 runtime_offset; struct kvm_vcpu_hv_synic synic; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a8084406707e..2695a34fa1c5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -106,14 +106,27 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, return 0; } -static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) +static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + if (vpidx < KVM_MAX_VCPUS) + vcpu = kvm_get_vcpu(kvm, vpidx); + if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + return NULL; +} + +static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu; struct kvm_vcpu_hv_synic *synic; - if (vcpu_id >= atomic_read(&kvm->online_vcpus)) - return NULL; - vcpu = kvm_get_vcpu(kvm, vcpu_id); + vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu) return NULL; synic = vcpu_to_synic(vcpu); @@ -320,11 +333,11 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return ret; } -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) +int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -343,11 +356,11 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) kvm_hv_notify_acked_sint(vcpu, i); } -static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) +static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -689,6 +702,13 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) stimer_init(&hv_vcpu->stimer[i], i); } +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + + hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); +} + int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); @@ -983,6 +1003,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { + case HV_X64_MSR_VP_INDEX: + if (!host) + return 1; + hv->vp_index = (u32)data; + break; case HV_X64_MSR_APIC_ASSIST_PAGE: { u64 gfn; unsigned long addr; @@ -1094,18 +1119,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } + case HV_X64_MSR_VP_INDEX: + data = hv->vp_index; break; - } case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); case HV_X64_MSR_ICR: diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 12f65fe1011d..e637631a9574 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -59,6 +59,7 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6753f0982791..5b8f07889f6a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2666,6 +2666,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_SPIN: case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_HYPERV_SYNIC2: + case KVM_CAP_HYPERV_VP_INDEX: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -7688,6 +7689,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) struct msr_data msr; struct kvm *kvm = vcpu->kvm; + kvm_hv_vcpu_postcreate(vcpu); + if (vcpu_load(vcpu)) return; msr.data = 0x0; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 38b2cfbc8112..6cd63c18708a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -928,6 +928,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_FWNMI 146 #define KVM_CAP_PPC_SMT_POSSIBLE 147 #define KVM_CAP_HYPERV_SYNIC2 148 +#define KVM_CAP_HYPERV_VP_INDEX 149 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3