From ce9fe18abb7c86a71b545e1cdd60fe333bf462a3 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 26 Nov 2020 11:47:16 +0100 Subject: block/rnbd-clt: Make path parameter optional for map_device During map_device if the given session exists, then the path parameter is not used. In such a case, the path parameter is redundant. This commit makes the path parameter optional for map_device. When the path parameter is not given, if the session exists then that is used to establish the rtrs connection. If the session does not exist, and the path parameter is also missing, then map_device fails. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 1 - drivers/block/rnbd/rnbd-clt.c | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 4f4474eecadb..e7b41ec7cd6a 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -37,7 +37,6 @@ enum { }; static const unsigned int rnbd_opt_mandatory[] = { - RNBD_OPT_PATH, RNBD_OPT_DEV_PATH, RNBD_OPT_SESSNAME, }; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 8b2411ccbda9..edefa0761a81 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1193,6 +1193,12 @@ find_and_get_or_create_sess(const char *sessname, else if (!first) return sess; + if (!path_cnt) { + pr_err("Session %s not found, and path parameter not given", sessname); + err = -ENXIO; + goto put_sess; + } + rtrs_ops = (struct rtrs_clt_ops) { .priv = sess, .link_ev = rnbd_clt_link_ev, -- cgit v1.2.3 From 91f4acb2801ce4985483b0fa174bbe995d105417 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 26 Nov 2020 11:47:17 +0100 Subject: block/rnbd-clt: support mapping two devices with the same name from different servers Previously, we can't map same device name from different sessions due to the limitation of sysfs naming mechanism. root@clt2:~# ls -l /sys/class/rnbd-client/ctl/devices/ total 0 lrwxrwxrwx 1 root 0 Sep 2 16:31 !dev!nullb1 -> ../../../block/rnbd0 We only use the device name in above, which caused device with the same name can't be mapped from another server. To address the issue, the sessname is appended to the node to differentiate where the device comes from. Also, we need to check if the pathname is existed in a specific session instead of search it in global sess_list. Signed-off-by: Guoqing Jiang Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Reviewed-by: Md Haris Iqbal Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 4 ++++ drivers/block/rnbd/rnbd-clt.c | 13 ++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index e7b41ec7cd6a..5d3c3c80dab4 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -480,6 +480,10 @@ static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, if (ret >= len) return -ENAMETOOLONG; + ret = snprintf(buf, len, "%s@%s", buf, dev->sess->sessname); + if (ret >= len) + return -ENAMETOOLONG; + return 0; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index edefa0761a81..1bb495e50931 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1410,13 +1410,16 @@ out_alloc: return ERR_PTR(ret); } -static bool __exists_dev(const char *pathname) +static bool __exists_dev(const char *pathname, const char *sessname) { struct rnbd_clt_session *sess; struct rnbd_clt_dev *dev; bool found = false; list_for_each_entry(sess, &sess_list, list) { + if (sessname && strncmp(sess->sessname, sessname, + sizeof(sess->sessname))) + continue; mutex_lock(&sess->lock); list_for_each_entry(dev, &sess->devs_list, list) { if (!strncmp(dev->pathname, pathname, @@ -1433,12 +1436,12 @@ static bool __exists_dev(const char *pathname) return found; } -static bool exists_devpath(const char *pathname) +static bool exists_devpath(const char *pathname, const char *sessname) { bool found; mutex_lock(&sess_lock); - found = __exists_dev(pathname); + found = __exists_dev(pathname, sessname); mutex_unlock(&sess_lock); return found; @@ -1451,7 +1454,7 @@ static bool insert_dev_if_not_exists_devpath(const char *pathname, bool found; mutex_lock(&sess_lock); - found = __exists_dev(pathname); + found = __exists_dev(pathname, sess->sessname); if (!found) { mutex_lock(&sess->lock); list_add_tail(&dev->list, &sess->devs_list); @@ -1481,7 +1484,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, struct rnbd_clt_dev *dev; int ret; - if (exists_devpath(pathname)) + if (unlikely(exists_devpath(pathname, sessname))) return ERR_PTR(-EEXIST); sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr); -- cgit v1.2.3 From 786998050cbc8ead32e6e9fcda2facb3bf3d198d Mon Sep 17 00:00:00 2001 From: Lutz Pogrell Date: Thu, 26 Nov 2020 11:47:20 +0100 Subject: block/rnbd-srv: close a mapped device from server side. The forceful close of an exported device is required for the use case, when the client side hangs, is crashed, or is not accessible. There have been cases observed, where only some of the devices are to be cleaned up, but the session shall remain. When the device is to be exported to a different client host, server side cleanup is required. Signed-off-by: Lutz Pogrell Signed-off-by: Jack Wang Reviewed-by: Gioh Kim Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-sysfs.c | 38 ++++++++++++++++++++++++++++++++++++- drivers/block/rnbd/rnbd-srv.c | 19 +++++++++++++++++-- drivers/block/rnbd/rnbd-srv.h | 4 +++- 3 files changed, 57 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 106775c074d1..08ffb492ebfa 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -120,10 +120,46 @@ static ssize_t mapping_path_show(struct kobject *kobj, static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = __ATTR_RO(mapping_path); +static ssize_t rnbd_srv_dev_session_force_close_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rnbd_srv_dev_session_force_close_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + + if (!sysfs_streq(buf, "1")) { + rnbd_srv_err(sess_dev, "%s: invalid value: '%s'\n", + attr->attr.name, buf); + return -EINVAL; + } + + rnbd_srv_info(sess_dev, "force close requested\n"); + + /* first remove sysfs itself to avoid deadlock */ + sysfs_remove_file_self(&sess_dev->kobj, &attr->attr); + rnbd_srv_sess_dev_force_close(sess_dev); + + return count; +} + +static struct kobj_attribute rnbd_srv_dev_session_force_close_attr = + __ATTR(force_close, 0644, + rnbd_srv_dev_session_force_close_show, + rnbd_srv_dev_session_force_close_store); + static struct attribute *rnbd_srv_default_dev_sessions_attrs[] = { &rnbd_srv_dev_session_access_mode_attr.attr, &rnbd_srv_dev_session_ro_attr.attr, &rnbd_srv_dev_session_mapping_path_attr.attr, + &rnbd_srv_dev_session_force_close_attr.attr, NULL, }; @@ -145,7 +181,7 @@ static void rnbd_srv_sess_dev_release(struct kobject *kobj) struct rnbd_srv_sess_dev *sess_dev; sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - rnbd_destroy_sess_dev(sess_dev); + rnbd_destroy_sess_dev(sess_dev, sess_dev->keep_id); } static struct kobj_type rnbd_srv_sess_dev_ktype = { diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index e1bc8b4cd592..d1ee72ed8384 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -212,12 +212,20 @@ static void rnbd_put_srv_dev(struct rnbd_srv_dev *dev) kref_put(&dev->kref, destroy_device_cb); } -void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev) +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) { DECLARE_COMPLETION_ONSTACK(dc); - xa_erase(&sess_dev->sess->index_idr, sess_dev->device_id); + if (keep_id) + /* free the resources for the id but don't */ + /* allow to re-use the id itself because it */ + /* is still used by the client */ + xa_cmpxchg(&sess_dev->sess->index_idr, sess_dev->device_id, + sess_dev, NULL, 0); + else + xa_erase(&sess_dev->sess->index_idr, sess_dev->device_id); synchronize_rcu(); + sess_dev->destroy_comp = &dc; rnbd_put_sess_dev(sess_dev); wait_for_completion(&dc); /* wait for inflights to drop to zero */ @@ -328,6 +336,13 @@ static int rnbd_srv_link_ev(struct rtrs_srv *rtrs, } } +void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev) +{ + rnbd_srv_destroy_dev_session_sysfs(sess_dev); + sess_dev->keep_id = true; + +} + static int process_msg_close(struct rtrs_srv *rtrs, struct rnbd_srv_session *srv_sess, void *data, size_t datalen, const void *usr, diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index 5a8544b5e74f..b157371c25ed 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -56,6 +56,7 @@ struct rnbd_srv_sess_dev { struct rnbd_srv_dev *dev; struct kobject kobj; u32 device_id; + bool keep_id; fmode_t open_flags; struct kref kref; struct completion *destroy_comp; @@ -63,6 +64,7 @@ struct rnbd_srv_sess_dev { enum rnbd_access_mode access_mode; }; +void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev); /* rnbd-srv-sysfs.c */ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, @@ -73,6 +75,6 @@ int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); int rnbd_srv_create_sysfs_files(void); void rnbd_srv_destroy_sysfs_files(void); -void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev); +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id); #endif /* RNBD_SRV_H */ -- cgit v1.2.3 From d3a95ccaaf4df94743a958c90ab85f4703e3a687 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 26 Nov 2020 11:47:22 +0100 Subject: block/rnbd: call kobject_put in the failure path Per the comment of kobject_init_and_add, we need to cleanup the memory by call kobject_put. Also we need to call kobject_del for the other failure cases if the kobject_init_and_add doesn't fail. Signed-off-by: Guoqing Jiang Signed-off-by: Jack Wang Reviewed-by: Md Haris Iqbal Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 4 +++- drivers/block/rnbd/rnbd-srv-sysfs.c | 28 ++++++++++++++++------------ 2 files changed, 19 insertions(+), 13 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 5d3c3c80dab4..e3c3270b0cee 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -450,9 +450,11 @@ static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) ret = kobject_init_and_add(&dev->kobj, &rnbd_dev_ktype, gd_kobj, "%s", "rnbd"); - if (ret) + if (ret) { rnbd_clt_err(dev, "Failed to create device sysfs dir, err: %d\n", ret); + kobject_put(&dev->kobj); + } return ret; } diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 08ffb492ebfa..05ffe488ddc6 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -47,13 +47,17 @@ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, ret = kobject_init_and_add(&dev->dev_kobj, &dev_ktype, rnbd_devs_kobj, dev_name); - if (ret) + if (ret) { + kobject_put(&dev->dev_kobj); return ret; + } dev->dev_sessions_kobj = kobject_create_and_add("sessions", &dev->dev_kobj); - if (!dev->dev_sessions_kobj) - goto put_dev_kobj; + if (!dev->dev_sessions_kobj) { + ret = -ENOMEM; + goto free_dev_kobj; + } bdev_kobj = &disk_to_dev(bdev->bd_disk)->kobj; ret = sysfs_create_link(&dev->dev_kobj, bdev_kobj, "block_dev"); @@ -64,7 +68,8 @@ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, put_sess_kobj: kobject_put(dev->dev_sessions_kobj); -put_dev_kobj: +free_dev_kobj: + kobject_del(&dev->dev_kobj); kobject_put(&dev->dev_kobj); return ret; } @@ -196,18 +201,17 @@ int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev) ret = kobject_init_and_add(&sess_dev->kobj, &rnbd_srv_sess_dev_ktype, sess_dev->dev->dev_sessions_kobj, "%s", sess_dev->sess->sessname); - if (ret) + if (ret) { + kobject_put(&sess_dev->kobj); return ret; + } ret = sysfs_create_group(&sess_dev->kobj, &rnbd_srv_default_dev_session_attr_group); - if (ret) - goto err; - - return 0; - -err: - kobject_put(&sess_dev->kobj); + if (ret) { + kobject_del(&sess_dev->kobj); + kobject_put(&sess_dev->kobj); + } return ret; } -- cgit v1.2.3 From 64e8a6ece1a5b1fa21316918053d068baeac84af Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 26 Nov 2020 11:47:23 +0100 Subject: block/rnbd-clt: Dynamically alloc buffer for pathname & blk_symlink_name For every rnbd_clt_dev, we alloc the pathname and blk_symlink_name statically to NAME_MAX which is 255 bytes. In most of the cases we only need less than 10 bytes, so 500 bytes per block device are wasted. This commit dynamically allocates memory buffer for pathname and blk_symlink_name. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Reviewed-by: Lutz Pogrell Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 12 ++++++++++-- drivers/block/rnbd/rnbd-clt.c | 14 +++++++++++--- drivers/block/rnbd/rnbd-clt.h | 4 ++-- 3 files changed, 23 insertions(+), 7 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index e3c3270b0cee..c3c96a567568 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -434,6 +434,7 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) */ if (strlen(dev->blk_symlink_name) && try_module_get(THIS_MODULE)) { sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name); + kfree(dev->blk_symlink_name); module_put(THIS_MODULE); } } @@ -492,10 +493,17 @@ static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev) { struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj; - int ret; + int ret, len; + + len = strlen(dev->pathname) + strlen(dev->sess->sessname) + 2; + dev->blk_symlink_name = kzalloc(len, GFP_KERNEL); + if (!dev->blk_symlink_name) { + rnbd_clt_err(dev, "Failed to allocate memory for blk_symlink_name\n"); + goto out_err; + } ret = rnbd_clt_get_path_name(dev, dev->blk_symlink_name, - sizeof(dev->blk_symlink_name)); + len); if (ret) { rnbd_clt_err(dev, "Failed to get /sys/block symlink path, err: %d\n", ret); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 1bb495e50931..34bc6083b58d 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -59,6 +59,7 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) ida_simple_remove(&index_ida, dev->clt_device_id); mutex_unlock(&ida_lock); kfree(dev->hw_queues); + kfree(dev->pathname); rnbd_clt_put_sess(dev->sess); mutex_destroy(&dev->lock); kfree(dev); @@ -1387,10 +1388,17 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, pathname, sess->sessname, ret); goto out_queues; } + + dev->pathname = kzalloc(strlen(pathname) + 1, GFP_KERNEL); + if (!dev->pathname) { + ret = -ENOMEM; + goto out_queues; + } + strlcpy(dev->pathname, pathname, strlen(pathname) + 1); + dev->clt_device_id = ret; dev->sess = sess; dev->access_mode = access_mode; - strlcpy(dev->pathname, pathname, sizeof(dev->pathname)); mutex_init(&dev->lock); refcount_set(&dev->refcount, 1); dev->dev_state = DEV_STATE_INIT; @@ -1422,8 +1430,8 @@ static bool __exists_dev(const char *pathname, const char *sessname) continue; mutex_lock(&sess->lock); list_for_each_entry(dev, &sess->devs_list, list) { - if (!strncmp(dev->pathname, pathname, - sizeof(dev->pathname))) { + if (strlen(dev->pathname) == strlen(pathname) && + !strcmp(dev->pathname, pathname)) { found = true; break; } diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index ed33654aa486..b193d5904050 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -108,7 +108,7 @@ struct rnbd_clt_dev { u32 clt_device_id; struct mutex lock; enum rnbd_clt_dev_state dev_state; - char pathname[NAME_MAX]; + char *pathname; enum rnbd_access_mode access_mode; bool read_only; bool rotational; @@ -126,7 +126,7 @@ struct rnbd_clt_dev { struct list_head list; struct gendisk *gd; struct kobject kobj; - char blk_symlink_name[NAME_MAX]; + char *blk_symlink_name; refcount_t refcount; struct work_struct unmap_on_rmmod_work; }; -- cgit v1.2.3 From 733c15bd3a944b8eeaacdddf061759b6a83dd3f4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 7 Dec 2020 14:54:46 +0000 Subject: block/rnbd: fix a null pointer dereference on dev->blk_symlink_name Currently in the case where dev->blk_symlink_name fails to be allocates the error return path attempts to set an end-of-string character to the unallocated dev->blk_symlink_name causing a null pointer dereference error. Fix this by returning with an explicity ENOMEM error (which also is missing in the original code as was not initialized). Fixes: 1eb54f8f5dd8 ("block/rnbd: client: sysfs interface functions") Signed-off-by: Colin Ian King Addresses-Coverity: ("Dereference after null check") Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index c3c96a567568..a7caeedeb198 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -499,7 +499,7 @@ static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev) dev->blk_symlink_name = kzalloc(len, GFP_KERNEL); if (!dev->blk_symlink_name) { rnbd_clt_err(dev, "Failed to allocate memory for blk_symlink_name\n"); - goto out_err; + return -ENOMEM; } ret = rnbd_clt_get_path_name(dev, dev->blk_symlink_name, -- cgit v1.2.3 From 0ebcdd702f49aeb0ad2e2d894f8c124a0acc6e23 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:11 +0900 Subject: null_blk: Fix zone size initialization For a null_blk device with zoned mode enabled is currently initialized with a number of zones equal to the device capacity divided by the zone size, without considering if the device capacity is a multiple of the zone size. If the zone size is not a divisor of the capacity, the zones end up not covering the entire capacity, potentially resulting is out of bounds accesses to the zone array. Fix this by adding one last smaller zone with a size equal to the remainder of the disk capacity divided by the zone size if the capacity is not a multiple of the zone size. For such smaller last zone, the zone capacity is also checked so that it does not exceed the smaller zone size. Reported-by: Naohiro Aota Fixes: ca4b2a011948 ("null_blk: add zone support") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index beb34b4f76b0..1d0370d91fe7 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -6,8 +6,7 @@ #define CREATE_TRACE_POINTS #include "null_blk_trace.h" -/* zone_size in MBs to sectors. */ -#define ZONE_SIZE_SHIFT 11 +#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) { @@ -16,7 +15,7 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) { - sector_t dev_size = (sector_t)dev->size * 1024 * 1024; + sector_t dev_capacity_sects, zone_capacity_sects; sector_t sector = 0; unsigned int i; @@ -38,9 +37,13 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) return -EINVAL; } - dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; - dev->nr_zones = dev_size >> - (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); + zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); + dev_capacity_sects = MB_TO_SECTS(dev->size); + dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); + dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); + if (dev_capacity_sects & (dev->zone_size_sects - 1)) + dev->nr_zones++; + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), GFP_KERNEL | __GFP_ZERO); if (!dev->zones) @@ -101,8 +104,12 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) struct blk_zone *zone = &dev->zones[i]; zone->start = zone->wp = sector; - zone->len = dev->zone_size_sects; - zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT; + if (zone->start + dev->zone_size_sects > dev_capacity_sects) + zone->len = dev_capacity_sects - zone->start; + else + zone->len = dev->zone_size_sects; + zone->capacity = + min_t(sector_t, zone->len, zone_capacity_sects); zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; -- cgit v1.2.3 From 2e896d89510f23927ec393bee1e0570db3d5a6c6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:12 +0900 Subject: null_blk: Fail zone append to conventional zones Conventional zones do not have a write pointer and so cannot accept zone append writes. Make sure to fail any zone append write command issued to a conventional zone. Reported-by: Naohiro Aota Fixes: e0489ed5daeb ("null_blk: Support REQ_OP_ZONE_APPEND") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 1d0370d91fe7..172f720b8d63 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -339,8 +339,11 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, trace_nullb_zone_op(cmd, zno, zone->cond); - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (append) + return BLK_STS_IOERR; return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + } null_lock_zone(dev, zno); -- cgit v1.2.3 From 2b8b7ed7f3fc2b1536a0add3941ae159529d23bd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:14 +0900 Subject: null_blk: improve zone locking With memory backing disabled, using a single spinlock for protecting zone information and zone resource management prevents the parallel execution on multiple queue of IO requests to different zones. Furthermore, regardless of the use of memory backing, if a null_blk device is created without limits on the number of open and active zones, accounting for zone resource management is not necessary. >From these observations, zone locking is changed as follows to improve performance: 1) the zone_lock spinlock is renamed zone_res_lock and used only if zone resource management is necessary, that is, if either zone_max_open or zone_max_active are not 0. This is indicated using the new boolean need_zone_res_mgmt in the nullb_device structure. null_zone_write() is modified to reduce the amount of code executed with the zone_res_lock spinlock held. 2) With memory backing disabled, per zone locking is changed to a spinlock per zone. 3) Introduce the structure nullb_zone to replace the use of struct blk_zone for zone information. This new structure includes a union of a spinlock and a mutex for zone locking. The spinlock is used when memory backing is disabled and the mutex is used with memory backing. With these changes, fio performance with zonemode=zbd for 4K random read and random write on a dual socket (24 cores per socket) machine using the none schedulder is as follows: before patch: write (psync x 96 jobs) = 465 KIOPS read (libaio@qd=8 x 96 jobs) = 1361 KIOPS after patch: write (psync x 96 jobs) = 456 KIOPS read (libaio@qd=8 x 96 jobs) = 4096 KIOPS Write performance remains mostly unchanged but read performance is three times higher. Performance when using the mq-deadline scheduler is not changed by this patch as mq-deadline becomes the bottleneck for a multi-queue device. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 28 ++++- drivers/block/null_blk_zoned.c | 280 ++++++++++++++++++++++++----------------- 2 files changed, 188 insertions(+), 120 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index c24d9b5ad81a..14546ead1d66 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include struct nullb_cmd { struct request *rq; @@ -32,6 +34,26 @@ struct nullb_queue { struct nullb_cmd *cmds; }; +struct nullb_zone { + /* + * Zone lock to prevent concurrent modification of a zone write + * pointer position and condition: with memory backing, a write + * command execution may sleep on memory allocation. For this case, + * use mutex as the zone lock. Otherwise, use the spinlock for + * locking the zone. + */ + union { + spinlock_t spinlock; + struct mutex mutex; + }; + enum blk_zone_type type; + enum blk_zone_cond cond; + sector_t start; + sector_t wp; + unsigned int len; + unsigned int capacity; +}; + struct nullb_device { struct nullb *nullb; struct config_item item; @@ -45,10 +67,10 @@ struct nullb_device { unsigned int nr_zones_imp_open; unsigned int nr_zones_exp_open; unsigned int nr_zones_closed; - struct blk_zone *zones; + struct nullb_zone *zones; sector_t zone_size_sects; - spinlock_t zone_lock; - unsigned long *zone_locks; + bool need_zone_res_mgmt; + spinlock_t zone_res_lock; unsigned long size; /* device size in MB */ unsigned long completion_nsec; /* time in ns to complete a request */ diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 172f720b8d63..4d5c0b938618 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -13,9 +13,49 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) return sect >> ilog2(dev->zone_size_sects); } +static inline void null_lock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_lock_irq(&dev->zone_res_lock); +} + +static inline void null_unlock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_unlock_irq(&dev->zone_res_lock); +} + +static inline void null_init_zone_lock(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_init(&zone->spinlock); + else + mutex_init(&zone->mutex); +} + +static inline void null_lock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_irq(&zone->spinlock); + else + mutex_lock(&zone->mutex); +} + +static inline void null_unlock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_unlock_irq(&zone->spinlock); + else + mutex_unlock(&zone->mutex); +} + int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) { sector_t dev_capacity_sects, zone_capacity_sects; + struct nullb_zone *zone; sector_t sector = 0; unsigned int i; @@ -44,26 +84,12 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) if (dev_capacity_sects & (dev->zone_size_sects - 1)) dev->nr_zones++; - dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), - GFP_KERNEL | __GFP_ZERO); + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), + GFP_KERNEL | __GFP_ZERO); if (!dev->zones) return -ENOMEM; - /* - * With memory backing, the zone_lock spinlock needs to be temporarily - * released to avoid scheduling in atomic context. To guarantee zone - * information protection, use a bitmap to lock zones with - * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing - * implies that the queue is marked with BLK_MQ_F_BLOCKING. - */ - spin_lock_init(&dev->zone_lock); - if (dev->memory_backed) { - dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); - if (!dev->zone_locks) { - kvfree(dev->zones); - return -ENOMEM; - } - } + spin_lock_init(&dev->zone_res_lock); if (dev->zone_nr_conv >= dev->nr_zones) { dev->zone_nr_conv = dev->nr_zones - 1; @@ -86,10 +112,12 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) dev->zone_max_open = 0; pr_info("zone_max_open limit disabled, limit >= zone count\n"); } + dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; for (i = 0; i < dev->zone_nr_conv; i++) { - struct blk_zone *zone = &dev->zones[i]; + zone = &dev->zones[i]; + null_init_zone_lock(dev, zone); zone->start = sector; zone->len = dev->zone_size_sects; zone->capacity = zone->len; @@ -101,8 +129,9 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) } for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - struct blk_zone *zone = &dev->zones[i]; + zone = &dev->zones[i]; + null_init_zone_lock(dev, zone); zone->start = zone->wp = sector; if (zone->start + dev->zone_size_sects > dev_capacity_sects) zone->len = dev_capacity_sects - zone->start; @@ -147,32 +176,17 @@ int null_register_zoned_dev(struct nullb *nullb) void null_free_zoned_dev(struct nullb_device *dev) { - bitmap_free(dev->zone_locks); kvfree(dev->zones); } -static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) -{ - if (dev->memory_backed) - wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&dev->zone_lock); -} - -static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) -{ - spin_unlock_irq(&dev->zone_lock); - - if (dev->memory_backed) - clear_and_wake_up_bit(zno, dev->zone_locks); -} - int null_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; - unsigned int first_zone, i, zno; - struct blk_zone zone; + unsigned int first_zone, i; + struct nullb_zone *zone; + struct blk_zone blkz; int error; first_zone = null_zone_no(dev, sector); @@ -182,19 +196,25 @@ int null_report_zones(struct gendisk *disk, sector_t sector, nr_zones = min(nr_zones, dev->nr_zones - first_zone); trace_nullb_report_zones(nullb, nr_zones); - zno = first_zone; - for (i = 0; i < nr_zones; i++, zno++) { + memset(&blkz, 0, sizeof(struct blk_zone)); + zone = &dev->zones[first_zone]; + for (i = 0; i < nr_zones; i++, zone++) { /* * Stacked DM target drivers will remap the zone information by * modifying the zone information passed to the report callback. * So use a local copy to avoid corruption of the device zone * array. */ - null_lock_zone(dev, zno); - memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); - null_unlock_zone(dev, zno); - - error = cb(&zone, i, data); + null_lock_zone(dev, zone); + blkz.start = zone->start; + blkz.len = zone->len; + blkz.wp = zone->wp; + blkz.type = zone->type; + blkz.cond = zone->cond; + blkz.capacity = zone->capacity; + null_unlock_zone(dev, zone); + + error = cb(&blkz, i, data); if (error) return error; } @@ -210,7 +230,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb, sector_t sector, unsigned int len) { struct nullb_device *dev = nullb->dev; - struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; + struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; unsigned int nr_sectors = len >> SECTOR_SHIFT; /* Read must be below the write pointer position */ @@ -224,11 +244,9 @@ size_t null_zone_valid_read_len(struct nullb *nullb, return (zone->wp - sector) << SECTOR_SHIFT; } -static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t __null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) { - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - switch (zone->cond) { case BLK_ZONE_COND_CLOSED: /* close operation on closed is not an error */ @@ -261,7 +279,7 @@ static void null_close_first_imp_zone(struct nullb_device *dev) for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { - null_close_zone(dev, &dev->zones[i]); + __null_close_zone(dev, &dev->zones[i]); return; } } @@ -310,7 +328,8 @@ static blk_status_t null_check_open(struct nullb_device *dev) * it is not certain that closing an implicit open zone will allow a new zone * to be opened, since we might already be at the active limit capacity. */ -static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_check_zone_resources(struct nullb_device *dev, + struct nullb_zone *zone) { blk_status_t ret; @@ -334,7 +353,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, { struct nullb_device *dev = cmd->nq->dev; unsigned int zno = null_zone_no(dev, sector); - struct blk_zone *zone = &dev->zones[zno]; + struct nullb_zone *zone = &dev->zones[zno]; blk_status_t ret; trace_nullb_zone_op(cmd, zno, zone->cond); @@ -345,26 +364,12 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); } - null_lock_zone(dev, zno); + null_lock_zone(dev, zone); - switch (zone->cond) { - case BLK_ZONE_COND_FULL: + if (zone->cond == BLK_ZONE_COND_FULL) { /* Cannot write to a full zone */ ret = BLK_STS_IOERR; goto unlock; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - break; - default: - /* Invalid zone condition */ - ret = BLK_STS_IOERR; - goto unlock; } /* @@ -389,60 +394,69 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, goto unlock; } - if (zone->cond == BLK_ZONE_COND_CLOSED) { - dev->nr_zones_closed--; - dev->nr_zones_imp_open++; - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { - dev->nr_zones_imp_open++; + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) { + null_lock_zone_res(dev); + + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + null_unlock_zone_res(dev); + goto unlock; + } + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } + + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + null_unlock_zone_res(dev); } - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - /* - * Memory backing allocation may sleep: release the zone_lock spinlock - * to avoid scheduling in atomic context. Zone operation atomicity is - * still guaranteed through the zone_locks bitmap. - */ - if (dev->memory_backed) - spin_unlock_irq(&dev->zone_lock); ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (dev->memory_backed) - spin_lock_irq(&dev->zone_lock); - if (ret != BLK_STS_OK) goto unlock; zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { + null_lock_zone_res(dev); if (zone->cond == BLK_ZONE_COND_EXP_OPEN) dev->nr_zones_exp_open--; else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) dev->nr_zones_imp_open--; zone->cond = BLK_ZONE_COND_FULL; + null_unlock_zone_res(dev); } + ret = BLK_STS_OK; unlock: - null_unlock_zone(dev, zno); + null_unlock_zone(dev, zone); return ret; } -static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_open_zone(struct nullb_device *dev, + struct nullb_zone *zone) { - blk_status_t ret; + blk_status_t ret = BLK_STS_OK; if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; + null_lock_zone_res(dev); + switch (zone->cond) { case BLK_ZONE_COND_EXP_OPEN: /* open operation on exp open is not an error */ - return BLK_STS_OK; + goto unlock; case BLK_ZONE_COND_EMPTY: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; break; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -450,35 +464,57 @@ static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zo case BLK_ZONE_COND_CLOSED: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; dev->nr_zones_closed--; break; case BLK_ZONE_COND_FULL: default: - return BLK_STS_IOERR; + ret = BLK_STS_IOERR; + goto unlock; } zone->cond = BLK_ZONE_COND_EXP_OPEN; dev->nr_zones_exp_open++; - return BLK_STS_OK; +unlock: + null_unlock_zone_res(dev); + + return ret; } -static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) { blk_status_t ret; if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; + null_lock_zone_res(dev); + ret = __null_close_zone(dev, zone); + null_unlock_zone_res(dev); + + return ret; +} + +static blk_status_t null_finish_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret = BLK_STS_OK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + switch (zone->cond) { case BLK_ZONE_COND_FULL: /* finish operation on full is not an error */ - return BLK_STS_OK; + goto unlock; case BLK_ZONE_COND_EMPTY: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; break; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -489,27 +525,35 @@ static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone * case BLK_ZONE_COND_CLOSED: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; dev->nr_zones_closed--; break; default: - return BLK_STS_IOERR; + ret = BLK_STS_IOERR; + goto unlock; } zone->cond = BLK_ZONE_COND_FULL; zone->wp = zone->start + zone->len; - return BLK_STS_OK; +unlock: + null_unlock_zone_res(dev); + + return ret; } -static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_reset_zone(struct nullb_device *dev, + struct nullb_zone *zone) { if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; + null_lock_zone_res(dev); + switch (zone->cond) { case BLK_ZONE_COND_EMPTY: /* reset operation on empty is not an error */ + null_unlock_zone_res(dev); return BLK_STS_OK; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -523,12 +567,15 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *z case BLK_ZONE_COND_FULL: break; default: + null_unlock_zone_res(dev); return BLK_STS_IOERR; } zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; + null_unlock_zone_res(dev); + return BLK_STS_OK; } @@ -537,19 +584,19 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, { struct nullb_device *dev = cmd->nq->dev; unsigned int zone_no; - struct blk_zone *zone; + struct nullb_zone *zone; blk_status_t ret; size_t i; if (op == REQ_OP_ZONE_RESET_ALL) { for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - null_lock_zone(dev, i); zone = &dev->zones[i]; + null_lock_zone(dev, zone); if (zone->cond != BLK_ZONE_COND_EMPTY) { null_reset_zone(dev, zone); trace_nullb_zone_op(cmd, i, zone->cond); } - null_unlock_zone(dev, i); + null_unlock_zone(dev, zone); } return BLK_STS_OK; } @@ -557,7 +604,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, zone_no = null_zone_no(dev, sector); zone = &dev->zones[zone_no]; - null_lock_zone(dev, zone_no); + null_lock_zone(dev, zone); switch (op) { case REQ_OP_ZONE_RESET: @@ -580,7 +627,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, if (ret == BLK_STS_OK) trace_nullb_zone_op(cmd, zone_no, zone->cond); - null_unlock_zone(dev, zone_no); + null_unlock_zone(dev, zone); return ret; } @@ -588,29 +635,28 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, sector_t sector, sector_t nr_sectors) { - struct nullb_device *dev = cmd->nq->dev; - unsigned int zno = null_zone_no(dev, sector); + struct nullb_device *dev; + struct nullb_zone *zone; blk_status_t sts; switch (op) { case REQ_OP_WRITE: - sts = null_zone_write(cmd, sector, nr_sectors, false); - break; + return null_zone_write(cmd, sector, nr_sectors, false); case REQ_OP_ZONE_APPEND: - sts = null_zone_write(cmd, sector, nr_sectors, true); - break; + return null_zone_write(cmd, sector, nr_sectors, true); case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - sts = null_zone_mgmt(cmd, op, sector); - break; + return null_zone_mgmt(cmd, op, sector); default: - null_lock_zone(dev, zno); + dev = cmd->nq->dev; + zone = &dev->zones[null_zone_no(dev, sector)]; + + null_lock_zone(dev, zone); sts = null_process_cmd(cmd, op, sector, nr_sectors); - null_unlock_zone(dev, zno); + null_unlock_zone(dev, zone); + return sts; } - - return sts; } -- cgit v1.2.3 From 2e8c6e0e1d2d65562c637940747cfa30559f976a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:15 +0900 Subject: null_blk: Improve implicit zone close When open zone resource management is enabled, that is, when a null_blk zoned device is created with zone_max_open different than 0, implicitly or explicitly opening a zone may require implicitly closing a zone that is already implicitly open. This operation is done using the function null_close_first_imp_zone(), which search for an implicitly open zone to close starting from the first sequential zone. This implementation is simple but may result in the same being constantly implicitly closed and then implicitly reopened on write, namely, the lowest numbered zone that is being written. Avoid this by starting the search for an implicitly open zone starting from the zone following the last zone that was implicitly closed. The function null_close_first_imp_zone() is renamed null_close_imp_open_zone(). Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 1 + drivers/block/null_blk_zoned.c | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 14546ead1d66..29a8817fadfc 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -67,6 +67,7 @@ struct nullb_device { unsigned int nr_zones_imp_open; unsigned int nr_zones_exp_open; unsigned int nr_zones_closed; + unsigned int imp_close_zone_no; struct nullb_zone *zones; sector_t zone_size_sects; bool need_zone_res_mgmt; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 4d5c0b938618..4dad8748a61d 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -113,6 +113,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) pr_info("zone_max_open limit disabled, limit >= zone count\n"); } dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; + dev->imp_close_zone_no = dev->zone_nr_conv; for (i = 0; i < dev->zone_nr_conv; i++) { zone = &dev->zones[i]; @@ -273,13 +274,24 @@ static blk_status_t __null_close_zone(struct nullb_device *dev, return BLK_STS_OK; } -static void null_close_first_imp_zone(struct nullb_device *dev) +static void null_close_imp_open_zone(struct nullb_device *dev) { - unsigned int i; + struct nullb_zone *zone; + unsigned int zno, i; + + zno = dev->imp_close_zone_no; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { - __null_close_zone(dev, &dev->zones[i]); + zone = &dev->zones[zno]; + zno++; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; + + if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { + __null_close_zone(dev, zone); + dev->imp_close_zone_no = zno; return; } } @@ -307,7 +319,7 @@ static blk_status_t null_check_open(struct nullb_device *dev) if (dev->nr_zones_imp_open) { if (null_check_active(dev) == BLK_STS_OK) { - null_close_first_imp_zone(dev); + null_close_imp_open_zone(dev); return BLK_STS_OK; } } -- cgit v1.2.3 From 49c7089f3ded981fcea387f853fa394788e60fb2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:16 +0900 Subject: null_blk: cleanup discard handling null_handle_discard() is called from both null_handle_rq() and null_handle_bio(). As these functions are only passed a nullb_cmd structure, this forces pointer dereferences to identiify the discard operation code and to access the sector range to be discarded. Simplify all this by changing the interface of the functions null_handle_discard() and null_handle_memory_backed() to pass along the operation code, operation start sector and number of sectors. With this change null_handle_discard() can be called directly from null_handle_memory_backed(). Also add a message warning that the discard configuration attribute has no effect when memory backing is disabled. No functional change is introduced by this patch. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 4685ea401d5b..a223bee24e76 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1076,13 +1076,16 @@ static void nullb_fill_pattern(struct nullb *nullb, struct page *page, kunmap_atomic(dst); } -static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) +static blk_status_t null_handle_discard(struct nullb_device *dev, + sector_t sector, sector_t nr_sectors) { + struct nullb *nullb = dev->nullb; + size_t n = nr_sectors << SECTOR_SHIFT; size_t temp; spin_lock_irq(&nullb->lock); while (n > 0) { - temp = min_t(size_t, n, nullb->dev->blocksize); + temp = min_t(size_t, n, dev->blocksize); null_free_sector(nullb, sector, false); if (null_cache_active(nullb)) null_free_sector(nullb, sector, true); @@ -1090,6 +1093,8 @@ static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) n -= temp; } spin_unlock_irq(&nullb->lock); + + return BLK_STS_OK; } static int null_handle_flush(struct nullb *nullb) @@ -1149,17 +1154,10 @@ static int null_handle_rq(struct nullb_cmd *cmd) struct nullb *nullb = cmd->nq->dev->nullb; int err; unsigned int len; - sector_t sector; + sector_t sector = blk_rq_pos(rq); struct req_iterator iter; struct bio_vec bvec; - sector = blk_rq_pos(rq); - - if (req_op(rq) == REQ_OP_DISCARD) { - null_handle_discard(nullb, sector, blk_rq_bytes(rq)); - return 0; - } - spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; @@ -1183,18 +1181,10 @@ static int null_handle_bio(struct nullb_cmd *cmd) struct nullb *nullb = cmd->nq->dev->nullb; int err; unsigned int len; - sector_t sector; + sector_t sector = bio->bi_iter.bi_sector; struct bio_vec bvec; struct bvec_iter iter; - sector = bio->bi_iter.bi_sector; - - if (bio_op(bio) == REQ_OP_DISCARD) { - null_handle_discard(nullb, sector, - bio_sectors(bio) << SECTOR_SHIFT); - return 0; - } - spin_lock_irq(&nullb->lock); bio_for_each_segment(bvec, bio, iter) { len = bvec.bv_len; @@ -1263,11 +1253,16 @@ static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, } static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_opf op) + enum req_opf op, + sector_t sector, + sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; int err; + if (op == REQ_OP_DISCARD) + return null_handle_discard(dev, sector, nr_sectors); + if (dev->queue_mode == NULL_Q_BIO) err = null_handle_bio(cmd); else @@ -1343,7 +1338,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, } if (dev->memory_backed) - return null_handle_memory_backed(cmd, op); + return null_handle_memory_backed(cmd, op, sector, nr_sectors); return BLK_STS_OK; } @@ -1589,6 +1584,12 @@ static void null_config_discard(struct nullb *nullb) if (nullb->dev->discard == false) return; + if (!nullb->dev->memory_backed) { + nullb->dev->discard = false; + pr_info("discard option is ignored without memory backing\n"); + return; + } + if (nullb->dev->zoned) { nullb->dev->discard = false; pr_info("discard option is ignored in zoned mode\n"); -- cgit v1.2.3 From 0ec4d913ac69ec86757eec117fc2733018552aa7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:17 +0900 Subject: null_blk: discard zones on reset When memory backing is enabled, use null_handle_discard() to free the backing memory used by a zone when the zone is being reset. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 2 ++ drivers/block/null_blk_main.c | 4 ++-- drivers/block/null_blk_zoned.c | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 29a8817fadfc..63000aeeb2f3 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -116,6 +116,8 @@ struct nullb { char disk_name[DISK_NAME_LEN]; }; +blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, + sector_t nr_sectors); blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_opf op, sector_t sector, unsigned int nr_sectors); diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index a223bee24e76..b758b9366630 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1076,8 +1076,8 @@ static void nullb_fill_pattern(struct nullb *nullb, struct page *page, kunmap_atomic(dst); } -static blk_status_t null_handle_discard(struct nullb_device *dev, - sector_t sector, sector_t nr_sectors) +blk_status_t null_handle_discard(struct nullb_device *dev, + sector_t sector, sector_t nr_sectors) { struct nullb *nullb = dev->nullb; size_t n = nr_sectors << SECTOR_SHIFT; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 4dad8748a61d..65464f7559e0 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -588,6 +588,9 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, null_unlock_zone_res(dev); + if (dev->memory_backed) + return null_handle_discard(dev, zone->start, zone->len); + return BLK_STS_OK; } -- cgit v1.2.3 From ea17fd354ca8afd3e8962a77236b1a9a59262fdd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:18 +0900 Subject: null_blk: Allow controlling max_hw_sectors limit Add the module option and configfs attribute max_sectors to allow configuring the maximum size of a command issued to a null_blk device. This allows exercising the block layer BIO splitting with different limits than the default BLK_SAFE_MAX_SECTORS. This is also useful for testing the zone append write path of file systems as the max_hw_sectors limit value is also used for the max_zone_append_sectors limit. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 1 + drivers/block/null_blk_main.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 63000aeeb2f3..83504f3cc9d6 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -85,6 +85,7 @@ struct nullb_device { unsigned int home_node; /* home node for the device */ unsigned int queue_mode; /* block interface */ unsigned int blocksize; /* block size */ + unsigned int max_sectors; /* Max sectors per command */ unsigned int irqmode; /* IRQ completion handler */ unsigned int hw_queue_depth; /* queue depth */ unsigned int index; /* index of the disk, only valid with a disk */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index b758b9366630..5357c3a4a36f 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -152,6 +152,10 @@ static int g_bs = 512; module_param_named(bs, g_bs, int, 0444); MODULE_PARM_DESC(bs, "Block size (in bytes)"); +static int g_max_sectors; +module_param_named(max_sectors, g_max_sectors, int, 0444); +MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); + static unsigned int nr_devices = 1; module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); @@ -346,6 +350,7 @@ NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); NULLB_DEVICE_ATTR(home_node, uint, NULL); NULLB_DEVICE_ATTR(queue_mode, uint, NULL); NULLB_DEVICE_ATTR(blocksize, uint, NULL); +NULLB_DEVICE_ATTR(max_sectors, uint, NULL); NULLB_DEVICE_ATTR(irqmode, uint, NULL); NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); NULLB_DEVICE_ATTR(index, uint, NULL); @@ -463,6 +468,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_home_node, &nullb_device_attr_queue_mode, &nullb_device_attr_blocksize, + &nullb_device_attr_max_sectors, &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, @@ -533,7 +539,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -588,6 +594,7 @@ static struct nullb_device *null_alloc_dev(void) dev->home_node = g_home_node; dev->queue_mode = g_queue_mode; dev->blocksize = g_bs; + dev->max_sectors = g_max_sectors; dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; @@ -1867,6 +1874,11 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_logical_block_size(nullb->q, dev->blocksize); blk_queue_physical_block_size(nullb->q, dev->blocksize); + if (!dev->max_sectors) + dev->max_sectors = queue_max_hw_sectors(nullb->q); + dev->max_sectors = min_t(unsigned int, dev->max_sectors, + BLK_DEF_MAX_SECTORS); + blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); null_config_discard(nullb); @@ -1910,6 +1922,12 @@ static int __init null_init(void) g_bs = PAGE_SIZE; } + if (g_max_sectors > BLK_DEF_MAX_SECTORS) { + pr_warn("invalid max sectors\n"); + pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS); + g_max_sectors = BLK_DEF_MAX_SECTORS; + } + if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { pr_err("invalid home_node value\n"); g_home_node = NUMA_NO_NODE; -- cgit v1.2.3 From eebf34a85c8c724676eba502d15202854f199b05 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:19 +0900 Subject: null_blk: Move driver into its own directory Move null_blk driver code into the new sub-directory drivers/block/null_blk. Suggested-by: Bart Van Assche Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 8 +- drivers/block/Makefile | 7 +- drivers/block/null_blk.h | 162 --- drivers/block/null_blk/Kconfig | 12 + drivers/block/null_blk/Makefile | 11 + drivers/block/null_blk/main.c | 2031 +++++++++++++++++++++++++++++++++++++ drivers/block/null_blk/null_blk.h | 162 +++ drivers/block/null_blk/trace.c | 21 + drivers/block/null_blk/trace.h | 79 ++ drivers/block/null_blk/zoned.c | 677 +++++++++++++ drivers/block/null_blk_main.c | 2031 ------------------------------------- drivers/block/null_blk_trace.c | 21 - drivers/block/null_blk_trace.h | 79 -- drivers/block/null_blk_zoned.c | 677 ------------- 14 files changed, 2995 insertions(+), 2983 deletions(-) delete mode 100644 drivers/block/null_blk.h create mode 100644 drivers/block/null_blk/Kconfig create mode 100644 drivers/block/null_blk/Makefile create mode 100644 drivers/block/null_blk/main.c create mode 100644 drivers/block/null_blk/null_blk.h create mode 100644 drivers/block/null_blk/trace.c create mode 100644 drivers/block/null_blk/trace.h create mode 100644 drivers/block/null_blk/zoned.c delete mode 100644 drivers/block/null_blk_main.c delete mode 100644 drivers/block/null_blk_trace.c delete mode 100644 drivers/block/null_blk_trace.h delete mode 100644 drivers/block/null_blk_zoned.c (limited to 'drivers/block') diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index ecceaaa1a66f..262326973ee0 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -16,13 +16,7 @@ menuconfig BLK_DEV if BLK_DEV -config BLK_DEV_NULL_BLK - tristate "Null test block driver" - select CONFIGFS_FS - -config BLK_DEV_NULL_BLK_FAULT_INJECTION - bool "Support fault injection for Null test block driver" - depends on BLK_DEV_NULL_BLK && FAULT_INJECTION +source "drivers/block/null_blk/Kconfig" config BLK_DEV_FD tristate "Normal floppy disk support" diff --git a/drivers/block/Makefile b/drivers/block/Makefile index e1f63117ee94..a3170859e01d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -41,12 +41,7 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ -obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o -null_blk-objs := null_blk_main.o -ifeq ($(CONFIG_BLK_DEV_ZONED), y) -null_blk-$(CONFIG_TRACING) += null_blk_trace.o -endif -null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h deleted file mode 100644 index 83504f3cc9d6..000000000000 --- a/drivers/block/null_blk.h +++ /dev/null @@ -1,162 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BLK_NULL_BLK_H -#define __BLK_NULL_BLK_H - -#undef pr_fmt -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct nullb_cmd { - struct request *rq; - struct bio *bio; - unsigned int tag; - blk_status_t error; - struct nullb_queue *nq; - struct hrtimer timer; -}; - -struct nullb_queue { - unsigned long *tag_map; - wait_queue_head_t wait; - unsigned int queue_depth; - struct nullb_device *dev; - unsigned int requeue_selection; - - struct nullb_cmd *cmds; -}; - -struct nullb_zone { - /* - * Zone lock to prevent concurrent modification of a zone write - * pointer position and condition: with memory backing, a write - * command execution may sleep on memory allocation. For this case, - * use mutex as the zone lock. Otherwise, use the spinlock for - * locking the zone. - */ - union { - spinlock_t spinlock; - struct mutex mutex; - }; - enum blk_zone_type type; - enum blk_zone_cond cond; - sector_t start; - sector_t wp; - unsigned int len; - unsigned int capacity; -}; - -struct nullb_device { - struct nullb *nullb; - struct config_item item; - struct radix_tree_root data; /* data stored in the disk */ - struct radix_tree_root cache; /* disk cache data */ - unsigned long flags; /* device flags */ - unsigned int curr_cache; - struct badblocks badblocks; - - unsigned int nr_zones; - unsigned int nr_zones_imp_open; - unsigned int nr_zones_exp_open; - unsigned int nr_zones_closed; - unsigned int imp_close_zone_no; - struct nullb_zone *zones; - sector_t zone_size_sects; - bool need_zone_res_mgmt; - spinlock_t zone_res_lock; - - unsigned long size; /* device size in MB */ - unsigned long completion_nsec; /* time in ns to complete a request */ - unsigned long cache_size; /* disk cache size in MB */ - unsigned long zone_size; /* zone size in MB if device is zoned */ - unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ - unsigned int zone_nr_conv; /* number of conventional zones */ - unsigned int zone_max_open; /* max number of open zones */ - unsigned int zone_max_active; /* max number of active zones */ - unsigned int submit_queues; /* number of submission queues */ - unsigned int home_node; /* home node for the device */ - unsigned int queue_mode; /* block interface */ - unsigned int blocksize; /* block size */ - unsigned int max_sectors; /* Max sectors per command */ - unsigned int irqmode; /* IRQ completion handler */ - unsigned int hw_queue_depth; /* queue depth */ - unsigned int index; /* index of the disk, only valid with a disk */ - unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ - bool blocking; /* blocking blk-mq device */ - bool use_per_node_hctx; /* use per-node allocation for hardware context */ - bool power; /* power on/off the device */ - bool memory_backed; /* if data is stored in memory */ - bool discard; /* if support discard */ - bool zoned; /* if device is zoned */ -}; - -struct nullb { - struct nullb_device *dev; - struct list_head list; - unsigned int index; - struct request_queue *q; - struct gendisk *disk; - struct blk_mq_tag_set *tag_set; - struct blk_mq_tag_set __tag_set; - unsigned int queue_depth; - atomic_long_t cur_bytes; - struct hrtimer bw_timer; - unsigned long cache_flush_pos; - spinlock_t lock; - - struct nullb_queue *queues; - unsigned int nr_queues; - char disk_name[DISK_NAME_LEN]; -}; - -blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, - sector_t nr_sectors); -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors); - -#ifdef CONFIG_BLK_DEV_ZONED -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); -int null_register_zoned_dev(struct nullb *nullb); -void null_free_zoned_dev(struct nullb_device *dev); -int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - sector_t nr_sectors); -size_t null_zone_valid_read_len(struct nullb *nullb, - sector_t sector, unsigned int len); -#else -static inline int null_init_zoned_dev(struct nullb_device *dev, - struct request_queue *q) -{ - pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); - return -EINVAL; -} -static inline int null_register_zoned_dev(struct nullb *nullb) -{ - return -ENODEV; -} -static inline void null_free_zoned_dev(struct nullb_device *dev) {} -static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, sector_t nr_sectors) -{ - return BLK_STS_NOTSUPP; -} -static inline size_t null_zone_valid_read_len(struct nullb *nullb, - sector_t sector, - unsigned int len) -{ - return len; -} -#define null_report_zones NULL -#endif /* CONFIG_BLK_DEV_ZONED */ -#endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk/Kconfig b/drivers/block/null_blk/Kconfig new file mode 100644 index 000000000000..6bf1f8ca20a2 --- /dev/null +++ b/drivers/block/null_blk/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Null block device driver configuration +# + +config BLK_DEV_NULL_BLK + tristate "Null test block driver" + select CONFIGFS_FS + +config BLK_DEV_NULL_BLK_FAULT_INJECTION + bool "Support fault injection for Null test block driver" + depends on BLK_DEV_NULL_BLK && FAULT_INJECTION diff --git a/drivers/block/null_blk/Makefile b/drivers/block/null_blk/Makefile new file mode 100644 index 000000000000..84c36e512ab8 --- /dev/null +++ b/drivers/block/null_blk/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +# needed for trace events +ccflags-y += -I$(src) + +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o +null_blk-objs := main.o +ifeq ($(CONFIG_BLK_DEV_ZONED), y) +null_blk-$(CONFIG_TRACING) += trace.o +endif +null_blk-$(CONFIG_BLK_DEV_ZONED) += zoned.o diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c new file mode 100644 index 000000000000..5357c3a4a36f --- /dev/null +++ b/drivers/block/null_blk/main.c @@ -0,0 +1,2031 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Add configfs and memory store: Kyungchan Koh and + * Shaohua Li + */ +#include + +#include +#include +#include +#include +#include "null_blk.h" + +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + +#define FREE_BATCH 16 + +#define TICKS_PER_SEC 50ULL +#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +static DECLARE_FAULT_ATTR(null_timeout_attr); +static DECLARE_FAULT_ATTR(null_requeue_attr); +static DECLARE_FAULT_ATTR(null_init_hctx_attr); +#endif + +static inline u64 mb_per_tick(int mbps) +{ + return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); +} + +/* + * Status flags for nullb_device. + * + * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. + * UP: Device is currently on and visible in userspace. + * THROTTLED: Device is being throttled. + * CACHE: Device is using a write-back cache. + */ +enum nullb_device_flags { + NULLB_DEV_FL_CONFIGURED = 0, + NULLB_DEV_FL_UP = 1, + NULLB_DEV_FL_THROTTLED = 2, + NULLB_DEV_FL_CACHE = 3, +}; + +#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) +/* + * nullb_page is a page in memory for nullb devices. + * + * @page: The page holding the data. + * @bitmap: The bitmap represents which sector in the page has data. + * Each bit represents one block size. For example, sector 8 + * will use the 7th bit + * The highest 2 bits of bitmap are for special purpose. LOCK means the cache + * page is being flushing to storage. FREE means the cache page is freed and + * should be skipped from flushing to storage. Please see + * null_make_cache_space + */ +struct nullb_page { + struct page *page; + DECLARE_BITMAP(bitmap, MAP_SZ); +}; +#define NULLB_PAGE_LOCK (MAP_SZ - 1) +#define NULLB_PAGE_FREE (MAP_SZ - 2) + +static LIST_HEAD(nullb_list); +static struct mutex lock; +static int null_major; +static DEFINE_IDA(nullb_indexes); +static struct blk_mq_tag_set tag_set; + +enum { + NULL_IRQ_NONE = 0, + NULL_IRQ_SOFTIRQ = 1, + NULL_IRQ_TIMER = 2, +}; + +enum { + NULL_Q_BIO = 0, + NULL_Q_RQ = 1, + NULL_Q_MQ = 2, +}; + +static int g_no_sched; +module_param_named(no_sched, g_no_sched, int, 0444); +MODULE_PARM_DESC(no_sched, "No io scheduler"); + +static int g_submit_queues = 1; +module_param_named(submit_queues, g_submit_queues, int, 0444); +MODULE_PARM_DESC(submit_queues, "Number of submission queues"); + +static int g_home_node = NUMA_NO_NODE; +module_param_named(home_node, g_home_node, int, 0444); +MODULE_PARM_DESC(home_node, "Home node for the device"); + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +/* + * For more details about fault injection, please refer to + * Documentation/fault-injection/fault-injection.rst. + */ +static char g_timeout_str[80]; +module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); +MODULE_PARM_DESC(timeout, "Fault injection. timeout=,,,"); + +static char g_requeue_str[80]; +module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); +MODULE_PARM_DESC(requeue, "Fault injection. requeue=,,,"); + +static char g_init_hctx_str[80]; +module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); +MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=,,,"); +#endif + +static int g_queue_mode = NULL_Q_MQ; + +static int null_param_store_val(const char *str, int *val, int min, int max) +{ + int ret, new_val; + + ret = kstrtoint(str, 10, &new_val); + if (ret) + return -EINVAL; + + if (new_val < min || new_val > max) + return -EINVAL; + + *val = new_val; + return 0; +} + +static int null_set_queue_mode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); +} + +static const struct kernel_param_ops null_queue_mode_param_ops = { + .set = null_set_queue_mode, + .get = param_get_int, +}; + +device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); +MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); + +static int g_gb = 250; +module_param_named(gb, g_gb, int, 0444); +MODULE_PARM_DESC(gb, "Size in GB"); + +static int g_bs = 512; +module_param_named(bs, g_bs, int, 0444); +MODULE_PARM_DESC(bs, "Block size (in bytes)"); + +static int g_max_sectors; +module_param_named(max_sectors, g_max_sectors, int, 0444); +MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); + +static unsigned int nr_devices = 1; +module_param(nr_devices, uint, 0444); +MODULE_PARM_DESC(nr_devices, "Number of devices to register"); + +static bool g_blocking; +module_param_named(blocking, g_blocking, bool, 0444); +MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); + +static bool shared_tags; +module_param(shared_tags, bool, 0444); +MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); + +static bool g_shared_tag_bitmap; +module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); +MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); + +static int g_irqmode = NULL_IRQ_SOFTIRQ; + +static int null_set_irqmode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, + NULL_IRQ_TIMER); +} + +static const struct kernel_param_ops null_irqmode_param_ops = { + .set = null_set_irqmode, + .get = param_get_int, +}; + +device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); +MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); + +static unsigned long g_completion_nsec = 10000; +module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); +MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); + +static int g_hw_queue_depth = 64; +module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); + +static bool g_use_per_node_hctx; +module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); +MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); + +static bool g_zoned; +module_param_named(zoned, g_zoned, bool, S_IRUGO); +MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); + +static unsigned long g_zone_size = 256; +module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); +MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); + +static unsigned long g_zone_capacity; +module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); +MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); + +static unsigned int g_zone_nr_conv; +module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); +MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); + +static unsigned int g_zone_max_open; +module_param_named(zone_max_open, g_zone_max_open, uint, 0444); +MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); + +static unsigned int g_zone_max_active; +module_param_named(zone_max_active, g_zone_max_active, uint, 0444); +MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); + +static struct nullb_device *null_alloc_dev(void); +static void null_free_dev(struct nullb_device *dev); +static void null_del_dev(struct nullb *nullb); +static int null_add_dev(struct nullb_device *dev); +static void null_free_device_storage(struct nullb_device *dev, bool is_cache); + +static inline struct nullb_device *to_nullb_device(struct config_item *item) +{ + return item ? container_of(item, struct nullb_device, item) : NULL; +} + +static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%lu\n", val); +} + +static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static ssize_t nullb_device_uint_attr_store(unsigned int *val, + const char *page, size_t count) +{ + unsigned int tmp; + int result; + + result = kstrtouint(page, 0, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +static ssize_t nullb_device_ulong_attr_store(unsigned long *val, + const char *page, size_t count) +{ + int result; + unsigned long tmp; + + result = kstrtoul(page, 0, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, + size_t count) +{ + bool tmp; + int result; + + result = kstrtobool(page, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ +#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ +static ssize_t \ +nullb_device_##NAME##_show(struct config_item *item, char *page) \ +{ \ + return nullb_device_##TYPE##_attr_show( \ + to_nullb_device(item)->NAME, page); \ +} \ +static ssize_t \ +nullb_device_##NAME##_store(struct config_item *item, const char *page, \ + size_t count) \ +{ \ + int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ + struct nullb_device *dev = to_nullb_device(item); \ + TYPE new_value = 0; \ + int ret; \ + \ + ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ + if (ret < 0) \ + return ret; \ + if (apply_fn) \ + ret = apply_fn(dev, new_value); \ + else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ + ret = -EBUSY; \ + if (ret < 0) \ + return ret; \ + dev->NAME = new_value; \ + return count; \ +} \ +CONFIGFS_ATTR(nullb_device_, NAME); + +static int nullb_apply_submit_queues(struct nullb_device *dev, + unsigned int submit_queues) +{ + struct nullb *nullb = dev->nullb; + struct blk_mq_tag_set *set; + + if (!nullb) + return 0; + + /* + * Make sure that null_init_hctx() does not access nullb->queues[] past + * the end of that array. + */ + if (submit_queues > nr_cpu_ids) + return -EINVAL; + set = nullb->tag_set; + blk_mq_update_nr_hw_queues(set, submit_queues); + return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; +} + +NULLB_DEVICE_ATTR(size, ulong, NULL); +NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); +NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); +NULLB_DEVICE_ATTR(home_node, uint, NULL); +NULLB_DEVICE_ATTR(queue_mode, uint, NULL); +NULLB_DEVICE_ATTR(blocksize, uint, NULL); +NULLB_DEVICE_ATTR(max_sectors, uint, NULL); +NULLB_DEVICE_ATTR(irqmode, uint, NULL); +NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); +NULLB_DEVICE_ATTR(index, uint, NULL); +NULLB_DEVICE_ATTR(blocking, bool, NULL); +NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); +NULLB_DEVICE_ATTR(memory_backed, bool, NULL); +NULLB_DEVICE_ATTR(discard, bool, NULL); +NULLB_DEVICE_ATTR(mbps, uint, NULL); +NULLB_DEVICE_ATTR(cache_size, ulong, NULL); +NULLB_DEVICE_ATTR(zoned, bool, NULL); +NULLB_DEVICE_ATTR(zone_size, ulong, NULL); +NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); +NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); +NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); +NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); + +static ssize_t nullb_device_power_show(struct config_item *item, char *page) +{ + return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); +} + +static ssize_t nullb_device_power_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *dev = to_nullb_device(item); + bool newp = false; + ssize_t ret; + + ret = nullb_device_bool_attr_store(&newp, page, count); + if (ret < 0) + return ret; + + if (!dev->power && newp) { + if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) + return count; + if (null_add_dev(dev)) { + clear_bit(NULLB_DEV_FL_UP, &dev->flags); + return -ENOMEM; + } + + set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); + dev->power = newp; + } else if (dev->power && !newp) { + if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { + mutex_lock(&lock); + dev->power = newp; + null_del_dev(dev->nullb); + mutex_unlock(&lock); + } + clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); + } + + return count; +} + +CONFIGFS_ATTR(nullb_device_, power); + +static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) +{ + struct nullb_device *t_dev = to_nullb_device(item); + + return badblocks_show(&t_dev->badblocks, page, 0); +} + +static ssize_t nullb_device_badblocks_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *t_dev = to_nullb_device(item); + char *orig, *buf, *tmp; + u64 start, end; + int ret; + + orig = kstrndup(page, count, GFP_KERNEL); + if (!orig) + return -ENOMEM; + + buf = strstrip(orig); + + ret = -EINVAL; + if (buf[0] != '+' && buf[0] != '-') + goto out; + tmp = strchr(&buf[1], '-'); + if (!tmp) + goto out; + *tmp = '\0'; + ret = kstrtoull(buf + 1, 0, &start); + if (ret) + goto out; + ret = kstrtoull(tmp + 1, 0, &end); + if (ret) + goto out; + ret = -EINVAL; + if (start > end) + goto out; + /* enable badblocks */ + cmpxchg(&t_dev->badblocks.shift, -1, 0); + if (buf[0] == '+') + ret = badblocks_set(&t_dev->badblocks, start, + end - start + 1, 1); + else + ret = badblocks_clear(&t_dev->badblocks, start, + end - start + 1); + if (ret == 0) + ret = count; +out: + kfree(orig); + return ret; +} +CONFIGFS_ATTR(nullb_device_, badblocks); + +static struct configfs_attribute *nullb_device_attrs[] = { + &nullb_device_attr_size, + &nullb_device_attr_completion_nsec, + &nullb_device_attr_submit_queues, + &nullb_device_attr_home_node, + &nullb_device_attr_queue_mode, + &nullb_device_attr_blocksize, + &nullb_device_attr_max_sectors, + &nullb_device_attr_irqmode, + &nullb_device_attr_hw_queue_depth, + &nullb_device_attr_index, + &nullb_device_attr_blocking, + &nullb_device_attr_use_per_node_hctx, + &nullb_device_attr_power, + &nullb_device_attr_memory_backed, + &nullb_device_attr_discard, + &nullb_device_attr_mbps, + &nullb_device_attr_cache_size, + &nullb_device_attr_badblocks, + &nullb_device_attr_zoned, + &nullb_device_attr_zone_size, + &nullb_device_attr_zone_capacity, + &nullb_device_attr_zone_nr_conv, + &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_max_active, + NULL, +}; + +static void nullb_device_release(struct config_item *item) +{ + struct nullb_device *dev = to_nullb_device(item); + + null_free_device_storage(dev, false); + null_free_dev(dev); +} + +static struct configfs_item_operations nullb_device_ops = { + .release = nullb_device_release, +}; + +static const struct config_item_type nullb_device_type = { + .ct_item_ops = &nullb_device_ops, + .ct_attrs = nullb_device_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct +config_item *nullb_group_make_item(struct config_group *group, const char *name) +{ + struct nullb_device *dev; + + dev = null_alloc_dev(); + if (!dev) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&dev->item, name, &nullb_device_type); + + return &dev->item; +} + +static void +nullb_group_drop_item(struct config_group *group, struct config_item *item) +{ + struct nullb_device *dev = to_nullb_device(item); + + if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { + mutex_lock(&lock); + dev->power = false; + null_del_dev(dev->nullb); + mutex_unlock(&lock); + } + + config_item_put(item); +} + +static ssize_t memb_group_features_show(struct config_item *item, char *page) +{ + return snprintf(page, PAGE_SIZE, + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors\n"); +} + +CONFIGFS_ATTR_RO(memb_group_, features); + +static struct configfs_attribute *nullb_group_attrs[] = { + &memb_group_attr_features, + NULL, +}; + +static struct configfs_group_operations nullb_group_ops = { + .make_item = nullb_group_make_item, + .drop_item = nullb_group_drop_item, +}; + +static const struct config_item_type nullb_group_type = { + .ct_group_ops = &nullb_group_ops, + .ct_attrs = nullb_group_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem nullb_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "nullb", + .ci_type = &nullb_group_type, + }, + }, +}; + +static inline int null_cache_active(struct nullb *nullb) +{ + return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); +} + +static struct nullb_device *null_alloc_dev(void) +{ + struct nullb_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); + INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); + if (badblocks_init(&dev->badblocks, 0)) { + kfree(dev); + return NULL; + } + + dev->size = g_gb * 1024; + dev->completion_nsec = g_completion_nsec; + dev->submit_queues = g_submit_queues; + dev->home_node = g_home_node; + dev->queue_mode = g_queue_mode; + dev->blocksize = g_bs; + dev->max_sectors = g_max_sectors; + dev->irqmode = g_irqmode; + dev->hw_queue_depth = g_hw_queue_depth; + dev->blocking = g_blocking; + dev->use_per_node_hctx = g_use_per_node_hctx; + dev->zoned = g_zoned; + dev->zone_size = g_zone_size; + dev->zone_capacity = g_zone_capacity; + dev->zone_nr_conv = g_zone_nr_conv; + dev->zone_max_open = g_zone_max_open; + dev->zone_max_active = g_zone_max_active; + return dev; +} + +static void null_free_dev(struct nullb_device *dev) +{ + if (!dev) + return; + + null_free_zoned_dev(dev); + badblocks_exit(&dev->badblocks); + kfree(dev); +} + +static void put_tag(struct nullb_queue *nq, unsigned int tag) +{ + clear_bit_unlock(tag, nq->tag_map); + + if (waitqueue_active(&nq->wait)) + wake_up(&nq->wait); +} + +static unsigned int get_tag(struct nullb_queue *nq) +{ + unsigned int tag; + + do { + tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); + if (tag >= nq->queue_depth) + return -1U; + } while (test_and_set_bit_lock(tag, nq->tag_map)); + + return tag; +} + +static void free_cmd(struct nullb_cmd *cmd) +{ + put_tag(cmd->nq, cmd->tag); +} + +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); + +static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + unsigned int tag; + + tag = get_tag(nq); + if (tag != -1U) { + cmd = &nq->cmds[tag]; + cmd->tag = tag; + cmd->error = BLK_STS_OK; + cmd->nq = nq; + if (nq->dev->irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } + return cmd; + } + + return NULL; +} + +static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) +{ + struct nullb_cmd *cmd; + DEFINE_WAIT(wait); + + cmd = __alloc_cmd(nq); + if (cmd || !can_wait) + return cmd; + + do { + prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); + cmd = __alloc_cmd(nq); + if (cmd) + break; + + io_schedule(); + } while (1); + + finish_wait(&nq->wait, &wait); + return cmd; +} + +static void end_cmd(struct nullb_cmd *cmd) +{ + int queue_mode = cmd->nq->dev->queue_mode; + + switch (queue_mode) { + case NULL_Q_MQ: + blk_mq_end_request(cmd->rq, cmd->error); + return; + case NULL_Q_BIO: + cmd->bio->bi_status = cmd->error; + bio_endio(cmd->bio); + break; + } + + free_cmd(cmd); +} + +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) +{ + end_cmd(container_of(timer, struct nullb_cmd, timer)); + + return HRTIMER_NORESTART; +} + +static void null_cmd_end_timer(struct nullb_cmd *cmd) +{ + ktime_t kt = cmd->nq->dev->completion_nsec; + + hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); +} + +static void null_complete_rq(struct request *rq) +{ + end_cmd(blk_mq_rq_to_pdu(rq)); +} + +static struct nullb_page *null_alloc_page(gfp_t gfp_flags) +{ + struct nullb_page *t_page; + + t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); + if (!t_page) + goto out; + + t_page->page = alloc_pages(gfp_flags, 0); + if (!t_page->page) + goto out_freepage; + + memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); + return t_page; +out_freepage: + kfree(t_page); +out: + return NULL; +} + +static void null_free_page(struct nullb_page *t_page) +{ + __set_bit(NULLB_PAGE_FREE, t_page->bitmap); + if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) + return; + __free_page(t_page->page); + kfree(t_page); +} + +static bool null_page_empty(struct nullb_page *page) +{ + int size = MAP_SZ - 2; + + return find_first_bit(page->bitmap, size) == size; +} + +static void null_free_sector(struct nullb *nullb, sector_t sector, + bool is_cache) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page, *ret; + struct radix_tree_root *root; + + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + t_page = radix_tree_lookup(root, idx); + if (t_page) { + __clear_bit(sector_bit, t_page->bitmap); + + if (null_page_empty(t_page)) { + ret = radix_tree_delete_item(root, idx, t_page); + WARN_ON(ret != t_page); + null_free_page(ret); + if (is_cache) + nullb->dev->curr_cache -= PAGE_SIZE; + } + } +} + +static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, + struct nullb_page *t_page, bool is_cache) +{ + struct radix_tree_root *root; + + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + + if (radix_tree_insert(root, idx, t_page)) { + null_free_page(t_page); + t_page = radix_tree_lookup(root, idx); + WARN_ON(!t_page || t_page->page->index != idx); + } else if (is_cache) + nullb->dev->curr_cache += PAGE_SIZE; + + return t_page; +} + +static void null_free_device_storage(struct nullb_device *dev, bool is_cache) +{ + unsigned long pos = 0; + int nr_pages; + struct nullb_page *ret, *t_pages[FREE_BATCH]; + struct radix_tree_root *root; + + root = is_cache ? &dev->cache : &dev->data; + + do { + int i; + + nr_pages = radix_tree_gang_lookup(root, + (void **)t_pages, pos, FREE_BATCH); + + for (i = 0; i < nr_pages; i++) { + pos = t_pages[i]->page->index; + ret = radix_tree_delete_item(root, pos, t_pages[i]); + WARN_ON(ret != t_pages[i]); + null_free_page(ret); + } + + pos++; + } while (nr_pages == FREE_BATCH); + + if (is_cache) + dev->curr_cache = 0; +} + +static struct nullb_page *__null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write, bool is_cache) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page; + struct radix_tree_root *root; + + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + t_page = radix_tree_lookup(root, idx); + WARN_ON(t_page && t_page->page->index != idx); + + if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) + return t_page; + + return NULL; +} + +static struct nullb_page *null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write, bool ignore_cache) +{ + struct nullb_page *page = NULL; + + if (!ignore_cache) + page = __null_lookup_page(nullb, sector, for_write, true); + if (page) + return page; + return __null_lookup_page(nullb, sector, for_write, false); +} + +static struct nullb_page *null_insert_page(struct nullb *nullb, + sector_t sector, bool ignore_cache) + __releases(&nullb->lock) + __acquires(&nullb->lock) +{ + u64 idx; + struct nullb_page *t_page; + + t_page = null_lookup_page(nullb, sector, true, ignore_cache); + if (t_page) + return t_page; + + spin_unlock_irq(&nullb->lock); + + t_page = null_alloc_page(GFP_NOIO); + if (!t_page) + goto out_lock; + + if (radix_tree_preload(GFP_NOIO)) + goto out_freepage; + + spin_lock_irq(&nullb->lock); + idx = sector >> PAGE_SECTORS_SHIFT; + t_page->page->index = idx; + t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); + radix_tree_preload_end(); + + return t_page; +out_freepage: + null_free_page(t_page); +out_lock: + spin_lock_irq(&nullb->lock); + return null_lookup_page(nullb, sector, true, ignore_cache); +} + +static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) +{ + int i; + unsigned int offset; + u64 idx; + struct nullb_page *t_page, *ret; + void *dst, *src; + + idx = c_page->page->index; + + t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); + + __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); + if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { + null_free_page(c_page); + if (t_page && null_page_empty(t_page)) { + ret = radix_tree_delete_item(&nullb->dev->data, + idx, t_page); + null_free_page(t_page); + } + return 0; + } + + if (!t_page) + return -ENOMEM; + + src = kmap_atomic(c_page->page); + dst = kmap_atomic(t_page->page); + + for (i = 0; i < PAGE_SECTORS; + i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { + if (test_bit(i, c_page->bitmap)) { + offset = (i << SECTOR_SHIFT); + memcpy(dst + offset, src + offset, + nullb->dev->blocksize); + __set_bit(i, t_page->bitmap); + } + } + + kunmap_atomic(dst); + kunmap_atomic(src); + + ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); + null_free_page(ret); + nullb->dev->curr_cache -= PAGE_SIZE; + + return 0; +} + +static int null_make_cache_space(struct nullb *nullb, unsigned long n) +{ + int i, err, nr_pages; + struct nullb_page *c_pages[FREE_BATCH]; + unsigned long flushed = 0, one_round; + +again: + if ((nullb->dev->cache_size * 1024 * 1024) > + nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) + return 0; + + nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, + (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); + /* + * nullb_flush_cache_page could unlock before using the c_pages. To + * avoid race, we don't allow page free + */ + for (i = 0; i < nr_pages; i++) { + nullb->cache_flush_pos = c_pages[i]->page->index; + /* + * We found the page which is being flushed to disk by other + * threads + */ + if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) + c_pages[i] = NULL; + else + __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); + } + + one_round = 0; + for (i = 0; i < nr_pages; i++) { + if (c_pages[i] == NULL) + continue; + err = null_flush_cache_page(nullb, c_pages[i]); + if (err) + return err; + one_round++; + } + flushed += one_round << PAGE_SHIFT; + + if (n > flushed) { + if (nr_pages == 0) + nullb->cache_flush_pos = 0; + if (one_round == 0) { + /* give other threads a chance */ + spin_unlock_irq(&nullb->lock); + spin_lock_irq(&nullb->lock); + } + goto again; + } + return 0; +} + +static int copy_to_nullb(struct nullb *nullb, struct page *source, + unsigned int off, sector_t sector, size_t n, bool is_fua) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + if (null_cache_active(nullb) && !is_fua) + null_make_cache_space(nullb, PAGE_SIZE); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_insert_page(nullb, sector, + !null_cache_active(nullb) || is_fua); + if (!t_page) + return -ENOSPC; + + src = kmap_atomic(source); + dst = kmap_atomic(t_page->page); + memcpy(dst + offset, src + off + count, temp); + kunmap_atomic(dst); + kunmap_atomic(src); + + __set_bit(sector & SECTOR_MASK, t_page->bitmap); + + if (is_fua) + null_free_sector(nullb, sector, true); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static int copy_from_nullb(struct nullb *nullb, struct page *dest, + unsigned int off, sector_t sector, size_t n) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_lookup_page(nullb, sector, false, + !null_cache_active(nullb)); + + dst = kmap_atomic(dest); + if (!t_page) { + memset(dst + off + count, 0, temp); + goto next; + } + src = kmap_atomic(t_page->page); + memcpy(dst + off + count, src + offset, temp); + kunmap_atomic(src); +next: + kunmap_atomic(dst); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static void nullb_fill_pattern(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off) +{ + void *dst; + + dst = kmap_atomic(page); + memset(dst + off, 0xFF, len); + kunmap_atomic(dst); +} + +blk_status_t null_handle_discard(struct nullb_device *dev, + sector_t sector, sector_t nr_sectors) +{ + struct nullb *nullb = dev->nullb; + size_t n = nr_sectors << SECTOR_SHIFT; + size_t temp; + + spin_lock_irq(&nullb->lock); + while (n > 0) { + temp = min_t(size_t, n, dev->blocksize); + null_free_sector(nullb, sector, false); + if (null_cache_active(nullb)) + null_free_sector(nullb, sector, true); + sector += temp >> SECTOR_SHIFT; + n -= temp; + } + spin_unlock_irq(&nullb->lock); + + return BLK_STS_OK; +} + +static int null_handle_flush(struct nullb *nullb) +{ + int err; + + if (!null_cache_active(nullb)) + return 0; + + spin_lock_irq(&nullb->lock); + while (true) { + err = null_make_cache_space(nullb, + nullb->dev->cache_size * 1024 * 1024); + if (err || nullb->dev->curr_cache == 0) + break; + } + + WARN_ON(!radix_tree_empty(&nullb->dev->cache)); + spin_unlock_irq(&nullb->lock); + return err; +} + +static int null_transfer(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off, bool is_write, sector_t sector, + bool is_fua) +{ + struct nullb_device *dev = nullb->dev; + unsigned int valid_len = len; + int err = 0; + + if (!is_write) { + if (dev->zoned) + valid_len = null_zone_valid_read_len(nullb, + sector, len); + + if (valid_len) { + err = copy_from_nullb(nullb, page, off, + sector, valid_len); + off += valid_len; + len -= valid_len; + } + + if (len) + nullb_fill_pattern(nullb, page, len, off); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + err = copy_to_nullb(nullb, page, off, sector, len, is_fua); + } + + return err; +} + +static int null_handle_rq(struct nullb_cmd *cmd) +{ + struct request *rq = cmd->rq; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector = blk_rq_pos(rq); + struct req_iterator iter; + struct bio_vec bvec; + + spin_lock_irq(&nullb->lock); + rq_for_each_segment(bvec, rq, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(req_op(rq)), sector, + rq->cmd_flags & REQ_FUA); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + + return 0; +} + +static int null_handle_bio(struct nullb_cmd *cmd) +{ + struct bio *bio = cmd->bio; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector = bio->bi_iter.bi_sector; + struct bio_vec bvec; + struct bvec_iter iter; + + spin_lock_irq(&nullb->lock); + bio_for_each_segment(bvec, bio, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(bio_op(bio)), sector, + bio->bi_opf & REQ_FUA); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + return 0; +} + +static void null_stop_queue(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + + if (nullb->dev->queue_mode == NULL_Q_MQ) + blk_mq_stop_hw_queues(q); +} + +static void null_restart_queue_async(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + + if (nullb->dev->queue_mode == NULL_Q_MQ) + blk_mq_start_stopped_hw_queues(q, true); +} + +static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct nullb *nullb = dev->nullb; + blk_status_t sts = BLK_STS_OK; + struct request *rq = cmd->rq; + + if (!hrtimer_active(&nullb->bw_timer)) + hrtimer_restart(&nullb->bw_timer); + + if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { + null_stop_queue(nullb); + /* race with timer */ + if (atomic_long_read(&nullb->cur_bytes) > 0) + null_restart_queue_async(nullb); + /* requeue request */ + sts = BLK_STS_DEV_RESOURCE; + } + return sts; +} + +static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, + sector_t sector, + sector_t nr_sectors) +{ + struct badblocks *bb = &cmd->nq->dev->badblocks; + sector_t first_bad; + int bad_sectors; + + if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) + return BLK_STS_IOERR; + + return BLK_STS_OK; +} + +static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, + enum req_opf op, + sector_t sector, + sector_t nr_sectors) +{ + struct nullb_device *dev = cmd->nq->dev; + int err; + + if (op == REQ_OP_DISCARD) + return null_handle_discard(dev, sector, nr_sectors); + + if (dev->queue_mode == NULL_Q_BIO) + err = null_handle_bio(cmd); + else + err = null_handle_rq(cmd); + + return errno_to_blk_status(err); +} + +static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct bio *bio; + + if (dev->memory_backed) + return; + + if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { + zero_fill_bio(cmd->bio); + } else if (req_op(cmd->rq) == REQ_OP_READ) { + __rq_for_each_bio(bio, cmd->rq) + zero_fill_bio(bio); + } +} + +static inline void nullb_complete_cmd(struct nullb_cmd *cmd) +{ + /* + * Since root privileges are required to configure the null_blk + * driver, it is fine that this driver does not initialize the + * data buffers of read commands. Zero-initialize these buffers + * anyway if KMSAN is enabled to prevent that KMSAN complains + * about null_blk not initializing read data buffers. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + nullb_zero_read_cmd_buffer(cmd); + + /* Complete IO by inline, softirq or timer */ + switch (cmd->nq->dev->irqmode) { + case NULL_IRQ_SOFTIRQ: + switch (cmd->nq->dev->queue_mode) { + case NULL_Q_MQ: + if (likely(!blk_should_fake_timeout(cmd->rq->q))) + blk_mq_complete_request(cmd->rq); + break; + case NULL_Q_BIO: + /* + * XXX: no proper submitting cpu information available. + */ + end_cmd(cmd); + break; + } + break; + case NULL_IRQ_NONE: + end_cmd(cmd); + break; + case NULL_IRQ_TIMER: + null_cmd_end_timer(cmd); + break; + } +} + +blk_status_t null_process_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + unsigned int nr_sectors) +{ + struct nullb_device *dev = cmd->nq->dev; + blk_status_t ret; + + if (dev->badblocks.shift != -1) { + ret = null_handle_badblocks(cmd, sector, nr_sectors); + if (ret != BLK_STS_OK) + return ret; + } + + if (dev->memory_backed) + return null_handle_memory_backed(cmd, op, sector, nr_sectors); + + return BLK_STS_OK; +} + +static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, + sector_t nr_sectors, enum req_opf op) +{ + struct nullb_device *dev = cmd->nq->dev; + struct nullb *nullb = dev->nullb; + blk_status_t sts; + + if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { + sts = null_handle_throttled(cmd); + if (sts != BLK_STS_OK) + return sts; + } + + if (op == REQ_OP_FLUSH) { + cmd->error = errno_to_blk_status(null_handle_flush(nullb)); + goto out; + } + + if (dev->zoned) + cmd->error = null_process_zoned_cmd(cmd, op, + sector, nr_sectors); + else + cmd->error = null_process_cmd(cmd, op, sector, nr_sectors); + +out: + nullb_complete_cmd(cmd); + return BLK_STS_OK; +} + +static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) +{ + struct nullb *nullb = container_of(timer, struct nullb, bw_timer); + ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); + unsigned int mbps = nullb->dev->mbps; + + if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) + return HRTIMER_NORESTART; + + atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); + null_restart_queue_async(nullb); + + hrtimer_forward_now(&nullb->bw_timer, timer_interval); + + return HRTIMER_RESTART; +} + +static void nullb_setup_bwtimer(struct nullb *nullb) +{ + ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); + + hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + nullb->bw_timer.function = nullb_bwtimer_fn; + atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); + hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); +} + +static struct nullb_queue *nullb_to_queue(struct nullb *nullb) +{ + int index = 0; + + if (nullb->nr_queues != 1) + index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); + + return &nullb->queues[index]; +} + +static blk_qc_t null_submit_bio(struct bio *bio) +{ + sector_t sector = bio->bi_iter.bi_sector; + sector_t nr_sectors = bio_sectors(bio); + struct nullb *nullb = bio->bi_disk->private_data; + struct nullb_queue *nq = nullb_to_queue(nullb); + struct nullb_cmd *cmd; + + cmd = alloc_cmd(nq, 1); + cmd->bio = bio; + + null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); + return BLK_QC_T_NONE; +} + +static bool should_timeout_request(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_timeout_str[0]) + return should_fail(&null_timeout_attr, 1); +#endif + return false; +} + +static bool should_requeue_request(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_requeue_str[0]) + return should_fail(&null_requeue_attr, 1); +#endif + return false; +} + +static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) +{ + pr_info("rq %p timed out\n", rq); + blk_mq_complete_request(rq); + return BLK_EH_DONE; +} + +static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + struct nullb_queue *nq = hctx->driver_data; + sector_t nr_sectors = blk_rq_sectors(bd->rq); + sector_t sector = blk_rq_pos(bd->rq); + + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + + if (nq->dev->irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } + cmd->rq = bd->rq; + cmd->error = BLK_STS_OK; + cmd->nq = nq; + + blk_mq_start_request(bd->rq); + + if (should_requeue_request(bd->rq)) { + /* + * Alternate between hitting the core BUSY path, and the + * driver driven requeue path + */ + nq->requeue_selection++; + if (nq->requeue_selection & 1) + return BLK_STS_RESOURCE; + else { + blk_mq_requeue_request(bd->rq, true); + return BLK_STS_OK; + } + } + if (should_timeout_request(bd->rq)) + return BLK_STS_OK; + + return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); +} + +static void cleanup_queue(struct nullb_queue *nq) +{ + kfree(nq->tag_map); + kfree(nq->cmds); +} + +static void cleanup_queues(struct nullb *nullb) +{ + int i; + + for (i = 0; i < nullb->nr_queues; i++) + cleanup_queue(&nullb->queues[i]); + + kfree(nullb->queues); +} + +static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nullb_queue *nq = hctx->driver_data; + struct nullb *nullb = nq->dev->nullb; + + nullb->nr_queues--; +} + +static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) +{ + init_waitqueue_head(&nq->wait); + nq->queue_depth = nullb->queue_depth; + nq->dev = nullb->dev; +} + +static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, + unsigned int hctx_idx) +{ + struct nullb *nullb = hctx->queue->queuedata; + struct nullb_queue *nq; + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) + return -EFAULT; +#endif + + nq = &nullb->queues[hctx_idx]; + hctx->driver_data = nq; + null_init_queue(nullb, nq); + nullb->nr_queues++; + + return 0; +} + +static const struct blk_mq_ops null_mq_ops = { + .queue_rq = null_queue_rq, + .complete = null_complete_rq, + .timeout = null_timeout_rq, + .init_hctx = null_init_hctx, + .exit_hctx = null_exit_hctx, +}; + +static void null_del_dev(struct nullb *nullb) +{ + struct nullb_device *dev; + + if (!nullb) + return; + + dev = nullb->dev; + + ida_simple_remove(&nullb_indexes, nullb->index); + + list_del_init(&nullb->list); + + del_gendisk(nullb->disk); + + if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { + hrtimer_cancel(&nullb->bw_timer); + atomic_long_set(&nullb->cur_bytes, LONG_MAX); + null_restart_queue_async(nullb); + } + + blk_cleanup_queue(nullb->q); + if (dev->queue_mode == NULL_Q_MQ && + nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); + put_disk(nullb->disk); + cleanup_queues(nullb); + if (null_cache_active(nullb)) + null_free_device_storage(nullb->dev, true); + kfree(nullb); + dev->nullb = NULL; +} + +static void null_config_discard(struct nullb *nullb) +{ + if (nullb->dev->discard == false) + return; + + if (!nullb->dev->memory_backed) { + nullb->dev->discard = false; + pr_info("discard option is ignored without memory backing\n"); + return; + } + + if (nullb->dev->zoned) { + nullb->dev->discard = false; + pr_info("discard option is ignored in zoned mode\n"); + return; + } + + nullb->q->limits.discard_granularity = nullb->dev->blocksize; + nullb->q->limits.discard_alignment = nullb->dev->blocksize; + blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); +} + +static const struct block_device_operations null_bio_ops = { + .owner = THIS_MODULE, + .submit_bio = null_submit_bio, + .report_zones = null_report_zones, +}; + +static const struct block_device_operations null_rq_ops = { + .owner = THIS_MODULE, + .report_zones = null_report_zones, +}; + +static int setup_commands(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + int i, tag_size; + + nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); + if (!nq->cmds) + return -ENOMEM; + + tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; + nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); + if (!nq->tag_map) { + kfree(nq->cmds); + return -ENOMEM; + } + + for (i = 0; i < nq->queue_depth; i++) { + cmd = &nq->cmds[i]; + cmd->tag = -1U; + } + + return 0; +} + +static int setup_queues(struct nullb *nullb) +{ + nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), + GFP_KERNEL); + if (!nullb->queues) + return -ENOMEM; + + nullb->queue_depth = nullb->dev->hw_queue_depth; + + return 0; +} + +static int init_driver_queues(struct nullb *nullb) +{ + struct nullb_queue *nq; + int i, ret = 0; + + for (i = 0; i < nullb->dev->submit_queues; i++) { + nq = &nullb->queues[i]; + + null_init_queue(nullb, nq); + + ret = setup_commands(nq); + if (ret) + return ret; + nullb->nr_queues++; + } + return 0; +} + +static int null_gendisk_register(struct nullb *nullb) +{ + sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; + struct gendisk *disk; + + disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); + if (!disk) + return -ENOMEM; + set_capacity(disk, size); + + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->major = null_major; + disk->first_minor = nullb->index; + if (queue_is_mq(nullb->q)) + disk->fops = &null_rq_ops; + else + disk->fops = &null_bio_ops; + disk->private_data = nullb; + disk->queue = nullb->q; + strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + + if (nullb->dev->zoned) { + int ret = null_register_zoned_dev(nullb); + + if (ret) + return ret; + } + + add_disk(disk); + return 0; +} + +static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) +{ + set->ops = &null_mq_ops; + set->nr_hw_queues = nullb ? nullb->dev->submit_queues : + g_submit_queues; + set->queue_depth = nullb ? nullb->dev->hw_queue_depth : + g_hw_queue_depth; + set->numa_node = nullb ? nullb->dev->home_node : g_home_node; + set->cmd_size = sizeof(struct nullb_cmd); + set->flags = BLK_MQ_F_SHOULD_MERGE; + if (g_no_sched) + set->flags |= BLK_MQ_F_NO_SCHED; + if (g_shared_tag_bitmap) + set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; + set->driver_data = NULL; + + if ((nullb && nullb->dev->blocking) || g_blocking) + set->flags |= BLK_MQ_F_BLOCKING; + + return blk_mq_alloc_tag_set(set); +} + +static int null_validate_conf(struct nullb_device *dev) +{ + dev->blocksize = round_down(dev->blocksize, 512); + dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); + + if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { + if (dev->submit_queues != nr_online_nodes) + dev->submit_queues = nr_online_nodes; + } else if (dev->submit_queues > nr_cpu_ids) + dev->submit_queues = nr_cpu_ids; + else if (dev->submit_queues == 0) + dev->submit_queues = 1; + + dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); + dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); + + /* Do memory allocation, so set blocking */ + if (dev->memory_backed) + dev->blocking = true; + else /* cache is meaningless */ + dev->cache_size = 0; + dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, + dev->cache_size); + dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); + /* can not stop a queue */ + if (dev->queue_mode == NULL_Q_BIO) + dev->mbps = 0; + + if (dev->zoned && + (!dev->zone_size || !is_power_of_2(dev->zone_size))) { + pr_err("zone_size must be power-of-two\n"); + return -EINVAL; + } + + return 0; +} + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +static bool __null_setup_fault(struct fault_attr *attr, char *str) +{ + if (!str[0]) + return true; + + if (!setup_fault_attr(attr, str)) + return false; + + attr->verbose = 0; + return true; +} +#endif + +static bool null_setup_fault(void) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) + return false; + if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) + return false; + if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) + return false; +#endif + return true; +} + +static int null_add_dev(struct nullb_device *dev) +{ + struct nullb *nullb; + int rv; + + rv = null_validate_conf(dev); + if (rv) + return rv; + + nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); + if (!nullb) { + rv = -ENOMEM; + goto out; + } + nullb->dev = dev; + dev->nullb = nullb; + + spin_lock_init(&nullb->lock); + + rv = setup_queues(nullb); + if (rv) + goto out_free_nullb; + + if (dev->queue_mode == NULL_Q_MQ) { + if (shared_tags) { + nullb->tag_set = &tag_set; + rv = 0; + } else { + nullb->tag_set = &nullb->__tag_set; + rv = null_init_tag_set(nullb, nullb->tag_set); + } + + if (rv) + goto out_cleanup_queues; + + if (!null_setup_fault()) + goto out_cleanup_queues; + + nullb->tag_set->timeout = 5 * HZ; + nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); + if (IS_ERR(nullb->q)) { + rv = -ENOMEM; + goto out_cleanup_tags; + } + } else if (dev->queue_mode == NULL_Q_BIO) { + nullb->q = blk_alloc_queue(dev->home_node); + if (!nullb->q) { + rv = -ENOMEM; + goto out_cleanup_queues; + } + rv = init_driver_queues(nullb); + if (rv) + goto out_cleanup_blk_queue; + } + + if (dev->mbps) { + set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); + nullb_setup_bwtimer(nullb); + } + + if (dev->cache_size > 0) { + set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); + blk_queue_write_cache(nullb->q, true, true); + } + + if (dev->zoned) { + rv = null_init_zoned_dev(dev, nullb->q); + if (rv) + goto out_cleanup_blk_queue; + } + + nullb->q->queuedata = nullb; + blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); + + mutex_lock(&lock); + nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); + dev->index = nullb->index; + mutex_unlock(&lock); + + blk_queue_logical_block_size(nullb->q, dev->blocksize); + blk_queue_physical_block_size(nullb->q, dev->blocksize); + if (!dev->max_sectors) + dev->max_sectors = queue_max_hw_sectors(nullb->q); + dev->max_sectors = min_t(unsigned int, dev->max_sectors, + BLK_DEF_MAX_SECTORS); + blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); + + null_config_discard(nullb); + + sprintf(nullb->disk_name, "nullb%d", nullb->index); + + rv = null_gendisk_register(nullb); + if (rv) + goto out_cleanup_zone; + + mutex_lock(&lock); + list_add_tail(&nullb->list, &nullb_list); + mutex_unlock(&lock); + + return 0; +out_cleanup_zone: + null_free_zoned_dev(dev); +out_cleanup_blk_queue: + blk_cleanup_queue(nullb->q); +out_cleanup_tags: + if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); +out_cleanup_queues: + cleanup_queues(nullb); +out_free_nullb: + kfree(nullb); + dev->nullb = NULL; +out: + return rv; +} + +static int __init null_init(void) +{ + int ret = 0; + unsigned int i; + struct nullb *nullb; + struct nullb_device *dev; + + if (g_bs > PAGE_SIZE) { + pr_warn("invalid block size\n"); + pr_warn("defaults block size to %lu\n", PAGE_SIZE); + g_bs = PAGE_SIZE; + } + + if (g_max_sectors > BLK_DEF_MAX_SECTORS) { + pr_warn("invalid max sectors\n"); + pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS); + g_max_sectors = BLK_DEF_MAX_SECTORS; + } + + if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { + pr_err("invalid home_node value\n"); + g_home_node = NUMA_NO_NODE; + } + + if (g_queue_mode == NULL_Q_RQ) { + pr_err("legacy IO path no longer available\n"); + return -EINVAL; + } + if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { + if (g_submit_queues != nr_online_nodes) { + pr_warn("submit_queues param is set to %u.\n", + nr_online_nodes); + g_submit_queues = nr_online_nodes; + } + } else if (g_submit_queues > nr_cpu_ids) + g_submit_queues = nr_cpu_ids; + else if (g_submit_queues <= 0) + g_submit_queues = 1; + + if (g_queue_mode == NULL_Q_MQ && shared_tags) { + ret = null_init_tag_set(NULL, &tag_set); + if (ret) + return ret; + } + + config_group_init(&nullb_subsys.su_group); + mutex_init(&nullb_subsys.su_mutex); + + ret = configfs_register_subsystem(&nullb_subsys); + if (ret) + goto err_tagset; + + mutex_init(&lock); + + null_major = register_blkdev(0, "nullb"); + if (null_major < 0) { + ret = null_major; + goto err_conf; + } + + for (i = 0; i < nr_devices; i++) { + dev = null_alloc_dev(); + if (!dev) { + ret = -ENOMEM; + goto err_dev; + } + ret = null_add_dev(dev); + if (ret) { + null_free_dev(dev); + goto err_dev; + } + } + + pr_info("module loaded\n"); + return 0; + +err_dev: + while (!list_empty(&nullb_list)) { + nullb = list_entry(nullb_list.next, struct nullb, list); + dev = nullb->dev; + null_del_dev(nullb); + null_free_dev(dev); + } + unregister_blkdev(null_major, "nullb"); +err_conf: + configfs_unregister_subsystem(&nullb_subsys); +err_tagset: + if (g_queue_mode == NULL_Q_MQ && shared_tags) + blk_mq_free_tag_set(&tag_set); + return ret; +} + +static void __exit null_exit(void) +{ + struct nullb *nullb; + + configfs_unregister_subsystem(&nullb_subsys); + + unregister_blkdev(null_major, "nullb"); + + mutex_lock(&lock); + while (!list_empty(&nullb_list)) { + struct nullb_device *dev; + + nullb = list_entry(nullb_list.next, struct nullb, list); + dev = nullb->dev; + null_del_dev(nullb); + null_free_dev(dev); + } + mutex_unlock(&lock); + + if (g_queue_mode == NULL_Q_MQ && shared_tags) + blk_mq_free_tag_set(&tag_set); +} + +module_init(null_init); +module_exit(null_exit); + +MODULE_AUTHOR("Jens Axboe "); +MODULE_LICENSE("GPL"); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h new file mode 100644 index 000000000000..83504f3cc9d6 --- /dev/null +++ b/drivers/block/null_blk/null_blk.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BLK_NULL_BLK_H +#define __BLK_NULL_BLK_H + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nullb_cmd { + struct request *rq; + struct bio *bio; + unsigned int tag; + blk_status_t error; + struct nullb_queue *nq; + struct hrtimer timer; +}; + +struct nullb_queue { + unsigned long *tag_map; + wait_queue_head_t wait; + unsigned int queue_depth; + struct nullb_device *dev; + unsigned int requeue_selection; + + struct nullb_cmd *cmds; +}; + +struct nullb_zone { + /* + * Zone lock to prevent concurrent modification of a zone write + * pointer position and condition: with memory backing, a write + * command execution may sleep on memory allocation. For this case, + * use mutex as the zone lock. Otherwise, use the spinlock for + * locking the zone. + */ + union { + spinlock_t spinlock; + struct mutex mutex; + }; + enum blk_zone_type type; + enum blk_zone_cond cond; + sector_t start; + sector_t wp; + unsigned int len; + unsigned int capacity; +}; + +struct nullb_device { + struct nullb *nullb; + struct config_item item; + struct radix_tree_root data; /* data stored in the disk */ + struct radix_tree_root cache; /* disk cache data */ + unsigned long flags; /* device flags */ + unsigned int curr_cache; + struct badblocks badblocks; + + unsigned int nr_zones; + unsigned int nr_zones_imp_open; + unsigned int nr_zones_exp_open; + unsigned int nr_zones_closed; + unsigned int imp_close_zone_no; + struct nullb_zone *zones; + sector_t zone_size_sects; + bool need_zone_res_mgmt; + spinlock_t zone_res_lock; + + unsigned long size; /* device size in MB */ + unsigned long completion_nsec; /* time in ns to complete a request */ + unsigned long cache_size; /* disk cache size in MB */ + unsigned long zone_size; /* zone size in MB if device is zoned */ + unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ + unsigned int zone_nr_conv; /* number of conventional zones */ + unsigned int zone_max_open; /* max number of open zones */ + unsigned int zone_max_active; /* max number of active zones */ + unsigned int submit_queues; /* number of submission queues */ + unsigned int home_node; /* home node for the device */ + unsigned int queue_mode; /* block interface */ + unsigned int blocksize; /* block size */ + unsigned int max_sectors; /* Max sectors per command */ + unsigned int irqmode; /* IRQ completion handler */ + unsigned int hw_queue_depth; /* queue depth */ + unsigned int index; /* index of the disk, only valid with a disk */ + unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ + bool blocking; /* blocking blk-mq device */ + bool use_per_node_hctx; /* use per-node allocation for hardware context */ + bool power; /* power on/off the device */ + bool memory_backed; /* if data is stored in memory */ + bool discard; /* if support discard */ + bool zoned; /* if device is zoned */ +}; + +struct nullb { + struct nullb_device *dev; + struct list_head list; + unsigned int index; + struct request_queue *q; + struct gendisk *disk; + struct blk_mq_tag_set *tag_set; + struct blk_mq_tag_set __tag_set; + unsigned int queue_depth; + atomic_long_t cur_bytes; + struct hrtimer bw_timer; + unsigned long cache_flush_pos; + spinlock_t lock; + + struct nullb_queue *queues; + unsigned int nr_queues; + char disk_name[DISK_NAME_LEN]; +}; + +blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, + sector_t nr_sectors); +blk_status_t null_process_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + unsigned int nr_sectors); + +#ifdef CONFIG_BLK_DEV_ZONED +int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); +int null_register_zoned_dev(struct nullb *nullb); +void null_free_zoned_dev(struct nullb_device *dev); +int null_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + sector_t nr_sectors); +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len); +#else +static inline int null_init_zoned_dev(struct nullb_device *dev, + struct request_queue *q) +{ + pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); + return -EINVAL; +} +static inline int null_register_zoned_dev(struct nullb *nullb) +{ + return -ENODEV; +} +static inline void null_free_zoned_dev(struct nullb_device *dev) {} +static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, sector_t nr_sectors) +{ + return BLK_STS_NOTSUPP; +} +static inline size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, + unsigned int len) +{ + return len; +} +#define null_report_zones NULL +#endif /* CONFIG_BLK_DEV_ZONED */ +#endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk/trace.c b/drivers/block/null_blk/trace.c new file mode 100644 index 000000000000..3711cba16071 --- /dev/null +++ b/drivers/block/null_blk/trace.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * null_blk trace related helpers. + * + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#include "trace.h" + +/* + * Helper to use for all null_blk traces to extract disk name. + */ +const char *nullb_trace_disk_name(struct trace_seq *p, char *name) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (name && *name) + trace_seq_printf(p, "disk=%s, ", name); + trace_seq_putc(p, 0); + + return ret; +} diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h new file mode 100644 index 000000000000..ce3b430e88c5 --- /dev/null +++ b/drivers/block/null_blk/trace.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * null_blk device driver tracepoints. + * + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nullb + +#if !defined(_TRACE_NULLB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NULLB_H + +#include +#include + +#include "null_blk.h" + +const char *nullb_trace_disk_name(struct trace_seq *p, char *name); + +#define __print_disk_name(name) nullb_trace_disk_name(p, name) + +#ifndef TRACE_HEADER_MULTI_READ +static inline void __assign_disk_name(char *name, struct gendisk *disk) +{ + if (disk) + memcpy(name, disk->disk_name, DISK_NAME_LEN); + else + memset(name, 0, DISK_NAME_LEN); +} +#endif + +TRACE_EVENT(nullb_zone_op, + TP_PROTO(struct nullb_cmd *cmd, unsigned int zone_no, + unsigned int zone_cond), + TP_ARGS(cmd, zone_no, zone_cond), + TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) + __field(enum req_opf, op) + __field(unsigned int, zone_no) + __field(unsigned int, zone_cond) + ), + TP_fast_assign( + __entry->op = req_op(cmd->rq); + __entry->zone_no = zone_no; + __entry->zone_cond = zone_cond; + __assign_disk_name(__entry->disk, cmd->rq->rq_disk); + ), + TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", + __print_disk_name(__entry->disk), + blk_op_str(__entry->op), + __entry->zone_no, + blk_zone_cond_str(__entry->zone_cond)) +); + +TRACE_EVENT(nullb_report_zones, + TP_PROTO(struct nullb *nullb, unsigned int nr_zones), + TP_ARGS(nullb, nr_zones), + TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) + __field(unsigned int, nr_zones) + ), + TP_fast_assign( + __entry->nr_zones = nr_zones; + __assign_disk_name(__entry->disk, nullb->disk); + ), + TP_printk("%s nr_zones=%u", + __print_disk_name(__entry->disk), __entry->nr_zones) +); + +#endif /* _TRACE_NULLB_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c new file mode 100644 index 000000000000..148b871f263b --- /dev/null +++ b/drivers/block/null_blk/zoned.c @@ -0,0 +1,677 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "null_blk.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) + +static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) +{ + return sect >> ilog2(dev->zone_size_sects); +} + +static inline void null_lock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_lock_irq(&dev->zone_res_lock); +} + +static inline void null_unlock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_unlock_irq(&dev->zone_res_lock); +} + +static inline void null_init_zone_lock(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_init(&zone->spinlock); + else + mutex_init(&zone->mutex); +} + +static inline void null_lock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_irq(&zone->spinlock); + else + mutex_lock(&zone->mutex); +} + +static inline void null_unlock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_unlock_irq(&zone->spinlock); + else + mutex_unlock(&zone->mutex); +} + +int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) +{ + sector_t dev_capacity_sects, zone_capacity_sects; + struct nullb_zone *zone; + sector_t sector = 0; + unsigned int i; + + if (!is_power_of_2(dev->zone_size)) { + pr_err("zone_size must be power-of-two\n"); + return -EINVAL; + } + if (dev->zone_size > dev->size) { + pr_err("Zone size larger than device capacity\n"); + return -EINVAL; + } + + if (!dev->zone_capacity) + dev->zone_capacity = dev->zone_size; + + if (dev->zone_capacity > dev->zone_size) { + pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", + dev->zone_capacity, dev->zone_size); + return -EINVAL; + } + + zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); + dev_capacity_sects = MB_TO_SECTS(dev->size); + dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); + dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); + if (dev_capacity_sects & (dev->zone_size_sects - 1)) + dev->nr_zones++; + + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), + GFP_KERNEL | __GFP_ZERO); + if (!dev->zones) + return -ENOMEM; + + spin_lock_init(&dev->zone_res_lock); + + if (dev->zone_nr_conv >= dev->nr_zones) { + dev->zone_nr_conv = dev->nr_zones - 1; + pr_info("changed the number of conventional zones to %u", + dev->zone_nr_conv); + } + + /* Max active zones has to be < nbr of seq zones in order to be enforceable */ + if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { + dev->zone_max_active = 0; + pr_info("zone_max_active limit disabled, limit >= zone count\n"); + } + + /* Max open zones has to be <= max active zones */ + if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { + dev->zone_max_open = dev->zone_max_active; + pr_info("changed the maximum number of open zones to %u\n", + dev->nr_zones); + } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { + dev->zone_max_open = 0; + pr_info("zone_max_open limit disabled, limit >= zone count\n"); + } + dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; + dev->imp_close_zone_no = dev->zone_nr_conv; + + for (i = 0; i < dev->zone_nr_conv; i++) { + zone = &dev->zones[i]; + + null_init_zone_lock(dev, zone); + zone->start = sector; + zone->len = dev->zone_size_sects; + zone->capacity = zone->len; + zone->wp = zone->start + zone->len; + zone->type = BLK_ZONE_TYPE_CONVENTIONAL; + zone->cond = BLK_ZONE_COND_NOT_WP; + + sector += dev->zone_size_sects; + } + + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + zone = &dev->zones[i]; + + null_init_zone_lock(dev, zone); + zone->start = zone->wp = sector; + if (zone->start + dev->zone_size_sects > dev_capacity_sects) + zone->len = dev_capacity_sects - zone->start; + else + zone->len = dev->zone_size_sects; + zone->capacity = + min_t(sector_t, zone->len, zone_capacity_sects); + zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone->cond = BLK_ZONE_COND_EMPTY; + + sector += dev->zone_size_sects; + } + + q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + + return 0; +} + +int null_register_zoned_dev(struct nullb *nullb) +{ + struct nullb_device *dev = nullb->dev; + struct request_queue *q = nullb->q; + + if (queue_is_mq(q)) { + int ret = blk_revalidate_disk_zones(nullb->disk, NULL); + + if (ret) + return ret; + } else { + blk_queue_chunk_sectors(q, dev->zone_size_sects); + q->nr_zones = blkdev_nr_zones(nullb->disk); + } + + blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); + blk_queue_max_open_zones(q, dev->zone_max_open); + blk_queue_max_active_zones(q, dev->zone_max_active); + + return 0; +} + +void null_free_zoned_dev(struct nullb_device *dev) +{ + kvfree(dev->zones); +} + +int null_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nullb *nullb = disk->private_data; + struct nullb_device *dev = nullb->dev; + unsigned int first_zone, i; + struct nullb_zone *zone; + struct blk_zone blkz; + int error; + + first_zone = null_zone_no(dev, sector); + if (first_zone >= dev->nr_zones) + return 0; + + nr_zones = min(nr_zones, dev->nr_zones - first_zone); + trace_nullb_report_zones(nullb, nr_zones); + + memset(&blkz, 0, sizeof(struct blk_zone)); + zone = &dev->zones[first_zone]; + for (i = 0; i < nr_zones; i++, zone++) { + /* + * Stacked DM target drivers will remap the zone information by + * modifying the zone information passed to the report callback. + * So use a local copy to avoid corruption of the device zone + * array. + */ + null_lock_zone(dev, zone); + blkz.start = zone->start; + blkz.len = zone->len; + blkz.wp = zone->wp; + blkz.type = zone->type; + blkz.cond = zone->cond; + blkz.capacity = zone->capacity; + null_unlock_zone(dev, zone); + + error = cb(&blkz, i, data); + if (error) + return error; + } + + return nr_zones; +} + +/* + * This is called in the case of memory backing from null_process_cmd() + * with the target zone already locked. + */ +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len) +{ + struct nullb_device *dev = nullb->dev; + struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; + unsigned int nr_sectors = len >> SECTOR_SHIFT; + + /* Read must be below the write pointer position */ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || + sector + nr_sectors <= zone->wp) + return len; + + if (sector > zone->wp) + return 0; + + return (zone->wp - sector) << SECTOR_SHIFT; +} + +static blk_status_t __null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_CLOSED: + /* close operation on closed is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + default: + return BLK_STS_IOERR; + } + + if (zone->wp == zone->start) { + zone->cond = BLK_ZONE_COND_EMPTY; + } else { + zone->cond = BLK_ZONE_COND_CLOSED; + dev->nr_zones_closed++; + } + + return BLK_STS_OK; +} + +static void null_close_imp_open_zone(struct nullb_device *dev) +{ + struct nullb_zone *zone; + unsigned int zno, i; + + zno = dev->imp_close_zone_no; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; + + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + zone = &dev->zones[zno]; + zno++; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; + + if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { + __null_close_zone(dev, zone); + dev->imp_close_zone_no = zno; + return; + } + } +} + +static blk_status_t null_check_active(struct nullb_device *dev) +{ + if (!dev->zone_max_active) + return BLK_STS_OK; + + if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + + dev->nr_zones_closed < dev->zone_max_active) + return BLK_STS_OK; + + return BLK_STS_ZONE_ACTIVE_RESOURCE; +} + +static blk_status_t null_check_open(struct nullb_device *dev) +{ + if (!dev->zone_max_open) + return BLK_STS_OK; + + if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) + return BLK_STS_OK; + + if (dev->nr_zones_imp_open) { + if (null_check_active(dev) == BLK_STS_OK) { + null_close_imp_open_zone(dev); + return BLK_STS_OK; + } + } + + return BLK_STS_ZONE_OPEN_RESOURCE; +} + +/* + * This function matches the manage open zone resources function in the ZBC standard, + * with the addition of max active zones support (added in the ZNS standard). + * + * The function determines if a zone can transition to implicit open or explicit open, + * while maintaining the max open zone (and max active zone) limit(s). It may close an + * implicit open zone in order to make additional zone resources available. + * + * ZBC states that an implicit open zone shall be closed only if there is not + * room within the open limit. However, with the addition of an active limit, + * it is not certain that closing an implicit open zone will allow a new zone + * to be opened, since we might already be at the active limit capacity. + */ +static blk_status_t null_check_zone_resources(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret; + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + ret = null_check_active(dev); + if (ret != BLK_STS_OK) + return ret; + fallthrough; + case BLK_ZONE_COND_CLOSED: + return null_check_open(dev); + default: + /* Should never be called for other states */ + WARN_ON(1); + return BLK_STS_IOERR; + } +} + +static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, + unsigned int nr_sectors, bool append) +{ + struct nullb_device *dev = cmd->nq->dev; + unsigned int zno = null_zone_no(dev, sector); + struct nullb_zone *zone = &dev->zones[zno]; + blk_status_t ret; + + trace_nullb_zone_op(cmd, zno, zone->cond); + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (append) + return BLK_STS_IOERR; + return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + } + + null_lock_zone(dev, zone); + + if (zone->cond == BLK_ZONE_COND_FULL) { + /* Cannot write to a full zone */ + ret = BLK_STS_IOERR; + goto unlock; + } + + /* + * Regular writes must be at the write pointer position. + * Zone append writes are automatically issued at the write + * pointer and the position returned using the request or BIO + * sector. + */ + if (append) { + sector = zone->wp; + if (cmd->bio) + cmd->bio->bi_iter.bi_sector = sector; + else + cmd->rq->__sector = sector; + } else if (sector != zone->wp) { + ret = BLK_STS_IOERR; + goto unlock; + } + + if (zone->wp + nr_sectors > zone->start + zone->capacity) { + ret = BLK_STS_IOERR; + goto unlock; + } + + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) { + null_lock_zone_res(dev); + + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + null_unlock_zone_res(dev); + goto unlock; + } + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } + + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + null_unlock_zone_res(dev); + } + + ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + if (ret != BLK_STS_OK) + goto unlock; + + zone->wp += nr_sectors; + if (zone->wp == zone->start + zone->capacity) { + null_lock_zone_res(dev); + if (zone->cond == BLK_ZONE_COND_EXP_OPEN) + dev->nr_zones_exp_open--; + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) + dev->nr_zones_imp_open--; + zone->cond = BLK_ZONE_COND_FULL; + null_unlock_zone_res(dev); + } + + ret = BLK_STS_OK; + +unlock: + null_unlock_zone(dev, zone); + + return ret; +} + +static blk_status_t null_open_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret = BLK_STS_OK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + + switch (zone->cond) { + case BLK_ZONE_COND_EXP_OPEN: + /* open operation on exp open is not an error */ + goto unlock; + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + goto unlock; + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + goto unlock; + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_FULL: + default: + ret = BLK_STS_IOERR; + goto unlock; + } + + zone->cond = BLK_ZONE_COND_EXP_OPEN; + dev->nr_zones_exp_open++; + +unlock: + null_unlock_zone_res(dev); + + return ret; +} + +static blk_status_t null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + ret = __null_close_zone(dev, zone); + null_unlock_zone_res(dev); + + return ret; +} + +static blk_status_t null_finish_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret = BLK_STS_OK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* finish operation on full is not an error */ + goto unlock; + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + goto unlock; + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + goto unlock; + dev->nr_zones_closed--; + break; + default: + ret = BLK_STS_IOERR; + goto unlock; + } + + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zone->len; + +unlock: + null_unlock_zone_res(dev); + + return ret; +} + +static blk_status_t null_reset_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + /* reset operation on empty is not an error */ + null_unlock_zone_res(dev); + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_FULL: + break; + default: + null_unlock_zone_res(dev); + return BLK_STS_IOERR; + } + + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + + null_unlock_zone_res(dev); + + if (dev->memory_backed) + return null_handle_discard(dev, zone->start, zone->len); + + return BLK_STS_OK; +} + +static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector) +{ + struct nullb_device *dev = cmd->nq->dev; + unsigned int zone_no; + struct nullb_zone *zone; + blk_status_t ret; + size_t i; + + if (op == REQ_OP_ZONE_RESET_ALL) { + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + zone = &dev->zones[i]; + null_lock_zone(dev, zone); + if (zone->cond != BLK_ZONE_COND_EMPTY) { + null_reset_zone(dev, zone); + trace_nullb_zone_op(cmd, i, zone->cond); + } + null_unlock_zone(dev, zone); + } + return BLK_STS_OK; + } + + zone_no = null_zone_no(dev, sector); + zone = &dev->zones[zone_no]; + + null_lock_zone(dev, zone); + + switch (op) { + case REQ_OP_ZONE_RESET: + ret = null_reset_zone(dev, zone); + break; + case REQ_OP_ZONE_OPEN: + ret = null_open_zone(dev, zone); + break; + case REQ_OP_ZONE_CLOSE: + ret = null_close_zone(dev, zone); + break; + case REQ_OP_ZONE_FINISH: + ret = null_finish_zone(dev, zone); + break; + default: + ret = BLK_STS_NOTSUPP; + break; + } + + if (ret == BLK_STS_OK) + trace_nullb_zone_op(cmd, zone_no, zone->cond); + + null_unlock_zone(dev, zone); + + return ret; +} + +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector, sector_t nr_sectors) +{ + struct nullb_device *dev; + struct nullb_zone *zone; + blk_status_t sts; + + switch (op) { + case REQ_OP_WRITE: + return null_zone_write(cmd, sector, nr_sectors, false); + case REQ_OP_ZONE_APPEND: + return null_zone_write(cmd, sector, nr_sectors, true); + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + return null_zone_mgmt(cmd, op, sector); + default: + dev = cmd->nq->dev; + zone = &dev->zones[null_zone_no(dev, sector)]; + + null_lock_zone(dev, zone); + sts = null_process_cmd(cmd, op, sector, nr_sectors); + null_unlock_zone(dev, zone); + return sts; + } +} diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c deleted file mode 100644 index 5357c3a4a36f..000000000000 --- a/drivers/block/null_blk_main.c +++ /dev/null @@ -1,2031 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Add configfs and memory store: Kyungchan Koh and - * Shaohua Li - */ -#include - -#include -#include -#include -#include -#include "null_blk.h" - -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) -#define SECTOR_MASK (PAGE_SECTORS - 1) - -#define FREE_BATCH 16 - -#define TICKS_PER_SEC 50ULL -#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION -static DECLARE_FAULT_ATTR(null_timeout_attr); -static DECLARE_FAULT_ATTR(null_requeue_attr); -static DECLARE_FAULT_ATTR(null_init_hctx_attr); -#endif - -static inline u64 mb_per_tick(int mbps) -{ - return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); -} - -/* - * Status flags for nullb_device. - * - * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. - * UP: Device is currently on and visible in userspace. - * THROTTLED: Device is being throttled. - * CACHE: Device is using a write-back cache. - */ -enum nullb_device_flags { - NULLB_DEV_FL_CONFIGURED = 0, - NULLB_DEV_FL_UP = 1, - NULLB_DEV_FL_THROTTLED = 2, - NULLB_DEV_FL_CACHE = 3, -}; - -#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) -/* - * nullb_page is a page in memory for nullb devices. - * - * @page: The page holding the data. - * @bitmap: The bitmap represents which sector in the page has data. - * Each bit represents one block size. For example, sector 8 - * will use the 7th bit - * The highest 2 bits of bitmap are for special purpose. LOCK means the cache - * page is being flushing to storage. FREE means the cache page is freed and - * should be skipped from flushing to storage. Please see - * null_make_cache_space - */ -struct nullb_page { - struct page *page; - DECLARE_BITMAP(bitmap, MAP_SZ); -}; -#define NULLB_PAGE_LOCK (MAP_SZ - 1) -#define NULLB_PAGE_FREE (MAP_SZ - 2) - -static LIST_HEAD(nullb_list); -static struct mutex lock; -static int null_major; -static DEFINE_IDA(nullb_indexes); -static struct blk_mq_tag_set tag_set; - -enum { - NULL_IRQ_NONE = 0, - NULL_IRQ_SOFTIRQ = 1, - NULL_IRQ_TIMER = 2, -}; - -enum { - NULL_Q_BIO = 0, - NULL_Q_RQ = 1, - NULL_Q_MQ = 2, -}; - -static int g_no_sched; -module_param_named(no_sched, g_no_sched, int, 0444); -MODULE_PARM_DESC(no_sched, "No io scheduler"); - -static int g_submit_queues = 1; -module_param_named(submit_queues, g_submit_queues, int, 0444); -MODULE_PARM_DESC(submit_queues, "Number of submission queues"); - -static int g_home_node = NUMA_NO_NODE; -module_param_named(home_node, g_home_node, int, 0444); -MODULE_PARM_DESC(home_node, "Home node for the device"); - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION -/* - * For more details about fault injection, please refer to - * Documentation/fault-injection/fault-injection.rst. - */ -static char g_timeout_str[80]; -module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); -MODULE_PARM_DESC(timeout, "Fault injection. timeout=,,,"); - -static char g_requeue_str[80]; -module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); -MODULE_PARM_DESC(requeue, "Fault injection. requeue=,,,"); - -static char g_init_hctx_str[80]; -module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); -MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=,,,"); -#endif - -static int g_queue_mode = NULL_Q_MQ; - -static int null_param_store_val(const char *str, int *val, int min, int max) -{ - int ret, new_val; - - ret = kstrtoint(str, 10, &new_val); - if (ret) - return -EINVAL; - - if (new_val < min || new_val > max) - return -EINVAL; - - *val = new_val; - return 0; -} - -static int null_set_queue_mode(const char *str, const struct kernel_param *kp) -{ - return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); -} - -static const struct kernel_param_ops null_queue_mode_param_ops = { - .set = null_set_queue_mode, - .get = param_get_int, -}; - -device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); -MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); - -static int g_gb = 250; -module_param_named(gb, g_gb, int, 0444); -MODULE_PARM_DESC(gb, "Size in GB"); - -static int g_bs = 512; -module_param_named(bs, g_bs, int, 0444); -MODULE_PARM_DESC(bs, "Block size (in bytes)"); - -static int g_max_sectors; -module_param_named(max_sectors, g_max_sectors, int, 0444); -MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); - -static unsigned int nr_devices = 1; -module_param(nr_devices, uint, 0444); -MODULE_PARM_DESC(nr_devices, "Number of devices to register"); - -static bool g_blocking; -module_param_named(blocking, g_blocking, bool, 0444); -MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); - -static bool shared_tags; -module_param(shared_tags, bool, 0444); -MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); - -static bool g_shared_tag_bitmap; -module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); -MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); - -static int g_irqmode = NULL_IRQ_SOFTIRQ; - -static int null_set_irqmode(const char *str, const struct kernel_param *kp) -{ - return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, - NULL_IRQ_TIMER); -} - -static const struct kernel_param_ops null_irqmode_param_ops = { - .set = null_set_irqmode, - .get = param_get_int, -}; - -device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); -MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); - -static unsigned long g_completion_nsec = 10000; -module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); -MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); - -static int g_hw_queue_depth = 64; -module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); -MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); - -static bool g_use_per_node_hctx; -module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); -MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); - -static bool g_zoned; -module_param_named(zoned, g_zoned, bool, S_IRUGO); -MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); - -static unsigned long g_zone_size = 256; -module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); -MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); - -static unsigned long g_zone_capacity; -module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); -MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); - -static unsigned int g_zone_nr_conv; -module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); -MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); - -static unsigned int g_zone_max_open; -module_param_named(zone_max_open, g_zone_max_open, uint, 0444); -MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); - -static unsigned int g_zone_max_active; -module_param_named(zone_max_active, g_zone_max_active, uint, 0444); -MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); - -static struct nullb_device *null_alloc_dev(void); -static void null_free_dev(struct nullb_device *dev); -static void null_del_dev(struct nullb *nullb); -static int null_add_dev(struct nullb_device *dev); -static void null_free_device_storage(struct nullb_device *dev, bool is_cache); - -static inline struct nullb_device *to_nullb_device(struct config_item *item) -{ - return item ? container_of(item, struct nullb_device, item) : NULL; -} - -static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) -{ - return snprintf(page, PAGE_SIZE, "%u\n", val); -} - -static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, - char *page) -{ - return snprintf(page, PAGE_SIZE, "%lu\n", val); -} - -static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) -{ - return snprintf(page, PAGE_SIZE, "%u\n", val); -} - -static ssize_t nullb_device_uint_attr_store(unsigned int *val, - const char *page, size_t count) -{ - unsigned int tmp; - int result; - - result = kstrtouint(page, 0, &tmp); - if (result < 0) - return result; - - *val = tmp; - return count; -} - -static ssize_t nullb_device_ulong_attr_store(unsigned long *val, - const char *page, size_t count) -{ - int result; - unsigned long tmp; - - result = kstrtoul(page, 0, &tmp); - if (result < 0) - return result; - - *val = tmp; - return count; -} - -static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, - size_t count) -{ - bool tmp; - int result; - - result = kstrtobool(page, &tmp); - if (result < 0) - return result; - - *val = tmp; - return count; -} - -/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ -#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ -static ssize_t \ -nullb_device_##NAME##_show(struct config_item *item, char *page) \ -{ \ - return nullb_device_##TYPE##_attr_show( \ - to_nullb_device(item)->NAME, page); \ -} \ -static ssize_t \ -nullb_device_##NAME##_store(struct config_item *item, const char *page, \ - size_t count) \ -{ \ - int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ - struct nullb_device *dev = to_nullb_device(item); \ - TYPE new_value = 0; \ - int ret; \ - \ - ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ - if (ret < 0) \ - return ret; \ - if (apply_fn) \ - ret = apply_fn(dev, new_value); \ - else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ - ret = -EBUSY; \ - if (ret < 0) \ - return ret; \ - dev->NAME = new_value; \ - return count; \ -} \ -CONFIGFS_ATTR(nullb_device_, NAME); - -static int nullb_apply_submit_queues(struct nullb_device *dev, - unsigned int submit_queues) -{ - struct nullb *nullb = dev->nullb; - struct blk_mq_tag_set *set; - - if (!nullb) - return 0; - - /* - * Make sure that null_init_hctx() does not access nullb->queues[] past - * the end of that array. - */ - if (submit_queues > nr_cpu_ids) - return -EINVAL; - set = nullb->tag_set; - blk_mq_update_nr_hw_queues(set, submit_queues); - return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; -} - -NULLB_DEVICE_ATTR(size, ulong, NULL); -NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); -NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); -NULLB_DEVICE_ATTR(home_node, uint, NULL); -NULLB_DEVICE_ATTR(queue_mode, uint, NULL); -NULLB_DEVICE_ATTR(blocksize, uint, NULL); -NULLB_DEVICE_ATTR(max_sectors, uint, NULL); -NULLB_DEVICE_ATTR(irqmode, uint, NULL); -NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); -NULLB_DEVICE_ATTR(index, uint, NULL); -NULLB_DEVICE_ATTR(blocking, bool, NULL); -NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); -NULLB_DEVICE_ATTR(memory_backed, bool, NULL); -NULLB_DEVICE_ATTR(discard, bool, NULL); -NULLB_DEVICE_ATTR(mbps, uint, NULL); -NULLB_DEVICE_ATTR(cache_size, ulong, NULL); -NULLB_DEVICE_ATTR(zoned, bool, NULL); -NULLB_DEVICE_ATTR(zone_size, ulong, NULL); -NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); -NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); -NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); -NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); - -static ssize_t nullb_device_power_show(struct config_item *item, char *page) -{ - return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); -} - -static ssize_t nullb_device_power_store(struct config_item *item, - const char *page, size_t count) -{ - struct nullb_device *dev = to_nullb_device(item); - bool newp = false; - ssize_t ret; - - ret = nullb_device_bool_attr_store(&newp, page, count); - if (ret < 0) - return ret; - - if (!dev->power && newp) { - if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) - return count; - if (null_add_dev(dev)) { - clear_bit(NULLB_DEV_FL_UP, &dev->flags); - return -ENOMEM; - } - - set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); - dev->power = newp; - } else if (dev->power && !newp) { - if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { - mutex_lock(&lock); - dev->power = newp; - null_del_dev(dev->nullb); - mutex_unlock(&lock); - } - clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); - } - - return count; -} - -CONFIGFS_ATTR(nullb_device_, power); - -static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) -{ - struct nullb_device *t_dev = to_nullb_device(item); - - return badblocks_show(&t_dev->badblocks, page, 0); -} - -static ssize_t nullb_device_badblocks_store(struct config_item *item, - const char *page, size_t count) -{ - struct nullb_device *t_dev = to_nullb_device(item); - char *orig, *buf, *tmp; - u64 start, end; - int ret; - - orig = kstrndup(page, count, GFP_KERNEL); - if (!orig) - return -ENOMEM; - - buf = strstrip(orig); - - ret = -EINVAL; - if (buf[0] != '+' && buf[0] != '-') - goto out; - tmp = strchr(&buf[1], '-'); - if (!tmp) - goto out; - *tmp = '\0'; - ret = kstrtoull(buf + 1, 0, &start); - if (ret) - goto out; - ret = kstrtoull(tmp + 1, 0, &end); - if (ret) - goto out; - ret = -EINVAL; - if (start > end) - goto out; - /* enable badblocks */ - cmpxchg(&t_dev->badblocks.shift, -1, 0); - if (buf[0] == '+') - ret = badblocks_set(&t_dev->badblocks, start, - end - start + 1, 1); - else - ret = badblocks_clear(&t_dev->badblocks, start, - end - start + 1); - if (ret == 0) - ret = count; -out: - kfree(orig); - return ret; -} -CONFIGFS_ATTR(nullb_device_, badblocks); - -static struct configfs_attribute *nullb_device_attrs[] = { - &nullb_device_attr_size, - &nullb_device_attr_completion_nsec, - &nullb_device_attr_submit_queues, - &nullb_device_attr_home_node, - &nullb_device_attr_queue_mode, - &nullb_device_attr_blocksize, - &nullb_device_attr_max_sectors, - &nullb_device_attr_irqmode, - &nullb_device_attr_hw_queue_depth, - &nullb_device_attr_index, - &nullb_device_attr_blocking, - &nullb_device_attr_use_per_node_hctx, - &nullb_device_attr_power, - &nullb_device_attr_memory_backed, - &nullb_device_attr_discard, - &nullb_device_attr_mbps, - &nullb_device_attr_cache_size, - &nullb_device_attr_badblocks, - &nullb_device_attr_zoned, - &nullb_device_attr_zone_size, - &nullb_device_attr_zone_capacity, - &nullb_device_attr_zone_nr_conv, - &nullb_device_attr_zone_max_open, - &nullb_device_attr_zone_max_active, - NULL, -}; - -static void nullb_device_release(struct config_item *item) -{ - struct nullb_device *dev = to_nullb_device(item); - - null_free_device_storage(dev, false); - null_free_dev(dev); -} - -static struct configfs_item_operations nullb_device_ops = { - .release = nullb_device_release, -}; - -static const struct config_item_type nullb_device_type = { - .ct_item_ops = &nullb_device_ops, - .ct_attrs = nullb_device_attrs, - .ct_owner = THIS_MODULE, -}; - -static struct -config_item *nullb_group_make_item(struct config_group *group, const char *name) -{ - struct nullb_device *dev; - - dev = null_alloc_dev(); - if (!dev) - return ERR_PTR(-ENOMEM); - - config_item_init_type_name(&dev->item, name, &nullb_device_type); - - return &dev->item; -} - -static void -nullb_group_drop_item(struct config_group *group, struct config_item *item) -{ - struct nullb_device *dev = to_nullb_device(item); - - if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { - mutex_lock(&lock); - dev->power = false; - null_del_dev(dev->nullb); - mutex_unlock(&lock); - } - - config_item_put(item); -} - -static ssize_t memb_group_features_show(struct config_item *item, char *page) -{ - return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors\n"); -} - -CONFIGFS_ATTR_RO(memb_group_, features); - -static struct configfs_attribute *nullb_group_attrs[] = { - &memb_group_attr_features, - NULL, -}; - -static struct configfs_group_operations nullb_group_ops = { - .make_item = nullb_group_make_item, - .drop_item = nullb_group_drop_item, -}; - -static const struct config_item_type nullb_group_type = { - .ct_group_ops = &nullb_group_ops, - .ct_attrs = nullb_group_attrs, - .ct_owner = THIS_MODULE, -}; - -static struct configfs_subsystem nullb_subsys = { - .su_group = { - .cg_item = { - .ci_namebuf = "nullb", - .ci_type = &nullb_group_type, - }, - }, -}; - -static inline int null_cache_active(struct nullb *nullb) -{ - return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); -} - -static struct nullb_device *null_alloc_dev(void) -{ - struct nullb_device *dev; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return NULL; - INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); - INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); - if (badblocks_init(&dev->badblocks, 0)) { - kfree(dev); - return NULL; - } - - dev->size = g_gb * 1024; - dev->completion_nsec = g_completion_nsec; - dev->submit_queues = g_submit_queues; - dev->home_node = g_home_node; - dev->queue_mode = g_queue_mode; - dev->blocksize = g_bs; - dev->max_sectors = g_max_sectors; - dev->irqmode = g_irqmode; - dev->hw_queue_depth = g_hw_queue_depth; - dev->blocking = g_blocking; - dev->use_per_node_hctx = g_use_per_node_hctx; - dev->zoned = g_zoned; - dev->zone_size = g_zone_size; - dev->zone_capacity = g_zone_capacity; - dev->zone_nr_conv = g_zone_nr_conv; - dev->zone_max_open = g_zone_max_open; - dev->zone_max_active = g_zone_max_active; - return dev; -} - -static void null_free_dev(struct nullb_device *dev) -{ - if (!dev) - return; - - null_free_zoned_dev(dev); - badblocks_exit(&dev->badblocks); - kfree(dev); -} - -static void put_tag(struct nullb_queue *nq, unsigned int tag) -{ - clear_bit_unlock(tag, nq->tag_map); - - if (waitqueue_active(&nq->wait)) - wake_up(&nq->wait); -} - -static unsigned int get_tag(struct nullb_queue *nq) -{ - unsigned int tag; - - do { - tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); - if (tag >= nq->queue_depth) - return -1U; - } while (test_and_set_bit_lock(tag, nq->tag_map)); - - return tag; -} - -static void free_cmd(struct nullb_cmd *cmd) -{ - put_tag(cmd->nq, cmd->tag); -} - -static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); - -static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - unsigned int tag; - - tag = get_tag(nq); - if (tag != -1U) { - cmd = &nq->cmds[tag]; - cmd->tag = tag; - cmd->error = BLK_STS_OK; - cmd->nq = nq; - if (nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; - } - return cmd; - } - - return NULL; -} - -static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) -{ - struct nullb_cmd *cmd; - DEFINE_WAIT(wait); - - cmd = __alloc_cmd(nq); - if (cmd || !can_wait) - return cmd; - - do { - prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); - cmd = __alloc_cmd(nq); - if (cmd) - break; - - io_schedule(); - } while (1); - - finish_wait(&nq->wait, &wait); - return cmd; -} - -static void end_cmd(struct nullb_cmd *cmd) -{ - int queue_mode = cmd->nq->dev->queue_mode; - - switch (queue_mode) { - case NULL_Q_MQ: - blk_mq_end_request(cmd->rq, cmd->error); - return; - case NULL_Q_BIO: - cmd->bio->bi_status = cmd->error; - bio_endio(cmd->bio); - break; - } - - free_cmd(cmd); -} - -static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) -{ - end_cmd(container_of(timer, struct nullb_cmd, timer)); - - return HRTIMER_NORESTART; -} - -static void null_cmd_end_timer(struct nullb_cmd *cmd) -{ - ktime_t kt = cmd->nq->dev->completion_nsec; - - hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); -} - -static void null_complete_rq(struct request *rq) -{ - end_cmd(blk_mq_rq_to_pdu(rq)); -} - -static struct nullb_page *null_alloc_page(gfp_t gfp_flags) -{ - struct nullb_page *t_page; - - t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); - if (!t_page) - goto out; - - t_page->page = alloc_pages(gfp_flags, 0); - if (!t_page->page) - goto out_freepage; - - memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); - return t_page; -out_freepage: - kfree(t_page); -out: - return NULL; -} - -static void null_free_page(struct nullb_page *t_page) -{ - __set_bit(NULLB_PAGE_FREE, t_page->bitmap); - if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) - return; - __free_page(t_page->page); - kfree(t_page); -} - -static bool null_page_empty(struct nullb_page *page) -{ - int size = MAP_SZ - 2; - - return find_first_bit(page->bitmap, size) == size; -} - -static void null_free_sector(struct nullb *nullb, sector_t sector, - bool is_cache) -{ - unsigned int sector_bit; - u64 idx; - struct nullb_page *t_page, *ret; - struct radix_tree_root *root; - - root = is_cache ? &nullb->dev->cache : &nullb->dev->data; - idx = sector >> PAGE_SECTORS_SHIFT; - sector_bit = (sector & SECTOR_MASK); - - t_page = radix_tree_lookup(root, idx); - if (t_page) { - __clear_bit(sector_bit, t_page->bitmap); - - if (null_page_empty(t_page)) { - ret = radix_tree_delete_item(root, idx, t_page); - WARN_ON(ret != t_page); - null_free_page(ret); - if (is_cache) - nullb->dev->curr_cache -= PAGE_SIZE; - } - } -} - -static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, - struct nullb_page *t_page, bool is_cache) -{ - struct radix_tree_root *root; - - root = is_cache ? &nullb->dev->cache : &nullb->dev->data; - - if (radix_tree_insert(root, idx, t_page)) { - null_free_page(t_page); - t_page = radix_tree_lookup(root, idx); - WARN_ON(!t_page || t_page->page->index != idx); - } else if (is_cache) - nullb->dev->curr_cache += PAGE_SIZE; - - return t_page; -} - -static void null_free_device_storage(struct nullb_device *dev, bool is_cache) -{ - unsigned long pos = 0; - int nr_pages; - struct nullb_page *ret, *t_pages[FREE_BATCH]; - struct radix_tree_root *root; - - root = is_cache ? &dev->cache : &dev->data; - - do { - int i; - - nr_pages = radix_tree_gang_lookup(root, - (void **)t_pages, pos, FREE_BATCH); - - for (i = 0; i < nr_pages; i++) { - pos = t_pages[i]->page->index; - ret = radix_tree_delete_item(root, pos, t_pages[i]); - WARN_ON(ret != t_pages[i]); - null_free_page(ret); - } - - pos++; - } while (nr_pages == FREE_BATCH); - - if (is_cache) - dev->curr_cache = 0; -} - -static struct nullb_page *__null_lookup_page(struct nullb *nullb, - sector_t sector, bool for_write, bool is_cache) -{ - unsigned int sector_bit; - u64 idx; - struct nullb_page *t_page; - struct radix_tree_root *root; - - idx = sector >> PAGE_SECTORS_SHIFT; - sector_bit = (sector & SECTOR_MASK); - - root = is_cache ? &nullb->dev->cache : &nullb->dev->data; - t_page = radix_tree_lookup(root, idx); - WARN_ON(t_page && t_page->page->index != idx); - - if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) - return t_page; - - return NULL; -} - -static struct nullb_page *null_lookup_page(struct nullb *nullb, - sector_t sector, bool for_write, bool ignore_cache) -{ - struct nullb_page *page = NULL; - - if (!ignore_cache) - page = __null_lookup_page(nullb, sector, for_write, true); - if (page) - return page; - return __null_lookup_page(nullb, sector, for_write, false); -} - -static struct nullb_page *null_insert_page(struct nullb *nullb, - sector_t sector, bool ignore_cache) - __releases(&nullb->lock) - __acquires(&nullb->lock) -{ - u64 idx; - struct nullb_page *t_page; - - t_page = null_lookup_page(nullb, sector, true, ignore_cache); - if (t_page) - return t_page; - - spin_unlock_irq(&nullb->lock); - - t_page = null_alloc_page(GFP_NOIO); - if (!t_page) - goto out_lock; - - if (radix_tree_preload(GFP_NOIO)) - goto out_freepage; - - spin_lock_irq(&nullb->lock); - idx = sector >> PAGE_SECTORS_SHIFT; - t_page->page->index = idx; - t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); - radix_tree_preload_end(); - - return t_page; -out_freepage: - null_free_page(t_page); -out_lock: - spin_lock_irq(&nullb->lock); - return null_lookup_page(nullb, sector, true, ignore_cache); -} - -static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) -{ - int i; - unsigned int offset; - u64 idx; - struct nullb_page *t_page, *ret; - void *dst, *src; - - idx = c_page->page->index; - - t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); - - __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); - if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { - null_free_page(c_page); - if (t_page && null_page_empty(t_page)) { - ret = radix_tree_delete_item(&nullb->dev->data, - idx, t_page); - null_free_page(t_page); - } - return 0; - } - - if (!t_page) - return -ENOMEM; - - src = kmap_atomic(c_page->page); - dst = kmap_atomic(t_page->page); - - for (i = 0; i < PAGE_SECTORS; - i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { - if (test_bit(i, c_page->bitmap)) { - offset = (i << SECTOR_SHIFT); - memcpy(dst + offset, src + offset, - nullb->dev->blocksize); - __set_bit(i, t_page->bitmap); - } - } - - kunmap_atomic(dst); - kunmap_atomic(src); - - ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); - null_free_page(ret); - nullb->dev->curr_cache -= PAGE_SIZE; - - return 0; -} - -static int null_make_cache_space(struct nullb *nullb, unsigned long n) -{ - int i, err, nr_pages; - struct nullb_page *c_pages[FREE_BATCH]; - unsigned long flushed = 0, one_round; - -again: - if ((nullb->dev->cache_size * 1024 * 1024) > - nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) - return 0; - - nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, - (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); - /* - * nullb_flush_cache_page could unlock before using the c_pages. To - * avoid race, we don't allow page free - */ - for (i = 0; i < nr_pages; i++) { - nullb->cache_flush_pos = c_pages[i]->page->index; - /* - * We found the page which is being flushed to disk by other - * threads - */ - if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) - c_pages[i] = NULL; - else - __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); - } - - one_round = 0; - for (i = 0; i < nr_pages; i++) { - if (c_pages[i] == NULL) - continue; - err = null_flush_cache_page(nullb, c_pages[i]); - if (err) - return err; - one_round++; - } - flushed += one_round << PAGE_SHIFT; - - if (n > flushed) { - if (nr_pages == 0) - nullb->cache_flush_pos = 0; - if (one_round == 0) { - /* give other threads a chance */ - spin_unlock_irq(&nullb->lock); - spin_lock_irq(&nullb->lock); - } - goto again; - } - return 0; -} - -static int copy_to_nullb(struct nullb *nullb, struct page *source, - unsigned int off, sector_t sector, size_t n, bool is_fua) -{ - size_t temp, count = 0; - unsigned int offset; - struct nullb_page *t_page; - void *dst, *src; - - while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); - - if (null_cache_active(nullb) && !is_fua) - null_make_cache_space(nullb, PAGE_SIZE); - - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; - t_page = null_insert_page(nullb, sector, - !null_cache_active(nullb) || is_fua); - if (!t_page) - return -ENOSPC; - - src = kmap_atomic(source); - dst = kmap_atomic(t_page->page); - memcpy(dst + offset, src + off + count, temp); - kunmap_atomic(dst); - kunmap_atomic(src); - - __set_bit(sector & SECTOR_MASK, t_page->bitmap); - - if (is_fua) - null_free_sector(nullb, sector, true); - - count += temp; - sector += temp >> SECTOR_SHIFT; - } - return 0; -} - -static int copy_from_nullb(struct nullb *nullb, struct page *dest, - unsigned int off, sector_t sector, size_t n) -{ - size_t temp, count = 0; - unsigned int offset; - struct nullb_page *t_page; - void *dst, *src; - - while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); - - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; - t_page = null_lookup_page(nullb, sector, false, - !null_cache_active(nullb)); - - dst = kmap_atomic(dest); - if (!t_page) { - memset(dst + off + count, 0, temp); - goto next; - } - src = kmap_atomic(t_page->page); - memcpy(dst + off + count, src + offset, temp); - kunmap_atomic(src); -next: - kunmap_atomic(dst); - - count += temp; - sector += temp >> SECTOR_SHIFT; - } - return 0; -} - -static void nullb_fill_pattern(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off) -{ - void *dst; - - dst = kmap_atomic(page); - memset(dst + off, 0xFF, len); - kunmap_atomic(dst); -} - -blk_status_t null_handle_discard(struct nullb_device *dev, - sector_t sector, sector_t nr_sectors) -{ - struct nullb *nullb = dev->nullb; - size_t n = nr_sectors << SECTOR_SHIFT; - size_t temp; - - spin_lock_irq(&nullb->lock); - while (n > 0) { - temp = min_t(size_t, n, dev->blocksize); - null_free_sector(nullb, sector, false); - if (null_cache_active(nullb)) - null_free_sector(nullb, sector, true); - sector += temp >> SECTOR_SHIFT; - n -= temp; - } - spin_unlock_irq(&nullb->lock); - - return BLK_STS_OK; -} - -static int null_handle_flush(struct nullb *nullb) -{ - int err; - - if (!null_cache_active(nullb)) - return 0; - - spin_lock_irq(&nullb->lock); - while (true) { - err = null_make_cache_space(nullb, - nullb->dev->cache_size * 1024 * 1024); - if (err || nullb->dev->curr_cache == 0) - break; - } - - WARN_ON(!radix_tree_empty(&nullb->dev->cache)); - spin_unlock_irq(&nullb->lock); - return err; -} - -static int null_transfer(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off, bool is_write, sector_t sector, - bool is_fua) -{ - struct nullb_device *dev = nullb->dev; - unsigned int valid_len = len; - int err = 0; - - if (!is_write) { - if (dev->zoned) - valid_len = null_zone_valid_read_len(nullb, - sector, len); - - if (valid_len) { - err = copy_from_nullb(nullb, page, off, - sector, valid_len); - off += valid_len; - len -= valid_len; - } - - if (len) - nullb_fill_pattern(nullb, page, len, off); - flush_dcache_page(page); - } else { - flush_dcache_page(page); - err = copy_to_nullb(nullb, page, off, sector, len, is_fua); - } - - return err; -} - -static int null_handle_rq(struct nullb_cmd *cmd) -{ - struct request *rq = cmd->rq; - struct nullb *nullb = cmd->nq->dev->nullb; - int err; - unsigned int len; - sector_t sector = blk_rq_pos(rq); - struct req_iterator iter; - struct bio_vec bvec; - - spin_lock_irq(&nullb->lock); - rq_for_each_segment(bvec, rq, iter) { - len = bvec.bv_len; - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(req_op(rq)), sector, - rq->cmd_flags & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } - sector += len >> SECTOR_SHIFT; - } - spin_unlock_irq(&nullb->lock); - - return 0; -} - -static int null_handle_bio(struct nullb_cmd *cmd) -{ - struct bio *bio = cmd->bio; - struct nullb *nullb = cmd->nq->dev->nullb; - int err; - unsigned int len; - sector_t sector = bio->bi_iter.bi_sector; - struct bio_vec bvec; - struct bvec_iter iter; - - spin_lock_irq(&nullb->lock); - bio_for_each_segment(bvec, bio, iter) { - len = bvec.bv_len; - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(bio_op(bio)), sector, - bio->bi_opf & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } - sector += len >> SECTOR_SHIFT; - } - spin_unlock_irq(&nullb->lock); - return 0; -} - -static void null_stop_queue(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_stop_hw_queues(q); -} - -static void null_restart_queue_async(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_start_stopped_hw_queues(q, true); -} - -static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) -{ - struct nullb_device *dev = cmd->nq->dev; - struct nullb *nullb = dev->nullb; - blk_status_t sts = BLK_STS_OK; - struct request *rq = cmd->rq; - - if (!hrtimer_active(&nullb->bw_timer)) - hrtimer_restart(&nullb->bw_timer); - - if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { - null_stop_queue(nullb); - /* race with timer */ - if (atomic_long_read(&nullb->cur_bytes) > 0) - null_restart_queue_async(nullb); - /* requeue request */ - sts = BLK_STS_DEV_RESOURCE; - } - return sts; -} - -static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, - sector_t sector, - sector_t nr_sectors) -{ - struct badblocks *bb = &cmd->nq->dev->badblocks; - sector_t first_bad; - int bad_sectors; - - if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) - return BLK_STS_IOERR; - - return BLK_STS_OK; -} - -static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_opf op, - sector_t sector, - sector_t nr_sectors) -{ - struct nullb_device *dev = cmd->nq->dev; - int err; - - if (op == REQ_OP_DISCARD) - return null_handle_discard(dev, sector, nr_sectors); - - if (dev->queue_mode == NULL_Q_BIO) - err = null_handle_bio(cmd); - else - err = null_handle_rq(cmd); - - return errno_to_blk_status(err); -} - -static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) -{ - struct nullb_device *dev = cmd->nq->dev; - struct bio *bio; - - if (dev->memory_backed) - return; - - if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { - zero_fill_bio(cmd->bio); - } else if (req_op(cmd->rq) == REQ_OP_READ) { - __rq_for_each_bio(bio, cmd->rq) - zero_fill_bio(bio); - } -} - -static inline void nullb_complete_cmd(struct nullb_cmd *cmd) -{ - /* - * Since root privileges are required to configure the null_blk - * driver, it is fine that this driver does not initialize the - * data buffers of read commands. Zero-initialize these buffers - * anyway if KMSAN is enabled to prevent that KMSAN complains - * about null_blk not initializing read data buffers. - */ - if (IS_ENABLED(CONFIG_KMSAN)) - nullb_zero_read_cmd_buffer(cmd); - - /* Complete IO by inline, softirq or timer */ - switch (cmd->nq->dev->irqmode) { - case NULL_IRQ_SOFTIRQ: - switch (cmd->nq->dev->queue_mode) { - case NULL_Q_MQ: - if (likely(!blk_should_fake_timeout(cmd->rq->q))) - blk_mq_complete_request(cmd->rq); - break; - case NULL_Q_BIO: - /* - * XXX: no proper submitting cpu information available. - */ - end_cmd(cmd); - break; - } - break; - case NULL_IRQ_NONE: - end_cmd(cmd); - break; - case NULL_IRQ_TIMER: - null_cmd_end_timer(cmd); - break; - } -} - -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors) -{ - struct nullb_device *dev = cmd->nq->dev; - blk_status_t ret; - - if (dev->badblocks.shift != -1) { - ret = null_handle_badblocks(cmd, sector, nr_sectors); - if (ret != BLK_STS_OK) - return ret; - } - - if (dev->memory_backed) - return null_handle_memory_backed(cmd, op, sector, nr_sectors); - - return BLK_STS_OK; -} - -static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, - sector_t nr_sectors, enum req_opf op) -{ - struct nullb_device *dev = cmd->nq->dev; - struct nullb *nullb = dev->nullb; - blk_status_t sts; - - if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { - sts = null_handle_throttled(cmd); - if (sts != BLK_STS_OK) - return sts; - } - - if (op == REQ_OP_FLUSH) { - cmd->error = errno_to_blk_status(null_handle_flush(nullb)); - goto out; - } - - if (dev->zoned) - cmd->error = null_process_zoned_cmd(cmd, op, - sector, nr_sectors); - else - cmd->error = null_process_cmd(cmd, op, sector, nr_sectors); - -out: - nullb_complete_cmd(cmd); - return BLK_STS_OK; -} - -static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) -{ - struct nullb *nullb = container_of(timer, struct nullb, bw_timer); - ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); - unsigned int mbps = nullb->dev->mbps; - - if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) - return HRTIMER_NORESTART; - - atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); - null_restart_queue_async(nullb); - - hrtimer_forward_now(&nullb->bw_timer, timer_interval); - - return HRTIMER_RESTART; -} - -static void nullb_setup_bwtimer(struct nullb *nullb) -{ - ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); - - hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - nullb->bw_timer.function = nullb_bwtimer_fn; - atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); - hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); -} - -static struct nullb_queue *nullb_to_queue(struct nullb *nullb) -{ - int index = 0; - - if (nullb->nr_queues != 1) - index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); - - return &nullb->queues[index]; -} - -static blk_qc_t null_submit_bio(struct bio *bio) -{ - sector_t sector = bio->bi_iter.bi_sector; - sector_t nr_sectors = bio_sectors(bio); - struct nullb *nullb = bio->bi_disk->private_data; - struct nullb_queue *nq = nullb_to_queue(nullb); - struct nullb_cmd *cmd; - - cmd = alloc_cmd(nq, 1); - cmd->bio = bio; - - null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); - return BLK_QC_T_NONE; -} - -static bool should_timeout_request(struct request *rq) -{ -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_timeout_str[0]) - return should_fail(&null_timeout_attr, 1); -#endif - return false; -} - -static bool should_requeue_request(struct request *rq) -{ -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_requeue_str[0]) - return should_fail(&null_requeue_attr, 1); -#endif - return false; -} - -static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) -{ - pr_info("rq %p timed out\n", rq); - blk_mq_complete_request(rq); - return BLK_EH_DONE; -} - -static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - struct nullb_queue *nq = hctx->driver_data; - sector_t nr_sectors = blk_rq_sectors(bd->rq); - sector_t sector = blk_rq_pos(bd->rq); - - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - if (nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; - } - cmd->rq = bd->rq; - cmd->error = BLK_STS_OK; - cmd->nq = nq; - - blk_mq_start_request(bd->rq); - - if (should_requeue_request(bd->rq)) { - /* - * Alternate between hitting the core BUSY path, and the - * driver driven requeue path - */ - nq->requeue_selection++; - if (nq->requeue_selection & 1) - return BLK_STS_RESOURCE; - else { - blk_mq_requeue_request(bd->rq, true); - return BLK_STS_OK; - } - } - if (should_timeout_request(bd->rq)) - return BLK_STS_OK; - - return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); -} - -static void cleanup_queue(struct nullb_queue *nq) -{ - kfree(nq->tag_map); - kfree(nq->cmds); -} - -static void cleanup_queues(struct nullb *nullb) -{ - int i; - - for (i = 0; i < nullb->nr_queues; i++) - cleanup_queue(&nullb->queues[i]); - - kfree(nullb->queues); -} - -static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - struct nullb_queue *nq = hctx->driver_data; - struct nullb *nullb = nq->dev->nullb; - - nullb->nr_queues--; -} - -static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) -{ - init_waitqueue_head(&nq->wait); - nq->queue_depth = nullb->queue_depth; - nq->dev = nullb->dev; -} - -static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, - unsigned int hctx_idx) -{ - struct nullb *nullb = hctx->queue->queuedata; - struct nullb_queue *nq; - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) - return -EFAULT; -#endif - - nq = &nullb->queues[hctx_idx]; - hctx->driver_data = nq; - null_init_queue(nullb, nq); - nullb->nr_queues++; - - return 0; -} - -static const struct blk_mq_ops null_mq_ops = { - .queue_rq = null_queue_rq, - .complete = null_complete_rq, - .timeout = null_timeout_rq, - .init_hctx = null_init_hctx, - .exit_hctx = null_exit_hctx, -}; - -static void null_del_dev(struct nullb *nullb) -{ - struct nullb_device *dev; - - if (!nullb) - return; - - dev = nullb->dev; - - ida_simple_remove(&nullb_indexes, nullb->index); - - list_del_init(&nullb->list); - - del_gendisk(nullb->disk); - - if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { - hrtimer_cancel(&nullb->bw_timer); - atomic_long_set(&nullb->cur_bytes, LONG_MAX); - null_restart_queue_async(nullb); - } - - blk_cleanup_queue(nullb->q); - if (dev->queue_mode == NULL_Q_MQ && - nullb->tag_set == &nullb->__tag_set) - blk_mq_free_tag_set(nullb->tag_set); - put_disk(nullb->disk); - cleanup_queues(nullb); - if (null_cache_active(nullb)) - null_free_device_storage(nullb->dev, true); - kfree(nullb); - dev->nullb = NULL; -} - -static void null_config_discard(struct nullb *nullb) -{ - if (nullb->dev->discard == false) - return; - - if (!nullb->dev->memory_backed) { - nullb->dev->discard = false; - pr_info("discard option is ignored without memory backing\n"); - return; - } - - if (nullb->dev->zoned) { - nullb->dev->discard = false; - pr_info("discard option is ignored in zoned mode\n"); - return; - } - - nullb->q->limits.discard_granularity = nullb->dev->blocksize; - nullb->q->limits.discard_alignment = nullb->dev->blocksize; - blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); -} - -static const struct block_device_operations null_bio_ops = { - .owner = THIS_MODULE, - .submit_bio = null_submit_bio, - .report_zones = null_report_zones, -}; - -static const struct block_device_operations null_rq_ops = { - .owner = THIS_MODULE, - .report_zones = null_report_zones, -}; - -static int setup_commands(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - int i, tag_size; - - nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); - if (!nq->cmds) - return -ENOMEM; - - tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; - nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); - if (!nq->tag_map) { - kfree(nq->cmds); - return -ENOMEM; - } - - for (i = 0; i < nq->queue_depth; i++) { - cmd = &nq->cmds[i]; - cmd->tag = -1U; - } - - return 0; -} - -static int setup_queues(struct nullb *nullb) -{ - nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), - GFP_KERNEL); - if (!nullb->queues) - return -ENOMEM; - - nullb->queue_depth = nullb->dev->hw_queue_depth; - - return 0; -} - -static int init_driver_queues(struct nullb *nullb) -{ - struct nullb_queue *nq; - int i, ret = 0; - - for (i = 0; i < nullb->dev->submit_queues; i++) { - nq = &nullb->queues[i]; - - null_init_queue(nullb, nq); - - ret = setup_commands(nq); - if (ret) - return ret; - nullb->nr_queues++; - } - return 0; -} - -static int null_gendisk_register(struct nullb *nullb) -{ - sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; - struct gendisk *disk; - - disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); - if (!disk) - return -ENOMEM; - set_capacity(disk, size); - - disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->major = null_major; - disk->first_minor = nullb->index; - if (queue_is_mq(nullb->q)) - disk->fops = &null_rq_ops; - else - disk->fops = &null_bio_ops; - disk->private_data = nullb; - disk->queue = nullb->q; - strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); - - if (nullb->dev->zoned) { - int ret = null_register_zoned_dev(nullb); - - if (ret) - return ret; - } - - add_disk(disk); - return 0; -} - -static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) -{ - set->ops = &null_mq_ops; - set->nr_hw_queues = nullb ? nullb->dev->submit_queues : - g_submit_queues; - set->queue_depth = nullb ? nullb->dev->hw_queue_depth : - g_hw_queue_depth; - set->numa_node = nullb ? nullb->dev->home_node : g_home_node; - set->cmd_size = sizeof(struct nullb_cmd); - set->flags = BLK_MQ_F_SHOULD_MERGE; - if (g_no_sched) - set->flags |= BLK_MQ_F_NO_SCHED; - if (g_shared_tag_bitmap) - set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; - set->driver_data = NULL; - - if ((nullb && nullb->dev->blocking) || g_blocking) - set->flags |= BLK_MQ_F_BLOCKING; - - return blk_mq_alloc_tag_set(set); -} - -static int null_validate_conf(struct nullb_device *dev) -{ - dev->blocksize = round_down(dev->blocksize, 512); - dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); - - if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { - if (dev->submit_queues != nr_online_nodes) - dev->submit_queues = nr_online_nodes; - } else if (dev->submit_queues > nr_cpu_ids) - dev->submit_queues = nr_cpu_ids; - else if (dev->submit_queues == 0) - dev->submit_queues = 1; - - dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); - dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); - - /* Do memory allocation, so set blocking */ - if (dev->memory_backed) - dev->blocking = true; - else /* cache is meaningless */ - dev->cache_size = 0; - dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, - dev->cache_size); - dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); - /* can not stop a queue */ - if (dev->queue_mode == NULL_Q_BIO) - dev->mbps = 0; - - if (dev->zoned && - (!dev->zone_size || !is_power_of_2(dev->zone_size))) { - pr_err("zone_size must be power-of-two\n"); - return -EINVAL; - } - - return 0; -} - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION -static bool __null_setup_fault(struct fault_attr *attr, char *str) -{ - if (!str[0]) - return true; - - if (!setup_fault_attr(attr, str)) - return false; - - attr->verbose = 0; - return true; -} -#endif - -static bool null_setup_fault(void) -{ -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) - return false; - if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) - return false; - if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) - return false; -#endif - return true; -} - -static int null_add_dev(struct nullb_device *dev) -{ - struct nullb *nullb; - int rv; - - rv = null_validate_conf(dev); - if (rv) - return rv; - - nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); - if (!nullb) { - rv = -ENOMEM; - goto out; - } - nullb->dev = dev; - dev->nullb = nullb; - - spin_lock_init(&nullb->lock); - - rv = setup_queues(nullb); - if (rv) - goto out_free_nullb; - - if (dev->queue_mode == NULL_Q_MQ) { - if (shared_tags) { - nullb->tag_set = &tag_set; - rv = 0; - } else { - nullb->tag_set = &nullb->__tag_set; - rv = null_init_tag_set(nullb, nullb->tag_set); - } - - if (rv) - goto out_cleanup_queues; - - if (!null_setup_fault()) - goto out_cleanup_queues; - - nullb->tag_set->timeout = 5 * HZ; - nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); - if (IS_ERR(nullb->q)) { - rv = -ENOMEM; - goto out_cleanup_tags; - } - } else if (dev->queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue(dev->home_node); - if (!nullb->q) { - rv = -ENOMEM; - goto out_cleanup_queues; - } - rv = init_driver_queues(nullb); - if (rv) - goto out_cleanup_blk_queue; - } - - if (dev->mbps) { - set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); - nullb_setup_bwtimer(nullb); - } - - if (dev->cache_size > 0) { - set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); - blk_queue_write_cache(nullb->q, true, true); - } - - if (dev->zoned) { - rv = null_init_zoned_dev(dev, nullb->q); - if (rv) - goto out_cleanup_blk_queue; - } - - nullb->q->queuedata = nullb; - blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); - - mutex_lock(&lock); - nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); - dev->index = nullb->index; - mutex_unlock(&lock); - - blk_queue_logical_block_size(nullb->q, dev->blocksize); - blk_queue_physical_block_size(nullb->q, dev->blocksize); - if (!dev->max_sectors) - dev->max_sectors = queue_max_hw_sectors(nullb->q); - dev->max_sectors = min_t(unsigned int, dev->max_sectors, - BLK_DEF_MAX_SECTORS); - blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); - - null_config_discard(nullb); - - sprintf(nullb->disk_name, "nullb%d", nullb->index); - - rv = null_gendisk_register(nullb); - if (rv) - goto out_cleanup_zone; - - mutex_lock(&lock); - list_add_tail(&nullb->list, &nullb_list); - mutex_unlock(&lock); - - return 0; -out_cleanup_zone: - null_free_zoned_dev(dev); -out_cleanup_blk_queue: - blk_cleanup_queue(nullb->q); -out_cleanup_tags: - if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) - blk_mq_free_tag_set(nullb->tag_set); -out_cleanup_queues: - cleanup_queues(nullb); -out_free_nullb: - kfree(nullb); - dev->nullb = NULL; -out: - return rv; -} - -static int __init null_init(void) -{ - int ret = 0; - unsigned int i; - struct nullb *nullb; - struct nullb_device *dev; - - if (g_bs > PAGE_SIZE) { - pr_warn("invalid block size\n"); - pr_warn("defaults block size to %lu\n", PAGE_SIZE); - g_bs = PAGE_SIZE; - } - - if (g_max_sectors > BLK_DEF_MAX_SECTORS) { - pr_warn("invalid max sectors\n"); - pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS); - g_max_sectors = BLK_DEF_MAX_SECTORS; - } - - if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { - pr_err("invalid home_node value\n"); - g_home_node = NUMA_NO_NODE; - } - - if (g_queue_mode == NULL_Q_RQ) { - pr_err("legacy IO path no longer available\n"); - return -EINVAL; - } - if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { - if (g_submit_queues != nr_online_nodes) { - pr_warn("submit_queues param is set to %u.\n", - nr_online_nodes); - g_submit_queues = nr_online_nodes; - } - } else if (g_submit_queues > nr_cpu_ids) - g_submit_queues = nr_cpu_ids; - else if (g_submit_queues <= 0) - g_submit_queues = 1; - - if (g_queue_mode == NULL_Q_MQ && shared_tags) { - ret = null_init_tag_set(NULL, &tag_set); - if (ret) - return ret; - } - - config_group_init(&nullb_subsys.su_group); - mutex_init(&nullb_subsys.su_mutex); - - ret = configfs_register_subsystem(&nullb_subsys); - if (ret) - goto err_tagset; - - mutex_init(&lock); - - null_major = register_blkdev(0, "nullb"); - if (null_major < 0) { - ret = null_major; - goto err_conf; - } - - for (i = 0; i < nr_devices; i++) { - dev = null_alloc_dev(); - if (!dev) { - ret = -ENOMEM; - goto err_dev; - } - ret = null_add_dev(dev); - if (ret) { - null_free_dev(dev); - goto err_dev; - } - } - - pr_info("module loaded\n"); - return 0; - -err_dev: - while (!list_empty(&nullb_list)) { - nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); - } - unregister_blkdev(null_major, "nullb"); -err_conf: - configfs_unregister_subsystem(&nullb_subsys); -err_tagset: - if (g_queue_mode == NULL_Q_MQ && shared_tags) - blk_mq_free_tag_set(&tag_set); - return ret; -} - -static void __exit null_exit(void) -{ - struct nullb *nullb; - - configfs_unregister_subsystem(&nullb_subsys); - - unregister_blkdev(null_major, "nullb"); - - mutex_lock(&lock); - while (!list_empty(&nullb_list)) { - struct nullb_device *dev; - - nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); - } - mutex_unlock(&lock); - - if (g_queue_mode == NULL_Q_MQ && shared_tags) - blk_mq_free_tag_set(&tag_set); -} - -module_init(null_init); -module_exit(null_exit); - -MODULE_AUTHOR("Jens Axboe "); -MODULE_LICENSE("GPL"); diff --git a/drivers/block/null_blk_trace.c b/drivers/block/null_blk_trace.c deleted file mode 100644 index f246e7bff698..000000000000 --- a/drivers/block/null_blk_trace.c +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * null_blk trace related helpers. - * - * Copyright (C) 2020 Western Digital Corporation or its affiliates. - */ -#include "null_blk_trace.h" - -/* - * Helper to use for all null_blk traces to extract disk name. - */ -const char *nullb_trace_disk_name(struct trace_seq *p, char *name) -{ - const char *ret = trace_seq_buffer_ptr(p); - - if (name && *name) - trace_seq_printf(p, "disk=%s, ", name); - trace_seq_putc(p, 0); - - return ret; -} diff --git a/drivers/block/null_blk_trace.h b/drivers/block/null_blk_trace.h deleted file mode 100644 index 4f83032eb544..000000000000 --- a/drivers/block/null_blk_trace.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * null_blk device driver tracepoints. - * - * Copyright (C) 2020 Western Digital Corporation or its affiliates. - */ - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM nullb - -#if !defined(_TRACE_NULLB_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_NULLB_H - -#include -#include - -#include "null_blk.h" - -const char *nullb_trace_disk_name(struct trace_seq *p, char *name); - -#define __print_disk_name(name) nullb_trace_disk_name(p, name) - -#ifndef TRACE_HEADER_MULTI_READ -static inline void __assign_disk_name(char *name, struct gendisk *disk) -{ - if (disk) - memcpy(name, disk->disk_name, DISK_NAME_LEN); - else - memset(name, 0, DISK_NAME_LEN); -} -#endif - -TRACE_EVENT(nullb_zone_op, - TP_PROTO(struct nullb_cmd *cmd, unsigned int zone_no, - unsigned int zone_cond), - TP_ARGS(cmd, zone_no, zone_cond), - TP_STRUCT__entry( - __array(char, disk, DISK_NAME_LEN) - __field(enum req_opf, op) - __field(unsigned int, zone_no) - __field(unsigned int, zone_cond) - ), - TP_fast_assign( - __entry->op = req_op(cmd->rq); - __entry->zone_no = zone_no; - __entry->zone_cond = zone_cond; - __assign_disk_name(__entry->disk, cmd->rq->rq_disk); - ), - TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", - __print_disk_name(__entry->disk), - blk_op_str(__entry->op), - __entry->zone_no, - blk_zone_cond_str(__entry->zone_cond)) -); - -TRACE_EVENT(nullb_report_zones, - TP_PROTO(struct nullb *nullb, unsigned int nr_zones), - TP_ARGS(nullb, nr_zones), - TP_STRUCT__entry( - __array(char, disk, DISK_NAME_LEN) - __field(unsigned int, nr_zones) - ), - TP_fast_assign( - __entry->nr_zones = nr_zones; - __assign_disk_name(__entry->disk, nullb->disk); - ), - TP_printk("%s nr_zones=%u", - __print_disk_name(__entry->disk), __entry->nr_zones) -); - -#endif /* _TRACE_NULLB_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE null_blk_trace - -/* This part must be outside protection */ -#include diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c deleted file mode 100644 index 65464f7559e0..000000000000 --- a/drivers/block/null_blk_zoned.c +++ /dev/null @@ -1,677 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include "null_blk.h" - -#define CREATE_TRACE_POINTS -#include "null_blk_trace.h" - -#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) - -static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) -{ - return sect >> ilog2(dev->zone_size_sects); -} - -static inline void null_lock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_lock_irq(&dev->zone_res_lock); -} - -static inline void null_unlock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_unlock_irq(&dev->zone_res_lock); -} - -static inline void null_init_zone_lock(struct nullb_device *dev, - struct nullb_zone *zone) -{ - if (!dev->memory_backed) - spin_lock_init(&zone->spinlock); - else - mutex_init(&zone->mutex); -} - -static inline void null_lock_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - if (!dev->memory_backed) - spin_lock_irq(&zone->spinlock); - else - mutex_lock(&zone->mutex); -} - -static inline void null_unlock_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - if (!dev->memory_backed) - spin_unlock_irq(&zone->spinlock); - else - mutex_unlock(&zone->mutex); -} - -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) -{ - sector_t dev_capacity_sects, zone_capacity_sects; - struct nullb_zone *zone; - sector_t sector = 0; - unsigned int i; - - if (!is_power_of_2(dev->zone_size)) { - pr_err("zone_size must be power-of-two\n"); - return -EINVAL; - } - if (dev->zone_size > dev->size) { - pr_err("Zone size larger than device capacity\n"); - return -EINVAL; - } - - if (!dev->zone_capacity) - dev->zone_capacity = dev->zone_size; - - if (dev->zone_capacity > dev->zone_size) { - pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", - dev->zone_capacity, dev->zone_size); - return -EINVAL; - } - - zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); - dev_capacity_sects = MB_TO_SECTS(dev->size); - dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); - dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); - if (dev_capacity_sects & (dev->zone_size_sects - 1)) - dev->nr_zones++; - - dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), - GFP_KERNEL | __GFP_ZERO); - if (!dev->zones) - return -ENOMEM; - - spin_lock_init(&dev->zone_res_lock); - - if (dev->zone_nr_conv >= dev->nr_zones) { - dev->zone_nr_conv = dev->nr_zones - 1; - pr_info("changed the number of conventional zones to %u", - dev->zone_nr_conv); - } - - /* Max active zones has to be < nbr of seq zones in order to be enforceable */ - if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { - dev->zone_max_active = 0; - pr_info("zone_max_active limit disabled, limit >= zone count\n"); - } - - /* Max open zones has to be <= max active zones */ - if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { - dev->zone_max_open = dev->zone_max_active; - pr_info("changed the maximum number of open zones to %u\n", - dev->nr_zones); - } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { - dev->zone_max_open = 0; - pr_info("zone_max_open limit disabled, limit >= zone count\n"); - } - dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; - dev->imp_close_zone_no = dev->zone_nr_conv; - - for (i = 0; i < dev->zone_nr_conv; i++) { - zone = &dev->zones[i]; - - null_init_zone_lock(dev, zone); - zone->start = sector; - zone->len = dev->zone_size_sects; - zone->capacity = zone->len; - zone->wp = zone->start + zone->len; - zone->type = BLK_ZONE_TYPE_CONVENTIONAL; - zone->cond = BLK_ZONE_COND_NOT_WP; - - sector += dev->zone_size_sects; - } - - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - zone = &dev->zones[i]; - - null_init_zone_lock(dev, zone); - zone->start = zone->wp = sector; - if (zone->start + dev->zone_size_sects > dev_capacity_sects) - zone->len = dev_capacity_sects - zone->start; - else - zone->len = dev->zone_size_sects; - zone->capacity = - min_t(sector_t, zone->len, zone_capacity_sects); - zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; - zone->cond = BLK_ZONE_COND_EMPTY; - - sector += dev->zone_size_sects; - } - - q->limits.zoned = BLK_ZONED_HM; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - - return 0; -} - -int null_register_zoned_dev(struct nullb *nullb) -{ - struct nullb_device *dev = nullb->dev; - struct request_queue *q = nullb->q; - - if (queue_is_mq(q)) { - int ret = blk_revalidate_disk_zones(nullb->disk, NULL); - - if (ret) - return ret; - } else { - blk_queue_chunk_sectors(q, dev->zone_size_sects); - q->nr_zones = blkdev_nr_zones(nullb->disk); - } - - blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); - blk_queue_max_open_zones(q, dev->zone_max_open); - blk_queue_max_active_zones(q, dev->zone_max_active); - - return 0; -} - -void null_free_zoned_dev(struct nullb_device *dev) -{ - kvfree(dev->zones); -} - -int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) -{ - struct nullb *nullb = disk->private_data; - struct nullb_device *dev = nullb->dev; - unsigned int first_zone, i; - struct nullb_zone *zone; - struct blk_zone blkz; - int error; - - first_zone = null_zone_no(dev, sector); - if (first_zone >= dev->nr_zones) - return 0; - - nr_zones = min(nr_zones, dev->nr_zones - first_zone); - trace_nullb_report_zones(nullb, nr_zones); - - memset(&blkz, 0, sizeof(struct blk_zone)); - zone = &dev->zones[first_zone]; - for (i = 0; i < nr_zones; i++, zone++) { - /* - * Stacked DM target drivers will remap the zone information by - * modifying the zone information passed to the report callback. - * So use a local copy to avoid corruption of the device zone - * array. - */ - null_lock_zone(dev, zone); - blkz.start = zone->start; - blkz.len = zone->len; - blkz.wp = zone->wp; - blkz.type = zone->type; - blkz.cond = zone->cond; - blkz.capacity = zone->capacity; - null_unlock_zone(dev, zone); - - error = cb(&blkz, i, data); - if (error) - return error; - } - - return nr_zones; -} - -/* - * This is called in the case of memory backing from null_process_cmd() - * with the target zone already locked. - */ -size_t null_zone_valid_read_len(struct nullb *nullb, - sector_t sector, unsigned int len) -{ - struct nullb_device *dev = nullb->dev; - struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; - unsigned int nr_sectors = len >> SECTOR_SHIFT; - - /* Read must be below the write pointer position */ - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || - sector + nr_sectors <= zone->wp) - return len; - - if (sector > zone->wp) - return 0; - - return (zone->wp - sector) << SECTOR_SHIFT; -} - -static blk_status_t __null_close_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - switch (zone->cond) { - case BLK_ZONE_COND_CLOSED: - /* close operation on closed is not an error */ - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_FULL: - default: - return BLK_STS_IOERR; - } - - if (zone->wp == zone->start) { - zone->cond = BLK_ZONE_COND_EMPTY; - } else { - zone->cond = BLK_ZONE_COND_CLOSED; - dev->nr_zones_closed++; - } - - return BLK_STS_OK; -} - -static void null_close_imp_open_zone(struct nullb_device *dev) -{ - struct nullb_zone *zone; - unsigned int zno, i; - - zno = dev->imp_close_zone_no; - if (zno >= dev->nr_zones) - zno = dev->zone_nr_conv; - - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - zone = &dev->zones[zno]; - zno++; - if (zno >= dev->nr_zones) - zno = dev->zone_nr_conv; - - if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { - __null_close_zone(dev, zone); - dev->imp_close_zone_no = zno; - return; - } - } -} - -static blk_status_t null_check_active(struct nullb_device *dev) -{ - if (!dev->zone_max_active) - return BLK_STS_OK; - - if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + - dev->nr_zones_closed < dev->zone_max_active) - return BLK_STS_OK; - - return BLK_STS_ZONE_ACTIVE_RESOURCE; -} - -static blk_status_t null_check_open(struct nullb_device *dev) -{ - if (!dev->zone_max_open) - return BLK_STS_OK; - - if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) - return BLK_STS_OK; - - if (dev->nr_zones_imp_open) { - if (null_check_active(dev) == BLK_STS_OK) { - null_close_imp_open_zone(dev); - return BLK_STS_OK; - } - } - - return BLK_STS_ZONE_OPEN_RESOURCE; -} - -/* - * This function matches the manage open zone resources function in the ZBC standard, - * with the addition of max active zones support (added in the ZNS standard). - * - * The function determines if a zone can transition to implicit open or explicit open, - * while maintaining the max open zone (and max active zone) limit(s). It may close an - * implicit open zone in order to make additional zone resources available. - * - * ZBC states that an implicit open zone shall be closed only if there is not - * room within the open limit. However, with the addition of an active limit, - * it is not certain that closing an implicit open zone will allow a new zone - * to be opened, since we might already be at the active limit capacity. - */ -static blk_status_t null_check_zone_resources(struct nullb_device *dev, - struct nullb_zone *zone) -{ - blk_status_t ret; - - switch (zone->cond) { - case BLK_ZONE_COND_EMPTY: - ret = null_check_active(dev); - if (ret != BLK_STS_OK) - return ret; - fallthrough; - case BLK_ZONE_COND_CLOSED: - return null_check_open(dev); - default: - /* Should never be called for other states */ - WARN_ON(1); - return BLK_STS_IOERR; - } -} - -static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, - unsigned int nr_sectors, bool append) -{ - struct nullb_device *dev = cmd->nq->dev; - unsigned int zno = null_zone_no(dev, sector); - struct nullb_zone *zone = &dev->zones[zno]; - blk_status_t ret; - - trace_nullb_zone_op(cmd, zno, zone->cond); - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { - if (append) - return BLK_STS_IOERR; - return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - } - - null_lock_zone(dev, zone); - - if (zone->cond == BLK_ZONE_COND_FULL) { - /* Cannot write to a full zone */ - ret = BLK_STS_IOERR; - goto unlock; - } - - /* - * Regular writes must be at the write pointer position. - * Zone append writes are automatically issued at the write - * pointer and the position returned using the request or BIO - * sector. - */ - if (append) { - sector = zone->wp; - if (cmd->bio) - cmd->bio->bi_iter.bi_sector = sector; - else - cmd->rq->__sector = sector; - } else if (sector != zone->wp) { - ret = BLK_STS_IOERR; - goto unlock; - } - - if (zone->wp + nr_sectors > zone->start + zone->capacity) { - ret = BLK_STS_IOERR; - goto unlock; - } - - if (zone->cond == BLK_ZONE_COND_CLOSED || - zone->cond == BLK_ZONE_COND_EMPTY) { - null_lock_zone_res(dev); - - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) { - null_unlock_zone_res(dev); - goto unlock; - } - if (zone->cond == BLK_ZONE_COND_CLOSED) { - dev->nr_zones_closed--; - dev->nr_zones_imp_open++; - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { - dev->nr_zones_imp_open++; - } - - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - null_unlock_zone_res(dev); - } - - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (ret != BLK_STS_OK) - goto unlock; - - zone->wp += nr_sectors; - if (zone->wp == zone->start + zone->capacity) { - null_lock_zone_res(dev); - if (zone->cond == BLK_ZONE_COND_EXP_OPEN) - dev->nr_zones_exp_open--; - else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) - dev->nr_zones_imp_open--; - zone->cond = BLK_ZONE_COND_FULL; - null_unlock_zone_res(dev); - } - - ret = BLK_STS_OK; - -unlock: - null_unlock_zone(dev, zone); - - return ret; -} - -static blk_status_t null_open_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - blk_status_t ret = BLK_STS_OK; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - null_lock_zone_res(dev); - - switch (zone->cond) { - case BLK_ZONE_COND_EXP_OPEN: - /* open operation on exp open is not an error */ - goto unlock; - case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; - break; - case BLK_ZONE_COND_FULL: - default: - ret = BLK_STS_IOERR; - goto unlock; - } - - zone->cond = BLK_ZONE_COND_EXP_OPEN; - dev->nr_zones_exp_open++; - -unlock: - null_unlock_zone_res(dev); - - return ret; -} - -static blk_status_t null_close_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - blk_status_t ret; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - null_lock_zone_res(dev); - ret = __null_close_zone(dev, zone); - null_unlock_zone_res(dev); - - return ret; -} - -static blk_status_t null_finish_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - blk_status_t ret = BLK_STS_OK; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - null_lock_zone_res(dev); - - switch (zone->cond) { - case BLK_ZONE_COND_FULL: - /* finish operation on full is not an error */ - goto unlock; - case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; - break; - default: - ret = BLK_STS_IOERR; - goto unlock; - } - - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = zone->start + zone->len; - -unlock: - null_unlock_zone_res(dev); - - return ret; -} - -static blk_status_t null_reset_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - null_lock_zone_res(dev); - - switch (zone->cond) { - case BLK_ZONE_COND_EMPTY: - /* reset operation on empty is not an error */ - null_unlock_zone_res(dev); - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - dev->nr_zones_closed--; - break; - case BLK_ZONE_COND_FULL: - break; - default: - null_unlock_zone_res(dev); - return BLK_STS_IOERR; - } - - zone->cond = BLK_ZONE_COND_EMPTY; - zone->wp = zone->start; - - null_unlock_zone_res(dev); - - if (dev->memory_backed) - return null_handle_discard(dev, zone->start, zone->len); - - return BLK_STS_OK; -} - -static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, - sector_t sector) -{ - struct nullb_device *dev = cmd->nq->dev; - unsigned int zone_no; - struct nullb_zone *zone; - blk_status_t ret; - size_t i; - - if (op == REQ_OP_ZONE_RESET_ALL) { - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - zone = &dev->zones[i]; - null_lock_zone(dev, zone); - if (zone->cond != BLK_ZONE_COND_EMPTY) { - null_reset_zone(dev, zone); - trace_nullb_zone_op(cmd, i, zone->cond); - } - null_unlock_zone(dev, zone); - } - return BLK_STS_OK; - } - - zone_no = null_zone_no(dev, sector); - zone = &dev->zones[zone_no]; - - null_lock_zone(dev, zone); - - switch (op) { - case REQ_OP_ZONE_RESET: - ret = null_reset_zone(dev, zone); - break; - case REQ_OP_ZONE_OPEN: - ret = null_open_zone(dev, zone); - break; - case REQ_OP_ZONE_CLOSE: - ret = null_close_zone(dev, zone); - break; - case REQ_OP_ZONE_FINISH: - ret = null_finish_zone(dev, zone); - break; - default: - ret = BLK_STS_NOTSUPP; - break; - } - - if (ret == BLK_STS_OK) - trace_nullb_zone_op(cmd, zone_no, zone->cond); - - null_unlock_zone(dev, zone); - - return ret; -} - -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, - sector_t sector, sector_t nr_sectors) -{ - struct nullb_device *dev; - struct nullb_zone *zone; - blk_status_t sts; - - switch (op) { - case REQ_OP_WRITE: - return null_zone_write(cmd, sector, nr_sectors, false); - case REQ_OP_ZONE_APPEND: - return null_zone_write(cmd, sector, nr_sectors, true); - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_RESET_ALL: - case REQ_OP_ZONE_OPEN: - case REQ_OP_ZONE_CLOSE: - case REQ_OP_ZONE_FINISH: - return null_zone_mgmt(cmd, op, sector); - default: - dev = cmd->nq->dev; - zone = &dev->zones[null_zone_no(dev, sector)]; - - null_lock_zone(dev, zone); - sts = null_process_cmd(cmd, op, sector, nr_sectors); - null_unlock_zone(dev, zone); - return sts; - } -} -- cgit v1.2.3 From aeb2b0b1a3da5791d3b216e71ec72db7570f3571 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sat, 12 Dec 2020 06:13:02 +0100 Subject: block: drop dead assignments in loop_init() Commit 8410d38c2552 ("loop: use __register_blkdev to allocate devices on demand") simplified loop_init(); so computing the range of the block region is not required anymore and can be dropped. Drop dead assignments in loop_init(). As compilers will detect these unneeded assignments and optimize this, the resulting object code is identical before and after this change. No functional change. No change in object code. Signed-off-by: Lukas Bulwahn Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 22e59410b971..a45248c6e319 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2316,7 +2316,6 @@ MODULE_ALIAS("devname:loop-control"); static int __init loop_init(void) { int i, nr; - unsigned long range; struct loop_device *lo; int err; @@ -2353,13 +2352,10 @@ static int __init loop_init(void) * /dev/loop-control interface, or be instantiated by accessing * a 'dead' device node. */ - if (max_loop) { + if (max_loop) nr = max_loop; - range = max_loop << part_shift; - } else { + else nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT; - range = 1UL << MINORBITS; - } err = misc_register(&loop_misc); if (err < 0) -- cgit v1.2.3