From 6a0cb1bc106fc07ce0443303bcdb7f7da5131e5c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 18 Oct 2016 15:40:33 +0900 Subject: block: Implement support for zoned block devices Implement zoned block device zone information reporting and reset. Zone information are reported as struct blk_zone. This implementation does not differentiate between host-aware and host-managed device models and is valid for both. Two functions are provided: blkdev_report_zones for discovering the zone configuration of a zoned block device, and blkdev_reset_zones for resetting the write pointer of sequential zones. The helper function blk_queue_zone_size and bdev_zone_size are also provided for, as the name suggest, obtaining the zone size (in 512B sectors) of the zones of the device. Signed-off-by: Hannes Reinecke [Damien: * Removed the zone cache * Implement report zones operation based on earlier proposal by Shaun Tancheff ] Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/blkzoned.h | 103 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 include/uapi/linux/blkzoned.h (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 6965d0909554..b2166f283da9 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -70,6 +70,7 @@ header-y += bfs_fs.h header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h +header-y += blkzoned.h header-y += bpf_common.h header-y += bpf_perf_event.h header-y += bpf.h diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h new file mode 100644 index 000000000000..a3817214b0e0 --- /dev/null +++ b/include/uapi/linux/blkzoned.h @@ -0,0 +1,103 @@ +/* + * Zoned block devices handling. + * + * Copyright (C) 2015 Seagate Technology PLC + * + * Written by: Shaun Tancheff + * + * Modified by: Damien Le Moal + * Copyright (C) 2016 Western Digital + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ +#ifndef _UAPI_BLKZONED_H +#define _UAPI_BLKZONED_H + +#include + +/** + * enum blk_zone_type - Types of zones allowed in a zoned device. + * + * @BLK_ZONE_TYPE_CONVENTIONAL: The zone has no write pointer and can be writen + * randomly. Zone reset has no effect on the zone. + * @BLK_ZONE_TYPE_SEQWRITE_REQ: The zone must be written sequentially + * @BLK_ZONE_TYPE_SEQWRITE_PREF: The zone can be written non-sequentially + * + * Any other value not defined is reserved and must be considered as invalid. + */ +enum blk_zone_type { + BLK_ZONE_TYPE_CONVENTIONAL = 0x1, + BLK_ZONE_TYPE_SEQWRITE_REQ = 0x2, + BLK_ZONE_TYPE_SEQWRITE_PREF = 0x3, +}; + +/** + * enum blk_zone_cond - Condition [state] of a zone in a zoned device. + * + * @BLK_ZONE_COND_NOT_WP: The zone has no write pointer, it is conventional. + * @BLK_ZONE_COND_EMPTY: The zone is empty. + * @BLK_ZONE_COND_IMP_OPEN: The zone is open, but not explicitly opened. + * @BLK_ZONE_COND_EXP_OPEN: The zones was explicitly opened by an + * OPEN ZONE command. + * @BLK_ZONE_COND_CLOSED: The zone was [explicitly] closed after writing. + * @BLK_ZONE_COND_FULL: The zone is marked as full, possibly by a zone + * FINISH ZONE command. + * @BLK_ZONE_COND_READONLY: The zone is read-only. + * @BLK_ZONE_COND_OFFLINE: The zone is offline (sectors cannot be read/written). + * + * The Zone Condition state machine in the ZBC/ZAC standards maps the above + * deinitions as: + * - ZC1: Empty | BLK_ZONE_EMPTY + * - ZC2: Implicit Open | BLK_ZONE_COND_IMP_OPEN + * - ZC3: Explicit Open | BLK_ZONE_COND_EXP_OPEN + * - ZC4: Closed | BLK_ZONE_CLOSED + * - ZC5: Full | BLK_ZONE_FULL + * - ZC6: Read Only | BLK_ZONE_READONLY + * - ZC7: Offline | BLK_ZONE_OFFLINE + * + * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should + * be considered invalid. + */ +enum blk_zone_cond { + BLK_ZONE_COND_NOT_WP = 0x0, + BLK_ZONE_COND_EMPTY = 0x1, + BLK_ZONE_COND_IMP_OPEN = 0x2, + BLK_ZONE_COND_EXP_OPEN = 0x3, + BLK_ZONE_COND_CLOSED = 0x4, + BLK_ZONE_COND_READONLY = 0xD, + BLK_ZONE_COND_FULL = 0xE, + BLK_ZONE_COND_OFFLINE = 0xF, +}; + +/** + * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl. + * + * @start: Zone start in 512 B sector units + * @len: Zone length in 512 B sector units + * @wp: Zone write pointer location in 512 B sector units + * @type: see enum blk_zone_type for possible values + * @cond: see enum blk_zone_cond for possible values + * @non_seq: Flag indicating that the zone is using non-sequential resources + * (for host-aware zoned block devices only). + * @reset: Flag indicating that a zone reset is recommended. + * @reserved: Padding to 64 B to match the ZBC/ZAC defined zone descriptor size. + * + * start, len and wp use the regular 512 B sector unit, regardless of the + * device logical block size. The overall structure size is 64 B to match the + * ZBC/ZAC defined zone descriptor and allow support for future additional + * zone information. + */ +struct blk_zone { + __u64 start; /* Zone start sector */ + __u64 len; /* Zone length in number of sectors */ + __u64 wp; /* Zone write pointer position */ + __u8 type; /* Zone type */ + __u8 cond; /* Zone condition */ + __u8 non_seq; /* Non-sequential write resources active */ + __u8 reset; /* Reset write pointer recommended */ + __u8 reserved[36]; +}; + +#endif /* _UAPI_BLKZONED_H */ -- cgit v1.2.3 From 3ed05a987e0f63b21e634101e0b460d32f3581c3 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 18 Oct 2016 15:40:35 +0900 Subject: blk-zoned: implement ioctls Adds the new BLKREPORTZONE and BLKRESETZONE ioctls for respectively obtaining the zone configuration of a zoned block device and resetting the write pointer of sequential zones of a zoned block device. The BLKREPORTZONE ioctl maps directly to a single call of the function blkdev_report_zones. The zone information result is passed as an array of struct blk_zone identical to the structure used internally for processing the REQ_OP_ZONE_REPORT operation. The BLKRESETZONE ioctl maps to a single call of the blkdev_reset_zones function. Signed-off-by: Shaun Tancheff Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-zoned.c | 93 +++++++++++++++++++++++++++++++++++++++++++ block/ioctl.c | 4 ++ include/linux/blkdev.h | 21 ++++++++++ include/uapi/linux/blkzoned.h | 40 +++++++++++++++++++ include/uapi/linux/fs.h | 4 ++ 5 files changed, 162 insertions(+) (limited to 'include/uapi') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1603573f9605..667f95d86695 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -255,3 +255,96 @@ int blkdev_reset_zones(struct block_device *bdev, return 0; } EXPORT_SYMBOL_GPL(blkdev_reset_zones); + +/** + * BLKREPORTZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct request_queue *q; + struct blk_zone_report rep; + struct blk_zone *zones; + int ret; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) + return -EFAULT; + + if (!rep.nr_zones) + return -EINVAL; + + zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) + return -ENOMEM; + + ret = blkdev_report_zones(bdev, rep.sector, + zones, &rep.nr_zones, + GFP_KERNEL); + if (ret) + goto out; + + if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) { + ret = -EFAULT; + goto out; + } + + if (rep.nr_zones) { + if (copy_to_user(argp + sizeof(struct blk_zone_report), zones, + sizeof(struct blk_zone) * rep.nr_zones)) + ret = -EFAULT; + } + + out: + kfree(zones); + + return ret; +} + +/** + * BLKRESETZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct request_queue *q; + struct blk_zone_range zrange; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) + return -EFAULT; + + return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); +} diff --git a/block/ioctl.c b/block/ioctl.c index 755119c3c1b9..f856963204f4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -519,6 +519,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, BLKDEV_DISCARD_SECURE); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKREPORTZONE: + return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); + case BLKRESETZONE: + return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg); case HDIO_GETGEO: return blkdev_getgeo(bdev, argp); case BLKRAGET: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 252043f7cd2c..90097dd8b8ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -316,6 +316,27 @@ extern int blkdev_report_zones(struct block_device *bdev, extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); +extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); + +#else /* CONFIG_BLK_DEV_ZONED */ + +static inline int blkdev_report_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + +static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + #endif /* CONFIG_BLK_DEV_ZONED */ struct request_queue { diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index a3817214b0e0..40d1d7bff537 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -16,6 +16,7 @@ #define _UAPI_BLKZONED_H #include +#include /** * enum blk_zone_type - Types of zones allowed in a zoned device. @@ -100,4 +101,43 @@ struct blk_zone { __u8 reserved[36]; }; +/** + * struct blk_zone_report - BLKREPORTZONE ioctl request/reply + * + * @sector: starting sector of report + * @nr_zones: IN maximum / OUT actual + * @reserved: padding to 16 byte alignment + * @zones: Space to hold @nr_zones @zones entries on reply. + * + * The array of at most @nr_zones must follow this structure in memory. + */ +struct blk_zone_report { + __u64 sector; + __u32 nr_zones; + __u8 reserved[4]; + struct blk_zone zones[0]; +} __packed; + +/** + * struct blk_zone_range - BLKRESETZONE ioctl request + * @sector: starting sector of the first zone to issue reset write pointer + * @nr_sectors: Total number of sectors of 1 or more zones to reset + */ +struct blk_zone_range { + __u64 sector; + __u64 nr_sectors; +}; + +/** + * Zoned block device ioctl's: + * + * @BLKREPORTZONE: Get zone information. Takes a zone report as argument. + * The zone report will start from the zone containing the + * sector specified in the report request structure. + * @BLKRESETZONE: Reset the write pointer of the zones in the specified + * sector range. The sector range must be zone aligned. + */ +#define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report) +#define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) + #endif /* _UAPI_BLKZONED_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index acb2b6152ba0..c1d11df07b28 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -225,6 +225,10 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) +/* + * A jump here: 130-131 are reserved for zoned block devices + * (see uapi/linux/blkzoned.h) + */ #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From 9561a7ade0c205bc2ee035a2ac880478dcc1a024 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 22 Nov 2016 14:04:40 -0500 Subject: nbd: add multi-connection support NBD can become contended on its single connection. We have to serialize all writes and we can only process one read response at a time. Fix this by allowing userspace to provide multiple connections to a single nbd device. This coupled with block-mq drastically increases performance in multi-process cases. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 382 +++++++++++++++++++++++++++++------------------ include/uapi/linux/nbd.h | 9 +- 2 files changed, 243 insertions(+), 148 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 807d24a2f1de..e683e2241cbd 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -41,26 +41,34 @@ #include +struct nbd_sock { + struct socket *sock; + struct mutex tx_lock; +}; + #define NBD_TIMEDOUT 0 #define NBD_DISCONNECT_REQUESTED 1 +#define NBD_DISCONNECTED 2 +#define NBD_RUNNING 3 struct nbd_device { u32 flags; unsigned long runtime_flags; - struct socket * sock; /* If == NULL, device is not ready, yet */ + struct nbd_sock **socks; int magic; struct blk_mq_tag_set tag_set; - struct mutex tx_lock; + struct mutex config_lock; struct gendisk *disk; + int num_connections; + atomic_t recv_threads; + wait_queue_head_t recv_wq; int blksize; loff_t bytesize; - /* protects initialization and shutdown of the socket */ - spinlock_t sock_lock; struct task_struct *task_recv; - struct task_struct *task_send; + struct task_struct *task_setup; #if IS_ENABLED(CONFIG_DEBUG_FS) struct dentry *dbg_dir; @@ -69,7 +77,7 @@ struct nbd_device { struct nbd_cmd { struct nbd_device *nbd; - struct list_head list; + struct completion send_complete; }; #if IS_ENABLED(CONFIG_DEBUG_FS) @@ -159,22 +167,20 @@ static void nbd_end_request(struct nbd_cmd *cmd) */ static void sock_shutdown(struct nbd_device *nbd) { - struct socket *sock; - - spin_lock(&nbd->sock_lock); + int i; - if (!nbd->sock) { - spin_unlock_irq(&nbd->sock_lock); + if (nbd->num_connections == 0) + return; + if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) return; - } - - sock = nbd->sock; - dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); - nbd->sock = NULL; - spin_unlock(&nbd->sock_lock); - kernel_sock_shutdown(sock, SHUT_RDWR); - sockfd_put(sock); + for (i = 0; i < nbd->num_connections; i++) { + struct nbd_sock *nsock = nbd->socks[i]; + mutex_lock(&nsock->tx_lock); + kernel_sock_shutdown(nsock->sock, SHUT_RDWR); + mutex_unlock(&nsock->tx_lock); + } + dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n"); } static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, @@ -182,35 +188,31 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); struct nbd_device *nbd = cmd->nbd; - struct socket *sock = NULL; - - spin_lock(&nbd->sock_lock); + dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n"); set_bit(NBD_TIMEDOUT, &nbd->runtime_flags); - - if (nbd->sock) { - sock = nbd->sock; - get_file(sock->file); - } - - spin_unlock(&nbd->sock_lock); - if (sock) { - kernel_sock_shutdown(sock, SHUT_RDWR); - sockfd_put(sock); - } - req->errors++; - dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n"); + + /* + * If our disconnect packet times out then we're already holding the + * config_lock and could deadlock here, so just set an error and return, + * we'll handle shutting everything down later. + */ + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + return BLK_EH_HANDLED; + mutex_lock(&nbd->config_lock); + sock_shutdown(nbd); + mutex_unlock(&nbd->config_lock); return BLK_EH_HANDLED; } /* * Send or receive packet. */ -static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, - int msg_flags) +static int sock_xmit(struct nbd_device *nbd, int index, int send, void *buf, + int size, int msg_flags) { - struct socket *sock = nbd->sock; + struct socket *sock = nbd->socks[index]->sock; int result; struct msghdr msg; struct kvec iov; @@ -254,19 +256,19 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, return result; } -static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec, - int flags) +static inline int sock_send_bvec(struct nbd_device *nbd, int index, + struct bio_vec *bvec, int flags) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset, + result = sock_xmit(nbd, index, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags); kunmap(bvec->bv_page); return result; } /* always call with the tx_lock held */ -static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd) +static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) { struct request *req = blk_mq_rq_from_pdu(cmd); int result, flags; @@ -274,10 +276,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd) unsigned long size = blk_rq_bytes(req); struct bio *bio; u32 type; + u32 tag = blk_mq_unique_tag(req); - if (req->cmd_type == REQ_TYPE_DRV_PRIV) - type = NBD_CMD_DISC; - else if (req_op(req) == REQ_OP_DISCARD) + if (req_op(req) == REQ_OP_DISCARD) type = NBD_CMD_TRIM; else if (req_op(req) == REQ_OP_FLUSH) type = NBD_CMD_FLUSH; @@ -289,16 +290,16 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd) memset(&request, 0, sizeof(request)); request.magic = htonl(NBD_REQUEST_MAGIC); request.type = htonl(type); - if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) { + if (type != NBD_CMD_FLUSH) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); } - memcpy(request.handle, &req->tag, sizeof(req->tag)); + memcpy(request.handle, &tag, sizeof(tag)); dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", cmd, nbdcmd_to_ascii(type), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); - result = sock_xmit(nbd, 1, &request, sizeof(request), + result = sock_xmit(nbd, index, 1, &request, sizeof(request), (type == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), @@ -323,7 +324,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd) flags = MSG_MORE; dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", cmd, bvec.bv_len); - result = sock_send_bvec(nbd, &bvec, flags); + result = sock_send_bvec(nbd, index, &bvec, flags); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", @@ -344,31 +345,34 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd) return 0; } -static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) +static inline int sock_recv_bvec(struct nbd_device *nbd, int index, + struct bio_vec *bvec) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len, - MSG_WAITALL); + result = sock_xmit(nbd, index, 0, kaddr + bvec->bv_offset, + bvec->bv_len, MSG_WAITALL); kunmap(bvec->bv_page); return result; } /* NULL returned = something went wrong, inform userspace */ -static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd) +static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) { int result; struct nbd_reply reply; struct nbd_cmd *cmd; struct request *req = NULL; u16 hwq; - int tag; + u32 tag; reply.magic = 0; - result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL); + result = sock_xmit(nbd, index, 0, &reply, sizeof(reply), MSG_WAITALL); if (result <= 0) { - dev_err(disk_to_dev(nbd->disk), - "Receive control failed (result %d)\n", result); + if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) && + !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags)) + dev_err(disk_to_dev(nbd->disk), + "Receive control failed (result %d)\n", result); return ERR_PTR(result); } @@ -378,7 +382,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd) return ERR_PTR(-EPROTO); } - memcpy(&tag, reply.handle, sizeof(int)); + memcpy(&tag, reply.handle, sizeof(u32)); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < nbd->tag_set.nr_hw_queues) @@ -390,7 +394,6 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd) return ERR_PTR(-ENOENT); } cmd = blk_mq_rq_to_pdu(req); - if (ntohl(reply.error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); @@ -404,7 +407,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd) struct bio_vec bvec; rq_for_each_segment(bvec, req, iter) { - result = sock_recv_bvec(nbd, &bvec); + result = sock_recv_bvec(nbd, index, &bvec); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); @@ -414,6 +417,9 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd) dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", cmd, bvec.bv_len); } + } else { + /* See the comment in nbd_queue_rq. */ + wait_for_completion(&cmd->send_complete); } return cmd; } @@ -432,25 +438,24 @@ static struct device_attribute pid_attr = { .show = pid_show, }; -static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) +struct recv_thread_args { + struct work_struct work; + struct nbd_device *nbd; + int index; +}; + +static void recv_work(struct work_struct *work) { + struct recv_thread_args *args = container_of(work, + struct recv_thread_args, + work); + struct nbd_device *nbd = args->nbd; struct nbd_cmd *cmd; - int ret; + int ret = 0; BUG_ON(nbd->magic != NBD_MAGIC); - - sk_set_memalloc(nbd->sock->sk); - - ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); - if (ret) { - dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); - return ret; - } - - nbd_size_update(nbd, bdev); - while (1) { - cmd = nbd_read_stat(nbd); + cmd = nbd_read_stat(nbd, args->index); if (IS_ERR(cmd)) { ret = PTR_ERR(cmd); break; @@ -459,10 +464,14 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) nbd_end_request(cmd); } - nbd_size_clear(nbd, bdev); - - device_remove_file(disk_to_dev(nbd->disk), &pid_attr); - return ret; + /* + * We got an error, shut everybody down if this wasn't the result of a + * disconnect request. + */ + if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags)) + sock_shutdown(nbd); + atomic_dec(&nbd->recv_threads); + wake_up(&nbd->recv_wq); } static void nbd_clear_req(struct request *req, void *data, bool reserved) @@ -480,26 +489,35 @@ static void nbd_clear_que(struct nbd_device *nbd) { BUG_ON(nbd->magic != NBD_MAGIC); - /* - * Because we have set nbd->sock to NULL under the tx_lock, all - * modifications to the list must have completed by now. - */ - BUG_ON(nbd->sock); - blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); } -static void nbd_handle_cmd(struct nbd_cmd *cmd) +static void nbd_handle_cmd(struct nbd_cmd *cmd, int index) { struct request *req = blk_mq_rq_from_pdu(cmd); struct nbd_device *nbd = cmd->nbd; + struct nbd_sock *nsock; + + if (index >= nbd->num_connections) { + dev_err(disk_to_dev(nbd->disk), + "Attempted send on invalid socket\n"); + goto error_out; + } + + if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) { + dev_err(disk_to_dev(nbd->disk), + "Attempted send on closed socket\n"); + goto error_out; + } - if (req->cmd_type != REQ_TYPE_FS) + if (req->cmd_type != REQ_TYPE_FS && + req->cmd_type != REQ_TYPE_DRV_PRIV) goto error_out; - if (rq_data_dir(req) == WRITE && + if (req->cmd_type == REQ_TYPE_FS && + rq_data_dir(req) == WRITE && (nbd->flags & NBD_FLAG_READ_ONLY)) { dev_err(disk_to_dev(nbd->disk), "Write on read-only\n"); @@ -508,23 +526,22 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd) req->errors = 0; - mutex_lock(&nbd->tx_lock); - nbd->task_send = current; - if (unlikely(!nbd->sock)) { - mutex_unlock(&nbd->tx_lock); + nsock = nbd->socks[index]; + mutex_lock(&nsock->tx_lock); + if (unlikely(!nsock->sock)) { + mutex_unlock(&nsock->tx_lock); dev_err(disk_to_dev(nbd->disk), "Attempted send on closed socket\n"); goto error_out; } - if (nbd_send_cmd(nbd, cmd) != 0) { + if (nbd_send_cmd(nbd, cmd, index) != 0) { dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; nbd_end_request(cmd); } - nbd->task_send = NULL; - mutex_unlock(&nbd->tx_lock); + mutex_unlock(&nsock->tx_lock); return; @@ -538,39 +555,70 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx, { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + /* + * Since we look at the bio's to send the request over the network we + * need to make sure the completion work doesn't mark this request done + * before we are done doing our send. This keeps us from dereferencing + * freed data if we have particularly fast completions (ie we get the + * completion before we exit sock_xmit on the last bvec) or in the case + * that the server is misbehaving (or there was an error) before we're + * done sending everything over the wire. + */ + init_completion(&cmd->send_complete); blk_mq_start_request(bd->rq); - nbd_handle_cmd(cmd); + nbd_handle_cmd(cmd, hctx->queue_num); + complete(&cmd->send_complete); + return BLK_MQ_RQ_QUEUE_OK; } -static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock) +static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock) { - int ret = 0; - - spin_lock_irq(&nbd->sock_lock); + struct nbd_sock **socks; + struct nbd_sock *nsock; - if (nbd->sock) { - ret = -EBUSY; - goto out; + if (!nbd->task_setup) + nbd->task_setup = current; + if (nbd->task_setup != current) { + dev_err(disk_to_dev(nbd->disk), + "Device being setup by another task"); + return -EINVAL; } - nbd->sock = sock; + socks = krealloc(nbd->socks, (nbd->num_connections + 1) * + sizeof(struct nbd_sock *), GFP_KERNEL); + if (!socks) + return -ENOMEM; + nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL); + if (!nsock) + return -ENOMEM; -out: - spin_unlock_irq(&nbd->sock_lock); + nbd->socks = socks; - return ret; + mutex_init(&nsock->tx_lock); + nsock->sock = sock; + socks[nbd->num_connections++] = nsock; + + return 0; } /* Reset all properties of an NBD device */ static void nbd_reset(struct nbd_device *nbd) { + int i; + + for (i = 0; i < nbd->num_connections; i++) + kfree(nbd->socks[i]); + kfree(nbd->socks); + nbd->socks = NULL; nbd->runtime_flags = 0; nbd->blksize = 1024; nbd->bytesize = 0; set_capacity(nbd->disk, 0); nbd->flags = 0; nbd->tag_set.timeout = 0; + nbd->num_connections = 0; + nbd->task_setup = NULL; queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); } @@ -596,48 +644,67 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev) blk_queue_write_cache(nbd->disk->queue, false, false); } +static void send_disconnects(struct nbd_device *nbd) +{ + struct nbd_request request = {}; + int i, ret; + + request.magic = htonl(NBD_REQUEST_MAGIC); + request.type = htonl(NBD_CMD_DISC); + + for (i = 0; i < nbd->num_connections; i++) { + ret = sock_xmit(nbd, i, 1, &request, sizeof(request), 0); + if (ret <= 0) + dev_err(disk_to_dev(nbd->disk), + "Send disconnect failed %d\n", ret); + } +} + static int nbd_dev_dbg_init(struct nbd_device *nbd); static void nbd_dev_dbg_close(struct nbd_device *nbd); -/* Must be called with tx_lock held */ - +/* Must be called with config_lock held */ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { switch (cmd) { case NBD_DISCONNECT: { - struct request *sreq; - dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); - if (!nbd->sock) + if (!nbd->socks) return -EINVAL; - sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0); - if (!sreq) - return -ENOMEM; - - mutex_unlock(&nbd->tx_lock); + mutex_unlock(&nbd->config_lock); fsync_bdev(bdev); - mutex_lock(&nbd->tx_lock); - sreq->cmd_type = REQ_TYPE_DRV_PRIV; + mutex_lock(&nbd->config_lock); /* Check again after getting mutex back. */ - if (!nbd->sock) { - blk_mq_free_request(sreq); + if (!nbd->socks) return -EINVAL; - } - set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags); - - nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq)); - blk_mq_free_request(sreq); + if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED, + &nbd->runtime_flags)) + send_disconnects(nbd); return 0; } - + case NBD_CLEAR_SOCK: sock_shutdown(nbd); nbd_clear_que(nbd); kill_bdev(bdev); + nbd_bdev_reset(bdev); + /* + * We want to give the run thread a chance to wait for everybody + * to clean up and then do it's own cleanup. + */ + if (!test_bit(NBD_RUNNING, &nbd->runtime_flags)) { + int i; + + for (i = 0; i < nbd->num_connections; i++) + kfree(nbd->socks[i]); + kfree(nbd->socks); + nbd->socks = NULL; + nbd->num_connections = 0; + } return 0; case NBD_SET_SOCK: { @@ -647,7 +714,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, if (!sock) return err; - err = nbd_set_socket(nbd, sock); + err = nbd_add_socket(nbd, sock); if (!err && max_part) bdev->bd_invalidated = 1; @@ -676,26 +743,58 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return 0; case NBD_DO_IT: { - int error; + struct recv_thread_args *args; + int num_connections = nbd->num_connections; + int error, i; if (nbd->task_recv) return -EBUSY; - if (!nbd->sock) + if (!nbd->socks) return -EINVAL; + if (num_connections > 1 && + !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) { + dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n"); + goto out_err; + } - /* We have to claim the device under the lock */ + set_bit(NBD_RUNNING, &nbd->runtime_flags); + blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections); + args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL); + if (!args) + goto out_err; nbd->task_recv = current; - mutex_unlock(&nbd->tx_lock); + mutex_unlock(&nbd->config_lock); nbd_parse_flags(nbd, bdev); + error = device_create_file(disk_to_dev(nbd->disk), &pid_attr); + if (error) { + dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); + goto out_recv; + } + + nbd_size_update(nbd, bdev); + nbd_dev_dbg_init(nbd); - error = nbd_thread_recv(nbd, bdev); + for (i = 0; i < num_connections; i++) { + sk_set_memalloc(nbd->socks[i]->sock->sk); + atomic_inc(&nbd->recv_threads); + INIT_WORK(&args[i].work, recv_work); + args[i].nbd = nbd; + args[i].index = i; + queue_work(system_long_wq, &args[i].work); + } + wait_event_interruptible(nbd->recv_wq, + atomic_read(&nbd->recv_threads) == 0); + for (i = 0; i < num_connections; i++) + flush_work(&args[i].work); nbd_dev_dbg_close(nbd); - - mutex_lock(&nbd->tx_lock); + nbd_size_clear(nbd, bdev); + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); +out_recv: + mutex_lock(&nbd->config_lock); nbd->task_recv = NULL; - +out_err: sock_shutdown(nbd); nbd_clear_que(nbd); kill_bdev(bdev); @@ -708,7 +807,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, error = -ETIMEDOUT; nbd_reset(nbd); - return error; } @@ -740,9 +838,9 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode, BUG_ON(nbd->magic != NBD_MAGIC); - mutex_lock(&nbd->tx_lock); + mutex_lock(&nbd->config_lock); error = __nbd_ioctl(bdev, nbd, cmd, arg); - mutex_unlock(&nbd->tx_lock); + mutex_unlock(&nbd->config_lock); return error; } @@ -762,8 +860,6 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) if (nbd->task_recv) seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv)); - if (nbd->task_send) - seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send)); return 0; } @@ -887,9 +983,7 @@ static int nbd_init_request(void *data, struct request *rq, unsigned int numa_node) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); - cmd->nbd = data; - INIT_LIST_HEAD(&cmd->list); return 0; } @@ -999,13 +1093,13 @@ static int __init nbd_init(void) for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].magic = NBD_MAGIC; - spin_lock_init(&nbd_dev[i].sock_lock); - mutex_init(&nbd_dev[i].tx_lock); + mutex_init(&nbd_dev[i].config_lock); disk->major = NBD_MAJOR; disk->first_minor = i << part_shift; disk->fops = &nbd_fops; disk->private_data = &nbd_dev[i]; sprintf(disk->disk_name, "nbd%d", i); + init_waitqueue_head(&nbd_dev[i].recv_wq); nbd_reset(&nbd_dev[i]); add_disk(disk); } diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index e08e413d5f71..f063cff178c5 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -38,11 +38,12 @@ enum { }; /* values for flags field */ -#define NBD_FLAG_HAS_FLAGS (1 << 0) /* nbd-server supports flags */ -#define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ -#define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ +#define NBD_FLAG_HAS_FLAGS (1 << 0) /* nbd-server supports flags */ +#define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ +#define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ /* there is a gap here to match userspace */ -#define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ +#define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ +#define NBD_FLAG_CAN_MULTI_CONN (1 << 7) /* Server supports multiple connections per export. */ /* userspace doesn't need the nbd_device structure */ -- cgit v1.2.3 From 63db89eaf0b9bf856c5e079ed5eab5c3f13165c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Nov 2016 13:09:50 -0700 Subject: nbd: move multi-connection bit to unused value Bit #7 is already used, move to bit #8 which is the first unused one. Fixes: 9561a7ade0c2 ("nbd: add multi-connection support") Signed-off-by: Jens Axboe --- include/uapi/linux/nbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index f063cff178c5..c91c642ea900 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -43,7 +43,7 @@ enum { #define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ /* there is a gap here to match userspace */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ -#define NBD_FLAG_CAN_MULTI_CONN (1 << 7) /* Server supports multiple connections per export. */ +#define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ /* userspace doesn't need the nbd_device structure */ -- cgit v1.2.3 From 6ea76f33e9ab99c7888547e1acba2baf8e4b5b17 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 2 Dec 2016 00:28:38 -0800 Subject: Add type 0x28 NVME type code to scsi fc headers Signed-off-by: James Smart Acked-by: Johannes Thumshirn Reviewed-by: Jay Freyensee Signed-off-by: Christoph Hellwig --- include/uapi/scsi/fc/fc_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/scsi/fc/fc_fs.h b/include/uapi/scsi/fc/fc_fs.h index 50f28b143451..dcf314dc2a27 100644 --- a/include/uapi/scsi/fc/fc_fs.h +++ b/include/uapi/scsi/fc/fc_fs.h @@ -190,6 +190,7 @@ enum fc_fh_type { FC_TYPE_FCP = 0x08, /* SCSI FCP */ FC_TYPE_CT = 0x20, /* Fibre Channel Services (FC-CT) */ FC_TYPE_ILS = 0x22, /* internal link service */ + FC_TYPE_NVME = 0x28, /* FC-NVME */ }; /* @@ -203,6 +204,7 @@ enum fc_fh_type { [FC_TYPE_FCP] = "FCP", \ [FC_TYPE_CT] = "CT", \ [FC_TYPE_ILS] = "ILS", \ + [FC_TYPE_NVME] = "NVME", \ } /* -- cgit v1.2.3