diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 10:34:51 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 10:34:51 -0700 |
commit | c6b1e36c8fa04a6680c44fe0321d0370400e90b6 (patch) | |
tree | 5110f0639bfa803baa8d213cb21efe37beeaf742 | |
parent | 81e3e044897b0875a52953b3fb6241a33428e4f9 (diff) | |
parent | a84ebb837b419787c2ece74efa566c998929cead (diff) | |
download | linux-c6b1e36c8fa04a6680c44fe0321d0370400e90b6.tar.bz2 |
Merge branch 'for-4.13/block' of git://git.kernel.dk/linux-block
Pull core block/IO updates from Jens Axboe:
"This is the main pull request for the block layer for 4.13. Not a huge
round in terms of features, but there's a lot of churn related to some
core cleanups.
Note this depends on the UUID tree pull request, that Christoph
already sent out.
This pull request contains:
- A series from Christoph, unifying the error/stats codes in the
block layer. We now use blk_status_t everywhere, instead of using
different schemes for different places.
- Also from Christoph, some cleanups around request allocation and IO
scheduler interactions in blk-mq.
- And yet another series from Christoph, cleaning up how we handle
and do bounce buffering in the block layer.
- A blk-mq debugfs series from Bart, further improving on the support
we have for exporting internal information to aid debugging IO
hangs or stalls.
- Also from Bart, a series that cleans up the request initialization
differences across types of devices.
- A series from Goldwyn Rodrigues, allowing the block layer to return
failure if we will block and the user asked for non-blocking.
- Patch from Hannes for supporting setting loop devices block size to
that of the underlying device.
- Two series of patches from Javier, fixing various issues with
lightnvm, particular around pblk.
- A series from me, adding support for write hints. This comes with
NVMe support as well, so applications can help guide data placement
on flash to improve performance, latencies, and write
amplification.
- A series from Ming, improving and hardening blk-mq support for
stopping/starting and quiescing hardware queues.
- Two pull requests for NVMe updates. Nothing major on the feature
side, but lots of cleanups and bug fixes. From the usual crew.
- A series from Neil Brown, greatly improving the bio rescue set
support. Most notably, this kills the bio rescue work queues, if we
don't really need them.
- Lots of other little bug fixes that are all over the place"
* 'for-4.13/block' of git://git.kernel.dk/linux-block: (217 commits)
lightnvm: pblk: set line bitmap check under debug
lightnvm: pblk: verify that cache read is still valid
lightnvm: pblk: add initialization check
lightnvm: pblk: remove target using async. I/Os
lightnvm: pblk: use vmalloc for GC data buffer
lightnvm: pblk: use right metadata buffer for recovery
lightnvm: pblk: schedule if data is not ready
lightnvm: pblk: remove unused return variable
lightnvm: pblk: fix double-free on pblk init
lightnvm: pblk: fix bad le64 assignations
nvme: Makefile: remove dead build rule
blk-mq: map all HWQ also in hyperthreaded system
nvmet-rdma: register ib_client to not deadlock in device removal
nvme_fc: fix error recovery on link down.
nvmet_fc: fix crashes on bad opcodes
nvme_fc: Fix crash when nvme controller connection fails.
nvme_fc: replace ioabort msleep loop with completion
nvme_fc: fix double calls to nvme_cleanup_cmd()
nvme-fabrics: verify that a controller returns the correct NQN
nvme: simplify nvme_dev_attrs_are_visible
...
265 files changed, 5912 insertions, 6237 deletions
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 01ddeaf64b0f..9490f2845f06 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -632,7 +632,7 @@ to i/o submission, if the bio fields are likely to be accessed after the i/o is issued (since the bio may otherwise get freed in case i/o completion happens in the meantime). -The bio_clone() routine may be used to duplicate a bio, where the clone +The bio_clone_fast() routine may be used to duplicate a bio, where the clone shares the bio_vec_list with the original bio (i.e. both point to the same bio_vec_list). This would typically be used for splitting i/o requests in lvm or md. diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h index 67026300c88e..144809a3f4f6 100644 --- a/arch/s390/include/asm/eadm.h +++ b/arch/s390/include/asm/eadm.h @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/device.h> +#include <linux/blkdev.h> struct arqb { u64 data; @@ -105,13 +106,14 @@ struct scm_driver { int (*probe) (struct scm_device *scmdev); int (*remove) (struct scm_device *scmdev); void (*notify) (struct scm_device *scmdev, enum scm_event event); - void (*handler) (struct scm_device *scmdev, void *data, int error); + void (*handler) (struct scm_device *scmdev, void *data, + blk_status_t error); }; int scm_driver_register(struct scm_driver *scmdrv); void scm_driver_unregister(struct scm_driver *scmdrv); int eadm_start_aob(struct aob *aob); -void scm_irq_handler(struct aob *aob, int error); +void scm_irq_handler(struct aob *aob, blk_status_t error); #endif /* _ASM_S390_EADM_H */ diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 85410279beab..b55fe9bf5d3e 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -534,7 +534,7 @@ static void ubd_handler(void) for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { blk_end_request( (*irq_req_buffer)[count]->req, - 0, + BLK_STS_OK, (*irq_req_buffer)[count]->length ); kfree((*irq_req_buffer)[count]); diff --git a/block/badblocks.c b/block/badblocks.c index 6ebcef282314..43c71166e1e2 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -533,6 +533,7 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, case 3: if (newline != '\n') return -EINVAL; + /* fall through */ case 2: if (length <= 0) return -EINVAL; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ed93da2462ab..12bbc6b8657d 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -725,8 +725,12 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, } static void -bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + struct bfq_io_cq *bic, bool bfq_already_existing) { + unsigned int old_wr_coeff = bfqq->wr_coeff; + bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + if (bic->saved_idle_window) bfq_mark_bfqq_idle_window(bfqq); else @@ -754,6 +758,14 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) /* make sure weight will be updated, however we got here */ bfqq->entity.prio_changed = 1; + + if (likely(!busy)) + return; + + if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) + bfqd->wr_busy_queues++; + else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) + bfqd->wr_busy_queues--; } static int bfqq_process_refs(struct bfq_queue *bfqq) @@ -4290,10 +4302,16 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void bfq_put_rq_private(struct request_queue *q, struct request *rq) +static void bfq_finish_request(struct request *rq) { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; + struct bfq_queue *bfqq; + struct bfq_data *bfqd; + + if (!rq->elv.icq) + return; + + bfqq = RQ_BFQQ(rq); + bfqd = bfqq->bfqd; if (rq->rq_flags & RQF_STARTED) bfqg_stats_update_completion(bfqq_group(bfqq), @@ -4324,7 +4342,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) */ if (!RB_EMPTY_NODE(&rq->rb_node)) - bfq_remove_request(q, rq); + bfq_remove_request(rq->q, rq); bfq_put_rq_priv_body(bfqq); } @@ -4394,20 +4412,21 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, /* * Allocate bfq data structures associated with this request. */ -static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - struct bio *bio) +static void bfq_prepare_request(struct request *rq, struct bio *bio) { + struct request_queue *q = rq->q; struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); + struct bfq_io_cq *bic; const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; bool new_queue = false; - bool split = false; + bool bfqq_already_existing = false, split = false; - spin_lock_irq(&bfqd->lock); + if (!rq->elv.icq) + return; + bic = icq_to_bic(rq->elv.icq); - if (!bic) - goto queue_fail; + spin_lock_irq(&bfqd->lock); bfq_check_ioprio_change(bic, bio); @@ -4432,6 +4451,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); + else + bfqq_already_existing = true; } } @@ -4457,7 +4478,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, * queue: restore the idle window and the * possible weight raising period. */ - bfq_bfqq_resume_state(bfqq, bic); + bfq_bfqq_resume_state(bfqq, bfqd, bic, + bfqq_already_existing); } } @@ -4465,13 +4487,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bfq_handle_burst(bfqd, bfqq); spin_unlock_irq(&bfqd->lock); - - return 0; - -queue_fail: - spin_unlock_irq(&bfqd->lock); - - return 1; } static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) @@ -4950,8 +4965,8 @@ static struct elv_fs_entry bfq_attrs[] = { static struct elevator_type iosched_bfq_mq = { .ops.mq = { - .get_rq_priv = bfq_get_rq_private, - .put_rq_priv = bfq_put_rq_private, + .prepare_request = bfq_prepare_request, + .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, diff --git a/block/bio-integrity.c b/block/bio-integrity.c index b5009a896a7f..b8a3a65f7364 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -224,7 +224,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, * @bio: bio to generate/verify integrity metadata for * @proc_fn: Pointer to the relevant processing function */ -static int bio_integrity_process(struct bio *bio, +static blk_status_t bio_integrity_process(struct bio *bio, integrity_processing_fn *proc_fn) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); @@ -232,7 +232,7 @@ static int bio_integrity_process(struct bio *bio, struct bvec_iter bviter; struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); - unsigned int ret = 0; + blk_status_t ret = BLK_STS_OK; void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; @@ -369,7 +369,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio *bio = bip->bip_bio; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn); + bio->bi_status = bio_integrity_process(bio, bi->profile->verify_fn); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; @@ -398,7 +398,7 @@ void bio_integrity_endio(struct bio *bio) * integrity metadata. Restore original bio end_io handler * and run it. */ - if (bio->bi_error) { + if (bio->bi_status) { bio->bi_end_io = bip->bip_end_io; bio_endio(bio); diff --git a/block/bio.c b/block/bio.c index 26b0810fb8ea..1cfcd0df3f30 100644 --- a/block/bio.c +++ b/block/bio.c @@ -315,8 +315,8 @@ static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - if (!parent->bi_error) - parent->bi_error = bio->bi_error; + if (!parent->bi_status) + parent->bi_status = bio->bi_status; bio_put(bio); return parent; } @@ -369,6 +369,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs) struct bio_list punt, nopunt; struct bio *bio; + if (WARN_ON_ONCE(!bs->rescue_workqueue)) + return; /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on @@ -480,7 +482,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, if (current->bio_list && (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1]))) + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); @@ -550,7 +553,7 @@ EXPORT_SYMBOL(zero_fill_bio); * * Description: * Put a reference to a &struct bio, either one you have gotten with - * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. + * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { @@ -599,6 +602,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); bio->bi_opf = bio_src->bi_opf; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; @@ -682,6 +686,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, return NULL; bio->bi_bdev = bio_src->bi_bdev; bio->bi_opf = bio_src->bi_opf; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -924,7 +929,7 @@ static void submit_bio_wait_endio(struct bio *bio) { struct submit_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->error = blk_status_to_errno(bio->bi_status); complete(&ret->event); } @@ -1823,8 +1828,8 @@ again: } if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), - bio, bio->bi_error); + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, + blk_status_to_errno(bio->bi_status)); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } @@ -1927,9 +1932,29 @@ void bioset_free(struct bio_set *bs) } EXPORT_SYMBOL(bioset_free); -static struct bio_set *__bioset_create(unsigned int pool_size, - unsigned int front_pad, - bool create_bvec_pool) +/** + * bioset_create - Create a bio_set + * @pool_size: Number of bio and bio_vecs to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS + * and %BIOSET_NEED_RESCUER + * + * Description: + * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller + * to ask for a number of bytes to be allocated in front of the bio. + * Front pad allocation is useful for embedding the bio inside + * another structure, to avoid allocating extra data to go with the bio. + * Note that the bio must be embedded at the END of that structure always, + * or things will break badly. + * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated + * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast(). + * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to + * dispatch queued requests when the mempool runs out of space. + * + */ +struct bio_set *bioset_create(unsigned int pool_size, + unsigned int front_pad, + int flags) { unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); struct bio_set *bs; @@ -1954,12 +1979,15 @@ static struct bio_set *__bioset_create(unsigned int pool_size, if (!bs->bio_pool) goto bad; - if (create_bvec_pool) { + if (flags & BIOSET_NEED_BVECS) { bs->bvec_pool = biovec_create_pool(pool_size); if (!bs->bvec_pool) goto bad; } + if (!(flags & BIOSET_NEED_RESCUER)) + return bs; + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); if (!bs->rescue_workqueue) goto bad; @@ -1969,41 +1997,8 @@ bad: bioset_free(bs); return NULL; } - -/** - * bioset_create - Create a bio_set - * @pool_size: Number of bio and bio_vecs to cache in the mempool - * @front_pad: Number of bytes to allocate in front of the returned bio - * - * Description: - * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller - * to ask for a number of bytes to be allocated in front of the bio. - * Front pad allocation is useful for embedding the bio inside - * another structure, to avoid allocating extra data to go with the bio. - * Note that the bio must be embedded at the END of that structure always, - * or things will break badly. - */ -struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) -{ - return __bioset_create(pool_size, front_pad, true); -} EXPORT_SYMBOL(bioset_create); -/** - * bioset_create_nobvec - Create a bio_set without bio_vec mempool - * @pool_size: Number of bio to cache in the mempool - * @front_pad: Number of bytes to allocate in front of the returned bio - * - * Description: - * Same functionality as bioset_create() except that mempool is not - * created for bio_vecs. Saving some memory for bio_clone_fast() users. - */ -struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_pad) -{ - return __bioset_create(pool_size, front_pad, false); -} -EXPORT_SYMBOL(bioset_create_nobvec); - #ifdef CONFIG_BLK_CGROUP /** @@ -2118,7 +2113,7 @@ static int __init init_bio(void) bio_integrity_init(); biovec_init_slabs(); - fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); + fs_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!fs_bio_set) panic("bio: can't allocate bios\n"); diff --git a/block/blk-core.c b/block/blk-core.c index a7421b772d0e..af393d5a9680 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -129,11 +129,70 @@ void blk_rq_init(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(blk_rq_init); +static const struct { + int errno; + const char *name; +} blk_errors[] = { + [BLK_STS_OK] = { 0, "" }, + [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, + [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, + [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, + [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, + [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, + [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, + [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, + [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, + [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, + [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, + + /* device mapper special case, should not leak out: */ + [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, + + /* everything else not covered above: */ + [BLK_STS_IOERR] = { -EIO, "I/O" }, +}; + +blk_status_t errno_to_blk_status(int errno) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { + if (blk_errors[i].errno == errno) + return (__force blk_status_t)i; + } + + return BLK_STS_IOERR; +} +EXPORT_SYMBOL_GPL(errno_to_blk_status); + +int blk_status_to_errno(blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return -EIO; + return blk_errors[idx].errno; +} +EXPORT_SYMBOL_GPL(blk_status_to_errno); + +static void print_req_error(struct request *req, blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return; + + printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", + __func__, blk_errors[idx].name, req->rq_disk ? + req->rq_disk->disk_name : "?", + (unsigned long long)blk_rq_pos(req)); +} + static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, int error) + unsigned int nbytes, blk_status_t error) { if (error) - bio->bi_error = error; + bio->bi_status = error; if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); @@ -177,10 +236,13 @@ static void blk_delay_work(struct work_struct *work) * Description: * Sometimes queueing needs to be postponed for a little while, to allow * resources to come back. This function will make sure that queueing is - * restarted around the specified time. Queue lock must be held. + * restarted around the specified time. */ void blk_delay_queue(struct request_queue *q, unsigned long msecs) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (likely(!blk_queue_dead(q))) queue_delayed_work(kblockd_workqueue, &q->delay_work, msecs_to_jiffies(msecs)); @@ -198,6 +260,9 @@ EXPORT_SYMBOL(blk_delay_queue); **/ void blk_start_queue_async(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + queue_flag_clear(QUEUE_FLAG_STOPPED, q); blk_run_queue_async(q); } @@ -210,11 +275,13 @@ EXPORT_SYMBOL(blk_start_queue_async); * Description: * blk_start_queue() will clear the stop flag on the queue, and call * the request_fn for the queue if it was in a stopped state when - * entered. Also see blk_stop_queue(). Queue lock must be held. + * entered. Also see blk_stop_queue(). **/ void blk_start_queue(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); WARN_ON(!irqs_disabled()); + WARN_ON_ONCE(q->mq_ops); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); @@ -233,10 +300,13 @@ EXPORT_SYMBOL(blk_start_queue); * or if it simply chooses not to queue more I/O at one point, it can * call this function to prevent the request_fn from being called until * the driver has signalled it's ready to go again. This happens by calling - * blk_start_queue() to restart queue operations. Queue lock must be held. + * blk_start_queue() to restart queue operations. **/ void blk_stop_queue(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + cancel_delayed_work(&q->delay_work); queue_flag_set(QUEUE_FLAG_STOPPED, q); } @@ -289,6 +359,9 @@ EXPORT_SYMBOL(blk_sync_queue); */ inline void __blk_run_queue_uncond(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (unlikely(blk_queue_dead(q))) return; @@ -310,11 +383,13 @@ EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); * @q: The queue to run * * Description: - * See @blk_run_queue. This variant must be called with the queue lock - * held and interrupts disabled. + * See @blk_run_queue. */ void __blk_run_queue(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (unlikely(blk_queue_stopped(q))) return; @@ -328,10 +403,18 @@ EXPORT_SYMBOL(__blk_run_queue); * * Description: * Tells kblockd to perform the equivalent of @blk_run_queue on behalf - * of us. The caller must hold the queue lock. + * of us. + * + * Note: + * Since it is not allowed to run q->delay_work after blk_cleanup_queue() + * has canceled q->delay_work, callers must hold the queue lock to avoid + * race conditions between blk_cleanup_queue() and blk_run_queue_async(). */ void blk_run_queue_async(struct request_queue *q) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); } @@ -349,6 +432,8 @@ void blk_run_queue(struct request_queue *q) { unsigned long flags; + WARN_ON_ONCE(q->mq_ops); + spin_lock_irqsave(q->queue_lock, flags); __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); @@ -377,6 +462,7 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) int i; lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); while (true) { bool drain = false; @@ -455,6 +541,8 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) */ void blk_queue_bypass_start(struct request_queue *q) { + WARN_ON_ONCE(q->mq_ops); + spin_lock_irq(q->queue_lock); q->bypass_depth++; queue_flag_set(QUEUE_FLAG_BYPASS, q); @@ -481,6 +569,9 @@ EXPORT_SYMBOL_GPL(blk_queue_bypass_start); * @q: queue of interest * * Leave bypass mode and restore the normal queueing behavior. + * + * Note: although blk_queue_bypass_start() is only called for blk-sq queues, + * this function is called for both blk-sq and blk-mq queues. */ void blk_queue_bypass_end(struct request_queue *q) { @@ -732,7 +823,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (q->id < 0) goto fail_q; - q->bio_split = bioset_create(BIO_POOL_SIZE, 0); + q->bio_split = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!q->bio_split) goto fail_id; @@ -878,6 +969,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); int blk_init_allocated_queue(struct request_queue *q) { + WARN_ON_ONCE(q->mq_ops); + q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size); if (!q->fq) return -ENOMEM; @@ -1015,6 +1108,8 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) struct request_list *rl; int on_thresh, off_thresh; + WARN_ON_ONCE(q->mq_ops); + spin_lock_irq(q->queue_lock); q->nr_requests = nr; blk_queue_congestion_threshold(q); @@ -1077,6 +1172,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, int may_queue; req_flags_t rq_flags = RQF_ALLOCED; + lockdep_assert_held(q->queue_lock); + if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); @@ -1250,12 +1347,20 @@ static struct request *get_request(struct request_queue *q, unsigned int op, struct request_list *rl; struct request *rq; + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: rq = __get_request(rl, op, bio, gfp_mask); if (!IS_ERR(rq)) return rq; + if (op & REQ_NOWAIT) { + blk_put_rl(rl); + return ERR_PTR(-EAGAIN); + } + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; @@ -1283,16 +1388,18 @@ retry: goto retry; } -static struct request *blk_old_get_request(struct request_queue *q, int rw, - gfp_t gfp_mask) +static struct request *blk_old_get_request(struct request_queue *q, + unsigned int op, gfp_t gfp_mask) { struct request *rq; + WARN_ON_ONCE(q->mq_ops); + /* create ioc upfront */ create_io_context(gfp_mask, q->node); spin_lock_irq(q->queue_lock); - rq = get_request(q, rw, NULL, gfp_mask); + rq = get_request(q, op, NULL, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); return rq; @@ -1305,14 +1412,24 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, return rq; } -struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +struct request *blk_get_request(struct request_queue *q, unsigned int op, + gfp_t gfp_mask) { - if (q->mq_ops) - return blk_mq_alloc_request(q, rw, + struct request *req; + + if (q->mq_ops) { + req = blk_mq_alloc_request(q, op, (gfp_mask & __GFP_DIRECT_RECLAIM) ? 0 : BLK_MQ_REQ_NOWAIT); - else - return blk_old_get_request(q, rw, gfp_mask); + if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) + q->mq_ops->initialize_rq_fn(req); + } else { + req = blk_old_get_request(q, op, gfp_mask); + if (!IS_ERR(req) && q->initialize_rq_fn) + q->initialize_rq_fn(req); + } + + return req; } EXPORT_SYMBOL(blk_get_request); @@ -1328,6 +1445,9 @@ EXPORT_SYMBOL(blk_get_request); */ void blk_requeue_request(struct request_queue *q, struct request *rq) { + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); @@ -1402,9 +1522,6 @@ static void blk_pm_put_request(struct request *rq) static inline void blk_pm_put_request(struct request *rq) {} #endif -/* - * queue lock must be held - */ void __blk_put_request(struct request_queue *q, struct request *req) { req_flags_t rq_flags = req->rq_flags; @@ -1417,6 +1534,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) return; } + lockdep_assert_held(q->queue_lock); + blk_pm_put_request(req); elv_completed_request(q, req); @@ -1646,6 +1765,7 @@ void blk_init_request_from_bio(struct request *req, struct bio *bio) req->ioprio = ioc->ioprio; else req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + req->write_hint = bio->bi_write_hint; blk_rq_bio_prep(req->q, req, bio); } EXPORT_SYMBOL_GPL(blk_init_request_from_bio); @@ -1665,10 +1785,10 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; } @@ -1726,7 +1846,10 @@ get_rq: req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { __wbt_done(q->rq_wb, wb_acct); - bio->bi_error = PTR_ERR(req); + if (PTR_ERR(req) == -ENOMEM) + bio->bi_status = BLK_STS_RESOURCE; + else + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); goto out_unlock; } @@ -1881,7 +2004,7 @@ generic_make_request_checks(struct bio *bio) { struct request_queue *q; int nr_sectors = bio_sectors(bio); - int err = -EIO; + blk_status_t status = BLK_STS_IOERR; char b[BDEVNAME_SIZE]; struct hd_struct *part; @@ -1900,6 +2023,14 @@ generic_make_request_checks(struct bio *bio) goto end_io; } + /* + * For a REQ_NOWAIT based request, return -EOPNOTSUPP + * if queue is not a request based queue. + */ + + if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) + goto not_supported; + part = bio->bi_bdev->bd_part; if (should_fail_request(part, bio->bi_iter.bi_size) || should_fail_request(&part_to_disk(part)->part0, @@ -1924,7 +2055,7 @@ generic_make_request_checks(struct bio *bio) !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!nr_sectors) { - err = 0; + status = BLK_STS_OK; goto end_io; } } @@ -1976,9 +2107,9 @@ generic_make_request_checks(struct bio *bio) return true; not_supported: - err = -EOPNOTSUPP; + status = BLK_STS_NOTSUPP; end_io: - bio->bi_error = err; + bio->bi_status = status; bio_endio(bio); return false; } @@ -2057,7 +2188,7 @@ blk_qc_t generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (likely(blk_queue_enter(q, false) == 0)) { + if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ @@ -2082,7 +2213,11 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } else { - bio_io_error(bio); + if (unlikely(!blk_queue_dying(q) && + (bio->bi_opf & REQ_NOWAIT))) + bio_wouldblock_error(bio); + else + bio_io_error(bio); } bio = bio_list_pop(&bio_list_on_stack[0]); } while (bio); @@ -2183,29 +2318,29 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, * @q: the queue to submit the request * @rq: the request being queued */ -int blk_insert_cloned_request(struct request_queue *q, struct request *rq) +blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned long flags; int where = ELEVATOR_INSERT_BACK; if (blk_cloned_rq_check_limits(q, rq)) - return -EIO; + return BLK_STS_IOERR; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) - return -EIO; + return BLK_STS_IOERR; if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); blk_mq_sched_insert_request(rq, false, true, false, false); - return 0; + return BLK_STS_OK; } spin_lock_irqsave(q->queue_lock, flags); if (unlikely(blk_queue_dying(q))) { spin_unlock_irqrestore(q->queue_lock, flags); - return -ENODEV; + return BLK_STS_IOERR; } /* @@ -2222,7 +2357,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); - return 0; + return BLK_STS_OK; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); @@ -2238,9 +2373,6 @@ EXPORT_SYMBOL_GPL(blk_insert_cloned_request); * * Return: * The number of bytes to fail. - * - * Context: - * queue_lock must be held. */ unsigned int blk_rq_err_bytes(const struct request *rq) { @@ -2380,15 +2512,15 @@ void blk_account_io_start(struct request *rq, bool new_io) * Return: * Pointer to the request at the top of @q if available. Null * otherwise. - * - * Context: - * queue_lock must be held. */ struct request *blk_peek_request(struct request_queue *q) { struct request *rq; int ret; + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + while ((rq = __elv_next_request(q)) != NULL) { rq = blk_pm_peek_request(q, rq); @@ -2456,15 +2588,14 @@ struct request *blk_peek_request(struct request_queue *q) rq = NULL; break; } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { - int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO; - rq->rq_flags |= RQF_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. */ blk_start_request(rq); - __blk_end_request_all(rq, err); + __blk_end_request_all(rq, ret == BLKPREP_INVALID ? + BLK_STS_TARGET : BLK_STS_IOERR); } else { printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); break; @@ -2505,12 +2636,12 @@ void blk_dequeue_request(struct request *rq) * * Block internal functions which don't want to start timer should * call blk_dequeue_request(). - * - * Context: - * queue_lock must be held. */ void blk_start_request(struct request *req) { + lockdep_assert_held(req->q->queue_lock); + WARN_ON_ONCE(req->q->mq_ops); + blk_dequeue_request(req); if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { @@ -2535,14 +2666,14 @@ EXPORT_SYMBOL(blk_start_request); * Return: * Pointer to the request at the top of @q if available. Null * otherwise. - * - * Context: - * queue_lock must be held. */ struct request *blk_fetch_request(struct request_queue *q) { struct request *rq; + lockdep_assert_held(q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + rq = blk_peek_request(q); if (rq) blk_start_request(rq); @@ -2553,7 +2684,7 @@ EXPORT_SYMBOL(blk_fetch_request); /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete @req * * Description: @@ -2572,49 +2703,19 @@ EXPORT_SYMBOL(blk_fetch_request); * %false - this request doesn't have any more data * %true - this request has more data **/ -bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) +bool blk_update_request(struct request *req, blk_status_t error, + unsigned int nr_bytes) { int total_bytes; - trace_block_rq_complete(req, error, nr_bytes); + trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); if (!req->bio) return false; - if (error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET)) { - char *error_type; - - switch (error) { - case -ENOLINK: - error_type = "recoverable transport"; - break; - case -EREMOTEIO: - error_type = "critical target"; - break; - case -EBADE: - error_type = "critical nexus"; - break; - case -ETIMEDOUT: - error_type = "timeout"; - break; - case -ENOSPC: - error_type = "critical space allocation"; - break; - case -ENODATA: - error_type = "critical medium"; - break; - case -EIO: - default: - error_type = "I/O"; - break; - } - printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", - __func__, error_type, req->rq_disk ? - req->rq_disk->disk_name : "?", - (unsigned long long)blk_rq_pos(req)); - - } + if (unlikely(error && !blk_rq_is_passthrough(req) && + !(req->rq_flags & RQF_QUIET))) + print_req_error(req, error); blk_account_io_completion(req, nr_bytes); @@ -2680,7 +2781,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) } EXPORT_SYMBOL_GPL(blk_update_request); -static bool blk_update_bidi_request(struct request *rq, int error, +static bool blk_update_bidi_request(struct request *rq, blk_status_t error, unsigned int nr_bytes, unsigned int bidi_bytes) { @@ -2718,13 +2819,13 @@ void blk_unprep_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_unprep_request); -/* - * queue lock must be held - */ -void blk_finish_request(struct request *req, int error) +void blk_finish_request(struct request *req, blk_status_t error) { struct request_queue *q = req->q; + lockdep_assert_held(req->q->queue_lock); + WARN_ON_ONCE(q->mq_ops); + if (req->rq_flags & RQF_STATS) blk_stat_add(req); @@ -2758,7 +2859,7 @@ EXPORT_SYMBOL(blk_finish_request); /** * blk_end_bidi_request - Complete a bidi request * @rq: the request to complete - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * @@ -2772,12 +2873,14 @@ EXPORT_SYMBOL(blk_finish_request); * %false - we are done with this request * %true - still buffers pending for this request **/ -static bool blk_end_bidi_request(struct request *rq, int error, +static bool blk_end_bidi_request(struct request *rq, blk_status_t error, unsigned int nr_bytes, unsigned int bidi_bytes) { struct request_queue *q = rq->q; unsigned long flags; + WARN_ON_ONCE(q->mq_ops); + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; @@ -2791,7 +2894,7 @@ static bool blk_end_bidi_request(struct request *rq, int error, /** * __blk_end_bidi_request - Complete a bidi request with queue lock held * @rq: the request to complete - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * @@ -2803,9 +2906,12 @@ static bool blk_end_bidi_request(struct request *rq, int error, * %false - we are done with this request * %true - still buffers pending for this request **/ -static bool __blk_end_bidi_request(struct request *rq, int error, +static bool __blk_end_bidi_request(struct request *rq, blk_status_t error, unsigned int nr_bytes, unsigned int bidi_bytes) { + lockdep_assert_held(rq->q->queue_lock); + WARN_ON_ONCE(rq->q->mq_ops); + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; @@ -2817,7 +2923,7 @@ static bool __blk_end_bidi_request(struct request *rq, int error, /** * blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete * * Description: @@ -2828,8 +2934,10 @@ static bool __blk_end_bidi_request(struct request *rq, int error, * %false - we are done with this request * %true - still buffers pending for this request **/ -bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes) +bool blk_end_request(struct request *rq, blk_status_t error, + unsigned int nr_bytes) { + WARN_ON_ONCE(rq->q->mq_ops); return blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(blk_end_request); @@ -2837,12 +2945,12 @@ EXPORT_SYMBOL(blk_end_request); /** * blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish - * @error: %0 for success, < %0 for error + * @error: block status code * * Description: * Completely finish @rq. */ -void blk_end_request_all(struct request *rq, int error) +void blk_end_request_all(struct request *rq, blk_status_t error) { bool pending; unsigned int bidi_bytes = 0; @@ -2858,7 +2966,7 @@ EXPORT_SYMBOL(blk_end_request_all); /** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed - * @error: %0 for success, < %0 for error + * @error: block status code * @nr_bytes: number of bytes to complete * * Description: @@ -2868,8 +2976,12 @@ EXPORT_SYMBOL(blk_end_request_all); * %false - we are done with this request * %true - still buffers pending for this request **/ -bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) +bool __blk_end_request(struct request *rq, blk_status_t error, + unsigned int nr_bytes) { + lockdep_assert_held(rq->q->queue_lock); + WARN_ON_ONCE(rq->q->mq_ops); + return __blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(__blk_end_request); @@ -2877,16 +2989,19 @@ EXPORT_SYMBOL(__blk_end_request); /** * __blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish - * @error: %0 for success, < %0 for error + * @error: block status code * * Description: * Completely finish @rq. Must be called with queue lock held. */ -void __blk_end_request_all(struct request *rq, int error) +void __blk_end_request_all(struct request *rq, blk_status_t error) { bool pending; unsigned int bidi_bytes = 0; + lockdep_assert_held(rq->q->queue_lock); + WARN_ON_ONCE(rq->q->mq_ops); + if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); @@ -2898,7 +3013,7 @@ EXPORT_SYMBOL(__blk_end_request_all); /** * __blk_end_request_cur - Helper function to finish the current request chunk. * @rq: the request to finish the current chunk for - * @error: %0 for success, < %0 for error + * @error: block status code * * Description: * Complete the current consecutively mapped chunk from @rq. Must @@ -2908,7 +3023,7 @@ EXPORT_SYMBOL(__blk_end_request_all); * %false - we are done with this request * %true - still buffers pending for this request */ -bool __blk_end_request_cur(struct request *rq, int error) +bool __blk_end_request_cur(struct request *rq, blk_status_t error) { return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } @@ -3151,6 +3266,8 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth, bool from_schedule) __releases(q->queue_lock) { + lockdep_assert_held(q->queue_lock); + trace_block_unplug(q, depth, !from_schedule); if (from_schedule) @@ -3249,7 +3366,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) * Short-circuit if @q is dead */ if (unlikely(blk_queue_dying(q))) { - __blk_end_request_all(rq, -ENODEV); + __blk_end_request_all(rq, BLK_STS_IOERR); continue; } diff --git a/block/blk-exec.c b/block/blk-exec.c index a9451e3b8587..5c0f3dc446dc 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -16,7 +16,7 @@ * @rq: request to complete * @error: end I/O status of the request */ -static void blk_end_sync_rq(struct request *rq, int error) +static void blk_end_sync_rq(struct request *rq, blk_status_t error) { struct completion *waiting = rq->end_io_data; @@ -69,7 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, if (unlikely(blk_queue_dying(q))) { rq->rq_flags |= RQF_QUIET; - __blk_end_request_all(rq, -ENXIO); + __blk_end_request_all(rq, BLK_STS_IOERR); spin_unlock_irq(q->queue_lock); return; } diff --git a/block/blk-flush.c b/block/blk-flush.c index c4e0880b54bb..ed5fe322abba 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -164,7 +164,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) */ static bool blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, - unsigned int seq, int error) + unsigned int seq, blk_status_t error) { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; @@ -216,7 +216,7 @@ static bool blk_flush_complete_seq(struct request *rq, return kicked | queued; } -static void flush_end_io(struct request *flush_rq, int error) +static void flush_end_io(struct request *flush_rq, blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -341,11 +341,13 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) return blk_flush_queue_rq(flush_rq, false); } -static void flush_data_end_io(struct request *rq, int error) +static void flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); + lockdep_assert_held(q->queue_lock); + /* * Updating q->in_flight[] here for making this tag usable * early. Because in blk_queue_start_tag(), @@ -382,7 +384,7 @@ static void flush_data_end_io(struct request *rq, int error) blk_run_queue_async(q); } -static void mq_flush_data_end_io(struct request *rq, int error) +static void mq_flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx; @@ -411,9 +413,6 @@ static void mq_flush_data_end_io(struct request *rq, int error) * or __blk_mq_run_hw_queue() to dispatch request. * @rq is being submitted. Analyze what needs to be done and put it on the * right queue. - * - * CONTEXT: - * spin_lock_irq(q->queue_lock) in !mq case */ void blk_insert_flush(struct request *rq) { @@ -422,6 +421,9 @@ void blk_insert_flush(struct request *rq) unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + if (!q->mq_ops) + lockdep_assert_held(q->queue_lock); + /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 0f891a9aff4d..feb30570eaf5 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -384,9 +384,9 @@ static struct kobj_type integrity_ktype = { .sysfs_ops = &integrity_ops, }; -static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) +static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) { - return 0; + return BLK_STS_OK; } static const struct blk_integrity_profile nop_profile = { diff --git a/block/blk-map.c b/block/blk-map.c index 3b5cb863318f..2547016aa7aa 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -16,6 +16,8 @@ */ int blk_rq_append_bio(struct request *rq, struct bio *bio) { + blk_queue_bounce(rq->q, &bio); + if (!rq->bio) { blk_rq_bio_prep(rq->q, rq, bio); } else { @@ -72,15 +74,13 @@ static int __blk_rq_map_user_iov(struct request *rq, map_data->offset += bio->bi_iter.bi_size; orig_bio = bio; - blk_queue_bounce(q, &bio); /* * We link the bounce buffer in and could have to traverse it * later so we have to get a ref to prevent it from being freed */ - bio_get(bio); - ret = blk_rq_append_bio(rq, bio); + bio_get(bio); if (ret) { bio_endio(bio); __blk_rq_unmap_user(orig_bio); @@ -249,7 +249,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, return ret; } - blk_queue_bounce(q, &rq->bio); return 0; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-merge.c b/block/blk-merge.c index 3990ae406341..99038830fb42 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -108,31 +108,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bool do_split = true; struct bio *new = NULL; const unsigned max_sectors = get_max_io_size(q, bio); - unsigned bvecs = 0; bio_for_each_segment(bv, bio, iter) { /* - * With arbitrary bio size, the incoming bio may be very - * big. We have to split the bio into small bios so that - * each holds at most BIO_MAX_PAGES bvecs because - * bio_clone() can fail to allocate big bvecs. - * - * It should have been better to apply the limit per - * request queue in which bio_clone() is involved, - * instead of globally. The biggest blocker is the - * bio_clone() in bio bounce. - * - * If bio is splitted by this reason, we should have - * allowed to continue bios merging, but don't do - * that now for making the change simple. - * - * TODO: deal with bio bounce's bio_clone() gracefully - * and convert the global limit into per-queue limit. - */ - if (bvecs++ >= BIO_MAX_PAGES) - goto split; - - /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ @@ -202,8 +180,7 @@ split: return do_split ? new : NULL; } -void blk_queue_split(struct request_queue *q, struct bio **bio, - struct bio_set *bs) +void blk_queue_split(struct request_queue *q, struct bio **bio) { struct bio *split, *res; unsigned nsegs; @@ -211,13 +188,13 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, switch (bio_op(*bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - split = blk_bio_discard_split(q, *bio, bs, &nsegs); + split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs); break; case REQ_OP_WRITE_ZEROES: - split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs); + split = blk_bio_write_zeroes_split(q, *bio, q->bio_split, &nsegs); break; case REQ_OP_WRITE_SAME: - split = blk_bio_write_same_split(q, *bio, bs, &nsegs); + split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs); break; default: split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); @@ -671,6 +648,9 @@ static void blk_account_io_merge(struct request *req) static struct request *attempt_merge(struct request_queue *q, struct request *req, struct request *next) { + if (!q->mq_ops) + lockdep_assert_held(q->queue_lock); + if (!rq_mergeable(req) || !rq_mergeable(next)) return NULL; @@ -693,6 +673,13 @@ static struct request *attempt_merge(struct request_queue *q, return NULL; /* + * Don't allow merge of different write hints, or for a hint with + * non-hint IO. + */ + if (req->write_hint != next->write_hint) + return NULL; + + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn * will have updated segment counts, update sector @@ -811,6 +798,13 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) !blk_write_same_mergeable(rq->bio, bio)) return false; + /* + * Don't allow merge of different write hints, or for a hint with + * non-hint IO. + */ + if (rq->write_hint != bio->bi_write_hint) + return false; + return true; } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 8e61e8640e17..2cca4fc43f45 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -14,10 +14,15 @@ #include "blk.h" #include "blk-mq.h" -static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, - const int cpu) +static int cpu_to_queue_index(unsigned int nr_queues, const int cpu, + const struct cpumask *online_mask) { - return cpu * nr_queues / nr_cpus; + /* + * Non online CPU will be mapped to queue index 0. + */ + if (!cpumask_test_cpu(cpu, online_mask)) + return 0; + return cpu % nr_queues; } static int get_first_sibling(unsigned int cpu) @@ -36,55 +41,26 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set) unsigned int *map = set->mq_map; unsigned int nr_queues = set->nr_hw_queues; const struct cpumask *online_mask = cpu_online_mask; - unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; - cpumask_var_t cpus; - - if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) - return -ENOMEM; - - cpumask_clear(cpus); - nr_cpus = nr_uniq_cpus = 0; - for_each_cpu(i, online_mask) { - nr_cpus++; - first_sibling = get_first_sibling(i); - if (!cpumask_test_cpu(first_sibling, cpus)) - nr_uniq_cpus++; - cpumask_set_cpu(i, cpus); - } - - queue = 0; - for_each_possible_cpu(i) { - if (!cpumask_test_cpu(i, online_mask)) { - map[i] = 0; - continue; - } + unsigned int cpu, first_sibling; + for_each_possible_cpu(cpu) { /* - * Easy case - we have equal or more hardware queues. Or - * there are no thread siblings to take into account. Do - * 1:1 if enough, or sequential mapping if less. + * First do sequential mapping between CPUs and queues. + * In case we still have CPUs to map, and we have some number of + * threads per cores then map sibling threads to the same queue for + * performace optimizations. */ - if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { - map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); - queue++; - continue; + if (cpu < nr_queues) { + map[cpu] = cpu_to_queue_index(nr_queues, cpu, online_mask); + } else { + first_sibling = get_first_sibling(cpu); + if (first_sibling == cpu) + map[cpu] = cpu_to_queue_index(nr_queues, cpu, online_mask); + else + map[cpu] = map[first_sibling]; } - - /* - * Less then nr_cpus queues, and we have some number of - * threads per cores. Map sibling threads to the same - * queue. - */ - first_sibling = get_first_sibling(i); - if (first_sibling == i) { - map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, - queue); - queue++; - } else - map[i] = map[first_sibling]; } - free_cpumask_var(cpus); return 0; } EXPORT_SYMBOL_GPL(blk_mq_map_queues); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 803aed4d7221..9ebc2945f991 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -114,10 +114,12 @@ static ssize_t queue_state_write(void *data, const char __user *buf, blk_mq_run_hw_queues(q, true); } else if (strcmp(op, "start") == 0) { blk_mq_start_stopped_hw_queues(q, true); + } else if (strcmp(op, "kick") == 0) { + blk_mq_kick_requeue_list(q); } else { pr_err("%s: unsupported operation '%s'\n", __func__, op); inval: - pr_err("%s: use either 'run' or 'start'\n", __func__); + pr_err("%s: use 'run', 'start' or 'kick'\n", __func__); return -EINVAL; } return count; @@ -133,6 +135,29 @@ static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) } } +static int queue_write_hint_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + int i; + + for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) + seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]); + + return 0; +} + +static ssize_t queue_write_hint_store(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct request_queue *q = data; + int i; + + for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) + q->write_hints[i] = 0; + + return count; +} + static int queue_poll_stat_show(void *data, struct seq_file *m) { struct request_queue *q = data; @@ -267,6 +292,14 @@ static const char *const rqf_name[] = { }; #undef RQF_NAME +#define RQAF_NAME(name) [REQ_ATOM_##name] = #name +static const char *const rqaf_name[] = { + RQAF_NAME(COMPLETE), + RQAF_NAME(STARTED), + RQAF_NAME(POLL_SLEPT), +}; +#undef RQAF_NAME + int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; @@ -283,6 +316,8 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); + seq_puts(m, ", .atomic_flags="); + blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name)); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) @@ -298,6 +333,37 @@ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) } EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show); +static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos) + __acquires(&q->requeue_lock) +{ + struct request_queue *q = m->private; + + spin_lock_irq(&q->requeue_lock); + return seq_list_start(&q->requeue_list, *pos); +} + +static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct request_queue *q = m->private; + + return seq_list_next(v, &q->requeue_list, pos); +} + +static void queue_requeue_list_stop(struct seq_file *m, void *v) + __releases(&q->requeue_lock) +{ + struct request_queue *q = m->private; + + spin_unlock_irq(&q->requeue_lock); +} + +static const struct seq_operations queue_requeue_list_seq_ops = { + .start = queue_requeue_list_start, + .next = queue_requeue_list_next, + .stop = queue_requeue_list_stop, + .show = blk_mq_debugfs_rq_show, +}; + static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&hctx->lock) { @@ -329,6 +395,36 @@ static const struct seq_operations hctx_dispatch_seq_ops = { .show = blk_mq_debugfs_rq_show, }; +struct show_busy_params { + struct seq_file *m; + struct blk_mq_hw_ctx *hctx; +}; + +/* + * Note: the state of a request may change while this function is in progress, + * e.g. due to a concurrent blk_mq_finish_request() call. + */ +static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) +{ + const struct show_busy_params *params = data; + + if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && + test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + __blk_mq_debugfs_rq_show(params->m, + list_entry_rq(&rq->queuelist)); +} + +static int hctx_busy_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + struct show_busy_params params = { .m = m, .hctx = hctx }; + + blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, + ¶ms); + + return 0; +} + static int hctx_ctx_map_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -655,7 +751,9 @@ const struct file_operations blk_mq_debugfs_fops = { static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { {"poll_stat", 0400, queue_poll_stat_show}, + {"requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops}, {"state", 0600, queue_state_show, queue_state_write}, + {"write_hints", 0600, queue_write_hint_show, queue_write_hint_store}, {}, }; @@ -663,6 +761,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, hctx_state_show}, {"flags", 0400, hctx_flags_show}, {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops}, + {"busy", 0400, hctx_busy_show}, {"ctx_map", 0400, hctx_ctx_map_show}, {"tags", 0400, hctx_tags_show}, {"tags_bitmap", 0400, hctx_tags_bitmap_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0ded5e846335..7f0dc48ffb40 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -31,11 +31,10 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); -static void __blk_mq_sched_assign_ioc(struct request_queue *q, - struct request *rq, - struct bio *bio, - struct io_context *ioc) +void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) { + struct request_queue *q = rq->q; + struct io_context *ioc = rq_ioc(bio); struct io_cq *icq; spin_lock_irq(q->queue_lock); @@ -47,25 +46,8 @@ static void __blk_mq_sched_assign_ioc(struct request_queue *q, if (!icq) return; } - + get_io_context(icq->ioc); rq->elv.icq = icq; - if (!blk_mq_sched_get_rq_priv(q, rq, bio)) { - rq->rq_flags |= RQF_ELVPRIV; - get_io_context(icq->ioc); - return; - } - - rq->elv.icq = NULL; -} - -static void blk_mq_sched_assign_ioc(struct request_queue *q, - struct request *rq, struct bio *bio) -{ - struct io_context *ioc; - - ioc = rq_ioc(bio); - if (ioc) - __blk_mq_sched_assign_ioc(q, rq, bio, ioc); } /* @@ -107,71 +89,6 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) return false; } -struct request *blk_mq_sched_get_request(struct request_queue *q, - struct bio *bio, - unsigned int op, - struct blk_mq_alloc_data *data) -{ - struct elevator_queue *e = q->elevator; - struct request *rq; - - blk_queue_enter_live(q); - data->q = q; - if (likely(!data->ctx)) - data->ctx = blk_mq_get_ctx(q); - if (likely(!data->hctx)) - data->hctx = blk_mq_map_queue(q, data->ctx->cpu); - - if (e) { - data->flags |= BLK_MQ_REQ_INTERNAL; - - /* - * Flush requests are special and go directly to the - * dispatch list. - */ - if (!op_is_flush(op) && e->type->ops.mq.get_request) { - rq = e->type->ops.mq.get_request(q, op, data); - if (rq) - rq->rq_flags |= RQF_QUEUED; - } else - rq = __blk_mq_alloc_request(data, op); - } else { - rq = __blk_mq_alloc_request(data, op); - } - - if (rq) { - if (!op_is_flush(op)) { - rq->elv.icq = NULL; - if (e && e->type->icq_cache) - blk_mq_sched_assign_ioc(q, rq, bio); - } - data->hctx->queued++; - return rq; - } - - blk_queue_exit(q); - return NULL; -} - -void blk_mq_sched_put_request(struct request *rq) -{ - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - - if (rq->rq_flags & RQF_ELVPRIV) { - blk_mq_sched_put_rq_priv(rq->q, rq); - if (rq->elv.icq) { - put_io_context(rq->elv.icq->ioc); - rq->elv.icq = NULL; - } - } - - if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request) - e->type->ops.mq.put_request(rq); - else - blk_mq_finish_request(rq); -} - void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; @@ -180,7 +97,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) bool did_work = false; LIST_HEAD(rq_list); - if (unlikely(blk_mq_hctx_stopped(hctx))) + /* RCU or SRCU read lock is needed before checking quiesced flag */ + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; hctx->run++; @@ -260,19 +178,73 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); +/* + * Reverse check our software queue for entries that we could potentially + * merge with. Currently includes a hand-wavy stop count of 8, to not spend + * too much time checking for merges. + */ +static bool blk_mq_attempt_merge(struct request_queue *q, + struct blk_mq_ctx *ctx, struct bio *bio) +{ + struct request *rq; + int checked = 8; + + lockdep_assert_held(&ctx->lock); + + list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { + bool merged = false; + + if (!checked--) + break; + + if (!blk_rq_merge_ok(rq, bio)) + continue; + + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_back_merge(q, rq, bio); + break; + case ELEVATOR_FRONT_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_front_merge(q, rq, bio); + break; + case ELEVATOR_DISCARD_MERGE: + merged = bio_attempt_discard_merge(q, rq, bio); + break; + default: + continue; + } + + if (merged) + ctx->rq_merged++; + return merged; + } + + return false; +} + bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + bool ret = false; - if (e->type->ops.mq.bio_merge) { - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - + if (e && e->type->ops.mq.bio_merge) { blk_mq_put_ctx(ctx); return e->type->ops.mq.bio_merge(hctx, bio); } - return false; + if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { + /* default per sw-queue merge */ + spin_lock(&ctx->lock); + ret = blk_mq_attempt_merge(q, ctx, bio); + spin_unlock(&ctx->lock); + } + + blk_mq_put_ctx(ctx); + return ret; } bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 5007edece51a..9267d0b7c197 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -7,8 +7,7 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, void (*exit)(struct blk_mq_hw_ctx *)); -struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data); -void blk_mq_sched_put_request(struct request *rq); +void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio); void blk_mq_sched_request_inserted(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, @@ -38,35 +37,12 @@ int blk_mq_sched_init(struct request_queue *q); static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { - struct elevator_queue *e = q->elevator; - - if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio)) + if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return false; return __blk_mq_sched_bio_merge(q, bio); } -static inline int blk_mq_sched_get_rq_priv(struct request_queue *q, - struct request *rq, - struct bio *bio) -{ - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.mq.get_rq_priv) - return e->type->ops.mq.get_rq_priv(q, rq, bio); - - return 0; -} - -static inline void blk_mq_sched_put_rq_priv(struct request_queue *q, - struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.mq.put_rq_priv) - e->type->ops.mq.put_rq_priv(q, rq); -} - static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) diff --git a/block/blk-mq.c b/block/blk-mq.c index 958cedaff8b8..05dfa3f270ae 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -42,7 +42,6 @@ static LIST_HEAD(all_q_list); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); -static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync); static int blk_mq_poll_stats_bkt(const struct request *rq) { @@ -154,13 +153,28 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +/* + * FIXME: replace the scsi_internal_device_*block_nowait() calls in the + * mpt3sas driver such that this function can be removed. + */ +void blk_mq_quiesce_queue_nowait(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); + /** - * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished + * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * * Note: this function does not prevent that the struct request end_io() - * callback function is invoked. Additionally, it is not prevented that - * new queue_rq() calls occur unless the queue has been stopped first. + * callback function is invoked. Once this function is returned, we make + * sure no dispatch can happen until the queue is unquiesced via + * blk_mq_unquiesce_queue(). */ void blk_mq_quiesce_queue(struct request_queue *q) { @@ -168,11 +182,11 @@ void blk_mq_quiesce_queue(struct request_queue *q) unsigned int i; bool rcu = false; - __blk_mq_stop_hw_queues(q, true); + blk_mq_quiesce_queue_nowait(q); queue_for_each_hw_ctx(q, hctx, i) { if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(&hctx->queue_rq_srcu); + synchronize_srcu(hctx->queue_rq_srcu); else rcu = true; } @@ -181,6 +195,26 @@ void blk_mq_quiesce_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); +/* + * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() + * @q: request queue. + * + * This function recovers queue into the state before quiescing + * which is done by blk_mq_quiesce_queue. + */ +void blk_mq_unquiesce_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(q->queue_lock, flags); + + /* dispatch requests which are inserted during quiescing */ + blk_mq_run_hw_queues(q, true); +} +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; @@ -204,15 +238,33 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); -void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, unsigned int op) +static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, + unsigned int tag, unsigned int op) { + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct request *rq = tags->static_rqs[tag]; + + rq->rq_flags = 0; + + if (data->flags & BLK_MQ_REQ_INTERNAL) { + rq->tag = -1; + rq->internal_tag = tag; + } else { + if (blk_mq_tag_busy(data->hctx)) { + rq->rq_flags = RQF_MQ_INFLIGHT; + atomic_inc(&data->hctx->nr_active); + } + rq->tag = tag; + rq->internal_tag = -1; + data->hctx->tags->rqs[rq->tag] = rq; + } + INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ - rq->q = q; - rq->mq_ctx = ctx; + rq->q = data->q; + rq->mq_ctx = data->ctx; rq->cmd_flags = op; - if (blk_queue_io_stat(q)) + if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ rq->cpu = -1; @@ -241,44 +293,60 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->end_io_data = NULL; rq->next_rq = NULL; - ctx->rq_dispatched[op_is_sync(op)]++; + data->ctx->rq_dispatched[op_is_sync(op)]++; + return rq; } -EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init); -struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, - unsigned int op) +static struct request *blk_mq_get_request(struct request_queue *q, + struct bio *bio, unsigned int op, + struct blk_mq_alloc_data *data) { + struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - tag = blk_mq_get_tag(data); - if (tag != BLK_MQ_TAG_FAIL) { - struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + blk_queue_enter_live(q); + data->q = q; + if (likely(!data->ctx)) + data->ctx = blk_mq_get_ctx(q); + if (likely(!data->hctx)) + data->hctx = blk_mq_map_queue(q, data->ctx->cpu); + if (op & REQ_NOWAIT) + data->flags |= BLK_MQ_REQ_NOWAIT; - rq = tags->static_rqs[tag]; + if (e) { + data->flags |= BLK_MQ_REQ_INTERNAL; - if (data->flags & BLK_MQ_REQ_INTERNAL) { - rq->tag = -1; - rq->internal_tag = tag; - } else { - if (blk_mq_tag_busy(data->hctx)) { - rq->rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); - } - rq->tag = tag; - rq->internal_tag = -1; - data->hctx->tags->rqs[rq->tag] = rq; - } + /* + * Flush requests are special and go directly to the + * dispatch list. + */ + if (!op_is_flush(op) && e->type->ops.mq.limit_depth) + e->type->ops.mq.limit_depth(op, data); + } - blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); - return rq; + tag = blk_mq_get_tag(data); + if (tag == BLK_MQ_TAG_FAIL) { + blk_queue_exit(q); + return NULL; } - return NULL; + rq = blk_mq_rq_ctx_init(data, tag, op); + if (!op_is_flush(op)) { + rq->elv.icq = NULL; + if (e && e->type->ops.mq.prepare_request) { + if (e->type->icq_cache && rq_ioc(bio)) + blk_mq_sched_assign_ioc(rq, bio); + + e->type->ops.mq.prepare_request(rq, bio); + rq->rq_flags |= RQF_ELVPRIV; + } + } + data->hctx->queued++; + return rq; } -EXPORT_SYMBOL_GPL(__blk_mq_alloc_request); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, +struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, unsigned int flags) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; @@ -289,7 +357,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, if (ret) return ERR_PTR(ret); - rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); + rq = blk_mq_get_request(q, NULL, op, &alloc_data); blk_mq_put_ctx(alloc_data.ctx); blk_queue_exit(q); @@ -304,8 +372,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, } EXPORT_SYMBOL(blk_mq_alloc_request); -struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, - unsigned int flags, unsigned int hctx_idx) +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, + unsigned int op, unsigned int flags, unsigned int hctx_idx) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; @@ -340,7 +408,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, cpu = cpumask_first(alloc_data.hctx->cpumask); alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); + rq = blk_mq_get_request(q, NULL, op, &alloc_data); blk_queue_exit(q); @@ -351,17 +419,28 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct request *rq) +void blk_mq_free_request(struct request *rq) { - const int sched_tag = rq->internal_tag; struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + const int sched_tag = rq->internal_tag; + if (rq->rq_flags & RQF_ELVPRIV) { + if (e && e->type->ops.mq.finish_request) + e->type->ops.mq.finish_request(rq); + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } + } + + ctx->rq_completed[rq_is_sync(rq)]++; if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); wbt_done(q->rq_wb, &rq->issue_stat); - rq->rq_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); @@ -372,29 +451,9 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, blk_mq_sched_restart(hctx); blk_queue_exit(q); } - -static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - - ctx->rq_completed[rq_is_sync(rq)]++; - __blk_mq_finish_request(hctx, ctx, rq); -} - -void blk_mq_finish_request(struct request *rq) -{ - blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); -} -EXPORT_SYMBOL_GPL(blk_mq_finish_request); - -void blk_mq_free_request(struct request *rq) -{ - blk_mq_sched_put_request(rq); -} EXPORT_SYMBOL_GPL(blk_mq_free_request); -inline void __blk_mq_end_request(struct request *rq, int error) +inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { blk_account_io_done(rq); @@ -409,7 +468,7 @@ inline void __blk_mq_end_request(struct request *rq, int error) } EXPORT_SYMBOL(__blk_mq_end_request); -void blk_mq_end_request(struct request *rq, int error) +void blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); @@ -753,50 +812,6 @@ static void blk_mq_timeout_work(struct work_struct *work) blk_queue_exit(q); } -/* - * Reverse check our software queue for entries that we could potentially - * merge with. Currently includes a hand-wavy stop count of 8, to not spend - * too much time checking for merges. - */ -static bool blk_mq_attempt_merge(struct request_queue *q, - struct blk_mq_ctx *ctx, struct bio *bio) -{ - struct request *rq; - int checked = 8; - - list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { - bool merged = false; - - if (!checked--) - break; - - if (!blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_back_merge(q, rq, bio); - break; - case ELEVATOR_FRONT_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_front_merge(q, rq, bio); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - continue; - } - - if (merged) - ctx->rq_merged++; - return merged; - } - - return false; -} - struct flush_busy_ctx_data { struct blk_mq_hw_ctx *hctx; struct list_head *list; @@ -968,7 +983,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) { struct blk_mq_hw_ctx *hctx; struct request *rq; - int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; + int errors, queued; if (list_empty(list)) return false; @@ -979,6 +994,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) errors = queued = 0; do { struct blk_mq_queue_data bd; + blk_status_t ret; rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { @@ -1019,25 +1035,20 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) } ret = q->mq_ops->queue_rq(hctx, &bd); - switch (ret) { - case BLK_MQ_RQ_QUEUE_OK: - queued++; - break; - case BLK_MQ_RQ_QUEUE_BUSY: + if (ret == BLK_STS_RESOURCE) { blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; - default: - pr_err("blk-mq: bad return on queue: %d\n", ret); - case BLK_MQ_RQ_QUEUE_ERROR: + } + + if (unlikely(ret != BLK_STS_OK)) { errors++; - blk_mq_end_request(rq, -EIO); - break; + blk_mq_end_request(rq, BLK_STS_IOERR); + continue; } - if (ret == BLK_MQ_RQ_QUEUE_BUSY) - break; + queued++; } while (!list_empty(list)); hctx->dispatched[queued_to_index(queued)]++; @@ -1075,7 +1086,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before - * returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq + * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. */ if (!blk_mq_sched_needs_restart(hctx) && @@ -1100,9 +1111,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } else { might_sleep(); - srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); + srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); blk_mq_sched_dispatch_requests(hctx); - srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); + srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); } } @@ -1134,8 +1145,10 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, unsigned long msecs) { - if (unlikely(blk_mq_hctx_stopped(hctx) || - !blk_mq_hw_queue_mapped(hctx))) + if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx))) + return; + + if (unlikely(blk_mq_hctx_stopped(hctx))) return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { @@ -1201,34 +1214,39 @@ bool blk_mq_queue_stopped(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_queue_stopped); -static void __blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx, bool sync) +/* + * This function is often used for pausing .queue_rq() by driver when + * there isn't enough resource or some conditions aren't satisfied, and + * BLK_MQ_RQ_QUEUE_BUSY is usually returned. + * + * We do not guarantee that dispatch can be drained or blocked + * after blk_mq_stop_hw_queue() returns. Please use + * blk_mq_quiesce_queue() for that requirement. + */ +void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - if (sync) - cancel_delayed_work_sync(&hctx->run_work); - else - cancel_delayed_work(&hctx->run_work); + cancel_delayed_work(&hctx->run_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } - -void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) -{ - __blk_mq_stop_hw_queue(hctx, false); -} EXPORT_SYMBOL(blk_mq_stop_hw_queue); -static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync) +/* + * This function is often used for pausing .queue_rq() by driver when + * there isn't enough resource or some conditions aren't satisfied, and + * BLK_MQ_RQ_QUEUE_BUSY is usually returned. + * + * We do not guarantee that dispatch can be drained or blocked + * after blk_mq_stop_hw_queues() returns. Please use + * blk_mq_quiesce_queue() for that requirement. + */ +void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) - __blk_mq_stop_hw_queue(hctx, sync); -} - -void blk_mq_stop_hw_queues(struct request_queue *q) -{ - __blk_mq_stop_hw_queues(q, false); + blk_mq_stop_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_stop_hw_queues); @@ -1295,7 +1313,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { - if (unlikely(!blk_mq_hw_queue_mapped(hctx))) + if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx))) return; /* @@ -1317,6 +1335,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, { struct blk_mq_ctx *ctx = rq->mq_ctx; + lockdep_assert_held(&ctx->lock); + trace_block_rq_insert(hctx->queue, rq); if (at_head) @@ -1330,6 +1350,8 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct blk_mq_ctx *ctx = rq->mq_ctx; + lockdep_assert_held(&ctx->lock); + __blk_mq_insert_req_list(hctx, rq, at_head); blk_mq_hctx_mark_pending(hctx, ctx); } @@ -1427,30 +1449,13 @@ static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) !blk_queue_nomerges(hctx->queue); } -static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, - struct request *rq, struct bio *bio) +static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq) { - if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { - blk_mq_bio_to_request(rq, bio); - spin_lock(&ctx->lock); -insert_rq: - __blk_mq_insert_request(hctx, rq, false); - spin_unlock(&ctx->lock); - return false; - } else { - struct request_queue *q = hctx->queue; - - spin_lock(&ctx->lock); - if (!blk_mq_attempt_merge(q, ctx, bio)) { - blk_mq_bio_to_request(rq, bio); - goto insert_rq; - } - - spin_unlock(&ctx->lock); - __blk_mq_finish_request(hctx, ctx, rq); - return true; - } + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, false); + spin_unlock(&ctx->lock); } static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) @@ -1471,10 +1476,11 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, .last = true, }; blk_qc_t new_cookie; - int ret; + blk_status_t ret; bool run_queue = true; - if (blk_mq_hctx_stopped(hctx)) { + /* RCU or SRCU read lock is needed before checking quiesced flag */ + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; goto insert; } @@ -1493,18 +1499,19 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, * would have done */ ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) { + switch (ret) { + case BLK_STS_OK: *cookie = new_cookie; return; - } - - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + case BLK_STS_RESOURCE: + __blk_mq_requeue_request(rq); + goto insert; + default: *cookie = BLK_QC_T_NONE; - blk_mq_end_request(rq, -EIO); + blk_mq_end_request(rq, ret); return; } - __blk_mq_requeue_request(rq); insert: blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); } @@ -1521,9 +1528,9 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, might_sleep(); - srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); + srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); __blk_mq_try_issue_directly(hctx, rq, cookie, true); - srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); + srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); } } @@ -1541,7 +1548,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio_io_error(bio); @@ -1559,9 +1566,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_getrq(q, bio, bio->bi_opf); - rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); + rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); return BLK_QC_T_NONE; } @@ -1639,11 +1648,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_sched_insert_request(rq, false, true, true, true); - } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + } else { blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_queue_io(data.hctx, data.ctx, rq); blk_mq_run_hw_queue(data.hctx, true); - } else - blk_mq_put_ctx(data.ctx); + } return cookie; } @@ -1866,7 +1876,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, set->ops->exit_hctx(hctx, hctx_idx); if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(&hctx->queue_rq_srcu); + cleanup_srcu_struct(hctx->queue_rq_srcu); blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); @@ -1900,7 +1910,6 @@ static int blk_mq_init_hctx(struct request_queue *q, spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; - hctx->queue_num = hctx_idx; hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); @@ -1939,7 +1948,7 @@ static int blk_mq_init_hctx(struct request_queue *q, goto free_fq; if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(&hctx->queue_rq_srcu); + init_srcu_struct(hctx->queue_rq_srcu); blk_mq_debugfs_register_hctx(q, hctx); @@ -2224,6 +2233,20 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); +static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) +{ + int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); + + BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), + __alignof__(struct blk_mq_hw_ctx)) != + sizeof(struct blk_mq_hw_ctx)); + + if (tag_set->flags & BLK_MQ_F_BLOCKING) + hw_ctx_size += sizeof(struct srcu_struct); + + return hw_ctx_size; +} + static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -2238,7 +2261,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, continue; node = blk_mq_hw_queue_to_node(q->mq_map, i); - hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), + hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set), GFP_KERNEL, node); if (!hctxs[i]) break; diff --git a/block/blk-mq.h b/block/blk-mq.h index cc67b48e3551..1a06fdf9fd4d 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -128,17 +128,6 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data return data->hctx->tags; } -/* - * Internal helpers for request allocation/init/free - */ -void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, unsigned int op); -void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct request *rq); -void blk_mq_finish_request(struct request *rq); -struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, - unsigned int op); - static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_STOPPED, &hctx->state); diff --git a/block/blk-settings.c b/block/blk-settings.c index 4fa81ed383ca..be1f115b538b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -172,11 +172,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) q->nr_batching = BLK_BATCH_REQ; blk_set_default_limits(&q->limits); - - /* - * by default assume old behaviour and bounce for any highmem page - */ - blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); } EXPORT_SYMBOL(blk_queue_make_request); diff --git a/block/blk-tag.c b/block/blk-tag.c index 07cc329fa4b0..2290f65b9d73 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -258,15 +258,14 @@ EXPORT_SYMBOL(blk_queue_resize_tags); * all transfers have been done for a request. It's important to call * this function before end_that_request_last(), as that will put the * request back on the free list thus corrupting the internal tag list. - * - * Notes: - * queue lock must be held. **/ void blk_queue_end_tag(struct request_queue *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; unsigned tag = rq->tag; /* negative tags invalid */ + lockdep_assert_held(q->queue_lock); + BUG_ON(tag >= bqt->real_max_depth); list_del_init(&rq->queuelist); @@ -307,9 +306,6 @@ EXPORT_SYMBOL(blk_queue_end_tag); * calling this function. The request will also be removed from * the request queue, so it's the drivers responsibility to readd * it if it should need to be restarted for some reason. - * - * Notes: - * queue lock must be held. **/ int blk_queue_start_tag(struct request_queue *q, struct request *rq) { @@ -317,6 +313,8 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) unsigned max_depth; int tag; + lockdep_assert_held(q->queue_lock); + if (unlikely((rq->rq_flags & RQF_QUEUED))) { printk(KERN_ERR "%s: request %p for device [%s] already tagged %d", @@ -389,14 +387,13 @@ EXPORT_SYMBOL(blk_queue_start_tag); * Hardware conditions may dictate a need to stop all pending requests. * In this case, we will safely clear the block side of the tag queue and * readd all requests to the request queue in the right order. - * - * Notes: - * queue lock must be held. **/ void blk_queue_invalidate_tags(struct request_queue *q) { struct list_head *tmp, *n; + lockdep_assert_held(q->queue_lock); + list_for_each_safe(tmp, n, &q->tag_busy_list) blk_requeue_request(q, list_entry_rq(tmp)); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index cbff183f3d9f..17ec83bb0900 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -189,13 +189,15 @@ unsigned long blk_rq_timeout(unsigned long timeout) * Notes: * Each request has its own timer, and as it is added to the queue, we * set up the timer. When the request completes, we cancel the timer. - * Queue lock must be held for the non-mq case, mq case doesn't care. */ void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; + if (!q->mq_ops) + lockdep_assert_held(q->queue_lock); + /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ if (!q->mq_ops && !q->rq_timed_out_fn) return; diff --git a/block/blk.h b/block/blk.h index 83c8e1100525..01ebb8185f6b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -143,6 +143,8 @@ static inline struct request *__elv_next_request(struct request_queue *q) struct request *rq; struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); + WARN_ON_ONCE(q->mq_ops); + while (1) { if (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); @@ -334,4 +336,17 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { } static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } #endif +#ifdef CONFIG_BOUNCE +extern int init_emergency_isa_pool(void); +extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); +#else +static inline int init_emergency_isa_pool(void) +{ + return 0; +} +static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) +{ +} +#endif /* CONFIG_BOUNCE */ + #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c index 1cb5dd3a5da1..5793c2dc1a15 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -22,10 +22,12 @@ #include <asm/tlbflush.h> #include <trace/events/block.h> +#include "blk.h" #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 +static struct bio_set *bounce_bio_set, *bounce_bio_split; static mempool_t *page_pool, *isa_page_pool; #if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) @@ -40,6 +42,14 @@ static __init int init_emergency_pool(void) BUG_ON(!page_pool); pr_info("pool size: %d pages\n", POOL_SIZE); + bounce_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + BUG_ON(!bounce_bio_set); + if (bioset_integrity_create(bounce_bio_set, BIO_POOL_SIZE)) + BUG_ON(1); + + bounce_bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); + BUG_ON(!bounce_bio_split); + return 0; } @@ -143,7 +153,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) mempool_free(bvec->bv_page, pool); } - bio_orig->bi_error = bio->bi_error; + bio_orig->bi_status = bio->bi_status; bio_endio(bio_orig); bio_put(bio); } @@ -163,7 +173,7 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; - if (!bio->bi_error) + if (!bio->bi_status) copy_to_high_bio_irq(bio_orig, bio); bounce_end_io(bio, pool); @@ -186,20 +196,31 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, int rw = bio_data_dir(*bio_orig); struct bio_vec *to, from; struct bvec_iter iter; - unsigned i; - - bio_for_each_segment(from, *bio_orig, iter) - if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) - goto bounce; + unsigned i = 0; + bool bounce = false; + int sectors = 0; + + bio_for_each_segment(from, *bio_orig, iter) { + if (i++ < BIO_MAX_PAGES) + sectors += from.bv_len >> 9; + if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) + bounce = true; + } + if (!bounce) + return; - return; -bounce: - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); + if (sectors < bio_sectors(*bio_orig)) { + bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split); + bio_chain(bio, *bio_orig); + generic_make_request(*bio_orig); + *bio_orig = bio; + } + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set); bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; - if (page_to_pfn(page) <= queue_bounce_pfn(q)) + if (page_to_pfn(page) <= q->limits.bounce_pfn) continue; to->bv_page = mempool_alloc(pool, q->bounce_gfp); @@ -251,7 +272,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn) + if (q->limits.bounce_pfn >= blk_max_pfn) return; pool = page_pool; } else { @@ -264,5 +285,3 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) */ __blk_queue_bounce(q, bio_orig, pool); } - -EXPORT_SYMBOL(blk_queue_bounce); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 0a23dbba2d30..c4513b23f57a 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref) struct bsg_job *job = container_of(kref, struct bsg_job, kref); struct request *rq = job->req; - blk_end_request_all(rq, scsi_req(rq)->result); + blk_end_request_all(rq, BLK_STS_OK); put_device(job->dev); /* release reference for the request */ @@ -202,7 +202,7 @@ static void bsg_request_fn(struct request_queue *q) ret = bsg_create_job(dev, req); if (ret) { scsi_req(req)->result = ret; - blk_end_request_all(req, ret); + blk_end_request_all(req, BLK_STS_OK); spin_lock_irq(q->queue_lock); continue; } @@ -246,6 +246,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name, q->bsg_job_size = dd_job_size; q->bsg_job_fn = job_fn; queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); blk_queue_softirq_done(q, bsg_softirq_done); blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); diff --git a/block/bsg.c b/block/bsg.c index 6fd08544d77e..37663b664666 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -236,7 +236,6 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) rq = blk_get_request(q, op, GFP_KERNEL); if (IS_ERR(rq)) return rq; - scsi_req_init(rq); ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); if (ret) @@ -294,14 +293,14 @@ out: * async completion call-back from the block layer, when scsi/ide/whatever * calls end_that_request_last() on a request */ -static void bsg_rq_end_io(struct request *rq, int uptodate) +static void bsg_rq_end_io(struct request *rq, blk_status_t status) { struct bsg_command *bc = rq->end_io_data; struct bsg_device *bd = bc->bd; unsigned long flags; - dprintk("%s: finished rq %p bc %p, bio %p stat %d\n", - bd->name, rq, bc, bc->bio, uptodate); + dprintk("%s: finished rq %p bc %p, bio %p\n", + bd->name, rq, bc, bc->bio); bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); @@ -750,6 +749,12 @@ static struct bsg_device *bsg_add_device(struct inode *inode, #ifdef BSG_DEBUG unsigned char buf[32]; #endif + + if (!blk_queue_scsi_passthrough(rq)) { + WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); + return ERR_PTR(-EINVAL); + } + if (!blk_get_queue(rq)) return ERR_PTR(-ENXIO); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b7e9c7feeab2..3d5c28945719 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -982,15 +982,6 @@ static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) return min_vdisktime; } -static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) -{ - s64 delta = (s64)(vdisktime - min_vdisktime); - if (delta < 0) - min_vdisktime = vdisktime; - - return min_vdisktime; -} - static void update_min_vdisktime(struct cfq_rb_root *st) { struct cfq_group *cfqg; diff --git a/block/elevator.c b/block/elevator.c index dac99fbfc273..4bb2f0c93fa6 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -681,6 +681,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) */ if (elv_attempt_insert_merge(q, rq)) break; + /* fall through */ case ELEVATOR_INSERT_SORT: BUG_ON(blk_rq_is_passthrough(rq)); rq->rq_flags |= RQF_SORTED; diff --git a/block/genhd.c b/block/genhd.c index d252d29fe837..7f520fa25d16 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -36,7 +36,7 @@ struct kobject *block_depr; static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); -static struct device_type disk_type; +static const struct device_type disk_type; static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); @@ -1183,7 +1183,7 @@ static char *block_devnode(struct device *dev, umode_t *mode, return NULL; } -static struct device_type disk_type = { +static const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, diff --git a/block/ioprio.c b/block/ioprio.c index 4b120c9cf7e8..6f5d0b6625e3 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -75,7 +75,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) case IOPRIO_CLASS_RT: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - /* fall through, rt has prio field too */ + /* fall through */ + /* rt has prio field too */ case IOPRIO_CLASS_BE: if (data >= IOPRIO_BE_NR || data < 0) return -EINVAL; diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index b9faabc75fdb..a9f6fd3fab8e 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -426,33 +426,29 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd, } } -static struct request *kyber_get_request(struct request_queue *q, - unsigned int op, - struct blk_mq_alloc_data *data) +static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { - struct kyber_queue_data *kqd = q->elevator->elevator_data; - struct request *rq; - /* * We use the scheduler tags as per-hardware queue queueing tokens. * Async requests can be limited at this stage. */ - if (!op_is_sync(op)) + if (!op_is_sync(op)) { + struct kyber_queue_data *kqd = data->q->elevator->elevator_data; + data->shallow_depth = kqd->async_depth; + } +} - rq = __blk_mq_alloc_request(data, op); - if (rq) - rq_set_domain_token(rq, -1); - return rq; +static void kyber_prepare_request(struct request *rq, struct bio *bio) +{ + rq_set_domain_token(rq, -1); } -static void kyber_put_request(struct request *rq) +static void kyber_finish_request(struct request *rq) { - struct request_queue *q = rq->q; - struct kyber_queue_data *kqd = q->elevator->elevator_data; + struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; rq_clear_domain_token(kqd, rq); - blk_mq_finish_request(rq); } static void kyber_completed_request(struct request *rq) @@ -815,8 +811,9 @@ static struct elevator_type kyber_sched = { .exit_sched = kyber_exit_sched, .init_hctx = kyber_init_hctx, .exit_hctx = kyber_exit_hctx, - .get_request = kyber_get_request, - .put_request = kyber_put_request, + .limit_depth = kyber_limit_depth, + .prepare_request = kyber_prepare_request, + .finish_request = kyber_finish_request, .completed_request = kyber_completed_request, .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 4a294a5f7fab..7440de44dd85 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -326,7 +326,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (IS_ERR(rq)) return PTR_ERR(rq); req = scsi_req(rq); - scsi_req_init(rq); if (hdr->cmd_len > BLK_MAX_CDB) { req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); @@ -456,7 +455,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, goto error_free_buffer; } req = scsi_req(rq); - scsi_req_init(rq); cmdlen = COMMAND_SIZE(opcode); @@ -542,7 +540,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); - scsi_req_init(rq); rq->timeout = BLK_DEFAULT_SG_TIMEOUT; scsi_req(rq)->cmd[0] = cmd; scsi_req(rq)->cmd[4] = data; @@ -744,10 +741,14 @@ int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, } EXPORT_SYMBOL(scsi_cmd_blk_ioctl); -void scsi_req_init(struct request *rq) +/** + * scsi_req_init - initialize certain fields of a scsi_request structure + * @req: Pointer to a scsi_request structure. + * Initializes .__cmd[], .cmd, .cmd_len and .sense_len but no other members + * of struct scsi_request. + */ +void scsi_req_init(struct scsi_request *req) { - struct scsi_request *req = scsi_req(rq); - memset(req->__cmd, 0, sizeof(req->__cmd)); req->cmd = req->__cmd; req->cmd_len = BLK_MAX_CDB; diff --git a/block/t10-pi.c b/block/t10-pi.c index 680c6d636298..3416dadf7b15 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -46,8 +46,8 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len) * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref * tag. */ -static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, - unsigned int type) +static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, + csum_fn *fn, unsigned int type) { unsigned int i; @@ -67,11 +67,11 @@ static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, iter->seed++; } - return 0; + return BLK_STS_OK; } -static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, - unsigned int type) +static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, + csum_fn *fn, unsigned int type) { unsigned int i; @@ -91,7 +91,7 @@ static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, "(rcvd %u)\n", iter->disk_name, (unsigned long long) iter->seed, be32_to_cpu(pi->ref_tag)); - return -EILSEQ; + return BLK_STS_PROTECTION; } break; case 3: @@ -108,7 +108,7 @@ static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, "(rcvd %04x, want %04x)\n", iter->disk_name, (unsigned long long)iter->seed, be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); - return -EILSEQ; + return BLK_STS_PROTECTION; } next: @@ -117,45 +117,45 @@ next: iter->seed++; } - return 0; + return BLK_STS_OK; } -static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_crc_fn, 1); } -static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_ip_fn, 1); } -static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_crc_fn, 1); } -static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_ip_fn, 1); } -static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_crc_fn, 3); } -static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_ip_fn, 3); } -static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_crc_fn, 3); } -static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_ip_fn, 3); } diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 26a51be77227..245a879b036e 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -3464,7 +3464,7 @@ static inline bool DAC960_ProcessCompletedRequest(DAC960_Command_T *Command, bool SuccessfulIO) { struct request *Request = Command->Request; - int Error = SuccessfulIO ? 0 : -EIO; + blk_status_t Error = SuccessfulIO ? BLK_STS_OK : BLK_STS_IOERR; pci_unmap_sg(Command->Controller->PCIDevice, Command->cmd_sglist, Command->SegmentCount, Command->DmaDirection); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index a328f673adfe..49908c74bfcb 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1378,7 +1378,7 @@ static void redo_fd_request(void) struct amiga_floppy_struct *floppy; char *data; unsigned long flags; - int err; + blk_status_t err; next_req: rq = set_next_request(); @@ -1392,7 +1392,7 @@ next_req: next_segment: /* Here someone could investigate to be more efficient */ - for (cnt = 0, err = 0; cnt < blk_rq_cur_sectors(rq); cnt++) { + for (cnt = 0, err = BLK_STS_OK; cnt < blk_rq_cur_sectors(rq); cnt++) { #ifdef DEBUG printk("fd: sector %ld + %d requested for %s\n", blk_rq_pos(rq), cnt, @@ -1400,7 +1400,7 @@ next_segment: #endif block = blk_rq_pos(rq) + cnt; if ((int)block > floppy->blocks) { - err = -EIO; + err = BLK_STS_IOERR; break; } @@ -1413,7 +1413,7 @@ next_segment: #endif if (get_track(drive, track) == -1) { - err = -EIO; + err = BLK_STS_IOERR; break; } @@ -1424,7 +1424,7 @@ next_segment: /* keep the drive spinning while writes are scheduled */ if (!fd_motor_on(drive)) { - err = -EIO; + err = BLK_STS_IOERR; break; } /* diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 027b876370bc..6797e6c23c8a 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -388,6 +388,7 @@ aoeblk_gdalloc(void *vp) d->aoemajor, d->aoeminor); goto err_mempool; } + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); spin_lock_irqsave(&d->lock, flags); WARN_ON(!(d->flags & DEVFL_GD_NOW)); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 3c606c09fd5a..dc43254e05a4 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1070,8 +1070,8 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail) d->ip.rq = NULL; do { bio = rq->bio; - bok = !fastfail && !bio->bi_error; - } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size)); + bok = !fastfail && !bio->bi_status; + } while (__blk_end_request(rq, bok ? BLK_STS_OK : BLK_STS_IOERR, bio->bi_iter.bi_size)); /* cf. http://lkml.org/lkml/2006/10/31/28 */ if (!fastfail) @@ -1131,7 +1131,7 @@ ktiocomplete(struct frame *f) ahout->cmdstat, ahin->cmdstat, d->aoemajor, d->aoeminor); noskb: if (buf) - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; goto out; } @@ -1144,7 +1144,7 @@ noskb: if (buf) "aoe: runt data size in read from", (long) d->aoemajor, d->aoeminor, skb->len, n); - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; break; } if (n > f->iter.bi_size) { @@ -1152,7 +1152,7 @@ noskb: if (buf) "aoe: too-large data size in read from", (long) d->aoemajor, d->aoeminor, n, f->iter.bi_size); - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; break; } bvcpy(skb, f->buf->bio, f->iter, n); @@ -1654,7 +1654,7 @@ aoe_failbuf(struct aoedev *d, struct buf *buf) if (buf == NULL) return; buf->iter.bi_size = 0; - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; if (buf->nframesout == 0) aoe_end_buf(d, buf); } diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index ffd1947500c6..b28fefb90391 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -170,7 +170,7 @@ aoe_failip(struct aoedev *d) if (rq == NULL) return; while ((bio = d->ip.nxbio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; d->ip.nxbio = bio->bi_next; n = (unsigned long) rq->special; rq->special = (void *) --n; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index fa69ecd52cb5..92da886180aa 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -378,7 +378,7 @@ static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0); static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0); static DEFINE_TIMER(fd_timer, check_change, 0, 0); -static void fd_end_request_cur(int err) +static void fd_end_request_cur(blk_status_t err) { if (!__blk_end_request_cur(fd_request, err)) fd_request = NULL; @@ -620,7 +620,7 @@ static void fd_error( void ) fd_request->error_count++; if (fd_request->error_count >= MAX_ERRORS) { printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive ); - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); } else if (fd_request->error_count == RECALIBRATE_ERRORS) { printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive ); @@ -739,7 +739,7 @@ static void do_fd_action( int drive ) } else { /* all sectors finished */ - fd_end_request_cur(0); + fd_end_request_cur(BLK_STS_OK); redo_fd_request(); return; } @@ -1144,7 +1144,7 @@ static void fd_rwsec_done1(int status) } else { /* all sectors finished */ - fd_end_request_cur(0); + fd_end_request_cur(BLK_STS_OK); redo_fd_request(); } return; @@ -1445,7 +1445,7 @@ repeat: if (!UD.connected) { /* drive not connected */ printk(KERN_ERR "Unknown Device: fd%d\n", drive ); - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); goto repeat; } @@ -1461,12 +1461,12 @@ repeat: /* user supplied disk type */ if (--type >= NUM_DISK_MINORS) { printk(KERN_WARNING "fd%d: invalid disk format", drive ); - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); goto repeat; } if (minor2disktype[type].drive_types > DriveType) { printk(KERN_WARNING "fd%d: unsupported disk format", drive ); - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); goto repeat; } type = minor2disktype[type].index; @@ -1476,7 +1476,7 @@ repeat: } if (blk_rq_pos(fd_request) + 1 > UDT->blocks) { - fd_end_request_cur(-EIO); + fd_end_request_cur(BLK_STS_IOERR); goto repeat; } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 57b574f2f66a..6112e99bedf7 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -418,7 +418,6 @@ static struct brd_device *brd_alloc(int i) blk_queue_make_request(brd->brd_queue, brd_make_request); blk_queue_max_hw_sectors(brd->brd_queue, 1024); - blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); /* This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index cd375503f7b0..02a611993bb4 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1864,7 +1864,8 @@ static void cciss_softirq_done(struct request *rq) /* set the residual count for pc requests */ if (blk_rq_is_passthrough(rq)) scsi_req(rq)->resid_len = c->err_info->ResidualCnt; - blk_end_request_all(rq, scsi_req(rq)->result ? -EIO : 0); + blk_end_request_all(rq, scsi_req(rq)->result ? + BLK_STS_IOERR : BLK_STS_OK); spin_lock_irqsave(&h->lock, flags); cmd_free(h, c); @@ -1956,6 +1957,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, disk->queue->cmd_size = sizeof(struct scsi_request); disk->queue->request_fn = do_cciss_request; disk->queue->queue_lock = &h->lock; + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, disk->queue); if (blk_init_allocated_queue(disk->queue) < 0) goto cleanup_queue; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 8d7bcfa49c12..e02c45cd3c5a 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -178,7 +178,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, else submit_bio(bio); wait_until_done_or_force_detached(device, bdev, &device->md_io.done); - if (!bio->bi_error) + if (!bio->bi_status) err = device->md_io.error; out: diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index a804a4107fbc..809fd245c3dc 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -959,16 +959,16 @@ static void drbd_bm_endio(struct bio *bio) !bm_test_page_unchanged(b->bm_pages[idx])) drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx); - if (bio->bi_error) { + if (bio->bi_status) { /* ctx error will hold the completed-last non-zero error code, * in case error codes differ. */ - ctx->error = bio->bi_error; + ctx->error = blk_status_to_errno(bio->bi_status); bm_set_page_io_err(b->bm_pages[idx]); /* Not identical to on disk version of it. * Is BM_PAGE_IO_ERROR enough? */ if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "IO ERROR %d on bitmap page idx %u\n", - bio->bi_error, idx); + bio->bi_status, idx); } else { bm_clear_page_io_err(b->bm_pages[idx]); dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index d5da45bb03a6..d17b6e6393c7 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1441,6 +1441,9 @@ extern struct bio_set *drbd_md_io_bio_set; /* to allocate from that set */ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); +/* And a bio_set for cloning */ +extern struct bio_set *drbd_io_bio_set; + extern struct mutex resources_mutex; extern int conn_lowest_minor(struct drbd_connection *connection); @@ -1627,7 +1630,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device, __release(local); if (!bio->bi_bdev) { drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n"); - bio->bi_error = -ENODEV; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 84455c365f57..5fb99e06ebe4 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -128,6 +128,7 @@ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; mempool_t *drbd_md_io_page_pool; struct bio_set *drbd_md_io_bio_set; +struct bio_set *drbd_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -2098,6 +2099,8 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */ + if (drbd_io_bio_set) + bioset_free(drbd_io_bio_set); if (drbd_md_io_bio_set) bioset_free(drbd_md_io_bio_set); if (drbd_md_io_page_pool) @@ -2115,6 +2118,7 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); + drbd_io_bio_set = NULL; drbd_md_io_bio_set = NULL; drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; @@ -2142,6 +2146,7 @@ static int drbd_create_mempools(void) drbd_pp_pool = NULL; drbd_md_io_page_pool = NULL; drbd_md_io_bio_set = NULL; + drbd_io_bio_set = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -2165,7 +2170,13 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ - drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_RESCUER); + if (drbd_io_bio_set == NULL) + goto Enomem; + + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0, + BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER); if (drbd_md_io_bio_set == NULL) goto Enomem; @@ -2839,7 +2850,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); - blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); q->queue_lock = &resource->req_lock; device->md_io.page = alloc_page(GFP_KERNEL); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 02255a0d68b9..ad0fcb43e45c 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2294,7 +2294,7 @@ _check_net_options(struct drbd_connection *connection, struct net_conf *old_net_ static enum drbd_ret_code check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf) { - static enum drbd_ret_code rv; + enum drbd_ret_code rv; struct drbd_peer_device *peer_device; int i; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1b0a2be24f39..c7e95e6380fb 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1229,9 +1229,9 @@ void one_flush_endio(struct bio *bio) struct drbd_device *device = octx->device; struct issue_flush_context *ctx = octx->ctx; - if (bio->bi_error) { - ctx->error = bio->bi_error; - drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error); + if (bio->bi_status) { + ctx->error = blk_status_to_errno(bio->bi_status); + drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status); } kfree(octx); bio_put(bio); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 656624314f0d..f6e865b2d543 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -203,7 +203,7 @@ void start_new_tl_epoch(struct drbd_connection *connection) void complete_master_bio(struct drbd_device *device, struct bio_and_error *m) { - m->bio->bi_error = m->error; + m->bio->bi_status = errno_to_blk_status(m->error); bio_endio(m->bio); dec_ap_bio(device); } @@ -1157,7 +1157,7 @@ static void drbd_process_discard_req(struct drbd_request *req) if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9, GFP_NOIO, 0)) - req->private_bio->bi_error = -EIO; + req->private_bio->bi_status = BLK_STS_IOERR; bio_endio(req->private_bio); } @@ -1225,7 +1225,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long /* only pass the error to the upper layers. * if user cannot handle io errors, that's not our business. */ drbd_err(device, "could not kmalloc() req\n"); - bio->bi_error = -ENOMEM; + bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return ERR_PTR(-ENOMEM); } @@ -1560,7 +1560,7 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) struct drbd_device *device = (struct drbd_device *) q->queuedata; unsigned long start_jif; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); start_jif = jiffies; diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index eb49e7f2da91..9e1866ab238f 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -263,7 +263,7 @@ enum drbd_req_state_bits { static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) { struct bio *bio; - bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + bio = bio_clone_fast(bio_src, GFP_NOIO, drbd_io_bio_set); req->private_bio = bio; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 1afcb4e02d8d..1d8726a8df34 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -63,7 +63,7 @@ void drbd_md_endio(struct bio *bio) struct drbd_device *device; device = bio->bi_private; - device->md_io.error = bio->bi_error; + device->md_io.error = blk_status_to_errno(bio->bi_status); /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able * to timeout on the lower level device, and eventually detach from it. @@ -177,13 +177,13 @@ void drbd_peer_request_endio(struct bio *bio) bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES || bio_op(bio) == REQ_OP_DISCARD; - if (bio->bi_error && __ratelimit(&drbd_ratelimit_state)) + if (bio->bi_status && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", is_write ? (is_discard ? "discard" : "write") - : "read", bio->bi_error, + : "read", bio->bi_status, (unsigned long long)peer_req->i.sector); - if (bio->bi_error) + if (bio->bi_status) set_bit(__EE_WAS_ERROR, &peer_req->flags); bio_put(bio); /* no need for the bio anymore */ @@ -243,16 +243,16 @@ void drbd_request_endio(struct bio *bio) if (__ratelimit(&drbd_ratelimit_state)) drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); - if (!bio->bi_error) + if (!bio->bi_status) drbd_panic_after_delayed_completion_of_aborted_request(device); } /* to avoid recursion in __req_mod */ - if (unlikely(bio->bi_error)) { + if (unlikely(bio->bi_status)) { switch (bio_op(bio)) { case REQ_OP_WRITE_ZEROES: case REQ_OP_DISCARD: - if (bio->bi_error == -EOPNOTSUPP) + if (bio->bi_status == BLK_STS_NOTSUPP) what = DISCARD_COMPLETED_NOTSUPP; else what = DISCARD_COMPLETED_WITH_ERROR; @@ -272,7 +272,7 @@ void drbd_request_endio(struct bio *bio) } bio_put(req->private_bio); - req->private_bio = ERR_PTR(bio->bi_error); + req->private_bio = ERR_PTR(blk_status_to_errno(bio->bi_status)); /* not req_mod(), we need irqsave here! */ spin_lock_irqsave(&device->resource->req_lock, flags); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 60d4c7653178..ce823647a9c4 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2202,7 +2202,7 @@ static int do_format(int drive, struct format_descr *tmp_format_req) * ============================= */ -static void floppy_end_request(struct request *req, int error) +static void floppy_end_request(struct request *req, blk_status_t error) { unsigned int nr_sectors = current_count_sectors; unsigned int drive = (unsigned long)req->rq_disk->private_data; @@ -2263,7 +2263,7 @@ static void request_done(int uptodate) DRWE->last_error_generation = DRS->generation; } spin_lock_irqsave(q->queue_lock, flags); - floppy_end_request(req, -EIO); + floppy_end_request(req, BLK_STS_IOERR); spin_unlock_irqrestore(q->queue_lock, flags); } } @@ -3780,9 +3780,9 @@ static void floppy_rb0_cb(struct bio *bio) struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private; int drive = cbdata->drive; - if (bio->bi_error) { + if (bio->bi_status) { pr_info("floppy: error %d while reading block 0\n", - bio->bi_error); + bio->bi_status); set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); } complete(&cbdata->complete); @@ -4203,6 +4203,7 @@ static int __init do_floppy_init(void) goto out_put_disk; } + blk_queue_bounce_limit(disks[drive]->queue, BLK_BOUNCE_HIGH); blk_queue_max_hw_sectors(disks[drive]->queue, 64); disks[drive]->major = FLOPPY_MAJOR; disks[drive]->first_minor = TOMINOR(drive); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ebbd0c3fe0ed..0de11444e317 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -221,7 +221,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) } static int -figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) +figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit, + loff_t logical_blocksize) { loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); sector_t x = (sector_t)size; @@ -233,6 +234,12 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) lo->lo_offset = offset; if (lo->lo_sizelimit != sizelimit) lo->lo_sizelimit = sizelimit; + if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) { + lo->lo_logical_blocksize = logical_blocksize; + blk_queue_physical_block_size(lo->lo_queue, lo->lo_blocksize); + blk_queue_logical_block_size(lo->lo_queue, + lo->lo_logical_blocksize); + } set_capacity(lo->lo_disk, x); bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9); /* let user-space know about the new size */ @@ -457,7 +464,7 @@ static void lo_complete_rq(struct request *rq) zero_fill_bio(bio); } - blk_mq_end_request(rq, cmd->ret < 0 ? -EIO : 0); + blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK); } static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) @@ -813,6 +820,7 @@ static void loop_config_discard(struct loop_device *lo) struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; struct request_queue *q = lo->lo_queue; + int lo_bits = 9; /* * We use punch hole to reclaim the free space used by the @@ -832,8 +840,11 @@ static void loop_config_discard(struct loop_device *lo) q->limits.discard_granularity = inode->i_sb->s_blocksize; q->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(q, UINT_MAX >> 9); - blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9); + if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) + lo_bits = blksize_bits(lo->lo_logical_blocksize); + + blk_queue_max_discard_sectors(q, UINT_MAX >> lo_bits); + blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> lo_bits); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); } @@ -843,10 +854,16 @@ static void loop_unprepare_queue(struct loop_device *lo) kthread_stop(lo->worker_task); } +static int loop_kthread_worker_fn(void *worker_ptr) +{ + current->flags |= PF_LESS_THROTTLE; + return kthread_worker_fn(worker_ptr); +} + static int loop_prepare_queue(struct loop_device *lo) { kthread_init_worker(&lo->worker); - lo->worker_task = kthread_run(kthread_worker_fn, + lo->worker_task = kthread_run(loop_kthread_worker_fn, &lo->worker, "loop%d", lo->lo_number); if (IS_ERR(lo->worker_task)) return -ENOMEM; @@ -921,6 +938,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->use_dio = false; lo->lo_blocksize = lo_blocksize; + lo->lo_logical_blocksize = 512; lo->lo_device = bdev; lo->lo_flags = lo_flags; lo->lo_backing_file = file; @@ -1086,6 +1104,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) int err; struct loop_func_table *xfer; kuid_t uid = current_uid(); + int lo_flags = lo->lo_flags; if (lo->lo_encrypt_key_size && !uid_eq(lo->lo_key_owner, uid) && @@ -1118,12 +1137,30 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (err) goto exit; + if (info->lo_flags & LO_FLAGS_BLOCKSIZE) { + if (!(lo->lo_flags & LO_FLAGS_BLOCKSIZE)) + lo->lo_logical_blocksize = 512; + lo->lo_flags |= LO_FLAGS_BLOCKSIZE; + if (LO_INFO_BLOCKSIZE(info) != 512 && + LO_INFO_BLOCKSIZE(info) != 1024 && + LO_INFO_BLOCKSIZE(info) != 2048 && + LO_INFO_BLOCKSIZE(info) != 4096) + return -EINVAL; + if (LO_INFO_BLOCKSIZE(info) > lo->lo_blocksize) + return -EINVAL; + } + if (lo->lo_offset != info->lo_offset || - lo->lo_sizelimit != info->lo_sizelimit) - if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { + lo->lo_sizelimit != info->lo_sizelimit || + lo->lo_flags != lo_flags || + ((lo->lo_flags & LO_FLAGS_BLOCKSIZE) && + lo->lo_logical_blocksize != LO_INFO_BLOCKSIZE(info))) { + if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit, + LO_INFO_BLOCKSIZE(info))) { err = -EFBIG; goto exit; } + } loop_config_discard(lo); @@ -1306,12 +1343,13 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { return err; } -static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev) +static int loop_set_capacity(struct loop_device *lo) { if (unlikely(lo->lo_state != Lo_bound)) return -ENXIO; - return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); + return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit, + lo->lo_logical_blocksize); } static int loop_set_dio(struct loop_device *lo, unsigned long arg) @@ -1369,7 +1407,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_SET_CAPACITY: err = -EPERM; if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) - err = loop_set_capacity(lo, bdev); + err = loop_set_capacity(lo); break; case LOOP_SET_DIRECT_IO: err = -EPERM; @@ -1645,7 +1683,7 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); @@ -1654,7 +1692,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); if (lo->lo_state != Lo_bound) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; switch (req_op(cmd->rq)) { case REQ_OP_FLUSH: @@ -1669,7 +1707,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, kthread_queue_work(&lo->worker, &cmd->work); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void loop_handle_cmd(struct loop_cmd *cmd) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index fecd3f97ef8c..2c096b9a17b8 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -49,6 +49,7 @@ struct loop_device { struct file * lo_backing_file; struct block_device *lo_device; unsigned lo_blocksize; + unsigned lo_logical_blocksize; void *key_data; gfp_t old_gfp_mask; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 3a779a4f5653..61b046f256ca 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -532,7 +532,7 @@ static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, struct smart_attr *attrib); -static void mtip_complete_command(struct mtip_cmd *cmd, int status) +static void mtip_complete_command(struct mtip_cmd *cmd, blk_status_t status) { struct request *req = blk_mq_rq_from_pdu(cmd); @@ -568,7 +568,7 @@ static void mtip_handle_tfe(struct driver_data *dd) if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); - mtip_complete_command(cmd, -EIO); + mtip_complete_command(cmd, BLK_STS_IOERR); return; } @@ -667,7 +667,7 @@ static void mtip_handle_tfe(struct driver_data *dd) tag, fail_reason != NULL ? fail_reason : "unknown"); - mtip_complete_command(cmd, -ENODATA); + mtip_complete_command(cmd, BLK_STS_MEDIUM); continue; } } @@ -690,7 +690,7 @@ static void mtip_handle_tfe(struct driver_data *dd) dev_warn(&port->dd->pdev->dev, "retiring tag %d\n", tag); - mtip_complete_command(cmd, -EIO); + mtip_complete_command(cmd, BLK_STS_IOERR); } } print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); @@ -1063,23 +1063,10 @@ static int mtip_exec_internal_command(struct mtip_port *port, /* insert request and run queue */ blk_execute_rq(rq->q, NULL, rq, true); - rv = int_cmd->status; - if (rv < 0) { - if (rv == -ERESTARTSYS) { /* interrupted */ - dev_err(&dd->pdev->dev, - "Internal command [%02X] was interrupted after %u ms\n", - fis->command, - jiffies_to_msecs(jiffies - start)); - rv = -EINTR; - goto exec_ic_exit; - } else if (rv == 0) /* timeout */ - dev_err(&dd->pdev->dev, - "Internal command did not complete [%02X] within timeout of %lu ms\n", - fis->command, timeout); - else - dev_err(&dd->pdev->dev, - "Internal command [%02X] wait returned code [%d] after %lu ms - unhandled\n", - fis->command, rv, timeout); + if (int_cmd->status) { + dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n", + fis->command, int_cmd->status); + rv = -EIO; if (mtip_check_surprise_removal(dd->pdev) || test_bit(MTIP_DDF_REMOVE_PENDING_BIT, @@ -2753,7 +2740,7 @@ static void mtip_abort_cmd(struct request *req, void *data, dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag); clear_bit(req->tag, dd->port->cmds_to_issue); - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; mtip_softirq_done_fn(req); } @@ -3597,7 +3584,7 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) int err; err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); - blk_mq_end_request(rq, err); + blk_mq_end_request(rq, err ? BLK_STS_IOERR : BLK_STS_OK); return 0; } @@ -3633,8 +3620,8 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } -static int mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, - struct request *rq) +static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, + struct request *rq) { struct driver_data *dd = hctx->queue->queuedata; struct mtip_int_cmd *icmd = rq->special; @@ -3642,7 +3629,7 @@ static int mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, struct mtip_cmd_sg *command_sg; if (mtip_commands_active(dd->port)) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; /* Populate the SG list */ cmd->command_header->opts = @@ -3666,10 +3653,10 @@ static int mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); mtip_issue_non_ncq_command(dd->port, rq->tag); - return BLK_MQ_RQ_QUEUE_OK; + return 0; } -static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; @@ -3681,15 +3668,14 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, return mtip_issue_reserved_cmd(hctx, rq); if (unlikely(mtip_check_unal_depth(hctx, rq))) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; blk_mq_start_request(rq); ret = mtip_submit_request(hctx, rq); if (likely(!ret)) - return BLK_MQ_RQ_QUEUE_OK; - - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_OK; + return BLK_STS_IOERR; } static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq, @@ -3730,7 +3716,7 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req, if (reserved) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); - cmd->status = -ETIME; + cmd->status = BLK_STS_TIMEOUT; return BLK_EH_HANDLED; } @@ -3961,7 +3947,7 @@ static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - cmd->status = -ENODEV; + cmd->status = BLK_STS_IOERR; blk_mq_complete_request(rq); } diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 37b8e3e0bb78..e8286af50e16 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -342,7 +342,7 @@ struct mtip_cmd { int retries; /* The number of retries left for this command. */ int direction; /* Data transfer direction */ - int status; + blk_status_t status; }; /* Structure used to describe a port. */ diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f3f191ba8ca4..977ec960dd2f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -116,7 +116,7 @@ struct nbd_cmd { int index; int cookie; struct completion send_complete; - int status; + blk_status_t status; }; #if IS_ENABLED(CONFIG_DEBUG_FS) @@ -286,7 +286,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, struct nbd_config *config; if (!refcount_inc_not_zero(&nbd->config_refs)) { - cmd->status = -EIO; + cmd->status = BLK_STS_TIMEOUT; return BLK_EH_HANDLED; } @@ -331,7 +331,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, "Connection timed out\n"); } set_bit(NBD_TIMEDOUT, &config->runtime_flags); - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; sock_shutdown(nbd); nbd_config_put(nbd); @@ -400,6 +400,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) unsigned long size = blk_rq_bytes(req); struct bio *bio; u32 type; + u32 nbd_cmd_flags = 0; u32 tag = blk_mq_unique_tag(req); int sent = nsock->sent, skip = 0; @@ -429,6 +430,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) return -EIO; } + if (req->cmd_flags & REQ_FUA) + nbd_cmd_flags |= NBD_CMD_FLAG_FUA; + /* We did a partial send previously, and we at least sent the whole * request struct, so just go and send the rest of the pages in the * request. @@ -442,7 +446,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) } cmd->index = index; cmd->cookie = nsock->cookie; - request.type = htonl(type); + request.type = htonl(type | nbd_cmd_flags); if (type != NBD_CMD_FLUSH) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); @@ -465,7 +469,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) nsock->pending = req; nsock->sent = sent; } - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } dev_err_ratelimited(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); @@ -506,7 +510,7 @@ send_pages: */ nsock->pending = req; nsock->sent = sent; - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", @@ -574,7 +578,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) if (ntohl(reply.error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; return cmd; } @@ -599,7 +603,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) */ if (nbd_disconnected(config) || config->num_connections <= 1) { - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; return cmd; } return ERR_PTR(-EIO); @@ -651,7 +655,7 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved) if (!blk_mq_request_started(req)) return; cmd = blk_mq_rq_to_pdu(req); - cmd->status = -EIO; + cmd->status = BLK_STS_IOERR; blk_mq_complete_request(req); } @@ -740,7 +744,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) nbd_config_put(nbd); return -EINVAL; } - cmd->status = 0; + cmd->status = BLK_STS_OK; again: nsock = config->socks[index]; mutex_lock(&nsock->tx_lock); @@ -794,7 +798,7 @@ out: return ret; } -static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); @@ -818,13 +822,9 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx, * appropriate. */ ret = nbd_handle_cmd(cmd, hctx->queue_num); - if (ret < 0) - ret = BLK_MQ_RQ_QUEUE_ERROR; - if (!ret) - ret = BLK_MQ_RQ_QUEUE_OK; complete(&cmd->send_complete); - return ret; + return ret < 0 ? BLK_STS_IOERR : BLK_STS_OK; } static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, @@ -910,6 +910,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) continue; } sk_set_memalloc(sock->sk); + sock->sk->sk_sndtimeo = nbd->tag_set.timeout; atomic_inc(&config->recv_threads); refcount_inc(&nbd->config_refs); old = nsock->sock; @@ -957,8 +958,12 @@ static void nbd_parse_flags(struct nbd_device *nbd) set_disk_ro(nbd->disk, false); if (config->flags & NBD_FLAG_SEND_TRIM) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); - if (config->flags & NBD_FLAG_SEND_FLUSH) - blk_queue_write_cache(nbd->disk->queue, true, false); + if (config->flags & NBD_FLAG_SEND_FLUSH) { + if (config->flags & NBD_FLAG_SEND_FUA) + blk_queue_write_cache(nbd->disk->queue, true, true); + else + blk_queue_write_cache(nbd->disk->queue, true, false); + } else blk_queue_write_cache(nbd->disk->queue, false, false); } @@ -1071,6 +1076,7 @@ static int nbd_start_device(struct nbd_device *nbd) return -ENOMEM; } sk_set_memalloc(config->socks[i]->sock->sk); + config->socks[i]->sock->sk->sk_sndtimeo = nbd->tag_set.timeout; atomic_inc(&config->recv_threads); refcount_inc(&nbd->config_refs); INIT_WORK(&args->work, recv_work); @@ -1305,6 +1311,8 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused) seq_puts(s, "NBD_FLAG_READ_ONLY\n"); if (flags & NBD_FLAG_SEND_FLUSH) seq_puts(s, "NBD_FLAG_SEND_FLUSH\n"); + if (flags & NBD_FLAG_SEND_FUA) + seq_puts(s, "NBD_FLAG_SEND_FUA\n"); if (flags & NBD_FLAG_SEND_TRIM) seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index d946e1eeac8e..71f4422eba81 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -35,7 +35,8 @@ struct nullb { struct request_queue *q; struct gendisk *disk; struct nvm_dev *ndev; - struct blk_mq_tag_set tag_set; + struct blk_mq_tag_set *tag_set; + struct blk_mq_tag_set __tag_set; struct hrtimer timer; unsigned int queue_depth; spinlock_t lock; @@ -50,6 +51,7 @@ static struct mutex lock; static int null_major; static int nullb_indexes; static struct kmem_cache *ppa_cache; +static struct blk_mq_tag_set tag_set; enum { NULL_IRQ_NONE = 0, @@ -109,7 +111,7 @@ static int bs = 512; module_param(bs, int, S_IRUGO); MODULE_PARM_DESC(bs, "Block size (in bytes)"); -static int nr_devices = 2; +static int nr_devices = 1; module_param(nr_devices, int, S_IRUGO); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); @@ -121,6 +123,10 @@ static bool blocking; module_param(blocking, bool, S_IRUGO); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); +static bool shared_tags; +module_param(shared_tags, bool, S_IRUGO); +MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); + static int irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) @@ -229,11 +235,11 @@ static void end_cmd(struct nullb_cmd *cmd) switch (queue_mode) { case NULL_Q_MQ: - blk_mq_end_request(cmd->rq, 0); + blk_mq_end_request(cmd->rq, BLK_STS_OK); return; case NULL_Q_RQ: INIT_LIST_HEAD(&cmd->rq->queuelist); - blk_end_request_all(cmd->rq, 0); + blk_end_request_all(cmd->rq, BLK_STS_OK); break; case NULL_Q_BIO: bio_endio(cmd->bio); @@ -356,7 +362,7 @@ static void null_request_fn(struct request_queue *q) } } -static int null_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); @@ -373,34 +379,11 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); null_handle_cmd(cmd); - return BLK_MQ_RQ_QUEUE_OK; -} - -static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) -{ - BUG_ON(!nullb); - BUG_ON(!nq); - - init_waitqueue_head(&nq->wait); - nq->queue_depth = nullb->queue_depth; -} - -static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int index) -{ - struct nullb *nullb = data; - struct nullb_queue *nq = &nullb->queues[index]; - - hctx->driver_data = nq; - null_init_queue(nullb, nq); - nullb->nr_queues++; - - return 0; + return BLK_STS_OK; } static const struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, - .init_hctx = null_init_hctx, .complete = null_softirq_done_fn, }; @@ -422,11 +405,12 @@ static void cleanup_queues(struct nullb *nullb) #ifdef CONFIG_NVM -static void null_lnvm_end_io(struct request *rq, int error) +static void null_lnvm_end_io(struct request *rq, blk_status_t status) { struct nvm_rq *rqd = rq->end_io_data; - rqd->error = error; + /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */ + rqd->error = status ? -EIO : 0; nvm_end_io(rqd); blk_put_request(rq); @@ -591,8 +575,8 @@ static void null_del_dev(struct nullb *nullb) else del_gendisk(nullb->disk); blk_cleanup_queue(nullb->q); - if (queue_mode == NULL_Q_MQ) - blk_mq_free_tag_set(&nullb->tag_set); + if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); if (!use_lightnvm) put_disk(nullb->disk); cleanup_queues(nullb); @@ -614,6 +598,32 @@ static const struct block_device_operations null_fops = { .release = null_release, }; +static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) +{ + BUG_ON(!nullb); + BUG_ON(!nq); + + init_waitqueue_head(&nq->wait); + nq->queue_depth = nullb->queue_depth; +} + +static void null_init_queues(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + struct blk_mq_hw_ctx *hctx; + struct nullb_queue *nq; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->nr_ctx || !hctx->tags) + continue; + nq = &nullb->queues[i]; + hctx->driver_data = nq; + null_init_queue(nullb, nq); + nullb->nr_queues++; + } +} + static int setup_commands(struct nullb_queue *nq) { struct nullb_cmd *cmd; @@ -694,6 +704,22 @@ static int null_gendisk_register(struct nullb *nullb) return 0; } +static int null_init_tag_set(struct blk_mq_tag_set *set) +{ + set->ops = &null_mq_ops; + set->nr_hw_queues = submit_queues; + set->queue_depth = hw_queue_depth; + set->numa_node = home_node; + set->cmd_size = sizeof(struct nullb_cmd); + set->flags = BLK_MQ_F_SHOULD_MERGE; + set->driver_data = NULL; + + if (blocking) + set->flags |= BLK_MQ_F_BLOCKING; + + return blk_mq_alloc_tag_set(set); +} + static int null_add_dev(void) { struct nullb *nullb; @@ -715,26 +741,23 @@ static int null_add_dev(void) goto out_free_nullb; if (queue_mode == NULL_Q_MQ) { - nullb->tag_set.ops = &null_mq_ops; - nullb->tag_set.nr_hw_queues = submit_queues; - nullb->tag_set.queue_depth = hw_queue_depth; - nullb->tag_set.numa_node = home_node; - nullb->tag_set.cmd_size = sizeof(struct nullb_cmd); - nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - nullb->tag_set.driver_data = nullb; - - if (blocking) - nullb->tag_set.flags |= BLK_MQ_F_BLOCKING; - - rv = blk_mq_alloc_tag_set(&nullb->tag_set); + if (shared_tags) { + nullb->tag_set = &tag_set; + rv = 0; + } else { + nullb->tag_set = &nullb->__tag_set; + rv = null_init_tag_set(nullb->tag_set); + } + if (rv) goto out_cleanup_queues; - nullb->q = blk_mq_init_queue(&nullb->tag_set); + nullb->q = blk_mq_init_queue(nullb->tag_set); if (IS_ERR(nullb->q)) { rv = -ENOMEM; goto out_cleanup_tags; } + null_init_queues(nullb); } else if (queue_mode == NULL_Q_BIO) { nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); if (!nullb->q) { @@ -787,8 +810,8 @@ static int null_add_dev(void) out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: - if (queue_mode == NULL_Q_MQ) - blk_mq_free_tag_set(&nullb->tag_set); + if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); out_cleanup_queues: cleanup_queues(nullb); out_free_nullb: @@ -821,6 +844,9 @@ static int __init null_init(void) queue_mode = NULL_Q_MQ; } + if (queue_mode == NULL_Q_MQ && shared_tags) + null_init_tag_set(&tag_set); + if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { if (submit_queues < nr_online_nodes) { pr_warn("null_blk: submit_queues param is set to %u.", @@ -881,6 +907,9 @@ static void __exit null_exit(void) } mutex_unlock(&lock); + if (queue_mode == NULL_Q_MQ && shared_tags) + blk_mq_free_tag_set(&tag_set); + kmem_cache_destroy(ppa_cache); } diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index b1267ef34d5a..7b8c6368beb7 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -305,6 +305,7 @@ static void pcd_init_units(void) put_disk(disk); continue; } + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); cd->disk = disk; cd->pi = &cd->pia; cd->present = 0; @@ -783,7 +784,7 @@ static void pcd_request(void) ps_set_intr(do_pcd_read, NULL, 0, nice); return; } else { - __blk_end_request_all(pcd_req, -EIO); + __blk_end_request_all(pcd_req, BLK_STS_IOERR); pcd_req = NULL; } } @@ -794,7 +795,7 @@ static void do_pcd_request(struct request_queue *q) pcd_request(); } -static inline void next_request(int err) +static inline void next_request(blk_status_t err) { unsigned long saved_flags; @@ -837,7 +838,7 @@ static void pcd_start(void) if (pcd_command(pcd_current, rd_cmd, 2048, "read block")) { pcd_bufblk = -1; - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } @@ -871,7 +872,7 @@ static void do_pcd_read_drq(void) return; } pcd_bufblk = -1; - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 7d2402f90978..27a44b97393a 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -438,7 +438,7 @@ static void run_fsm(void) phase = NULL; spin_lock_irqsave(&pd_lock, saved_flags); if (!__blk_end_request_cur(pd_req, - res == Ok ? 0 : -EIO)) { + res == Ok ? 0 : BLK_STS_IOERR)) { if (!set_next_request()) stop = 1; } @@ -863,6 +863,7 @@ static void pd_probe_drive(struct pd_unit *disk) return; } blk_queue_max_hw_sectors(p->queue, cluster); + blk_queue_bounce_limit(p->queue, BLK_BOUNCE_HIGH); if (disk->drive == -1) { for (disk->drive = 0; disk->drive <= 1; disk->drive++) diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index f24ca7315ddc..eef7a91f667d 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -293,6 +293,7 @@ static void __init pf_init_units(void) return; } blk_queue_max_segments(disk->queue, cluster); + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); pf->disk = disk; pf->pi = &pf->pia; pf->media_status = PF_NM; @@ -801,7 +802,7 @@ static int set_next_request(void) return pf_req != NULL; } -static void pf_end_request(int err) +static void pf_end_request(blk_status_t err) { if (pf_req && !__blk_end_request_cur(pf_req, err)) pf_req = NULL; @@ -821,7 +822,7 @@ repeat: pf_count = blk_rq_cur_sectors(pf_req); if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) { - pf_end_request(-EIO); + pf_end_request(BLK_STS_IOERR); goto repeat; } @@ -836,7 +837,7 @@ repeat: pi_do_claimed(pf_current->pi, do_pf_write); else { pf_busy = 0; - pf_end_request(-EIO); + pf_end_request(BLK_STS_IOERR); goto repeat; } } @@ -868,7 +869,7 @@ static int pf_next_buf(void) return 0; } -static inline void next_request(int err) +static inline void next_request(blk_status_t err) { unsigned long saved_flags; @@ -896,7 +897,7 @@ static void do_pf_read_start(void) pi_do_claimed(pf_current->pi, do_pf_read_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } pf_mask = STAT_DRQ; @@ -915,7 +916,7 @@ static void do_pf_read_drq(void) pi_do_claimed(pf_current->pi, do_pf_read_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } pi_read_block(pf_current->pi, pf_buf, 512); @@ -942,7 +943,7 @@ static void do_pf_write_start(void) pi_do_claimed(pf_current->pi, do_pf_write_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } @@ -955,7 +956,7 @@ static void do_pf_write_start(void) pi_do_claimed(pf_current->pi, do_pf_write_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } pi_write_block(pf_current->pi, pf_buf, 512); @@ -975,7 +976,7 @@ static void do_pf_write_done(void) pi_do_claimed(pf_current->pi, do_pf_write_start); return; } - next_request(-EIO); + next_request(BLK_STS_IOERR); return; } pi_disconnect(pf_current->pi); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 205b865ebeb9..467beca397a2 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -98,6 +98,7 @@ static int write_congestion_on = PKT_WRITE_CONGESTION_ON; static int write_congestion_off = PKT_WRITE_CONGESTION_OFF; static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */ static mempool_t *psd_pool; +static struct bio_set *pkt_bio_set; static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */ static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ @@ -707,7 +708,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); - scsi_req_init(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, @@ -952,9 +952,9 @@ static void pkt_end_io_read(struct bio *bio) pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", bio, (unsigned long long)pkt->sector, - (unsigned long long)bio->bi_iter.bi_sector, bio->bi_error); + (unsigned long long)bio->bi_iter.bi_sector, bio->bi_status); - if (bio->bi_error) + if (bio->bi_status) atomic_inc(&pkt->io_errors); if (atomic_dec_and_test(&pkt->io_wait)) { atomic_inc(&pkt->run_sm); @@ -969,7 +969,7 @@ static void pkt_end_io_packet_write(struct bio *bio) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_error); + pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status); pd->stats.pkt_ended++; @@ -1305,16 +1305,16 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) pkt_queue_bio(pd, pkt->w_bio); } -static void pkt_finish_packet(struct packet_data *pkt, int error) +static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status) { struct bio *bio; - if (error) + if (status) pkt->cache_valid = 0; /* Finish all bios corresponding to this packet */ while ((bio = bio_list_pop(&pkt->orig_bios))) { - bio->bi_error = error; + bio->bi_status = status; bio_endio(bio); } } @@ -1349,7 +1349,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data if (atomic_read(&pkt->io_wait) > 0) return; - if (!pkt->w_bio->bi_error) { + if (!pkt->w_bio->bi_status) { pkt_set_state(pkt, PACKET_FINISHED_STATE); } else { pkt_set_state(pkt, PACKET_RECOVERY_STATE); @@ -1366,7 +1366,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data break; case PACKET_FINISHED_STATE: - pkt_finish_packet(pkt, pkt->w_bio->bi_error); + pkt_finish_packet(pkt, pkt->w_bio->bi_status); return; default: @@ -2301,7 +2301,7 @@ static void pkt_end_io_read_cloned(struct bio *bio) struct packet_stacked_data *psd = bio->bi_private; struct pktcdvd_device *pd = psd->pd; - psd->bio->bi_error = bio->bi_error; + psd->bio->bi_status = bio->bi_status; bio_put(bio); bio_endio(psd->bio); mempool_free(psd, psd_pool); @@ -2310,7 +2310,7 @@ static void pkt_end_io_read_cloned(struct bio *bio) static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) { - struct bio *cloned_bio = bio_clone(bio, GFP_NOIO); + struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, pkt_bio_set); struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO); psd->pd = pd; @@ -2412,9 +2412,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) char b[BDEVNAME_SIZE]; struct bio *split; - blk_queue_bounce(q, &bio); - - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); pd = q->queuedata; if (!pd) { @@ -2455,7 +2453,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) split = bio_split(bio, last_zone - bio->bi_iter.bi_sector, - GFP_NOIO, fs_bio_set); + GFP_NOIO, pkt_bio_set); bio_chain(split, bio); } else { split = bio; @@ -2583,6 +2581,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) bdev = bdget(dev); if (!bdev) return -ENOMEM; + if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { + WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); + bdput(bdev); + return -EINVAL; + } ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); if (ret) return ret; @@ -2919,6 +2922,11 @@ static int __init pkt_init(void) sizeof(struct packet_stacked_data)); if (!psd_pool) return -ENOMEM; + pkt_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0); + if (!pkt_bio_set) { + mempool_destroy(psd_pool); + return -ENOMEM; + } ret = register_blkdev(pktdev_major, DRIVER_NAME); if (ret < 0) { @@ -2951,6 +2959,7 @@ out: unregister_blkdev(pktdev_major, DRIVER_NAME); out2: mempool_destroy(psd_pool); + bioset_free(pkt_bio_set); return ret; } @@ -2964,6 +2973,7 @@ static void __exit pkt_exit(void) unregister_blkdev(pktdev_major, DRIVER_NAME); mempool_destroy(psd_pool); + bioset_free(pkt_bio_set); } MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives"); diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index a809e3e9feb8..075662f2cf46 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -158,7 +158,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev, if (res) { dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__, __LINE__, op, res); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); return 0; } @@ -180,7 +180,7 @@ static int ps3disk_submit_flush_request(struct ps3_storage_device *dev, if (res) { dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n", __func__, __LINE__, res); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); return 0; } @@ -208,7 +208,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev, break; default: blk_dump_rq_flags(req, DEVICE_NAME " bad request"); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } } } @@ -231,7 +231,8 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) struct ps3_storage_device *dev = data; struct ps3disk_private *priv; struct request *req; - int res, read, error; + int res, read; + blk_status_t error; u64 tag, status; const char *op; @@ -269,7 +270,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) if (status) { dev_dbg(&dev->sbd.core, "%s:%u: %s failed 0x%llx\n", __func__, __LINE__, op, status); - error = -EIO; + error = BLK_STS_IOERR; } else { dev_dbg(&dev->sbd.core, "%s:%u: %s completed\n", __func__, __LINE__, op); diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 456b4fe21559..e0e81cacd781 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -428,7 +428,7 @@ static void ps3vram_cache_cleanup(struct ps3_system_bus_device *dev) kfree(priv->cache.tags); } -static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, +static blk_status_t ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); @@ -438,7 +438,7 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, (unsigned int)from, len); if (from >= priv->size) - return -EIO; + return BLK_STS_IOERR; if (len > priv->size - from) len = priv->size - from; @@ -472,14 +472,14 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, return 0; } -static int ps3vram_write(struct ps3_system_bus_device *dev, loff_t to, +static blk_status_t ps3vram_write(struct ps3_system_bus_device *dev, loff_t to, size_t len, size_t *retlen, const u_char *buf) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); unsigned int cached, count; if (to >= priv->size) - return -EIO; + return BLK_STS_IOERR; if (len > priv->size - to) len = priv->size - to; @@ -554,7 +554,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, int write = bio_data_dir(bio) == WRITE; const char *op = write ? "write" : "read"; loff_t offset = bio->bi_iter.bi_sector << 9; - int error = 0; + blk_status_t error = 0; struct bio_vec bvec; struct bvec_iter iter; struct bio *next; @@ -578,7 +578,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, if (retlen != len) { dev_err(&dev->core, "Short %s\n", op); - error = -EIO; + error = BLK_STS_IOERR; goto out; } @@ -593,7 +593,7 @@ out: next = bio_list_peek(&priv->list); spin_unlock_irq(&priv->lock); - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); return next; } @@ -606,7 +606,7 @@ static blk_qc_t ps3vram_make_request(struct request_queue *q, struct bio *bio) dev_dbg(&dev->core, "%s\n", __func__); - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); spin_lock_irq(&priv->lock); busy = !bio_list_empty(&priv->list); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c16f74547804..b008b6a98098 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -442,6 +442,8 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; +static struct bio_set *rbd_bio_clone; + static int rbd_major; static DEFINE_IDA(rbd_dev_id_ida); @@ -1363,7 +1365,7 @@ static struct bio *bio_clone_range(struct bio *bio_src, { struct bio *bio; - bio = bio_clone(bio_src, gfpmask); + bio = bio_clone_fast(bio_src, gfpmask, rbd_bio_clone); if (!bio) return NULL; /* ENOMEM */ @@ -2293,11 +2295,13 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) rbd_assert(img_request->obj_request != NULL); more = obj_request->which < img_request->obj_request_count - 1; } else { + blk_status_t status = errno_to_blk_status(result); + rbd_assert(img_request->rq != NULL); - more = blk_update_request(img_request->rq, result, xferred); + more = blk_update_request(img_request->rq, status, xferred); if (!more) - __blk_mq_end_request(img_request->rq, result); + __blk_mq_end_request(img_request->rq, status); } return more; @@ -4150,17 +4154,17 @@ err_rq: obj_op_name(op_type), length, offset, result); ceph_put_snap_context(snapc); err: - blk_mq_end_request(rq, result); + blk_mq_end_request(rq, errno_to_blk_status(result)); } -static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; struct work_struct *work = blk_mq_rq_to_pdu(rq); queue_work(rbd_wq, work); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void rbd_free_disk(struct rbd_device *rbd_dev) @@ -6414,8 +6418,16 @@ static int rbd_slab_init(void) if (!rbd_obj_request_cache) goto out_err; + rbd_assert(!rbd_bio_clone); + rbd_bio_clone = bioset_create(BIO_POOL_SIZE, 0, 0); + if (!rbd_bio_clone) + goto out_err_clone; + return 0; +out_err_clone: + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; out_err: kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; @@ -6431,6 +6443,10 @@ static void rbd_slab_exit(void) rbd_assert(rbd_img_request_cache); kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; + + rbd_assert(rbd_bio_clone); + bioset_free(rbd_bio_clone); + rbd_bio_clone = NULL; } static int __init rbd_init(void) diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 9c566364ac9c..7f4acebf4657 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -149,9 +149,9 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) { struct rsxx_cardinfo *card = q->queuedata; struct rsxx_bio_meta *bio_meta; - int st = -EINVAL; + blk_status_t st = BLK_STS_IOERR; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); might_sleep(); @@ -161,15 +161,11 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) if (bio_end_sector(bio) > get_capacity(card->gendisk)) goto req_err; - if (unlikely(card->halt)) { - st = -EFAULT; + if (unlikely(card->halt)) goto req_err; - } - if (unlikely(card->dma_fault)) { - st = (-EFAULT); + if (unlikely(card->dma_fault)) goto req_err; - } if (bio->bi_iter.bi_size == 0) { dev_err(CARD_TO_DEV(card), "size zero BIO!\n"); @@ -178,7 +174,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL); if (!bio_meta) { - st = -ENOMEM; + st = BLK_STS_RESOURCE; goto req_err; } @@ -205,7 +201,7 @@ queue_err: kmem_cache_free(bio_meta_pool, bio_meta); req_err: if (st) - bio->bi_error = st; + bio->bi_status = st; bio_endio(bio); return BLK_QC_T_NONE; } @@ -288,7 +284,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) } blk_queue_make_request(card->queue, rsxx_make_request); - blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 5a20385f87d0..6a1b2177951c 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -611,7 +611,7 @@ static void rsxx_schedule_done(struct work_struct *work) mutex_unlock(&ctrl->work_lock); } -static int rsxx_queue_discard(struct rsxx_cardinfo *card, +static blk_status_t rsxx_queue_discard(struct rsxx_cardinfo *card, struct list_head *q, unsigned int laddr, rsxx_dma_cb cb, @@ -621,7 +621,7 @@ static int rsxx_queue_discard(struct rsxx_cardinfo *card, dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); if (!dma) - return -ENOMEM; + return BLK_STS_RESOURCE; dma->cmd = HW_CMD_BLK_DISCARD; dma->laddr = laddr; @@ -640,7 +640,7 @@ static int rsxx_queue_discard(struct rsxx_cardinfo *card, return 0; } -static int rsxx_queue_dma(struct rsxx_cardinfo *card, +static blk_status_t rsxx_queue_dma(struct rsxx_cardinfo *card, struct list_head *q, int dir, unsigned int dma_off, @@ -655,7 +655,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card, dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); if (!dma) - return -ENOMEM; + return BLK_STS_RESOURCE; dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; dma->laddr = laddr; @@ -677,7 +677,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card, return 0; } -int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, +blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, struct bio *bio, atomic_t *n_dmas, rsxx_dma_cb cb, @@ -694,7 +694,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, unsigned int dma_len; int dma_cnt[RSXX_MAX_TARGETS]; int tgt; - int st; + blk_status_t st; int i; addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */ @@ -769,7 +769,6 @@ bvec_err: for (i = 0; i < card->n_targets; i++) rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i], FREE_DMA); - return st; } diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 6bbc64d0f690..277f27e673a2 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h @@ -391,7 +391,7 @@ int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); void rsxx_dma_cleanup(void); void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); int rsxx_dma_configure(struct rsxx_cardinfo *card); -int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, +blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, struct bio *bio, atomic_t *n_dmas, rsxx_dma_cb cb, diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 27833e4dae2a..d0368682bd43 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -451,8 +451,8 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, struct skd_special_context *skspcl); static void skd_request_fn(struct request_queue *rq); static void skd_end_request(struct skd_device *skdev, - struct skd_request_context *skreq, int error); -static int skd_preop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq, blk_status_t status); +static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq); static void skd_postop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq); @@ -491,7 +491,7 @@ static void skd_fail_all_pending(struct skd_device *skdev) if (req == NULL) break; blk_start_request(req); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } } @@ -545,7 +545,6 @@ static void skd_request_fn(struct request_queue *q) struct request *req = NULL; struct skd_scsi_request *scsi_req; unsigned long io_flags; - int error; u32 lba; u32 count; int data_dir; @@ -716,9 +715,7 @@ static void skd_request_fn(struct request_queue *q) if (!req->bio) goto skip_sg; - error = skd_preop_sg_list(skdev, skreq); - - if (error != 0) { + if (!skd_preop_sg_list(skdev, skreq)) { /* * Complete the native request with error. * Note that the request context is still at the @@ -730,7 +727,7 @@ static void skd_request_fn(struct request_queue *q) */ pr_debug("%s:%s:%d error Out\n", skdev->name, __func__, __LINE__); - skd_end_request(skdev, skreq, error); + skd_end_request(skdev, skreq, BLK_STS_RESOURCE); continue; } @@ -805,7 +802,7 @@ skip_sg: } static void skd_end_request(struct skd_device *skdev, - struct skd_request_context *skreq, int error) + struct skd_request_context *skreq, blk_status_t error) { if (unlikely(error)) { struct request *req = skreq->req; @@ -822,7 +819,7 @@ static void skd_end_request(struct skd_device *skdev, __blk_end_request_all(skreq->req, error); } -static int skd_preop_sg_list(struct skd_device *skdev, +static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq) { struct request *req = skreq->req; @@ -839,7 +836,7 @@ static int skd_preop_sg_list(struct skd_device *skdev, n_sg = blk_rq_map_sg(skdev->queue, req, sg); if (n_sg <= 0) - return -EINVAL; + return false; /* * Map scatterlist to PCI bus addresses. @@ -847,7 +844,7 @@ static int skd_preop_sg_list(struct skd_device *skdev, */ n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir); if (n_sg <= 0) - return -EINVAL; + return false; SKD_ASSERT(n_sg <= skdev->sgs_per_request); @@ -882,7 +879,7 @@ static int skd_preop_sg_list(struct skd_device *skdev, } } - return 0; + return true; } static void skd_postop_sg_list(struct skd_device *skdev, @@ -2333,7 +2330,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev, switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) { case SKD_CHECK_STATUS_REPORT_GOOD: case SKD_CHECK_STATUS_REPORT_SMART_ALERT: - skd_end_request(skdev, skreq, 0); + skd_end_request(skdev, skreq, BLK_STS_OK); break; case SKD_CHECK_STATUS_BUSY_IMMINENT: @@ -2355,7 +2352,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev, case SKD_CHECK_STATUS_REPORT_ERROR: default: - skd_end_request(skdev, skreq, -EIO); + skd_end_request(skdev, skreq, BLK_STS_IOERR); break; } } @@ -2748,7 +2745,7 @@ static int skd_isr_completion_posted(struct skd_device *skdev, * native request. */ if (likely(cmp_status == SAM_STAT_GOOD)) - skd_end_request(skdev, skreq, 0); + skd_end_request(skdev, skreq, BLK_STS_OK); else skd_resolve_req_exception(skdev, skreq); } @@ -3190,7 +3187,7 @@ static void skd_recover_requests(struct skd_device *skdev, int requeue) SKD_MAX_RETRIES) blk_requeue_request(skdev->queue, skreq->req); else - skd_end_request(skdev, skreq, -EIO); + skd_end_request(skdev, skreq, BLK_STS_IOERR); skreq->req = NULL; @@ -4276,6 +4273,7 @@ static int skd_cons_disk(struct skd_device *skdev) rc = -ENOMEM; goto err_out; } + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); skdev->queue = q; disk->queue = q; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 3f3a3ab3d50a..6b16ead1da58 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -316,7 +316,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, rqe->req = NULL; - __blk_end_request(req, (desc->status ? -EIO : 0), desc->size); + __blk_end_request(req, (desc->status ? BLK_STS_IOERR : 0), desc->size); vdc_blk_queue_start(port); } @@ -1023,7 +1023,7 @@ static void vdc_queue_drain(struct vdc_port *port) struct request *req; while ((req = blk_fetch_request(port->disk->queue)) != NULL) - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } static void vdc_ldc_reset_timer(unsigned long _arg) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 3064be6cf375..84434d3ea19b 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -493,7 +493,7 @@ static inline int swim_read_sector(struct floppy_state *fs, return ret; } -static int floppy_read_sectors(struct floppy_state *fs, +static blk_status_t floppy_read_sectors(struct floppy_state *fs, int req_sector, int sectors_nb, unsigned char *buffer) { @@ -516,7 +516,7 @@ static int floppy_read_sectors(struct floppy_state *fs, ret = swim_read_sector(fs, side, track, sector, buffer); if (try-- == 0) - return -EIO; + return BLK_STS_IOERR; } while (ret != 512); buffer += ret; @@ -553,7 +553,7 @@ static void do_fd_request(struct request_queue *q) req = swim_next_request(swd); while (req) { - int err = -EIO; + blk_status_t err = BLK_STS_IOERR; fs = req->rq_disk->private_data; if (blk_rq_pos(req) >= fs->total_secs) @@ -864,6 +864,8 @@ static int swim_floppy_init(struct swim_priv *swd) put_disk(swd->unit[drive].disk); goto exit_put_disks; } + blk_queue_bounce_limit(swd->unit[drive].disk->queue, + BLK_BOUNCE_HIGH); swd->unit[drive].disk->queue->queuedata = swd; swd->unit[drive].swd = swd; } diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index ba4809c9bdba..9f931f8f6b4c 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -257,7 +257,7 @@ static unsigned int floppy_check_events(struct gendisk *disk, unsigned int clearing); static int floppy_revalidate(struct gendisk *disk); -static bool swim3_end_request(struct floppy_state *fs, int err, unsigned int nr_bytes) +static bool swim3_end_request(struct floppy_state *fs, blk_status_t err, unsigned int nr_bytes) { struct request *req = fs->cur_req; int rc; @@ -334,7 +334,7 @@ static void start_request(struct floppy_state *fs) if (fs->mdev->media_bay && check_media_bay(fs->mdev->media_bay) != MB_FD) { swim3_dbg("%s", " media bay absent, dropping req\n"); - swim3_end_request(fs, -ENODEV, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); continue; } @@ -350,12 +350,12 @@ static void start_request(struct floppy_state *fs) if (blk_rq_pos(req) >= fs->total_secs) { swim3_dbg(" pos out of bounds (%ld, max is %ld)\n", (long)blk_rq_pos(req), (long)fs->total_secs); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); continue; } if (fs->ejected) { swim3_dbg("%s", " disk ejected\n"); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); continue; } @@ -364,7 +364,7 @@ static void start_request(struct floppy_state *fs) fs->write_prot = swim3_readbit(fs, WRITE_PROT); if (fs->write_prot) { swim3_dbg("%s", " try to write, disk write protected\n"); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); continue; } } @@ -548,7 +548,7 @@ static void act(struct floppy_state *fs) if (fs->retries > 5) { swim3_err("Wrong cylinder in transfer, want: %d got %d\n", fs->req_cyl, fs->cur_cyl); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; return; } @@ -584,7 +584,7 @@ static void scan_timeout(unsigned long data) out_8(&sw->intr_enable, 0); fs->cur_cyl = -1; if (fs->retries > 5) { - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); } else { @@ -608,7 +608,7 @@ static void seek_timeout(unsigned long data) out_8(&sw->select, RELAX); out_8(&sw->intr_enable, 0); swim3_err("%s", "Seek timeout\n"); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); spin_unlock_irqrestore(&swim3_lock, flags); @@ -637,7 +637,7 @@ static void settle_timeout(unsigned long data) goto unlock; } swim3_err("%s", "Seek settle timeout\n"); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); unlock: @@ -666,7 +666,7 @@ static void xfer_timeout(unsigned long data) swim3_err("Timeout %sing sector %ld\n", (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"), (long)blk_rq_pos(fs->cur_req)); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); spin_unlock_irqrestore(&swim3_lock, flags); @@ -703,7 +703,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) swim3_err("%s", "Seen sector but cyl=ff?\n"); fs->cur_cyl = -1; if (fs->retries > 5) { - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); } else { @@ -786,7 +786,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) swim3_err("Error %sing block %ld (err=%x)\n", rq_data_dir(req) == WRITE? "writ": "read", (long)blk_rq_pos(req), err); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; } } else { @@ -795,7 +795,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid); swim3_err(" state=%d, dir=%x, intr=%x, err=%x\n", fs->state, rq_data_dir(req), intr, err); - swim3_end_request(fs, -EIO, 0); + swim3_end_request(fs, BLK_STS_IOERR, 0); fs->state = idle; start_request(fs); break; @@ -1223,6 +1223,7 @@ static int swim3_attach(struct macio_dev *mdev, put_disk(disk); return -ENOMEM; } + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); disk->queue->queuedata = &floppy_states[index]; if (index == 0) { @@ -1245,7 +1246,7 @@ static int swim3_attach(struct macio_dev *mdev, return 0; } -static struct of_device_id swim3_match[] = +static const struct of_device_id swim3_match[] = { { .name = "swim3", diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index c8e072caf56f..08586dc14e85 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -745,7 +745,7 @@ static unsigned int carm_fill_get_fw_ver(struct carm_host *host, static inline void carm_end_request_queued(struct carm_host *host, struct carm_request *crq, - int error) + blk_status_t error) { struct request *req = crq->rq; int rc; @@ -791,7 +791,7 @@ static inline void carm_round_robin(struct carm_host *host) } static inline void carm_end_rq(struct carm_host *host, struct carm_request *crq, - int error) + blk_status_t error) { carm_end_request_queued(host, crq, error); if (max_queue == 1) @@ -869,14 +869,14 @@ queue_one_request: sg = &crq->sg[0]; n_elem = blk_rq_map_sg(q, rq, sg); if (n_elem <= 0) { - carm_end_rq(host, crq, -EIO); + carm_end_rq(host, crq, BLK_STS_IOERR); return; /* request with no s/g entries? */ } /* map scatterlist to PCI bus addresses */ n_elem = pci_map_sg(host->pdev, sg, n_elem, pci_dir); if (n_elem <= 0) { - carm_end_rq(host, crq, -EIO); + carm_end_rq(host, crq, BLK_STS_IOERR); return; /* request with no s/g entries? */ } crq->n_elem = n_elem; @@ -937,7 +937,7 @@ queue_one_request: static void carm_handle_array_info(struct carm_host *host, struct carm_request *crq, u8 *mem, - int error) + blk_status_t error) { struct carm_port *port; u8 *msg_data = mem + sizeof(struct carm_array_info); @@ -997,7 +997,7 @@ out: static void carm_handle_scan_chan(struct carm_host *host, struct carm_request *crq, u8 *mem, - int error) + blk_status_t error) { u8 *msg_data = mem + IOC_SCAN_CHAN_OFFSET; unsigned int i, dev_count = 0; @@ -1029,7 +1029,7 @@ out: } static void carm_handle_generic(struct carm_host *host, - struct carm_request *crq, int error, + struct carm_request *crq, blk_status_t error, int cur_state, int next_state) { DPRINTK("ENTER\n"); @@ -1045,7 +1045,7 @@ static void carm_handle_generic(struct carm_host *host, } static inline void carm_handle_rw(struct carm_host *host, - struct carm_request *crq, int error) + struct carm_request *crq, blk_status_t error) { int pci_dir; @@ -1067,7 +1067,7 @@ static inline void carm_handle_resp(struct carm_host *host, u32 handle = le32_to_cpu(ret_handle_le); unsigned int msg_idx; struct carm_request *crq; - int error = (status == RMSG_OK) ? 0 : -EIO; + blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR; u8 *mem; VPRINTK("ENTER, handle == 0x%x\n", handle); @@ -1155,7 +1155,7 @@ static inline void carm_handle_resp(struct carm_host *host, err_out: printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n", pci_name(host->pdev), crq->msg_type, crq->msg_subtype); - carm_end_rq(host, crq, -EIO); + carm_end_rq(host, crq, BLK_STS_IOERR); } static inline void carm_handle_responses(struct carm_host *host) diff --git a/drivers/block/umem.c b/drivers/block/umem.c index c141cc3be22b..0677d2514665 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -454,7 +454,7 @@ static void process_page(unsigned long data) PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); if (control & DMASCR_HARD_ERROR) { /* error */ - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; dev_printk(KERN_WARNING, &card->dev->dev, "I/O error on sector %d/%d\n", le32_to_cpu(desc->local_addr)>>9, @@ -529,7 +529,7 @@ static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size); - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); spin_lock_irq(&card->lock); *card->biotail = bio; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 553cc4c542b4..0297ad7c1452 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -64,15 +64,15 @@ struct virtblk_req { struct scatterlist sg[]; }; -static inline int virtblk_result(struct virtblk_req *vbr) +static inline blk_status_t virtblk_result(struct virtblk_req *vbr) { switch (vbr->status) { case VIRTIO_BLK_S_OK: - return 0; + return BLK_STS_OK; case VIRTIO_BLK_S_UNSUPP: - return -ENOTTY; + return BLK_STS_NOTSUPP; default: - return -EIO; + return BLK_STS_IOERR; } } @@ -214,7 +214,7 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } -static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct virtio_blk *vblk = hctx->queue->queuedata; @@ -246,7 +246,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, break; default: WARN_ON_ONCE(1); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); @@ -276,8 +276,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, /* Out of mem doesn't actually happen, since we fall back * to direct descriptors */ if (err == -ENOMEM || err == -ENOSPC) - return BLK_MQ_RQ_QUEUE_BUSY; - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_RESOURCE; + return BLK_STS_IOERR; } if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) @@ -286,7 +286,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, if (notify) virtqueue_notify(vblk->vqs[qid].vq); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } /* return id (s/n) string for *disk to *id_str @@ -307,7 +307,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) goto out; blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); - err = virtblk_result(blk_mq_rq_to_pdu(req)); + err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); out: blk_put_request(req); return err; @@ -720,9 +720,6 @@ static int virtblk_probe(struct virtio_device *vdev) /* We can handle whatever the host told us to handle. */ blk_queue_max_segments(q, vblk->sg_elems-2); - /* No need to bounce any requests */ - blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); - /* No real sector limit. */ blk_queue_max_hw_sectors(q, -1U); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 0e824091a12f..fe7cd58c43d0 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1066,20 +1066,17 @@ static void xen_blk_drain_io(struct xen_blkif_ring *ring) atomic_set(&blkif->drain, 0); } -/* - * Completion callback on the bio's. Called as bh->b_end_io() - */ - -static void __end_block_io_op(struct pending_req *pending_req, int error) +static void __end_block_io_op(struct pending_req *pending_req, + blk_status_t error) { /* An error fails the entire request. */ - if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && - (error == -EOPNOTSUPP)) { + if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE && + error == BLK_STS_NOTSUPP) { pr_debug("flush diskcache op failed, not supported\n"); xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; - } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && - (error == -EOPNOTSUPP)) { + } else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER && + error == BLK_STS_NOTSUPP) { pr_debug("write barrier op failed, not supported\n"); xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; @@ -1103,7 +1100,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) */ static void end_block_io_op(struct bio *bio) { - __end_block_io_op(bio->bi_private, bio->bi_error); + __end_block_io_op(bio->bi_private, bio->bi_status); bio_put(bio); } @@ -1420,7 +1417,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, for (i = 0; i < nbio; i++) bio_put(biolist[i]); atomic_set(&pending_req->pendcnt, 1); - __end_block_io_op(pending_req, -EINVAL); + __end_block_io_op(pending_req, BLK_STS_RESOURCE); msleep(1); /* back off a bit */ return -EIO; } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 39459631667c..c852ed3c01d5 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -110,11 +110,6 @@ struct blk_shadow { unsigned long associated_id; }; -struct split_bio { - struct bio *bio; - atomic_t pending; -}; - struct blkif_req { int error; }; @@ -881,7 +876,7 @@ static inline bool blkif_request_flush_invalid(struct request *req, !info->feature_fua)); } -static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { unsigned long flags; @@ -904,16 +899,16 @@ static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, flush_requests(rinfo); spin_unlock_irqrestore(&rinfo->ring_lock, flags); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_err: spin_unlock_irqrestore(&rinfo->ring_lock, flags); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; out_busy: spin_unlock_irqrestore(&rinfo->ring_lock, flags); blk_mq_stop_hw_queue(hctx); - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } static void blkif_complete_rq(struct request *rq) @@ -958,9 +953,6 @@ static void blkif_set_queue_limits(struct blkfront_info *info) /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); - - /* Make sure we don't use bounce buffers. */ - blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY); } static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -1601,14 +1593,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) continue; } - blkif_req(req)->error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; + if (bret->status == BLKIF_RSP_OKAY) + blkif_req(req)->error = BLK_STS_OK; + else + blkif_req(req)->error = BLK_STS_IOERR; + switch (bret->operation) { case BLKIF_OP_DISCARD: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - blkif_req(req)->error = -EOPNOTSUPP; + blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); @@ -1626,11 +1622,11 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) rinfo->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - blkif_req(req)->error = -EOPNOTSUPP; + blkif_req(req)->error = BLK_STS_NOTSUPP; } if (unlikely(blkif_req(req)->error)) { - if (blkif_req(req)->error == -EOPNOTSUPP) - blkif_req(req)->error = 0; + if (blkif_req(req)->error == BLK_STS_NOTSUPP) + blkif_req(req)->error = BLK_STS_OK; info->feature_fua = 0; info->feature_flush = 0; xlvbd_flush(info); @@ -1996,28 +1992,13 @@ static int blkfront_probe(struct xenbus_device *dev, return 0; } -static void split_bio_end(struct bio *bio) -{ - struct split_bio *split_bio = bio->bi_private; - - if (atomic_dec_and_test(&split_bio->pending)) { - split_bio->bio->bi_phys_segments = 0; - split_bio->bio->bi_error = bio->bi_error; - bio_endio(split_bio->bio); - kfree(split_bio); - } - bio_put(bio); -} - static int blkif_recover(struct blkfront_info *info) { - unsigned int i, r_index; + unsigned int r_index; struct request *req, *n; int rc; - struct bio *bio, *cloned_bio; - unsigned int segs, offset; - int pending, size; - struct split_bio *split_bio; + struct bio *bio; + unsigned int segs; blkfront_gather_backend_features(info); /* Reset limits changed by blk_mq_update_nr_hw_queues(). */ @@ -2056,34 +2037,6 @@ static int blkif_recover(struct blkfront_info *info) while ((bio = bio_list_pop(&info->bio_list)) != NULL) { /* Traverse the list of pending bios and re-queue them */ - if (bio_segments(bio) > segs) { - /* - * This bio has more segments than what we can - * handle, we have to split it. - */ - pending = (bio_segments(bio) + segs - 1) / segs; - split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); - BUG_ON(split_bio == NULL); - atomic_set(&split_bio->pending, pending); - split_bio->bio = bio; - for (i = 0; i < pending; i++) { - offset = (i * segs * XEN_PAGE_SIZE) >> 9; - size = min((unsigned int)(segs * XEN_PAGE_SIZE) >> 9, - (unsigned int)bio_sectors(bio) - offset); - cloned_bio = bio_clone(bio, GFP_NOIO); - BUG_ON(cloned_bio == NULL); - bio_trim(cloned_bio, offset, size); - cloned_bio->bi_private = split_bio; - cloned_bio->bi_end_io = split_bio_end; - submit_bio(cloned_bio); - } - /* - * Now we have to wait for all those smaller bios to - * end, so we can also end the "parent" bio. - */ - continue; - } - /* We don't need to split this bio */ submit_bio(bio); } @@ -2137,7 +2090,7 @@ static int blkfront_resume(struct xenbus_device *dev) merge_bio.tail = shadow[j].request->biotail; bio_list_merge(&info->bio_list, &merge_bio); shadow[j].request->bio = NULL; - blk_mq_end_request(shadow[j].request, 0); + blk_mq_end_request(shadow[j].request, BLK_STS_OK); } } diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 757dce2147e0..14459d66ef0c 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -471,7 +471,7 @@ static struct request *ace_get_next_request(struct request_queue *q) if (!blk_rq_is_passthrough(req)) break; blk_start_request(req); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } return req; } @@ -499,11 +499,11 @@ static void ace_fsm_dostate(struct ace_device *ace) /* Drop all in-flight and pending requests */ if (ace->req) { - __blk_end_request_all(ace->req, -EIO); + __blk_end_request_all(ace->req, BLK_STS_IOERR); ace->req = NULL; } while ((req = blk_fetch_request(ace->queue)) != NULL) - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); /* Drop back to IDLE state and notify waiters */ ace->fsm_state = ACE_FSM_STATE_IDLE; @@ -728,7 +728,7 @@ static void ace_fsm_dostate(struct ace_device *ace) } /* bio finished; is there another one? */ - if (__blk_end_request_cur(ace->req, 0)) { + if (__blk_end_request_cur(ace->req, BLK_STS_OK)) { /* dev_dbg(ace->dev, "next block; h=%u c=%u\n", * blk_rq_sectors(ace->req), * blk_rq_cur_sectors(ace->req)); @@ -993,6 +993,7 @@ static int ace_setup(struct ace_device *ace) if (ace->queue == NULL) goto err_blk_initq; blk_queue_logical_block_size(ace->queue, 512); + blk_queue_bounce_limit(ace->queue, BLK_BOUNCE_HIGH); /* * Allocate and initialize GD structure diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 968f9e52effa..41c95c9b2ab4 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -74,14 +74,14 @@ static void do_z2_request(struct request_queue *q) while (req) { unsigned long start = blk_rq_pos(req) << 9; unsigned long len = blk_rq_cur_bytes(req); - int err = 0; + blk_status_t err = BLK_STS_OK; if (start + len > z2ram_size) { pr_err(DEVICE_NAME ": bad access: block=%llu, " "count=%u\n", (unsigned long long)blk_rq_pos(req), blk_rq_cur_sectors(req)); - err = -EIO; + err = BLK_STS_IOERR; goto done; } while (len) { diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 76c952fd9ab9..e36d160c458f 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2178,6 +2178,12 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, if (!q) return -ENXIO; + if (!blk_queue_scsi_passthrough(q)) { + WARN_ONCE(true, + "Attempt read CDDA info through a non-SCSI queue\n"); + return -EINVAL; + } + cdi->last_sense = 0; while (nframes) { @@ -2195,7 +2201,6 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, break; } req = scsi_req(rq); - scsi_req_init(rq); ret = blk_rq_map_user(q, rq, NULL, ubuf, len, GFP_KERNEL); if (ret) { diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 1372763a948f..6495b03f576c 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -583,7 +583,8 @@ static int gdrom_set_interrupt_handlers(void) */ static void gdrom_readdisk_dma(struct work_struct *work) { - int err, block, block_cnt; + int block, block_cnt; + blk_status_t err; struct packet_command *read_command; struct list_head *elem, *next; struct request *req; @@ -641,7 +642,7 @@ static void gdrom_readdisk_dma(struct work_struct *work) __raw_writeb(1, GDROM_DMA_STATUS_REG); wait_event_interruptible_timeout(request_queue, gd.transfer == 0, GDROM_DEFAULT_TIMEOUT); - err = gd.transfer ? -EIO : 0; + err = gd.transfer ? BLK_STS_IOERR : BLK_STS_OK; gd.transfer = 0; gd.pending = 0; /* now seek to take the request spinlock @@ -670,11 +671,11 @@ static void gdrom_request(struct request_queue *rq) break; case REQ_OP_WRITE: pr_notice("Read only device - write request ignored\n"); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); break; default: printk(KERN_DEBUG "gdrom: Non-fs request ignored\n"); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); break; } } @@ -812,6 +813,7 @@ static int probe_gdrom(struct platform_device *devptr) err = -ENOMEM; goto probe_fail_requestq; } + blk_queue_bounce_limit(gd.gdrom_rq, BLK_BOUNCE_HIGH); err = probe_gdrom_setupqueue(); if (err) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 5901937284e7..14d1e7d9a1d6 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -93,7 +93,6 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, int error; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; rq->special = (char *)pc; @@ -200,7 +199,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) memset(sense, 0, sizeof(*sense)); blk_rq_init(rq->q, sense_rq); - scsi_req_init(sense_rq); + scsi_req_init(req); err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len, GFP_NOIO); @@ -273,7 +272,7 @@ void ide_retry_pc(ide_drive_t *drive) ide_requeue_and_plug(drive, failed_rq); if (ide_queue_sense_rq(drive, pc)) { blk_start_request(failed_rq); - ide_complete_rq(drive, -EIO, blk_rq_bytes(failed_rq)); + ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(failed_rq)); } } EXPORT_SYMBOL_GPL(ide_retry_pc); @@ -437,7 +436,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) /* No more interrupts */ if ((stat & ATA_DRQ) == 0) { - int uptodate, error; + int uptodate; + blk_status_t error; debug_log("Packet command completed, %d bytes transferred\n", blk_rq_bytes(rq)); @@ -490,7 +490,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if (ata_misc_request(rq)) { scsi_req(rq)->result = 0; - error = 0; + error = BLK_STS_OK; } else { if (blk_rq_is_passthrough(rq) && uptodate <= 0) { @@ -498,7 +498,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) scsi_req(rq)->result = -EIO; } - error = uptodate ? 0 : -EIO; + error = uptodate ? BLK_STS_OK : BLK_STS_IOERR; } ide_complete_rq(drive, error, blk_rq_bytes(rq)); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 07e5ff3a64c3..81e18f9628d0 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -228,7 +228,7 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) scsi_req(failed)->sense_len = scsi_req(rq)->sense_len; cdrom_analyze_sense_data(drive, failed); - if (ide_end_rq(drive, failed, -EIO, blk_rq_bytes(failed))) + if (ide_end_rq(drive, failed, BLK_STS_IOERR, blk_rq_bytes(failed))) BUG(); } else cdrom_analyze_sense_data(drive, NULL); @@ -438,7 +438,6 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, rq = blk_get_request(drive->queue, write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB); ide_req(rq)->type = ATA_PRIV_PC; rq->rq_flags |= rq_flags; @@ -508,7 +507,7 @@ static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd) nr_bytes -= cmd->last_xfer_len; if (nr_bytes > 0) { - ide_complete_rq(drive, 0, nr_bytes); + ide_complete_rq(drive, BLK_STS_OK, nr_bytes); return true; } @@ -674,7 +673,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) out_end: if (blk_rq_is_scsi(rq) && rc == 0) { scsi_req(rq)->resid_len = 0; - blk_end_request_all(rq, 0); + blk_end_request_all(rq, BLK_STS_OK); hwif->rq = NULL; } else { if (sense && uptodate) @@ -699,7 +698,7 @@ out_end: scsi_req(rq)->resid_len += cmd->last_xfer_len; } - ide_complete_rq(drive, uptodate ? 0 : -EIO, blk_rq_bytes(rq)); + ide_complete_rq(drive, uptodate ? BLK_STS_OK : BLK_STS_IOERR, blk_rq_bytes(rq)); if (sense && rc == 2) ide_error(drive, "request sense failure", stat); @@ -844,7 +843,7 @@ out_end: if (nsectors == 0) nsectors = 1; - ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9); + ide_complete_rq(drive, uptodate ? BLK_STS_OK : BLK_STS_IOERR, nsectors << 9); return ide_stopped; } diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 55cd736c39c6..9d26c9737e21 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -304,7 +304,6 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) int ret; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; rq->rq_flags = RQF_QUIET; blk_execute_rq(drive->queue, cd->disk, rq, 0); diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index 9b69c32ee560..ef7c8c43a380 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -166,7 +166,6 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, return setting->set(drive, arg); rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd_len = 5; scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC; diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 7c06237f3479..241983da5fc4 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -478,7 +478,6 @@ static int set_multcount(ide_drive_t *drive, int arg) return -EBUSY; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_TASKFILE; drive->mult_req = arg; diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 51c81223e56d..54d4d78ca46a 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -104,7 +104,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive) if ((cmd->tf_flags & IDE_TFLAG_FS) == 0) ide_finish_cmd(drive, cmd, stat); else - ide_complete_rq(drive, 0, + ide_complete_rq(drive, BLK_STS_OK, blk_rq_sectors(cmd->rq) << 9); return ide_stopped; } diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c index 4b7ffd7d158d..47d5f3379748 100644 --- a/drivers/ide/ide-eh.c +++ b/drivers/ide/ide-eh.c @@ -135,7 +135,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat) return ide_stopped; } scsi_req(rq)->result = err; - ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, err ? BLK_STS_IOERR : BLK_STS_OK, blk_rq_bytes(rq)); return ide_stopped; } @@ -143,7 +143,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat) } EXPORT_SYMBOL_GPL(ide_error); -static inline void ide_complete_drive_reset(ide_drive_t *drive, int err) +static inline void ide_complete_drive_reset(ide_drive_t *drive, blk_status_t err) { struct request *rq = drive->hwif->rq; @@ -151,7 +151,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err) scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) { if (err <= 0 && scsi_req(rq)->result == 0) scsi_req(rq)->result = -EIO; - ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, err, blk_rq_bytes(rq)); } } @@ -191,7 +191,7 @@ static ide_startstop_t atapi_reset_pollfunc(ide_drive_t *drive) } /* done polling */ hwif->polling = 0; - ide_complete_drive_reset(drive, 0); + ide_complete_drive_reset(drive, BLK_STS_OK); return ide_stopped; } @@ -225,7 +225,7 @@ static ide_startstop_t reset_pollfunc(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; const struct ide_port_ops *port_ops = hwif->port_ops; u8 tmp; - int err = 0; + blk_status_t err = BLK_STS_OK; if (port_ops && port_ops->reset_poll) { err = port_ops->reset_poll(drive); @@ -247,7 +247,7 @@ static ide_startstop_t reset_pollfunc(ide_drive_t *drive) printk(KERN_ERR "%s: reset timed-out, status=0x%02x\n", hwif->name, tmp); drive->failures++; - err = -EIO; + err = BLK_STS_IOERR; } else { tmp = ide_read_error(drive); @@ -257,7 +257,7 @@ static ide_startstop_t reset_pollfunc(ide_drive_t *drive) } else { ide_reset_report_error(hwif, tmp); drive->failures++; - err = -EIO; + err = BLK_STS_IOERR; } } out: @@ -392,7 +392,7 @@ static ide_startstop_t do_reset1(ide_drive_t *drive, int do_not_try_atapi) if (io_ports->ctl_addr == 0) { spin_unlock_irqrestore(&hwif->lock, flags); - ide_complete_drive_reset(drive, -ENXIO); + ide_complete_drive_reset(drive, BLK_STS_IOERR); return ide_stopped; } diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 8ac6048cd2df..627b1f62a749 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -143,7 +143,7 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive, drive->failed_pc = NULL; drive->pc_callback(drive, 0); - ide_complete_rq(drive, -EIO, done); + ide_complete_rq(drive, BLK_STS_IOERR, done); return ide_stopped; } @@ -248,7 +248,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, if (ata_misc_request(rq)) { scsi_req(rq)->result = 0; - ide_complete_rq(drive, 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_OK, blk_rq_bytes(rq)); return ide_stopped; } else goto out_end; @@ -303,7 +303,7 @@ out_end: drive->failed_pc = NULL; if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0) scsi_req(rq)->result = -EIO; - ide_complete_rq(drive, -EIO, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(rq)); return ide_stopped; } diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 323af721f8cb..3a234701d92c 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -54,7 +54,7 @@ #include <linux/uaccess.h> #include <asm/io.h> -int ide_end_rq(ide_drive_t *drive, struct request *rq, int error, +int ide_end_rq(ide_drive_t *drive, struct request *rq, blk_status_t error, unsigned int nr_bytes) { /* @@ -112,7 +112,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err) } } -int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes) +int ide_complete_rq(ide_drive_t *drive, blk_status_t error, unsigned int nr_bytes) { ide_hwif_t *hwif = drive->hwif; struct request *rq = hwif->rq; @@ -122,7 +122,7 @@ int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes) * if failfast is set on a request, override number of sectors * and complete the whole request right now */ - if (blk_noretry_request(rq) && error <= 0) + if (blk_noretry_request(rq) && error) nr_bytes = blk_rq_sectors(rq) << 9; rc = ide_end_rq(drive, rq, error, nr_bytes); @@ -149,7 +149,7 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq) scsi_req(rq)->result = -EIO; } - ide_complete_rq(drive, -EIO, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(rq)); } static void ide_tf_set_specify_cmd(ide_drive_t *drive, struct ide_taskfile *tf) @@ -272,7 +272,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive, printk("%s: DRIVE_CMD (null)\n", drive->name); #endif scsi_req(rq)->result = 0; - ide_complete_rq(drive, 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_OK, blk_rq_bytes(rq)); return ide_stopped; } diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index 8c0d17297a7a..3661abb16a5f 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -126,7 +126,6 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) struct request *rq; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_TASKFILE; blk_execute_rq(drive->queue, NULL, rq, 0); err = scsi_req(rq)->result ? -EIO : 0; @@ -224,7 +223,6 @@ static int generic_drive_reset(ide_drive_t *drive) int ret = 0; rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd_len = 1; scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET; diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index 94e3107f59b9..1f264d5d3f3f 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -32,7 +32,6 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) spin_unlock_irq(&hwif->lock); rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); scsi_req(rq)->cmd[0] = REQ_PARK_HEADS; scsi_req(rq)->cmd_len = 1; ide_req(rq)->type = ATA_PRIV_MISC; @@ -48,7 +47,6 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) * timeout has expired, so power management will be reenabled. */ rq = blk_get_request(q, REQ_OP_DRV_IN, GFP_NOWAIT); - scsi_req_init(rq); if (IS_ERR(rq)) goto out; diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 0977fc1f40ce..544f02d673ca 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -19,7 +19,6 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_PM_SUSPEND; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; @@ -40,7 +39,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) return ret; } -static void ide_end_sync_rq(struct request *rq, int error) +static void ide_end_sync_rq(struct request *rq, blk_status_t error) { complete(rq->end_io_data); } @@ -57,7 +56,7 @@ static int ide_pm_execute_rq(struct request *rq) if (unlikely(blk_queue_dying(q))) { rq->rq_flags |= RQF_QUIET; scsi_req(rq)->result = -ENXIO; - __blk_end_request_all(rq, 0); + __blk_end_request_all(rq, BLK_STS_OK); spin_unlock_irq(q->queue_lock); return -ENXIO; } @@ -91,7 +90,6 @@ int generic_ide_resume(struct device *dev) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_PM_RESUME; rq->rq_flags |= RQF_PREEMPT; rq->special = &rqpm; @@ -235,7 +233,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) drive->hwif->rq = NULL; - if (blk_end_request(rq, 0, 0)) + if (blk_end_request(rq, BLK_STS_OK, 0)) BUG(); } diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 023562565d11..01b2adfd8226 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -741,12 +741,12 @@ static void ide_port_tune_devices(ide_hwif_t *hwif) } } -static int ide_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp) +static void ide_initialize_rq(struct request *rq) { struct ide_request *req = blk_mq_rq_to_pdu(rq); + scsi_req_init(&req->sreq); req->sreq.sense = req->sense; - return 0; } /* @@ -771,8 +771,9 @@ static int ide_init_queue(ide_drive_t *drive) return 1; q->request_fn = do_ide_request; - q->init_rq_fn = ide_init_rq; + q->initialize_rq_fn = ide_initialize_rq; q->cmd_size = sizeof(struct ide_request); + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); if (blk_init_allocated_queue(q) < 0) { blk_cleanup_queue(q); return 1; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index a0651f948b76..fd57e8ccc47a 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -474,7 +474,7 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, drive->failed_pc = NULL; drive->pc_callback(drive, 0); - ide_complete_rq(drive, -EIO, blk_rq_bytes(rq)); + ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(rq)); return ide_stopped; } ide_debug_log(IDE_DBG_SENSE, "retry #%d, cmd: 0x%02x", pc->retries, @@ -855,7 +855,6 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) BUG_ON(size < 0 || size % tape->blk_size); rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_MISC; scsi_req(rq)->cmd[13] = cmd; rq->rq_disk = tape->disk; diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index d71199d23c9e..4efe4c6e956c 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -318,7 +318,7 @@ static void ide_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd) } if (nr_bytes > 0) - ide_complete_rq(drive, 0, nr_bytes); + ide_complete_rq(drive, BLK_STS_OK, nr_bytes); } } @@ -336,7 +336,7 @@ void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat) ide_driveid_update(drive); } - ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq)); + ide_complete_rq(drive, err ? BLK_STS_IOERR : BLK_STS_OK, blk_rq_bytes(rq)); } /* @@ -394,7 +394,7 @@ out_end: if ((cmd->tf_flags & IDE_TFLAG_FS) == 0) ide_finish_cmd(drive, cmd, stat); else - ide_complete_rq(drive, 0, blk_rq_sectors(cmd->rq) << 9); + ide_complete_rq(drive, BLK_STS_OK, blk_rq_sectors(cmd->rq) << 9); return ide_stopped; out_err: ide_error_cmd(drive, cmd); @@ -433,7 +433,6 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, rq = blk_get_request(drive->queue, (cmd->tf_flags & IDE_TFLAG_WRITE) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM); - scsi_req_init(rq); ide_req(rq)->type = ATA_PRIV_TASKFILE; /* diff --git a/drivers/ide/siimage.c b/drivers/ide/siimage.c index 6a1849bb476c..57eea5a9047f 100644 --- a/drivers/ide/siimage.c +++ b/drivers/ide/siimage.c @@ -406,7 +406,7 @@ static int siimage_dma_test_irq(ide_drive_t *drive) * yet. */ -static int sil_sata_reset_poll(ide_drive_t *drive) +static blk_status_t sil_sata_reset_poll(ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; void __iomem *sata_status_addr @@ -419,11 +419,11 @@ static int sil_sata_reset_poll(ide_drive_t *drive) if ((sata_stat & 0x03) != 0x03) { printk(KERN_WARNING "%s: reset phy dead, status=0x%08x\n", hwif->name, sata_stat); - return -ENXIO; + return BLK_STS_IOERR; } } - return 0; + return BLK_STS_OK; } /** diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 6a4aa608ad95..ddae430b6eae 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -252,8 +252,9 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) } mutex_unlock(&dev->mlock); - if (nvm_reserve_luns(dev, s->lun_begin, s->lun_end)) - return -ENOMEM; + ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end); + if (ret) + return ret; t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); if (!t) { @@ -640,6 +641,7 @@ EXPORT_SYMBOL(nvm_max_phys_sects); int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) { struct nvm_dev *dev = tgt_dev->parent; + int ret; if (!dev->ops->submit_io) return -ENODEV; @@ -647,7 +649,12 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) nvm_rq_tgt_to_dev(tgt_dev, rqd); rqd->dev = tgt_dev; - return dev->ops->submit_io(dev, rqd); + + /* In case of error, fail with right address format */ + ret = dev->ops->submit_io(dev, rqd); + if (ret) + nvm_rq_dev_to_tgt(tgt_dev, rqd); + return ret; } EXPORT_SYMBOL(nvm_submit_io); diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 59bcea88db84..024a8fc93069 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -31,9 +31,13 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) */ retry: ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos); - if (ret == NVM_IO_REQUEUE) { + switch (ret) { + case NVM_IO_REQUEUE: io_schedule(); goto retry; + case NVM_IO_ERR: + pblk_pipeline_stop(pblk); + goto out; } if (unlikely(!bio_has_data(bio))) @@ -58,6 +62,8 @@ retry: atomic_long_add(nr_entries, &pblk->req_writes); #endif + pblk_rl_inserted(&pblk->rl, nr_entries); + out: pblk_write_should_kick(pblk); return ret; diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 5e44768ccffa..11fe0c5b2a9c 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -17,7 +17,6 @@ */ #include "pblk.h" -#include <linux/time.h> static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, struct ppa_addr *ppa) @@ -34,7 +33,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", line->id, pos); - pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb); + pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, pblk->bb_wq); } static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) @@ -54,6 +53,8 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) *ppa = rqd->ppa_addr; pblk_mark_bb(pblk, line, ppa); } + + atomic_dec(&pblk->inflight_io); } /* Erase completion assumes that only one block is erased at the time */ @@ -61,13 +62,12 @@ static void pblk_end_io_erase(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; - up(&pblk->erase_sem); __pblk_end_io_erase(pblk, rqd); - mempool_free(rqd, pblk->r_rq_pool); + mempool_free(rqd, pblk->g_rq_pool); } -static void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr) +void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, + u64 paddr) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct list_head *move_list = NULL; @@ -88,7 +88,7 @@ static void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, spin_unlock(&line->lock); return; } - line->vsc--; + le32_add_cpu(line->vsc, -1); if (line->state == PBLK_LINESTATE_CLOSED) move_list = pblk_line_gc_list(pblk, line); @@ -130,18 +130,6 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) __pblk_map_invalidate(pblk, line, paddr); } -void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr) -{ - __pblk_map_invalidate(pblk, line, paddr); - - pblk_rb_sync_init(&pblk->rwb, NULL); - line->left_ssecs--; - if (!line->left_ssecs) - pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws); - pblk_rb_sync_end(&pblk->rwb, NULL); -} - static void pblk_invalidate_range(struct pblk *pblk, sector_t slba, unsigned int nr_secs) { @@ -172,8 +160,8 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) pool = pblk->w_rq_pool; rq_size = pblk_w_rq_size; } else { - pool = pblk->r_rq_pool; - rq_size = pblk_r_rq_size; + pool = pblk->g_rq_pool; + rq_size = pblk_g_rq_size; } rqd = mempool_alloc(pool, GFP_KERNEL); @@ -189,7 +177,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) if (rw == WRITE) pool = pblk->w_rq_pool; else - pool = pblk->r_rq_pool; + pool = pblk->g_rq_pool; mempool_free(rqd, pool); } @@ -271,35 +259,26 @@ void pblk_end_io_sync(struct nvm_rq *rqd) complete(waiting); } -void pblk_flush_writer(struct pblk *pblk) +void pblk_wait_for_meta(struct pblk *pblk) { - struct bio *bio; - int ret; - DECLARE_COMPLETION_ONSTACK(wait); - - bio = bio_alloc(GFP_KERNEL, 1); - if (!bio) - return; - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_OP_FLUSH); - bio->bi_private = &wait; - bio->bi_end_io = pblk_end_bio_sync; + do { + if (!atomic_read(&pblk->inflight_io)) + break; - ret = pblk_write_to_cache(pblk, bio, 0); - if (ret == NVM_IO_OK) { - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: flush cache timed out\n"); - } - } else if (ret != NVM_IO_DONE) { - pr_err("pblk: tear down bio failed\n"); - } + schedule(); + } while (1); +} - if (bio->bi_error) - pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error); +static void pblk_flush_writer(struct pblk *pblk) +{ + pblk_rb_flush(&pblk->rwb); + do { + if (!pblk_rb_sync_count(&pblk->rwb)) + break; - bio_put(bio); + pblk_write_kick(pblk); + schedule(); + } while (1); } struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) @@ -307,28 +286,31 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct list_head *move_list = NULL; + int vsc = le32_to_cpu(*line->vsc); - if (!line->vsc) { + lockdep_assert_held(&line->lock); + + if (!vsc) { if (line->gc_group != PBLK_LINEGC_FULL) { line->gc_group = PBLK_LINEGC_FULL; move_list = &l_mg->gc_full_list; } - } else if (line->vsc < lm->mid_thrs) { + } else if (vsc < lm->high_thrs) { if (line->gc_group != PBLK_LINEGC_HIGH) { line->gc_group = PBLK_LINEGC_HIGH; move_list = &l_mg->gc_high_list; } - } else if (line->vsc < lm->high_thrs) { + } else if (vsc < lm->mid_thrs) { if (line->gc_group != PBLK_LINEGC_MID) { line->gc_group = PBLK_LINEGC_MID; move_list = &l_mg->gc_mid_list; } - } else if (line->vsc < line->sec_in_line) { + } else if (vsc < line->sec_in_line) { if (line->gc_group != PBLK_LINEGC_LOW) { line->gc_group = PBLK_LINEGC_LOW; move_list = &l_mg->gc_low_list; } - } else if (line->vsc == line->sec_in_line) { + } else if (vsc == line->sec_in_line) { if (line->gc_group != PBLK_LINEGC_EMPTY) { line->gc_group = PBLK_LINEGC_EMPTY; move_list = &l_mg->gc_empty_list; @@ -338,7 +320,7 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) line->gc_group = PBLK_LINEGC_NONE; move_list = &l_mg->corrupt_list; pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", - line->id, line->vsc, + line->id, vsc, line->sec_in_line, lm->high_thrs, lm->mid_thrs); } @@ -397,6 +379,11 @@ void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd) #endif } +void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write) +{ + pblk->sec_per_write = sec_per_write; +} + int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) { struct nvm_tgt_dev *dev = pblk->dev; @@ -431,21 +418,23 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) } } #endif + + atomic_inc(&pblk->inflight_io); + return nvm_submit_io(dev, rqd); } struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, unsigned int nr_secs, unsigned int len, - gfp_t gfp_mask) + int alloc_type, gfp_t gfp_mask) { struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; void *kaddr = data; struct page *page; struct bio *bio; int i, ret; - if (l_mg->emeta_alloc_type == PBLK_KMALLOC_META) + if (alloc_type == PBLK_KMALLOC_META) return bio_map_kern(dev->q, kaddr, len, gfp_mask); bio = bio_kmalloc(gfp_mask, nr_secs); @@ -478,7 +467,7 @@ out: int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, unsigned long secs_to_flush) { - int max = pblk->max_write_pgs; + int max = pblk->sec_per_write; int min = pblk->min_write_pgs; int secs_to_sync = 0; @@ -492,12 +481,26 @@ int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, return secs_to_sync; } -static u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, - int nr_secs) +void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) +{ + u64 addr; + int i; + + addr = find_next_zero_bit(line->map_bitmap, + pblk->lm.sec_per_line, line->cur_sec); + line->cur_sec = addr - nr_secs; + + for (i = 0; i < nr_secs; i++, line->cur_sec--) + WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); +} + +u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) { u64 addr; int i; + lockdep_assert_held(&line->lock); + /* logic error: ppa out-of-bounds. Prevent generating bad address */ if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) { WARN(1, "pblk: page allocation out of bounds\n"); @@ -528,27 +531,38 @@ u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) return addr; } +u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line) +{ + u64 paddr; + + spin_lock(&line->lock); + paddr = find_next_zero_bit(line->map_bitmap, + pblk->lm.sec_per_line, line->cur_sec); + spin_unlock(&line->lock); + + return paddr; +} + /* * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when * taking the per LUN semaphore. */ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, - u64 paddr, int dir) + void *emeta_buf, u64 paddr, int dir) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; + void *ppa_list, *meta_list; struct bio *bio; struct nvm_rq rqd; - struct ppa_addr *ppa_list; - dma_addr_t dma_ppa_list; - void *emeta = line->emeta; + dma_addr_t dma_ppa_list, dma_meta_list; int min = pblk->min_write_pgs; - int left_ppas = lm->emeta_sec; + int left_ppas = lm->emeta_sec[0]; int id = line->id; int rq_ppas, rq_len; int cmd_op, bio_op; - int flags; int i, j; int ret; DECLARE_COMPLETION_ONSTACK(wait); @@ -556,25 +570,28 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, if (dir == WRITE) { bio_op = REQ_OP_WRITE; cmd_op = NVM_OP_PWRITE; - flags = pblk_set_progr_mode(pblk, WRITE); } else if (dir == READ) { bio_op = REQ_OP_READ; cmd_op = NVM_OP_PREAD; - flags = pblk_set_read_mode(pblk); } else return -EINVAL; - ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_ppa_list); - if (!ppa_list) + meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &dma_meta_list); + if (!meta_list) return -ENOMEM; + ppa_list = meta_list + pblk_dma_meta_size; + dma_ppa_list = dma_meta_list + pblk_dma_meta_size; + next_rq: memset(&rqd, 0, sizeof(struct nvm_rq)); rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); rq_len = rq_ppas * geo->sec_size; - bio = pblk_bio_map_addr(pblk, emeta, rq_ppas, rq_len, GFP_KERNEL); + bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len, + l_mg->emeta_alloc_type, GFP_KERNEL); if (IS_ERR(bio)) { ret = PTR_ERR(bio); goto free_rqd_dma; @@ -584,27 +601,38 @@ next_rq: bio_set_op_attrs(bio, bio_op, 0); rqd.bio = bio; - rqd.opcode = cmd_op; - rqd.flags = flags; - rqd.nr_ppas = rq_ppas; + rqd.meta_list = meta_list; rqd.ppa_list = ppa_list; + rqd.dma_meta_list = dma_meta_list; rqd.dma_ppa_list = dma_ppa_list; + rqd.opcode = cmd_op; + rqd.nr_ppas = rq_ppas; rqd.end_io = pblk_end_io_sync; rqd.private = &wait; if (dir == WRITE) { + struct pblk_sec_meta *meta_list = rqd.meta_list; + + rqd.flags = pblk_set_progr_mode(pblk, WRITE); for (i = 0; i < rqd.nr_ppas; ) { spin_lock(&line->lock); paddr = __pblk_alloc_page(pblk, line, min); spin_unlock(&line->lock); - for (j = 0; j < min; j++, i++, paddr++) + for (j = 0; j < min; j++, i++, paddr++) { + meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); + } } } else { for (i = 0; i < rqd.nr_ppas; ) { struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id); int pos = pblk_dev_ppa_to_pos(geo, ppa); + int read_type = PBLK_READ_RANDOM; + + if (pblk_io_aligned(pblk, rq_ppas)) + read_type = PBLK_READ_SEQUENTIAL; + rqd.flags = pblk_set_read_mode(pblk, read_type); while (test_bit(pos, line->blk_bitmap)) { paddr += min; @@ -645,9 +673,11 @@ next_rq: msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: emeta I/O timed out\n"); } + atomic_dec(&pblk->inflight_io); reinit_completion(&wait); - bio_put(bio); + if (likely(pblk->l_mg.emeta_alloc_type == PBLK_VMALLOC_META)) + bio_put(bio); if (rqd.error) { if (dir == WRITE) @@ -656,12 +686,12 @@ next_rq: pblk_log_read_err(pblk, &rqd); } - emeta += rq_len; + emeta_buf += rq_len; left_ppas -= rq_ppas; if (left_ppas) goto next_rq; free_rqd_dma: - nvm_dev_dma_free(dev->parent, ppa_list, dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return ret; } @@ -697,21 +727,24 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, bio_op = REQ_OP_WRITE; cmd_op = NVM_OP_PWRITE; flags = pblk_set_progr_mode(pblk, WRITE); - lba_list = pblk_line_emeta_to_lbas(line->emeta); + lba_list = emeta_to_lbas(pblk, line->emeta->buf); } else if (dir == READ) { bio_op = REQ_OP_READ; cmd_op = NVM_OP_PREAD; - flags = pblk_set_read_mode(pblk); + flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); } else return -EINVAL; memset(&rqd, 0, sizeof(struct nvm_rq)); - rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd.dma_ppa_list); - if (!rqd.ppa_list) + rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd.dma_meta_list); + if (!rqd.meta_list) return -ENOMEM; + rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; + rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; + bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL); if (IS_ERR(bio)) { ret = PTR_ERR(bio); @@ -729,9 +762,15 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, rqd.private = &wait; for (i = 0; i < lm->smeta_sec; i++, paddr++) { + struct pblk_sec_meta *meta_list = rqd.meta_list; + rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - if (dir == WRITE) - lba_list[paddr] = cpu_to_le64(ADDR_EMPTY); + + if (dir == WRITE) { + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + + meta_list[i].lba = lba_list[paddr] = addr_empty; + } } /* @@ -750,6 +789,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: smeta I/O timed out\n"); } + atomic_dec(&pblk->inflight_io); if (rqd.error) { if (dir == WRITE) @@ -759,7 +799,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, } free_ppa_list: - nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return ret; } @@ -771,9 +811,11 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line) return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ); } -int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line) +int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, + void *emeta_buf) { - return pblk_line_submit_emeta_io(pblk, line, line->emeta_ssec, READ); + return pblk_line_submit_emeta_io(pblk, line, emeta_buf, + line->emeta_ssec, READ); } static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, @@ -789,7 +831,7 @@ static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) { struct nvm_rq rqd; - int ret; + int ret = 0; DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); @@ -824,14 +866,14 @@ out: rqd.private = pblk; __pblk_end_io_erase(pblk, &rqd); - return 0; + return ret; } int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_meta *lm = &pblk->lm; struct ppa_addr ppa; - int bit = -1; + int ret, bit = -1; /* Erase only good blocks, one at a time */ do { @@ -850,27 +892,59 @@ int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) WARN_ON(test_and_set_bit(bit, line->erase_bitmap)); spin_unlock(&line->lock); - if (pblk_blk_erase_sync(pblk, ppa)) { + ret = pblk_blk_erase_sync(pblk, ppa); + if (ret) { pr_err("pblk: failed to erase line %d\n", line->id); - return -ENOMEM; + return ret; } } while (1); return 0; } +static void pblk_line_setup_metadata(struct pblk_line *line, + struct pblk_line_mgmt *l_mg, + struct pblk_line_meta *lm) +{ + int meta_line; + + lockdep_assert_held(&l_mg->free_lock); + +retry_meta: + meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); + if (meta_line == PBLK_DATA_LINES) { + spin_unlock(&l_mg->free_lock); + io_schedule(); + spin_lock(&l_mg->free_lock); + goto retry_meta; + } + + set_bit(meta_line, &l_mg->meta_bitmap); + line->meta_line = meta_line; + + line->smeta = l_mg->sline_meta[meta_line]; + line->emeta = l_mg->eline_meta[meta_line]; + + memset(line->smeta, 0, lm->smeta_len); + memset(line->emeta->buf, 0, lm->emeta_len[0]); + + line->emeta->mem = 0; + atomic_set(&line->emeta->sync, 0); +} + /* For now lines are always assumed full lines. Thus, smeta former and current * lun bitmaps are omitted. */ -static int pblk_line_set_metadata(struct pblk *pblk, struct pblk_line *line, +static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, struct pblk_line *cur) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct line_smeta *smeta = line->smeta; - struct line_emeta *emeta = line->emeta; + struct pblk_emeta *emeta = line->emeta; + struct line_emeta *emeta_buf = emeta->buf; + struct line_smeta *smeta_buf = (struct line_smeta *)line->smeta; int nr_blk_line; /* After erasing the line, new bad blocks might appear and we risk @@ -893,42 +967,44 @@ static int pblk_line_set_metadata(struct pblk *pblk, struct pblk_line *line, } /* Run-time metadata */ - line->lun_bitmap = ((void *)(smeta)) + sizeof(struct line_smeta); + line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta); /* Mark LUNs allocated in this line (all for now) */ bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len); - smeta->header.identifier = cpu_to_le32(PBLK_MAGIC); - memcpy(smeta->header.uuid, pblk->instance_uuid, 16); - smeta->header.id = cpu_to_le32(line->id); - smeta->header.type = cpu_to_le16(line->type); - smeta->header.version = cpu_to_le16(1); + smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); + memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); + smeta_buf->header.id = cpu_to_le32(line->id); + smeta_buf->header.type = cpu_to_le16(line->type); + smeta_buf->header.version = cpu_to_le16(1); /* Start metadata */ - smeta->seq_nr = cpu_to_le64(line->seq_nr); - smeta->window_wr_lun = cpu_to_le32(geo->nr_luns); + smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); + smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns); /* Fill metadata among lines */ if (cur) { memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len); - smeta->prev_id = cpu_to_le32(cur->id); - cur->emeta->next_id = cpu_to_le32(line->id); + smeta_buf->prev_id = cpu_to_le32(cur->id); + cur->emeta->buf->next_id = cpu_to_le32(line->id); } else { - smeta->prev_id = cpu_to_le32(PBLK_LINE_EMPTY); + smeta_buf->prev_id = cpu_to_le32(PBLK_LINE_EMPTY); } /* All smeta must be set at this point */ - smeta->header.crc = cpu_to_le32(pblk_calc_meta_header_crc(pblk, smeta)); - smeta->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta)); + smeta_buf->header.crc = cpu_to_le32( + pblk_calc_meta_header_crc(pblk, &smeta_buf->header)); + smeta_buf->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta_buf)); /* End metadata */ - memcpy(&emeta->header, &smeta->header, sizeof(struct line_header)); - emeta->seq_nr = cpu_to_le64(line->seq_nr); - emeta->nr_lbas = cpu_to_le64(line->sec_in_line); - emeta->nr_valid_lbas = cpu_to_le64(0); - emeta->next_id = cpu_to_le32(PBLK_LINE_EMPTY); - emeta->crc = cpu_to_le32(0); - emeta->prev_id = smeta->prev_id; + memcpy(&emeta_buf->header, &smeta_buf->header, + sizeof(struct line_header)); + emeta_buf->seq_nr = cpu_to_le64(line->seq_nr); + emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line); + emeta_buf->nr_valid_lbas = cpu_to_le64(0); + emeta_buf->next_id = cpu_to_le32(PBLK_LINE_EMPTY); + emeta_buf->crc = cpu_to_le32(0); + emeta_buf->prev_id = smeta_buf->prev_id; return 1; } @@ -965,7 +1041,6 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, /* Mark smeta metadata sectors as bad sectors */ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); off = bit * geo->sec_per_pl; -retry_smeta: bitmap_set(line->map_bitmap, off, lm->smeta_sec); line->sec_in_line -= lm->smeta_sec; line->smeta_ssec = off; @@ -973,8 +1048,7 @@ retry_smeta: if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) { pr_debug("pblk: line smeta I/O failed. Retry\n"); - off += geo->sec_per_pl; - goto retry_smeta; + return 1; } bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line); @@ -983,8 +1057,8 @@ retry_smeta: * blocks to make sure that there are enough sectors to store emeta */ bit = lm->sec_per_line; - off = lm->sec_per_line - lm->emeta_sec; - bitmap_set(line->invalid_bitmap, off, lm->emeta_sec); + off = lm->sec_per_line - lm->emeta_sec[0]; + bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]); while (nr_bb) { off -= geo->sec_per_pl; if (!test_bit(off, line->invalid_bitmap)) { @@ -993,9 +1067,11 @@ retry_smeta: } } - line->sec_in_line -= lm->emeta_sec; + line->sec_in_line -= lm->emeta_sec[0]; line->emeta_ssec = off; - line->vsc = line->left_ssecs = line->left_msecs = line->sec_in_line; + line->nr_valid_lbas = 0; + line->left_msecs = line->sec_in_line; + *line->vsc = cpu_to_le32(line->sec_in_line); if (lm->sec_per_line - line->sec_in_line != bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) { @@ -1034,14 +1110,20 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) spin_lock(&line->lock); if (line->state != PBLK_LINESTATE_FREE) { + mempool_free(line->invalid_bitmap, pblk->line_meta_pool); + mempool_free(line->map_bitmap, pblk->line_meta_pool); spin_unlock(&line->lock); - WARN(1, "pblk: corrupted line state\n"); - return -EINTR; + WARN(1, "pblk: corrupted line %d, state %d\n", + line->id, line->state); + return -EAGAIN; } + line->state = PBLK_LINESTATE_OPEN; atomic_set(&line->left_eblks, blk_in_line); atomic_set(&line->left_seblks, blk_in_line); + + line->meta_distance = lm->meta_distance; spin_unlock(&line->lock); /* Bad blocks do not need to be erased */ @@ -1091,15 +1173,15 @@ struct pblk_line *pblk_line_get(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line = NULL; - int bit; + struct pblk_line *line; + int ret, bit; lockdep_assert_held(&l_mg->free_lock); -retry_get: +retry: if (list_empty(&l_mg->free_list)) { pr_err("pblk: no free lines\n"); - goto out; + return NULL; } line = list_first_entry(&l_mg->free_list, struct pblk_line, list); @@ -1115,16 +1197,22 @@ retry_get: list_add_tail(&line->list, &l_mg->bad_list); pr_debug("pblk: line %d is bad\n", line->id); - goto retry_get; + goto retry; } - if (pblk_line_prepare(pblk, line)) { - pr_err("pblk: failed to prepare line %d\n", line->id); - list_add(&line->list, &l_mg->free_list); - return NULL; + ret = pblk_line_prepare(pblk, line); + if (ret) { + if (ret == -EAGAIN) { + list_add(&line->list, &l_mg->corrupt_list); + goto retry; + } else { + pr_err("pblk: failed to prepare line %d\n", line->id); + list_add(&line->list, &l_mg->free_list); + l_mg->nr_free_lines++; + return NULL; + } } -out: return line; } @@ -1134,6 +1222,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk, struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *retry_line; +retry: spin_lock(&l_mg->free_lock); retry_line = pblk_line_get(pblk); if (!retry_line) { @@ -1150,23 +1239,25 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk, l_mg->data_line = retry_line; spin_unlock(&l_mg->free_lock); - if (pblk_line_erase(pblk, retry_line)) { - spin_lock(&l_mg->free_lock); - l_mg->data_line = NULL; - spin_unlock(&l_mg->free_lock); - return NULL; - } - pblk_rl_free_lines_dec(&pblk->rl, retry_line); + if (pblk_line_erase(pblk, retry_line)) + goto retry; + return retry_line; } +static void pblk_set_space_limit(struct pblk *pblk) +{ + struct pblk_rl *rl = &pblk->rl; + + atomic_set(&rl->rb_space, 0); +} + struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line; - int meta_line; int is_next = 0; spin_lock(&l_mg->free_lock); @@ -1180,30 +1271,37 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) line->type = PBLK_LINETYPE_DATA; l_mg->data_line = line; - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - set_bit(meta_line, &l_mg->meta_bitmap); - line->smeta = l_mg->sline_meta[meta_line].meta; - line->emeta = l_mg->eline_meta[meta_line].meta; - line->meta_line = meta_line; + pblk_line_setup_metadata(line, l_mg, &pblk->lm); /* Allocate next line for preparation */ l_mg->data_next = pblk_line_get(pblk); - if (l_mg->data_next) { + if (!l_mg->data_next) { + /* If we cannot get a new line, we need to stop the pipeline. + * Only allow as many writes in as we can store safely and then + * fail gracefully + */ + pblk_set_space_limit(pblk); + + l_mg->data_next = NULL; + } else { l_mg->data_next->seq_nr = l_mg->d_seq_nr++; l_mg->data_next->type = PBLK_LINETYPE_DATA; is_next = 1; } spin_unlock(&l_mg->free_lock); + if (pblk_line_erase(pblk, line)) { + line = pblk_line_retry(pblk, line); + if (!line) + return NULL; + } + pblk_rl_free_lines_dec(&pblk->rl, line); if (is_next) pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); - if (pblk_line_erase(pblk, line)) - return NULL; - retry_setup: - if (!pblk_line_set_metadata(pblk, line, NULL)) { + if (!pblk_line_init_metadata(pblk, line, NULL)) { line = pblk_line_retry(pblk, line); if (!line) return NULL; @@ -1222,69 +1320,89 @@ retry_setup: return line; } -struct pblk_line *pblk_line_replace_data(struct pblk *pblk) +static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) +{ + lockdep_assert_held(&pblk->l_mg.free_lock); + + pblk_set_space_limit(pblk); + pblk->state = PBLK_STATE_STOPPING; +} + +void pblk_pipeline_stop(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + int ret; + + spin_lock(&l_mg->free_lock); + if (pblk->state == PBLK_STATE_RECOVERING || + pblk->state == PBLK_STATE_STOPPED) { + spin_unlock(&l_mg->free_lock); + return; + } + pblk->state = PBLK_STATE_RECOVERING; + spin_unlock(&l_mg->free_lock); + + pblk_flush_writer(pblk); + pblk_wait_for_meta(pblk); + + ret = pblk_recov_pad(pblk); + if (ret) { + pr_err("pblk: could not close data on teardown(%d)\n", ret); + return; + } + + flush_workqueue(pblk->bb_wq); + pblk_line_close_meta_sync(pblk); + + spin_lock(&l_mg->free_lock); + pblk->state = PBLK_STATE_STOPPED; + l_mg->data_line = NULL; + l_mg->data_next = NULL; + spin_unlock(&l_mg->free_lock); +} + +void pblk_line_replace_data(struct pblk *pblk) { - struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *cur, *new; unsigned int left_seblks; - int meta_line; int is_next = 0; cur = l_mg->data_line; new = l_mg->data_next; if (!new) - return NULL; + return; l_mg->data_line = new; -retry_line: + spin_lock(&l_mg->free_lock); + if (pblk->state != PBLK_STATE_RUNNING) { + l_mg->data_line = NULL; + l_mg->data_next = NULL; + spin_unlock(&l_mg->free_lock); + return; + } + + pblk_line_setup_metadata(new, l_mg, &pblk->lm); + spin_unlock(&l_mg->free_lock); + +retry_erase: left_seblks = atomic_read(&new->left_seblks); if (left_seblks) { /* If line is not fully erased, erase it */ if (atomic_read(&new->left_eblks)) { if (pblk_line_erase(pblk, new)) - return NULL; + return; } else { io_schedule(); } - goto retry_line; + goto retry_erase; } - spin_lock(&l_mg->free_lock); - /* Allocate next line for preparation */ - l_mg->data_next = pblk_line_get(pblk); - if (l_mg->data_next) { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - is_next = 1; - } - -retry_meta: - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - if (meta_line == PBLK_DATA_LINES) { - spin_unlock(&l_mg->free_lock); - io_schedule(); - spin_lock(&l_mg->free_lock); - goto retry_meta; - } - - set_bit(meta_line, &l_mg->meta_bitmap); - new->smeta = l_mg->sline_meta[meta_line].meta; - new->emeta = l_mg->eline_meta[meta_line].meta; - new->meta_line = meta_line; - - memset(new->smeta, 0, lm->smeta_len); - memset(new->emeta, 0, lm->emeta_len); - spin_unlock(&l_mg->free_lock); - - if (is_next) - pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); - retry_setup: - if (!pblk_line_set_metadata(pblk, new, cur)) { + if (!pblk_line_init_metadata(pblk, new, cur)) { new = pblk_line_retry(pblk, new); if (!new) - return NULL; + return; goto retry_setup; } @@ -1292,12 +1410,30 @@ retry_setup: if (!pblk_line_init_bb(pblk, new, 1)) { new = pblk_line_retry(pblk, new); if (!new) - return NULL; + return; goto retry_setup; } - return new; + /* Allocate next line for preparation */ + spin_lock(&l_mg->free_lock); + l_mg->data_next = pblk_line_get(pblk); + if (!l_mg->data_next) { + /* If we cannot get a new line, we need to stop the pipeline. + * Only allow as many writes in as we can store safely and then + * fail gracefully + */ + pblk_stop_writes(pblk, new); + l_mg->data_next = NULL; + } else { + l_mg->data_next->seq_nr = l_mg->d_seq_nr++; + l_mg->data_next->type = PBLK_LINETYPE_DATA; + is_next = 1; + } + spin_unlock(&l_mg->free_lock); + + if (is_next) + pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); } void pblk_line_free(struct pblk *pblk, struct pblk_line *line) @@ -1307,6 +1443,8 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line) if (line->invalid_bitmap) mempool_free(line->invalid_bitmap, pblk->line_meta_pool); + *line->vsc = cpu_to_le32(EMPTY_ENTRY); + line->map_bitmap = NULL; line->invalid_bitmap = NULL; line->smeta = NULL; @@ -1339,8 +1477,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) struct nvm_rq *rqd; int err; - rqd = mempool_alloc(pblk->r_rq_pool, GFP_KERNEL); - memset(rqd, 0, pblk_r_rq_size); + rqd = mempool_alloc(pblk->g_rq_pool, GFP_KERNEL); + memset(rqd, 0, pblk_g_rq_size); pblk_setup_e_rq(pblk, rqd, ppa); @@ -1368,7 +1506,8 @@ struct pblk_line *pblk_line_get_data(struct pblk *pblk) return pblk->l_mg.data_line; } -struct pblk_line *pblk_line_get_data_next(struct pblk *pblk) +/* For now, always erase next line */ +struct pblk_line *pblk_line_get_erase(struct pblk *pblk) { return pblk->l_mg.data_next; } @@ -1378,18 +1517,58 @@ int pblk_line_is_full(struct pblk_line *line) return (line->left_msecs == 0); } +void pblk_line_close_meta_sync(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line *line, *tline; + LIST_HEAD(list); + + spin_lock(&l_mg->close_lock); + if (list_empty(&l_mg->emeta_list)) { + spin_unlock(&l_mg->close_lock); + return; + } + + list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); + spin_unlock(&l_mg->close_lock); + + list_for_each_entry_safe(line, tline, &list, list) { + struct pblk_emeta *emeta = line->emeta; + + while (emeta->mem < lm->emeta_len[0]) { + int ret; + + ret = pblk_submit_meta_io(pblk, line); + if (ret) { + pr_err("pblk: sync meta line %d failed (%d)\n", + line->id, ret); + return; + } + } + } + + pblk_wait_for_meta(pblk); + flush_workqueue(pblk->close_wq); +} + +static void pblk_line_should_sync_meta(struct pblk *pblk) +{ + if (pblk_rl_is_limit(&pblk->rl)) + pblk_line_close_meta_sync(pblk); +} + void pblk_line_close(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct list_head *move_list; - line->emeta->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, line->emeta)); - - if (pblk_line_submit_emeta_io(pblk, line, line->cur_sec, WRITE)) - pr_err("pblk: line %d close I/O failed\n", line->id); +#ifdef CONFIG_NVM_DEBUG + struct pblk_line_meta *lm = &pblk->lm; - WARN(!bitmap_full(line->map_bitmap, line->sec_in_line), + WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), "pblk: corrupt closed line %d\n", line->id); +#endif spin_lock(&l_mg->free_lock); WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap)); @@ -1410,6 +1589,31 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line) spin_unlock(&line->lock); spin_unlock(&l_mg->gc_lock); + + pblk_gc_should_kick(pblk); +} + +void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_emeta *emeta = line->emeta; + struct line_emeta *emeta_buf = emeta->buf; + + /* No need for exact vsc value; avoid a big line lock and take aprox. */ + memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len); + memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len); + + emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas); + emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf)); + + spin_lock(&l_mg->close_lock); + spin_lock(&line->lock); + list_add_tail(&line->list, &l_mg->emeta_list); + spin_unlock(&line->lock); + spin_unlock(&l_mg->close_lock); + + pblk_line_should_sync_meta(pblk); } void pblk_line_close_ws(struct work_struct *work) @@ -1449,7 +1653,8 @@ void pblk_line_mark_bb(struct work_struct *work) } void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *)) + void (*work)(struct work_struct *), + struct workqueue_struct *wq) { struct pblk_line_ws *line_ws; @@ -1462,7 +1667,7 @@ void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, line_ws->priv = priv; INIT_WORK(&line_ws->ws, work); - queue_work(pblk->kw_wq, &line_ws->ws); + queue_work(wq, &line_ws->ws); } void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, @@ -1471,7 +1676,7 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_lun *rlun; - int lun_id = ppa_list[0].g.ch * geo->luns_per_chnl + ppa_list[0].g.lun; + int pos = pblk_ppa_to_pos(geo, ppa_list[0]); int ret; /* @@ -1488,10 +1693,10 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, /* If the LUN has been locked for this same request, do no attempt to * lock it again */ - if (test_and_set_bit(lun_id, lun_bitmap)) + if (test_and_set_bit(pos, lun_bitmap)) return; - rlun = &pblk->luns[lun_id]; + rlun = &pblk->luns[pos]; ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000)); if (ret) { switch (ret) { diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index eaf479c6b63c..6090d28f7995 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -20,8 +20,7 @@ static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) { - kfree(gc_rq->data); - kfree(gc_rq->lba_list); + vfree(gc_rq->data); kfree(gc_rq); } @@ -37,10 +36,8 @@ static int pblk_gc_write(struct pblk *pblk) return 1; } - list_for_each_entry_safe(gc_rq, tgc_rq, &gc->w_list, list) { - list_move_tail(&gc_rq->list, &w_list); - gc->w_entries--; - } + list_cut_position(&w_list, &gc->w_list, gc->w_list.prev); + gc->w_entries = 0; spin_unlock(&gc->w_lock); list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { @@ -48,9 +45,8 @@ static int pblk_gc_write(struct pblk *pblk) gc_rq->nr_secs, gc_rq->secs_to_gc, gc_rq->line, PBLK_IOTYPE_GC); - kref_put(&gc_rq->line->ref, pblk_line_put); - list_del(&gc_rq->list); + kref_put(&gc_rq->line->ref, pblk_line_put); pblk_gc_free_gc_rq(gc_rq); } @@ -66,52 +62,41 @@ static void pblk_gc_writer_kick(struct pblk_gc *gc) * Responsible for managing all memory related to a gc request. Also in case of * failure */ -static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_line *line, - u64 *lba_list, unsigned int nr_secs) +static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_gc *gc = &pblk->gc; - struct pblk_gc_rq *gc_rq; + struct pblk_line *line = gc_rq->line; void *data; unsigned int secs_to_gc; - int ret = NVM_IO_OK; + int ret = 0; - data = kmalloc(nr_secs * geo->sec_size, GFP_KERNEL); + data = vmalloc(gc_rq->nr_secs * geo->sec_size); if (!data) { - ret = NVM_IO_ERR; - goto free_lba_list; + ret = -ENOMEM; + goto out; } /* Read from GC victim block */ - if (pblk_submit_read_gc(pblk, lba_list, data, nr_secs, + if (pblk_submit_read_gc(pblk, gc_rq->lba_list, data, gc_rq->nr_secs, &secs_to_gc, line)) { - ret = NVM_IO_ERR; + ret = -EFAULT; goto free_data; } if (!secs_to_gc) - goto free_data; - - gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL); - if (!gc_rq) { - ret = NVM_IO_ERR; - goto free_data; - } + goto free_rq; - gc_rq->line = line; gc_rq->data = data; - gc_rq->lba_list = lba_list; - gc_rq->nr_secs = nr_secs; gc_rq->secs_to_gc = secs_to_gc; - kref_get(&line->ref); - retry: spin_lock(&gc->w_lock); - if (gc->w_entries > 256) { + if (gc->w_entries >= PBLK_GC_W_QD) { spin_unlock(&gc->w_lock); - usleep_range(256, 1024); + pblk_gc_writer_kick(&pblk->gc); + usleep_range(128, 256); goto retry; } gc->w_entries++; @@ -120,13 +105,14 @@ retry: pblk_gc_writer_kick(&pblk->gc); - return NVM_IO_OK; + return 0; +free_rq: + kfree(gc_rq); free_data: - kfree(data); -free_lba_list: - kfree(lba_list); - + vfree(data); +out: + kref_put(&line->ref, pblk_line_put); return ret; } @@ -150,140 +136,206 @@ static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) static void pblk_gc_line_ws(struct work_struct *work) { + struct pblk_line_ws *line_rq_ws = container_of(work, + struct pblk_line_ws, ws); + struct pblk *pblk = line_rq_ws->pblk; + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line = line_rq_ws->line; + struct pblk_gc_rq *gc_rq = line_rq_ws->priv; + + up(&gc->gc_sem); + + if (pblk_gc_move_valid_secs(pblk, gc_rq)) { + pr_err("pblk: could not GC all sectors: line:%d (%d/%d)\n", + line->id, *line->vsc, + gc_rq->nr_secs); + } + + mempool_free(line_rq_ws, pblk->line_ws_pool); +} + +static void pblk_gc_line_prepare_ws(struct work_struct *work) +{ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, ws); struct pblk *pblk = line_ws->pblk; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line = line_ws->line; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; - __le64 *lba_list = line_ws->priv; - u64 *gc_list; - int sec_left; - int nr_ppas, bit; - int put_line = 1; + struct pblk_gc *gc = &pblk->gc; + struct line_emeta *emeta_buf; + struct pblk_line_ws *line_rq_ws; + struct pblk_gc_rq *gc_rq; + __le64 *lba_list; + int sec_left, nr_secs, bit; + int ret; - pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); + emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type, + GFP_KERNEL); + if (!emeta_buf) { + pr_err("pblk: cannot use GC emeta\n"); + return; + } - spin_lock(&line->lock); - sec_left = line->vsc; - if (!sec_left) { - /* Lines are erased before being used (l_mg->data_/log_next) */ - spin_unlock(&line->lock); - goto out; + ret = pblk_line_read_emeta(pblk, line, emeta_buf); + if (ret) { + pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret); + goto fail_free_emeta; + } + + /* If this read fails, it means that emeta is corrupted. For now, leave + * the line untouched. TODO: Implement a recovery routine that scans and + * moves all sectors on the line. + */ + lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); + if (!lba_list) { + pr_err("pblk: could not interpret emeta (line %d)\n", line->id); + goto fail_free_emeta; } - spin_unlock(&line->lock); + sec_left = pblk_line_vsc(line); if (sec_left < 0) { pr_err("pblk: corrupted GC line (%d)\n", line->id); - put_line = 0; - pblk_put_line_back(pblk, line); - goto out; + goto fail_free_emeta; } bit = -1; next_rq: - gc_list = kmalloc_array(pblk->max_write_pgs, sizeof(u64), GFP_KERNEL); - if (!gc_list) { - put_line = 0; - pblk_put_line_back(pblk, line); - goto out; - } + gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL); + if (!gc_rq) + goto fail_free_emeta; - nr_ppas = 0; + nr_secs = 0; do { bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line, bit + 1); if (bit > line->emeta_ssec) break; - gc_list[nr_ppas++] = le64_to_cpu(lba_list[bit]); - } while (nr_ppas < pblk->max_write_pgs); + gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); + } while (nr_secs < pblk->max_write_pgs); - if (unlikely(!nr_ppas)) { - kfree(gc_list); + if (unlikely(!nr_secs)) { + kfree(gc_rq); goto out; } - if (pblk_gc_move_valid_secs(pblk, line, gc_list, nr_ppas)) { - pr_err("pblk: could not GC all sectors: line:%d (%d/%d/%d)\n", - line->id, line->vsc, - nr_ppas, nr_ppas); - put_line = 0; - pblk_put_line_back(pblk, line); - goto out; - } + gc_rq->nr_secs = nr_secs; + gc_rq->line = line; + + line_rq_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); + if (!line_rq_ws) + goto fail_free_gc_rq; - sec_left -= nr_ppas; + line_rq_ws->pblk = pblk; + line_rq_ws->line = line; + line_rq_ws->priv = gc_rq; + + down(&gc->gc_sem); + kref_get(&line->ref); + + INIT_WORK(&line_rq_ws->ws, pblk_gc_line_ws); + queue_work(gc->gc_line_reader_wq, &line_rq_ws->ws); + + sec_left -= nr_secs; if (sec_left > 0) goto next_rq; out: - pblk_mfree(line->emeta, l_mg->emeta_alloc_type); + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); mempool_free(line_ws, pblk->line_ws_pool); - atomic_dec(&pblk->gc.inflight_gc); - if (put_line) - kref_put(&line->ref, pblk_line_put); + + kref_put(&line->ref, pblk_line_put); + atomic_dec(&gc->inflight_gc); + + return; + +fail_free_gc_rq: + kfree(gc_rq); +fail_free_emeta: + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + pblk_put_line_back(pblk, line); + kref_put(&line->ref, pblk_line_put); + mempool_free(line_ws, pblk->line_ws_pool); + atomic_dec(&gc->inflight_gc); + + pr_err("pblk: Failed to GC line %d\n", line->id); } static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) { - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; + struct pblk_gc *gc = &pblk->gc; struct pblk_line_ws *line_ws; - __le64 *lba_list; - int ret; - line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); - line->emeta = pblk_malloc(lm->emeta_len, l_mg->emeta_alloc_type, - GFP_KERNEL); - if (!line->emeta) { - pr_err("pblk: cannot use GC emeta\n"); - goto fail_free_ws; - } - - ret = pblk_line_read_emeta(pblk, line); - if (ret) { - pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret); - goto fail_free_emeta; - } + pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); - /* If this read fails, it means that emeta is corrupted. For now, leave - * the line untouched. TODO: Implement a recovery routine that scans and - * moves all sectors on the line. - */ - lba_list = pblk_recov_get_lba_list(pblk, line->emeta); - if (!lba_list) { - pr_err("pblk: could not interpret emeta (line %d)\n", line->id); - goto fail_free_emeta; - } + line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); + if (!line_ws) + return -ENOMEM; line_ws->pblk = pblk; line_ws->line = line; - line_ws->priv = lba_list; - INIT_WORK(&line_ws->ws, pblk_gc_line_ws); - queue_work(pblk->gc.gc_reader_wq, &line_ws->ws); + INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); + queue_work(gc->gc_reader_wq, &line_ws->ws); return 0; +} -fail_free_emeta: - pblk_mfree(line->emeta, l_mg->emeta_alloc_type); -fail_free_ws: - mempool_free(line_ws, pblk->line_ws_pool); - pblk_put_line_back(pblk, line); +static int pblk_gc_read(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line; + + spin_lock(&gc->r_lock); + if (list_empty(&gc->r_list)) { + spin_unlock(&gc->r_lock); + return 1; + } + + line = list_first_entry(&gc->r_list, struct pblk_line, list); + list_del(&line->list); + spin_unlock(&gc->r_lock); + + pblk_gc_kick(pblk); - return 1; + if (pblk_gc_line(pblk, line)) + pr_err("pblk: failed to GC line %d\n", line->id); + + return 0; } -static void pblk_gc_lines(struct pblk *pblk, struct list_head *gc_list) +static void pblk_gc_reader_kick(struct pblk_gc *gc) { - struct pblk_line *line, *tline; + wake_up_process(gc->gc_reader_ts); +} - list_for_each_entry_safe(line, tline, gc_list, list) { - if (pblk_gc_line(pblk, line)) - pr_err("pblk: failed to GC line %d\n", line->id); - list_del(&line->list); +static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, + struct list_head *group_list) +{ + struct pblk_line *line, *victim; + int line_vsc, victim_vsc; + + victim = list_first_entry(group_list, struct pblk_line, list); + list_for_each_entry(line, group_list, list) { + line_vsc = le32_to_cpu(*line->vsc); + victim_vsc = le32_to_cpu(*victim->vsc); + if (line_vsc < victim_vsc) + victim = line; } + + return victim; +} + +static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) +{ + unsigned int nr_blocks_free, nr_blocks_need; + + nr_blocks_need = pblk_rl_high_thrs(rl); + nr_blocks_free = pblk_rl_nr_free_blks(rl); + + /* This is not critical, no need to take lock here */ + return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)); } /* @@ -296,71 +348,83 @@ static void pblk_gc_run(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line, *tline; - unsigned int nr_blocks_free, nr_blocks_need; + struct pblk_line *line; struct list_head *group_list; - int run_gc, gc_group = 0; - int prev_gc = 0; - int inflight_gc = atomic_read(&gc->inflight_gc); - LIST_HEAD(gc_list); + bool run_gc; + int inflight_gc, gc_group = 0, prev_group = 0; + + do { + spin_lock(&l_mg->gc_lock); + if (list_empty(&l_mg->gc_full_list)) { + spin_unlock(&l_mg->gc_lock); + break; + } + + line = list_first_entry(&l_mg->gc_full_list, + struct pblk_line, list); - spin_lock(&l_mg->gc_lock); - list_for_each_entry_safe(line, tline, &l_mg->gc_full_list, list) { spin_lock(&line->lock); WARN_ON(line->state != PBLK_LINESTATE_CLOSED); line->state = PBLK_LINESTATE_GC; spin_unlock(&line->lock); list_del(&line->list); + spin_unlock(&l_mg->gc_lock); + kref_put(&line->ref, pblk_line_put); - } - spin_unlock(&l_mg->gc_lock); + } while (1); - nr_blocks_need = pblk_rl_gc_thrs(&pblk->rl); - nr_blocks_free = pblk_rl_nr_free_blks(&pblk->rl); - run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced); + run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); + if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD)) + return; next_gc_group: group_list = l_mg->gc_lists[gc_group++]; - spin_lock(&l_mg->gc_lock); - while (run_gc && !list_empty(group_list)) { - /* No need to queue up more GC lines than we can handle */ - if (!run_gc || inflight_gc > gc->gc_jobs_active) { + + do { + spin_lock(&l_mg->gc_lock); + if (list_empty(group_list)) { spin_unlock(&l_mg->gc_lock); - pblk_gc_lines(pblk, &gc_list); - return; + break; } - line = list_first_entry(group_list, struct pblk_line, list); - nr_blocks_free += atomic_read(&line->blk_in_line); + line = pblk_gc_get_victim_line(pblk, group_list); spin_lock(&line->lock); WARN_ON(line->state != PBLK_LINESTATE_CLOSED); line->state = PBLK_LINESTATE_GC; - list_move_tail(&line->list, &gc_list); - atomic_inc(&gc->inflight_gc); - inflight_gc++; spin_unlock(&line->lock); - prev_gc = 1; - run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced); - } - spin_unlock(&l_mg->gc_lock); + list_del(&line->list); + spin_unlock(&l_mg->gc_lock); + + spin_lock(&gc->r_lock); + list_add_tail(&line->list, &gc->r_list); + spin_unlock(&gc->r_lock); - pblk_gc_lines(pblk, &gc_list); + inflight_gc = atomic_inc_return(&gc->inflight_gc); + pblk_gc_reader_kick(gc); - if (!prev_gc && pblk->rl.rb_state > gc_group && - gc_group < PBLK_NR_GC_LISTS) + prev_group = 1; + + /* No need to queue up more GC lines than we can handle */ + run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); + if (!run_gc || inflight_gc >= PBLK_GC_L_QD) + break; + } while (1); + + if (!prev_group && pblk->rl.rb_state > gc_group && + gc_group < PBLK_GC_NR_LISTS) goto next_gc_group; } - -static void pblk_gc_kick(struct pblk *pblk) +void pblk_gc_kick(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; wake_up_process(gc->gc_ts); pblk_gc_writer_kick(gc); + pblk_gc_reader_kick(gc); mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); } @@ -398,42 +462,34 @@ static int pblk_gc_writer_ts(void *data) return 0; } -static void pblk_gc_start(struct pblk *pblk) +static int pblk_gc_reader_ts(void *data) { - pblk->gc.gc_active = 1; + struct pblk *pblk = data; - pr_debug("pblk: gc start\n"); + while (!kthread_should_stop()) { + if (!pblk_gc_read(pblk)) + continue; + set_current_state(TASK_INTERRUPTIBLE); + io_schedule(); + } + + return 0; } -int pblk_gc_status(struct pblk *pblk) +static void pblk_gc_start(struct pblk *pblk) { - struct pblk_gc *gc = &pblk->gc; - int ret; - - spin_lock(&gc->lock); - ret = gc->gc_active; - spin_unlock(&gc->lock); - - return ret; + pblk->gc.gc_active = 1; + pr_debug("pblk: gc start\n"); } -static void __pblk_gc_should_start(struct pblk *pblk) +void pblk_gc_should_start(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; - lockdep_assert_held(&gc->lock); - if (gc->gc_enabled && !gc->gc_active) pblk_gc_start(pblk); -} -void pblk_gc_should_start(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - spin_lock(&gc->lock); - __pblk_gc_should_start(pblk); - spin_unlock(&gc->lock); + pblk_gc_kick(pblk); } /* @@ -442,10 +498,7 @@ void pblk_gc_should_start(struct pblk *pblk) */ static void pblk_gc_stop(struct pblk *pblk, int flush_wq) { - spin_lock(&pblk->gc.lock); pblk->gc.gc_active = 0; - spin_unlock(&pblk->gc.lock); - pr_debug("pblk: gc stop\n"); } @@ -468,20 +521,25 @@ void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, spin_unlock(&gc->lock); } -void pblk_gc_sysfs_force(struct pblk *pblk, int force) +int pblk_gc_sysfs_force(struct pblk *pblk, int force) { struct pblk_gc *gc = &pblk->gc; - int rsv = 0; + + if (force < 0 || force > 1) + return -EINVAL; spin_lock(&gc->lock); - if (force) { - gc->gc_enabled = 1; - rsv = 64; - } - pblk_rl_set_gc_rsc(&pblk->rl, rsv); gc->gc_forced = force; - __pblk_gc_should_start(pblk); + + if (force) + gc->gc_enabled = 1; + else + gc->gc_enabled = 0; spin_unlock(&gc->lock); + + pblk_gc_should_start(pblk); + + return 0; } int pblk_gc_init(struct pblk *pblk) @@ -503,30 +561,58 @@ int pblk_gc_init(struct pblk *pblk) goto fail_free_main_kthread; } + gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, + "pblk-gc-reader-ts"); + if (IS_ERR(gc->gc_reader_ts)) { + pr_err("pblk: could not allocate GC reader kthread\n"); + ret = PTR_ERR(gc->gc_reader_ts); + goto fail_free_writer_kthread; + } + setup_timer(&gc->gc_timer, pblk_gc_timer, (unsigned long)pblk); mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); gc->gc_active = 0; gc->gc_forced = 0; gc->gc_enabled = 1; - gc->gc_jobs_active = 8; gc->w_entries = 0; atomic_set(&gc->inflight_gc, 0); - gc->gc_reader_wq = alloc_workqueue("pblk-gc-reader-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, gc->gc_jobs_active); + /* Workqueue that reads valid sectors from a line and submit them to the + * GC writer to be recycled. + */ + gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); + if (!gc->gc_line_reader_wq) { + pr_err("pblk: could not allocate GC line reader workqueue\n"); + ret = -ENOMEM; + goto fail_free_reader_kthread; + } + + /* Workqueue that prepare lines for GC */ + gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!gc->gc_reader_wq) { pr_err("pblk: could not allocate GC reader workqueue\n"); ret = -ENOMEM; - goto fail_free_writer_kthread; + goto fail_free_reader_line_wq; } spin_lock_init(&gc->lock); spin_lock_init(&gc->w_lock); + spin_lock_init(&gc->r_lock); + + sema_init(&gc->gc_sem, 128); + INIT_LIST_HEAD(&gc->w_list); + INIT_LIST_HEAD(&gc->r_list); return 0; +fail_free_reader_line_wq: + destroy_workqueue(gc->gc_line_reader_wq); +fail_free_reader_kthread: + kthread_stop(gc->gc_reader_ts); fail_free_writer_kthread: kthread_stop(gc->gc_writer_ts); fail_free_main_kthread: @@ -540,6 +626,7 @@ void pblk_gc_exit(struct pblk *pblk) struct pblk_gc *gc = &pblk->gc; flush_workqueue(gc->gc_reader_wq); + flush_workqueue(gc->gc_line_reader_wq); del_timer(&gc->gc_timer); pblk_gc_stop(pblk, 1); @@ -547,9 +634,15 @@ void pblk_gc_exit(struct pblk *pblk) if (gc->gc_ts) kthread_stop(gc->gc_ts); - if (pblk->gc.gc_reader_wq) - destroy_workqueue(pblk->gc.gc_reader_wq); + if (gc->gc_reader_wq) + destroy_workqueue(gc->gc_reader_wq); + + if (gc->gc_line_reader_wq) + destroy_workqueue(gc->gc_line_reader_wq); if (gc->gc_writer_ts) kthread_stop(gc->gc_writer_ts); + + if (gc->gc_reader_ts) + kthread_stop(gc->gc_reader_ts); } diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index ae8cd6d5af8b..1b0f61233c21 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -20,9 +20,10 @@ #include "pblk.h" -static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_r_rq_cache, - *pblk_w_rq_cache, *pblk_line_meta_cache; +static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, + *pblk_w_rq_cache, *pblk_line_meta_cache; static DECLARE_RWSEM(pblk_lock); +struct bio_set *pblk_bio_set; static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, struct bio *bio) @@ -33,7 +34,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, * constraint. Writes can be of arbitrary size. */ if (bio_data_dir(bio) == READ) { - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); ret = pblk_submit_read(pblk, bio); if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED)) bio_put(bio); @@ -46,7 +47,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, * available for user I/O. */ if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl))) - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); } @@ -199,9 +200,9 @@ static int pblk_init_global_caches(struct pblk *pblk) return -ENOMEM; } - pblk_r_rq_cache = kmem_cache_create("pblk_r_rq", pblk_r_rq_size, + pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, 0, 0, NULL); - if (!pblk_r_rq_cache) { + if (!pblk_g_rq_cache) { kmem_cache_destroy(pblk_blk_ws_cache); kmem_cache_destroy(pblk_rec_cache); up_write(&pblk_lock); @@ -213,7 +214,7 @@ static int pblk_init_global_caches(struct pblk *pblk) if (!pblk_w_rq_cache) { kmem_cache_destroy(pblk_blk_ws_cache); kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_r_rq_cache); + kmem_cache_destroy(pblk_g_rq_cache); up_write(&pblk_lock); return -ENOMEM; } @@ -225,7 +226,7 @@ static int pblk_init_global_caches(struct pblk *pblk) if (!pblk_line_meta_cache) { kmem_cache_destroy(pblk_blk_ws_cache); kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_r_rq_cache); + kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); up_write(&pblk_lock); return -ENOMEM; @@ -239,27 +240,10 @@ static int pblk_core_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - int max_write_ppas; - int mod; - pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); - max_write_ppas = pblk->min_write_pgs * geo->nr_luns; - pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ? - max_write_ppas : nvm_max_phys_sects(dev); pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * geo->nr_planes * geo->nr_luns; - if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { - pr_err("pblk: cannot support device max_phys_sect\n"); - return -EINVAL; - } - - div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod); - if (mod) { - pr_err("pblk: bad configuration of sectors/pages\n"); - return -EINVAL; - } - if (pblk_init_global_caches(pblk)) return -ENOMEM; @@ -267,7 +251,7 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->page_pool) return -ENOMEM; - pblk->line_ws_pool = mempool_create_slab_pool(geo->nr_luns, + pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE, pblk_blk_ws_cache); if (!pblk->line_ws_pool) goto free_page_pool; @@ -276,41 +260,51 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->rec_pool) goto free_blk_ws_pool; - pblk->r_rq_pool = mempool_create_slab_pool(64, pblk_r_rq_cache); - if (!pblk->r_rq_pool) + pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE, + pblk_g_rq_cache); + if (!pblk->g_rq_pool) goto free_rec_pool; - pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache); + pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns * 2, + pblk_w_rq_cache); if (!pblk->w_rq_pool) - goto free_r_rq_pool; + goto free_g_rq_pool; pblk->line_meta_pool = - mempool_create_slab_pool(16, pblk_line_meta_cache); + mempool_create_slab_pool(PBLK_META_POOL_SIZE, + pblk_line_meta_cache); if (!pblk->line_meta_pool) goto free_w_rq_pool; - pblk->kw_wq = alloc_workqueue("pblk-aux-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!pblk->kw_wq) + pblk->close_wq = alloc_workqueue("pblk-close-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); + if (!pblk->close_wq) goto free_line_meta_pool; + pblk->bb_wq = alloc_workqueue("pblk-bb-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!pblk->bb_wq) + goto free_close_wq; + if (pblk_set_ppaf(pblk)) - goto free_kw_wq; + goto free_bb_wq; if (pblk_rwb_init(pblk)) - goto free_kw_wq; + goto free_bb_wq; INIT_LIST_HEAD(&pblk->compl_list); return 0; -free_kw_wq: - destroy_workqueue(pblk->kw_wq); +free_bb_wq: + destroy_workqueue(pblk->bb_wq); +free_close_wq: + destroy_workqueue(pblk->close_wq); free_line_meta_pool: mempool_destroy(pblk->line_meta_pool); free_w_rq_pool: mempool_destroy(pblk->w_rq_pool); -free_r_rq_pool: - mempool_destroy(pblk->r_rq_pool); +free_g_rq_pool: + mempool_destroy(pblk->g_rq_pool); free_rec_pool: mempool_destroy(pblk->rec_pool); free_blk_ws_pool: @@ -322,19 +316,22 @@ free_page_pool: static void pblk_core_free(struct pblk *pblk) { - if (pblk->kw_wq) - destroy_workqueue(pblk->kw_wq); + if (pblk->close_wq) + destroy_workqueue(pblk->close_wq); + + if (pblk->bb_wq) + destroy_workqueue(pblk->bb_wq); mempool_destroy(pblk->page_pool); mempool_destroy(pblk->line_ws_pool); mempool_destroy(pblk->rec_pool); - mempool_destroy(pblk->r_rq_pool); + mempool_destroy(pblk->g_rq_pool); mempool_destroy(pblk->w_rq_pool); mempool_destroy(pblk->line_meta_pool); kmem_cache_destroy(pblk_blk_ws_cache); kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_r_rq_cache); + kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); kmem_cache_destroy(pblk_line_meta_cache); } @@ -344,6 +341,12 @@ static void pblk_luns_free(struct pblk *pblk) kfree(pblk->luns); } +static void pblk_free_line_bitmaps(struct pblk_line *line) +{ + kfree(line->blk_bitmap); + kfree(line->erase_bitmap); +} + static void pblk_lines_free(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -355,8 +358,7 @@ static void pblk_lines_free(struct pblk *pblk) line = &pblk->lines[i]; pblk_line_free(pblk, line); - kfree(line->blk_bitmap); - kfree(line->erase_bitmap); + pblk_free_line_bitmaps(line); } spin_unlock(&l_mg->free_lock); } @@ -368,11 +370,15 @@ static void pblk_line_meta_free(struct pblk *pblk) kfree(l_mg->bb_template); kfree(l_mg->bb_aux); + kfree(l_mg->vsc_list); + spin_lock(&l_mg->free_lock); for (i = 0; i < PBLK_DATA_LINES; i++) { - pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type); - pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type); + kfree(l_mg->sline_meta[i]); + pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type); + kfree(l_mg->eline_meta[i]); } + spin_unlock(&l_mg->free_lock); kfree(pblk->lines); } @@ -411,13 +417,31 @@ out: return ret; } -static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line) +static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line, + int blk_per_line) { - struct pblk_line_meta *lm = &pblk->lm; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; struct pblk_lun *rlun; int bb_cnt = 0; int i; + for (i = 0; i < blk_per_line; i++) { + rlun = &pblk->luns[i]; + if (rlun->bb_list[line->id] == NVM_BLK_T_FREE) + continue; + + set_bit(pblk_ppa_to_pos(geo, rlun->bppa), line->blk_bitmap); + bb_cnt++; + } + + return bb_cnt; +} + +static int pblk_alloc_line_bitmaps(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); if (!line->blk_bitmap) return -ENOMEM; @@ -428,16 +452,7 @@ static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line) return -ENOMEM; } - for (i = 0; i < lm->blk_per_line; i++) { - rlun = &pblk->luns[i]; - if (rlun->bb_list[line->id] == NVM_BLK_T_FREE) - continue; - - set_bit(i, line->blk_bitmap); - bb_cnt++; - } - - return bb_cnt; + return 0; } static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns) @@ -505,12 +520,32 @@ static int pblk_lines_configure(struct pblk *pblk, int flags) } /* See comment over struct line_emeta definition */ -static unsigned int calc_emeta_len(struct pblk *pblk, struct pblk_line_meta *lm) +static unsigned int calc_emeta_len(struct pblk *pblk) { - return (sizeof(struct line_emeta) + - ((lm->sec_per_line - lm->emeta_sec) * sizeof(u64)) + - (pblk->l_mg.nr_lines * sizeof(u32)) + - lm->blk_bitmap_len); + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + + /* Round to sector size so that lba_list starts on its own sector */ + lm->emeta_sec[1] = DIV_ROUND_UP( + sizeof(struct line_emeta) + lm->blk_bitmap_len, + geo->sec_size); + lm->emeta_len[1] = lm->emeta_sec[1] * geo->sec_size; + + /* Round to sector size so that vsc_list starts on its own sector */ + lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0]; + lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64), + geo->sec_size); + lm->emeta_len[2] = lm->emeta_sec[2] * geo->sec_size; + + lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32), + geo->sec_size); + lm->emeta_len[3] = lm->emeta_sec[3] * geo->sec_size; + + lm->vsc_list_len = l_mg->nr_lines * sizeof(u32); + + return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]); } static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) @@ -534,6 +569,78 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) atomic_set(&pblk->rl.free_blocks, nr_free_blks); } +static int pblk_lines_alloc_metadata(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + int i; + + /* smeta is always small enough to fit on a kmalloc memory allocation, + * emeta depends on the number of LUNs allocated to the pblk instance + */ + for (i = 0; i < PBLK_DATA_LINES; i++) { + l_mg->sline_meta[i] = kmalloc(lm->smeta_len, GFP_KERNEL); + if (!l_mg->sline_meta[i]) + goto fail_free_smeta; + } + + /* emeta allocates three different buffers for managing metadata with + * in-memory and in-media layouts + */ + for (i = 0; i < PBLK_DATA_LINES; i++) { + struct pblk_emeta *emeta; + + emeta = kmalloc(sizeof(struct pblk_emeta), GFP_KERNEL); + if (!emeta) + goto fail_free_emeta; + + if (lm->emeta_len[0] > KMALLOC_MAX_CACHE_SIZE) { + l_mg->emeta_alloc_type = PBLK_VMALLOC_META; + + emeta->buf = vmalloc(lm->emeta_len[0]); + if (!emeta->buf) { + kfree(emeta); + goto fail_free_emeta; + } + + emeta->nr_entries = lm->emeta_sec[0]; + l_mg->eline_meta[i] = emeta; + } else { + l_mg->emeta_alloc_type = PBLK_KMALLOC_META; + + emeta->buf = kmalloc(lm->emeta_len[0], GFP_KERNEL); + if (!emeta->buf) { + kfree(emeta); + goto fail_free_emeta; + } + + emeta->nr_entries = lm->emeta_sec[0]; + l_mg->eline_meta[i] = emeta; + } + } + + l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL); + if (!l_mg->vsc_list) + goto fail_free_emeta; + + for (i = 0; i < l_mg->nr_lines; i++) + l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY); + + return 0; + +fail_free_emeta: + while (--i >= 0) { + vfree(l_mg->eline_meta[i]->buf); + kfree(l_mg->eline_meta[i]); + } + +fail_free_smeta: + for (i = 0; i < PBLK_DATA_LINES; i++) + kfree(l_mg->sline_meta[i]); + + return -ENOMEM; +} + static int pblk_lines_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; @@ -542,10 +649,32 @@ static int pblk_lines_init(struct pblk *pblk) struct pblk_line_meta *lm = &pblk->lm; struct pblk_line *line; unsigned int smeta_len, emeta_len; - long nr_bad_blks, nr_meta_blks, nr_free_blks; - int bb_distance; - int i; - int ret; + long nr_bad_blks, nr_free_blks; + int bb_distance, max_write_ppas, mod; + int i, ret; + + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); + max_write_ppas = pblk->min_write_pgs * geo->nr_luns; + pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ? + max_write_ppas : nvm_max_phys_sects(dev); + pblk_set_sec_per_write(pblk, pblk->min_write_pgs); + + if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { + pr_err("pblk: cannot support device max_phys_sect\n"); + return -EINVAL; + } + + div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod); + if (mod) { + pr_err("pblk: bad configuration of sectors/pages\n"); + return -EINVAL; + } + + l_mg->nr_lines = geo->blks_per_lun; + l_mg->log_line = l_mg->data_line = NULL; + l_mg->l_seq_nr = l_mg->d_seq_nr = 0; + l_mg->nr_free_lines = 0; + bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); lm->sec_per_line = geo->sec_per_blk * geo->nr_luns; lm->blk_per_line = geo->nr_luns; @@ -554,20 +683,17 @@ static int pblk_lines_init(struct pblk *pblk) lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); lm->high_thrs = lm->sec_per_line / 2; lm->mid_thrs = lm->sec_per_line / 4; + lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; /* Calculate necessary pages for smeta. See comment over struct * line_smeta definition */ - lm->smeta_len = sizeof(struct line_smeta) + - PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len; - i = 1; add_smeta_page: lm->smeta_sec = i * geo->sec_per_pl; lm->smeta_len = lm->smeta_sec * geo->sec_size; - smeta_len = sizeof(struct line_smeta) + - PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len; + smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len; if (smeta_len > lm->smeta_len) { i++; goto add_smeta_page; @@ -578,66 +704,28 @@ add_smeta_page: */ i = 1; add_emeta_page: - lm->emeta_sec = i * geo->sec_per_pl; - lm->emeta_len = lm->emeta_sec * geo->sec_size; + lm->emeta_sec[0] = i * geo->sec_per_pl; + lm->emeta_len[0] = lm->emeta_sec[0] * geo->sec_size; - emeta_len = calc_emeta_len(pblk, lm); - if (emeta_len > lm->emeta_len) { + emeta_len = calc_emeta_len(pblk); + if (emeta_len > lm->emeta_len[0]) { i++; goto add_emeta_page; } - lm->emeta_bb = geo->nr_luns - i; - - nr_meta_blks = (lm->smeta_sec + lm->emeta_sec + - (geo->sec_per_blk / 2)) / geo->sec_per_blk; - lm->min_blk_line = nr_meta_blks + 1; - - l_mg->nr_lines = geo->blks_per_lun; - l_mg->log_line = l_mg->data_line = NULL; - l_mg->l_seq_nr = l_mg->d_seq_nr = 0; - l_mg->nr_free_lines = 0; - bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); - /* smeta is always small enough to fit on a kmalloc memory allocation, - * emeta depends on the number of LUNs allocated to the pblk instance - */ - l_mg->smeta_alloc_type = PBLK_KMALLOC_META; - for (i = 0; i < PBLK_DATA_LINES; i++) { - l_mg->sline_meta[i].meta = kmalloc(lm->smeta_len, GFP_KERNEL); - if (!l_mg->sline_meta[i].meta) - while (--i >= 0) { - kfree(l_mg->sline_meta[i].meta); - ret = -ENOMEM; - goto fail; - } + lm->emeta_bb = geo->nr_luns - i; + lm->min_blk_line = 1 + DIV_ROUND_UP(lm->smeta_sec + lm->emeta_sec[0], + geo->sec_per_blk); + if (lm->min_blk_line > lm->blk_per_line) { + pr_err("pblk: config. not supported. Min. LUN in line:%d\n", + lm->blk_per_line); + ret = -EINVAL; + goto fail; } - if (lm->emeta_len > KMALLOC_MAX_CACHE_SIZE) { - l_mg->emeta_alloc_type = PBLK_VMALLOC_META; - - for (i = 0; i < PBLK_DATA_LINES; i++) { - l_mg->eline_meta[i].meta = vmalloc(lm->emeta_len); - if (!l_mg->eline_meta[i].meta) - while (--i >= 0) { - vfree(l_mg->eline_meta[i].meta); - ret = -ENOMEM; - goto fail; - } - } - } else { - l_mg->emeta_alloc_type = PBLK_KMALLOC_META; - - for (i = 0; i < PBLK_DATA_LINES; i++) { - l_mg->eline_meta[i].meta = - kmalloc(lm->emeta_len, GFP_KERNEL); - if (!l_mg->eline_meta[i].meta) - while (--i >= 0) { - kfree(l_mg->eline_meta[i].meta); - ret = -ENOMEM; - goto fail; - } - } - } + ret = pblk_lines_alloc_metadata(pblk); + if (ret) + goto fail; l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); if (!l_mg->bb_template) { @@ -664,11 +752,14 @@ add_emeta_page: INIT_LIST_HEAD(&l_mg->gc_low_list); INIT_LIST_HEAD(&l_mg->gc_empty_list); + INIT_LIST_HEAD(&l_mg->emeta_list); + l_mg->gc_lists[0] = &l_mg->gc_high_list; l_mg->gc_lists[1] = &l_mg->gc_mid_list; l_mg->gc_lists[2] = &l_mg->gc_low_list; spin_lock_init(&l_mg->free_lock); + spin_lock_init(&l_mg->close_lock); spin_lock_init(&l_mg->gc_lock); pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line), @@ -689,10 +780,16 @@ add_emeta_page: line->type = PBLK_LINETYPE_FREE; line->state = PBLK_LINESTATE_FREE; line->gc_group = PBLK_LINEGC_NONE; + line->vsc = &l_mg->vsc_list[i]; spin_lock_init(&line->lock); - nr_bad_blks = pblk_bb_line(pblk, line); + ret = pblk_alloc_line_bitmaps(pblk, line); + if (ret) + goto fail_free_lines; + + nr_bad_blks = pblk_bb_line(pblk, line, lm->blk_per_line); if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line) { + pblk_free_line_bitmaps(line); ret = -EINVAL; goto fail_free_lines; } @@ -713,24 +810,20 @@ add_emeta_page: pblk_set_provision(pblk, nr_free_blks); - sema_init(&pblk->erase_sem, 1); - /* Cleanup per-LUN bad block lists - managed within lines on run-time */ for (i = 0; i < geo->nr_luns; i++) kfree(pblk->luns[i].bb_list); return 0; fail_free_lines: - kfree(pblk->lines); + while (--i >= 0) + pblk_free_line_bitmaps(&pblk->lines[i]); fail_free_bb_aux: kfree(l_mg->bb_aux); fail_free_bb_template: kfree(l_mg->bb_template); fail_free_meta: - for (i = 0; i < PBLK_DATA_LINES; i++) { - pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type); - pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type); - } + pblk_line_meta_free(pblk); fail: for (i = 0; i < geo->nr_luns; i++) kfree(pblk->luns[i].bb_list); @@ -754,6 +847,15 @@ static int pblk_writer_init(struct pblk *pblk) static void pblk_writer_stop(struct pblk *pblk) { + /* The pipeline must be stopped and the write buffer emptied before the + * write thread is stopped + */ + WARN(pblk_rb_read_count(&pblk->rwb), + "Stopping not fully persisted write buffer\n"); + + WARN(pblk_rb_sync_count(&pblk->rwb), + "Stopping not fully synced write buffer\n"); + if (pblk->writer_ts) kthread_stop(pblk->writer_ts); del_timer(&pblk->wtimer); @@ -772,10 +874,9 @@ static void pblk_free(struct pblk *pblk) static void pblk_tear_down(struct pblk *pblk) { - pblk_flush_writer(pblk); + pblk_pipeline_stop(pblk); pblk_writer_stop(pblk); pblk_rb_sync_l2p(&pblk->rwb); - pblk_recov_pad(pblk); pblk_rwb_free(pblk); pblk_rl_free(&pblk->rl); @@ -821,6 +922,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, pblk->dev = dev; pblk->disk = tdisk; + pblk->state = PBLK_STATE_RUNNING; spin_lock_init(&pblk->trans_lock); spin_lock_init(&pblk->lock); @@ -836,8 +938,8 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, atomic_long_set(&pblk->req_writes, 0); atomic_long_set(&pblk->sub_writes, 0); atomic_long_set(&pblk->sync_writes, 0); - atomic_long_set(&pblk->compl_writes, 0); atomic_long_set(&pblk->inflight_reads, 0); + atomic_long_set(&pblk->cache_reads, 0); atomic_long_set(&pblk->sync_reads, 0); atomic_long_set(&pblk->recov_writes, 0); atomic_long_set(&pblk->recov_writes, 0); @@ -946,11 +1048,20 @@ static struct nvm_tgt_type tt_pblk = { static int __init pblk_module_init(void) { - return nvm_register_tgt_type(&tt_pblk); + int ret; + + pblk_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0); + if (!pblk_bio_set) + return -ENOMEM; + ret = nvm_register_tgt_type(&tt_pblk); + if (ret) + bioset_free(pblk_bio_set); + return ret; } static void pblk_module_exit(void) { + bioset_free(pblk_bio_set); nvm_unregister_tgt_type(&tt_pblk); } diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index 17c16955284d..fddb924f6dde 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -25,9 +25,9 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, unsigned int valid_secs) { struct pblk_line *line = pblk_line_get_data(pblk); - struct line_emeta *emeta = line->emeta; + struct pblk_emeta *emeta = line->emeta; struct pblk_w_ctx *w_ctx; - __le64 *lba_list = pblk_line_emeta_to_lbas(emeta); + __le64 *lba_list = emeta_to_lbas(pblk, emeta->buf); u64 paddr; int nr_secs = pblk->min_write_pgs; int i; @@ -51,18 +51,20 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, w_ctx->ppa = ppa_list[i]; meta_list[i].lba = cpu_to_le64(w_ctx->lba); lba_list[paddr] = cpu_to_le64(w_ctx->lba); - le64_add_cpu(&line->emeta->nr_valid_lbas, 1); + line->nr_valid_lbas++; } else { - meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); - lba_list[paddr] = cpu_to_le64(ADDR_EMPTY); - pblk_map_pad_invalidate(pblk, line, paddr); + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + + lba_list[paddr] = meta_list[i].lba = addr_empty; + __pblk_map_invalidate(pblk, line, paddr); } } if (pblk_line_is_full(line)) { - line = pblk_line_replace_data(pblk); - if (!line) - return; + struct pblk_line *prev_line = line; + + pblk_line_replace_data(pblk); + pblk_line_close_meta(pblk, prev_line); } pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap); @@ -91,8 +93,9 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - struct pblk_line *e_line = pblk_line_get_data_next(pblk); + struct pblk_line_meta *lm = &pblk->lm; struct pblk_sec_meta *meta_list = rqd->meta_list; + struct pblk_line *e_line, *d_line; unsigned int map_secs; int min = pblk->min_write_pgs; int i, erase_lun; @@ -102,35 +105,63 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i], lun_bitmap, &meta_list[i], map_secs); - erase_lun = rqd->ppa_list[i].g.lun * geo->nr_chnls + - rqd->ppa_list[i].g.ch; + erase_lun = pblk_ppa_to_pos(geo, rqd->ppa_list[i]); - if (!test_bit(erase_lun, e_line->erase_bitmap)) { - if (down_trylock(&pblk->erase_sem)) - continue; + /* line can change after page map. We might also be writing the + * last line. + */ + e_line = pblk_line_get_erase(pblk); + if (!e_line) + return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, + valid_secs, i + min); + spin_lock(&e_line->lock); + if (!test_bit(erase_lun, e_line->erase_bitmap)) { set_bit(erase_lun, e_line->erase_bitmap); atomic_dec(&e_line->left_eblks); + *erase_ppa = rqd->ppa_list[i]; erase_ppa->g.blk = e_line->id; + spin_unlock(&e_line->lock); + /* Avoid evaluating e_line->left_eblks */ return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, valid_secs, i + min); } + spin_unlock(&e_line->lock); } - /* Erase blocks that are bad in this line but might not be in next */ - if (unlikely(ppa_empty(*erase_ppa))) { - struct pblk_line_meta *lm = &pblk->lm; + d_line = pblk_line_get_data(pblk); + + /* line can change after page map. We might also be writing the + * last line. + */ + e_line = pblk_line_get_erase(pblk); + if (!e_line) + return; - i = find_first_zero_bit(e_line->erase_bitmap, lm->blk_per_line); - if (i == lm->blk_per_line) + /* Erase blocks that are bad in this line but might not be in next */ + if (unlikely(ppa_empty(*erase_ppa)) && + bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { + int bit = -1; + +retry: + bit = find_next_bit(d_line->blk_bitmap, + lm->blk_per_line, bit + 1); + if (bit >= lm->blk_per_line) return; - set_bit(i, e_line->erase_bitmap); + spin_lock(&e_line->lock); + if (test_bit(bit, e_line->erase_bitmap)) { + spin_unlock(&e_line->lock); + goto retry; + } + spin_unlock(&e_line->lock); + + set_bit(bit, e_line->erase_bitmap); atomic_dec(&e_line->left_eblks); - *erase_ppa = pblk->luns[i].bppa; /* set ch and lun */ + *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */ erase_ppa->g.blk = e_line->id; } } diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 045384ddc1f9..5ecc154f6831 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -150,6 +150,7 @@ try: /* Release flags on context. Protect from writes and reads */ smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY); pblk_ppa_set_empty(&w_ctx->ppa); + w_ctx->lba = ADDR_EMPTY; } #define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size) @@ -180,6 +181,14 @@ unsigned int pblk_rb_read_count(struct pblk_rb *rb) return pblk_rb_ring_count(mem, subm, rb->nr_entries); } +unsigned int pblk_rb_sync_count(struct pblk_rb *rb) +{ + unsigned int mem = READ_ONCE(rb->mem); + unsigned int sync = READ_ONCE(rb->sync); + + return pblk_rb_ring_count(mem, sync, rb->nr_entries); +} + unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries) { unsigned int subm; @@ -199,12 +208,22 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, struct pblk_line *line; struct pblk_rb_entry *entry; struct pblk_w_ctx *w_ctx; + unsigned int user_io = 0, gc_io = 0; unsigned int i; + int flags; for (i = 0; i < to_update; i++) { entry = &rb->entries[*l2p_upd]; w_ctx = &entry->w_ctx; + flags = READ_ONCE(entry->w_ctx.flags); + if (flags & PBLK_IOTYPE_USER) + user_io++; + else if (flags & PBLK_IOTYPE_GC) + gc_io++; + else + WARN(1, "pblk: unknown IO type\n"); + pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, entry->cacheline); @@ -214,6 +233,8 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1); } + pblk_rl_out(&pblk->rl, user_io, gc_io); + return 0; } @@ -357,6 +378,9 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, /* Protect syncs */ smp_store_release(&rb->sync_point, sync_point); + if (!bio) + return 0; + spin_lock_irq(&rb->s_lock); bio_list_add(&entry->w_ctx.bios, bio); spin_unlock_irq(&rb->s_lock); @@ -395,6 +419,17 @@ static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, return 1; } +void pblk_rb_flush(struct pblk_rb *rb) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + unsigned int mem = READ_ONCE(rb->mem); + + if (pblk_rb_sync_point_set(rb, NULL, mem)) + return; + + pblk_write_should_kick(pblk); +} + static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, unsigned int *pos, struct bio *bio, int *io_ret) @@ -431,15 +466,16 @@ int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, unsigned int nr_entries, unsigned int *pos) { struct pblk *pblk = container_of(rb, struct pblk, rwb); - int flush_done; + int io_ret; spin_lock(&rb->w_lock); - if (!pblk_rl_user_may_insert(&pblk->rl, nr_entries)) { + io_ret = pblk_rl_user_may_insert(&pblk->rl, nr_entries); + if (io_ret) { spin_unlock(&rb->w_lock); - return NVM_IO_REQUEUE; + return io_ret; } - if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &flush_done)) { + if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &io_ret)) { spin_unlock(&rb->w_lock); return NVM_IO_REQUEUE; } @@ -447,7 +483,7 @@ int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, pblk_rl_user_in(&pblk->rl, nr_entries); spin_unlock(&rb->w_lock); - return flush_done; + return io_ret; } /* @@ -521,20 +557,18 @@ out: * This function is used by the write thread to form the write bio that will * persist data on the write buffer to the media. */ -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio, - struct pblk_c_ctx *c_ctx, - unsigned int pos, - unsigned int nr_entries, - unsigned int count) +unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, + struct bio *bio, unsigned int pos, + unsigned int nr_entries, unsigned int count) { struct pblk *pblk = container_of(rb, struct pblk, rwb); + struct request_queue *q = pblk->dev->q; + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); struct pblk_rb_entry *entry; struct page *page; - unsigned int pad = 0, read = 0, to_read = nr_entries; - unsigned int user_io = 0, gc_io = 0; + unsigned int pad = 0, to_read = nr_entries; unsigned int i; int flags; - int ret; if (count < nr_entries) { pad = nr_entries - count; @@ -553,15 +587,10 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio, */ try: flags = READ_ONCE(entry->w_ctx.flags); - if (!(flags & PBLK_WRITTEN_DATA)) + if (!(flags & PBLK_WRITTEN_DATA)) { + io_schedule(); goto try; - - if (flags & PBLK_IOTYPE_USER) - user_io++; - else if (flags & PBLK_IOTYPE_GC) - gc_io++; - else - WARN(1, "pblk: unknown IO type\n"); + } page = virt_to_page(entry->data); if (!page) { @@ -570,17 +599,17 @@ try: flags |= PBLK_SUBMITTED_ENTRY; /* Release flags on context. Protect from writes */ smp_store_release(&entry->w_ctx.flags, flags); - goto out; + return NVM_IO_ERR; } - ret = bio_add_page(bio, page, rb->seg_size, 0); - if (ret != rb->seg_size) { + if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != + rb->seg_size) { pr_err("pblk: could not add page to write bio\n"); flags &= ~PBLK_WRITTEN_DATA; flags |= PBLK_SUBMITTED_ENTRY; /* Release flags on context. Protect from writes */ smp_store_release(&entry->w_ctx.flags, flags); - goto out; + return NVM_IO_ERR; } if (flags & PBLK_FLUSH_ENTRY) { @@ -607,14 +636,19 @@ try: pos = (pos + 1) & (rb->nr_entries - 1); } - read = to_read; - pblk_rl_out(&pblk->rl, user_io, gc_io); + if (pad) { + if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { + pr_err("pblk: could not pad page in write bio\n"); + return NVM_IO_ERR; + } + } + #ifdef CONFIG_NVM_DEBUG atomic_long_add(pad, &((struct pblk *) (container_of(rb, struct pblk, rwb)))->padded_writes); #endif -out: - return read; + + return NVM_IO_OK; } /* @@ -623,15 +657,17 @@ out: * be directed to disk. */ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - u64 pos, int bio_iter) + struct ppa_addr ppa, int bio_iter) { + struct pblk *pblk = container_of(rb, struct pblk, rwb); struct pblk_rb_entry *entry; struct pblk_w_ctx *w_ctx; + struct ppa_addr l2p_ppa; + u64 pos = pblk_addr_to_cacheline(ppa); void *data; int flags; int ret = 1; - spin_lock(&rb->w_lock); #ifdef CONFIG_NVM_DEBUG /* Caller must ensure that the access will not cause an overflow */ @@ -641,8 +677,14 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, w_ctx = &entry->w_ctx; flags = READ_ONCE(w_ctx->flags); + spin_lock(&rb->w_lock); + spin_lock(&pblk->trans_lock); + l2p_ppa = pblk_trans_map_get(pblk, lba); + spin_unlock(&pblk->trans_lock); + /* Check if the entry has been overwritten or is scheduled to be */ - if (w_ctx->lba != lba || flags & PBLK_WRITABLE_ENTRY) { + if (!pblk_ppa_comp(l2p_ppa, ppa) || w_ctx->lba != lba || + flags & PBLK_WRITABLE_ENTRY) { ret = 0; goto out; } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 4a12f14d78c6..4e5c48f3de62 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -34,8 +34,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, BUG_ON(!pblk_addr_in_cache(ppa)); #endif - return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, - pblk_addr_to_cacheline(ppa), bio_iter); + return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa, bio_iter); } static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, @@ -76,6 +75,9 @@ retry: } WARN_ON(test_and_set_bit(i, read_bitmap)); advanced_bio = 1; +#ifdef CONFIG_NVM_DEBUG + atomic_long_inc(&pblk->cache_reads); +#endif } else { /* Read from media non-cached sectors */ rqd->ppa_list[j++] = p; @@ -85,6 +87,11 @@ retry: bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); } + if (pblk_io_aligned(pblk, nr_secs)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + #ifdef CONFIG_NVM_DEBUG atomic_long_add(nr_secs, &pblk->inflight_reads); #endif @@ -94,8 +101,6 @@ static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd) { int err; - rqd->flags = pblk_set_read_mode(pblk); - err = pblk_submit_io(pblk, rqd); if (err) return NVM_IO_ERR; @@ -107,27 +112,27 @@ static void pblk_end_io_read(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct bio *bio = rqd->bio; if (rqd->error) pblk_log_read_err(pblk, rqd); #ifdef CONFIG_NVM_DEBUG else - WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n"); + WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); #endif - if (rqd->nr_ppas > 1) - nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); bio_put(bio); - if (r_ctx->orig_bio) { + if (r_ctx->private) { + struct bio *orig_bio = r_ctx->private; + #ifdef CONFIG_NVM_DEBUG - WARN_ONCE(r_ctx->orig_bio->bi_error, - "pblk: corrupted read bio\n"); + WARN_ONCE(orig_bio->bi_status, "pblk: corrupted read bio\n"); #endif - bio_endio(r_ctx->orig_bio); - bio_put(r_ctx->orig_bio); + bio_endio(orig_bio); + bio_put(orig_bio); } #ifdef CONFIG_NVM_DEBUG @@ -136,6 +141,7 @@ static void pblk_end_io_read(struct nvm_rq *rqd) #endif pblk_free_rqd(pblk, rqd, READ); + atomic_dec(&pblk->inflight_io); } static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, @@ -173,6 +179,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->bio = new_bio; rqd->nr_ppas = nr_holes; + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); rqd->end_io = NULL; if (unlikely(nr_secs > 1 && nr_holes == 1)) { @@ -280,9 +287,14 @@ retry: goto retry; } WARN_ON(test_and_set_bit(0, read_bitmap)); +#ifdef CONFIG_NVM_DEBUG + atomic_long_inc(&pblk->cache_reads); +#endif } else { rqd->ppa_addr = ppa; } + + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); } int pblk_submit_read(struct pblk *pblk, struct bio *bio) @@ -316,13 +328,16 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) */ bio_init_idx = pblk_get_bi_idx(bio); + rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd->dma_meta_list); + if (!rqd->meta_list) { + pr_err("pblk: not able to allocate ppa list\n"); + goto fail_rqd_free; + } + if (nr_secs > 1) { - rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd->dma_ppa_list); - if (!rqd->ppa_list) { - pr_err("pblk: not able to allocate ppa list\n"); - goto fail_rqd_free; - } + rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; + rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; pblk_read_ppalist_rq(pblk, rqd, &read_bitmap); } else { @@ -332,6 +347,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) bio_get(bio); if (bitmap_full(&read_bitmap, nr_secs)) { bio_endio(bio); + atomic_inc(&pblk->inflight_io); pblk_end_io_read(rqd); return NVM_IO_OK; } @@ -339,17 +355,17 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) /* All sectors are to be read from the device */ if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { struct bio *int_bio = NULL; - struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); /* Clone read bio to deal with read errors internally */ - int_bio = bio_clone_bioset(bio, GFP_KERNEL, fs_bio_set); + int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); if (!int_bio) { pr_err("pblk: could not clone read bio\n"); return NVM_IO_ERR; } rqd->bio = int_bio; - r_ctx->orig_bio = bio; + r_ctx->private = bio; ret = pblk_submit_read_io(pblk, rqd); if (ret) { @@ -445,7 +461,6 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - struct request_queue *q = dev->q; struct bio *bio; struct nvm_rq rqd; int ret, data_len; @@ -453,22 +468,19 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, memset(&rqd, 0, sizeof(struct nvm_rq)); + rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd.dma_meta_list); + if (!rqd.meta_list) + return NVM_IO_ERR; + if (nr_secs > 1) { - rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd.dma_ppa_list); - if (!rqd.ppa_list) - return NVM_IO_ERR; + rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; + rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list, nr_secs); - if (*secs_to_gc == 1) { - struct ppa_addr ppa; - - ppa = rqd.ppa_list[0]; - nvm_dev_dma_free(dev->parent, rqd.ppa_list, - rqd.dma_ppa_list); - rqd.ppa_addr = ppa; - } + if (*secs_to_gc == 1) + rqd.ppa_addr = rqd.ppa_list[0]; } else { *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]); } @@ -477,7 +489,8 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, goto out; data_len = (*secs_to_gc) * geo->sec_size; - bio = bio_map_kern(q, data, data_len, GFP_KERNEL); + bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len, + PBLK_KMALLOC_META, GFP_KERNEL); if (IS_ERR(bio)) { pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); goto err_free_dma; @@ -490,6 +503,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, rqd.end_io = pblk_end_io_sync; rqd.private = &wait; rqd.nr_ppas = *secs_to_gc; + rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); rqd.bio = bio; ret = pblk_submit_read_io(pblk, &rqd); @@ -503,6 +517,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: GC read I/O timed out\n"); } + atomic_dec(&pblk->inflight_io); if (rqd.error) { atomic_long_inc(&pblk->read_failed_gc); @@ -518,12 +533,10 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, #endif out: - if (rqd.nr_ppas > 1) - nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return NVM_IO_OK; err_free_dma: - if (rqd.nr_ppas > 1) - nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list); + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return NVM_IO_ERR; } diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index f8f85087cd3c..0e48d3e4e143 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -120,18 +120,18 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, return 0; } -__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta) +__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta_buf) { u32 crc; - crc = pblk_calc_emeta_crc(pblk, emeta); - if (le32_to_cpu(emeta->crc) != crc) + crc = pblk_calc_emeta_crc(pblk, emeta_buf); + if (le32_to_cpu(emeta_buf->crc) != crc) return NULL; - if (le32_to_cpu(emeta->header.identifier) != PBLK_MAGIC) + if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) return NULL; - return pblk_line_emeta_to_lbas(emeta); + return emeta_to_lbas(pblk, emeta_buf); } static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) @@ -139,19 +139,20 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct pblk_line_meta *lm = &pblk->lm; - struct line_emeta *emeta = line->emeta; + struct pblk_emeta *emeta = line->emeta; + struct line_emeta *emeta_buf = emeta->buf; __le64 *lba_list; int data_start; int nr_data_lbas, nr_valid_lbas, nr_lbas = 0; int i; - lba_list = pblk_recov_get_lba_list(pblk, emeta); + lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); if (!lba_list) return 1; data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - nr_data_lbas = lm->sec_per_line - lm->emeta_sec; - nr_valid_lbas = le64_to_cpu(emeta->nr_valid_lbas); + nr_data_lbas = lm->sec_per_line - lm->emeta_sec[0]; + nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) { struct ppa_addr ppa; @@ -169,7 +170,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) if (test_and_set_bit(i, line->invalid_bitmap)) WARN_ONCE(1, "pblk: rec. double invalidate:\n"); else - line->vsc--; + le32_add_cpu(line->vsc, -1); spin_unlock(&line->lock); continue; @@ -181,7 +182,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) if (nr_valid_lbas != nr_lbas) pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n", - line->id, line->emeta->nr_valid_lbas, nr_lbas); + line->id, emeta_buf->nr_valid_lbas, nr_lbas); line->left_msecs = 0; @@ -195,7 +196,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line) struct pblk_line_meta *lm = &pblk->lm; int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); - return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec - + return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] - nr_bb * geo->sec_per_blk; } @@ -240,7 +241,7 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line, r_ptr_int = r_ptr; next_read_rq: - memset(rqd, 0, pblk_r_rq_size); + memset(rqd, 0, pblk_g_rq_size); rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); if (!rq_ppas) @@ -256,7 +257,6 @@ next_read_rq: rqd->bio = bio; rqd->opcode = NVM_OP_PREAD; - rqd->flags = pblk_set_read_mode(pblk); rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; rqd->ppa_list = ppa_list; @@ -265,6 +265,11 @@ next_read_rq: rqd->end_io = pblk_end_io_sync; rqd->private = &wait; + if (pblk_io_aligned(pblk, rq_ppas)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + for (i = 0; i < rqd->nr_ppas; ) { struct ppa_addr ppa; int pos; @@ -295,7 +300,7 @@ next_read_rq: pr_err("pblk: L2P recovery read timed out\n"); return -EINTR; } - + atomic_dec(&pblk->inflight_io); reinit_completion(&wait); /* At this point, the read should not fail. If it does, it is a problem @@ -322,47 +327,94 @@ next_read_rq: return 0; } +static void pblk_recov_complete(struct kref *ref) +{ + struct pblk_pad_rq *pad_rq = container_of(ref, struct pblk_pad_rq, ref); + + complete(&pad_rq->wait); +} + +static void pblk_end_io_recov(struct nvm_rq *rqd) +{ + struct pblk_pad_rq *pad_rq = rqd->private; + struct pblk *pblk = pad_rq->pblk; + struct nvm_tgt_dev *dev = pblk->dev; + + kref_put(&pad_rq->ref, pblk_recov_complete); + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); + pblk_free_rqd(pblk, rqd, WRITE); +} + static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line, - struct pblk_recov_alloc p, int left_ppas) + int left_ppas) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct ppa_addr *ppa_list; struct pblk_sec_meta *meta_list; + struct pblk_pad_rq *pad_rq; struct nvm_rq *rqd; struct bio *bio; void *data; dma_addr_t dma_ppa_list, dma_meta_list; - __le64 *lba_list = pblk_line_emeta_to_lbas(line->emeta); + __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); u64 w_ptr = line->cur_sec; - int left_line_ppas = line->left_msecs; - int rq_ppas, rq_len; + int left_line_ppas, rq_ppas, rq_len; int i, j; int ret = 0; - DECLARE_COMPLETION_ONSTACK(wait); - ppa_list = p.ppa_list; - meta_list = p.meta_list; - rqd = p.rqd; - data = p.data; - dma_ppa_list = p.dma_ppa_list; - dma_meta_list = p.dma_meta_list; + spin_lock(&line->lock); + left_line_ppas = line->left_msecs; + spin_unlock(&line->lock); + + pad_rq = kmalloc(sizeof(struct pblk_pad_rq), GFP_KERNEL); + if (!pad_rq) + return -ENOMEM; + + data = vzalloc(pblk->max_write_pgs * geo->sec_size); + if (!data) { + ret = -ENOMEM; + goto free_rq; + } + + pad_rq->pblk = pblk; + init_completion(&pad_rq->wait); + kref_init(&pad_rq->ref); next_pad_rq: rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); - if (!rq_ppas) - rq_ppas = pblk->min_write_pgs; + if (rq_ppas < pblk->min_write_pgs) { + pr_err("pblk: corrupted pad line %d\n", line->id); + goto free_rq; + } + rq_len = rq_ppas * geo->sec_size; + meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); + if (!meta_list) { + ret = -ENOMEM; + goto free_data; + } + + ppa_list = (void *)(meta_list) + pblk_dma_meta_size; + dma_ppa_list = dma_meta_list + pblk_dma_meta_size; + + rqd = pblk_alloc_rqd(pblk, WRITE); + if (IS_ERR(rqd)) { + ret = PTR_ERR(rqd); + goto fail_free_meta; + } + memset(rqd, 0, pblk_w_rq_size); + bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); - if (IS_ERR(bio)) - return PTR_ERR(bio); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + goto fail_free_rqd; + } bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - memset(rqd, 0, pblk_r_rq_size); - rqd->bio = bio; rqd->opcode = NVM_OP_PWRITE; rqd->flags = pblk_set_progr_mode(pblk, WRITE); @@ -371,8 +423,8 @@ next_pad_rq: rqd->ppa_list = ppa_list; rqd->dma_ppa_list = dma_ppa_list; rqd->dma_meta_list = dma_meta_list; - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; + rqd->end_io = pblk_end_io_recov; + rqd->private = pad_rq; for (i = 0; i < rqd->nr_ppas; ) { struct ppa_addr ppa; @@ -390,34 +442,51 @@ next_pad_rq: for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) { struct ppa_addr dev_ppa; + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); pblk_map_invalidate(pblk, dev_ppa); - meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); - lba_list[w_ptr] = cpu_to_le64(ADDR_EMPTY); + lba_list[w_ptr] = meta_list[i].lba = addr_empty; rqd->ppa_list[i] = dev_ppa; } } + kref_get(&pad_rq->ref); + ret = pblk_submit_io(pblk, rqd); if (ret) { pr_err("pblk: I/O submission failed: %d\n", ret); - return ret; + goto free_data; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: L2P recovery write timed out\n"); - } - reinit_completion(&wait); + atomic_dec(&pblk->inflight_io); left_line_ppas -= rq_ppas; left_ppas -= rq_ppas; - if (left_ppas > 0 && left_line_ppas) + if (left_ppas && left_line_ppas) goto next_pad_rq; - return 0; + kref_put(&pad_rq->ref, pblk_recov_complete); + + if (!wait_for_completion_io_timeout(&pad_rq->wait, + msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { + pr_err("pblk: pad write timed out\n"); + ret = -ETIME; + } + +free_rq: + kfree(pad_rq); +free_data: + vfree(data); + return ret; + +fail_free_rqd: + pblk_free_rqd(pblk, rqd, WRITE); +fail_free_meta: + nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); + kfree(pad_rq); + return ret; } /* When this function is called, it means that not all upper pages have been @@ -456,7 +525,7 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line, rec_round = 0; next_rq: - memset(rqd, 0, pblk_r_rq_size); + memset(rqd, 0, pblk_g_rq_size); rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); if (!rq_ppas) @@ -472,7 +541,6 @@ next_rq: rqd->bio = bio; rqd->opcode = NVM_OP_PREAD; - rqd->flags = pblk_set_read_mode(pblk); rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; rqd->ppa_list = ppa_list; @@ -481,6 +549,11 @@ next_rq: rqd->end_io = pblk_end_io_sync; rqd->private = &wait; + if (pblk_io_aligned(pblk, rq_ppas)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + for (i = 0; i < rqd->nr_ppas; ) { struct ppa_addr ppa; int pos; @@ -510,6 +583,7 @@ next_rq: msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: L2P recovery read timed out\n"); } + atomic_dec(&pblk->inflight_io); reinit_completion(&wait); /* This should not happen since the read failed during normal recovery, @@ -544,7 +618,7 @@ next_rq: if (pad_secs > line->left_msecs) pad_secs = line->left_msecs; - ret = pblk_recov_pad_oob(pblk, line, p, pad_secs); + ret = pblk_recov_pad_oob(pblk, line, pad_secs); if (ret) pr_err("pblk: OOB padding failed (err:%d)\n", ret); @@ -552,7 +626,6 @@ next_rq: if (ret) pr_err("pblk: OOB read failed (err:%d)\n", ret); - line->left_ssecs = line->left_msecs; left_ppas = 0; } @@ -591,7 +664,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, *done = 1; next_rq: - memset(rqd, 0, pblk_r_rq_size); + memset(rqd, 0, pblk_g_rq_size); rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); if (!rq_ppas) @@ -607,7 +680,6 @@ next_rq: rqd->bio = bio; rqd->opcode = NVM_OP_PREAD; - rqd->flags = pblk_set_read_mode(pblk); rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; rqd->ppa_list = ppa_list; @@ -616,6 +688,11 @@ next_rq: rqd->end_io = pblk_end_io_sync; rqd->private = &wait; + if (pblk_io_aligned(pblk, rq_ppas)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + for (i = 0; i < rqd->nr_ppas; ) { struct ppa_addr ppa; int pos; @@ -646,6 +723,7 @@ next_rq: msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { pr_err("pblk: L2P recovery read timed out\n"); } + atomic_dec(&pblk->inflight_io); reinit_completion(&wait); /* Reached the end of the written line */ @@ -658,7 +736,6 @@ next_rq: /* Roll back failed sectors */ line->cur_sec -= nr_error_bits; line->left_msecs += nr_error_bits; - line->left_ssecs = line->left_msecs; bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits); left_ppas = 0; @@ -770,8 +847,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line, *tline, *data_line = NULL; - struct line_smeta *smeta; - struct line_emeta *emeta; + struct pblk_smeta *smeta; + struct pblk_emeta *emeta; + struct line_smeta *smeta_buf; int found_lines = 0, recovered_lines = 0, open_lines = 0; int is_next = 0; int meta_line; @@ -784,8 +862,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) spin_lock(&l_mg->free_lock); meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); set_bit(meta_line, &l_mg->meta_bitmap); - smeta = l_mg->sline_meta[meta_line].meta; - emeta = l_mg->eline_meta[meta_line].meta; + smeta = l_mg->sline_meta[meta_line]; + emeta = l_mg->eline_meta[meta_line]; + smeta_buf = (struct line_smeta *)smeta; spin_unlock(&l_mg->free_lock); /* Order data lines using their sequence number */ @@ -796,33 +875,33 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) memset(smeta, 0, lm->smeta_len); line->smeta = smeta; - line->lun_bitmap = ((void *)(smeta)) + + line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta); /* Lines that cannot be read are assumed as not written here */ if (pblk_line_read_smeta(pblk, line)) continue; - crc = pblk_calc_smeta_crc(pblk, smeta); - if (le32_to_cpu(smeta->crc) != crc) + crc = pblk_calc_smeta_crc(pblk, smeta_buf); + if (le32_to_cpu(smeta_buf->crc) != crc) continue; - if (le32_to_cpu(smeta->header.identifier) != PBLK_MAGIC) + if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) continue; - if (le16_to_cpu(smeta->header.version) != 1) { + if (le16_to_cpu(smeta_buf->header.version) != 1) { pr_err("pblk: found incompatible line version %u\n", - smeta->header.version); + smeta_buf->header.version); return ERR_PTR(-EINVAL); } /* The first valid instance uuid is used for initialization */ if (!valid_uuid) { - memcpy(pblk->instance_uuid, smeta->header.uuid, 16); + memcpy(pblk->instance_uuid, smeta_buf->header.uuid, 16); valid_uuid = 1; } - if (memcmp(pblk->instance_uuid, smeta->header.uuid, 16)) { + if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) { pr_debug("pblk: ignore line %u due to uuid mismatch\n", i); continue; @@ -830,9 +909,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) /* Update line metadata */ spin_lock(&line->lock); - line->id = le32_to_cpu(line->smeta->header.id); - line->type = le16_to_cpu(line->smeta->header.type); - line->seq_nr = le64_to_cpu(line->smeta->seq_nr); + line->id = le32_to_cpu(smeta_buf->header.id); + line->type = le16_to_cpu(smeta_buf->header.type); + line->seq_nr = le64_to_cpu(smeta_buf->seq_nr); spin_unlock(&line->lock); /* Update general metadata */ @@ -848,7 +927,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) pblk_recov_line_add_ordered(&recov_list, line); found_lines++; pr_debug("pblk: recovering data line %d, seq:%llu\n", - line->id, smeta->seq_nr); + line->id, smeta_buf->seq_nr); } if (!found_lines) { @@ -868,15 +947,15 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) recovered_lines++; /* Calculate where emeta starts based on the line bb */ - off = lm->sec_per_line - lm->emeta_sec; + off = lm->sec_per_line - lm->emeta_sec[0]; nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); off -= nr_bb * geo->sec_per_pl; - memset(emeta, 0, lm->emeta_len); - line->emeta = emeta; line->emeta_ssec = off; + line->emeta = emeta; + memset(line->emeta->buf, 0, lm->emeta_len[0]); - if (pblk_line_read_emeta(pblk, line)) { + if (pblk_line_read_emeta(pblk, line, line->emeta->buf)) { pblk_recov_l2p_from_oob(pblk, line); goto next; } @@ -941,58 +1020,26 @@ out: } /* - * Pad until smeta can be read on current data line + * Pad current line */ -void pblk_recov_pad(struct pblk *pblk) +int pblk_recov_pad(struct pblk *pblk) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; struct pblk_line *line; struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct nvm_rq *rqd; - struct pblk_recov_alloc p; - struct ppa_addr *ppa_list; - struct pblk_sec_meta *meta_list; - void *data; - dma_addr_t dma_ppa_list, dma_meta_list; + int left_msecs; + int ret = 0; spin_lock(&l_mg->free_lock); line = l_mg->data_line; + left_msecs = line->left_msecs; spin_unlock(&l_mg->free_lock); - rqd = pblk_alloc_rqd(pblk, READ); - if (IS_ERR(rqd)) - return; - - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); - if (!meta_list) - goto free_rqd; - - ppa_list = (void *)(meta_list) + pblk_dma_meta_size; - dma_ppa_list = dma_meta_list + pblk_dma_meta_size; - - data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL); - if (!data) - goto free_meta_list; - - p.ppa_list = ppa_list; - p.meta_list = meta_list; - p.rqd = rqd; - p.data = data; - p.dma_ppa_list = dma_ppa_list; - p.dma_meta_list = dma_meta_list; - - if (pblk_recov_pad_oob(pblk, line, p, line->left_msecs)) { - pr_err("pblk: Tear down padding failed\n"); - goto free_data; + ret = pblk_recov_pad_oob(pblk, line, left_msecs); + if (ret) { + pr_err("pblk: Tear down padding failed (%d)\n", ret); + return ret; } - pblk_line_close(pblk, line); - -free_data: - kfree(data); -free_meta_list: - nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); -free_rqd: - pblk_free_rqd(pblk, rqd, READ); + pblk_line_close_meta(pblk, line); + return ret; } diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index ab7cbb144f3f..2e6a5361baf0 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -23,11 +23,35 @@ static void pblk_rl_kick_u_timer(struct pblk_rl *rl) mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000)); } +int pblk_rl_is_limit(struct pblk_rl *rl) +{ + int rb_space; + + rb_space = atomic_read(&rl->rb_space); + + return (rb_space == 0); +} + int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries) { int rb_user_cnt = atomic_read(&rl->rb_user_cnt); + int rb_space = atomic_read(&rl->rb_space); - return (!(rb_user_cnt + nr_entries > rl->rb_user_max)); + if (unlikely(rb_space >= 0) && (rb_space - nr_entries < 0)) + return NVM_IO_ERR; + + if (rb_user_cnt >= rl->rb_user_max) + return NVM_IO_REQUEUE; + + return NVM_IO_OK; +} + +void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries) +{ + int rb_space = atomic_read(&rl->rb_space); + + if (unlikely(rb_space >= 0)) + atomic_sub(nr_entries, &rl->rb_space); } int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries) @@ -37,7 +61,7 @@ int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries) /* If there is no user I/O let GC take over space on the write buffer */ rb_user_active = READ_ONCE(rl->rb_user_active); - return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active)); + return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active)); } void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries) @@ -77,33 +101,32 @@ static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) unsigned long free_blocks = pblk_rl_nr_free_blks(rl); if (free_blocks >= rl->high) { - rl->rb_user_max = max - rl->rb_gc_rsv; - rl->rb_gc_max = rl->rb_gc_rsv; + rl->rb_user_max = max; + rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; } else if (free_blocks < rl->high) { int shift = rl->high_pw - rl->rb_windows_pw; int user_windows = free_blocks >> shift; int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW; - int gc_max; rl->rb_user_max = user_max; - gc_max = max - rl->rb_user_max; - rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv); - - if (free_blocks > rl->low) - rl->rb_state = PBLK_RL_MID; - else - rl->rb_state = PBLK_RL_LOW; + rl->rb_gc_max = max - user_max; + + if (free_blocks <= rl->rsv_blocks) { + rl->rb_user_max = 0; + rl->rb_gc_max = max; + } + + /* In the worst case, we will need to GC lines in the low list + * (high valid sector count). If there are lines to GC on high + * or mid lists, these will be prioritized + */ + rl->rb_state = PBLK_RL_LOW; } return rl->rb_state; } -void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv) -{ - rl->rb_gc_rsv = rl->rb_gc_max = rsv; -} - void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) { struct pblk *pblk = container_of(rl, struct pblk, rl); @@ -122,11 +145,15 @@ void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) { - struct pblk *pblk = container_of(rl, struct pblk, rl); int blk_in_line = atomic_read(&line->blk_in_line); - int ret; atomic_sub(blk_in_line, &rl->free_blocks); +} + +void pblk_gc_should_kick(struct pblk *pblk) +{ + struct pblk_rl *rl = &pblk->rl; + int ret; /* Rates will not change that often - no need to lock update */ ret = pblk_rl_update_rates(rl, rl->rb_budget); @@ -136,11 +163,16 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) pblk_gc_should_stop(pblk); } -int pblk_rl_gc_thrs(struct pblk_rl *rl) +int pblk_rl_high_thrs(struct pblk_rl *rl) { return rl->high; } +int pblk_rl_low_thrs(struct pblk_rl *rl) +{ + return rl->low; +} + int pblk_rl_sysfs_rate_show(struct pblk_rl *rl) { return rl->rb_user_max; @@ -161,24 +193,36 @@ void pblk_rl_free(struct pblk_rl *rl) void pblk_rl_init(struct pblk_rl *rl, int budget) { + struct pblk *pblk = container_of(rl, struct pblk, rl); + struct pblk_line_meta *lm = &pblk->lm; + int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE; unsigned int rb_windows; rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS; - rl->low = rl->total_blocks / PBLK_USER_LOW_THRS; rl->high_pw = get_count_order(rl->high); + rl->low = rl->total_blocks / PBLK_USER_LOW_THRS; + if (rl->low < min_blocks) + rl->low = min_blocks; + + rl->rsv_blocks = min_blocks; + /* This will always be a power-of-2 */ rb_windows = budget / PBLK_MAX_REQ_ADDRS; - rl->rb_windows_pw = get_count_order(rb_windows) + 1; + rl->rb_windows_pw = get_count_order(rb_windows); /* To start with, all buffer is available to user I/O writers */ rl->rb_budget = budget; rl->rb_user_max = budget; - atomic_set(&rl->rb_user_cnt, 0); rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; + + atomic_set(&rl->rb_user_cnt, 0); atomic_set(&rl->rb_gc_cnt, 0); + atomic_set(&rl->rb_space, -1); setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl); + rl->rb_user_active = 0; + rl->rb_gc_active = 0; } diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index f0af1d1ceeff..95fb434e2f01 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c @@ -49,30 +49,26 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; int free_blocks, total_blocks; int rb_user_max, rb_user_cnt; - int rb_gc_max, rb_gc_rsv, rb_gc_cnt, rb_budget, rb_state; + int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; free_blocks = atomic_read(&pblk->rl.free_blocks); rb_user_max = pblk->rl.rb_user_max; rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); rb_gc_max = pblk->rl.rb_gc_max; - rb_gc_rsv = pblk->rl.rb_gc_rsv; rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt); rb_budget = pblk->rl.rb_budget; rb_state = pblk->rl.rb_state; - total_blocks = geo->blks_per_lun * geo->nr_luns; + total_blocks = pblk->rl.total_blocks; return snprintf(page, PAGE_SIZE, - "u:%u/%u,gc:%u/%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n", + "u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n", rb_user_cnt, rb_user_max, rb_gc_cnt, rb_gc_max, - rb_gc_rsv, rb_state, rb_budget, pblk->rl.low, @@ -150,11 +146,11 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) ssize_t sz = 0; int nr_free_lines; int cur_data, cur_log; - int free_line_cnt = 0, closed_line_cnt = 0; + int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0; int d_line_cnt = 0, l_line_cnt = 0; int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0; - int free = 0, bad = 0, cor = 0; - int msecs = 0, ssecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0; + int bad = 0, cor = 0; + int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0; int map_weight = 0, meta_weight = 0; spin_lock(&l_mg->free_lock); @@ -166,6 +162,11 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) free_line_cnt++; spin_unlock(&l_mg->free_lock); + spin_lock(&l_mg->close_lock); + list_for_each_entry(line, &l_mg->emeta_list, list) + emeta_line_cnt++; + spin_unlock(&l_mg->close_lock); + spin_lock(&l_mg->gc_lock); list_for_each_entry(line, &l_mg->gc_full_list, list) { if (line->type == PBLK_LINETYPE_DATA) @@ -212,8 +213,6 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) gc_empty++; } - list_for_each_entry(line, &l_mg->free_list, list) - free++; list_for_each_entry(line, &l_mg->bad_list, list) bad++; list_for_each_entry(line, &l_mg->corrupt_list, list) @@ -224,8 +223,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) if (l_mg->data_line) { cur_sec = l_mg->data_line->cur_sec; msecs = l_mg->data_line->left_msecs; - ssecs = l_mg->data_line->left_ssecs; - vsc = l_mg->data_line->vsc; + vsc = le32_to_cpu(*l_mg->data_line->vsc); sec_in_line = l_mg->data_line->sec_in_line; meta_weight = bitmap_weight(&l_mg->meta_bitmap, PBLK_DATA_LINES); @@ -235,17 +233,20 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) spin_unlock(&l_mg->free_lock); if (nr_free_lines != free_line_cnt) - pr_err("pblk: corrupted free line list\n"); + pr_err("pblk: corrupted free line list:%d/%d\n", + nr_free_lines, free_line_cnt); sz = snprintf(page, PAGE_SIZE - sz, "line: nluns:%d, nblks:%d, nsecs:%d\n", geo->nr_luns, lm->blk_per_line, lm->sec_per_line); sz += snprintf(page + sz, PAGE_SIZE - sz, - "lines:d:%d,l:%d-f:%d(%d),b:%d,co:%d,c:%d(d:%d,l:%d)t:%d\n", + "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", cur_data, cur_log, - free, nr_free_lines, bad, cor, + nr_free_lines, + emeta_line_cnt, meta_weight, closed_line_cnt, + bad, cor, d_line_cnt, l_line_cnt, l_mg->nr_lines); @@ -255,9 +256,10 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) atomic_read(&pblk->gc.inflight_gc)); sz += snprintf(page + sz, PAGE_SIZE - sz, - "data (%d) cur:%d, left:%d/%d, vsc:%d, s:%d, map:%d/%d (%d)\n", - cur_data, cur_sec, msecs, ssecs, vsc, sec_in_line, - map_weight, lm->sec_per_line, meta_weight); + "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", + cur_data, cur_sec, msecs, vsc, sec_in_line, + map_weight, lm->sec_per_line, + atomic_read(&pblk->inflight_io)); return sz; } @@ -274,7 +276,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) lm->smeta_len, lm->smeta_sec); sz += snprintf(page + sz, PAGE_SIZE - sz, "emeta - len:%d, sec:%d, bb_start:%d\n", - lm->emeta_len, lm->emeta_sec, + lm->emeta_len[0], lm->emeta_sec[0], lm->emeta_bb); sz += snprintf(page + sz, PAGE_SIZE - sz, "bitmap lengths: sec:%d, blk:%d, lun:%d\n", @@ -290,6 +292,11 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) return sz; } +static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write); +} + #ifdef CONFIG_NVM_DEBUG static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) { @@ -303,52 +310,51 @@ static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) atomic_long_read(&pblk->padded_wb), atomic_long_read(&pblk->sub_writes), atomic_long_read(&pblk->sync_writes), - atomic_long_read(&pblk->compl_writes), atomic_long_read(&pblk->recov_writes), atomic_long_read(&pblk->recov_gc_writes), atomic_long_read(&pblk->recov_gc_reads), + atomic_long_read(&pblk->cache_reads), atomic_long_read(&pblk->sync_reads)); } #endif -static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page, - size_t len) +static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page, + size_t len) { - struct pblk_gc *gc = &pblk->gc; size_t c_len; - int value; + int force; c_len = strcspn(page, "\n"); if (c_len >= len) return -EINVAL; - if (kstrtouint(page, 0, &value)) + if (kstrtouint(page, 0, &force)) return -EINVAL; - spin_lock(&gc->lock); - pblk_rl_set_gc_rsc(&pblk->rl, value); - spin_unlock(&gc->lock); + pblk_gc_sysfs_force(pblk, force); return len; } -static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page, - size_t len) +static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk, + const char *page, size_t len) { size_t c_len; - int force; + int sec_per_write; c_len = strcspn(page, "\n"); if (c_len >= len) return -EINVAL; - if (kstrtouint(page, 0, &force)) + if (kstrtouint(page, 0, &sec_per_write)) return -EINVAL; - if (force < 0 || force > 1) + if (sec_per_write < pblk->min_write_pgs + || sec_per_write > pblk->max_write_pgs + || sec_per_write % pblk->min_write_pgs != 0) return -EINVAL; - pblk_gc_sysfs_force(pblk, force); + pblk_set_sec_per_write(pblk, sec_per_write); return len; } @@ -398,9 +404,9 @@ static struct attribute sys_gc_force = { .mode = 0200, }; -static struct attribute sys_gc_rl_max = { - .name = "gc_rl_max", - .mode = 0200, +static struct attribute sys_max_sec_per_write = { + .name = "max_sec_per_write", + .mode = 0644, }; #ifdef CONFIG_NVM_DEBUG @@ -416,7 +422,7 @@ static struct attribute *pblk_attrs[] = { &sys_errors_attr, &sys_gc_state, &sys_gc_force, - &sys_gc_rl_max, + &sys_max_sec_per_write, &sys_rb_attr, &sys_stats_ppaf_attr, &sys_lines_attr, @@ -448,6 +454,8 @@ static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr, return pblk_sysfs_lines(pblk, buf); else if (strcmp(attr->name, "lines_info") == 0) return pblk_sysfs_lines_info(pblk, buf); + else if (strcmp(attr->name, "max_sec_per_write") == 0) + return pblk_sysfs_get_sec_per_write(pblk, buf); #ifdef CONFIG_NVM_DEBUG else if (strcmp(attr->name, "stats") == 0) return pblk_sysfs_stats_debug(pblk, buf); @@ -460,10 +468,10 @@ static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr, { struct pblk *pblk = container_of(kobj, struct pblk, kobj); - if (strcmp(attr->name, "gc_rl_max") == 0) - return pblk_sysfs_rate_store(pblk, buf, len); - else if (strcmp(attr->name, "gc_force") == 0) + if (strcmp(attr->name, "gc_force") == 0) return pblk_sysfs_gc_force(pblk, buf, len); + else if (strcmp(attr->name, "max_sec_per_write") == 0) + return pblk_sysfs_set_sec_per_write(pblk, buf, len); return 0; } diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index aef6fd7c4a0c..d62a8f4faaf4 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -17,18 +17,6 @@ #include "pblk.h" -static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line) -{ -#ifdef CONFIG_NVM_DEBUG - atomic_long_inc(&pblk->sync_writes); -#endif - - /* Counter protected by rb sync lock */ - line->left_ssecs--; - if (!line->left_ssecs) - pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws); -} - static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx) { @@ -39,21 +27,14 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, for (i = 0; i < c_ctx->nr_valid; i++) { struct pblk_w_ctx *w_ctx; - struct ppa_addr p; - struct pblk_line *line; w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i); - - p = rqd->ppa_list[i]; - line = &pblk->lines[pblk_dev_ppa_to_line(p)]; - pblk_sync_line(pblk, line); - while ((original_bio = bio_list_pop(&w_ctx->bios))) bio_endio(original_bio); } #ifdef CONFIG_NVM_DEBUG - atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes); + atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes); #endif ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); @@ -169,7 +150,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) } INIT_WORK(&recovery->ws_rec, pblk_submit_rec); - queue_work(pblk->kw_wq, &recovery->ws_rec); + queue_work(pblk->close_wq, &recovery->ws_rec); out: pblk_complete_write(pblk, rqd, c_ctx); @@ -186,14 +167,50 @@ static void pblk_end_io_write(struct nvm_rq *rqd) } #ifdef CONFIG_NVM_DEBUG else - WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n"); + WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); #endif pblk_complete_write(pblk, rqd, c_ctx); + atomic_dec(&pblk->inflight_io); +} + +static void pblk_end_io_write_meta(struct nvm_rq *rqd) +{ + struct pblk *pblk = rqd->private; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); + struct pblk_line *line = m_ctx->private; + struct pblk_emeta *emeta = line->emeta; + int pos = pblk_ppa_to_pos(geo, rqd->ppa_list[0]); + struct pblk_lun *rlun = &pblk->luns[pos]; + int sync; + + up(&rlun->wr_sem); + + if (rqd->error) { + pblk_log_write_err(pblk, rqd); + pr_err("pblk: metadata I/O failed. Line %d\n", line->id); + } +#ifdef CONFIG_NVM_DEBUG + else + WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); +#endif + + sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); + if (sync == emeta->nr_entries) + pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws, + pblk->close_wq); + + bio_put(rqd->bio); + pblk_free_rqd(pblk, rqd, READ); + + atomic_dec(&pblk->inflight_io); } static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int nr_secs) + unsigned int nr_secs, + nvm_end_io_fn(*end_io)) { struct nvm_tgt_dev *dev = pblk->dev; @@ -202,7 +219,7 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, rqd->nr_ppas = nr_secs; rqd->flags = pblk_set_progr_mode(pblk, WRITE); rqd->private = pblk; - rqd->end_io = pblk_end_io_write; + rqd->end_io = end_io; rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &rqd->dma_meta_list); @@ -219,11 +236,10 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, } static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) + struct pblk_c_ctx *c_ctx, struct ppa_addr *erase_ppa) { struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *e_line = pblk_line_get_data_next(pblk); - struct ppa_addr erase_ppa; + struct pblk_line *e_line = pblk_line_get_erase(pblk); unsigned int valid = c_ctx->nr_valid; unsigned int padded = c_ctx->nr_padded; unsigned int nr_secs = valid + padded; @@ -231,40 +247,23 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, int ret = 0; lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); - if (!lun_bitmap) { - ret = -ENOMEM; - goto out; - } + if (!lun_bitmap) + return -ENOMEM; c_ctx->lun_bitmap = lun_bitmap; - ret = pblk_alloc_w_rq(pblk, rqd, nr_secs); + ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write); if (ret) { kfree(lun_bitmap); - goto out; + return ret; } - ppa_set_empty(&erase_ppa); if (likely(!e_line || !atomic_read(&e_line->left_eblks))) pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0); else pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, - valid, &erase_ppa); - -out: - if (unlikely(e_line && !ppa_empty(erase_ppa))) { - if (pblk_blk_erase_async(pblk, erase_ppa)) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int bit; - - atomic_inc(&e_line->left_eblks); - bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch; - WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap)); - up(&pblk->erase_sem); - } - } + valid, erase_ppa); - return ret; + return 0; } int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, @@ -280,7 +279,7 @@ int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, c_ctx->lun_bitmap = lun_bitmap; - ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas); + ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas, pblk_end_io_write); if (ret) return ret; @@ -311,16 +310,237 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, return secs_to_sync; } +static inline int pblk_valid_meta_ppa(struct pblk *pblk, + struct pblk_line *meta_line, + struct ppa_addr *ppa_list, int nr_ppas) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line *data_line; + struct ppa_addr ppa, ppa_opt; + u64 paddr; + int i; + + data_line = &pblk->lines[pblk_dev_ppa_to_line(ppa_list[0])]; + paddr = pblk_lookup_page(pblk, meta_line); + ppa = addr_to_gen_ppa(pblk, paddr, 0); + + if (test_bit(pblk_ppa_to_pos(geo, ppa), data_line->blk_bitmap)) + return 1; + + /* Schedule a metadata I/O that is half the distance from the data I/O + * with regards to the number of LUNs forming the pblk instance. This + * balances LUN conflicts across every I/O. + * + * When the LUN configuration changes (e.g., due to GC), this distance + * can align, which would result on a LUN deadlock. In this case, modify + * the distance to not be optimal, but allow metadata I/Os to succeed. + */ + ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); + if (unlikely(ppa_opt.ppa == ppa.ppa)) { + data_line->meta_distance--; + return 0; + } + + for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) + if (ppa_list[i].g.ch == ppa_opt.g.ch && + ppa_list[i].g.lun == ppa_opt.g.lun) + return 1; + + if (test_bit(pblk_ppa_to_pos(geo, ppa_opt), data_line->blk_bitmap)) { + for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) + if (ppa_list[i].g.ch == ppa.g.ch && + ppa_list[i].g.lun == ppa.g.lun) + return 0; + + return 1; + } + + return 0; +} + +int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_emeta *emeta = meta_line->emeta; + struct pblk_g_ctx *m_ctx; + struct pblk_lun *rlun; + struct bio *bio; + struct nvm_rq *rqd; + void *data; + u64 paddr; + int rq_ppas = pblk->min_write_pgs; + int id = meta_line->id; + int rq_len; + int i, j; + int ret; + + rqd = pblk_alloc_rqd(pblk, READ); + if (IS_ERR(rqd)) { + pr_err("pblk: cannot allocate write req.\n"); + return PTR_ERR(rqd); + } + m_ctx = nvm_rq_to_pdu(rqd); + m_ctx->private = meta_line; + + rq_len = rq_ppas * geo->sec_size; + data = ((void *)emeta->buf) + emeta->mem; + + bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, + l_mg->emeta_alloc_type, GFP_KERNEL); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + goto fail_free_rqd; + } + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + rqd->bio = bio; + + ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta); + if (ret) + goto fail_free_bio; + + for (i = 0; i < rqd->nr_ppas; ) { + spin_lock(&meta_line->lock); + paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas); + spin_unlock(&meta_line->lock); + for (j = 0; j < rq_ppas; j++, i++, paddr++) + rqd->ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); + } + + rlun = &pblk->luns[pblk_ppa_to_pos(geo, rqd->ppa_list[0])]; + ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000)); + if (ret) { + pr_err("pblk: lun semaphore timed out (%d)\n", ret); + goto fail_free_bio; + } + + emeta->mem += rq_len; + if (emeta->mem >= lm->emeta_len[0]) { + spin_lock(&l_mg->close_lock); + list_del(&meta_line->list); + WARN(!bitmap_full(meta_line->map_bitmap, lm->sec_per_line), + "pblk: corrupt meta line %d\n", meta_line->id); + spin_unlock(&l_mg->close_lock); + } + + ret = pblk_submit_io(pblk, rqd); + if (ret) { + pr_err("pblk: emeta I/O submission failed: %d\n", ret); + goto fail_rollback; + } + + return NVM_IO_OK; + +fail_rollback: + spin_lock(&l_mg->close_lock); + pblk_dealloc_page(pblk, meta_line, rq_ppas); + list_add(&meta_line->list, &meta_line->list); + spin_unlock(&l_mg->close_lock); +fail_free_bio: + if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META)) + bio_put(bio); +fail_free_rqd: + pblk_free_rqd(pblk, rqd, READ); + return ret; +} + +static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list, + int prev_n) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *meta_line; + + spin_lock(&l_mg->close_lock); +retry: + if (list_empty(&l_mg->emeta_list)) { + spin_unlock(&l_mg->close_lock); + return 0; + } + meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); + if (bitmap_full(meta_line->map_bitmap, lm->sec_per_line)) + goto retry; + spin_unlock(&l_mg->close_lock); + + if (!pblk_valid_meta_ppa(pblk, meta_line, prev_list, prev_n)) + return 0; + + return pblk_submit_meta_io(pblk, meta_line); +} + +static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct ppa_addr erase_ppa; + int err; + + ppa_set_empty(&erase_ppa); + + /* Assign lbas to ppas and populate request structure */ + err = pblk_setup_w_rq(pblk, rqd, c_ctx, &erase_ppa); + if (err) { + pr_err("pblk: could not setup write request: %d\n", err); + return NVM_IO_ERR; + } + + if (likely(ppa_empty(erase_ppa))) { + /* Submit metadata write for previous data line */ + err = pblk_sched_meta_io(pblk, rqd->ppa_list, rqd->nr_ppas); + if (err) { + pr_err("pblk: metadata I/O submission failed: %d", err); + return NVM_IO_ERR; + } + + /* Submit data write for current data line */ + err = pblk_submit_io(pblk, rqd); + if (err) { + pr_err("pblk: data I/O submission failed: %d\n", err); + return NVM_IO_ERR; + } + } else { + /* Submit data write for current data line */ + err = pblk_submit_io(pblk, rqd); + if (err) { + pr_err("pblk: data I/O submission failed: %d\n", err); + return NVM_IO_ERR; + } + + /* Submit available erase for next data line */ + if (pblk_blk_erase_async(pblk, erase_ppa)) { + struct pblk_line *e_line = pblk_line_get_erase(pblk); + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int bit; + + atomic_inc(&e_line->left_eblks); + bit = pblk_ppa_to_pos(geo, erase_ppa); + WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap)); + } + } + + return NVM_IO_OK; +} + +static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct bio *bio = rqd->bio; + + if (c_ctx->nr_padded) + pblk_bio_free_pages(pblk, bio, rqd->nr_ppas, c_ctx->nr_padded); +} + static int pblk_submit_write(struct pblk *pblk) { struct bio *bio; struct nvm_rq *rqd; - struct pblk_c_ctx *c_ctx; - unsigned int pgs_read; unsigned int secs_avail, secs_to_sync, secs_to_com; unsigned int secs_to_flush; unsigned long pos; - int err; /* If there are no sectors in the cache, flushes (bios without data) * will be cleared on the cache threads @@ -338,7 +558,6 @@ static int pblk_submit_write(struct pblk *pblk) pr_err("pblk: cannot allocate write req.\n"); return 1; } - c_ctx = nvm_rq_to_pdu(rqd); bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs); if (!bio) { @@ -358,29 +577,14 @@ static int pblk_submit_write(struct pblk *pblk) secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); - pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos, - secs_to_sync, secs_avail); - if (!pgs_read) { + if (pblk_rb_read_to_bio(&pblk->rwb, rqd, bio, pos, secs_to_sync, + secs_avail)) { pr_err("pblk: corrupted write bio\n"); goto fail_put_bio; } - if (c_ctx->nr_padded) - if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded)) - goto fail_put_bio; - - /* Assign lbas to ppas and populate request structure */ - err = pblk_setup_w_rq(pblk, rqd, c_ctx); - if (err) { - pr_err("pblk: could not setup write request\n"); - goto fail_free_bio; - } - - err = pblk_submit_io(pblk, rqd); - if (err) { - pr_err("pblk: I/O submission failed: %d\n", err); + if (pblk_submit_io_set(pblk, rqd)) goto fail_free_bio; - } #ifdef CONFIG_NVM_DEBUG atomic_long_add(secs_to_sync, &pblk->sub_writes); @@ -389,8 +593,7 @@ static int pblk_submit_write(struct pblk *pblk) return 0; fail_free_bio: - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded); + pblk_free_write_rqd(pblk, rqd); fail_put_bio: bio_put(bio); fail_free_rqd: diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 99f3186b5288..15931381348c 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -40,6 +40,12 @@ #define PBLK_MAX_REQ_ADDRS (64) #define PBLK_MAX_REQ_ADDRS_PW (6) +#define PBLK_WS_POOL_SIZE (128) +#define PBLK_META_POOL_SIZE (128) +#define PBLK_READ_REQ_POOL_SIZE (1024) + +#define PBLK_NR_CLOSE_JOBS (4) + #define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) #define PBLK_COMMAND_TIMEOUT_MS 30000 @@ -72,11 +78,15 @@ enum { PBLK_BLK_ST_CLOSED = 0x2, }; +struct pblk_sec_meta { + u64 reserved; + __le64 lba; +}; + /* The number of GC lists and the rate-limiter states go together. This way the * rate-limiter can dictate how much GC is needed based on resource utilization. */ -#define PBLK_NR_GC_LISTS 3 -#define PBLK_MAX_GC_JOBS 32 +#define PBLK_GC_NR_LISTS 3 enum { PBLK_RL_HIGH = 1, @@ -84,14 +94,9 @@ enum { PBLK_RL_LOW = 3, }; -struct pblk_sec_meta { - u64 reserved; - __le64 lba; -}; - #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS) -/* write completion context */ +/* write buffer completion context */ struct pblk_c_ctx { struct list_head list; /* Head for out-of-order completion */ @@ -101,9 +106,16 @@ struct pblk_c_ctx { unsigned int nr_padded; }; -/* Read context */ -struct pblk_r_ctx { - struct bio *orig_bio; +/* generic context */ +struct pblk_g_ctx { + void *private; +}; + +/* Pad context */ +struct pblk_pad_rq { + struct pblk *pblk; + struct completion wait; + struct kref ref; }; /* Recovery context */ @@ -195,29 +207,39 @@ struct pblk_lun { struct pblk_gc_rq { struct pblk_line *line; void *data; - u64 *lba_list; + u64 lba_list[PBLK_MAX_REQ_ADDRS]; int nr_secs; int secs_to_gc; struct list_head list; }; struct pblk_gc { + /* These states are not protected by a lock since (i) they are in the + * fast path, and (ii) they are not critical. + */ int gc_active; int gc_enabled; int gc_forced; - int gc_jobs_active; - atomic_t inflight_gc; struct task_struct *gc_ts; struct task_struct *gc_writer_ts; + struct task_struct *gc_reader_ts; + + struct workqueue_struct *gc_line_reader_wq; struct workqueue_struct *gc_reader_wq; + struct timer_list gc_timer; + struct semaphore gc_sem; + atomic_t inflight_gc; int w_entries; + struct list_head w_list; + struct list_head r_list; spinlock_t lock; spinlock_t w_lock; + spinlock_t r_lock; }; struct pblk_rl { @@ -229,10 +251,8 @@ struct pblk_rl { */ unsigned int high_pw; /* High rounded up as a power of 2 */ -#define PBLK_USER_HIGH_THRS 2 /* Begin write limit at 50 percent - * available blks - */ -#define PBLK_USER_LOW_THRS 20 /* Aggressive GC at 5% available blocks */ +#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ +#define PBLK_USER_LOW_THRS 10 /* Aggressive GC at 10% available blocks */ int rb_windows_pw; /* Number of rate windows in the write buffer * given as a power-of-2. This guarantees that @@ -244,13 +264,19 @@ struct pblk_rl { */ int rb_budget; /* Total number of entries available for I/O */ int rb_user_max; /* Max buffer entries available for user I/O */ - atomic_t rb_user_cnt; /* User I/O buffer counter */ int rb_gc_max; /* Max buffer entries available for GC I/O */ int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ int rb_state; /* Rate-limiter current state */ + + atomic_t rb_user_cnt; /* User I/O buffer counter */ atomic_t rb_gc_cnt; /* GC I/O buffer counter */ + atomic_t rb_space; /* Space limit in case of reaching capacity */ + + int rsv_blocks; /* Reserved blocks for GC */ int rb_user_active; + int rb_gc_active; + struct timer_list u_timer; unsigned long long nr_secs; @@ -258,8 +284,6 @@ struct pblk_rl { atomic_t free_blocks; }; -#define PBLK_LINE_NR_LUN_BITMAP 2 -#define PBLK_LINE_NR_SEC_BITMAP 2 #define PBLK_LINE_EMPTY (~0U) enum { @@ -310,16 +334,19 @@ struct line_smeta { __le32 window_wr_lun; /* Number of parallel LUNs to write */ __le32 rsvd[2]; + + __le64 lun_bitmap[]; }; /* - * Metadata Layout: - * 1. struct pblk_emeta - * 2. nr_lbas u64 forming lba list - * 3. nr_lines (all) u32 valid sector count (vsc) (~0U: non-alloc line) - * 4. nr_luns bits (u64 format) forming line bad block bitmap - * - * 3. and 4. will be part of FTL log + * Metadata layout in media: + * First sector: + * 1. struct line_emeta + * 2. bad block bitmap (u64 * window_wr_lun) + * Mid sectors (start at lbas_sector): + * 3. nr_lbas (u64) forming lba list + * Last sectors (start at vsc_sector): + * 4. u32 valid sector count (vsc) for all lines (~0U: free line) */ struct line_emeta { struct line_header header; @@ -339,6 +366,23 @@ struct line_emeta { __le32 next_id; /* Line id for next line */ __le64 nr_lbas; /* Number of lbas mapped in line */ __le64 nr_valid_lbas; /* Number of valid lbas mapped in line */ + __le64 bb_bitmap[]; /* Updated bad block bitmap for line */ +}; + +struct pblk_emeta { + struct line_emeta *buf; /* emeta buffer in media format */ + int mem; /* Write offset - points to next + * writable entry in memory + */ + atomic_t sync; /* Synced - backpointer that signals the + * last entry that has been successfully + * persisted to media + */ + unsigned int nr_entries; /* Number of emeta entries */ +}; + +struct pblk_smeta { + struct line_smeta *buf; /* smeta buffer in persistent format */ }; struct pblk_line { @@ -355,9 +399,12 @@ struct pblk_line { unsigned long *lun_bitmap; /* Bitmap for LUNs mapped in line */ - struct line_smeta *smeta; /* Start metadata */ - struct line_emeta *emeta; /* End metadata */ + struct pblk_smeta *smeta; /* Start metadata */ + struct pblk_emeta *emeta; /* End medatada */ + int meta_line; /* Metadata line id */ + int meta_distance; /* Distance between data and metadata */ + u64 smeta_ssec; /* Sector where smeta starts */ u64 emeta_ssec; /* Sector where emeta starts */ @@ -374,9 +421,10 @@ struct pblk_line { atomic_t left_seblks; /* Blocks left for sync erasing */ int left_msecs; /* Sectors left for mapping */ - int left_ssecs; /* Sectors left to sync */ unsigned int cur_sec; /* Sector map pointer */ - unsigned int vsc; /* Valid sector count in line */ + unsigned int nr_valid_lbas; /* Number of valid lbas in line */ + + __le32 *vsc; /* Valid sector count in line */ struct kref ref; /* Write buffer L2P references */ @@ -385,13 +433,15 @@ struct pblk_line { #define PBLK_DATA_LINES 4 -enum{ +enum { PBLK_KMALLOC_META = 1, PBLK_VMALLOC_META = 2, }; -struct pblk_line_metadata { - void *meta; +enum { + PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */ + PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */ + PBLK_EMETA_TYPE_VSC = 3, /* vsc list - type: __le32 */ }; struct pblk_line_mgmt { @@ -404,7 +454,7 @@ struct pblk_line_mgmt { struct list_head bad_list; /* Full lines bad */ /* GC lists - use gc_lock */ - struct list_head *gc_lists[PBLK_NR_GC_LISTS]; + struct list_head *gc_lists[PBLK_GC_NR_LISTS]; struct list_head gc_high_list; /* Full lines ready to GC, high isc */ struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */ struct list_head gc_low_list; /* Full lines ready to GC, low isc */ @@ -417,13 +467,16 @@ struct pblk_line_mgmt { struct pblk_line *log_next; /* Next FTL log line */ struct pblk_line *data_next; /* Next data line */ + struct list_head emeta_list; /* Lines queued to schedule emeta */ + + __le32 *vsc_list; /* Valid sector counts for all lines */ + /* Metadata allocation type: VMALLOC | KMALLOC */ - int smeta_alloc_type; int emeta_alloc_type; /* Pre-allocated metadata for data lines */ - struct pblk_line_metadata sline_meta[PBLK_DATA_LINES]; - struct pblk_line_metadata eline_meta[PBLK_DATA_LINES]; + struct pblk_smeta *sline_meta[PBLK_DATA_LINES]; + struct pblk_emeta *eline_meta[PBLK_DATA_LINES]; unsigned long meta_bitmap; /* Helpers for fast bitmap calculations */ @@ -434,25 +487,40 @@ struct pblk_line_mgmt { unsigned long l_seq_nr; /* Log line unique sequence number */ spinlock_t free_lock; + spinlock_t close_lock; spinlock_t gc_lock; }; struct pblk_line_meta { unsigned int smeta_len; /* Total length for smeta */ - unsigned int smeta_sec; /* Sectors needed for smeta*/ - unsigned int emeta_len; /* Total length for emeta */ - unsigned int emeta_sec; /* Sectors needed for emeta*/ + unsigned int smeta_sec; /* Sectors needed for smeta */ + + unsigned int emeta_len[4]; /* Lengths for emeta: + * [0]: Total length + * [1]: struct line_emeta length + * [2]: L2P portion length + * [3]: vsc list length + */ + unsigned int emeta_sec[4]; /* Sectors needed for emeta. Same layout + * as emeta_len + */ + unsigned int emeta_bb; /* Boundary for bb that affects emeta */ + + unsigned int vsc_list_len; /* Length for vsc list */ unsigned int sec_bitmap_len; /* Length for sector bitmap in line */ unsigned int blk_bitmap_len; /* Length for block bitmap in line */ unsigned int lun_bitmap_len; /* Length for lun bitmap in line */ unsigned int blk_per_line; /* Number of blocks in a full line */ unsigned int sec_per_line; /* Number of sectors in a line */ + unsigned int dsec_per_line; /* Number of data sectors in a line */ unsigned int min_blk_line; /* Min. number of good blocks in line */ unsigned int mid_thrs; /* Threshold for GC mid list */ unsigned int high_thrs; /* Threshold for GC high list */ + + unsigned int meta_distance; /* Distance between data and metadata */ }; struct pblk_addr_format { @@ -470,6 +538,13 @@ struct pblk_addr_format { u8 sec_offset; }; +enum { + PBLK_STATE_RUNNING = 0, + PBLK_STATE_STOPPING = 1, + PBLK_STATE_RECOVERING = 2, + PBLK_STATE_STOPPED = 3, +}; + struct pblk { struct nvm_tgt_dev *dev; struct gendisk *disk; @@ -487,6 +562,8 @@ struct pblk { struct pblk_rb rwb; + int state; /* pblk line state */ + int min_write_pgs; /* Minimum amount of pages required by controller */ int max_write_pgs; /* Maximum amount of pages supported by controller */ int pgs_in_buffer; /* Number of pages that need to be held in buffer to @@ -499,7 +576,7 @@ struct pblk { /* pblk provisioning values. Used by rate limiter */ struct pblk_rl rl; - struct semaphore erase_sem; + int sec_per_write; unsigned char instance_uuid[16]; #ifdef CONFIG_NVM_DEBUG @@ -511,8 +588,8 @@ struct pblk { atomic_long_t req_writes; /* Sectors stored on write buffer */ atomic_long_t sub_writes; /* Sectors submitted from buffer */ atomic_long_t sync_writes; /* Sectors synced to media */ - atomic_long_t compl_writes; /* Sectors completed in write bio */ atomic_long_t inflight_reads; /* Inflight sector read requests */ + atomic_long_t cache_reads; /* Read requests that hit the cache */ atomic_long_t sync_reads; /* Completed sector read requests */ atomic_long_t recov_writes; /* Sectors submitted from recovery */ atomic_long_t recov_gc_writes; /* Sectors submitted from write GC */ @@ -528,6 +605,8 @@ struct pblk { atomic_long_t write_failed; atomic_long_t erase_failed; + atomic_t inflight_io; /* General inflight I/O counter */ + struct task_struct *writer_ts; /* Simple translation map of logical addresses to physical addresses. @@ -542,11 +621,13 @@ struct pblk { mempool_t *page_pool; mempool_t *line_ws_pool; mempool_t *rec_pool; - mempool_t *r_rq_pool; + mempool_t *g_rq_pool; mempool_t *w_rq_pool; mempool_t *line_meta_pool; - struct workqueue_struct *kw_wq; + struct workqueue_struct *close_wq; + struct workqueue_struct *bb_wq; + struct timer_list wtimer; struct pblk_gc gc; @@ -559,7 +640,7 @@ struct pblk_line_ws { struct work_struct ws; }; -#define pblk_r_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_r_ctx)) +#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) #define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) /* @@ -579,18 +660,17 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, unsigned int pos); struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); +void pblk_rb_flush(struct pblk_rb *rb); void pblk_rb_sync_l2p(struct pblk_rb *rb); -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio, - struct pblk_c_ctx *c_ctx, - unsigned int pos, - unsigned int nr_entries, - unsigned int count); +unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, + struct bio *bio, unsigned int pos, + unsigned int nr_entries, unsigned int count); unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, struct list_head *list, unsigned int max); int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - u64 pos, int bio_iter); + struct ppa_addr ppa, int bio_iter); unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries); unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags); @@ -601,6 +681,7 @@ void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb); unsigned int pblk_rb_read_count(struct pblk_rb *rb); +unsigned int pblk_rb_sync_count(struct pblk_rb *rb); unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos); int pblk_rb_tear_down_check(struct pblk_rb *rb); @@ -612,40 +693,50 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf); * pblk core */ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw); +void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx); void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw); -void pblk_flush_writer(struct pblk *pblk); +void pblk_wait_for_meta(struct pblk *pblk); struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba); void pblk_discard(struct pblk *pblk, struct bio *bio); void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); +int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, unsigned int nr_secs, unsigned int len, - gfp_t gfp_mask); + int alloc_type, gfp_t gfp_mask); struct pblk_line *pblk_line_get(struct pblk *pblk); struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); -struct pblk_line *pblk_line_replace_data(struct pblk *pblk); +void pblk_line_replace_data(struct pblk *pblk); int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); struct pblk_line *pblk_line_get_data(struct pblk *pblk); -struct pblk_line *pblk_line_get_data_next(struct pblk *pblk); +struct pblk_line *pblk_line_get_erase(struct pblk *pblk); int pblk_line_erase(struct pblk *pblk, struct pblk_line *line); int pblk_line_is_full(struct pblk_line *line); void pblk_line_free(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close_ws(struct work_struct *work); +void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); void pblk_line_close(struct pblk *pblk, struct pblk_line *line); +void pblk_line_close_meta_sync(struct pblk *pblk); +void pblk_line_close_ws(struct work_struct *work); +void pblk_pipeline_stop(struct pblk *pblk); void pblk_line_mark_bb(struct work_struct *work); void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *)); + void (*work)(struct work_struct *), + struct workqueue_struct *wq); u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line); -int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line); +int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, + void *emeta_buf); int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); void pblk_line_put(struct kref *ref); struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); +u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); +void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); +u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, unsigned long secs_to_flush); void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, @@ -656,11 +747,11 @@ void pblk_end_bio_sync(struct bio *bio); void pblk_end_io_sync(struct nvm_rq *rqd); int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, int nr_pages); -void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr); void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, int nr_pages); void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa); +void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, + u64 paddr); void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa); void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa); @@ -702,6 +793,7 @@ void pblk_write_should_kick(struct pblk *pblk); /* * pblk read path */ +extern struct bio_set *pblk_bio_set; int pblk_submit_read(struct pblk *pblk, struct bio *bio); int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, unsigned int nr_secs, unsigned int *secs_to_gc, @@ -711,7 +803,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, */ void pblk_submit_rec(struct work_struct *work); struct pblk_line *pblk_recov_l2p(struct pblk *pblk); -void pblk_recov_pad(struct pblk *pblk); +int pblk_recov_pad(struct pblk *pblk); __le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta); int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, struct pblk_rec_ctx *recovery, u64 *comp_bits, @@ -720,33 +812,40 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, /* * pblk gc */ -#define PBLK_GC_TRIES 3 +#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ +#define PBLK_GC_W_QD 128 /* Queue depth for inflight GC write I/Os */ +#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ +#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ int pblk_gc_init(struct pblk *pblk); void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); -int pblk_gc_status(struct pblk *pblk); +void pblk_gc_should_kick(struct pblk *pblk); +void pblk_gc_kick(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active); -void pblk_gc_sysfs_force(struct pblk *pblk, int force); +int pblk_gc_sysfs_force(struct pblk *pblk, int force); /* * pblk rate limiter */ void pblk_rl_init(struct pblk_rl *rl, int budget); void pblk_rl_free(struct pblk_rl *rl); -int pblk_rl_gc_thrs(struct pblk_rl *rl); +int pblk_rl_high_thrs(struct pblk_rl *rl); +int pblk_rl_low_thrs(struct pblk_rl *rl); unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); +void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); -void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv); int pblk_rl_sysfs_rate_show(struct pblk_rl *rl); void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); +void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left); +int pblk_rl_is_limit(struct pblk_rl *rl); /* * pblk sysfs @@ -774,9 +873,30 @@ static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx) return c_ctx - sizeof(struct nvm_rq); } -static inline void *pblk_line_emeta_to_lbas(struct line_emeta *emeta) +static inline void *emeta_to_bb(struct line_emeta *emeta) +{ + return emeta->bb_bitmap; +} + +static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta) +{ + return ((void *)emeta + pblk->lm.emeta_len[1]); +} + +static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta) { - return (emeta) + 1; + return (emeta_to_lbas(pblk, emeta) + pblk->lm.emeta_len[2]); +} + +static inline int pblk_line_vsc(struct pblk_line *line) +{ + int vsc; + + spin_lock(&line->lock); + vsc = le32_to_cpu(*line->vsc); + spin_unlock(&line->lock); + + return vsc; } #define NVM_MEM_PAGE_WRITE (8) @@ -917,6 +1037,14 @@ static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr) ppa_addr->ppa = ADDR_EMPTY; } +static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) +{ + if (lppa.ppa == rppa.ppa) + return true; + + return false; +} + static inline int pblk_addr_in_cache(struct ppa_addr ppa) { return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached); @@ -964,11 +1092,11 @@ static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr, } static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, - struct line_smeta *smeta) + struct line_header *header) { u32 crc = ~(u32)0; - crc = crc32_le(crc, (unsigned char *)smeta + sizeof(crc), + crc = crc32_le(crc, (unsigned char *)header + sizeof(crc), sizeof(struct line_header) - sizeof(crc)); return crc; @@ -996,7 +1124,7 @@ static inline u32 pblk_calc_emeta_crc(struct pblk *pblk, crc = crc32_le(crc, (unsigned char *)emeta + sizeof(struct line_header) + sizeof(crc), - lm->emeta_len - + lm->emeta_len[0] - sizeof(struct line_header) - sizeof(crc)); return crc; @@ -1016,9 +1144,27 @@ static inline int pblk_set_progr_mode(struct pblk *pblk, int type) return flags; } -static inline int pblk_set_read_mode(struct pblk *pblk) +enum { + PBLK_READ_RANDOM = 0, + PBLK_READ_SEQUENTIAL = 1, +}; + +static inline int pblk_set_read_mode(struct pblk *pblk, int type) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int flags; + + flags = NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE; + if (type == PBLK_READ_SEQUENTIAL) + flags |= geo->plane_mode >> 1; + + return flags; +} + +static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs) { - return NVM_IO_SNGL_ACCESS | NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE; + return !(nr_secs % pblk->min_write_pgs); } #ifdef CONFIG_NVM_DEBUG diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index cf0e28a0ff61..267f01ae87e4 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -279,8 +279,8 @@ static void rrpc_end_sync_bio(struct bio *bio) { struct completion *waiting = bio->bi_private; - if (bio->bi_error) - pr_err("nvm: gc request failed (%u).\n", bio->bi_error); + if (bio->bi_status) + pr_err("nvm: gc request failed (%u).\n", bio->bi_status); complete(waiting); } @@ -359,7 +359,7 @@ try: goto finished; } wait_for_completion_io(&wait); - if (bio->bi_error) { + if (bio->bi_status) { rrpc_inflight_laddr_release(rrpc, rqd); goto finished; } @@ -385,7 +385,7 @@ try: wait_for_completion_io(&wait); rrpc_inflight_laddr_release(rrpc, rqd); - if (bio->bi_error) + if (bio->bi_status) goto finished; bio_reset(bio); @@ -994,7 +994,7 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio) struct nvm_rq *rqd; int err; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if (bio_op(bio) == REQ_OP_DISCARD) { rrpc_discard(rrpc, bio); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index c3ea03c9a1a8..dee542fff68e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -849,10 +849,11 @@ static inline void wake_up_allocators(struct cache_set *c) /* Forward declarations */ -void bch_count_io_errors(struct cache *, int, const char *); +void bch_count_io_errors(struct cache *, blk_status_t, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - int, const char *); -void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); + blk_status_t, const char *); +void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, + const char *); void bch_bbio_free(struct bio *, struct cache_set *); struct bio *bch_bbio_alloc(struct cache_set *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 450d0e848ae4..866dcf78ff8e 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -307,7 +307,7 @@ static void bch_btree_node_read(struct btree *b) bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); - if (bio->bi_error) + if (bio->bi_status) set_btree_node_io_error(b); bch_bbio_free(bio, b->c); @@ -374,10 +374,10 @@ static void btree_node_write_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct btree *b = container_of(cl, struct btree, io); - if (bio->bi_error) + if (bio->bi_status) set_btree_node_io_error(b); - bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree"); + bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree"); closure_put(cl); } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 06f55056aaae..35a5a7210e51 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -110,7 +110,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) struct bio_vec bv, cbv; struct bvec_iter iter, citer = { 0 }; - check = bio_clone(bio, GFP_NOIO); + check = bio_clone_kmalloc(bio, GFP_NOIO); if (!check) return; check->bi_opf = REQ_OP_READ; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index db45a88c0ce9..6a9b85095e7b 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -50,7 +50,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, /* IO errors */ -void bch_count_io_errors(struct cache *ca, int error, const char *m) +void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) { /* * The halflife of an error is: @@ -103,7 +103,7 @@ void bch_count_io_errors(struct cache *ca, int error, const char *m) } void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, - int error, const char *m) + blk_status_t error, const char *m) { struct bbio *b = container_of(bio, struct bbio, bio); struct cache *ca = PTR_CACHE(c, &b->key, 0); @@ -132,7 +132,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, } void bch_bbio_endio(struct cache_set *c, struct bio *bio, - int error, const char *m) + blk_status_t error, const char *m) { struct closure *cl = bio->bi_private; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 1198e53d5670..0352d05e495c 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -549,7 +549,7 @@ static void journal_write_endio(struct bio *bio) { struct journal_write *w = bio->bi_private; - cache_set_err_on(bio->bi_error, w->c, "journal io error"); + cache_set_err_on(bio->bi_status, w->c, "journal io error"); closure_put(&w->c->journal.io); } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 13b8a907006d..f633b30c962e 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -63,14 +63,14 @@ static void read_moving_endio(struct bio *bio) struct moving_io *io = container_of(bio->bi_private, struct moving_io, cl); - if (bio->bi_error) - io->op.error = bio->bi_error; + if (bio->bi_status) + io->op.status = bio->bi_status; else if (!KEY_DIRTY(&b->key) && ptr_stale(io->op.c, &b->key, 0)) { - io->op.error = -EINTR; + io->op.status = BLK_STS_IOERR; } - bch_bbio_endio(io->op.c, bio, bio->bi_error, "reading data to move"); + bch_bbio_endio(io->op.c, bio, bio->bi_status, "reading data to move"); } static void moving_init(struct moving_io *io) @@ -92,7 +92,7 @@ static void write_moving(struct closure *cl) struct moving_io *io = container_of(cl, struct moving_io, cl); struct data_insert_op *op = &io->op; - if (!op->error) { + if (!op->status) { moving_init(io); io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 709c9cc34369..019b3df9f1c6 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -81,7 +81,7 @@ static void bch_data_insert_keys(struct closure *cl) if (ret == -ESRCH) { op->replace_collision = true; } else if (ret) { - op->error = -ENOMEM; + op->status = BLK_STS_RESOURCE; op->insert_data_done = true; } @@ -178,17 +178,17 @@ static void bch_data_insert_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - if (bio->bi_error) { + if (bio->bi_status) { /* TODO: We could try to recover from this. */ if (op->writeback) - op->error = bio->bi_error; + op->status = bio->bi_status; else if (!op->replace) set_closure_fn(cl, bch_data_insert_error, op->wq); else set_closure_fn(cl, NULL, NULL); } - bch_bbio_endio(op->c, bio, bio->bi_error, "writing data to cache"); + bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache"); } static void bch_data_insert_start(struct closure *cl) @@ -488,15 +488,15 @@ static void bch_cache_read_endio(struct bio *bio) * from the backing device. */ - if (bio->bi_error) - s->iop.error = bio->bi_error; + if (bio->bi_status) + s->iop.status = bio->bi_status; else if (!KEY_DIRTY(&b->key) && ptr_stale(s->iop.c, &b->key, 0)) { atomic_long_inc(&s->iop.c->cache_read_races); - s->iop.error = -EINTR; + s->iop.status = BLK_STS_IOERR; } - bch_bbio_endio(s->iop.c, bio, bio->bi_error, "reading from cache"); + bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache"); } /* @@ -593,9 +593,9 @@ static void request_endio(struct bio *bio) { struct closure *cl = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { struct search *s = container_of(cl, struct search, cl); - s->iop.error = bio->bi_error; + s->iop.status = bio->bi_status; /* Only cache read errors are recoverable */ s->recoverable = false; } @@ -611,7 +611,7 @@ static void bio_complete(struct search *s) &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); - s->orig_bio->bi_error = s->iop.error; + s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); s->orig_bio = NULL; } @@ -664,7 +664,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.inode = d->id; s->iop.write_point = hash_long((unsigned long) current, 16); s->iop.write_prio = 0; - s->iop.error = 0; + s->iop.status = 0; s->iop.flags = 0; s->iop.flush_journal = op_is_flush(bio->bi_opf); s->iop.wq = bcache_wq; @@ -707,7 +707,7 @@ static void cached_dev_read_error(struct closure *cl) /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); - s->iop.error = 0; + s->iop.status = 0; do_bio_hook(s, s->orig_bio); /* XXX: invalidate cache */ @@ -767,7 +767,7 @@ static void cached_dev_read_done_bh(struct closure *cl) !s->cache_miss, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); - if (s->iop.error) + if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); else if (s->iop.bio || verify(dc, &s->bio.bio)) continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 1ff36875c2b3..7689176951ce 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -10,7 +10,7 @@ struct data_insert_op { unsigned inode; uint16_t write_point; uint16_t write_prio; - short error; + blk_status_t status; union { uint16_t flags; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e57353e39168..8352fad765f6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -271,7 +271,7 @@ static void write_super_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - bch_count_io_errors(ca, bio->bi_error, "writing superblock"); + bch_count_io_errors(ca, bio->bi_status, "writing superblock"); closure_put(&ca->set->sb_write); } @@ -321,7 +321,7 @@ static void uuid_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct cache_set *c = container_of(cl, struct cache_set, uuid_write); - cache_set_err_on(bio->bi_error, c, "accessing uuids"); + cache_set_err_on(bio->bi_status, c, "accessing uuids"); bch_bbio_free(bio, c); closure_put(cl); } @@ -494,7 +494,7 @@ static void prio_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - cache_set_err_on(bio->bi_error, ca->set, "accessing priorities"); + cache_set_err_on(bio->bi_status, ca->set, "accessing priorities"); bch_bbio_free(bio, ca->set); closure_put(&ca->prio); } @@ -782,7 +782,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, minor *= BCACHE_MINORS; - if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), + BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER)) || !(d->disk = alloc_disk(BCACHE_MINORS))) { ida_simple_remove(&bcache_minor, minor); return -ENOMEM; @@ -1516,7 +1518,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sizeof(struct bbio) + sizeof(struct bio_vec) * bucket_pages(c))) || !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || - !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || + !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio), + BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER)) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0)) || diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 6ac2e48b9235..42c66e76f05e 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -167,7 +167,7 @@ static void dirty_endio(struct bio *bio) struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; - if (bio->bi_error) + if (bio->bi_status) SET_KEY_DIRTY(&w->key, false); closure_put(&io->cl); @@ -195,7 +195,7 @@ static void read_dirty_endio(struct bio *bio) struct dirty_io *io = w->private; bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), - bio->bi_error, "reading dirty data from cache"); + bio->bi_status, "reading dirty data from cache"); dirty_endio(bio); } diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c index ae7da2c30a57..82d27384d31f 100644 --- a/drivers/md/dm-bio-prison-v1.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -229,7 +229,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell, int error) + struct dm_bio_prison_cell *cell, blk_status_t error) { struct bio_list bios; struct bio *bio; @@ -238,7 +238,7 @@ void dm_cell_error(struct dm_bio_prison *prison, dm_cell_release(prison, cell, &bios); while ((bio = bio_list_pop(&bios))) { - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); } } diff --git a/drivers/md/dm-bio-prison-v1.h b/drivers/md/dm-bio-prison-v1.h index cddd4ac07e2c..cec52ac5e1ae 100644 --- a/drivers/md/dm-bio-prison-v1.h +++ b/drivers/md/dm-bio-prison-v1.h @@ -91,7 +91,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, struct bio_list *inmates); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell, int error); + struct dm_bio_prison_cell *cell, blk_status_t error); /* * Visits the cell and then releases. Guarantees no new inmates are diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 840c1496b2b1..850ff6c67994 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -145,8 +145,8 @@ struct dm_buffer { enum data_mode data_mode; unsigned char list_mode; /* LIST_* */ unsigned hold_count; - int read_error; - int write_error; + blk_status_t read_error; + blk_status_t write_error; unsigned long state; unsigned long last_accessed; struct dm_bufio_client *c; @@ -555,7 +555,7 @@ static void dmio_complete(unsigned long error, void *context) { struct dm_buffer *b = context; - b->bio.bi_error = error ? -EIO : 0; + b->bio.bi_status = error ? BLK_STS_IOERR : 0; b->bio.bi_end_io(&b->bio); } @@ -588,7 +588,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, r = dm_io(&io_req, 1, ®ion, NULL); if (r) { - b->bio.bi_error = r; + b->bio.bi_status = errno_to_blk_status(r); end_io(&b->bio); } } @@ -596,7 +596,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, static void inline_endio(struct bio *bio) { bio_end_io_t *end_fn = bio->bi_private; - int error = bio->bi_error; + blk_status_t status = bio->bi_status; /* * Reset the bio to free any attached resources @@ -604,7 +604,7 @@ static void inline_endio(struct bio *bio) */ bio_reset(bio); - bio->bi_error = error; + bio->bi_status = status; end_fn(bio); } @@ -685,11 +685,12 @@ static void write_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->write_error = bio->bi_error; - if (unlikely(bio->bi_error)) { + b->write_error = bio->bi_status; + if (unlikely(bio->bi_status)) { struct dm_bufio_client *c = b->c; - int error = bio->bi_error; - (void)cmpxchg(&c->async_write_error, 0, error); + + (void)cmpxchg(&c->async_write_error, 0, + blk_status_to_errno(bio->bi_status)); } BUG_ON(!test_bit(B_WRITING, &b->state)); @@ -1063,7 +1064,7 @@ static void read_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->read_error = bio->bi_error; + b->read_error = bio->bi_status; BUG_ON(!test_bit(B_READING, &b->state)); @@ -1107,7 +1108,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); if (b->read_error) { - int error = b->read_error; + int error = blk_status_to_errno(b->read_error); dm_bufio_release(b); @@ -1257,7 +1258,8 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); */ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) { - int a, f; + blk_status_t a; + int f; unsigned long buffers_processed = 0; struct dm_buffer *b, *tmp; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index d682a0511381..c5ea03fc7ee1 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -119,7 +119,7 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) */ struct continuation { struct work_struct ws; - int input; + blk_status_t input; }; static inline void init_continuation(struct continuation *k, @@ -145,7 +145,7 @@ struct batcher { /* * The operation that everyone is waiting for. */ - int (*commit_op)(void *context); + blk_status_t (*commit_op)(void *context); void *commit_context; /* @@ -171,8 +171,7 @@ struct batcher { static void __commit(struct work_struct *_ws) { struct batcher *b = container_of(_ws, struct batcher, commit_work); - - int r; + blk_status_t r; unsigned long flags; struct list_head work_items; struct work_struct *ws, *tmp; @@ -205,7 +204,7 @@ static void __commit(struct work_struct *_ws) while ((bio = bio_list_pop(&bios))) { if (r) { - bio->bi_error = r; + bio->bi_status = r; bio_endio(bio); } else b->issue_op(bio, b->issue_context); @@ -213,7 +212,7 @@ static void __commit(struct work_struct *_ws) } static void batcher_init(struct batcher *b, - int (*commit_op)(void *), + blk_status_t (*commit_op)(void *), void *commit_context, void (*issue_op)(struct bio *bio, void *), void *issue_context, @@ -955,7 +954,7 @@ static void writethrough_endio(struct bio *bio) dm_unhook_bio(&pb->hook_info, bio); - if (bio->bi_error) { + if (bio->bi_status) { bio_endio(bio); return; } @@ -1220,7 +1219,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); if (read_err || write_err) - mg->k.input = -EIO; + mg->k.input = BLK_STS_IOERR; queue_continuation(mg->cache->wq, &mg->k); } @@ -1266,8 +1265,8 @@ static void overwrite_endio(struct bio *bio) dm_unhook_bio(&pb->hook_info, bio); - if (bio->bi_error) - mg->k.input = bio->bi_error; + if (bio->bi_status) + mg->k.input = bio->bi_status; queue_continuation(mg->cache->wq, &mg->k); } @@ -1323,8 +1322,10 @@ static void mg_complete(struct dm_cache_migration *mg, bool success) if (mg->overwrite_bio) { if (success) force_set_dirty(cache, cblock); + else if (mg->k.input) + mg->overwrite_bio->bi_status = mg->k.input; else - mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); + mg->overwrite_bio->bi_status = BLK_STS_IOERR; bio_endio(mg->overwrite_bio); } else { if (success) @@ -1504,7 +1505,7 @@ static void mg_copy(struct work_struct *ws) r = copy(mg, is_policy_promote); if (r) { DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); - mg->k.input = -EIO; + mg->k.input = BLK_STS_IOERR; mg_complete(mg, false); } } @@ -1907,12 +1908,12 @@ static int commit(struct cache *cache, bool clean_shutdown) /* * Used by the batcher. */ -static int commit_op(void *context) +static blk_status_t commit_op(void *context) { struct cache *cache = context; if (dm_cache_changed_this_transaction(cache->cmd)) - return commit(cache, false); + return errno_to_blk_status(commit(cache, false)); return 0; } @@ -2018,7 +2019,7 @@ static void requeue_deferred_bios(struct cache *cache) bio_list_init(&cache->deferred_bios); while ((bio = bio_list_pop(&bios))) { - bio->bi_error = DM_ENDIO_REQUEUE; + bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); } } @@ -2820,7 +2821,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return r; } -static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) +static int cache_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct cache *cache = ti->private; unsigned long flags; @@ -2838,7 +2840,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) bio_drop_shared_lock(cache, bio); accounted_complete(cache, bio); - return 0; + return DM_ENDIO_DONE; } static int write_dirty_bitset(struct cache *cache) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ebf9e72d479b..9e1b72e8f7ef 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -71,7 +71,7 @@ struct dm_crypt_io { struct convert_context ctx; atomic_t io_pending; - int error; + blk_status_t error; sector_t sector; struct rb_node rb_node; @@ -1292,7 +1292,7 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_ /* * Encrypt / decrypt data from one bio to another one (can be the same one) */ -static int crypt_convert(struct crypt_config *cc, +static blk_status_t crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { unsigned int tag_offset = 0; @@ -1343,13 +1343,13 @@ static int crypt_convert(struct crypt_config *cc, */ case -EBADMSG: atomic_dec(&ctx->cc_pending); - return -EILSEQ; + return BLK_STS_PROTECTION; /* * There was an error while processing the request. */ default: atomic_dec(&ctx->cc_pending); - return -EIO; + return BLK_STS_IOERR; } } @@ -1463,7 +1463,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; struct bio *base_bio = io->base_bio; - int error = io->error; + blk_status_t error = io->error; if (!atomic_dec_and_test(&io->io_pending)) return; @@ -1476,7 +1476,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) else kfree(io->integrity_metadata); - base_bio->bi_error = error; + base_bio->bi_status = error; bio_endio(base_bio); } @@ -1502,7 +1502,7 @@ static void crypt_endio(struct bio *clone) struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->cc; unsigned rw = bio_data_dir(clone); - int error; + blk_status_t error; /* * free the processed pages @@ -1510,7 +1510,7 @@ static void crypt_endio(struct bio *clone) if (rw == WRITE) crypt_free_buffer_pages(cc, clone); - error = clone->bi_error; + error = clone->bi_status; bio_put(clone); if (rw == READ && !error) { @@ -1570,7 +1570,7 @@ static void kcryptd_io_read_work(struct work_struct *work) crypt_inc_pending(io); if (kcryptd_io_read(io, GFP_NOIO)) - io->error = -ENOMEM; + io->error = BLK_STS_RESOURCE; crypt_dec_pending(io); } @@ -1656,7 +1656,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) sector_t sector; struct rb_node **rbp, *parent; - if (unlikely(io->error < 0)) { + if (unlikely(io->error)) { crypt_free_buffer_pages(cc, clone); bio_put(clone); crypt_dec_pending(io); @@ -1697,7 +1697,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) struct bio *clone; int crypt_finished; sector_t sector = io->sector; - int r; + blk_status_t r; /* * Prevent io from disappearing until this function completes. @@ -1707,7 +1707,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); if (unlikely(!clone)) { - io->error = -EIO; + io->error = BLK_STS_IOERR; goto dec; } @@ -1718,7 +1718,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) crypt_inc_pending(io); r = crypt_convert(cc, &io->ctx); - if (r < 0) + if (r) io->error = r; crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); @@ -1740,7 +1740,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io) static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; - int r = 0; + blk_status_t r; crypt_inc_pending(io); @@ -1748,7 +1748,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) io->sector); r = crypt_convert(cc, &io->ctx); - if (r < 0) + if (r) io->error = r; if (atomic_dec_and_test(&io->ctx.cc_pending)) @@ -1781,9 +1781,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (error == -EBADMSG) { DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq))); - io->error = -EILSEQ; + io->error = BLK_STS_PROTECTION; } else if (error < 0) - io->error = -EIO; + io->error = BLK_STS_IOERR; crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); @@ -2677,7 +2677,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->bs = bioset_create(MIN_IOS, 0); + cc->bs = bioset_create(MIN_IOS, 0, (BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER)); if (!cc->bs) { ti->error = "Cannot allocate crypt bioset"; goto bad; @@ -2795,10 +2796,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) * and is aligned to this size as defined in IO hints. */ if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0)) - return -EIO; + return DM_MAPIO_KILL; if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1))) - return -EIO; + return DM_MAPIO_KILL; io = dm_per_bio_data(bio, cc->per_bio_data_size); crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 13305a182611..3d04d5ce19d9 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -321,7 +321,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == READ) { if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags)) - return -EIO; + return DM_MAPIO_KILL; goto map_bio; } @@ -349,7 +349,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) /* * By default, error all I/O. */ - return -EIO; + return DM_MAPIO_KILL; } map_bio: @@ -358,12 +358,13 @@ map_bio: return DM_MAPIO_REMAPPED; } -static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) +static int flakey_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct flakey_c *fc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - if (!error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { + if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) && all_corrupt_bio_flags_match(bio, fc)) { /* @@ -377,11 +378,11 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) * Error read during the down_interval if drop_writes * and error_writes were not configured. */ - return -EIO; + *error = BLK_STS_IOERR; } } - return error; + return DM_ENDIO_DONE; } static void flakey_status(struct dm_target *ti, status_type_t type, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 93b181088168..1b224aa9cf15 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -246,7 +246,7 @@ struct dm_integrity_io { unsigned metadata_offset; atomic_t in_flight; - int bi_error; + blk_status_t bi_status; struct completion *completion; @@ -1118,8 +1118,8 @@ static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io * static void do_endio(struct dm_integrity_c *ic, struct bio *bio) { int r = dm_integrity_failed(ic); - if (unlikely(r) && !bio->bi_error) - bio->bi_error = r; + if (unlikely(r) && !bio->bi_status) + bio->bi_status = errno_to_blk_status(r); bio_endio(bio); } @@ -1127,7 +1127,7 @@ static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *di { struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); - if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic))) + if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic))) submit_flush_bio(ic, dio); else do_endio(ic, bio); @@ -1146,9 +1146,9 @@ static void dec_in_flight(struct dm_integrity_io *dio) bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); - if (unlikely(dio->bi_error) && !bio->bi_error) - bio->bi_error = dio->bi_error; - if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { + if (unlikely(dio->bi_status) && !bio->bi_status) + bio->bi_status = dio->bi_status; + if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { dio->range.logical_sector += dio->range.n_sectors; bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); INIT_WORK(&dio->work, integrity_bio_wait); @@ -1322,7 +1322,7 @@ skip_io: dec_in_flight(dio); return; error: - dio->bi_error = r; + dio->bi_status = errno_to_blk_status(r); dec_in_flight(dio); } @@ -1335,7 +1335,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) sector_t area, offset; dio->ic = ic; - dio->bi_error = 0; + dio->bi_status = 0; if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { submit_flush_bio(ic, dio); @@ -1356,13 +1356,13 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", (unsigned long long)dio->range.logical_sector, bio_sectors(bio), (unsigned long long)ic->provided_data_sectors); - return -EIO; + return DM_MAPIO_KILL; } if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) { DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", ic->sectors_per_block, (unsigned long long)dio->range.logical_sector, bio_sectors(bio)); - return -EIO; + return DM_MAPIO_KILL; } if (ic->sectors_per_block > 1) { @@ -1372,7 +1372,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) if (unlikely((bv.bv_offset | bv.bv_len) & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) { DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary", bv.bv_offset, bv.bv_len, ic->sectors_per_block); - return -EIO; + return DM_MAPIO_KILL; } } } @@ -1387,18 +1387,18 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) wanted_tag_size *= ic->tag_size; if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) { DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size); - return -EIO; + return DM_MAPIO_KILL; } } } else { if (unlikely(bip != NULL)) { DMERR("Unexpected integrity data when using internal hash"); - return -EIO; + return DM_MAPIO_KILL; } } if (unlikely(ic->mode == 'R') && unlikely(dio->write)) - return -EIO; + return DM_MAPIO_KILL; get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 8d5ca30f6551..25039607f3cb 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -58,7 +58,8 @@ struct dm_io_client *dm_io_client_create(void) if (!client->pool) goto bad; - client->bios = bioset_create(min_ios, 0); + client->bios = bioset_create(min_ios, 0, (BIOSET_NEED_BVECS | + BIOSET_NEED_RESCUER)); if (!client->bios) goto bad; @@ -124,7 +125,7 @@ static void complete_io(struct io *io) fn(error_bits, context); } -static void dec_count(struct io *io, unsigned int region, int error) +static void dec_count(struct io *io, unsigned int region, blk_status_t error) { if (error) set_bit(region, &io->error_bits); @@ -137,9 +138,9 @@ static void endio(struct bio *bio) { struct io *io; unsigned region; - int error; + blk_status_t error; - if (bio->bi_error && bio_data_dir(bio) == READ) + if (bio->bi_status && bio_data_dir(bio) == READ) zero_fill_bio(bio); /* @@ -147,7 +148,7 @@ static void endio(struct bio *bio) */ retrieve_io_and_region_from_bio(bio, &io, ®ion); - error = bio->bi_error; + error = bio->bi_status; bio_put(bio); dec_count(io, region, error); @@ -319,7 +320,7 @@ static void do_region(int op, int op_flags, unsigned region, if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES || op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) { atomic_inc(&io->count); - dec_count(io, region, -EOPNOTSUPP); + dec_count(io, region, BLK_STS_NOTSUPP); return; } diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 4dfe38655a49..a1da0eb58a93 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -150,10 +150,10 @@ static void log_end_io(struct bio *bio) { struct log_writes_c *lc = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { unsigned long flags; - DMERR("Error writing log block, error=%d", bio->bi_error); + DMERR("Error writing log block, error=%d", bio->bi_status); spin_lock_irqsave(&lc->blocks_lock, flags); lc->logging_enabled = false; spin_unlock_irqrestore(&lc->blocks_lock, flags); @@ -586,7 +586,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) spin_lock_irq(&lc->blocks_lock); lc->logging_enabled = false; spin_unlock_irq(&lc->blocks_lock); - return -ENOMEM; + return DM_MAPIO_KILL; } INIT_LIST_HEAD(&block->list); pb->block = block; @@ -639,7 +639,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) spin_lock_irq(&lc->blocks_lock); lc->logging_enabled = false; spin_unlock_irq(&lc->blocks_lock); - return -ENOMEM; + return DM_MAPIO_KILL; } src = kmap_atomic(bv.bv_page); @@ -664,7 +664,8 @@ map_bio: return DM_MAPIO_REMAPPED; } -static int normal_end_io(struct dm_target *ti, struct bio *bio, int error) +static int normal_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct log_writes_c *lc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); @@ -686,7 +687,7 @@ static int normal_end_io(struct dm_target *ti, struct bio *bio, int error) spin_unlock_irqrestore(&lc->blocks_lock, flags); } - return error; + return DM_ENDIO_DONE; } /* diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 3df056b73b66..0e8ab5bb3575 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -559,13 +559,13 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) return DM_MAPIO_REQUEUE; dm_report_EIO(m); - return -EIO; + return DM_MAPIO_KILL; } mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; - bio->bi_error = 0; + bio->bi_status = 0; bio->bi_bdev = pgpath->path.dev->bdev; bio->bi_opf |= REQ_FAILFAST_TRANSPORT; @@ -621,11 +621,19 @@ static void process_queued_bios(struct work_struct *work) blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio)); - if (r < 0 || r == DM_MAPIO_REQUEUE) { - bio->bi_error = r; + switch (r) { + case DM_MAPIO_KILL: + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + break; + case DM_MAPIO_REQUEUE: + bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); - } else if (r == DM_MAPIO_REMAPPED) + break; + case DM_MAPIO_REMAPPED: generic_make_request(bio); + break; + } } blk_finish_plug(&plug); } @@ -1442,22 +1450,15 @@ static void activate_path_work(struct work_struct *work) activate_or_offline_path(pgpath); } -static int noretry_error(int error) +static int noretry_error(blk_status_t error) { switch (error) { - case -EBADE: - /* - * EBADE signals an reservation conflict. - * We shouldn't fail the path here as we can communicate with - * the target. We should failover to the next path, but in - * doing so we might be causing a ping-pong between paths. - * So just return the reservation conflict error. - */ - case -EOPNOTSUPP: - case -EREMOTEIO: - case -EILSEQ: - case -ENODATA: - case -ENOSPC: + case BLK_STS_NOTSUPP: + case BLK_STS_NOSPC: + case BLK_STS_TARGET: + case BLK_STS_NEXUS: + case BLK_STS_MEDIUM: + case BLK_STS_RESOURCE: return 1; } @@ -1466,7 +1467,7 @@ static int noretry_error(int error) } static int multipath_end_io(struct dm_target *ti, struct request *clone, - int error, union map_info *map_context) + blk_status_t error, union map_info *map_context) { struct dm_mpath_io *mpio = get_mpio(map_context); struct pgpath *pgpath = mpio->pgpath; @@ -1493,7 +1494,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - if (error == -EIO) + if (error == BLK_STS_IOERR) dm_report_EIO(m); /* complete with the original error */ r = DM_ENDIO_DONE; @@ -1510,24 +1511,26 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, return r; } -static int do_end_io_bio(struct multipath *m, struct bio *clone, - int error, struct dm_mpath_io *mpio) +static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, + blk_status_t *error) { + struct multipath *m = ti->private; + struct dm_mpath_io *mpio = get_mpio_from_bio(clone); + struct pgpath *pgpath = mpio->pgpath; unsigned long flags; + int r = DM_ENDIO_DONE; - if (!error) - return 0; /* I/O complete */ - - if (noretry_error(error)) - return error; + if (!*error || noretry_error(*error)) + goto done; - if (mpio->pgpath) - fail_path(mpio->pgpath); + if (pgpath) + fail_path(pgpath); if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { dm_report_EIO(m); - return -EIO; + *error = BLK_STS_IOERR; + goto done; } /* Queue for the daemon to resubmit */ @@ -1539,23 +1542,11 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone, if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) queue_work(kmultipathd, &m->process_queued_bios); - return DM_ENDIO_INCOMPLETE; -} - -static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error) -{ - struct multipath *m = ti->private; - struct dm_mpath_io *mpio = get_mpio_from_bio(clone); - struct pgpath *pgpath; - struct path_selector *ps; - int r; - - BUG_ON(!mpio); - - r = do_end_io_bio(m, clone, error, mpio); - pgpath = mpio->pgpath; + r = DM_ENDIO_INCOMPLETE; +done: if (pgpath) { - ps = &pgpath->pg->ps; + struct path_selector *ps = &pgpath->pg->ps; + if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 4da8858856fb..a4fbd911d566 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -491,9 +491,9 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio) * If device is suspended, complete the bio. */ if (dm_noflush_suspending(ms->ti)) - bio->bi_error = DM_ENDIO_REQUEUE; + bio->bi_status = BLK_STS_DM_REQUEUE; else - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; @@ -627,7 +627,7 @@ static void write_callback(unsigned long error, void *context) * degrade the array. */ if (bio_op(bio) == REQ_OP_DISCARD) { - bio->bi_error = -EOPNOTSUPP; + bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); return; } @@ -1210,14 +1210,14 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); if (r < 0 && r != -EWOULDBLOCK) - return r; + return DM_MAPIO_KILL; /* * If region is not in-sync queue the bio. */ if (!r || (r == -EWOULDBLOCK)) { if (bio->bi_opf & REQ_RAHEAD) - return -EWOULDBLOCK; + return DM_MAPIO_KILL; queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; @@ -1229,7 +1229,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) */ m = choose_mirror(ms, bio->bi_iter.bi_sector); if (unlikely(!m)) - return -EIO; + return DM_MAPIO_KILL; dm_bio_record(&bio_record->details, bio); bio_record->m = m; @@ -1239,7 +1239,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } -static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) +static int mirror_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { int rw = bio_data_dir(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; @@ -1255,16 +1256,16 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) if (!(bio->bi_opf & REQ_PREFLUSH) && bio_op(bio) != REQ_OP_DISCARD) dm_rh_dec(ms->rh, bio_record->write_region); - return error; + return DM_ENDIO_DONE; } - if (error == -EOPNOTSUPP) + if (*error == BLK_STS_NOTSUPP) goto out; - if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD)) + if (bio->bi_opf & REQ_RAHEAD) goto out; - if (unlikely(error)) { + if (unlikely(*error)) { if (!bio_record->details.bi_bdev) { /* * There wasn't enough memory to record necessary @@ -1272,7 +1273,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) * mirror in-sync. */ DMERR_LIMIT("Mirror read failed."); - return -EIO; + return DM_ENDIO_DONE; } m = bio_record->m; @@ -1291,7 +1292,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) dm_bio_restore(bd, bio); bio_record->details.bi_bdev = NULL; - bio->bi_error = 0; + bio->bi_status = 0; queue_bio(ms, bio, rw); return DM_ENDIO_INCOMPLETE; @@ -1302,7 +1303,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) out: bio_record->details.bi_bdev = NULL; - return error; + return DM_ENDIO_DONE; } static void mirror_presuspend(struct dm_target *ti) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index b639fa7246ee..c6ebc5b1e00e 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -71,7 +71,7 @@ static void dm_old_start_queue(struct request_queue *q) static void dm_mq_start_queue(struct request_queue *q) { - blk_mq_start_stopped_hw_queues(q, true); + blk_mq_unquiesce_queue(q); blk_mq_kick_requeue_list(q); } @@ -119,7 +119,7 @@ static void end_clone_bio(struct bio *clone) struct dm_rq_target_io *tio = info->tio; struct bio *bio = info->orig; unsigned int nr_bytes = info->orig->bi_iter.bi_size; - int error = clone->bi_error; + blk_status_t error = clone->bi_status; bio_put(clone); @@ -158,7 +158,7 @@ static void end_clone_bio(struct bio *clone) * Do not use blk_end_request() here, because it may complete * the original request before the clone, and break the ordering. */ - blk_update_request(tio->orig, 0, nr_bytes); + blk_update_request(tio->orig, BLK_STS_OK, nr_bytes); } static struct dm_rq_target_io *tio_from_request(struct request *rq) @@ -216,7 +216,7 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * Must be called without clone's queue lock held, * see end_clone_request() for more details. */ -static void dm_end_request(struct request *clone, int error) +static void dm_end_request(struct request *clone, blk_status_t error) { int rw = rq_data_dir(clone); struct dm_rq_target_io *tio = clone->end_io_data; @@ -285,7 +285,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ rq_completed(md, rw, false); } -static void dm_done(struct request *clone, int error, bool mapped) +static void dm_done(struct request *clone, blk_status_t error, bool mapped) { int r = DM_ENDIO_DONE; struct dm_rq_target_io *tio = clone->end_io_data; @@ -298,7 +298,7 @@ static void dm_done(struct request *clone, int error, bool mapped) r = rq_end_io(tio->ti, clone, error, &tio->info); } - if (unlikely(error == -EREMOTEIO)) { + if (unlikely(error == BLK_STS_TARGET)) { if (req_op(clone) == REQ_OP_WRITE_SAME && !clone->q->limits.max_write_same_sectors) disable_write_same(tio->md); @@ -358,7 +358,7 @@ static void dm_softirq_done(struct request *rq) * Complete the clone and the original request with the error status * through softirq context. */ -static void dm_complete_request(struct request *rq, int error) +static void dm_complete_request(struct request *rq, blk_status_t error) { struct dm_rq_target_io *tio = tio_from_request(rq); @@ -375,7 +375,7 @@ static void dm_complete_request(struct request *rq, int error) * Target's rq_end_io() function isn't called. * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. */ -static void dm_kill_unmapped_request(struct request *rq, int error) +static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) { rq->rq_flags |= RQF_FAILED; dm_complete_request(rq, error); @@ -384,7 +384,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error) /* * Called with the clone's queue lock held (in the case of .request_fn) */ -static void end_clone_request(struct request *clone, int error) +static void end_clone_request(struct request *clone, blk_status_t error) { struct dm_rq_target_io *tio = clone->end_io_data; @@ -401,7 +401,7 @@ static void end_clone_request(struct request *clone, int error) static void dm_dispatch_clone_request(struct request *clone, struct request *rq) { - int r; + blk_status_t r; if (blk_queue_io_stat(clone->q)) clone->rq_flags |= RQF_IO_STAT; @@ -506,7 +506,7 @@ static int map_request(struct dm_rq_target_io *tio) break; case DM_MAPIO_KILL: /* The target wants to complete the I/O */ - dm_kill_unmapped_request(rq, -EIO); + dm_kill_unmapped_request(rq, BLK_STS_IOERR); break; default: DMWARN("unimplemented target map return value: %d", r); @@ -727,7 +727,7 @@ static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, return __dm_rq_init_rq(set->driver_data, rq); } -static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; @@ -744,7 +744,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, } if (ti->type->busy && ti->type->busy(ti)) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; dm_start_request(md, rq); @@ -762,10 +762,10 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, rq_end_stats(md, rq); rq_completed(md, rq_data_dir(rq), false); blk_mq_delay_run_hw_queue(hctx, 100/*ms*/); - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static const struct blk_mq_ops dm_mq_ops = { diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h index f0020d21b95f..9813922e4fe5 100644 --- a/drivers/md/dm-rq.h +++ b/drivers/md/dm-rq.h @@ -24,7 +24,7 @@ struct dm_rq_target_io { struct dm_target *ti; struct request *orig, *clone; struct kthread_work work; - int error; + blk_status_t error; union map_info info; struct dm_stats_aux stats_aux; unsigned long duration_jiffies; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index e152d9817c81..1ba41048b438 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1590,7 +1590,7 @@ static void full_bio_end_io(struct bio *bio) { void *callback_data = bio->bi_private; - dm_kcopyd_do_callback(callback_data, 0, bio->bi_error ? 1 : 0); + dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0); } static void start_full_bio(struct dm_snap_pending_exception *pe, @@ -1690,7 +1690,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ if (!s->valid) - return -EIO; + return DM_MAPIO_KILL; /* FIXME: should only take write lock if we need * to copy an exception */ @@ -1698,7 +1698,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_data_dir(bio) == WRITE)) { - r = -EIO; + r = DM_MAPIO_KILL; goto out_unlock; } @@ -1723,7 +1723,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid || s->snapshot_overflowed) { free_pending_exception(pe); - r = -EIO; + r = DM_MAPIO_KILL; goto out_unlock; } @@ -1741,7 +1741,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) DMERR("Snapshot overflowed: Unable to allocate exception."); } else __invalidate_snapshot(s, -ENOMEM); - r = -EIO; + r = DM_MAPIO_KILL; goto out_unlock; } } @@ -1851,14 +1851,15 @@ out_unlock: return r; } -static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error) +static int snapshot_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct dm_snapshot *s = ti->private; if (is_bio_tracked(bio)) stop_tracking_chunk(s, bio); - return 0; + return DM_ENDIO_DONE; } static void snapshot_merge_presuspend(struct dm_target *ti) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 75152482f3ad..11621a0af887 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -375,20 +375,21 @@ static void stripe_status(struct dm_target *ti, status_type_t type, } } -static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) +static int stripe_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { unsigned i; char major_minor[16]; struct stripe_c *sc = ti->private; - if (!error) - return 0; /* I/O complete */ + if (!*error) + return DM_ENDIO_DONE; /* I/O complete */ - if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD)) - return error; + if (bio->bi_opf & REQ_RAHEAD) + return DM_ENDIO_DONE; - if (error == -EOPNOTSUPP) - return error; + if (*error == BLK_STS_NOTSUPP) + return DM_ENDIO_DONE; memset(major_minor, 0, sizeof(major_minor)); sprintf(major_minor, "%d:%d", @@ -409,7 +410,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) schedule_work(&sc->trigger_event); } - return error; + return DM_ENDIO_DONE; } static int stripe_iterate_devices(struct dm_target *ti, diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index b242b750542f..c0d7e60820c4 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -128,7 +128,7 @@ static void io_err_dtr(struct dm_target *tt) static int io_err_map(struct dm_target *tt, struct bio *bio) { - return -EIO; + return DM_MAPIO_KILL; } static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 28808e5ec0fd..9dec2f8cc739 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -383,8 +383,8 @@ static void end_discard(struct discard_op *op, int r) * Even if r is set, there could be sub discards in flight that we * need to wait for. */ - if (r && !op->parent_bio->bi_error) - op->parent_bio->bi_error = r; + if (r && !op->parent_bio->bi_status) + op->parent_bio->bi_status = errno_to_blk_status(r); bio_endio(op->parent_bio); } @@ -450,22 +450,20 @@ static void cell_release_no_holder(struct pool *pool, } static void cell_error_with_code(struct pool *pool, - struct dm_bio_prison_cell *cell, int error_code) + struct dm_bio_prison_cell *cell, blk_status_t error_code) { dm_cell_error(pool->prison, cell, error_code); dm_bio_prison_free_cell(pool->prison, cell); } -static int get_pool_io_error_code(struct pool *pool) +static blk_status_t get_pool_io_error_code(struct pool *pool) { - return pool->out_of_data_space ? -ENOSPC : -EIO; + return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR; } static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) { - int error = get_pool_io_error_code(pool); - - cell_error_with_code(pool, cell, error); + cell_error_with_code(pool, cell, get_pool_io_error_code(pool)); } static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) @@ -475,7 +473,7 @@ static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell) { - cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE); + cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE); } /*----------------------------------------------------------------*/ @@ -555,17 +553,18 @@ static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) bio_list_init(master); } -static void error_bio_list(struct bio_list *bios, int error) +static void error_bio_list(struct bio_list *bios, blk_status_t error) { struct bio *bio; while ((bio = bio_list_pop(bios))) { - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); } } -static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error) +static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, + blk_status_t error) { struct bio_list bios; unsigned long flags; @@ -608,11 +607,11 @@ static void requeue_io(struct thin_c *tc) __merge_bio_list(&bios, &tc->retry_on_resume_list); spin_unlock_irqrestore(&tc->lock, flags); - error_bio_list(&bios, DM_ENDIO_REQUEUE); + error_bio_list(&bios, BLK_STS_DM_REQUEUE); requeue_deferred_cells(tc); } -static void error_retry_list_with_code(struct pool *pool, int error) +static void error_retry_list_with_code(struct pool *pool, blk_status_t error) { struct thin_c *tc; @@ -624,9 +623,7 @@ static void error_retry_list_with_code(struct pool *pool, int error) static void error_retry_list(struct pool *pool) { - int error = get_pool_io_error_code(pool); - - error_retry_list_with_code(pool, error); + error_retry_list_with_code(pool, get_pool_io_error_code(pool)); } /* @@ -774,7 +771,7 @@ struct dm_thin_new_mapping { */ atomic_t prepare_actions; - int err; + blk_status_t status; struct thin_c *tc; dm_block_t virt_begin, virt_end; dm_block_t data_block; @@ -814,7 +811,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) { struct dm_thin_new_mapping *m = context; - m->err = read_err || write_err ? -EIO : 0; + m->status = read_err || write_err ? BLK_STS_IOERR : 0; complete_mapping_preparation(m); } @@ -825,7 +822,7 @@ static void overwrite_endio(struct bio *bio) bio->bi_end_io = m->saved_bi_end_io; - m->err = bio->bi_error; + m->status = bio->bi_status; complete_mapping_preparation(m); } @@ -925,7 +922,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) struct bio *bio = m->bio; int r; - if (m->err) { + if (m->status) { cell_error(pool, m->cell); goto out; } @@ -1495,7 +1492,7 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&tc->lock, flags); } -static int should_error_unserviceable_bio(struct pool *pool) +static blk_status_t should_error_unserviceable_bio(struct pool *pool) { enum pool_mode m = get_pool_mode(pool); @@ -1503,27 +1500,27 @@ static int should_error_unserviceable_bio(struct pool *pool) case PM_WRITE: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); - return -EIO; + return BLK_STS_IOERR; case PM_OUT_OF_DATA_SPACE: - return pool->pf.error_if_no_space ? -ENOSPC : 0; + return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0; case PM_READ_ONLY: case PM_FAIL: - return -EIO; + return BLK_STS_IOERR; default: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); - return -EIO; + return BLK_STS_IOERR; } } static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) { - int error = should_error_unserviceable_bio(pool); + blk_status_t error = should_error_unserviceable_bio(pool); if (error) { - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); } else retry_on_resume(bio); @@ -1533,7 +1530,7 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c { struct bio *bio; struct bio_list bios; - int error; + blk_status_t error; error = should_error_unserviceable_bio(pool); if (error) { @@ -2071,7 +2068,8 @@ static void process_thin_deferred_bios(struct thin_c *tc) unsigned count = 0; if (tc->requeue_mode) { - error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE); + error_thin_bio_list(tc, &tc->deferred_bio_list, + BLK_STS_DM_REQUEUE); return; } @@ -2322,7 +2320,7 @@ static void do_no_space_timeout(struct work_struct *ws) if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) { pool->pf.error_if_no_space = true; notify_of_pool_mode_change_to_oods(pool); - error_retry_list_with_code(pool, -ENOSPC); + error_retry_list_with_code(pool, BLK_STS_NOSPC); } } @@ -2624,7 +2622,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) thin_hook_bio(tc, bio); if (tc->requeue_mode) { - bio->bi_error = DM_ENDIO_REQUEUE; + bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); return DM_MAPIO_SUBMITTED; } @@ -4177,7 +4175,8 @@ static int thin_map(struct dm_target *ti, struct bio *bio) return thin_bio_map(ti, bio); } -static int thin_endio(struct dm_target *ti, struct bio *bio, int err) +static int thin_endio(struct dm_target *ti, struct bio *bio, + blk_status_t *err) { unsigned long flags; struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); @@ -4212,7 +4211,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) if (h->cell) cell_defer_no_holder(h->tc, h->cell); - return 0; + return DM_ENDIO_DONE; } static void thin_presuspend(struct dm_target *ti) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 1ec9b2c51c07..b46705ebf01f 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -538,13 +538,13 @@ static int verity_verify_io(struct dm_verity_io *io) /* * End one "io" structure with a given error. */ -static void verity_finish_io(struct dm_verity_io *io, int error) +static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) { struct dm_verity *v = io->v; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); bio->bi_end_io = io->orig_bi_end_io; - bio->bi_error = error; + bio->bi_status = status; verity_fec_finish_io(io); @@ -555,15 +555,15 @@ static void verity_work(struct work_struct *w) { struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); - verity_finish_io(io, verity_verify_io(io)); + verity_finish_io(io, errno_to_blk_status(verity_verify_io(io))); } static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_error && !verity_fec_is_enabled(io->v)) { - verity_finish_io(io, bio->bi_error); + if (bio->bi_status && !verity_fec_is_enabled(io->v)) { + verity_finish_io(io, bio->bi_status); return; } @@ -643,17 +643,17 @@ static int verity_map(struct dm_target *ti, struct bio *bio) if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { DMERR_LIMIT("unaligned io"); - return -EIO; + return DM_MAPIO_KILL; } if (bio_end_sector(bio) >> (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { DMERR_LIMIT("io out of range"); - return -EIO; + return DM_MAPIO_KILL; } if (bio_data_dir(bio) == WRITE) - return -EIO; + return DM_MAPIO_KILL; io = dm_per_bio_data(bio, ti->per_io_data_size); io->v = v; diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index b616f11d8473..b65ca8dcfbdc 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -39,7 +39,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio) case REQ_OP_READ: if (bio->bi_opf & REQ_RAHEAD) { /* readahead of null bytes only wastes buffer cache */ - return -EIO; + return DM_MAPIO_KILL; } zero_fill_bio(bio); break; @@ -47,7 +47,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio) /* writes get silently dropped */ break; default: - return -EIO; + return DM_MAPIO_KILL; } bio_endio(bio); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 37ccd73c79ec..402946035308 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -63,7 +63,7 @@ static struct workqueue_struct *deferred_remove_workqueue; */ struct dm_io { struct mapped_device *md; - int error; + blk_status_t status; atomic_t io_count; struct bio *bio; unsigned long start_time; @@ -768,23 +768,24 @@ static int __noflush_suspending(struct mapped_device *md) * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -static void dec_pending(struct dm_io *io, int error) +static void dec_pending(struct dm_io *io, blk_status_t error) { unsigned long flags; - int io_error; + blk_status_t io_error; struct bio *bio; struct mapped_device *md = io->md; /* Push-back supersedes any I/O errors */ if (unlikely(error)) { spin_lock_irqsave(&io->endio_lock, flags); - if (!(io->error > 0 && __noflush_suspending(md))) - io->error = error; + if (!(io->status == BLK_STS_DM_REQUEUE && + __noflush_suspending(md))) + io->status = error; spin_unlock_irqrestore(&io->endio_lock, flags); } if (atomic_dec_and_test(&io->io_count)) { - if (io->error == DM_ENDIO_REQUEUE) { + if (io->status == BLK_STS_DM_REQUEUE) { /* * Target requested pushing back the I/O. */ @@ -793,16 +794,16 @@ static void dec_pending(struct dm_io *io, int error) bio_list_add_head(&md->deferred, io->bio); else /* noflush suspend was interrupted. */ - io->error = -EIO; + io->status = BLK_STS_IOERR; spin_unlock_irqrestore(&md->deferred_lock, flags); } - io_error = io->error; + io_error = io->status; bio = io->bio; end_io_acct(io); free_io(md, io); - if (io_error == DM_ENDIO_REQUEUE) + if (io_error == BLK_STS_DM_REQUEUE) return; if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { @@ -814,7 +815,7 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ - bio->bi_error = io_error; + bio->bi_status = io_error; bio_endio(bio); } } @@ -838,31 +839,13 @@ void disable_write_zeroes(struct mapped_device *md) static void clone_endio(struct bio *bio) { - int error = bio->bi_error; - int r = error; + blk_status_t error = bio->bi_status; struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; - if (endio) { - r = endio(tio->ti, bio, error); - if (r < 0 || r == DM_ENDIO_REQUEUE) - /* - * error and requeue request are handled - * in dec_pending(). - */ - error = r; - else if (r == DM_ENDIO_INCOMPLETE) - /* The target will handle the io */ - return; - else if (r) { - DMWARN("unimplemented target endio return value: %d", r); - BUG(); - } - } - - if (unlikely(r == -EREMOTEIO)) { + if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_WRITE_SAME && !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) disable_write_same(md); @@ -871,6 +854,23 @@ static void clone_endio(struct bio *bio) disable_write_zeroes(md); } + if (endio) { + int r = endio(tio->ti, bio, &error); + switch (r) { + case DM_ENDIO_REQUEUE: + error = BLK_STS_DM_REQUEUE; + /*FALLTHRU*/ + case DM_ENDIO_DONE: + break; + case DM_ENDIO_INCOMPLETE: + /* The target will handle the io */ + return; + default: + DMWARN("unimplemented target endio return value: %d", r); + BUG(); + } + } + free_tio(tio); dec_pending(io, error); } @@ -1036,7 +1036,8 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) while ((bio = bio_list_pop(&list))) { struct bio_set *bs = bio->bi_pool; - if (unlikely(!bs) || bs == fs_bio_set) { + if (unlikely(!bs) || bs == fs_bio_set || + !bs->rescue_workqueue) { bio_list_add(¤t->bio_list[i], bio); continue; } @@ -1084,18 +1085,24 @@ static void __map_bio(struct dm_target_io *tio) r = ti->type->map(ti, clone); dm_offload_end(&o); - if (r == DM_MAPIO_REMAPPED) { + switch (r) { + case DM_MAPIO_SUBMITTED: + break; + case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ - trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, tio->io->bio->bi_bdev->bd_dev, sector); - generic_make_request(clone); - } else if (r < 0 || r == DM_MAPIO_REQUEUE) { - /* error the io and bail out, or requeue it if needed */ - dec_pending(tio->io, r); + break; + case DM_MAPIO_KILL: + dec_pending(tio->io, BLK_STS_IOERR); + free_tio(tio); + break; + case DM_MAPIO_REQUEUE: + dec_pending(tio->io, BLK_STS_DM_REQUEUE); free_tio(tio); - } else if (r != DM_MAPIO_SUBMITTED) { + break; + default: DMWARN("unimplemented target map return value: %d", r); BUG(); } @@ -1360,7 +1367,7 @@ static void __split_and_process_bio(struct mapped_device *md, ci.map = map; ci.md = md; ci.io = alloc_io(md); - ci.io->error = 0; + ci.io->status = 0; atomic_set(&ci.io->io_count, 1); ci.io->bio = bio; ci.io->md = md; @@ -1527,7 +1534,6 @@ void dm_init_normal_md_queue(struct mapped_device *md) * Initialize aspects of queue that aren't relevant for blk-mq */ md->queue->backing_dev_info->congested_fn = dm_any_congested; - blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); } static void cleanup_mapped_device(struct mapped_device *md) @@ -2654,7 +2660,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu BUG(); } - pools->bs = bioset_create_nobvec(pool_size, front_pad); + pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER); if (!pools->bs) goto out; diff --git a/drivers/md/md.c b/drivers/md/md.c index 84e76ebac4d4..31bcbfb09fef 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -185,7 +185,7 @@ static int start_readonly; static bool create_on_open = true; /* bio_clone_mddev - * like bio_clone, but with a local bio set + * like bio_clone_bioset, but with a local bio set */ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, @@ -265,7 +265,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) unsigned int sectors; int cpu; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); @@ -273,7 +273,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) } if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) - bio->bi_error = -EROFS; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; } @@ -719,8 +719,8 @@ static void super_written(struct bio *bio) struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; - if (bio->bi_error) { - pr_err("md: super_written gets error=%d\n", bio->bi_error); + if (bio->bi_status) { + pr_err("md: super_written gets error=%d\n", bio->bi_status); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { @@ -801,7 +801,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, submit_bio_wait(bio); - ret = !bio->bi_error; + ret = !bio->bi_status; bio_put(bio); return ret; } @@ -5428,7 +5428,7 @@ int md_run(struct mddev *mddev) } if (mddev->bio_set == NULL) { - mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); + mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!mddev->bio_set) return -ENOMEM; } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index e95d521d93e9..68d036e64041 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -73,12 +73,12 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) +static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status) { struct bio *bio = mp_bh->master_bio; struct mpconf *conf = mp_bh->mddev->private; - bio->bi_error = err; + bio->bi_status = status; bio_endio(bio); mempool_free(mp_bh, conf->pool); } @@ -89,7 +89,7 @@ static void multipath_end_request(struct bio *bio) struct mpconf *conf = mp_bh->mddev->private; struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; - if (!bio->bi_error) + if (!bio->bi_status) multipath_end_bh_io(mp_bh, 0); else if (!(bio->bi_opf & REQ_RAHEAD)) { /* @@ -102,7 +102,7 @@ static void multipath_end_request(struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector); multipath_reschedule_retry(mp_bh); } else - multipath_end_bh_io(mp_bh, bio->bi_error); + multipath_end_bh_io(mp_bh, bio->bi_status); rdev_dec_pending(rdev, conf->mddev); } @@ -347,7 +347,7 @@ static void multipathd(struct md_thread *thread) pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", bdevname(bio->bi_bdev,b), (unsigned long long)bio->bi_iter.bi_sector); - multipath_end_bh_io(mp_bh, -EIO); + multipath_end_bh_io(mp_bh, BLK_STS_IOERR); } else { pr_err("multipath: %s: redirecting sector %llu to another IO path\n", bdevname(bio->bi_bdev,b), diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e1a7e3d4c5e4..98ca2c1d3226 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -277,7 +277,7 @@ static void call_bio_endio(struct r1bio *r1_bio) struct r1conf *conf = r1_bio->mddev->private; if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); /* @@ -335,7 +335,7 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) static void raid1_end_read_request(struct bio *bio) { - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct r1bio *r1_bio = bio->bi_private; struct r1conf *conf = r1_bio->mddev->private; struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; @@ -426,12 +426,12 @@ static void raid1_end_write_request(struct bio *bio) struct md_rdev *rdev = conf->mirrors[mirror].rdev; bool discard_error; - discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; + discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; /* * 'one mirror IO has finished' event handler: */ - if (bio->bi_error && !discard_error) { + if (bio->bi_status && !discard_error) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & @@ -802,7 +802,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) @@ -1856,7 +1856,7 @@ static void end_sync_read(struct bio *bio) * or re-read if the read failed. * We don't do much here, just schedule handling by raid1d */ - if (!bio->bi_error) + if (!bio->bi_status) set_bit(R1BIO_Uptodate, &r1_bio->state); if (atomic_dec_and_test(&r1_bio->remaining)) @@ -1865,7 +1865,7 @@ static void end_sync_read(struct bio *bio) static void end_sync_write(struct bio *bio) { - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct r1bio *r1_bio = get_resync_r1bio(bio); struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -2058,7 +2058,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) idx ++; } set_bit(R1BIO_Uptodate, &r1_bio->state); - bio->bi_error = 0; + bio->bi_status = 0; return 1; } @@ -2082,16 +2082,16 @@ static void process_checks(struct r1bio *r1_bio) for (i = 0; i < conf->raid_disks * 2; i++) { int j; int size; - int error; + blk_status_t status; struct bio_vec *bi; struct bio *b = r1_bio->bios[i]; struct resync_pages *rp = get_resync_pages(b); if (b->bi_end_io != end_sync_read) continue; /* fixup the bio for reuse, but preserve errno */ - error = b->bi_error; + status = b->bi_status; bio_reset(b); - b->bi_error = error; + b->bi_status = status; b->bi_vcnt = vcnt; b->bi_iter.bi_size = r1_bio->sectors << 9; b->bi_iter.bi_sector = r1_bio->sector + @@ -2113,7 +2113,7 @@ static void process_checks(struct r1bio *r1_bio) } for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - !r1_bio->bios[primary]->bi_error) { + !r1_bio->bios[primary]->bi_status) { r1_bio->bios[primary]->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[primary].rdev, mddev); break; @@ -2123,7 +2123,7 @@ static void process_checks(struct r1bio *r1_bio) int j; struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; - int error = sbio->bi_error; + blk_status_t status = sbio->bi_status; struct page **ppages = get_resync_pages(pbio)->pages; struct page **spages = get_resync_pages(sbio)->pages; struct bio_vec *bi; @@ -2132,12 +2132,12 @@ static void process_checks(struct r1bio *r1_bio) if (sbio->bi_end_io != end_sync_read) continue; /* Now we can 'fixup' the error value */ - sbio->bi_error = 0; + sbio->bi_status = 0; bio_for_each_segment_all(bi, sbio, j) page_len[j] = bi->bv_len; - if (!error) { + if (!status) { for (j = vcnt; j-- ; ) { if (memcmp(page_address(ppages[j]), page_address(spages[j]), @@ -2149,7 +2149,7 @@ static void process_checks(struct r1bio *r1_bio) if (j >= 0) atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && !error)) { + && !status)) { /* No need to write to this device. */ sbio->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[i].rdev, mddev); @@ -2400,11 +2400,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio struct bio *bio = r1_bio->bios[m]; if (bio->bi_end_io == NULL) continue; - if (!bio->bi_error && + if (!bio->bi_status && test_bit(R1BIO_MadeGood, &r1_bio->state)) { rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); } - if (bio->bi_error && + if (bio->bi_status && test_bit(R1BIO_WriteError, &r1_bio->state)) { if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) md_error(conf->mddev, rdev); @@ -2955,7 +2955,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf->r1bio_pool) goto abort; - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); if (!conf->bio_split) goto abort; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 797ed60abd5e..57a250fdbbcc 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -336,7 +336,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio) struct r10conf *conf = r10_bio->mddev->private; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); /* @@ -389,7 +389,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, static void raid10_end_read_request(struct bio *bio) { - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; int slot, dev; struct md_rdev *rdev; @@ -477,7 +477,7 @@ static void raid10_end_write_request(struct bio *bio) struct bio *to_put = NULL; bool discard_error; - discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; + discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); @@ -491,7 +491,7 @@ static void raid10_end_write_request(struct bio *bio) /* * this branch is our 'one mirror IO has finished' event handler: */ - if (bio->bi_error && !discard_error) { + if (bio->bi_status && !discard_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it. @@ -913,7 +913,7 @@ static void flush_pending_writes(struct r10conf *conf) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) @@ -1098,7 +1098,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) @@ -1888,7 +1888,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) { struct r10conf *conf = r10_bio->mddev->private; - if (!bio->bi_error) + if (!bio->bi_status) set_bit(R10BIO_Uptodate, &r10_bio->state); else /* The write handler will notice the lack of @@ -1972,7 +1972,7 @@ static void end_sync_write(struct bio *bio) else rdev = conf->mirrors[d].rdev; - if (bio->bi_error) { + if (bio->bi_status) { if (repl) md_error(mddev, rdev); else { @@ -2021,7 +2021,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) /* find the first device with a block */ for (i=0; i<conf->copies; i++) - if (!r10_bio->devs[i].bio->bi_error) + if (!r10_bio->devs[i].bio->bi_status) break; if (i == conf->copies) @@ -2050,7 +2050,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) tpages = get_resync_pages(tbio)->pages; d = r10_bio->devs[i].devnum; rdev = conf->mirrors[d].rdev; - if (!r10_bio->devs[i].bio->bi_error) { + if (!r10_bio->devs[i].bio->bi_status) { /* We know that the bi_io_vec layout is the same for * both 'first' and 'i', so we just compare them. * All vec entries are PAGE_SIZE; @@ -2633,7 +2633,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev = conf->mirrors[dev].rdev; if (r10_bio->devs[m].bio == NULL) continue; - if (!r10_bio->devs[m].bio->bi_error) { + if (!r10_bio->devs[m].bio->bi_status) { rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, @@ -2649,7 +2649,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) if (r10_bio->devs[m].repl_bio == NULL) continue; - if (!r10_bio->devs[m].repl_bio->bi_error) { + if (!r10_bio->devs[m].repl_bio->bi_status) { rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, @@ -2675,7 +2675,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) r10_bio->devs[m].addr, r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); - } else if (bio != NULL && bio->bi_error) { + } else if (bio != NULL && bio->bi_status) { fail = true; if (!narrow_write_error(r10_bio, m)) { md_error(conf->mddev, rdev); @@ -3267,7 +3267,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[i].repl_bio->bi_end_io = NULL; bio = r10_bio->devs[i].bio; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; rcu_read_lock(); rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { @@ -3309,7 +3309,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; sector = r10_bio->devs[i].addr; bio->bi_next = biolist; @@ -3375,7 +3375,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io == end_sync_read) { md_sync_acct(bio->bi_bdev, nr_sectors); - bio->bi_error = 0; + bio->bi_status = 0; generic_make_request(bio); } } @@ -3552,7 +3552,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) if (!conf->r10bio_pool) goto out; - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); if (!conf->bio_split) goto out; @@ -4397,7 +4397,7 @@ read_more: read_bio->bi_end_io = end_reshape_read; bio_set_op_attrs(read_bio, REQ_OP_READ, 0); read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); - read_bio->bi_error = 0; + read_bio->bi_status = 0; read_bio->bi_vcnt = 0; read_bio->bi_iter.bi_size = 0; r10_bio->master_bio = read_bio; @@ -4641,7 +4641,7 @@ static void end_reshape_write(struct bio *bio) rdev = conf->mirrors[d].rdev; } - if (bio->bi_error) { + if (bio->bi_status) { /* FIXME should record badblock */ md_error(mddev, rdev); } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 0a7af8b0a80a..bfa1e907c472 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -572,7 +572,7 @@ static void r5l_log_endio(struct bio *bio) struct r5l_log *log = io->log; unsigned long flags; - if (bio->bi_error) + if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); bio_put(bio); @@ -1247,7 +1247,7 @@ static void r5l_log_flush_endio(struct bio *bio) unsigned long flags; struct r5l_io_unit *io; - if (bio->bi_error) + if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); spin_lock_irqsave(&log->io_list_lock, flags); @@ -3063,7 +3063,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->io_pool) goto io_pool; - log->bs = bioset_create(R5L_POOL_SIZE, 0); + log->bs = bioset_create(R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!log->bs) goto io_bs; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index ccce92e68d7f..77cce3573aa8 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -397,7 +397,7 @@ static void ppl_log_endio(struct bio *bio) pr_debug("%s: seq: %llu\n", __func__, io->seq); - if (bio->bi_error) + if (bio->bi_status) md_error(ppl_conf->mddev, log->rdev); list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { @@ -1150,7 +1150,7 @@ int ppl_init_log(struct r5conf *conf) goto err; } - ppl_conf->bs = bioset_create(conf->raid_disks, 0); + ppl_conf->bs = bioset_create(conf->raid_disks, 0, 0); if (!ppl_conf->bs) { ret = -ENOMEM; goto err; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ec0f951ae19f..62c965be97e1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2476,7 +2476,7 @@ static void raid5_end_read_request(struct bio * bi) pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - bi->bi_error); + bi->bi_status); if (i == disks) { bio_reset(bi); BUG(); @@ -2496,7 +2496,7 @@ static void raid5_end_read_request(struct bio * bi) s = sh->sector + rdev->new_data_offset; else s = sh->sector + rdev->data_offset; - if (!bi->bi_error) { + if (!bi->bi_status) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { /* Note that this cannot happen on a @@ -2613,7 +2613,7 @@ static void raid5_end_write_request(struct bio *bi) } pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - bi->bi_error); + bi->bi_status); if (i == disks) { bio_reset(bi); BUG(); @@ -2621,14 +2621,14 @@ static void raid5_end_write_request(struct bio *bi) } if (replacement) { - if (bi->bi_error) + if (bi->bi_status) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { - if (bi->bi_error) { + if (bi->bi_status) { set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); @@ -2649,7 +2649,7 @@ static void raid5_end_write_request(struct bio *bi) } rdev_dec_pending(rdev, conf->mddev); - if (sh->batch_head && bi->bi_error && !replacement) + if (sh->batch_head && bi->bi_status && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); bio_reset(bi); @@ -3381,7 +3381,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); bio_endio(bi); bi = nextbi; @@ -3403,7 +3403,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); bio_endio(bi); bi = bi2; @@ -3429,7 +3429,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; bio_endio(bi); bi = nextbi; } @@ -5154,7 +5154,7 @@ static void raid5_align_endio(struct bio *bi) struct mddev *mddev; struct r5conf *conf; struct md_rdev *rdev; - int error = bi->bi_error; + blk_status_t error = bi->bi_status; bio_put(bi); @@ -5731,7 +5731,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; break; } } @@ -6943,7 +6943,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; } - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); if (!conf->bio_split) goto abort; conf->mddev = mddev; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 99e651c27fb7..22de7f5ed032 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1921,12 +1921,13 @@ static void msb_io_work(struct work_struct *work) spin_lock_irqsave(&msb->q_lock, flags); if (len) - if (!__blk_end_request(msb->req, 0, len)) + if (!__blk_end_request(msb->req, BLK_STS_OK, len)) msb->req = NULL; if (error && msb->req) { + blk_status_t ret = errno_to_blk_status(error); dbg_verbose("IO: ending one sector of the request with error"); - if (!__blk_end_request(msb->req, error, msb->page_size)) + if (!__blk_end_request(msb->req, ret, msb->page_size)) msb->req = NULL; } @@ -2014,7 +2015,7 @@ static void msb_submit_req(struct request_queue *q) WARN_ON(!msb->io_queue_stopped); while ((req = blk_fetch_request(q)) != NULL) - __blk_end_request_all(req, -ENODEV); + __blk_end_request_all(req, BLK_STS_IOERR); return; } diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c00d8a266878..8897962781bb 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -709,7 +709,8 @@ try_again: msb->req_sg); if (!msb->seg_count) { - chunk = __blk_end_request_cur(msb->block_req, -ENOMEM); + chunk = __blk_end_request_cur(msb->block_req, + BLK_STS_RESOURCE); continue; } @@ -776,7 +777,8 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error) if (error && !t_len) t_len = blk_rq_cur_bytes(msb->block_req); - chunk = __blk_end_request(msb->block_req, error, t_len); + chunk = __blk_end_request(msb->block_req, + errno_to_blk_status(error), t_len); error = mspro_block_issue_req(card, chunk); @@ -838,7 +840,7 @@ static void mspro_block_submit_req(struct request_queue *q) if (msb->eject) { while ((req = blk_fetch_request(q)) != NULL) - __blk_end_request_all(req, -ENODEV); + __blk_end_request_all(req, BLK_STS_IOERR); return; } diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 8273b078686d..6ff94a948a4b 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1184,9 +1184,10 @@ static void mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req) struct mmc_card *card = md->queue.card; unsigned int from, nr, arg; int err = 0, type = MMC_BLK_DISCARD; + blk_status_t status = BLK_STS_OK; if (!mmc_can_erase(card)) { - err = -EOPNOTSUPP; + status = BLK_STS_NOTSUPP; goto fail; } @@ -1212,10 +1213,12 @@ static void mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req) if (!err) err = mmc_erase(card, from, nr, arg); } while (err == -EIO && !mmc_blk_reset(md, card->host, type)); - if (!err) + if (err) + status = BLK_STS_IOERR; + else mmc_blk_reset_success(md, type); fail: - blk_end_request(req, err, blk_rq_bytes(req)); + blk_end_request(req, status, blk_rq_bytes(req)); } static void mmc_blk_issue_secdiscard_rq(struct mmc_queue *mq, @@ -1225,9 +1228,10 @@ static void mmc_blk_issue_secdiscard_rq(struct mmc_queue *mq, struct mmc_card *card = md->queue.card; unsigned int from, nr, arg; int err = 0, type = MMC_BLK_SECDISCARD; + blk_status_t status = BLK_STS_OK; if (!(mmc_can_secure_erase_trim(card))) { - err = -EOPNOTSUPP; + status = BLK_STS_NOTSUPP; goto out; } @@ -1254,8 +1258,10 @@ retry: err = mmc_erase(card, from, nr, arg); if (err == -EIO) goto out_retry; - if (err) + if (err) { + status = BLK_STS_IOERR; goto out; + } if (arg == MMC_SECURE_TRIM1_ARG) { if (card->quirks & MMC_QUIRK_INAND_CMD38) { @@ -1270,8 +1276,10 @@ retry: err = mmc_erase(card, from, nr, MMC_SECURE_TRIM2_ARG); if (err == -EIO) goto out_retry; - if (err) + if (err) { + status = BLK_STS_IOERR; goto out; + } } out_retry: @@ -1280,7 +1288,7 @@ out_retry: if (!err) mmc_blk_reset_success(md, type); out: - blk_end_request(req, err, blk_rq_bytes(req)); + blk_end_request(req, status, blk_rq_bytes(req)); } static void mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req) @@ -1290,10 +1298,7 @@ static void mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req) int ret = 0; ret = mmc_flush_cache(card); - if (ret) - ret = -EIO; - - blk_end_request_all(req, ret); + blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK); } /* @@ -1641,7 +1646,7 @@ static void mmc_blk_rw_cmd_abort(struct mmc_queue *mq, struct mmc_card *card, { if (mmc_card_removed(card)) req->rq_flags |= RQF_QUIET; - while (blk_end_request(req, -EIO, blk_rq_cur_bytes(req))); + while (blk_end_request(req, BLK_STS_IOERR, blk_rq_cur_bytes(req))); mmc_queue_req_free(mq, mqrq); } @@ -1661,7 +1666,7 @@ static void mmc_blk_rw_try_restart(struct mmc_queue *mq, struct request *req, */ if (mmc_card_removed(mq->card)) { req->rq_flags |= RQF_QUIET; - blk_end_request_all(req, -EIO); + blk_end_request_all(req, BLK_STS_IOERR); mmc_queue_req_free(mq, mqrq); return; } @@ -1743,7 +1748,7 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req) */ mmc_blk_reset_success(md, type); - req_pending = blk_end_request(old_req, 0, + req_pending = blk_end_request(old_req, BLK_STS_OK, brq->data.bytes_xfered); /* * If the blk_end_request function returns non-zero even @@ -1811,7 +1816,7 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req) * time, so we only reach here after trying to * read a single sector. */ - req_pending = blk_end_request(old_req, -EIO, + req_pending = blk_end_request(old_req, BLK_STS_IOERR, brq->data.blksz); if (!req_pending) { mmc_queue_req_free(mq, mq_rq); @@ -1860,7 +1865,7 @@ void mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) ret = mmc_blk_part_switch(card, md); if (ret) { if (req) { - blk_end_request_all(req, -EIO); + blk_end_request_all(req, BLK_STS_IOERR); } goto out; } diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 5c37b6be3e7b..b659a28c8018 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -133,7 +133,7 @@ static void mmc_request_fn(struct request_queue *q) if (!mq) { while ((req = blk_fetch_request(q)) != NULL) { req->rq_flags |= RQF_QUIET; - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); } return; } @@ -388,7 +388,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, mmc_queue_setup_discard(mq->queue, card); if (card->bouncesz) { - blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_ANY); blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512); blk_queue_max_segments(mq->queue, card->bouncesz / 512); blk_queue_max_segment_size(mq->queue, card->bouncesz); diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 6b8d5cd7dbf6..f336a9b85576 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -73,7 +73,7 @@ static void blktrans_dev_put(struct mtd_blktrans_dev *dev) } -static int do_blktrans_request(struct mtd_blktrans_ops *tr, +static blk_status_t do_blktrans_request(struct mtd_blktrans_ops *tr, struct mtd_blktrans_dev *dev, struct request *req) { @@ -84,33 +84,37 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, nsect = blk_rq_cur_bytes(req) >> tr->blkshift; buf = bio_data(req->bio); - if (req_op(req) == REQ_OP_FLUSH) - return tr->flush(dev); + if (req_op(req) == REQ_OP_FLUSH) { + if (tr->flush(dev)) + return BLK_STS_IOERR; + return BLK_STS_OK; + } if (blk_rq_pos(req) + blk_rq_cur_sectors(req) > get_capacity(req->rq_disk)) - return -EIO; + return BLK_STS_IOERR; switch (req_op(req)) { case REQ_OP_DISCARD: - return tr->discard(dev, block, nsect); + if (tr->discard(dev, block, nsect)) + return BLK_STS_IOERR; + return BLK_STS_OK; case REQ_OP_READ: for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->readsect(dev, block, buf)) - return -EIO; + return BLK_STS_IOERR; rq_flush_dcache_pages(req); - return 0; + return BLK_STS_OK; case REQ_OP_WRITE: if (!tr->writesect) - return -EIO; + return BLK_STS_IOERR; rq_flush_dcache_pages(req); for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->writesect(dev, block, buf)) - return -EIO; - return 0; + return BLK_STS_IOERR; default: - return -EIO; + return BLK_STS_IOERR; } } @@ -132,7 +136,7 @@ static void mtd_blktrans_work(struct work_struct *work) spin_lock_irq(rq->queue_lock); while (1) { - int res; + blk_status_t res; dev->bg_stop = false; if (!req && !(req = blk_fetch_request(rq))) { @@ -178,7 +182,7 @@ static void mtd_blktrans_request(struct request_queue *rq) if (!dev) while ((req = blk_fetch_request(rq)) != NULL) - __blk_end_request_all(req, -ENODEV); + __blk_end_request_all(req, BLK_STS_IOERR); else queue_work(dev->wq, &dev->work); } @@ -413,6 +417,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) new->rq->queuedata = new; blk_queue_logical_block_size(new->rq, tr->blksize); + blk_queue_bounce_limit(new->rq, BLK_BOUNCE_HIGH); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, new->rq); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, new->rq); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 5497e65439df..c3963f880448 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -313,10 +313,10 @@ static void ubiblock_do_work(struct work_struct *work) ret = ubiblock_read(pdu); rq_flush_dcache_pages(req); - blk_mq_end_request(req, ret); + blk_mq_end_request(req, errno_to_blk_status(ret)); } -static int ubiblock_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t ubiblock_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *req = bd->rq; @@ -327,9 +327,9 @@ static int ubiblock_queue_rq(struct blk_mq_hw_ctx *hctx, case REQ_OP_READ: ubi_sgl_init(&pdu->usgl); queue_work(dev->wq, &pdu->work); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; default: - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } } diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 822198a75e96..f12d23c49771 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -186,7 +186,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio) * another kernel subsystem, and we just pass it through. */ if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; goto out; } @@ -205,7 +205,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio) "io error in %s sector %lld, len %d,\n", (rw == READ) ? "READ" : "WRITE", (unsigned long long) iter.bi_sector, len); - bio->bi_error = err; + bio->bi_status = errno_to_blk_status(err); break; } } @@ -273,7 +273,6 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) blk_queue_make_request(q, nd_blk_make_request); blk_queue_max_hw_sectors(q, UINT_MAX); - blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); blk_queue_logical_block_size(q, nsblk_sector_size(nsblk)); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); q->queuedata = nsblk; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 983718b8fd9b..b6ba0618ea46 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1210,7 +1210,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) * another kernel subsystem, and we just pass it through. */ if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; goto out; } @@ -1232,7 +1232,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) (op_is_write(bio_op(bio))) ? "WRITE" : "READ", (unsigned long long) iter.bi_sector, len); - bio->bi_error = err; + bio->bi_status = errno_to_blk_status(err); break; } } @@ -1297,7 +1297,6 @@ static int btt_blk_init(struct btt *btt) blk_queue_make_request(btt->btt_queue, btt_make_request); blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); - blk_queue_bounce_limit(btt->btt_queue, BLK_BOUNCE_ANY); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue); btt->btt_queue->queuedata = btt; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index c544d466ea51..6b577afb1d44 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -49,19 +49,19 @@ static struct nd_region *to_region(struct pmem_device *pmem) return to_nd_region(to_dev(pmem)->parent); } -static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, - unsigned int len) +static blk_status_t pmem_clear_poison(struct pmem_device *pmem, + phys_addr_t offset, unsigned int len) { struct device *dev = to_dev(pmem); sector_t sector; long cleared; - int rc = 0; + blk_status_t rc = BLK_STS_OK; sector = (offset - pmem->data_offset) / 512; cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); if (cleared < len) - rc = -EIO; + rc = BLK_STS_IOERR; if (cleared > 0 && cleared / 512) { cleared /= 512; dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__, @@ -84,7 +84,7 @@ static void write_pmem(void *pmem_addr, struct page *page, kunmap_atomic(mem); } -static int read_pmem(struct page *page, unsigned int off, +static blk_status_t read_pmem(struct page *page, unsigned int off, void *pmem_addr, unsigned int len) { int rc; @@ -93,15 +93,15 @@ static int read_pmem(struct page *page, unsigned int off, rc = memcpy_mcsafe(mem + off, pmem_addr, len); kunmap_atomic(mem); if (rc) - return -EIO; - return 0; + return BLK_STS_IOERR; + return BLK_STS_OK; } -static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, +static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector) { - int rc = 0; + blk_status_t rc = BLK_STS_OK; bool bad_pmem = false; phys_addr_t pmem_off = sector * 512 + pmem->data_offset; void *pmem_addr = pmem->virt_addr + pmem_off; @@ -111,7 +111,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (!is_write) { if (unlikely(bad_pmem)) - rc = -EIO; + rc = BLK_STS_IOERR; else { rc = read_pmem(page, off, pmem_addr, len); flush_dcache_page(page); @@ -149,7 +149,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { - int rc = 0; + blk_status_t rc = 0; bool do_acct; unsigned long start; struct bio_vec bvec; @@ -166,7 +166,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) bvec.bv_offset, op_is_write(bio_op(bio)), iter.bi_sector); if (rc) { - bio->bi_error = rc; + bio->bi_status = rc; break; } } @@ -184,7 +184,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct page *page, bool is_write) { struct pmem_device *pmem = bdev->bd_queue->queuedata; - int rc; + blk_status_t rc; rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector); @@ -197,7 +197,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, if (rc == 0) page_endio(page, is_write, 0); - return rc; + return blk_status_to_errno(rc); } /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ @@ -343,7 +343,6 @@ static int pmem_attach_disk(struct device *dev, blk_queue_make_request(q, pmem_make_request); blk_queue_physical_block_size(q, PAGE_SIZE); blk_queue_max_hw_sectors(q, UINT_MAX); - blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); q->queuedata = pmem; diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 90745a616df7..46d6cb1e03bd 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -13,18 +13,6 @@ config BLK_DEV_NVME To compile this driver as a module, choose M here: the module will be called nvme. -config BLK_DEV_NVME_SCSI - bool "SCSI emulation for NVMe device nodes" - depends on NVME_CORE - ---help--- - This adds support for the SG_IO ioctl on the NVMe character - and block devices nodes, as well as a translation for a small - number of selected SCSI commands to NVMe commands to the NVMe - driver. If you don't know what this means you probably want - to say N here, unless you run a distro that abuses the SCSI - emulation to provide stable device names for mount by id, like - some OpenSuSE and SLES versions. - config NVME_FABRICS tristate diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index f1a7d945fbb6..cc0aacb4c8b4 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o nvme-core-y := core.o -nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-y += pci.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 903d5813023a..d70df1d0072d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -27,7 +27,6 @@ #include <linux/nvme_ioctl.h> #include <linux/t10-pi.h> #include <linux/pm_qos.h> -#include <scsi/sg.h> #include <asm/unaligned.h> #include "nvme.h" @@ -45,7 +44,7 @@ module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); EXPORT_SYMBOL_GPL(nvme_io_timeout); -unsigned char shutdown_timeout = 5; +static unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); @@ -65,34 +64,53 @@ static bool force_apst; module_param(force_apst, bool, 0644); MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); +static bool streams; +module_param(streams, bool, 0644); +MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); + +struct workqueue_struct *nvme_wq; +EXPORT_SYMBOL_GPL(nvme_wq); + static LIST_HEAD(nvme_ctrl_list); static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; -static int nvme_error_status(struct request *req) +int nvme_reset_ctrl(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + if (!queue_work(nvme_wq, &ctrl->reset_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_reset_ctrl); + +static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = nvme_reset_ctrl(ctrl); + if (!ret) + flush_work(&ctrl->reset_work); + return ret; +} + +static blk_status_t nvme_error_status(struct request *req) { switch (nvme_req(req)->status & 0x7ff) { case NVME_SC_SUCCESS: - return 0; + return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: - return -ENOSPC; - default: - return -EIO; - - /* - * XXX: these errors are a nasty side-band protocol to - * drivers/md/dm-mpath.c:noretry_error() that aren't documented - * anywhere.. - */ - case NVME_SC_CMD_SEQ_ERROR: - return -EILSEQ; + return BLK_STS_NOSPC; case NVME_SC_ONCS_NOT_SUPPORTED: - return -EOPNOTSUPP; + return BLK_STS_NOTSUPP; case NVME_SC_WRITE_FAULT: case NVME_SC_READ_ERROR: case NVME_SC_UNWRITTEN_BLOCK: - return -ENODATA; + return BLK_STS_MEDIUM; + default: + return BLK_STS_IOERR; } } @@ -165,7 +183,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: - case NVME_CTRL_RECONNECTING: changed = true; /* FALLTHRU */ default: @@ -283,6 +300,105 @@ struct request *nvme_alloc_request(struct request_queue *q, } EXPORT_SYMBOL_GPL(nvme_alloc_request); +static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + + c.directive.opcode = nvme_admin_directive_send; + c.directive.nsid = cpu_to_le32(0xffffffff); + c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; + c.directive.dtype = NVME_DIR_IDENTIFY; + c.directive.tdtype = NVME_DIR_STREAMS; + c.directive.endir = enable ? NVME_DIR_ENDIR : 0; + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0); +} + +static int nvme_disable_streams(struct nvme_ctrl *ctrl) +{ + return nvme_toggle_streams(ctrl, false); +} + +static int nvme_enable_streams(struct nvme_ctrl *ctrl) +{ + return nvme_toggle_streams(ctrl, true); +} + +static int nvme_get_stream_params(struct nvme_ctrl *ctrl, + struct streams_directive_params *s, u32 nsid) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + memset(s, 0, sizeof(*s)); + + c.directive.opcode = nvme_admin_directive_recv; + c.directive.nsid = cpu_to_le32(nsid); + c.directive.numd = sizeof(*s); + c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; + c.directive.dtype = NVME_DIR_STREAMS; + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s)); +} + +static int nvme_configure_directives(struct nvme_ctrl *ctrl) +{ + struct streams_directive_params s; + int ret; + + if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) + return 0; + if (!streams) + return 0; + + ret = nvme_enable_streams(ctrl); + if (ret) + return ret; + + ret = nvme_get_stream_params(ctrl, &s, 0xffffffff); + if (ret) + return ret; + + ctrl->nssa = le16_to_cpu(s.nssa); + if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { + dev_info(ctrl->device, "too few streams (%u) available\n", + ctrl->nssa); + nvme_disable_streams(ctrl); + return 0; + } + + ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); + return 0; +} + +/* + * Check if 'req' has a write hint associated with it. If it does, assign + * a valid namespace stream to the write. + */ +static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, + struct request *req, u16 *control, + u32 *dsmgmt) +{ + enum rw_hint streamid = req->write_hint; + + if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE) + streamid = 0; + else { + streamid--; + if (WARN_ON_ONCE(streamid > ctrl->nr_streams)) + return; + + *control |= NVME_RW_DTYPE_STREAMS; + *dsmgmt |= streamid << 16; + } + + if (streamid < ARRAY_SIZE(req->q->write_hints)) + req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -291,7 +407,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns, cmnd->common.nsid = cpu_to_le32(ns->ns_id); } -static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, +static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; @@ -300,7 +416,7 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); if (!range) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; __rq_for_each_bio(bio, req) { u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); @@ -314,7 +430,7 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, if (WARN_ON_ONCE(n != segments)) { kfree(range); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } memset(cmnd, 0, sizeof(*cmnd)); @@ -328,15 +444,26 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, req->special_vec.bv_len = sizeof(*range) * segments; req->rq_flags |= RQF_SPECIAL_PAYLOAD; - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } -static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmnd) +static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd) { + struct nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; u32 dsmgmt = 0; + /* + * If formated with metadata, require the block layer provide a buffer + * unless this namespace is formated such that the metadata can be + * stripped/generated by the controller with PRACT=1. + */ + if (ns && ns->ms && + (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) && + !blk_integrity_rq(req) && !blk_rq_is_passthrough(req)) + return BLK_STS_NOTSUPP; + if (req->cmd_flags & REQ_FUA) control |= NVME_RW_FUA; if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) @@ -351,6 +478,9 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) + nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); + if (ns->ms) { switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: @@ -370,12 +500,13 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + return 0; } -int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd) { - int ret = BLK_MQ_RQ_QUEUE_OK; + blk_status_t ret = BLK_STS_OK; if (!(req->rq_flags & RQF_DONTPREP)) { nvme_req(req)->retries = 0; @@ -398,11 +529,11 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, break; case REQ_OP_READ: case REQ_OP_WRITE: - nvme_setup_rw(ns, req, cmd); + ret = nvme_setup_rw(ns, req, cmd); break; default: WARN_ON_ONCE(1); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } cmd->common.command_id = req->tag; @@ -555,15 +686,16 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, result, timeout); } -static void nvme_keep_alive_end_io(struct request *rq, int error) +static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; blk_mq_free_request(rq); - if (error) { + if (status) { dev_err(ctrl->device, - "failed nvme_keep_alive_end_io error=%d\n", error); + "failed nvme_keep_alive_end_io error=%d\n", + status); return; } @@ -599,7 +731,7 @@ static void nvme_keep_alive_work(struct work_struct *work) if (nvme_keep_alive(ctrl)) { /* allocation failure, reset the controller */ dev_err(ctrl->device, "keep-alive failed\n"); - ctrl->ops->reset_ctrl(ctrl); + nvme_reset_ctrl(ctrl); return; } } @@ -623,7 +755,7 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); -int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) +static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; int error; @@ -643,6 +775,77 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid) +{ + struct nvme_command c = { }; + int status; + void *data; + int pos; + int len; + + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; + + data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); + if (!data) + return -ENOMEM; + + status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, data, + NVME_IDENTIFY_DATA_SIZE); + if (status) + goto free_data; + + for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { + struct nvme_ns_id_desc *cur = data + pos; + + if (cur->nidl == 0) + break; + + switch (cur->nidt) { + case NVME_NIDT_EUI64: + if (cur->nidl != NVME_NIDT_EUI64_LEN) { + dev_warn(ns->ctrl->device, + "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", + cur->nidl); + goto free_data; + } + len = NVME_NIDT_EUI64_LEN; + memcpy(ns->eui, data + pos + sizeof(*cur), len); + break; + case NVME_NIDT_NGUID: + if (cur->nidl != NVME_NIDT_NGUID_LEN) { + dev_warn(ns->ctrl->device, + "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", + cur->nidl); + goto free_data; + } + len = NVME_NIDT_NGUID_LEN; + memcpy(ns->nguid, data + pos + sizeof(*cur), len); + break; + case NVME_NIDT_UUID: + if (cur->nidl != NVME_NIDT_UUID_LEN) { + dev_warn(ns->ctrl->device, + "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", + cur->nidl); + goto free_data; + } + len = NVME_NIDT_UUID_LEN; + uuid_copy(&ns->uuid, data + pos + sizeof(*cur)); + break; + default: + /* Skip unnkown types */ + len = cur->nidl; + break; + } + + len += sizeof(*cur); + } +free_data: + kfree(data); + return status; +} + static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) { struct nvme_command c = { }; @@ -653,7 +856,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); } -int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, +static int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id) { struct nvme_command c = { }; @@ -675,26 +878,7 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, return error; } -int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - void *buffer, size_t buflen, u32 *result) -{ - struct nvme_command c; - union nvme_result res; - int ret; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_get_features; - c.features.nsid = cpu_to_le32(nsid); - c.features.fid = cpu_to_le32(fid); - - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, buffer, buflen, 0, - NVME_QID_ANY, 0, 0); - if (ret >= 0 && result) - *result = le32_to_cpu(res.u32); - return ret; -} - -int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, +static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, void *buffer, size_t buflen, u32 *result) { struct nvme_command c; @@ -713,28 +897,6 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, return ret; } -int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) -{ - struct nvme_command c = { }; - int error; - - c.common.opcode = nvme_admin_get_log_page, - c.common.nsid = cpu_to_le32(0xFFFFFFFF), - c.common.cdw10[0] = cpu_to_le32( - (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | - NVME_LOG_SMART), - - *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); - if (!*log) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, - sizeof(struct nvme_smart_log)); - if (error) - kfree(*log); - return error; -} - int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) { u32 q_count = (*count - 1) | ((*count - 1) << 16); @@ -752,7 +914,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) * access to the admin queue, as that might be only way to fix them up. */ if (status > 0) { - dev_err(ctrl->dev, "Could not set queue count (%d)\n", status); + dev_err(ctrl->device, "Could not set queue count (%d)\n", status); *count = 0; } else { nr_io_queues = min(result & 0xffff, result >> 16) + 1; @@ -870,12 +1032,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); -#ifdef CONFIG_BLK_DEV_NVME_SCSI - case SG_GET_VERSION_NUM: - return nvme_sg_get_version_num((void __user *)arg); - case SG_IO: - return nvme_sg_io(ns, (void __user *)arg); -#endif default: #ifdef CONFIG_NVM if (ns->ndev) @@ -892,10 +1048,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - switch (cmd) { - case SG_IO: - return -ENOIOCTLCMD; - } return nvme_ioctl(bdev, mode, cmd, arg); } #else @@ -983,6 +1135,12 @@ static void nvme_init_integrity(struct nvme_ns *ns) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ +static void nvme_set_chunk_size(struct nvme_ns *ns) +{ + u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9)); + blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); +} + static void nvme_config_discard(struct nvme_ns *ns) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -991,8 +1149,15 @@ static void nvme_config_discard(struct nvme_ns *ns) BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; + if (ctrl->nr_streams && ns->sws && ns->sgs) { + unsigned int sz = logical_block_size * ns->sws * ns->sgs; + + ns->queue->limits.discard_alignment = sz; + ns->queue->limits.discard_granularity = sz; + } else { + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + } blk_queue_max_discard_sectors(ns->queue, UINT_MAX); blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); @@ -1016,7 +1181,15 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid)); + memcpy(ns->nguid, (*id)->nguid, sizeof(ns->nguid)); + if (ns->ctrl->vs >= NVME_VS(1, 3, 0)) { + /* Don't treat error as fatal we potentially + * already have a NGUID or EUI-64 + */ + if (nvme_identify_ns_descs(ns, ns->ns_id)) + dev_warn(ns->ctrl->device, + "%s: Identify Descriptors failed\n", __func__); + } return 0; } @@ -1024,6 +1197,7 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; u16 bs; /* @@ -1034,12 +1208,15 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->lba_shift == 0) ns->lba_shift = 9; bs = 1 << ns->lba_shift; + ns->noiob = le16_to_cpu(id->noiob); blk_mq_freeze_queue(disk->queue); - if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) + if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) nvme_prep_integrity(disk, id, bs); blk_queue_logical_block_size(ns->queue, bs); + if (ns->noiob) + nvme_set_chunk_size(ns); if (ns->ms && !blk_get_integrity(disk) && !ns->ext) nvme_init_integrity(ns); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) @@ -1047,7 +1224,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) else set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) + if (ctrl->oncs & NVME_CTRL_ONCS_DSM) nvme_config_discard(ns); blk_mq_unfreeze_queue(disk->queue); } @@ -1283,7 +1460,7 @@ EXPORT_SYMBOL_GPL(nvme_enable_ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) { - unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; + unsigned long timeout = jiffies + (shutdown_timeout * HZ); u32 csts; int ret; @@ -1372,7 +1549,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) if (!table) return; - if (ctrl->ps_max_latency_us == 0) { + if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { /* Turn off APST. */ apste = 0; dev_dbg(ctrl->device, "APST disabled\n"); @@ -1528,6 +1705,31 @@ static bool quirk_matches(const struct nvme_id_ctrl *id, string_matches(id->fr, q->fr, sizeof(id->fr)); } +static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + size_t nqnlen; + int off; + + nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); + if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { + strcpy(ctrl->subnqn, id->subnqn); + return; + } + + if (ctrl->vs >= NVME_VS(1, 2, 1)) + dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); + + /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ + off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE, + "nqn.2014.08.org.nvmexpress:%4x%4x", + le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); + memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn)); + off += sizeof(id->sn); + memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn)); + off += sizeof(id->mn); + memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off); +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -1539,7 +1741,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) u64 cap; int ret, page_shift; u32 max_hw_sectors; - u8 prev_apsta; + bool prev_apst_enabled; ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { @@ -1563,6 +1765,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + nvme_init_subnqn(ctrl, id); + if (!ctrl->identified) { /* * Check for quirks. Quirk can depend on firmware version, @@ -1582,7 +1786,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { - dev_warn(ctrl->dev, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); + dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; } @@ -1607,16 +1811,17 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->kas = le16_to_cpu(id->kas); ctrl->npss = id->npss; - prev_apsta = ctrl->apsta; + ctrl->apsta = id->apsta; + prev_apst_enabled = ctrl->apst_enabled; if (ctrl->quirks & NVME_QUIRK_NO_APST) { if (force_apst && id->apsta) { - dev_warn(ctrl->dev, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); - ctrl->apsta = 1; + dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); + ctrl->apst_enabled = true; } else { - ctrl->apsta = 0; + ctrl->apst_enabled = false; } } else { - ctrl->apsta = id->apsta; + ctrl->apst_enabled = id->apsta; } memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); @@ -1634,22 +1839,25 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ret = -EINVAL; if (!ctrl->opts->discovery_nqn && !ctrl->kas) { - dev_err(ctrl->dev, + dev_err(ctrl->device, "keep-alive support is mandatory for fabrics\n"); ret = -EINVAL; } } else { ctrl->cntlid = le16_to_cpu(id->cntlid); + ctrl->hmpre = le32_to_cpu(id->hmpre); + ctrl->hmmin = le32_to_cpu(id->hmmin); } kfree(id); - if (ctrl->apsta && !prev_apsta) + if (ctrl->apst_enabled && !prev_apst_enabled) dev_pm_qos_expose_latency_tolerance(ctrl->device); - else if (!ctrl->apsta && prev_apsta) + else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); nvme_configure_apst(ctrl); + nvme_configure_directives(ctrl); ctrl->identified = true; @@ -1735,7 +1943,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: dev_warn(ctrl->device, "resetting controller\n"); - return ctrl->ops->reset_ctrl(ctrl); + return nvme_reset_ctrl_sync(ctrl); case NVME_IOCTL_SUBSYS_RESET: return nvme_reset_subsystem(ctrl); case NVME_IOCTL_RESCAN: @@ -1761,7 +1969,7 @@ static ssize_t nvme_sysfs_reset(struct device *dev, struct nvme_ctrl *ctrl = dev_get_drvdata(dev); int ret; - ret = ctrl->ops->reset_ctrl(ctrl); + ret = nvme_reset_ctrl_sync(ctrl); if (ret < 0) return ret; return count; @@ -1787,8 +1995,8 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, int serial_len = sizeof(ctrl->serial); int model_len = sizeof(ctrl->model); - if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) - return sprintf(buf, "eui.%16phN\n", ns->uuid); + if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + return sprintf(buf, "eui.%16phN\n", ns->nguid); if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) return sprintf(buf, "eui.%8phN\n", ns->eui); @@ -1803,11 +2011,28 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); +static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + return sprintf(buf, "%pU\n", ns->nguid); +} +static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); + static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%pU\n", ns->uuid); + + /* For backward compatibility expose the NGUID to userspace if + * we have no UUID set + */ + if (uuid_is_null(&ns->uuid)) { + printk_ratelimited(KERN_WARNING + "No UUID available providing old NGUID\n"); + return sprintf(buf, "%pU\n", ns->nguid); + } + return sprintf(buf, "%pU\n", &ns->uuid); } static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); @@ -1830,6 +2055,7 @@ static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); static struct attribute *nvme_ns_attrs[] = { &dev_attr_wwid.attr, &dev_attr_uuid.attr, + &dev_attr_nguid.attr, &dev_attr_eui.attr, &dev_attr_nsid.attr, NULL, @@ -1842,7 +2068,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct nvme_ns *ns = nvme_get_ns_from_dev(dev); if (a == &dev_attr_uuid.attr) { - if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + if (uuid_is_null(&ns->uuid) || + !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + return 0; + } + if (a == &dev_attr_nguid.attr) { + if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) return 0; } if (a == &dev_attr_eui.attr) { @@ -1931,8 +2162,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%s\n", - ctrl->ops->get_subsysnqn(ctrl)); + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn); } static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); @@ -1961,24 +2191,16 @@ static struct attribute *nvme_dev_attrs[] = { NULL }; -#define CHECK_ATTR(ctrl, a, name) \ - if ((a) == &dev_attr_##name.attr && \ - !(ctrl)->ops->get_##name) \ - return 0 - static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - if (a == &dev_attr_delete_controller.attr) { - if (!ctrl->ops->delete_ctrl) - return 0; - } - - CHECK_ATTR(ctrl, a, subsysnqn); - CHECK_ATTR(ctrl, a, address); + if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) + return 0; + if (a == &dev_attr_address.attr && !ctrl->ops->get_address) + return 0; return a->mode; } @@ -2019,6 +2241,32 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) return ret; } +static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) +{ + struct streams_directive_params s; + int ret; + + if (!ctrl->nr_streams) + return 0; + + ret = nvme_get_stream_params(ctrl, &s, ns->ns_id); + if (ret) + return ret; + + ns->sws = le32_to_cpu(s.sws); + ns->sgs = le16_to_cpu(s.sgs); + + if (ns->sws) { + unsigned int bs = 1 << ns->lba_shift; + + blk_queue_io_min(ns->queue, bs * ns->sws); + if (ns->sgs) + blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs); + } + + return 0; +} + static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; @@ -2048,6 +2296,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); + nvme_setup_streams_ns(ctrl, ns); sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); @@ -2056,7 +2305,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (nvme_nvm_ns_supported(ns, id) && nvme_nvm_register(ns, disk_name, node)) { - dev_warn(ctrl->dev, "%s: LightNVM init failure\n", __func__); + dev_warn(ctrl->device, "%s: LightNVM init failure\n", __func__); goto out_free_id; } @@ -2231,7 +2480,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl) * removal. */ if (ctrl->state == NVME_CTRL_LIVE) - schedule_work(&ctrl->scan_work); + queue_work(nvme_wq, &ctrl->scan_work); } EXPORT_SYMBOL_GPL(nvme_queue_scan); @@ -2286,7 +2535,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, /*FALLTHRU*/ case NVME_SC_ABORT_REQ: ++ctrl->event_limit; - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); break; default: break; @@ -2309,7 +2558,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_queue_async_events(struct nvme_ctrl *ctrl) { ctrl->event_limit = NVME_NR_AERS; - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_queue_async_events); @@ -2442,6 +2691,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) mutex_lock(&ctrl->namespaces_mutex); + /* Forcibly unquiesce queues to avoid blocking dispatch */ + blk_mq_unquiesce_queue(ctrl->admin_q); + /* Forcibly start all queues to avoid having stuck requests */ blk_mq_start_hw_queues(ctrl->admin_q); @@ -2455,6 +2707,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); + /* Forcibly unquiesce queues to avoid blocking dispatch */ + blk_mq_unquiesce_queue(ns->queue); + /* * Forcibly start all queues to avoid having stuck requests. * Note that we must ensure the queues are not stopped @@ -2533,7 +2788,7 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { - blk_mq_start_stopped_hw_queues(ns->queue, true); + blk_mq_unquiesce_queue(ns->queue); blk_mq_kick_requeue_list(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); @@ -2544,10 +2799,15 @@ int __init nvme_core_init(void) { int result; + nvme_wq = alloc_workqueue("nvme-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_wq) + return -ENOMEM; + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", &nvme_dev_fops); if (result < 0) - return result; + goto destroy_wq; else if (result > 0) nvme_char_major = result; @@ -2559,8 +2819,10 @@ int __init nvme_core_init(void) return 0; - unregister_chrdev: +unregister_chrdev: __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); +destroy_wq: + destroy_workqueue(nvme_wq); return result; } @@ -2568,6 +2830,7 @@ void nvme_core_exit(void) { class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + destroy_workqueue(nvme_wq); } MODULE_LICENSE("GPL"); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index c190d7e36900..2e582a240943 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -58,7 +58,6 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn) kref_init(&host->ref); memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); - uuid_gen(&host->id); list_add_tail(&host->list, &nvmf_hosts); out_unlock: @@ -75,7 +74,6 @@ static struct nvmf_host *nvmf_host_default(void) return NULL; kref_init(&host->ref); - uuid_gen(&host->id); snprintf(host->nqn, NVMF_NQN_SIZE, "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUb", &host->id); @@ -128,16 +126,6 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) EXPORT_SYMBOL_GPL(nvmf_get_address); /** - * nvmf_get_subsysnqn() - Get subsystem NQN - * @ctrl: Host NVMe controller instance which we got the NQN - */ -const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl) -{ - return ctrl->opts->subsysnqn; -} -EXPORT_SYMBOL_GPL(nvmf_get_subsysnqn); - -/** * nvmf_reg_read32() - NVMe Fabrics "Property Get" API function. * @ctrl: Host NVMe controller instance maintaining the admin * queue used to submit the property read command to @@ -337,6 +325,24 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, } } break; + + case NVME_SC_CONNECT_INVALID_HOST: + dev_err(ctrl->device, + "Connect for subsystem %s is not allowed, hostnqn: %s\n", + data->subsysnqn, data->hostnqn); + break; + + case NVME_SC_CONNECT_CTRL_BUSY: + dev_err(ctrl->device, + "Connect command failed: controller is busy or not available\n"); + break; + + case NVME_SC_CONNECT_FORMAT: + dev_err(ctrl->device, + "Connect incompatible format: %d", + cmd->connect.recfmt); + break; + default: dev_err(ctrl->device, "Connect command failed, error wo/DNR bit: %d\n", @@ -376,13 +382,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) cmd.connect.opcode = nvme_fabrics_command; cmd.connect.fctype = nvme_fabrics_type_connect; cmd.connect.qid = 0; - - /* - * fabrics spec sets a minimum of depth 32 for admin queue, - * so set the queue with this depth always until - * justification otherwise. - */ - cmd.connect.sqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + cmd.connect.sqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); /* * Set keep-alive timeout in seconds granularity (ms * 1000) @@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) { if (ctrl->opts->max_reconnects != -1 && - ctrl->opts->nr_reconnects < ctrl->opts->max_reconnects) + ctrl->nr_reconnects < ctrl->opts->max_reconnects) return true; return false; @@ -547,6 +547,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, + { NVMF_OPT_HOST_ID, "hostid=%s" }, { NVMF_OPT_ERR, NULL } }; @@ -558,6 +559,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, int token, ret = 0; size_t nqnlen = 0; int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO; + uuid_t hostid; /* Set defaults */ opts->queue_size = NVMF_DEF_QUEUE_SIZE; @@ -568,6 +570,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, if (!options) return -ENOMEM; + uuid_gen(&hostid); + while ((p = strsep(&o, ",\n")) != NULL) { if (!*p) continue; @@ -724,6 +728,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->host_traddr = p; break; + case NVMF_OPT_HOST_ID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (uuid_parse(p, &hostid)) { + ret = -EINVAL; + goto out; + } + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -743,6 +758,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->host = nvmf_default_host; } + uuid_copy(&opts->host->id, &hostid); + out: if (!opts->discovery_nqn && !opts->kato) opts->kato = NVME_DEFAULT_KATO; @@ -803,7 +820,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ - NVMF_OPT_KATO | NVMF_OPT_HOSTNQN) + NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ + NVMF_OPT_HOST_ID) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) @@ -854,6 +872,15 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_unlock; } + if (strcmp(ctrl->subnqn, opts->subsysnqn)) { + dev_warn(ctrl->device, + "controller returned incorrect NQN: \"%s\".\n", + ctrl->subnqn); + mutex_unlock(&nvmf_transports_mutex); + ctrl->ops->delete_ctrl(ctrl); + return ERR_PTR(-EINVAL); + } + mutex_unlock(&nvmf_transports_mutex); return ctrl; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 29be7600689d..bf33663218cd 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -56,6 +56,7 @@ enum { NVMF_OPT_RECONNECT_DELAY = 1 << 9, NVMF_OPT_HOST_TRADDR = 1 << 10, NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, + NVMF_OPT_HOST_ID = 1 << 12, }; /** @@ -80,7 +81,6 @@ enum { * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. * @kato: Keep-alive timeout. * @host: Virtual NVMe host, contains the NQN and Host ID. - * @nr_reconnects: number of reconnect attempted since the last ctrl failure * @max_reconnects: maximum number of allowed reconnect attempts before removing * the controller, (-1) means reconnect forever, zero means remove * immediately; @@ -98,7 +98,6 @@ struct nvmf_ctrl_options { bool discovery_nqn; unsigned int kato; struct nvmf_host *host; - int nr_reconnects; int max_reconnects; }; @@ -140,7 +139,6 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); int nvmf_register_transport(struct nvmf_transport_ops *ops); void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); -const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5ee4c71d168d..ed87214fdc0e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -36,7 +36,7 @@ */ #define NVME_FC_NR_AEN_COMMANDS 1 #define NVME_FC_AQ_BLKMQ_DEPTH \ - (NVMF_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS) + (NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS) #define AEN_CMDID_BASE (NVME_FC_AQ_BLKMQ_DEPTH + 1) enum nvme_fc_queue_flags { @@ -161,12 +161,12 @@ struct nvme_fc_ctrl { struct blk_mq_tag_set tag_set; struct work_struct delete_work; - struct work_struct reset_work; struct delayed_work connect_work; struct kref ref; u32 flags; u32 iocnt; + wait_queue_head_t ioabort_wait; struct nvme_fc_fcp_op aen_ops[NVME_FC_NR_AEN_COMMANDS]; @@ -214,7 +214,6 @@ static LIST_HEAD(nvme_fc_lport_list); static DEFINE_IDA(nvme_fc_local_port_cnt); static DEFINE_IDA(nvme_fc_ctrl_cnt); -static struct workqueue_struct *nvme_fc_wq; @@ -1241,8 +1240,10 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, spin_lock_irqsave(&ctrl->lock, flags); if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - if (ctrl->flags & FCCTRL_TERMIO) - ctrl->iocnt--; + if (ctrl->flags & FCCTRL_TERMIO) { + if (!--ctrl->iocnt) + wake_up(&ctrl->ioabort_wait); + } } if (op->flags & FCOP_FLAGS_RELEASED) complete_rq = true; @@ -1449,18 +1450,8 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, { struct nvme_fc_ctrl *ctrl = set->driver_data; struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_queue *queue = &ctrl->queues[hctx_idx+1]; - - return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); -} - -static int -nvme_fc_init_admin_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct nvme_fc_ctrl *ctrl = set->driver_data; - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_queue *queue = &ctrl->queues[0]; + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; + struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); } @@ -1758,16 +1749,16 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl) static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) { + /* only proceed if in LIVE state - e.g. on first error */ + if (ctrl->ctrl.state != NVME_CTRL_LIVE) + return; + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport association error detected: %s\n", ctrl->cnum, errmsg); dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum); - /* stop the queues on error, cleanup is in reset thread */ - if (ctrl->queue_count > 1) - nvme_stop_queues(&ctrl->ctrl); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { dev_err(ctrl->ctrl.device, "NVME-FC{%d}: error_recovery: Couldn't change state " @@ -1775,10 +1766,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) return; } - if (!queue_work(nvme_fc_wq, &ctrl->reset_work)) - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: error_recovery: Failed to schedule " - "reset work\n", ctrl->cnum); + nvme_reset_ctrl(&ctrl->ctrl); } static enum blk_eh_timer_return @@ -1887,7 +1875,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, * level FC exchange resource that is also outstanding. This must be * considered in all cleanup operations. */ -static int +static blk_status_t nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, struct nvme_fc_fcp_op *op, u32 data_len, enum nvmefc_fcp_datadir io_dir) @@ -1902,10 +1890,10 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, * the target device is present */ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; if (!nvme_fc_ctrl_get(ctrl)) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; /* format the FC-NVME CMD IU and fcp_req */ cmdiu->connection_id = cpu_to_be64(queue->connection_id); @@ -1953,8 +1941,9 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, if (ret < 0) { nvme_cleanup_cmd(op->rq); nvme_fc_ctrl_put(ctrl); - return (ret == -ENOMEM || ret == -EAGAIN) ? - BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR; + if (ret == -ENOMEM || ret == -EAGAIN) + return BLK_STS_RESOURCE; + return BLK_STS_IOERR; } } @@ -1971,28 +1960,26 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, queue->lldd_handle, &op->fcp_req); if (ret) { - if (op->rq) { /* normal request */ + if (op->rq) /* normal request */ nvme_fc_unmap_data(ctrl, op->rq, op); - nvme_cleanup_cmd(op->rq); - } /* else - aen. no cleanup needed */ nvme_fc_ctrl_put(ctrl); if (ret != -EBUSY) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; if (op->rq) { blk_mq_stop_hw_queues(op->rq->q); blk_mq_delay_queue(queue->hctx, NVMEFC_QUEUE_DELAY); } - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } -static int +static blk_status_t nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2005,7 +1992,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command *sqe = &cmdiu->sqe; enum nvmefc_fcp_datadir io_dir; u32 data_len; - int ret; + blk_status_t ret; ret = nvme_setup_cmd(ns, rq, sqe); if (ret) @@ -2060,7 +2047,7 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) struct nvme_fc_fcp_op *aen_op; unsigned long flags; bool terminating = false; - int ret; + blk_status_t ret; if (aer_idx > NVME_FC_NR_AEN_COMMANDS) return; @@ -2092,7 +2079,6 @@ __nvme_fc_final_op_cleanup(struct request *rq) op->flags &= ~(FCOP_FLAGS_TERMIO | FCOP_FLAGS_RELEASED | FCOP_FLAGS_COMPLETE); - nvme_cleanup_cmd(rq); nvme_fc_unmap_data(ctrl, rq, op); nvme_complete_rq(rq); nvme_fc_ctrl_put(ctrl); @@ -2310,7 +2296,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) int ret; bool changed; - ++ctrl->ctrl.opts->nr_reconnects; + ++ctrl->ctrl.nr_reconnects; /* * Create the admin queue @@ -2407,7 +2393,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - ctrl->ctrl.opts->nr_reconnects = 0; + ctrl->ctrl.nr_reconnects = 0; if (ctrl->queue_count > 1) { nvme_start_queues(&ctrl->ctrl); @@ -2493,11 +2479,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) /* wait for all io that had to be aborted */ spin_lock_irqsave(&ctrl->lock, flags); - while (ctrl->iocnt) { - spin_unlock_irqrestore(&ctrl->lock, flags); - msleep(1000); - spin_lock_irqsave(&ctrl->lock, flags); - } + wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock); ctrl->flags &= ~FCCTRL_TERMIO; spin_unlock_irqrestore(&ctrl->lock, flags); @@ -2527,7 +2509,7 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) struct nvme_fc_ctrl *ctrl = container_of(work, struct nvme_fc_ctrl, delete_work); - cancel_work_sync(&ctrl->reset_work); + cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); /* @@ -2554,7 +2536,7 @@ __nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) return true; - if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) + if (!queue_work(nvme_wq, &ctrl->delete_work)) return true; return false; @@ -2581,7 +2563,7 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) ret = __nvme_fc_del_ctrl(ctrl); if (!ret) - flush_workqueue(nvme_fc_wq); + flush_workqueue(nvme_wq); nvme_put_ctrl(&ctrl->ctrl); @@ -2606,13 +2588,13 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) dev_info(ctrl->ctrl.device, "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, + queue_delayed_work(nvme_wq, &ctrl->connect_work, ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", - ctrl->cnum, ctrl->ctrl.opts->nr_reconnects); + ctrl->cnum, ctrl->ctrl.nr_reconnects); WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); } } @@ -2621,7 +2603,7 @@ static void nvme_fc_reset_ctrl_work(struct work_struct *work) { struct nvme_fc_ctrl *ctrl = - container_of(work, struct nvme_fc_ctrl, reset_work); + container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); int ret; /* will block will waiting for io to terminate */ @@ -2635,29 +2617,6 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); } -/* - * called by the nvme core layer, for sysfs interface that requests - * a reset of the nvme controller - */ -static int -nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: admin requested controller reset\n", ctrl->cnum); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - - if (!queue_work(nvme_fc_wq, &ctrl->reset_work)) - return -EBUSY; - - flush_work(&ctrl->reset_work); - - return 0; -} - static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .name = "fc", .module = THIS_MODULE, @@ -2665,11 +2624,9 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .reset_ctrl = nvme_fc_reset_nvme_ctrl, .free_ctrl = nvme_fc_nvme_ctrl_freed, .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_del_nvme_ctrl, - .get_subsysnqn = nvmf_get_subsysnqn, .get_address = nvmf_get_address, }; @@ -2695,7 +2652,7 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) static const struct blk_mq_ops nvme_fc_admin_mq_ops = { .queue_rq = nvme_fc_queue_rq, .complete = nvme_fc_complete_rq, - .init_request = nvme_fc_init_admin_request, + .init_request = nvme_fc_init_request, .exit_request = nvme_fc_exit_request, .reinit_request = nvme_fc_reinit_request, .init_hctx = nvme_fc_init_admin_hctx, @@ -2740,7 +2697,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, kref_init(&ctrl->ref); INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); - INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); spin_lock_init(&ctrl->lock); @@ -2807,6 +2764,9 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + /* Remove core ctrl ref. */ + nvme_put_ctrl(&ctrl->ctrl); + /* as we're past the point where we transition to the ref * counting teardown path, if we return a bad pointer here, * the calling routine, thinking it's prior to the @@ -2965,20 +2925,7 @@ static struct nvmf_transport_ops nvme_fc_transport = { static int __init nvme_fc_init_module(void) { - int ret; - - nvme_fc_wq = create_workqueue("nvme_fc_wq"); - if (!nvme_fc_wq) - return -ENOMEM; - - ret = nvmf_register_transport(&nvme_fc_transport); - if (ret) - goto err; - - return 0; -err: - destroy_workqueue(nvme_fc_wq); - return ret; + return nvmf_register_transport(&nvme_fc_transport); } static void __exit nvme_fc_exit_module(void) @@ -2989,8 +2936,6 @@ static void __exit nvme_fc_exit_module(void) nvmf_unregister_transport(&nvme_fc_transport); - destroy_workqueue(nvme_fc_wq); - ida_destroy(&nvme_fc_local_port_cnt); ida_destroy(&nvme_fc_ctrl_cnt); } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index f5df78ed1e10..be8541335e31 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -242,7 +242,7 @@ static inline void _nvme_nvm_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64); } @@ -480,7 +480,7 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, rqd->bio->bi_iter.bi_sector)); } -static void nvme_nvm_end_io(struct request *rq, int error) +static void nvme_nvm_end_io(struct request *rq, blk_status_t status) { struct nvm_rq *rqd = rq->end_io_data; @@ -509,7 +509,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); if (IS_ERR(rq)) { kfree(cmd); - return -ENOMEM; + return PTR_ERR(rq); } rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; @@ -571,13 +571,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .max_phys_sect = 64, }; -static void nvme_nvm_end_user_vio(struct request *rq, int error) -{ - struct completion *waiting = rq->end_io_data; - - complete(waiting); -} - static int nvme_nvm_submit_user_cmd(struct request_queue *q, struct nvme_ns *ns, struct nvme_nvm_command *vcmd, @@ -608,7 +601,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - rq->end_io_data = &wait; if (ppa_buf && ppa_len) { ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); @@ -662,9 +654,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, } submit: - blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_user_vio); - - wait_for_completion_io(&wait); + blk_execute_rq(q, NULL, rq, 0); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) ret = -EINTR; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9d6a070d4391..d70ff0fdd36b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -27,12 +27,11 @@ extern unsigned char nvme_io_timeout; extern unsigned char admin_timeout; #define ADMIN_TIMEOUT (admin_timeout * HZ) -extern unsigned char shutdown_timeout; -#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) - #define NVME_DEFAULT_KATO 5 #define NVME_KATO_GRACE 10 +extern struct workqueue_struct *nvme_wq; + enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, @@ -131,6 +130,7 @@ struct nvme_ctrl { struct device *device; /* char device */ struct list_head node; struct ida ns_ida; + struct work_struct reset_work; struct opal_dev *opal_dev; @@ -138,6 +138,7 @@ struct nvme_ctrl { char serial[20]; char model[40]; char firmware_rev[8]; + char subnqn[NVMF_NQN_SIZE]; u16 cntlid; u32 ctrl_config; @@ -147,6 +148,8 @@ struct nvme_ctrl { u16 oncs; u16 vid; u16 oacs; + u16 nssa; + u16 nr_streams; atomic_t abort_limit; u8 event_limit; u8 vwc; @@ -165,6 +168,10 @@ struct nvme_ctrl { /* Power saving configuration */ u64 ps_max_latency_us; + bool apst_enabled; + + u32 hmpre; + u32 hmmin; /* Fabrics only */ u16 sqsize; @@ -172,12 +179,10 @@ struct nvme_ctrl { u32 iorcsz; u16 icdoff; u16 maxcmd; + int nr_reconnects; struct nvmf_ctrl_options *opts; }; -/* - * An NVM Express namespace is equivalent to a SCSI LUN - */ struct nvme_ns { struct list_head list; @@ -189,14 +194,18 @@ struct nvme_ns { int instance; u8 eui[8]; - u8 uuid[16]; + u8 nguid[16]; + uuid_t uuid; unsigned ns_id; int lba_shift; u16 ms; + u16 sgs; + u32 sws; bool ext; u8 pi_type; unsigned long flags; + u16 noiob; #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 @@ -214,11 +223,9 @@ struct nvme_ctrl_ops { int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); - int (*reset_ctrl)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); int (*delete_ctrl)(struct nvme_ctrl *ctrl); - const char *(*get_subsysnqn)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); }; @@ -296,7 +303,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags, int qid); -int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); @@ -310,23 +317,10 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u32 *result, unsigned timeout); -int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id); -int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, - struct nvme_id_ns **id); -int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); -int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - void *buffer, size_t buflen, u32 *result); -int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, - void *buffer, size_t buflen, u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); - -struct sg_io_hdr; - -int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); -int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg); -int nvme_sg_get_version_num(int __user *ip); +int nvme_reset_ctrl(struct nvme_ctrl *ctrl); #ifdef CONFIG_NVM int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 40c7581caeb0..33c3b9db7d36 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -17,28 +17,15 @@ #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-mq-pci.h> -#include <linux/cpu.h> -#include <linux/delay.h> #include <linux/dmi.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/hdreg.h> -#include <linux/idr.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/io.h> -#include <linux/kdev_t.h> -#include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> -#include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/pci.h> #include <linux/poison.h> -#include <linux/ptrace.h> -#include <linux/sched.h> -#include <linux/slab.h> #include <linux/t10-pi.h> #include <linux/timer.h> #include <linux/types.h> @@ -49,7 +36,6 @@ #include "nvme.h" #define NVME_Q_DEPTH 1024 -#define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) @@ -66,12 +52,14 @@ static bool use_cmb_sqes = true; module_param(use_cmb_sqes, bool, 0644); MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); -static struct workqueue_struct *nvme_workq; +static unsigned int max_host_mem_size_mb = 128; +module_param(max_host_mem_size_mb, uint, 0444); +MODULE_PARM_DESC(max_host_mem_size_mb, + "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); struct nvme_dev; struct nvme_queue; -static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); @@ -92,9 +80,8 @@ struct nvme_dev { int q_depth; u32 db_stride; void __iomem *bar; - struct work_struct reset_work; + unsigned long bar_mapped_size; struct work_struct remove_work; - struct timer_list watchdog_timer; struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; @@ -104,10 +91,18 @@ struct nvme_dev { u32 cmbloc; struct nvme_ctrl ctrl; struct completion ioq_wait; + + /* shadow doorbell buffer support: */ u32 *dbbuf_dbs; dma_addr_t dbbuf_dbs_dma_addr; u32 *dbbuf_eis; dma_addr_t dbbuf_eis_dma_addr; + + /* host memory buffer support: */ + u64 host_mem_size; + u32 nr_host_mem_descs; + struct nvme_host_mem_buf_desc *host_mem_descs; + void **host_mem_desc_bufs; }; static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -185,8 +180,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); - BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); @@ -350,19 +345,6 @@ static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_i nvmeq->tags = NULL; } -static int nvme_admin_init_request(struct blk_mq_tag_set *set, - struct request *req, unsigned int hctx_idx, - unsigned int numa_node) -{ - struct nvme_dev *dev = set->driver_data; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = dev->queues[0]; - - BUG_ON(!nvmeq); - iod->nvmeq = nvmeq; - return 0; -} - static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -382,7 +364,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, { struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; + struct nvme_queue *nvmeq = dev->queues[queue_idx]; BUG_ON(!nvmeq); iod->nvmeq = nvmeq; @@ -427,7 +410,7 @@ static __le64 **iod_list(struct request *req) return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); } -static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) +static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = blk_rq_nr_phys_segments(rq); @@ -436,7 +419,7 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); if (!iod->sg) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } else { iod->sg = iod->inline_sg; } @@ -446,7 +429,7 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) iod->nents = 0; iod->length = size; - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void nvme_free_iod(struct nvme_dev *dev, struct request *req) @@ -616,21 +599,21 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req) return true; } -static int nvme_map_data(struct nvme_dev *dev, struct request *req, +static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; - int ret = BLK_MQ_RQ_QUEUE_ERROR; + blk_status_t ret = BLK_STS_IOERR; sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); iod->nents = blk_rq_map_sg(q, req, iod->sg); if (!iod->nents) goto out; - ret = BLK_MQ_RQ_QUEUE_BUSY; + ret = BLK_STS_RESOURCE; if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, DMA_ATTR_NO_WARN)) goto out; @@ -638,7 +621,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, if (!nvme_setup_prps(dev, req)) goto out_unmap; - ret = BLK_MQ_RQ_QUEUE_ERROR; + ret = BLK_STS_IOERR; if (blk_integrity_rq(req)) { if (blk_rq_count_integrity_sg(q, req->bio) != 1) goto out_unmap; @@ -658,7 +641,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma); if (blk_integrity_rq(req)) cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_unmap: dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); @@ -688,7 +671,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) /* * NOTE: ns is NULL when called on the admin queue. */ -static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; @@ -696,47 +679,34 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; - int ret = BLK_MQ_RQ_QUEUE_OK; - - /* - * If formated with metadata, require the block layer provide a buffer - * unless this namespace is formated such that the metadata can be - * stripped/generated by the controller with PRACT=1. - */ - if (ns && ns->ms && !blk_integrity_rq(req)) { - if (!(ns->pi_type && ns->ms == 8) && - !blk_rq_is_passthrough(req)) { - blk_mq_end_request(req, -EFAULT); - return BLK_MQ_RQ_QUEUE_OK; - } - } + blk_status_t ret; ret = nvme_setup_cmd(ns, req, &cmnd); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) return ret; ret = nvme_init_iod(req, dev); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) goto out_free_cmd; - if (blk_rq_nr_phys_segments(req)) + if (blk_rq_nr_phys_segments(req)) { ret = nvme_map_data(dev, req, &cmnd); - - if (ret != BLK_MQ_RQ_QUEUE_OK) - goto out_cleanup_iod; + if (ret) + goto out_cleanup_iod; + } blk_mq_start_request(req); spin_lock_irq(&nvmeq->q_lock); if (unlikely(nvmeq->cq_vector < 0)) { - ret = BLK_MQ_RQ_QUEUE_ERROR; + ret = BLK_STS_IOERR; spin_unlock_irq(&nvmeq->q_lock); goto out_cleanup_iod; } __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_cleanup_iod: nvme_free_iod(dev, req); out_free_cmd: @@ -759,65 +729,75 @@ static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; } -static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) +static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) { - u16 head, phase; - - head = nvmeq->cq_head; - phase = nvmeq->cq_phase; - - while (nvme_cqe_valid(nvmeq, head, phase)) { - struct nvme_completion cqe = nvmeq->cqes[head]; - struct request *req; - - if (++head == nvmeq->q_depth) { - head = 0; - phase = !phase; - } - - if (tag && *tag == cqe.command_id) - *tag = -1; + u16 head = nvmeq->cq_head; - if (unlikely(cqe.command_id >= nvmeq->q_depth)) { - dev_warn(nvmeq->dev->ctrl.device, - "invalid id %d completed on queue %d\n", - cqe.command_id, le16_to_cpu(cqe.sq_id)); - continue; - } + if (likely(nvmeq->cq_vector >= 0)) { + if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, + nvmeq->dbbuf_cq_ei)) + writel(head, nvmeq->q_db + nvmeq->dev->db_stride); + } +} - /* - * AEN requests are special as they don't time out and can - * survive any kind of queue freeze and often don't respond to - * aborts. We don't even bother to allocate a struct request - * for them but rather special case them here. - */ - if (unlikely(nvmeq->qid == 0 && - cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { - nvme_complete_async_event(&nvmeq->dev->ctrl, - cqe.status, &cqe.result); - continue; - } +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, + struct nvme_completion *cqe) +{ + struct request *req; - req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); - nvme_end_request(req, cqe.status, cqe.result); + if (unlikely(cqe->command_id >= nvmeq->q_depth)) { + dev_warn(nvmeq->dev->ctrl.device, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpu(cqe->sq_id)); + return; } - if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvmeq->qid == 0 && + cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(&nvmeq->dev->ctrl, + cqe->status, &cqe->result); return; + } - if (likely(nvmeq->cq_vector >= 0)) - if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, - nvmeq->dbbuf_cq_ei)) - writel(head, nvmeq->q_db + nvmeq->dev->db_stride); - nvmeq->cq_head = head; - nvmeq->cq_phase = phase; + req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); + nvme_end_request(req, cqe->status, cqe->result); +} - nvmeq->cqe_seen = 1; +static inline bool nvme_read_cqe(struct nvme_queue *nvmeq, + struct nvme_completion *cqe) +{ + if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { + *cqe = nvmeq->cqes[nvmeq->cq_head]; + + if (++nvmeq->cq_head == nvmeq->q_depth) { + nvmeq->cq_head = 0; + nvmeq->cq_phase = !nvmeq->cq_phase; + } + return true; + } + return false; } static void nvme_process_cq(struct nvme_queue *nvmeq) { - __nvme_process_cq(nvmeq, NULL); + struct nvme_completion cqe; + int consumed = 0; + + while (nvme_read_cqe(nvmeq, &cqe)) { + nvme_handle_cqe(nvmeq, &cqe); + consumed++; + } + + if (consumed) { + nvme_ring_cq_doorbell(nvmeq); + nvmeq->cqe_seen = 1; + } } static irqreturn_t nvme_irq(int irq, void *data) @@ -842,16 +822,28 @@ static irqreturn_t nvme_irq_check(int irq, void *data) static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) { - if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { - spin_lock_irq(&nvmeq->q_lock); - __nvme_process_cq(nvmeq, &tag); - spin_unlock_irq(&nvmeq->q_lock); + struct nvme_completion cqe; + int found = 0, consumed = 0; - if (tag == -1) - return 1; - } + if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + return 0; - return 0; + spin_lock_irq(&nvmeq->q_lock); + while (nvme_read_cqe(nvmeq, &cqe)) { + nvme_handle_cqe(nvmeq, &cqe); + consumed++; + + if (tag == cqe.command_id) { + found = 1; + break; + } + } + + if (consumed) + nvme_ring_cq_doorbell(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + + return found; } static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) @@ -939,7 +931,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static void abort_endio(struct request *req, int error) +static void abort_endio(struct request *req, blk_status_t error) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = iod->nvmeq; @@ -950,6 +942,51 @@ static void abort_endio(struct request *req, int error) blk_mq_free_request(req); } +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) +{ + + /* If true, indicates loss of adapter communication, possibly by a + * NVMe Subsystem reset. + */ + bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); + + /* If there is a reset ongoing, we shouldn't reset again. */ + if (dev->ctrl.state == NVME_CTRL_RESETTING) + return false; + + /* We shouldn't reset unless the controller is on fatal error state + * _or_ if we lost the communication with it. + */ + if (!(csts & NVME_CSTS_CFS) && !nssro) + return false; + + /* If PCI error recovery process is happening, we cannot reset or + * the recovery mechanism will surely fail. + */ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return false; + + return true; +} + +static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) +{ + /* Read a config register to help see what died. */ + u16 pci_status; + int result; + + result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, + &pci_status); + if (result == PCIBIOS_SUCCESSFUL) + dev_warn(dev->ctrl.device, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", + csts, pci_status); + else + dev_warn(dev->ctrl.device, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", + csts, result); +} + static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -957,6 +994,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd; + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + /* + * Reset immediately if the controller is failed + */ + if (nvme_should_reset(dev, csts)) { + nvme_warn_reset(dev, csts); + nvme_dev_disable(dev, false); + nvme_reset_ctrl(&dev->ctrl); + return BLK_EH_HANDLED; + } /* * Did we miss an interrupt? @@ -993,7 +1041,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); - nvme_reset(dev); + nvme_reset_ctrl(&dev->ctrl); /* * Mark the request as handled, since the inline shutdown @@ -1247,7 +1295,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { .complete = nvme_pci_complete_rq, .init_hctx = nvme_admin_init_hctx, .exit_hctx = nvme_admin_exit_hctx, - .init_request = nvme_admin_init_request, + .init_request = nvme_init_request, .timeout = nvme_timeout, }; @@ -1311,6 +1359,32 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) return 0; } +static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +{ + return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); +} + +static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (size <= dev->bar_mapped_size) + return 0; + if (size > pci_resource_len(pdev, 0)) + return -ENOMEM; + if (dev->bar) + iounmap(dev->bar); + dev->bar = ioremap(pci_resource_start(pdev, 0), size); + if (!dev->bar) { + dev->bar_mapped_size = 0; + return -ENOMEM; + } + dev->bar_mapped_size = size; + dev->dbs = dev->bar + NVME_REG_DBS; + + return 0; +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; @@ -1318,6 +1392,10 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; + result = nvme_remap_bar(dev, db_bar_size(dev, 0)); + if (result < 0) + return result; + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? NVME_CAP_NSSRC(cap) : 0; @@ -1358,66 +1436,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) -{ - - /* If true, indicates loss of adapter communication, possibly by a - * NVMe Subsystem reset. - */ - bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); - - /* If there is a reset ongoing, we shouldn't reset again. */ - if (dev->ctrl.state == NVME_CTRL_RESETTING) - return false; - - /* We shouldn't reset unless the controller is on fatal error state - * _or_ if we lost the communication with it. - */ - if (!(csts & NVME_CSTS_CFS) && !nssro) - return false; - - /* If PCI error recovery process is happening, we cannot reset or - * the recovery mechanism will surely fail. - */ - if (pci_channel_offline(to_pci_dev(dev->dev))) - return false; - - return true; -} - -static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) -{ - /* Read a config register to help see what died. */ - u16 pci_status; - int result; - - result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, - &pci_status); - if (result == PCIBIOS_SUCCESSFUL) - dev_warn(dev->ctrl.device, - "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", - csts, pci_status); - else - dev_warn(dev->ctrl.device, - "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", - csts, result); -} - -static void nvme_watchdog_timer(unsigned long data) -{ - struct nvme_dev *dev = (struct nvme_dev *)data; - u32 csts = readl(dev->bar + NVME_REG_CSTS); - - /* Skip controllers under certain specific conditions. */ - if (nvme_should_reset(dev, csts)) { - if (!nvme_reset(dev)) - nvme_warn_reset(dev, csts); - return; - } - - mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); -} - static int nvme_create_io_queues(struct nvme_dev *dev) { unsigned i, max; @@ -1514,16 +1532,168 @@ static inline void nvme_release_cmb(struct nvme_dev *dev) } } -static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) +{ + size_t len = dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs); + struct nvme_command c; + u64 dma_addr; + int ret; + + dma_addr = dma_map_single(dev->dev, dev->host_mem_descs, len, + DMA_TO_DEVICE); + if (dma_mapping_error(dev->dev, dma_addr)) + return -ENOMEM; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); + c.features.dword11 = cpu_to_le32(bits); + c.features.dword12 = cpu_to_le32(dev->host_mem_size >> + ilog2(dev->ctrl.page_size)); + c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); + c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); + c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); + + ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); + if (ret) { + dev_warn(dev->ctrl.device, + "failed to set host mem (err %d, flags %#x).\n", + ret, bits); + } + dma_unmap_single(dev->dev, dma_addr, len, DMA_TO_DEVICE); + return ret; +} + +static void nvme_free_host_mem(struct nvme_dev *dev) +{ + int i; + + for (i = 0; i < dev->nr_host_mem_descs; i++) { + struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; + size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; + + dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i], + le64_to_cpu(desc->addr)); + } + + kfree(dev->host_mem_desc_bufs); + dev->host_mem_desc_bufs = NULL; + kfree(dev->host_mem_descs); + dev->host_mem_descs = NULL; +} + +static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { - return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); + struct nvme_host_mem_buf_desc *descs; + u32 chunk_size, max_entries, i = 0; + void **bufs; + u64 size, tmp; + + /* start big and work our way down */ + chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER); +retry: + tmp = (preferred + chunk_size - 1); + do_div(tmp, chunk_size); + max_entries = tmp; + descs = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL); + if (!descs) + goto out; + + bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL); + if (!bufs) + goto out_free_descs; + + for (size = 0; size < preferred; size += chunk_size) { + u32 len = min_t(u64, chunk_size, preferred - size); + dma_addr_t dma_addr; + + bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL, + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); + if (!bufs[i]) + break; + + descs[i].addr = cpu_to_le64(dma_addr); + descs[i].size = cpu_to_le32(len / dev->ctrl.page_size); + i++; + } + + if (!size || (min && size < min)) { + dev_warn(dev->ctrl.device, + "failed to allocate host memory buffer.\n"); + goto out_free_bufs; + } + + dev_info(dev->ctrl.device, + "allocated %lld MiB host memory buffer.\n", + size >> ilog2(SZ_1M)); + dev->nr_host_mem_descs = i; + dev->host_mem_size = size; + dev->host_mem_descs = descs; + dev->host_mem_desc_bufs = bufs; + return 0; + +out_free_bufs: + while (--i >= 0) { + size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; + + dma_free_coherent(dev->dev, size, bufs[i], + le64_to_cpu(descs[i].addr)); + } + + kfree(bufs); +out_free_descs: + kfree(descs); +out: + /* try a smaller chunk size if we failed early */ + if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) { + chunk_size /= 2; + goto retry; + } + dev->host_mem_descs = NULL; + return -ENOMEM; +} + +static void nvme_setup_host_mem(struct nvme_dev *dev) +{ + u64 max = (u64)max_host_mem_size_mb * SZ_1M; + u64 preferred = (u64)dev->ctrl.hmpre * 4096; + u64 min = (u64)dev->ctrl.hmmin * 4096; + u32 enable_bits = NVME_HOST_MEM_ENABLE; + + preferred = min(preferred, max); + if (min > max) { + dev_warn(dev->ctrl.device, + "min host memory (%lld MiB) above limit (%d MiB).\n", + min >> ilog2(SZ_1M), max_host_mem_size_mb); + nvme_free_host_mem(dev); + return; + } + + /* + * If we already have a buffer allocated check if we can reuse it. + */ + if (dev->host_mem_descs) { + if (dev->host_mem_size >= min) + enable_bits |= NVME_HOST_MEM_RETURN; + else + nvme_free_host_mem(dev); + } + + if (!dev->host_mem_descs) { + if (nvme_alloc_host_mem(dev, min, preferred)) + return; + } + + if (nvme_set_host_mem(dev, enable_bits)) + nvme_free_host_mem(dev); } static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = dev->queues[0]; struct pci_dev *pdev = to_pci_dev(dev->dev); - int result, nr_io_queues, size; + int result, nr_io_queues; + unsigned long size; nr_io_queues = num_online_cpus(); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); @@ -1542,20 +1712,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_release_cmb(dev); } - size = db_bar_size(dev, nr_io_queues); - if (size > 8192) { - iounmap(dev->bar); - do { - dev->bar = ioremap(pci_resource_start(pdev, 0), size); - if (dev->bar) - break; - if (!--nr_io_queues) - return -ENOMEM; - size = db_bar_size(dev, nr_io_queues); - } while (1); - dev->dbs = dev->bar + 4096; - adminq->q_db = dev->dbs; - } + do { + size = db_bar_size(dev, nr_io_queues); + result = nvme_remap_bar(dev, size); + if (!result) + break; + if (!--nr_io_queues) + return -ENOMEM; + } while (1); + adminq->q_db = dev->dbs; /* Deregister the admin queue's interrupt */ pci_free_irq(pdev, 0, adminq); @@ -1586,7 +1751,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return nvme_create_io_queues(dev); } -static void nvme_del_queue_end(struct request *req, int error) +static void nvme_del_queue_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; @@ -1594,7 +1759,7 @@ static void nvme_del_queue_end(struct request *req, int error) complete(&nvmeq->dev->ioq_wait); } -static void nvme_del_cq_end(struct request *req, int error) +static void nvme_del_cq_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; @@ -1799,8 +1964,6 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) bool dead = true; struct pci_dev *pdev = to_pci_dev(dev->dev); - del_timer_sync(&dev->watchdog_timer); - mutex_lock(&dev->shutdown_lock); if (pci_is_enabled(pdev)) { u32 csts = readl(dev->bar + NVME_REG_CSTS); @@ -1816,8 +1979,20 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * Give the controller a chance to complete all entered requests if * doing a safe shutdown. */ - if (!dead && shutdown) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + if (!dead) { + if (shutdown) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + + /* + * If the controller is still alive tell it to stop using the + * host memory buffer. In theory the shutdown / reset should + * make sure that it doesn't access the host memoery anymore, + * but I'd rather be safe than sorry.. + */ + if (dev->host_mem_descs) + nvme_set_host_mem(dev, 0); + + } nvme_stop_queues(&dev->ctrl); queues = dev->online_queues - 1; @@ -1900,7 +2075,8 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) static void nvme_reset_work(struct work_struct *work) { - struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + struct nvme_dev *dev = + container_of(work, struct nvme_dev, ctrl.reset_work); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; @@ -1949,6 +2125,9 @@ static void nvme_reset_work(struct work_struct *work) "unable to allocate dma for dbbuf\n"); } + if (dev->ctrl.hmpre) + nvme_setup_host_mem(dev); + result = nvme_setup_io_queues(dev); if (result) goto out; @@ -1962,8 +2141,6 @@ static void nvme_reset_work(struct work_struct *work) if (dev->online_queues > 1) nvme_queue_async_events(&dev->ctrl); - mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); - /* * Keep the controller around but remove all namespaces if we don't have * any working I/O queue. @@ -2003,17 +2180,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) nvme_put_ctrl(&dev->ctrl); } -static int nvme_reset(struct nvme_dev *dev) -{ - if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) - return -ENODEV; - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - if (!queue_work(nvme_workq, &dev->reset_work)) - return -EBUSY; - return 0; -} - static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { *val = readl(to_nvme_dev(ctrl)->bar + off); @@ -2032,16 +2198,6 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) return 0; } -static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) -{ - struct nvme_dev *dev = to_nvme_dev(ctrl); - int ret = nvme_reset(dev); - - if (!ret) - flush_work(&dev->reset_work); - return ret; -} - static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, @@ -2049,7 +2205,6 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, - .reset_ctrl = nvme_pci_reset_ctrl, .free_ctrl = nvme_pci_free_ctrl, .submit_async_event = nvme_pci_submit_async_event, }; @@ -2061,8 +2216,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (pci_request_mem_regions(pdev, "nvme")) return -ENODEV; - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) + if (nvme_remap_bar(dev, NVME_REG_DBS + 4096)) goto release; return 0; @@ -2116,10 +2270,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto free; - INIT_WORK(&dev->reset_work, nvme_reset_work); + INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - setup_timer(&dev->watchdog_timer, nvme_watchdog_timer, - (unsigned long)dev); mutex_init(&dev->shutdown_lock); init_completion(&dev->ioq_wait); @@ -2137,7 +2289,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING); dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - queue_work(nvme_workq, &dev->reset_work); + queue_work(nvme_wq, &dev->ctrl.reset_work); return 0; release_pools: @@ -2158,7 +2310,7 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) if (prepare) nvme_dev_disable(dev, false); else - nvme_reset(dev); + nvme_reset_ctrl(&dev->ctrl); } static void nvme_shutdown(struct pci_dev *pdev) @@ -2178,7 +2330,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - cancel_work_sync(&dev->reset_work); + cancel_work_sync(&dev->ctrl.reset_work); pci_set_drvdata(pdev, NULL); if (!pci_device_is_present(pdev)) { @@ -2186,9 +2338,10 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_disable(dev, false); } - flush_work(&dev->reset_work); + flush_work(&dev->ctrl.reset_work); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); + nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); nvme_release_prp_pools(dev); @@ -2229,7 +2382,7 @@ static int nvme_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - nvme_reset(ndev); + nvme_reset_ctrl(&ndev->ctrl); return 0; } #endif @@ -2268,7 +2421,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) dev_info(dev->ctrl.device, "restart after slot reset\n"); pci_restore_state(pdev); - nvme_reset(dev); + nvme_reset_ctrl(&dev->ctrl); return PCI_ERS_RESULT_RECOVERED; } @@ -2324,22 +2477,12 @@ static struct pci_driver nvme_driver = { static int __init nvme_init(void) { - int result; - - nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); - if (!nvme_workq) - return -ENOMEM; - - result = pci_register_driver(&nvme_driver); - if (result) - destroy_workqueue(nvme_workq); - return result; + return pci_register_driver(&nvme_driver); } static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - destroy_workqueue(nvme_workq); _nvme_check_size(); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 24397d306d53..6d4119dfbdaa 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -48,7 +48,7 @@ */ #define NVME_RDMA_NR_AEN_COMMANDS 1 #define NVME_RDMA_AQ_BLKMQ_DEPTH \ - (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) + (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) struct nvme_rdma_device { struct ib_device *dev; @@ -80,10 +80,8 @@ struct nvme_rdma_request { }; enum nvme_rdma_queue_flags { - NVME_RDMA_Q_CONNECTED = (1 << 0), - NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1), - NVME_RDMA_Q_DELETING = (1 << 2), - NVME_RDMA_Q_LIVE = (1 << 3), + NVME_RDMA_Q_LIVE = 0, + NVME_RDMA_Q_DELETING = 1, }; struct nvme_rdma_queue { @@ -103,9 +101,6 @@ struct nvme_rdma_queue { }; struct nvme_rdma_ctrl { - /* read and written in the hot path */ - spinlock_t lock; - /* read only in the hot path */ struct nvme_rdma_queue *queues; u32 queue_count; @@ -113,7 +108,6 @@ struct nvme_rdma_ctrl { /* other member variables */ struct blk_mq_tag_set tag_set; struct work_struct delete_work; - struct work_struct reset_work; struct work_struct err_work; struct nvme_rdma_qe async_event_sqe; @@ -145,8 +139,6 @@ static DEFINE_MUTEX(device_list_mutex); static LIST_HEAD(nvme_rdma_ctrl_list); static DEFINE_MUTEX(nvme_rdma_ctrl_mutex); -static struct workqueue_struct *nvme_rdma_wq; - /* * Disabling this option makes small I/O goes faster, but is fundamentally * unsafe. With it turned off we will have to register a global rkey that @@ -301,10 +293,12 @@ out: return ret; } -static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, - struct request *rq, unsigned int queue_idx) +static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx) { + struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; struct nvme_rdma_device *dev = queue->device; @@ -315,22 +309,13 @@ static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, DMA_TO_DEVICE); } -static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx) -{ - return __nvme_rdma_exit_request(set->driver_data, rq, hctx_idx + 1); -} - -static void nvme_rdma_exit_admin_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx) -{ - return __nvme_rdma_exit_request(set->driver_data, rq, 0); -} - -static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl, - struct request *rq, unsigned int queue_idx) +static int nvme_rdma_init_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx, + unsigned int numa_node) { + struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; @@ -358,20 +343,6 @@ out_free_qe: return -ENOMEM; } -static int nvme_rdma_init_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx, - unsigned int numa_node) -{ - return __nvme_rdma_init_request(set->driver_data, rq, hctx_idx + 1); -} - -static int nvme_rdma_init_admin_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx, - unsigned int numa_node) -{ - return __nvme_rdma_init_request(set->driver_data, rq, 0); -} - static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -469,9 +440,6 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) struct nvme_rdma_device *dev; struct ib_device *ibdev; - if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags)) - return; - dev = queue->device; ibdev = dev->dev; rdma_destroy_qp(queue->cm_id); @@ -483,17 +451,21 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) nvme_rdma_dev_put(dev); } -static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, - struct nvme_rdma_device *dev) +static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { - struct ib_device *ibdev = dev->dev; + struct ib_device *ibdev; const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ int comp_vector, idx = nvme_rdma_queue_idx(queue); - int ret; - queue->device = dev; + queue->device = nvme_rdma_find_get_device(queue->cm_id); + if (!queue->device) { + dev_err(queue->cm_id->device->dev.parent, + "no client data found!\n"); + return -ECONNREFUSED; + } + ibdev = queue->device->dev; /* * The admin queue is barely used once the controller is live, so don't @@ -506,12 +478,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, /* +1 for ib_stop_cq */ - queue->ib_cq = ib_alloc_cq(dev->dev, queue, - cq_factor * queue->queue_size + 1, comp_vector, - IB_POLL_SOFTIRQ); + queue->ib_cq = ib_alloc_cq(ibdev, queue, + cq_factor * queue->queue_size + 1, + comp_vector, IB_POLL_SOFTIRQ); if (IS_ERR(queue->ib_cq)) { ret = PTR_ERR(queue->ib_cq); - goto out; + goto out_put_dev; } ret = nvme_rdma_create_qp(queue, send_wr_factor); @@ -524,7 +496,6 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, ret = -ENOMEM; goto out_destroy_qp; } - set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags); return 0; @@ -532,7 +503,8 @@ out_destroy_qp: ib_destroy_qp(queue->qp); out_destroy_ib_cq: ib_free_cq(queue->ib_cq); -out: +out_put_dev: + nvme_rdma_dev_put(queue->device); return ret; } @@ -583,12 +555,10 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, } clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); - set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags); return 0; out_destroy_cm_id: - nvme_rdma_destroy_queue_ib(queue); rdma_destroy_id(queue->cm_id); return ret; } @@ -718,11 +688,11 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) if (nvmf_should_reconnect(&ctrl->ctrl)) { dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n", ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + queue_delayed_work(nvme_wq, &ctrl->reconnect_work, ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_info(ctrl->ctrl.device, "Removing controller...\n"); - queue_work(nvme_rdma_wq, &ctrl->delete_work); + queue_work(nvme_wq, &ctrl->delete_work); } } @@ -733,7 +703,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) bool changed; int ret; - ++ctrl->ctrl.opts->nr_reconnects; + ++ctrl->ctrl.nr_reconnects; if (ctrl->queue_count > 1) { nvme_rdma_free_io_queues(ctrl); @@ -749,7 +719,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ret) goto requeue; - ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + ret = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); if (ret) goto requeue; @@ -777,7 +747,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - ctrl->ctrl.opts->nr_reconnects = 0; + ctrl->ctrl.nr_reconnects = 0; if (ctrl->queue_count > 1) { nvme_queue_scan(&ctrl->ctrl); @@ -790,7 +760,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", - ctrl->ctrl.opts->nr_reconnects); + ctrl->ctrl.nr_reconnects); nvme_rdma_reconnect_or_remove(ctrl); } @@ -802,10 +772,8 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_stop_keep_alive(&ctrl->ctrl); - for (i = 0; i < ctrl->queue_count; i++) { - clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags); + for (i = 0; i < ctrl->queue_count; i++) clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); - } if (ctrl->queue_count > 1) nvme_stop_queues(&ctrl->ctrl); @@ -833,7 +801,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) return; - queue_work(nvme_rdma_wq, &ctrl->err_work); + queue_work(nvme_wq, &ctrl->err_work); } static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, @@ -1278,21 +1246,11 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) { - struct nvme_rdma_device *dev; int ret; - dev = nvme_rdma_find_get_device(queue->cm_id); - if (!dev) { - dev_err(queue->cm_id->device->dev.parent, - "no client data found!\n"); - return -ECONNREFUSED; - } - - ret = nvme_rdma_create_queue_ib(queue, dev); - if (ret) { - nvme_rdma_dev_put(dev); - goto out; - } + ret = nvme_rdma_create_queue_ib(queue); + if (ret) + return ret; ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); if (ret) { @@ -1306,7 +1264,6 @@ static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) out_destroy_queue: nvme_rdma_destroy_queue_ib(queue); -out: return ret; } @@ -1334,8 +1291,8 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) * specified by the Fabrics standard. */ if (priv.qid == 0) { - priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH); - priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH); + priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); } else { /* * current interpretation of the fabrics spec @@ -1383,12 +1340,14 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, complete(&queue->cm_done); return 0; case RDMA_CM_EVENT_REJECTED: + nvme_rdma_destroy_queue_ib(queue); cm_error = nvme_rdma_conn_rejected(queue, ev); break; - case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: + nvme_rdma_destroy_queue_ib(queue); + case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); cm_error = -ECONNRESET; @@ -1435,8 +1394,8 @@ nvme_rdma_timeout(struct request *rq, bool reserved) /* * We cannot accept any other command until the Connect command has completed. */ -static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, - struct request *rq) +static inline blk_status_t +nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq) { if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { struct nvme_command *cmd = nvme_req(rq)->cmd; @@ -1452,16 +1411,15 @@ static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, * failover. */ if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING) - return -EIO; - else - return -EAGAIN; + return BLK_STS_IOERR; + return BLK_STS_RESOURCE; /* try again later */ } } return 0; } -static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; @@ -1472,28 +1430,29 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command *c = sqe->data; bool flush = false; struct ib_device *dev; - int ret; + blk_status_t ret; + int err; WARN_ON_ONCE(rq->tag < 0); ret = nvme_rdma_queue_is_ready(queue, rq); if (unlikely(ret)) - goto err; + return ret; dev = queue->device->dev; ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); ret = nvme_setup_cmd(ns, rq, c); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) return ret; blk_mq_start_request(rq); - ret = nvme_rdma_map_data(queue, rq, c); - if (ret < 0) { + err = nvme_rdma_map_data(queue, rq, c); + if (err < 0) { dev_err(queue->ctrl->ctrl.device, - "Failed to map data (%d)\n", ret); + "Failed to map data (%d)\n", err); nvme_cleanup_cmd(rq); goto err; } @@ -1503,17 +1462,18 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, if (req_op(rq) == REQ_OP_FLUSH) flush = true; - ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, + err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); - if (ret) { + if (err) { nvme_rdma_unmap_data(queue, rq); goto err; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; err: - return (ret == -ENOMEM || ret == -EAGAIN) ? - BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR; + if (err == -ENOMEM || err == -EAGAIN) + return BLK_STS_RESOURCE; + return BLK_STS_IOERR; } static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) @@ -1523,7 +1483,6 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) struct ib_wc wc; int found = 0; - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(cq, 1, &wc) > 0) { struct ib_cqe *cqe = wc.wr_cqe; @@ -1560,8 +1519,8 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = { static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, - .init_request = nvme_rdma_init_admin_request, - .exit_request = nvme_rdma_exit_admin_request, + .init_request = nvme_rdma_init_request, + .exit_request = nvme_rdma_exit_request, .reinit_request = nvme_rdma_reinit_request, .init_hctx = nvme_rdma_init_admin_hctx, .timeout = nvme_rdma_timeout, @@ -1571,7 +1530,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) { int error; - error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + error = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); if (error) return error; @@ -1672,7 +1631,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) nvme_rdma_free_io_queues(ctrl); } - if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags)) + if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags)) nvme_shutdown_ctrl(&ctrl->ctrl); blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); @@ -1709,7 +1668,7 @@ static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) return -EBUSY; - if (!queue_work(nvme_rdma_wq, &ctrl->delete_work)) + if (!queue_work(nvme_wq, &ctrl->delete_work)) return -EBUSY; return 0; @@ -1743,8 +1702,8 @@ static void nvme_rdma_remove_ctrl_work(struct work_struct *work) static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { - struct nvme_rdma_ctrl *ctrl = container_of(work, - struct nvme_rdma_ctrl, reset_work); + struct nvme_rdma_ctrl *ctrl = + container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); int ret; bool changed; @@ -1785,22 +1744,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) del_dead_ctrl: /* Deleting this dead controller... */ dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work)); -} - -static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - - if (!queue_work(nvme_rdma_wq, &ctrl->reset_work)) - return -EBUSY; - - flush_work(&ctrl->reset_work); - - return 0; + WARN_ON(!queue_work(nvme_wq, &ctrl->delete_work)); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { @@ -1810,11 +1754,9 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .reset_ctrl = nvme_rdma_reset_ctrl, .free_ctrl = nvme_rdma_free_ctrl, .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_del_ctrl, - .get_subsysnqn = nvmf_get_subsysnqn, .get_address = nvmf_get_address, }; @@ -1919,8 +1861,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); - INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work); - spin_lock_init(&ctrl->lock); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ ctrl->ctrl.sqsize = opts->queue_size - 1; @@ -1939,12 +1880,14 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, /* sanity check icdoff */ if (ctrl->ctrl.icdoff) { dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + ret = -EINVAL; goto out_remove_admin_queue; } /* sanity check keyed sgls */ if (!(ctrl->ctrl.sgls & (1 << 20))) { dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n"); + ret = -EINVAL; goto out_remove_admin_queue; } @@ -2033,7 +1976,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) } mutex_unlock(&nvme_rdma_ctrl_mutex); - flush_workqueue(nvme_rdma_wq); + flush_workqueue(nvme_wq); } static struct ib_client nvme_rdma_ib_client = { @@ -2046,13 +1989,9 @@ static int __init nvme_rdma_init_module(void) { int ret; - nvme_rdma_wq = create_workqueue("nvme_rdma_wq"); - if (!nvme_rdma_wq) - return -ENOMEM; - ret = ib_register_client(&nvme_rdma_ib_client); if (ret) - goto err_destroy_wq; + return ret; ret = nvmf_register_transport(&nvme_rdma_transport); if (ret) @@ -2062,8 +2001,6 @@ static int __init nvme_rdma_init_module(void) err_unreg_client: ib_unregister_client(&nvme_rdma_ib_client); -err_destroy_wq: - destroy_workqueue(nvme_rdma_wq); return ret; } @@ -2071,7 +2008,6 @@ static void __exit nvme_rdma_cleanup_module(void) { nvmf_unregister_transport(&nvme_rdma_transport); ib_unregister_client(&nvme_rdma_ib_client); - destroy_workqueue(nvme_rdma_wq); } module_init(nvme_rdma_init_module); diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c deleted file mode 100644 index 1f7671e631dd..000000000000 --- a/drivers/nvme/host/scsi.c +++ /dev/null @@ -1,2460 +0,0 @@ -/* - * NVM Express device driver - * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -/* - * Refer to the SCSI-NVMe Translation spec for details on how - * each command is translated. - */ - -#include <linux/bio.h> -#include <linux/bitops.h> -#include <linux/blkdev.h> -#include <linux/compat.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/idr.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/io.h> -#include <linux/kdev_t.h> -#include <linux/kthread.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/poison.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <asm/unaligned.h> -#include <scsi/sg.h> -#include <scsi/scsi.h> -#include <scsi/scsi_request.h> - -#include "nvme.h" - -static int sg_version_num = 30534; /* 2 digits for each component */ - -/* VPD Page Codes */ -#define VPD_SUPPORTED_PAGES 0x00 -#define VPD_SERIAL_NUMBER 0x80 -#define VPD_DEVICE_IDENTIFIERS 0x83 -#define VPD_EXTENDED_INQUIRY 0x86 -#define VPD_BLOCK_LIMITS 0xB0 -#define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1 - -/* format unit paramter list offsets */ -#define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4 -#define FORMAT_UNIT_LONG_PARM_LIST_LEN 8 -#define FORMAT_UNIT_PROT_INT_OFFSET 3 -#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 -#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 - -/* Misc. defines */ -#define FIXED_SENSE_DATA 0x70 -#define DESC_FORMAT_SENSE_DATA 0x72 -#define FIXED_SENSE_DATA_ADD_LENGTH 10 -#define LUN_ENTRY_SIZE 8 -#define LUN_DATA_HEADER_SIZE 8 -#define ALL_LUNS_RETURNED 0x02 -#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 -#define RESTRICTED_LUNS_RETURNED 0x00 -#define DOWNLOAD_SAVE_ACTIVATE 0x05 -#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E -#define ACTIVATE_DEFERRED_MICROCODE 0x0F -#define FORMAT_UNIT_IMMED_MASK 0x2 -#define FORMAT_UNIT_IMMED_OFFSET 1 -#define KELVIN_TEMP_FACTOR 273 -#define FIXED_FMT_SENSE_DATA_SIZE 18 -#define DESC_FMT_SENSE_DATA_SIZE 8 - -/* SCSI/NVMe defines and bit masks */ -#define INQ_STANDARD_INQUIRY_PAGE 0x00 -#define INQ_SUPPORTED_VPD_PAGES_PAGE 0x00 -#define INQ_UNIT_SERIAL_NUMBER_PAGE 0x80 -#define INQ_DEVICE_IDENTIFICATION_PAGE 0x83 -#define INQ_EXTENDED_INQUIRY_DATA_PAGE 0x86 -#define INQ_BDEV_LIMITS_PAGE 0xB0 -#define INQ_BDEV_CHARACTERISTICS_PAGE 0xB1 -#define INQ_SERIAL_NUMBER_LENGTH 0x14 -#define INQ_NUM_SUPPORTED_VPD_PAGES 6 -#define VERSION_SPC_4 0x06 -#define ACA_UNSUPPORTED 0 -#define STANDARD_INQUIRY_LENGTH 36 -#define ADDITIONAL_STD_INQ_LENGTH 31 -#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C -#define RESERVED_FIELD 0 - -/* Mode Sense/Select defines */ -#define MODE_PAGE_INFO_EXCEP 0x1C -#define MODE_PAGE_CACHING 0x08 -#define MODE_PAGE_CONTROL 0x0A -#define MODE_PAGE_POWER_CONDITION 0x1A -#define MODE_PAGE_RETURN_ALL 0x3F -#define MODE_PAGE_BLK_DES_LEN 0x08 -#define MODE_PAGE_LLBAA_BLK_DES_LEN 0x10 -#define MODE_PAGE_CACHING_LEN 0x14 -#define MODE_PAGE_CONTROL_LEN 0x0C -#define MODE_PAGE_POW_CND_LEN 0x28 -#define MODE_PAGE_INF_EXC_LEN 0x0C -#define MODE_PAGE_ALL_LEN 0x54 -#define MODE_SENSE6_MPH_SIZE 4 -#define MODE_SENSE_PAGE_CONTROL_MASK 0xC0 -#define MODE_SENSE_PAGE_CODE_OFFSET 2 -#define MODE_SENSE_PAGE_CODE_MASK 0x3F -#define MODE_SENSE_LLBAA_MASK 0x10 -#define MODE_SENSE_LLBAA_SHIFT 4 -#define MODE_SENSE_DBD_MASK 8 -#define MODE_SENSE_DBD_SHIFT 3 -#define MODE_SENSE10_MPH_SIZE 8 -#define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10 -#define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1 -#define MODE_SELECT_6_BD_OFFSET 3 -#define MODE_SELECT_10_BD_OFFSET 6 -#define MODE_SELECT_10_LLBAA_OFFSET 4 -#define MODE_SELECT_10_LLBAA_MASK 1 -#define MODE_SELECT_6_MPH_SIZE 4 -#define MODE_SELECT_10_MPH_SIZE 8 -#define CACHING_MODE_PAGE_WCE_MASK 0x04 -#define MODE_SENSE_BLK_DESC_ENABLED 0 -#define MODE_SENSE_BLK_DESC_COUNT 1 -#define MODE_SELECT_PAGE_CODE_MASK 0x3F -#define SHORT_DESC_BLOCK 8 -#define LONG_DESC_BLOCK 16 -#define MODE_PAGE_POW_CND_LEN_FIELD 0x26 -#define MODE_PAGE_INF_EXC_LEN_FIELD 0x0A -#define MODE_PAGE_CACHING_LEN_FIELD 0x12 -#define MODE_PAGE_CONTROL_LEN_FIELD 0x0A -#define MODE_SENSE_PC_CURRENT_VALUES 0 - -/* Log Sense defines */ -#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE 0x00 -#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07 -#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F -#define LOG_PAGE_TEMPERATURE_PAGE 0x0D -#define LOG_SENSE_CDB_SP_NOT_ENABLED 0 -#define LOG_SENSE_CDB_PC_MASK 0xC0 -#define LOG_SENSE_CDB_PC_SHIFT 6 -#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1 -#define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F -#define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8 -#define LOG_INFO_EXCP_PAGE_LENGTH 0xC -#define REMAINING_TEMP_PAGE_LENGTH 0xC -#define LOG_TEMP_PAGE_LENGTH 0x10 -#define LOG_TEMP_UNKNOWN 0xFF -#define SUPPORTED_LOG_PAGES_PAGE_LENGTH 0x3 - -/* Read Capacity defines */ -#define READ_CAP_10_RESP_SIZE 8 -#define READ_CAP_16_RESP_SIZE 32 - -/* NVMe Namespace and Command Defines */ -#define BYTES_TO_DWORDS 4 -#define NVME_MAX_FIRMWARE_SLOT 7 - -/* Report LUNs defines */ -#define REPORT_LUNS_FIRST_LUN_OFFSET 8 - -/* SCSI ADDITIONAL SENSE Codes */ - -#define SCSI_ASC_NO_SENSE 0x00 -#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT 0x03 -#define SCSI_ASC_LUN_NOT_READY 0x04 -#define SCSI_ASC_WARNING 0x0B -#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED 0x10 -#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED 0x10 -#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED 0x10 -#define SCSI_ASC_UNRECOVERED_READ_ERROR 0x11 -#define SCSI_ASC_MISCOMPARE_DURING_VERIFY 0x1D -#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID 0x20 -#define SCSI_ASC_ILLEGAL_COMMAND 0x20 -#define SCSI_ASC_ILLEGAL_BLOCK 0x21 -#define SCSI_ASC_INVALID_CDB 0x24 -#define SCSI_ASC_INVALID_LUN 0x25 -#define SCSI_ASC_INVALID_PARAMETER 0x26 -#define SCSI_ASC_FORMAT_COMMAND_FAILED 0x31 -#define SCSI_ASC_INTERNAL_TARGET_FAILURE 0x44 - -/* SCSI ADDITIONAL SENSE Code Qualifiers */ - -#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE 0x00 -#define SCSI_ASCQ_FORMAT_COMMAND_FAILED 0x01 -#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED 0x01 -#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED 0x02 -#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED 0x03 -#define SCSI_ASCQ_FORMAT_IN_PROGRESS 0x04 -#define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08 -#define SCSI_ASCQ_INVALID_LUN_ID 0x09 - -/* copied from drivers/usb/gadget/function/storage_common.h */ -static inline u32 get_unaligned_be24(u8 *buf) -{ - return 0xffffff & (u32) get_unaligned_be32(buf - 1); -} - -/* Struct to gather data that needs to be extracted from a SCSI CDB. - Not conforming to any particular CDB variant, but compatible with all. */ - -struct nvme_trans_io_cdb { - u8 fua; - u8 prot_info; - u64 lba; - u32 xfer_len; -}; - - -/* Internal Helper Functions */ - - -/* Copy data to userspace memory */ - -static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, - unsigned long n) -{ - int i; - void *index = from; - size_t remaining = n; - size_t xfer_len; - - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - for (i = 0; i < hdr->iovec_count; i++) { - if (copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec))) - return -EFAULT; - xfer_len = min(remaining, sgl.iov_len); - if (copy_to_user(sgl.iov_base, index, xfer_len)) - return -EFAULT; - - index += xfer_len; - remaining -= xfer_len; - if (remaining == 0) - break; - } - return 0; - } - - if (copy_to_user(hdr->dxferp, from, n)) - return -EFAULT; - return 0; -} - -/* Copy data from userspace memory */ - -static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, - unsigned long n) -{ - int i; - void *index = to; - size_t remaining = n; - size_t xfer_len; - - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - for (i = 0; i < hdr->iovec_count; i++) { - if (copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec))) - return -EFAULT; - xfer_len = min(remaining, sgl.iov_len); - if (copy_from_user(index, sgl.iov_base, xfer_len)) - return -EFAULT; - index += xfer_len; - remaining -= xfer_len; - if (remaining == 0) - break; - } - return 0; - } - - if (copy_from_user(to, hdr->dxferp, n)) - return -EFAULT; - return 0; -} - -/* Status/Sense Buffer Writeback */ - -static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, - u8 asc, u8 ascq) -{ - u8 xfer_len; - u8 resp[DESC_FMT_SENSE_DATA_SIZE]; - - if (scsi_status_is_good(status)) { - hdr->status = SAM_STAT_GOOD; - hdr->masked_status = GOOD; - hdr->host_status = DID_OK; - hdr->driver_status = DRIVER_OK; - hdr->sb_len_wr = 0; - } else { - hdr->status = status; - hdr->masked_status = status >> 1; - hdr->host_status = DID_OK; - hdr->driver_status = DRIVER_OK; - - memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE); - resp[0] = DESC_FORMAT_SENSE_DATA; - resp[1] = sense_key; - resp[2] = asc; - resp[3] = ascq; - - xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); - hdr->sb_len_wr = xfer_len; - if (copy_to_user(hdr->sbp, resp, xfer_len) > 0) - return -EFAULT; - } - - return 0; -} - -/* - * Take a status code from a lowlevel routine, and if it was a positive NVMe - * error code update the sense data based on it. In either case the passed - * in value is returned again, unless an -EFAULT from copy_to_user overrides - * it. - */ -static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) -{ - u8 status, sense_key, asc, ascq; - int res; - - /* For non-nvme (Linux) errors, simply return the error code */ - if (nvme_sc < 0) - return nvme_sc; - - /* Mask DNR, More, and reserved fields */ - switch (nvme_sc & 0x7FF) { - /* Generic Command Status */ - case NVME_SC_SUCCESS: - status = SAM_STAT_GOOD; - sense_key = NO_SENSE; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_OPCODE: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ILLEGAL_COMMAND; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_FIELD: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_INVALID_CDB; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_DATA_XFER_ERROR: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_POWER_LOSS: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_WARNING; - ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED; - break; - case NVME_SC_INTERNAL: - status = SAM_STAT_CHECK_CONDITION; - sense_key = HARDWARE_ERROR; - asc = SCSI_ASC_INTERNAL_TARGET_FAILURE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ABORT_REQ: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ABORT_QUEUE: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_FUSED_FAIL: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_FUSED_MISSING: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_NS: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; - ascq = SCSI_ASCQ_INVALID_LUN_ID; - break; - case NVME_SC_LBA_RANGE: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ILLEGAL_BLOCK; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_CAP_EXCEEDED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_NS_NOT_READY: - status = SAM_STAT_CHECK_CONDITION; - sense_key = NOT_READY; - asc = SCSI_ASC_LUN_NOT_READY; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - - /* Command Specific Status */ - case NVME_SC_INVALID_FORMAT: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_FORMAT_COMMAND_FAILED; - ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED; - break; - case NVME_SC_BAD_ATTRIBUTES: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_INVALID_CDB; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - - /* Media Errors */ - case NVME_SC_WRITE_FAULT: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_READ_ERROR: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_UNRECOVERED_READ_ERROR; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_GUARD_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED; - break; - case NVME_SC_APPTAG_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED; - break; - case NVME_SC_REFTAG_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED; - break; - case NVME_SC_COMPARE_FAILED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MISCOMPARE; - asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ACCESS_DENIED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; - ascq = SCSI_ASCQ_INVALID_LUN_ID; - break; - - /* Unspecified/Default */ - case NVME_SC_CMDID_CONFLICT: - case NVME_SC_CMD_SEQ_ERROR: - case NVME_SC_CQ_INVALID: - case NVME_SC_QID_INVALID: - case NVME_SC_QUEUE_SIZE: - case NVME_SC_ABORT_LIMIT: - case NVME_SC_ABORT_MISSING: - case NVME_SC_ASYNC_LIMIT: - case NVME_SC_FIRMWARE_SLOT: - case NVME_SC_FIRMWARE_IMAGE: - case NVME_SC_INVALID_VECTOR: - case NVME_SC_INVALID_LOG_PAGE: - default: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - } - - res = nvme_trans_completion(hdr, status, sense_key, asc, ascq); - return res ? res : nvme_sc; -} - -/* INQUIRY Helper Functions */ - -static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ns *id_ns; - int res; - int nvme_sc; - int xfer_len; - u8 resp_data_format = 0x02; - u8 protect; - u8 cmdque = 0x01 << 1; - u8 fw_offset = sizeof(ctrl->firmware_rev); - - /* nvme ns identify - use DPS value for PROTECT field */ - nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - if (id_ns->dps) - protect = 0x01; - else - protect = 0; - kfree(id_ns); - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[2] = VERSION_SPC_4; - inq_response[3] = resp_data_format; /*normaca=0 | hisup=0 */ - inq_response[4] = ADDITIONAL_STD_INQ_LENGTH; - inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ - inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ - strncpy(&inq_response[8], "NVMe ", 8); - strncpy(&inq_response[16], ctrl->model, 16); - - while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) - fw_offset--; - fw_offset -= 4; - strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4); - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -} - -static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - int xfer_len; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE; /* Page Code */ - inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES; /* Page Length */ - inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE; - inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE; - inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE; - inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE; - inq_response[9] = INQ_BDEV_LIMITS_PAGE; - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -} - -static int nvme_trans_unit_serial_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - int xfer_len; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ - inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ - strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH); - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -} - -static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) -{ - struct nvme_id_ns *id_ns; - int nvme_sc, res; - size_t len; - void *eui; - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - eui = id_ns->eui64; - len = sizeof(id_ns->eui64); - - if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) { - if (bitmap_empty(eui, len * 8)) { - eui = id_ns->nguid; - len = sizeof(id_ns->nguid); - } - } - - if (bitmap_empty(eui, len * 8)) { - res = -EOPNOTSUPP; - goto out_free_id; - } - - memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[3] = 4 + len; /* Page Length */ - - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = len; /* Designator Length */ - memcpy(&inq_response[8], eui, len); - - res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); -out_free_id: - kfree(id_ns); - return res; -} - -static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ctrl *id_ctrl; - int nvme_sc, res; - - if (alloc_len < 72) { - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[3] = 0x48; /* Page Length */ - - /* Designation Descriptor start */ - inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ - inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 0x44; /* Designator Length */ - - sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid)); - memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model)); - sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id)); - memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial)); - - res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); - kfree(id_ctrl); - return res; -} - -static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int alloc_len) -{ - int res; - - if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) { - res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); - if (res != -EOPNOTSUPP) - return res; - } - - return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len); -} - -static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - u8 *inq_response; - int res; - int nvme_sc; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ctrl *id_ctrl; - struct nvme_id_ns *id_ns; - int xfer_len; - u8 microcode = 0x80; - u8 spt; - u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7}; - u8 grd_chk, app_chk, ref_chk, protect; - u8 uask_sup = 0x20; - u8 v_sup; - u8 luiclr = 0x01; - - inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); - if (inq_response == NULL) - return -ENOMEM; - - nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free_inq; - - spt = spt_lut[id_ns->dpc & 0x07] << 3; - if (id_ns->dps) - protect = 0x01; - else - protect = 0; - kfree(id_ns); - - grd_chk = protect << 2; - app_chk = protect << 1; - ref_chk = protect; - - nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free_inq; - - v_sup = id_ctrl->vwc; - kfree(id_ctrl); - - memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */ - inq_response[2] = 0x00; /* Page Length MSB */ - inq_response[3] = 0x3C; /* Page Length LSB */ - inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk; - inq_response[5] = uask_sup; - inq_response[6] = v_sup; - inq_response[7] = luiclr; - inq_response[8] = 0; - inq_response[9] = 0; - - xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - out_free_inq: - kfree(inq_response); - return res; -} - -static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) -{ - __be32 max_sectors = cpu_to_be32( - nvme_block_nr(ns, queue_max_hw_sectors(ns->queue))); - __be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors); - __be32 discard_desc_count = cpu_to_be32(0x100); - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = VPD_BLOCK_LIMITS; - inq_response[3] = 0x3c; /* Page Length */ - memcpy(&inq_response[8], &max_sectors, sizeof(u32)); - memcpy(&inq_response[20], &max_discard, sizeof(u32)); - - if (max_discard) - memcpy(&inq_response[24], &discard_desc_count, sizeof(u32)); - - return nvme_trans_copy_to_user(hdr, inq_response, 0x3c); -} - -static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - u8 *inq_response; - int res; - int xfer_len; - - inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ - inq_response[2] = 0x00; /* Page Length MSB */ - inq_response[3] = 0x3C; /* Page Length LSB */ - inq_response[4] = 0x00; /* Medium Rotation Rate MSB */ - inq_response[5] = 0x01; /* Medium Rotation Rate LSB */ - inq_response[6] = 0x00; /* Form Factor */ - - xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - kfree(inq_response); - out_mem: - return res; -} - -/* LOG SENSE Helper Functions */ - -static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - int res; - int xfer_len; - u8 *log_response; - - log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); - if (log_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH; - log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; - log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; - log_response[6] = LOG_PAGE_TEMPERATURE_PAGE; - - xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - kfree(log_response); - out_mem: - return res; -} - -static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, - struct sg_io_hdr *hdr, int alloc_len) -{ - int res; - int xfer_len; - u8 *log_response; - struct nvme_smart_log *smart_log; - u8 temp_c; - u16 temp_k; - - log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) - return -ENOMEM; - - res = nvme_get_log_page(ns->ctrl, &smart_log); - if (res < 0) - goto out_free_response; - - if (res != NVME_SC_SUCCESS) { - temp_c = LOG_TEMP_UNKNOWN; - } else { - temp_k = (smart_log->temperature[1] << 8) + - (smart_log->temperature[0]); - temp_c = temp_k - KELVIN_TEMP_FACTOR; - } - kfree(smart_log); - - log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH; - /* Informational Exceptions Log Parameter 1 Start */ - /* Parameter Code=0x0000 bytes 4,5 */ - log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */ - log_response[7] = 0x04; /* PARAMETER LENGTH */ - /* Add sense Code and qualifier = 0x00 each */ - /* Use Temperature from NVMe Get Log Page, convert to C from K */ - log_response[10] = temp_c; - - xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - out_free_response: - kfree(log_response); - return res; -} - -static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - int res; - int xfer_len; - u8 *log_response; - struct nvme_smart_log *smart_log; - u32 feature_resp; - u8 temp_c_cur, temp_c_thresh; - u16 temp_k; - - log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) - return -ENOMEM; - - res = nvme_get_log_page(ns->ctrl, &smart_log); - if (res < 0) - goto out_free_response; - - if (res != NVME_SC_SUCCESS) { - temp_c_cur = LOG_TEMP_UNKNOWN; - } else { - temp_k = (smart_log->temperature[1] << 8) + - (smart_log->temperature[0]); - temp_c_cur = temp_k - KELVIN_TEMP_FACTOR; - } - kfree(smart_log); - - /* Get Features for Temp Threshold */ - res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, NULL, 0, - &feature_resp); - if (res != NVME_SC_SUCCESS) - temp_c_thresh = LOG_TEMP_UNKNOWN; - else - temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR; - - log_response[0] = LOG_PAGE_TEMPERATURE_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = REMAINING_TEMP_PAGE_LENGTH; - /* Temperature Log Parameter 1 (Temperature) Start */ - /* Parameter Code = 0x0000 */ - log_response[6] = 0x01; /* Format and Linking = 01b */ - log_response[7] = 0x02; /* Parameter Length */ - /* Use Temperature from NVMe Get Log Page, convert to C from K */ - log_response[9] = temp_c_cur; - /* Temperature Log Parameter 2 (Reference Temperature) Start */ - log_response[11] = 0x01; /* Parameter Code = 0x0001 */ - log_response[12] = 0x01; /* Format and Linking = 01b */ - log_response[13] = 0x02; /* Parameter Length */ - /* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */ - log_response[15] = temp_c_thresh; - - xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - out_free_response: - kfree(log_response); - return res; -} - -/* MODE SENSE Helper Functions */ - -static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa, - u16 mode_data_length, u16 blk_desc_len) -{ - /* Quick check to make sure I don't stomp on my own memory... */ - if ((cdb10 && len < 8) || (!cdb10 && len < 4)) - return -EINVAL; - - if (cdb10) { - resp[0] = (mode_data_length & 0xFF00) >> 8; - resp[1] = (mode_data_length & 0x00FF); - resp[3] = 0x10 /* DPOFUA */; - resp[4] = llbaa; - resp[5] = RESERVED_FIELD; - resp[6] = (blk_desc_len & 0xFF00) >> 8; - resp[7] = (blk_desc_len & 0x00FF); - } else { - resp[0] = (mode_data_length & 0x00FF); - resp[2] = 0x10 /* DPOFUA */; - resp[3] = (blk_desc_len & 0x00FF); - } - - return 0; -} - -static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int len, u8 llbaa) -{ - int res; - int nvme_sc; - struct nvme_id_ns *id_ns; - u8 flbas; - u32 lba_length; - - if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN) - return -EINVAL; - else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) - return -EINVAL; - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - flbas = (id_ns->flbas) & 0x0F; - lba_length = (1 << (id_ns->lbaf[flbas].ds)); - - if (llbaa == 0) { - __be32 tmp_cap = cpu_to_be32(le64_to_cpu(id_ns->ncap)); - /* Byte 4 is reserved */ - __be32 tmp_len = cpu_to_be32(lba_length & 0x00FFFFFF); - - memcpy(resp, &tmp_cap, sizeof(u32)); - memcpy(&resp[4], &tmp_len, sizeof(u32)); - } else { - __be64 tmp_cap = cpu_to_be64(le64_to_cpu(id_ns->ncap)); - __be32 tmp_len = cpu_to_be32(lba_length); - - memcpy(resp, &tmp_cap, sizeof(u64)); - /* Bytes 8, 9, 10, 11 are reserved */ - memcpy(&resp[12], &tmp_len, sizeof(u32)); - } - - kfree(id_ns); - return res; -} - -static int nvme_trans_fill_control_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_CONTROL_LEN) - return -EINVAL; - - resp[0] = MODE_PAGE_CONTROL; - resp[1] = MODE_PAGE_CONTROL_LEN_FIELD; - resp[2] = 0x0E; /* TST=000b, TMF_ONLY=0, DPICZ=1, - * D_SENSE=1, GLTSD=1, RLEC=0 */ - resp[3] = 0x12; /* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */ - /* Byte 4: VS=0, RAC=0, UA_INT=0, SWP=0 */ - resp[5] = 0x40; /* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */ - /* resp[6] and [7] are obsolete, thus zero */ - resp[8] = 0xFF; /* Busy timeout period = 0xffff */ - resp[9] = 0xFF; - /* Bytes 10,11: Extended selftest completion time = 0x0000 */ - - return 0; -} - -static int nvme_trans_fill_caching_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, - u8 *resp, int len) -{ - int res = 0; - int nvme_sc; - u32 feature_resp; - u8 vwc; - - if (len < MODE_PAGE_CACHING_LEN) - return -EINVAL; - - nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, NULL, 0, - &feature_resp); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - vwc = feature_resp & 0x00000001; - - resp[0] = MODE_PAGE_CACHING; - resp[1] = MODE_PAGE_CACHING_LEN_FIELD; - resp[2] = vwc << 2; - return 0; -} - -static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_POW_CND_LEN) - return -EINVAL; - - resp[0] = MODE_PAGE_POWER_CONDITION; - resp[1] = MODE_PAGE_POW_CND_LEN_FIELD; - /* All other bytes are zero */ - - return 0; -} - -static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_INF_EXC_LEN) - return -EINVAL; - - resp[0] = MODE_PAGE_INFO_EXCEP; - resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD; - resp[2] = 0x88; - /* All other bytes are zero */ - - return 0; -} - -static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int len) -{ - int res; - u16 mode_pages_offset_1 = 0; - u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4; - - mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN; - mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN; - mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN; - - res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1], - MODE_PAGE_CACHING_LEN); - if (res) - return res; - res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2], - MODE_PAGE_CONTROL_LEN); - if (res) - return res; - res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3], - MODE_PAGE_POW_CND_LEN); - if (res) - return res; - return nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], - MODE_PAGE_INF_EXC_LEN); -} - -static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa) -{ - if (dbd == MODE_SENSE_BLK_DESC_ENABLED) { - /* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */ - return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT; - } else { - return 0; - } -} - -static int nvme_trans_mode_page_create(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *cmd, - u16 alloc_len, u8 cdb10, - int (*mode_page_fill_func) - (struct nvme_ns *, - struct sg_io_hdr *hdr, u8 *, int), - u16 mode_pages_tot_len) -{ - int res; - int xfer_len; - u8 *response; - u8 dbd, llbaa; - u16 resp_size; - int mph_size; - u16 mode_pages_offset_1; - u16 blk_desc_len, blk_desc_offset, mode_data_length; - - dbd = (cmd[1] & MODE_SENSE_DBD_MASK) >> MODE_SENSE_DBD_SHIFT; - llbaa = (cmd[1] & MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT; - mph_size = cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE; - - blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa); - - resp_size = mph_size + blk_desc_len + mode_pages_tot_len; - /* Refer spc4r34 Table 440 for calculation of Mode data Length field */ - mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len; - - blk_desc_offset = mph_size; - mode_pages_offset_1 = blk_desc_offset + blk_desc_len; - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, - llbaa, mode_data_length, blk_desc_len); - if (res) - goto out_free; - if (blk_desc_len > 0) { - res = nvme_trans_fill_blk_desc(ns, hdr, - &response[blk_desc_offset], - blk_desc_len, llbaa); - if (res) - goto out_free; - } - res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1], - mode_pages_tot_len); - if (res) - goto out_free; - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - out_free: - kfree(response); - out_mem: - return res; -} - -/* Read Capacity Helper Functions */ - -static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, - u8 cdb16) -{ - u8 flbas; - u32 lba_length; - u64 rlba; - u8 prot_en; - u8 p_type_lut[4] = {0, 0, 1, 2}; - __be64 tmp_rlba; - __be32 tmp_rlba_32; - __be32 tmp_len; - - flbas = (id_ns->flbas) & 0x0F; - lba_length = (1 << (id_ns->lbaf[flbas].ds)); - rlba = le64_to_cpup(&id_ns->nsze) - 1; - (id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0); - - if (!cdb16) { - if (rlba > 0xFFFFFFFF) - rlba = 0xFFFFFFFF; - tmp_rlba_32 = cpu_to_be32(rlba); - tmp_len = cpu_to_be32(lba_length); - memcpy(response, &tmp_rlba_32, sizeof(u32)); - memcpy(&response[4], &tmp_len, sizeof(u32)); - } else { - tmp_rlba = cpu_to_be64(rlba); - tmp_len = cpu_to_be32(lba_length); - memcpy(response, &tmp_rlba, sizeof(u64)); - memcpy(&response[8], &tmp_len, sizeof(u32)); - response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en; - /* P_I_Exponent = 0x0 | LBPPBE = 0x0 */ - /* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */ - /* Bytes 16-31 - Reserved */ - } -} - -/* Start Stop Unit Helper Functions */ - -static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 buffer_id) -{ - struct nvme_command c; - int nvme_sc; - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_activate_fw; - c.common.cdw10[0] = cpu_to_le32(buffer_id | NVME_FWACT_REPL_ACTV); - - nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); - return nvme_trans_status_code(hdr, nvme_sc); -} - -static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 opcode, u32 tot_len, u32 offset, - u8 buffer_id) -{ - int nvme_sc; - struct nvme_command c; - - if (hdr->iovec_count > 0) { - /* Assuming SGL is not allowed for this command */ - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_download_fw; - c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); - c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); - - nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c, - hdr->dxferp, tot_len, NULL, 0); - return nvme_trans_status_code(hdr, nvme_sc); -} - -/* Mode Select Helper Functions */ - -static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, - u16 *bd_len, u8 *llbaa) -{ - if (cdb10) { - /* 10 Byte CDB */ - *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + - parm_list[MODE_SELECT_10_BD_OFFSET + 1]; - *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] & - MODE_SELECT_10_LLBAA_MASK; - } else { - /* 6 Byte CDB */ - *bd_len = parm_list[MODE_SELECT_6_BD_OFFSET]; - } -} - -static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, - u16 idx, u16 bd_len, u8 llbaa) -{ - /* Store block descriptor info if a FORMAT UNIT comes later */ - /* TODO Saving 1st BD info; what to do if multiple BD received? */ - if (llbaa == 0) { - /* Standard Block Descriptor - spc4r34 7.5.5.1 */ - ns->mode_select_num_blocks = - (parm_list[idx + 1] << 16) + - (parm_list[idx + 2] << 8) + - (parm_list[idx + 3]); - - ns->mode_select_block_len = - (parm_list[idx + 5] << 16) + - (parm_list[idx + 6] << 8) + - (parm_list[idx + 7]); - } else { - /* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */ - ns->mode_select_num_blocks = - (((u64)parm_list[idx + 0]) << 56) + - (((u64)parm_list[idx + 1]) << 48) + - (((u64)parm_list[idx + 2]) << 40) + - (((u64)parm_list[idx + 3]) << 32) + - (((u64)parm_list[idx + 4]) << 24) + - (((u64)parm_list[idx + 5]) << 16) + - (((u64)parm_list[idx + 6]) << 8) + - ((u64)parm_list[idx + 7]); - - ns->mode_select_block_len = - (parm_list[idx + 12] << 24) + - (parm_list[idx + 13] << 16) + - (parm_list[idx + 14] << 8) + - (parm_list[idx + 15]); - } -} - -static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *mode_page, u8 page_code) -{ - int res = 0; - int nvme_sc; - unsigned dword11; - - switch (page_code) { - case MODE_PAGE_CACHING: - dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); - nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, - dword11, NULL, 0, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - break; - case MODE_PAGE_CONTROL: - break; - case MODE_PAGE_POWER_CONDITION: - /* Verify the OS is not trying to set timers */ - if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - return res; -} - -static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd, u16 parm_list_len, u8 pf, - u8 sp, u8 cdb10) -{ - int res; - u8 *parm_list; - u16 bd_len; - u8 llbaa = 0; - u16 index, saved_index; - u8 page_code; - u16 mp_size; - - /* Get parm list from data-in/out buffer */ - parm_list = kmalloc(parm_list_len, GFP_KERNEL); - if (parm_list == NULL) { - res = -ENOMEM; - goto out; - } - - res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len); - if (res) - goto out_mem; - - nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa); - index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE); - - if (bd_len != 0) { - /* Block Descriptors present, parse */ - nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa); - index += bd_len; - } - saved_index = index; - - /* Multiple mode pages may be present; iterate through all */ - /* In 1st Iteration, don't do NVME Command, only check for CDB errors */ - do { - page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; - mp_size = parm_list[index + 1] + 2; - if ((page_code != MODE_PAGE_CACHING) && - (page_code != MODE_PAGE_CONTROL) && - (page_code != MODE_PAGE_POWER_CONDITION)) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - index += mp_size; - } while (index < parm_list_len); - - /* In 2nd Iteration, do the NVME Commands */ - index = saved_index; - do { - page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; - mp_size = parm_list[index + 1] + 2; - res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index], - page_code); - if (res) - break; - index += mp_size; - } while (index < parm_list_len); - - out_mem: - kfree(parm_list); - out: - return res; -} - -/* Format Unit Helper Functions */ - -static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, - struct sg_io_hdr *hdr) -{ - int res = 0; - int nvme_sc; - u8 flbas; - - /* - * SCSI Expects a MODE SELECT would have been issued prior to - * a FORMAT UNIT, and the block size and number would be used - * from the block descriptor in it. If a MODE SELECT had not - * been issued, FORMAT shall use the current values for both. - */ - - if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { - struct nvme_id_ns *id_ns; - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - if (ns->mode_select_num_blocks == 0) - ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap); - if (ns->mode_select_block_len == 0) { - flbas = (id_ns->flbas) & 0x0F; - ns->mode_select_block_len = - (1 << (id_ns->lbaf[flbas].ds)); - } - - kfree(id_ns); - } - - return 0; -} - -static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, - u8 format_prot_info, u8 *nvme_pf_code) -{ - int res; - u8 *parm_list; - u8 pf_usage, pf_code; - - parm_list = kmalloc(len, GFP_KERNEL); - if (parm_list == NULL) { - res = -ENOMEM; - goto out; - } - res = nvme_trans_copy_from_user(hdr, parm_list, len); - if (res) - goto out_mem; - - if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] & - FORMAT_UNIT_IMMED_MASK) != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - - if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN && - (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] & - FORMAT_UNIT_PROT_FIELD_USAGE_MASK; - pf_code = (pf_usage << 2) | format_prot_info; - switch (pf_code) { - case 0: - *nvme_pf_code = 0; - break; - case 2: - *nvme_pf_code = 1; - break; - case 3: - *nvme_pf_code = 2; - break; - case 7: - *nvme_pf_code = 3; - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out_mem: - kfree(parm_list); - out: - return res; -} - -static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 prot_info) -{ - int res; - int nvme_sc; - struct nvme_id_ns *id_ns; - u8 i; - u8 nlbaf; - u8 selected_lbaf = 0xFF; - u32 cdw10 = 0; - struct nvme_command c; - - /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - nlbaf = id_ns->nlbaf; - - for (i = 0; i < nlbaf; i++) { - if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) { - selected_lbaf = i; - break; - } - } - if (selected_lbaf > 0x0F) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - if (ns->mode_select_num_blocks != le64_to_cpu(id_ns->ncap)) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - cdw10 |= prot_info << 5; - cdw10 |= selected_lbaf & 0x0F; - memset(&c, 0, sizeof(c)); - c.format.opcode = nvme_admin_format_nvm; - c.format.nsid = cpu_to_le32(ns->ns_id); - c.format.cdw10 = cpu_to_le32(cdw10); - - nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0); - res = nvme_trans_status_code(hdr, nvme_sc); - - kfree(id_ns); - return res; -} - -static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr, - struct nvme_trans_io_cdb *cdb_info, - u32 max_blocks) -{ - /* If using iovecs, send one nvme command per vector */ - if (hdr->iovec_count > 0) - return hdr->iovec_count; - else if (cdb_info->xfer_len > max_blocks) - return ((cdb_info->xfer_len - 1) / max_blocks) + 1; - else - return 1; -} - -static u16 nvme_trans_io_get_control(struct nvme_ns *ns, - struct nvme_trans_io_cdb *cdb_info) -{ - u16 control = 0; - - /* When Protection information support is added, implement here */ - - if (cdb_info->fua > 0) - control |= NVME_RW_FUA; - - return control; -} - -static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, - struct nvme_trans_io_cdb *cdb_info, u8 is_write) -{ - int nvme_sc = NVME_SC_SUCCESS; - u32 num_cmds; - u64 unit_len; - u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */ - u32 retcode; - u32 i = 0; - u64 nvme_offset = 0; - void __user *next_mapping_addr; - struct nvme_command c; - u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); - u16 control; - u32 max_blocks = queue_max_hw_sectors(ns->queue) >> (ns->lba_shift - 9); - - num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); - - /* - * This loop handles two cases. - * First, when an SGL is used in the form of an iovec list: - * - Use iov_base as the next mapping address for the nvme command_id - * - Use iov_len as the data transfer length for the command. - * Second, when we have a single buffer - * - If larger than max_blocks, split into chunks, offset - * each nvme command accordingly. - */ - for (i = 0; i < num_cmds; i++) { - memset(&c, 0, sizeof(c)); - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - retcode = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (retcode) - return -EFAULT; - unit_len = sgl.iov_len; - unit_num_blocks = unit_len >> ns->lba_shift; - next_mapping_addr = sgl.iov_base; - } else { - unit_num_blocks = min((u64)max_blocks, - (cdb_info->xfer_len - nvme_offset)); - unit_len = unit_num_blocks << ns->lba_shift; - next_mapping_addr = hdr->dxferp + - ((1 << ns->lba_shift) * nvme_offset); - } - - c.rw.opcode = opcode; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset); - c.rw.length = cpu_to_le16(unit_num_blocks - 1); - control = nvme_trans_io_get_control(ns, cdb_info); - c.rw.control = cpu_to_le16(control); - - if (get_capacity(ns->disk) - unit_num_blocks < - cdb_info->lba + nvme_offset) { - nvme_sc = NVME_SC_LBA_RANGE; - break; - } - nvme_sc = nvme_submit_user_cmd(ns->queue, &c, - next_mapping_addr, unit_len, NULL, 0); - if (nvme_sc) - break; - - nvme_offset += unit_num_blocks; - } - - return nvme_trans_status_code(hdr, nvme_sc); -} - - -/* SCSI Command Translation Functions */ - -static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, - u8 *cmd) -{ - int res = 0; - struct nvme_trans_io_cdb cdb_info = { 0, }; - u8 opcode = cmd[0]; - u64 xfer_bytes; - u64 sum_iov_len = 0; - struct sg_iovec sgl; - int i; - size_t not_copied; - - /* - * The FUA and WPROTECT fields are not supported in 6-byte CDBs, - * but always in the same place for all others. - */ - switch (opcode) { - case WRITE_6: - case READ_6: - break; - default: - cdb_info.fua = cmd[1] & 0x8; - cdb_info.prot_info = (cmd[1] & 0xe0) >> 5; - if (cdb_info.prot_info && !ns->pi_type) { - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - } - - switch (opcode) { - case WRITE_6: - case READ_6: - cdb_info.lba = get_unaligned_be24(&cmd[1]); - cdb_info.xfer_len = cmd[4]; - if (cdb_info.xfer_len == 0) - cdb_info.xfer_len = 256; - break; - case WRITE_10: - case READ_10: - cdb_info.lba = get_unaligned_be32(&cmd[2]); - cdb_info.xfer_len = get_unaligned_be16(&cmd[7]); - break; - case WRITE_12: - case READ_12: - cdb_info.lba = get_unaligned_be32(&cmd[2]); - cdb_info.xfer_len = get_unaligned_be32(&cmd[6]); - break; - case WRITE_16: - case READ_16: - cdb_info.lba = get_unaligned_be64(&cmd[2]); - cdb_info.xfer_len = get_unaligned_be32(&cmd[10]); - break; - default: - /* Will never really reach here */ - res = -EIO; - goto out; - } - - /* Calculate total length of transfer (in bytes) */ - if (hdr->iovec_count > 0) { - for (i = 0; i < hdr->iovec_count; i++) { - not_copied = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (not_copied) - return -EFAULT; - sum_iov_len += sgl.iov_len; - /* IO vector sizes should be multiples of block size */ - if (sgl.iov_len % (1 << ns->lba_shift) != 0) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - } - } else { - sum_iov_len = hdr->dxfer_len; - } - - /* As Per sg ioctl howto, if the lengths differ, use the lower one */ - xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len); - - /* If block count and actual data buffer size dont match, error out */ - if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) { - res = -EINVAL; - goto out; - } - - /* Check for 0 length transfer - it is not illegal */ - if (cdb_info.xfer_len == 0) - goto out; - - /* Send NVMe IO Command(s) */ - res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write); - if (res) - goto out; - - out: - return res; -} - -static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = 0; - u8 evpd; - u8 page_code; - int alloc_len; - u8 *inq_response; - - evpd = cmd[1] & 0x01; - page_code = cmd[2]; - alloc_len = get_unaligned_be16(&cmd[3]); - - inq_response = kmalloc(max(alloc_len, STANDARD_INQUIRY_LENGTH), - GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - if (evpd == 0) { - if (page_code == INQ_STANDARD_INQUIRY_PAGE) { - res = nvme_trans_standard_inquiry_page(ns, hdr, - inq_response, alloc_len); - } else { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - } else { - switch (page_code) { - case VPD_SUPPORTED_PAGES: - res = nvme_trans_supported_vpd_pages(ns, hdr, - inq_response, alloc_len); - break; - case VPD_SERIAL_NUMBER: - res = nvme_trans_unit_serial_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_DEVICE_IDENTIFIERS: - res = nvme_trans_device_id_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_EXTENDED_INQUIRY: - res = nvme_trans_ext_inq_page(ns, hdr, alloc_len); - break; - case VPD_BLOCK_LIMITS: - res = nvme_trans_bdev_limits_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_BLOCK_DEV_CHARACTERISTICS: - res = nvme_trans_bdev_char_page(ns, hdr, alloc_len); - break; - default: - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - } - kfree(inq_response); - out_mem: - return res; -} - -static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - u16 alloc_len; - u8 pc; - u8 page_code; - - if (cmd[1] != LOG_SENSE_CDB_SP_NOT_ENABLED) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - page_code = cmd[2] & LOG_SENSE_CDB_PAGE_CODE_MASK; - pc = (cmd[2] & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; - if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - alloc_len = get_unaligned_be16(&cmd[7]); - switch (page_code) { - case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE: - res = nvme_trans_log_supp_pages(ns, hdr, alloc_len); - break; - case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE: - res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len); - break; - case LOG_PAGE_TEMPERATURE_PAGE: - res = nvme_trans_log_temperature(ns, hdr, alloc_len); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - u8 cdb10 = 0; - u16 parm_list_len; - u8 page_format; - u8 save_pages; - - page_format = cmd[1] & MODE_SELECT_CDB_PAGE_FORMAT_MASK; - save_pages = cmd[1] & MODE_SELECT_CDB_SAVE_PAGES_MASK; - - if (cmd[0] == MODE_SELECT) { - parm_list_len = cmd[4]; - } else { - parm_list_len = cmd[7]; - cdb10 = 1; - } - - if (parm_list_len != 0) { - /* - * According to SPC-4 r24, a paramter list length field of 0 - * shall not be considered an error - */ - return nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, - page_format, save_pages, cdb10); - } - - return 0; -} - -static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = 0; - u16 alloc_len; - u8 cdb10 = 0; - - if (cmd[0] == MODE_SENSE) { - alloc_len = cmd[4]; - } else { - alloc_len = get_unaligned_be16(&cmd[7]); - cdb10 = 1; - } - - if ((cmd[2] & MODE_SENSE_PAGE_CONTROL_MASK) != - MODE_SENSE_PC_CURRENT_VALUES) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - switch (cmd[2] & MODE_SENSE_PAGE_CODE_MASK) { - case MODE_PAGE_CACHING: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_caching_page, - MODE_PAGE_CACHING_LEN); - break; - case MODE_PAGE_CONTROL: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_control_page, - MODE_PAGE_CONTROL_LEN); - break; - case MODE_PAGE_POWER_CONDITION: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_pow_cnd_page, - MODE_PAGE_POW_CND_LEN); - break; - case MODE_PAGE_INFO_EXCEP: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_inf_exc_page, - MODE_PAGE_INF_EXC_LEN); - break; - case MODE_PAGE_RETURN_ALL: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_all_pages, - MODE_PAGE_ALL_LEN); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd, u8 cdb16) -{ - int res; - int nvme_sc; - u32 alloc_len; - u32 resp_size; - u32 xfer_len; - struct nvme_id_ns *id_ns; - u8 *response; - - if (cdb16) { - alloc_len = get_unaligned_be32(&cmd[10]); - resp_size = READ_CAP_16_RESP_SIZE; - } else { - alloc_len = READ_CAP_10_RESP_SIZE; - resp_size = READ_CAP_10_RESP_SIZE; - } - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_free_id; - } - nvme_trans_fill_read_cap(response, id_ns, cdb16); - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out_free_id: - kfree(id_ns); - return res; -} - -static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - int nvme_sc; - u32 alloc_len, xfer_len, resp_size; - u8 *response; - struct nvme_id_ctrl *id_ctrl; - u32 ll_length, lun_id; - u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; - __be32 tmp_len; - - switch (cmd[2]) { - default: - return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - case ALL_LUNS_RETURNED: - case ALL_WELL_KNOWN_LUNS_RETURNED: - case RESTRICTED_LUNS_RETURNED: - nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE; - resp_size = ll_length + LUN_DATA_HEADER_SIZE; - - alloc_len = get_unaligned_be32(&cmd[6]); - if (alloc_len < resp_size) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_free_id; - } - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_free_id; - } - - /* The first LUN ID will always be 0 per the SAM spec */ - for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) { - /* - * Set the LUN Id and then increment to the next LUN - * location in the parameter data. - */ - __be64 tmp_id = cpu_to_be64(lun_id); - memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64)); - lun_id_offset += LUN_ENTRY_SIZE; - } - tmp_len = cpu_to_be32(ll_length); - memcpy(response, &tmp_len, sizeof(u32)); - } - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out_free_id: - kfree(id_ctrl); - return res; -} - -static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - u8 alloc_len, xfer_len, resp_size; - u8 desc_format; - u8 *response; - - desc_format = cmd[1] & 0x01; - alloc_len = cmd[4]; - - resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : - (FIXED_FMT_SENSE_DATA_SIZE)); - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out; - } - - if (desc_format) { - /* Descriptor Format Sense Data */ - response[0] = DESC_FORMAT_SENSE_DATA; - response[1] = NO_SENSE; - /* TODO How is LOW POWER CONDITION ON handled? (byte 2) */ - response[2] = SCSI_ASC_NO_SENSE; - response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - /* SDAT_OVFL = 0 | Additional Sense Length = 0 */ - } else { - /* Fixed Format Sense Data */ - response[0] = FIXED_SENSE_DATA; - /* Byte 1 = Obsolete */ - response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */ - /* Bytes 3-6 - Information - set to zero */ - response[7] = FIXED_SENSE_DATA_ADD_LENGTH; - /* Bytes 8-11 - Cmd Specific Information - set to zero */ - response[12] = SCSI_ASC_NO_SENSE; - response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - /* Byte 14 = Field Replaceable Unit Code = 0 */ - /* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */ - } - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out: - return res; -} - -static int nvme_trans_synchronize_cache(struct nvme_ns *ns, - struct sg_io_hdr *hdr) -{ - int nvme_sc; - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_cmd_flush; - c.common.nsid = cpu_to_le32(ns->ns_id); - - nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); - return nvme_trans_status_code(hdr, nvme_sc); -} - -static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - u8 parm_hdr_len = 0; - u8 nvme_pf_code = 0; - u8 format_prot_info, long_list, format_data; - - format_prot_info = (cmd[1] & 0xc0) >> 6; - long_list = cmd[1] & 0x20; - format_data = cmd[1] & 0x10; - - if (format_data != 0) { - if (format_prot_info != 0) { - if (long_list == 0) - parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN; - else - parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN; - } - } else if (format_data == 0 && format_prot_info != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - /* Get parm header from data-in/out buffer */ - /* - * According to the translation spec, the only fields in the parameter - * list we are concerned with are in the header. So allocate only that. - */ - if (parm_hdr_len > 0) { - res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len, - format_prot_info, &nvme_pf_code); - if (res) - goto out; - } - - /* Attempt to activate any previously downloaded firmware image */ - res = nvme_trans_send_activate_fw_cmd(ns, hdr, 0); - - /* Determine Block size and count and send format command */ - res = nvme_trans_fmt_set_blk_size_count(ns, hdr); - if (res) - goto out; - - res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code); - - out: - return res; -} - -static int nvme_trans_test_unit_ready(struct nvme_ns *ns, - struct sg_io_hdr *hdr, - u8 *cmd) -{ - if (nvme_ctrl_ready(ns->ctrl)) - return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - NOT_READY, SCSI_ASC_LUN_NOT_READY, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - else - return nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); -} - -static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = 0; - u32 buffer_offset, parm_list_length; - u8 buffer_id, mode; - - parm_list_length = get_unaligned_be24(&cmd[6]); - if (parm_list_length % BYTES_TO_DWORDS != 0) { - /* NVMe expects Firmware file to be a whole number of DWORDS */ - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - buffer_id = cmd[2]; - if (buffer_id > NVME_MAX_FIRMWARE_SLOT) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - mode = cmd[1] & 0x1f; - buffer_offset = get_unaligned_be24(&cmd[3]); - - switch (mode) { - case DOWNLOAD_SAVE_ACTIVATE: - res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, - parm_list_length, buffer_offset, - buffer_id); - if (res) - goto out; - res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); - break; - case DOWNLOAD_SAVE_DEFER_ACTIVATE: - res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, - parm_list_length, buffer_offset, - buffer_id); - break; - case ACTIVATE_DEFERRED_MICROCODE: - res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -struct scsi_unmap_blk_desc { - __be64 slba; - __be32 nlb; - u32 resv; -}; - -struct scsi_unmap_parm_list { - __be16 unmap_data_len; - __be16 unmap_blk_desc_data_len; - u32 resv; - struct scsi_unmap_blk_desc desc[0]; -}; - -static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - struct scsi_unmap_parm_list *plist; - struct nvme_dsm_range *range; - struct nvme_command c; - int i, nvme_sc, res; - u16 ndesc, list_len; - - list_len = get_unaligned_be16(&cmd[7]); - if (!list_len) - return -EINVAL; - - plist = kmalloc(list_len, GFP_KERNEL); - if (!plist) - return -ENOMEM; - - res = nvme_trans_copy_from_user(hdr, plist, list_len); - if (res) - goto out; - - ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4; - if (!ndesc || ndesc > 256) { - res = -EINVAL; - goto out; - } - - range = kcalloc(ndesc, sizeof(*range), GFP_KERNEL); - if (!range) { - res = -ENOMEM; - goto out; - } - - for (i = 0; i < ndesc; i++) { - range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb)); - range[i].slba = cpu_to_le64(be64_to_cpu(plist->desc[i].slba)); - range[i].cattr = 0; - } - - memset(&c, 0, sizeof(c)); - c.dsm.opcode = nvme_cmd_dsm; - c.dsm.nsid = cpu_to_le32(ns->ns_id); - c.dsm.nr = cpu_to_le32(ndesc - 1); - c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - - nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, range, - ndesc * sizeof(*range)); - res = nvme_trans_status_code(hdr, nvme_sc); - - kfree(range); - out: - kfree(plist); - return res; -} - -static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) -{ - u8 cmd[16]; - int retcode; - unsigned int opcode; - - if (hdr->cmdp == NULL) - return -EMSGSIZE; - if (hdr->cmd_len > sizeof(cmd)) - return -EINVAL; - if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) - return -EFAULT; - - /* - * Prime the hdr with good status for scsi commands that don't require - * an nvme command for translation. - */ - retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); - if (retcode) - return retcode; - - opcode = cmd[0]; - - switch (opcode) { - case READ_6: - case READ_10: - case READ_12: - case READ_16: - retcode = nvme_trans_io(ns, hdr, 0, cmd); - break; - case WRITE_6: - case WRITE_10: - case WRITE_12: - case WRITE_16: - retcode = nvme_trans_io(ns, hdr, 1, cmd); - break; - case INQUIRY: - retcode = nvme_trans_inquiry(ns, hdr, cmd); - break; - case LOG_SENSE: - retcode = nvme_trans_log_sense(ns, hdr, cmd); - break; - case MODE_SELECT: - case MODE_SELECT_10: - retcode = nvme_trans_mode_select(ns, hdr, cmd); - break; - case MODE_SENSE: - case MODE_SENSE_10: - retcode = nvme_trans_mode_sense(ns, hdr, cmd); - break; - case READ_CAPACITY: - retcode = nvme_trans_read_capacity(ns, hdr, cmd, 0); - break; - case SERVICE_ACTION_IN_16: - switch (cmd[1]) { - case SAI_READ_CAPACITY_16: - retcode = nvme_trans_read_capacity(ns, hdr, cmd, 1); - break; - default: - goto out; - } - break; - case REPORT_LUNS: - retcode = nvme_trans_report_luns(ns, hdr, cmd); - break; - case REQUEST_SENSE: - retcode = nvme_trans_request_sense(ns, hdr, cmd); - break; - case SYNCHRONIZE_CACHE: - retcode = nvme_trans_synchronize_cache(ns, hdr); - break; - case FORMAT_UNIT: - retcode = nvme_trans_format_unit(ns, hdr, cmd); - break; - case TEST_UNIT_READY: - retcode = nvme_trans_test_unit_ready(ns, hdr, cmd); - break; - case WRITE_BUFFER: - retcode = nvme_trans_write_buffer(ns, hdr, cmd); - break; - case UNMAP: - retcode = nvme_trans_unmap(ns, hdr, cmd); - break; - default: - out: - retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - return retcode; -} - -int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) -{ - struct sg_io_hdr hdr; - int retcode; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&hdr, u_hdr, sizeof(hdr))) - return -EFAULT; - if (hdr.interface_id != 'S') - return -EINVAL; - - /* - * A positive return code means a NVMe status, which has been - * translated to sense data. - */ - retcode = nvme_scsi_translate(ns, &hdr); - if (retcode < 0) - return retcode; - if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) - return -EFAULT; - return 0; -} - -int nvme_sg_get_version_num(int __user *ip) -{ - return put_user(sg_version_num, ip); -} diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index ff1f97006322..35f930db3c02 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -336,7 +336,7 @@ out: static void nvmet_execute_identify_nslist(struct nvmet_req *req) { - static const int buf_size = 4096; + static const int buf_size = NVME_IDENTIFY_DATA_SIZE; struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_ns *ns; u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); @@ -367,6 +367,64 @@ out: nvmet_req_complete(req, status); } +static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len, + void *id, off_t *off) +{ + struct nvme_ns_id_desc desc = { + .nidt = type, + .nidl = len, + }; + u16 status; + + status = nvmet_copy_to_sgl(req, *off, &desc, sizeof(desc)); + if (status) + return status; + *off += sizeof(desc); + + status = nvmet_copy_to_sgl(req, *off, id, len); + if (status) + return status; + *off += len; + + return 0; +} + +static void nvmet_execute_identify_desclist(struct nvmet_req *req) +{ + struct nvmet_ns *ns; + u16 status = 0; + off_t off = 0; + + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); + if (!ns) { + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + if (memchr_inv(&ns->uuid, 0, sizeof(ns->uuid))) { + status = nvmet_copy_ns_identifier(req, NVME_NIDT_UUID, + NVME_NIDT_UUID_LEN, + &ns->uuid, &off); + if (status) + goto out_put_ns; + } + if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) { + status = nvmet_copy_ns_identifier(req, NVME_NIDT_NGUID, + NVME_NIDT_NGUID_LEN, + &ns->nguid, &off); + if (status) + goto out_put_ns; + } + + if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off, + off) != NVME_IDENTIFY_DATA_SIZE - off) + status = NVME_SC_INTERNAL | NVME_SC_DNR; +out_put_ns: + nvmet_put_namespace(ns); +out: + nvmet_req_complete(req, status); +} + /* * A "mimimum viable" abort implementation: the command is mandatory in the * spec, but we are not required to do any useful work. We couldn't really @@ -504,7 +562,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) } break; case nvme_admin_identify: - req->data_len = 4096; + req->data_len = NVME_IDENTIFY_DATA_SIZE; switch (cmd->identify.cns) { case NVME_ID_CNS_NS: req->execute = nvmet_execute_identify_ns; @@ -515,6 +573,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) case NVME_ID_CNS_NS_ACTIVE_LIST: req->execute = nvmet_execute_identify_nslist; return 0; + case NVME_ID_CNS_NS_DESC_LIST: + req->execute = nvmet_execute_identify_desclist; + return 0; } break; case nvme_admin_abort_cmd: diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index be8c800078e2..a358ecd93e11 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -305,11 +305,41 @@ out_unlock: CONFIGFS_ATTR(nvmet_ns_, device_path); +static ssize_t nvmet_ns_device_uuid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->uuid); +} + +static ssize_t nvmet_ns_device_uuid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + int ret = 0; + + + mutex_lock(&subsys->lock); + if (ns->enabled) { + ret = -EBUSY; + goto out_unlock; + } + + + if (uuid_parse(page, &ns->uuid)) + ret = -EINVAL; + +out_unlock: + mutex_unlock(&subsys->lock); + return ret ? ret : count; +} + static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page) { return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid); } +CONFIGFS_ATTR(nvmet_ns_, device_uuid); + static ssize_t nvmet_ns_device_nguid_store(struct config_item *item, const char *page, size_t count) { @@ -379,6 +409,7 @@ CONFIGFS_ATTR(nvmet_ns_, enable); static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_device_path, &nvmet_ns_attr_device_nguid, + &nvmet_ns_attr_device_uuid, &nvmet_ns_attr_enable, NULL, }; @@ -619,8 +650,45 @@ out_unlock: CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host); +static ssize_t nvmet_subsys_version_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + if (NVME_TERTIARY(subsys->ver)) + return snprintf(page, PAGE_SIZE, "%d.%d.%d\n", + (int)NVME_MAJOR(subsys->ver), + (int)NVME_MINOR(subsys->ver), + (int)NVME_TERTIARY(subsys->ver)); + else + return snprintf(page, PAGE_SIZE, "%d.%d\n", + (int)NVME_MAJOR(subsys->ver), + (int)NVME_MINOR(subsys->ver)); +} + +static ssize_t nvmet_subsys_version_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + int major, minor, tertiary = 0; + int ret; + + + ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); + if (ret != 2 && ret != 3) + return -EINVAL; + + down_write(&nvmet_config_sem); + subsys->ver = NVME_VS(major, minor, tertiary); + up_write(&nvmet_config_sem); + + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, version); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, + &nvmet_subsys_attr_version, NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index eb9399ac97cf..b5b4ac103748 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -380,6 +380,7 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->nsid = nsid; ns->subsys = subsys; + uuid_gen(&ns->uuid); return ns; } @@ -926,7 +927,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, if (!subsys) return NULL; - subsys->ver = NVME_VS(1, 2, 1); /* NVMe 1.2.1 */ + subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */ switch (type) { case NVME_NQN_NVME: diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 1aaf597e81fc..8f3b57b4c97b 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -53,7 +53,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, e->portid = port->disc_addr.portid; /* we support only dynamic controllers */ e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); - e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH); + e->asqsz = cpu_to_le16(NVME_AQ_DEPTH); e->subtype = type; memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); @@ -185,7 +185,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } case nvme_admin_identify: - req->data_len = 4096; + req->data_len = NVME_IDENTIFY_DATA_SIZE; switch (cmd->identify.cns) { case NVME_ID_CNS_CTRL: req->execute = diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 2006fae61980..7692a96c9065 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2096,20 +2096,22 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, /* clear any response payload */ memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf)); + fod->data_sg = NULL; + fod->data_sg_cnt = 0; + ret = nvmet_req_init(&fod->req, &fod->queue->nvme_cq, &fod->queue->nvme_sq, &nvmet_fc_tgt_fcp_ops); - if (!ret) { /* bad SQE content or invalid ctrl state */ - nvmet_fc_abort_op(tgtport, fod); + if (!ret) { + /* bad SQE content or invalid ctrl state */ + /* nvmet layer has already called op done to send rsp. */ return; } /* keep a running counter of tail position */ atomic_inc(&fod->queue->sqtail); - fod->data_sg = NULL; - fod->data_sg_cnt = 0; if (fod->total_length) { ret = nvmet_fc_alloc_tgt_pgs(fod); if (ret) { diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 294a6611fb24..1bb9d5b311b1 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -569,7 +569,6 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *tgt_fcpreq) { struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); - int active; /* * mark aborted only in case there were 2 threads in transport @@ -577,7 +576,6 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, * after the abort request */ spin_lock(&tfcp_req->reqlock); - active = tfcp_req->active; tfcp_req->aborted = true; spin_unlock(&tfcp_req->reqlock); diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index c77940d80fc8..40128793e613 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -21,7 +21,7 @@ static void nvmet_bio_done(struct bio *bio) struct nvmet_req *req = bio->bi_private; nvmet_req_complete(req, - bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); if (bio != &req->inline_bio) bio_put(bio); @@ -145,7 +145,7 @@ static void nvmet_execute_discard(struct nvmet_req *req) bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; if (status) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else { submit_bio(bio); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index e503cfff0337..5f55c683b338 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -21,8 +21,6 @@ #include "../host/nvme.h" #include "../host/fabrics.h" -#define NVME_LOOP_AQ_DEPTH 256 - #define NVME_LOOP_MAX_SEGMENTS 256 /* @@ -31,7 +29,7 @@ */ #define NVME_LOOP_NR_AEN_COMMANDS 1 #define NVME_LOOP_AQ_BLKMQ_DEPTH \ - (NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) + (NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) struct nvme_loop_iod { struct nvme_request nvme_req; @@ -45,7 +43,6 @@ struct nvme_loop_iod { }; struct nvme_loop_ctrl { - spinlock_t lock; struct nvme_loop_queue *queues; u32 queue_count; @@ -59,7 +56,6 @@ struct nvme_loop_ctrl { struct nvmet_ctrl *target_ctrl; struct work_struct delete_work; - struct work_struct reset_work; }; static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) @@ -151,7 +147,7 @@ nvme_loop_timeout(struct request *rq, bool reserved) struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq); /* queue error recovery */ - schedule_work(&iod->queue->ctrl->reset_work); + nvme_reset_ctrl(&iod->queue->ctrl->ctrl); /* fail with DNR on admin cmd timeout */ nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR; @@ -159,17 +155,17 @@ nvme_loop_timeout(struct request *rq, bool reserved) return BLK_EH_HANDLED; } -static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_loop_queue *queue = hctx->driver_data; struct request *req = bd->rq; struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); - int ret; + blk_status_t ret; ret = nvme_setup_cmd(ns, req, &iod->cmd); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) return ret; iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; @@ -179,16 +175,15 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_cleanup_cmd(req); blk_mq_start_request(req); nvme_loop_queue_response(&iod->req); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } if (blk_rq_bytes(req)) { iod->sg_table.sgl = iod->first_sgl; - ret = sg_alloc_table_chained(&iod->sg_table, + if (sg_alloc_table_chained(&iod->sg_table, blk_rq_nr_phys_segments(req), - iod->sg_table.sgl); - if (ret) - return BLK_MQ_RQ_QUEUE_BUSY; + iod->sg_table.sgl)) + return BLK_STS_RESOURCE; iod->req.sg = iod->sg_table.sgl; iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); @@ -197,7 +192,7 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); schedule_work(&iod->work); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) @@ -234,15 +229,10 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) { - return nvme_loop_init_iod(set->driver_data, blk_mq_rq_to_pdu(req), - hctx_idx + 1); -} + struct nvme_loop_ctrl *ctrl = set->driver_data; -static int nvme_loop_init_admin_request(struct blk_mq_tag_set *set, - struct request *req, unsigned int hctx_idx, - unsigned int numa_node) -{ - return nvme_loop_init_iod(set->driver_data, blk_mq_rq_to_pdu(req), 0); + return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), + (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); } static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -280,7 +270,7 @@ static const struct blk_mq_ops nvme_loop_mq_ops = { static const struct blk_mq_ops nvme_loop_admin_mq_ops = { .queue_rq = nvme_loop_queue_rq, .complete = nvme_loop_complete_rq, - .init_request = nvme_loop_init_admin_request, + .init_request = nvme_loop_init_request, .init_hctx = nvme_loop_init_admin_hctx, .timeout = nvme_loop_timeout, }; @@ -467,7 +457,7 @@ static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) return -EBUSY; - if (!schedule_work(&ctrl->delete_work)) + if (!queue_work(nvme_wq, &ctrl->delete_work)) return -EBUSY; return 0; @@ -501,8 +491,8 @@ static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) static void nvme_loop_reset_ctrl_work(struct work_struct *work) { - struct nvme_loop_ctrl *ctrl = container_of(work, - struct nvme_loop_ctrl, reset_work); + struct nvme_loop_ctrl *ctrl = + container_of(work, struct nvme_loop_ctrl, ctrl.reset_work); bool changed; int ret; @@ -540,21 +530,6 @@ out_disable: nvme_put_ctrl(&ctrl->ctrl); } -static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - - if (!schedule_work(&ctrl->reset_work)) - return -EBUSY; - - flush_work(&ctrl->reset_work); - - return 0; -} - static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .name = "loop", .module = THIS_MODULE, @@ -562,11 +537,9 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .reset_ctrl = nvme_loop_reset_ctrl, .free_ctrl = nvme_loop_free_ctrl, .submit_async_event = nvme_loop_submit_async_event, .delete_ctrl = nvme_loop_del_ctrl, - .get_subsysnqn = nvmf_get_subsysnqn, }; static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) @@ -629,15 +602,13 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, INIT_LIST_HEAD(&ctrl->list); INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); - INIT_WORK(&ctrl->reset_work, nvme_loop_reset_ctrl_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work); ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) goto out_put_ctrl; - spin_lock_init(&ctrl->lock); - ret = -ENOMEM; ctrl->ctrl.sqsize = opts->queue_size - 1; @@ -766,7 +737,7 @@ static void __exit nvme_loop_cleanup_module(void) __nvme_loop_del_ctrl(ctrl); mutex_unlock(&nvme_loop_ctrl_mutex); - flush_scheduled_work(); + flush_workqueue(nvme_wq); } module_init(nvme_loop_init_module); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 8ff6e430b30a..747bbdb4f9c6 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -47,6 +47,7 @@ struct nvmet_ns { u32 blksize_shift; loff_t size; u8 nguid[16]; + uuid_t uuid; bool enabled; struct nvmet_subsys *subsys; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 9e45cde63376..56a4cba690b5 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1027,7 +1027,7 @@ nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; queue->send_queue_size = le16_to_cpu(req->hrqsize); - if (!queue->host_qid && queue->recv_queue_size > NVMF_AQ_DEPTH) + if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) return NVME_RDMA_CM_INVALID_HSQSIZE; /* XXX: Should we enforce some kind of max for IO queues? */ @@ -1307,53 +1307,44 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, /** * nvme_rdma_device_removal() - Handle RDMA device removal + * @cm_id: rdma_cm id, used for nvmet port * @queue: nvmet rdma queue (cm id qp_context) - * @addr: nvmet address (cm_id context) * * DEVICE_REMOVAL event notifies us that the RDMA device is about - * to unplug so we should take care of destroying our RDMA resources. - * This event will be generated for each allocated cm_id. + * to unplug. Note that this event can be generated on a normal + * queue cm_id and/or a device bound listener cm_id (where in this + * case queue will be null). * - * Note that this event can be generated on a normal queue cm_id - * and/or a device bound listener cm_id (where in this case - * queue will be null). - * - * we claim ownership on destroying the cm_id. For queues we move - * the queue state to NVMET_RDMA_IN_DEVICE_REMOVAL and for port + * We registered an ib_client to handle device removal for queues, + * so we only need to handle the listening port cm_ids. In this case * we nullify the priv to prevent double cm_id destruction and destroying * the cm_id implicitely by returning a non-zero rc to the callout. */ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { - unsigned long flags; - - if (!queue) { - struct nvmet_port *port = cm_id->context; + struct nvmet_port *port; + if (queue) { /* - * This is a listener cm_id. Make sure that - * future remove_port won't invoke a double - * cm_id destroy. use atomic xchg to make sure - * we don't compete with remove_port. - */ - if (xchg(&port->priv, NULL) != cm_id) - return 0; - } else { - /* - * This is a queue cm_id. Make sure that - * release queue will not destroy the cm_id - * and schedule all ctrl queues removal (only - * if the queue is not disconnecting already). + * This is a queue cm_id. we have registered + * an ib_client to handle queues removal + * so don't interfear and just return. */ - spin_lock_irqsave(&queue->state_lock, flags); - if (queue->state != NVMET_RDMA_Q_DISCONNECTING) - queue->state = NVMET_RDMA_IN_DEVICE_REMOVAL; - spin_unlock_irqrestore(&queue->state_lock, flags); - nvmet_rdma_queue_disconnect(queue); - flush_scheduled_work(); + return 0; } + port = cm_id->context; + + /* + * This is a listener cm_id. Make sure that + * future remove_port won't invoke a double + * cm_id destroy. use atomic xchg to make sure + * we don't compete with remove_port. + */ + if (xchg(&port->priv, NULL) != cm_id) + return 0; + /* * We need to return 1 so that the core will destroy * it's own ID. What a great API design.. @@ -1519,9 +1510,51 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = { .delete_ctrl = nvmet_rdma_delete_ctrl, }; +static void nvmet_rdma_add_one(struct ib_device *ib_device) +{ +} + +static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) +{ + struct nvmet_rdma_queue *queue; + + /* Device is being removed, delete all queues using this device */ + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + if (queue->dev->device != ib_device) + continue; + + pr_info("Removing queue %d\n", queue->idx); + __nvmet_rdma_queue_disconnect(queue); + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + flush_scheduled_work(); +} + +static struct ib_client nvmet_rdma_ib_client = { + .name = "nvmet_rdma", + .add = nvmet_rdma_add_one, + .remove = nvmet_rdma_remove_one +}; + static int __init nvmet_rdma_init(void) { - return nvmet_register_transport(&nvmet_rdma_ops); + int ret; + + ret = ib_register_client(&nvmet_rdma_ib_client); + if (ret) + return ret; + + ret = nvmet_register_transport(&nvmet_rdma_ops); + if (ret) + goto err_ib_client; + + return 0; + +err_ib_client: + ib_unregister_client(&nvmet_rdma_ib_client); + return ret; } static void __exit nvmet_rdma_exit(void) @@ -1544,6 +1577,7 @@ static void __exit nvmet_rdma_exit(void) mutex_unlock(&nvmet_rdma_queue_mutex); flush_scheduled_work(); + ib_unregister_client(&nvmet_rdma_ib_client); ida_destroy(&nvmet_rdma_queue_ida); } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 6fb3fd5efc11..b7cbd5d2cdea 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2672,7 +2672,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) */ if (basedev->state < DASD_STATE_READY) { while ((req = blk_fetch_request(block->request_queue))) - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); return; } @@ -2692,7 +2692,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) "Rejecting write request %p", req); blk_start_request(req); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); continue; } if (test_bit(DASD_FLAG_ABORTALL, &basedev->flags) && @@ -2702,7 +2702,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) "Rejecting failfast request %p", req); blk_start_request(req); - __blk_end_request_all(req, -ETIMEDOUT); + __blk_end_request_all(req, BLK_STS_TIMEOUT); continue; } cqr = basedev->discipline->build_cp(basedev, block, req); @@ -2734,7 +2734,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) "on request %p", PTR_ERR(cqr), req); blk_start_request(req); - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); continue; } /* @@ -2755,21 +2755,29 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr) { struct request *req; int status; - int error = 0; + blk_status_t error = BLK_STS_OK; req = (struct request *) cqr->callback_data; dasd_profile_end(cqr->block, cqr, req); + status = cqr->block->base->discipline->free_cp(cqr, req); if (status < 0) - error = status; + error = errno_to_blk_status(status); else if (status == 0) { - if (cqr->intrc == -EPERM) - error = -EBADE; - else if (cqr->intrc == -ENOLINK || - cqr->intrc == -ETIMEDOUT) - error = cqr->intrc; - else - error = -EIO; + switch (cqr->intrc) { + case -EPERM: + error = BLK_STS_NEXUS; + break; + case -ENOLINK: + error = BLK_STS_TRANSPORT; + break; + case -ETIMEDOUT: + error = BLK_STS_TIMEOUT; + break; + default: + error = BLK_STS_IOERR; + break; + } } __blk_end_request_all(req, error); } @@ -3190,7 +3198,7 @@ static void dasd_flush_request_queue(struct dasd_block *block) spin_lock_irq(&block->request_queue_lock); while ((req = blk_fetch_request(block->request_queue))) - __blk_end_request_all(req, -EIO); + __blk_end_request_all(req, BLK_STS_IOERR); spin_unlock_irq(&block->request_queue_lock); } diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 36e5280af3e4..06eb1de52d1c 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -845,7 +845,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio) unsigned long source_addr; unsigned long bytes_done; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); bytes_done = 0; dev_info = bio->bi_bdev->bd_disk->private_data; diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 152de6817875..3c2c84b72877 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -231,7 +231,7 @@ static inline void scm_request_init(struct scm_blk_dev *bdev, aob->request.data = (u64) aobrq; scmrq->bdev = bdev; scmrq->retries = 4; - scmrq->error = 0; + scmrq->error = BLK_STS_OK; /* We don't use all msbs - place aidaws at the end of the aob page. */ scmrq->next_aidaw = (void *) &aob->msb[nr_requests_per_io]; scm_request_cluster_init(scmrq); @@ -364,7 +364,7 @@ static void __scmrq_log_error(struct scm_request *scmrq) { struct aob *aob = scmrq->aob; - if (scmrq->error == -ETIMEDOUT) + if (scmrq->error == BLK_STS_TIMEOUT) SCM_LOG(1, "Request timeout"); else { SCM_LOG(1, "Request error"); @@ -377,7 +377,7 @@ static void __scmrq_log_error(struct scm_request *scmrq) scmrq->error); } -void scm_blk_irq(struct scm_device *scmdev, void *data, int error) +void scm_blk_irq(struct scm_device *scmdev, void *data, blk_status_t error) { struct scm_request *scmrq = data; struct scm_blk_dev *bdev = scmrq->bdev; @@ -397,7 +397,7 @@ static void scm_blk_handle_error(struct scm_request *scmrq) struct scm_blk_dev *bdev = scmrq->bdev; unsigned long flags; - if (scmrq->error != -EIO) + if (scmrq->error != BLK_STS_IOERR) goto restart; /* For -EIO the response block is valid. */ diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h index 09218cdc5129..cd598d1a4eae 100644 --- a/drivers/s390/block/scm_blk.h +++ b/drivers/s390/block/scm_blk.h @@ -35,7 +35,7 @@ struct scm_request { struct aob *aob; struct list_head list; u8 retries; - int error; + blk_status_t error; #ifdef CONFIG_SCM_BLOCK_CLUSTER_WRITE struct { enum {CLUSTER_NONE, CLUSTER_READ, CLUSTER_WRITE} state; @@ -50,7 +50,7 @@ struct scm_request { int scm_blk_dev_setup(struct scm_blk_dev *, struct scm_device *); void scm_blk_dev_cleanup(struct scm_blk_dev *); void scm_blk_set_available(struct scm_blk_dev *); -void scm_blk_irq(struct scm_device *, void *, int); +void scm_blk_irq(struct scm_device *, void *, blk_status_t); void scm_request_finish(struct scm_request *); void scm_request_requeue(struct scm_request *); diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index b9d7e755c8a3..a48f0d40c1d2 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -190,7 +190,7 @@ static blk_qc_t xpram_make_request(struct request_queue *q, struct bio *bio) unsigned long page_addr; unsigned long bytes; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if ((bio->bi_iter.bi_sector & 7) != 0 || (bio->bi_iter.bi_size & 4095) != 0) diff --git a/drivers/s390/cio/eadm_sch.c b/drivers/s390/cio/eadm_sch.c index b3f44bc7f644..0f11f3bcac82 100644 --- a/drivers/s390/cio/eadm_sch.c +++ b/drivers/s390/cio/eadm_sch.c @@ -135,7 +135,7 @@ static void eadm_subchannel_irq(struct subchannel *sch) struct eadm_private *private = get_eadm_private(sch); struct eadm_scsw *scsw = &sch->schib.scsw.eadm; struct irb *irb = this_cpu_ptr(&cio_irb); - int error = 0; + blk_status_t error = BLK_STS_OK; EADM_LOG(6, "irq"); EADM_LOG_HEX(6, irb, sizeof(*irb)); @@ -144,10 +144,10 @@ static void eadm_subchannel_irq(struct subchannel *sch) if ((scsw->stctl & (SCSW_STCTL_ALERT_STATUS | SCSW_STCTL_STATUS_PEND)) && scsw->eswf == 1 && irb->esw.eadm.erw.r) - error = -EIO; + error = BLK_STS_IOERR; if (scsw->fctl & SCSW_FCTL_CLEAR_FUNC) - error = -ETIMEDOUT; + error = BLK_STS_TIMEOUT; eadm_subchannel_set_timeout(sch, 0); diff --git a/drivers/s390/cio/scm.c b/drivers/s390/cio/scm.c index 15268edc54ae..1fa53ecdc2aa 100644 --- a/drivers/s390/cio/scm.c +++ b/drivers/s390/cio/scm.c @@ -71,7 +71,7 @@ void scm_driver_unregister(struct scm_driver *scmdrv) } EXPORT_SYMBOL_GPL(scm_driver_unregister); -void scm_irq_handler(struct aob *aob, int error) +void scm_irq_handler(struct aob *aob, blk_status_t error) { struct aob_rq_header *aobrq = (void *) aob->request.data; struct scm_device *scmdev = aobrq->scmdev; diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index 62fed9dc893e..14f377ac1280 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -214,7 +214,7 @@ static void jsfd_request(void) struct jsfd_part *jdp = req->rq_disk->private_data; unsigned long offset = blk_rq_pos(req) << 9; size_t len = blk_rq_cur_bytes(req); - int err = -EIO; + blk_status_t err = BLK_STS_IOERR; if ((offset + len) > jdp->dsize) goto end; @@ -230,7 +230,7 @@ static void jsfd_request(void) } jsfd_read(bio_data(req->bio), jdp->dbase + offset, len); - err = 0; + err = BLK_STS_OK; end: if (!__blk_end_request_cur(req, err)) req = jsfd_next_request(); @@ -592,6 +592,7 @@ static int jsfd_init(void) put_disk(disk); goto out; } + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); jsfd_disk[i] = disk; } diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index 8a1b94816419..a4f28b7e4c65 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -446,7 +446,7 @@ static void _put_request(struct request *rq) * code paths. */ if (unlikely(rq->bio)) - blk_end_request(rq, -ENOMEM, blk_rq_bytes(rq)); + blk_end_request(rq, BLK_STS_IOERR, blk_rq_bytes(rq)); else blk_put_request(rq); } @@ -474,10 +474,10 @@ void osd_end_request(struct osd_request *or) EXPORT_SYMBOL(osd_end_request); static void _set_error_resid(struct osd_request *or, struct request *req, - int error) + blk_status_t error) { or->async_error = error; - or->req_errors = scsi_req(req)->result ? : error; + or->req_errors = scsi_req(req)->result; or->sense_len = scsi_req(req)->sense_len; if (or->sense_len) memcpy(or->sense, scsi_req(req)->sense, or->sense_len); @@ -489,17 +489,19 @@ static void _set_error_resid(struct osd_request *or, struct request *req, int osd_execute_request(struct osd_request *or) { - int error; - blk_execute_rq(or->request->q, NULL, or->request, 0); - error = scsi_req(or->request)->result ? -EIO : 0; - _set_error_resid(or, or->request, error); - return error; + if (scsi_req(or->request)->result) { + _set_error_resid(or, or->request, BLK_STS_IOERR); + return -EIO; + } + + _set_error_resid(or, or->request, BLK_STS_OK); + return 0; } EXPORT_SYMBOL(osd_execute_request); -static void osd_request_async_done(struct request *req, int error) +static void osd_request_async_done(struct request *req, blk_status_t error) { struct osd_request *or = req->end_io_data; @@ -1572,13 +1574,9 @@ static struct request *_make_request(struct request_queue *q, bool has_write, flags); if (IS_ERR(req)) return req; - scsi_req_init(req); for_each_bio(bio) { - struct bio *bounce_bio = bio; - - blk_queue_bounce(req->q, &bounce_bio); - ret = blk_rq_append_bio(req, bounce_bio); + ret = blk_rq_append_bio(req, bio); if (ret) return ERR_PTR(ret); } @@ -1617,7 +1615,6 @@ static int _init_blk_request(struct osd_request *or, ret = PTR_ERR(req); goto out; } - scsi_req_init(req); or->in.req = or->request->next_rq = req; } } else if (has_in) @@ -1914,7 +1911,7 @@ analyze: /* scsi sense is Empty, the request was never issued to target * linux return code might tell us what happened. */ - if (or->async_error == -ENOMEM) + if (or->async_error == BLK_STS_RESOURCE) osi->osd_err_pri = OSD_ERR_PRI_RESOURCE; else osi->osd_err_pri = OSD_ERR_PRI_UNREACHABLE; diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index 67cbed92f07d..929ee7e88120 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -320,7 +320,7 @@ static int osst_chk_result(struct osst_tape * STp, struct osst_request * SRpnt) /* Wakeup from interrupt */ -static void osst_end_async(struct request *req, int update) +static void osst_end_async(struct request *req, blk_status_t status) { struct scsi_request *rq = scsi_req(req); struct osst_request *SRpnt = req->end_io_data; @@ -373,7 +373,6 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd, return DRIVER_ERROR << 24; rq = scsi_req(req); - scsi_req_init(req); req->rq_flags |= RQF_QUIET; SRpnt->bio = NULL; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index ecc07dab893d..304a7158540f 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1874,7 +1874,7 @@ int scsi_decide_disposition(struct scsi_cmnd *scmd) } } -static void eh_lock_door_done(struct request *req, int uptodate) +static void eh_lock_door_done(struct request *req, blk_status_t status) { __blk_put_request(req->q, req); } @@ -1903,7 +1903,6 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) if (IS_ERR(req)) return; rq = scsi_req(req); - scsi_req_init(req); rq->cmd[0] = ALLOW_MEDIUM_REMOVAL; rq->cmd[1] = 0; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 99e16ac479e3..550e29f903b7 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -250,7 +250,6 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, if (IS_ERR(req)) return ret; rq = scsi_req(req); - scsi_req_init(req); if (bufflen && blk_rq_map_kern(sdev->request_queue, req, buffer, bufflen, __GFP_RECLAIM)) @@ -635,7 +634,7 @@ static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd) cmd->request->next_rq->special = NULL; } -static bool scsi_end_request(struct request *req, int error, +static bool scsi_end_request(struct request *req, blk_status_t error, unsigned int bytes, unsigned int bidi_bytes) { struct scsi_cmnd *cmd = req->special; @@ -694,45 +693,28 @@ static bool scsi_end_request(struct request *req, int error, * @cmd: SCSI command (unused) * @result: scsi error code * - * Translate SCSI error code into standard UNIX errno. - * Return values: - * -ENOLINK temporary transport failure - * -EREMOTEIO permanent target failure, do not retry - * -EBADE permanent nexus failure, retry on other path - * -ENOSPC No write space available - * -ENODATA Medium error - * -EIO unspecified I/O error + * Translate SCSI error code into block errors. */ -static int __scsi_error_from_host_byte(struct scsi_cmnd *cmd, int result) +static blk_status_t __scsi_error_from_host_byte(struct scsi_cmnd *cmd, + int result) { - int error = 0; - - switch(host_byte(result)) { + switch (host_byte(result)) { case DID_TRANSPORT_FAILFAST: - error = -ENOLINK; - break; + return BLK_STS_TRANSPORT; case DID_TARGET_FAILURE: set_host_byte(cmd, DID_OK); - error = -EREMOTEIO; - break; + return BLK_STS_TARGET; case DID_NEXUS_FAILURE: - set_host_byte(cmd, DID_OK); - error = -EBADE; - break; + return BLK_STS_NEXUS; case DID_ALLOC_FAILURE: set_host_byte(cmd, DID_OK); - error = -ENOSPC; - break; + return BLK_STS_NOSPC; case DID_MEDIUM_ERROR: set_host_byte(cmd, DID_OK); - error = -ENODATA; - break; + return BLK_STS_MEDIUM; default: - error = -EIO; - break; + return BLK_STS_IOERR; } - - return error; } /* @@ -769,7 +751,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) int result = cmd->result; struct request_queue *q = cmd->device->request_queue; struct request *req = cmd->request; - int error = 0; + blk_status_t error = BLK_STS_OK; struct scsi_sense_hdr sshdr; bool sense_valid = false; int sense_deferred = 0, level = 0; @@ -808,7 +790,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) * both sides at once. */ scsi_req(req->next_rq)->resid_len = scsi_in(cmd)->resid; - if (scsi_end_request(req, 0, blk_rq_bytes(req), + if (scsi_end_request(req, BLK_STS_OK, blk_rq_bytes(req), blk_rq_bytes(req->next_rq))) BUG(); return; @@ -850,7 +832,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_print_sense(cmd); result = 0; /* for passthrough error may be set */ - error = 0; + error = BLK_STS_OK; } /* @@ -922,18 +904,18 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) action = ACTION_REPREP; } else if (sshdr.asc == 0x10) /* DIX */ { action = ACTION_FAIL; - error = -EILSEQ; + error = BLK_STS_PROTECTION; /* INVALID COMMAND OPCODE or INVALID FIELD IN CDB */ } else if (sshdr.asc == 0x20 || sshdr.asc == 0x24) { action = ACTION_FAIL; - error = -EREMOTEIO; + error = BLK_STS_TARGET; } else action = ACTION_FAIL; break; case ABORTED_COMMAND: action = ACTION_FAIL; if (sshdr.asc == 0x10) /* DIF */ - error = -EILSEQ; + error = BLK_STS_PROTECTION; break; case NOT_READY: /* If the device is in the process of becoming @@ -1134,6 +1116,20 @@ err_exit: } EXPORT_SYMBOL(scsi_init_io); +/** + * scsi_initialize_rq - initialize struct scsi_cmnd.req + * + * Called from inside blk_get_request(). + */ +void scsi_initialize_rq(struct request *rq) +{ + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); + + scsi_req_init(&cmd->req); +} +EXPORT_SYMBOL(scsi_initialize_rq); + +/* Called after a request has been started. */ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) { void *buf = cmd->sense_buffer; @@ -1829,15 +1825,15 @@ out_delay: blk_delay_queue(q, SCSI_QUEUE_DELAY); } -static inline int prep_to_mq(int ret) +static inline blk_status_t prep_to_mq(int ret) { switch (ret) { case BLKPREP_OK: - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; case BLKPREP_DEFER: - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; default: - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } } @@ -1909,7 +1905,7 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *req = bd->rq; @@ -1917,14 +1913,14 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); - int ret; + blk_status_t ret; int reason; ret = prep_to_mq(scsi_prep_state_check(sdev, req)); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret != BLK_STS_OK) goto out; - ret = BLK_MQ_RQ_QUEUE_BUSY; + ret = BLK_STS_RESOURCE; if (!get_device(&sdev->sdev_gendev)) goto out; @@ -1937,7 +1933,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (!(req->rq_flags & RQF_DONTPREP)) { ret = prep_to_mq(scsi_mq_prep_fn(req)); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret != BLK_STS_OK) goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; } else { @@ -1955,11 +1951,11 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, reason = scsi_dispatch_cmd(cmd); if (reason) { scsi_set_blocked(cmd, reason); - ret = BLK_MQ_RQ_QUEUE_BUSY; + ret = BLK_STS_RESOURCE; goto out_dec_host_busy; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_dec_host_busy: atomic_dec(&shost->host_busy); @@ -1972,12 +1968,14 @@ out_put_device: put_device(&sdev->sdev_gendev); out: switch (ret) { - case BLK_MQ_RQ_QUEUE_BUSY: + case BLK_STS_OK: + break; + case BLK_STS_RESOURCE: if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev)) blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); break; - case BLK_MQ_RQ_QUEUE_ERROR: + default: /* * Make sure to release all allocated ressources when * we hit an error, as we will never see this command @@ -1986,8 +1984,6 @@ out: if (req->rq_flags & RQF_DONTPREP) scsi_mq_uninit_cmd(cmd); break; - default: - break; } return ret; } @@ -2057,6 +2053,8 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) { struct device *dev = shost->dma_dev; + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); + /* * this limit is imposed by hardware restrictions */ @@ -2139,6 +2137,7 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) q->request_fn = scsi_request_fn; q->init_rq_fn = scsi_init_rq; q->exit_rq_fn = scsi_exit_rq; + q->initialize_rq_fn = scsi_initialize_rq; if (blk_init_allocated_queue(q) < 0) { blk_cleanup_queue(q); @@ -2163,6 +2162,7 @@ static const struct blk_mq_ops scsi_mq_ops = { #endif .init_request = scsi_init_request, .exit_request = scsi_exit_request, + .initialize_rq_fn = scsi_initialize_rq, .map_queues = scsi_map_queues, }; @@ -2977,7 +2977,7 @@ scsi_internal_device_block(struct scsi_device *sdev, bool wait) if (wait) blk_mq_quiesce_queue(q); else - blk_mq_stop_hw_queues(q); + blk_mq_quiesce_queue_nowait(q); } else { spin_lock_irqsave(q->queue_lock, flags); blk_stop_queue(q); @@ -3031,7 +3031,7 @@ scsi_internal_device_unblock(struct scsi_device *sdev, return -EINVAL; if (q->mq_ops) { - blk_mq_start_stopped_hw_queues(q, false); + blk_mq_unquiesce_queue(q); } else { spin_lock_irqsave(q->queue_lock, flags); blk_start_queue(q); diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c index 0ebe2f1bb908..5006a656e16a 100644 --- a/drivers/scsi/scsi_transport_sas.c +++ b/drivers/scsi/scsi_transport_sas.c @@ -33,6 +33,7 @@ #include <linux/bsg.h> #include <scsi/scsi.h> +#include <scsi/scsi_cmnd.h> #include <scsi/scsi_request.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> @@ -172,7 +173,7 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost, struct sas_rphy *rphy) { struct request *req; - int ret; + blk_status_t ret; int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *); while ((req = blk_fetch_request(q)) != NULL) { @@ -230,6 +231,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) q = blk_alloc_queue(GFP_KERNEL); if (!q) return -ENOMEM; + q->initialize_rq_fn = scsi_initialize_rq; q->cmd_size = sizeof(struct scsi_request); if (rphy) { @@ -249,6 +251,11 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) if (error) goto out_cleanup_queue; + /* + * by default assume old behaviour and bounce for any highmem page + */ + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); + error = bsg_register_queue(q, dev, name, release); if (error) goto out_cleanup_queue; @@ -264,6 +271,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) q->queuedata = shost; queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); + queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); return 0; out_cleanup_queue: diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 82c33a6edbea..21225d62b0c1 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -177,7 +177,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ } Sg_device; /* tasklet or soft irq callback */ -static void sg_rq_end_io(struct request *rq, int uptodate); +static void sg_rq_end_io(struct request *rq, blk_status_t status); static int sg_start_req(Sg_request *srp, unsigned char *cmd); static int sg_finish_rem_req(Sg_request * srp); static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size); @@ -808,7 +808,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, if (atomic_read(&sdp->detaching)) { if (srp->bio) { scsi_req_free_cmd(scsi_req(srp->rq)); - blk_end_request_all(srp->rq, -EIO); + blk_end_request_all(srp->rq, BLK_STS_IOERR); srp->rq = NULL; } @@ -1300,7 +1300,7 @@ sg_rq_end_io_usercontext(struct work_struct *work) * level when a command is completed (or has failed). */ static void -sg_rq_end_io(struct request *rq, int uptodate) +sg_rq_end_io(struct request *rq, blk_status_t status) { struct sg_request *srp = rq->end_io_data; struct scsi_request *req = scsi_req(rq); @@ -1732,8 +1732,6 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) } req = scsi_req(rq); - scsi_req_init(rq); - if (hp->cmd_len > BLK_MAX_CDB) req->cmd = long_cmdp; memcpy(req->cmd, cmd, hp->cmd_len); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 1ea34d6f5437..8e5013d9cad4 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -511,7 +511,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req) atomic64_dec(&STp->stats->in_flight); } -static void st_scsi_execute_end(struct request *req, int uptodate) +static void st_scsi_execute_end(struct request *req, blk_status_t status) { struct st_request *SRpnt = req->end_io_data; struct scsi_request *rq = scsi_req(req); @@ -549,7 +549,6 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, if (IS_ERR(req)) return DRIVER_ERROR << 24; rq = scsi_req(req); - scsi_req_init(req); req->rq_flags |= RQF_QUIET; mdata->null_mapped = 1; diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index bb069ebe4aa6..c05d38016556 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -93,7 +93,7 @@ static int iblock_configure_device(struct se_device *dev) return -EINVAL; } - ib_dev->ibd_bio_set = bioset_create(IBLOCK_BIO_POOL_SIZE, 0); + ib_dev->ibd_bio_set = bioset_create(IBLOCK_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (!ib_dev->ibd_bio_set) { pr_err("IBLOCK: Unable to create bioset\n"); goto out; @@ -296,8 +296,8 @@ static void iblock_bio_done(struct bio *bio) struct se_cmd *cmd = bio->bi_private; struct iblock_req *ibr = cmd->priv; - if (bio->bi_error) { - pr_err("bio error: %p, err: %d\n", bio, bio->bi_error); + if (bio->bi_status) { + pr_err("bio error: %p, err: %d\n", bio, bio->bi_status); /* * Bump the ib_bio_err_cnt and release bio. */ @@ -354,11 +354,11 @@ static void iblock_end_io_flush(struct bio *bio) { struct se_cmd *cmd = bio->bi_private; - if (bio->bi_error) - pr_err("IBLOCK: cache flush failed: %d\n", bio->bi_error); + if (bio->bi_status) + pr_err("IBLOCK: cache flush failed: %d\n", bio->bi_status); if (cmd) { - if (bio->bi_error) + if (bio->bi_status) target_complete_cmd(cmd, SAM_STAT_CHECK_CONDITION); else target_complete_cmd(cmd, SAM_STAT_GOOD); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 3e4abb13f8ea..ceec0211e84e 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -55,7 +55,7 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev) } static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd); -static void pscsi_req_done(struct request *, int); +static void pscsi_req_done(struct request *, blk_status_t); /* pscsi_attach_hba(): * @@ -992,8 +992,6 @@ pscsi_execute_cmd(struct se_cmd *cmd) goto fail; } - scsi_req_init(req); - if (sgl) { ret = pscsi_map_sg(cmd, sgl, sgl_nents, req); if (ret) @@ -1045,7 +1043,7 @@ static sector_t pscsi_get_blocks(struct se_device *dev) return 0; } -static void pscsi_req_done(struct request *req, int uptodate) +static void pscsi_req_done(struct request *req, blk_status_t status) { struct se_cmd *cmd = req->end_io_data; struct pscsi_plugin_task *pt = cmd->priv; @@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1568,6 +1568,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_pos = iocb->aio_offset; req->common.ki_complete = aio_complete; req->common.ki_flags = iocb_flags(req->common.ki_filp); + req->common.ki_hint = file_write_hint(file); if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* @@ -1586,6 +1587,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_flags |= IOCB_EVENTFD; } + ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags); + if (unlikely(ret)) { + pr_debug("EINVAL: aio_rw_flags\n"); + goto out_put_req; + } + + if ((req->common.ki_flags & IOCB_NOWAIT) && + !(req->common.ki_flags & IOCB_DIRECT)) { + ret = -EOPNOTSUPP; + goto out_put_req; + } + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); diff --git a/fs/block_dev.c b/fs/block_dev.c index 0a7404ef9335..a7df151f8aba 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -225,6 +225,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio_init(&bio, vecs, nr_pages); bio.bi_bdev = bdev; bio.bi_iter.bi_sector = pos >> 9; + bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; @@ -262,8 +263,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (vecs != inline_vecs) kfree(vecs); - if (unlikely(bio.bi_error)) - ret = bio.bi_error; + if (unlikely(bio.bi_status)) + ret = blk_status_to_errno(bio.bi_status); bio_uninit(&bio); @@ -291,16 +292,18 @@ static void blkdev_bio_end_io(struct bio *bio) bool should_dirty = dio->should_dirty; if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_error && !dio->bio.bi_error) - dio->bio.bi_error = bio->bi_error; + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; } else { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; - ssize_t ret = dio->bio.bi_error; + ssize_t ret; - if (likely(!ret)) { + if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret, 0); @@ -337,7 +340,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - int ret; + int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -361,12 +364,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) for (;;) { bio->bi_bdev = bdev; bio->bi_iter.bi_sector = pos >> 9; + bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { - bio->bi_error = ret; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } @@ -415,7 +419,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); - ret = dio->bio.bi_error; + if (!ret) + ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; @@ -439,7 +444,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio)); + blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); if (!blkdev_dio_pool) return -ENOMEM; return 0; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b8622e4d1744..d87ac27a5f2b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -310,7 +310,8 @@ struct btrfs_dio_private { * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ - int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); + blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *, + blk_status_t); }; /* diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ab14c2e635ca..4ded1c3f92b8 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2129,7 +2129,7 @@ static void btrfsic_bio_end_io(struct bio *bp) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bp->bi_error) + if (bp->bi_status) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2143,7 +2143,7 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bp->bi_error, + bp->bi_status, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 10e6b282d09d..a2fad39f79ba 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -155,7 +155,7 @@ static void end_compressed_bio_read(struct bio *bio) unsigned long index; int ret; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -268,7 +268,7 @@ static void end_compressed_bio_write(struct bio *bio) struct page *page; unsigned long index; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -287,7 +287,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_error ? 0 : 1); + bio->bi_status ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -320,7 +320,7 @@ out: * This also checksums the file bytes and gets things ready for * the end io hooks. */ -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, @@ -335,13 +335,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page *page; u64 first_byte = disk_start; struct block_device *bdev; - int ret; + blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_SIZE - 1)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) - return -ENOMEM; + return BLK_STS_RESOURCE; refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -358,7 +358,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); if (!bio) { kfree(cb); - return -ENOMEM; + return BLK_STS_RESOURCE; } bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; @@ -368,17 +368,17 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, /* create and submit bios for the compressed pages */ bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + int submit = 0; + page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + submit = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(bio); @@ -400,7 +400,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -434,7 +434,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -569,7 +569,7 @@ next: * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -586,7 +586,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret = -ENOMEM; + blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; u32 *sums; @@ -600,7 +600,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) - return -EIO; + return BLK_STS_IOERR; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -638,7 +638,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, __GFP_HIGHMEM); if (!cb->compressed_pages[pg_index]) { faili = pg_index - 1; - ret = -ENOMEM; + ret = BLK_STS_RESOURCE; goto fail2; } } @@ -659,19 +659,19 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { + int submit = 0; + page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + submit = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(comp_bio); @@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } @@ -726,7 +726,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 39ec43ab8df1..680d4265d601 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -48,12 +48,12 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages); -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); enum btrfs_compression_type { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4f8f75d9e839..a0d0c79d95ed 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3078,8 +3078,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3094,7 +3094,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5f678dcb20e6..6036d15b47b8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -87,7 +87,7 @@ struct btrfs_end_io_wq { bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; - int error; + blk_status_t status; enum btrfs_wq_endio_type metadata; struct list_head list; struct btrfs_work work; @@ -131,7 +131,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; - int error; + blk_status_t status; }; /* @@ -799,7 +799,7 @@ static void end_workqueue_bio(struct bio *bio) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = bio->bi_error; + end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -836,19 +836,19 @@ static void end_workqueue_bio(struct bio *bio) btrfs_queue_work(wq, &end_io_wq->work); } -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata) { struct btrfs_end_io_wq *end_io_wq; end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) - return -ENOMEM; + return BLK_STS_RESOURCE; end_io_wq->private = bio->bi_private; end_io_wq->end_io = bio->bi_end_io; end_io_wq->info = info; - end_io_wq->error = 0; + end_io_wq->status = 0; end_io_wq->bio = bio; end_io_wq->metadata = metadata; @@ -868,14 +868,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; - int ret; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); ret = async->submit_bio_start(async->inode, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) - async->error = ret; + async->status = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -898,8 +898,8 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ - if (async->error) { - async->bio->bi_error = async->error; + if (async->status) { + async->bio->bi_status = async->status; bio_endio(async->bio); return; } @@ -916,18 +916,17 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, + struct inode *inode, struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) { struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return -ENOMEM; + return BLK_STS_RESOURCE; async->inode = inode; async->bio = bio; @@ -941,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; - async->error = 0; + async->status = 0; atomic_inc(&fs_info->nr_async_submits); @@ -959,7 +958,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return 0; } -static int btree_csum_one_bio(struct bio *bio) +static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; @@ -972,12 +971,12 @@ static int btree_csum_one_bio(struct bio *bio) break; } - return ret; + return errno_to_blk_status(ret); } -static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_start(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -986,11 +985,11 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_done(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret; + blk_status_t ret; /* * when we're called for a write, we're already in the async @@ -998,7 +997,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, */ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1015,13 +1014,13 @@ static int check_async_write(unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, +static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(bio_flags); - int ret; + blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { /* @@ -1054,7 +1053,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, return 0; out_w_error: - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); return ret; } @@ -1820,7 +1819,7 @@ static void end_workqueue_fn(struct btrfs_work *work) end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - bio->bi_error = end_io_wq->error; + bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); @@ -3497,11 +3496,11 @@ static void btrfs_end_empty_barrier(struct bio *bio) * any device where the flush fails with eopnotsupp are flagged as not-barrier * capable */ -static int write_dev_flush(struct btrfs_device *device, int wait) +static blk_status_t write_dev_flush(struct btrfs_device *device, int wait) { struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio; - int ret = 0; + blk_status_t ret = 0; if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return 0; @@ -3513,8 +3512,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (bio->bi_error) { - ret = bio->bi_error; + if (bio->bi_status) { + ret = bio->bi_status; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); } @@ -3533,7 +3532,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) device->flush_bio = NULL; bio = btrfs_io_bio_alloc(GFP_NOFS, 0); if (!bio) - return -ENOMEM; + return BLK_STS_RESOURCE; bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; @@ -3558,7 +3557,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) struct btrfs_device *dev; int errors_send = 0; int errors_wait = 0; - int ret; + blk_status_t ret; /* send down all the barriers */ head = &info->fs_devices->devices; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 21f1ceb85b76..c581927555f3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -118,13 +118,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(const char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, u8 *result); -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done); +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, + struct inode *inode, struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d3619e010005..d1cd60140817 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -174,7 +174,8 @@ int __init extent_io_init(void) goto free_state_cache; btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio)); + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS); if (!btrfs_bioset) goto free_buffer_cache; @@ -2399,6 +2400,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct bio *bio; int read_mode = 0; + blk_status_t status; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2431,11 +2433,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, + status = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, failrec->bio_flags, 0); - if (ret) { + if (status) { free_io_failure(BTRFS_I(inode), failrec); bio_put(bio); + ret = blk_status_to_errno(status); } return ret; @@ -2474,6 +2477,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio) { + int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; u64 end; @@ -2503,7 +2507,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - end_extent_writepage(page, bio->bi_error, start, end); + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -2536,7 +2540,7 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; u64 offset = 0; @@ -2556,7 +2560,7 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_error, + (u64)bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; @@ -2615,7 +2619,7 @@ static void end_bio_extent_readpage(struct bio *bio) ret = bio_readpage_error(bio, offset, page, start, end, mirror); if (ret == 0) { - uptodate = !bio->bi_error; + uptodate = !bio->bi_status; offset += len; continue; } @@ -2673,7 +2677,7 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, bio->bi_error); + io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); bio_put(bio); } @@ -2743,7 +2747,7 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { - int ret = 0; + blk_status_t ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; @@ -2761,7 +2765,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, btrfsic_submit_bio(bio); bio_put(bio); - return ret; + return blk_status_to_errno(ret); } static int merge_bio(struct extent_io_tree *tree, struct page *page, @@ -2826,6 +2830,7 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio->bi_write_hint = page->mapping->host->i_write_hint; bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); @@ -3707,7 +3712,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (bio->bi_error || + if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1eafa2f0ede3..487ca0207cb6 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -92,9 +92,9 @@ struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset); +typedef blk_status_t (extent_submit_bio_hook_t)(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset); struct extent_io_ops { /* * The following callbacks must be allways defined, the function diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 64fcb31d7163..5b1c7090e546 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -160,7 +160,7 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) kfree(bio->csum_allocated); } -static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + return BLK_STS_RESOURCE; nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { @@ -191,7 +191,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); - return -ENOMEM; + return BLK_STS_RESOURCE; } btrfs_bio->csum = btrfs_bio->csum_allocated; btrfs_bio->end_io = btrfs_io_bio_endio_readpage; @@ -303,12 +303,12 @@ next: return 0; } -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) { return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); } @@ -433,7 +433,7 @@ fail: return ret; } -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -452,7 +452,7 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_NOFS); if (!sums) - return -ENOMEM; + return BLK_STS_RESOURCE; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index da1096eb1a40..59e2dccdf75b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1875,12 +1875,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, ssize_t num_written = 0; bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); ssize_t err; - loff_t pos; - size_t count; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); loff_t oldsize; int clean_page = 0; - inode_lock(inode); + if ((iocb->ki_flags & IOCB_NOWAIT) && + (iocb->ki_flags & IOCB_DIRECT)) { + /* Don't sleep on inode rwsem */ + if (!inode_trylock(inode)) + return -EAGAIN; + /* + * We will allocate space in case nodatacow is not set, + * so bail + */ + if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) || + check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + } else + inode_lock(inode); + err = generic_write_checks(iocb, from); if (err <= 0) { inode_unlock(inode); @@ -1914,8 +1931,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ update_time_for_write(inode); - pos = iocb->ki_pos; - count = iov_iter_count(from); start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { @@ -3071,13 +3086,19 @@ out: return offset; } +static int btrfs_file_open(struct inode *inode, struct file *filp) +{ + filp->f_mode |= FMODE_AIO_NOWAIT; + return generic_file_open(inode, filp); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, - .open = generic_file_open, + .open = btrfs_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ef3c98c527c1..556c93060606 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -842,13 +842,12 @@ retry: NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - ret = btrfs_submit_compressed_write(inode, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages); - if (ret) { + async_extent->nr_pages)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -1901,11 +1900,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btrfs_submit_bio_start(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret = 0; + blk_status_t ret = 0; ret = btrfs_csum_one_bio(inode, bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -1920,16 +1919,16 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btrfs_submit_bio_done(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1939,14 +1938,14 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, +static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - int ret = 0; + blk_status_t ret = 0; int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -1991,8 +1990,8 @@ mapit: ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); out: - if (ret < 0) { - bio->bi_error = ret; + if (ret) { + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -8037,7 +8036,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) struct bio_vec *bvec; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; ASSERT(bio->bi_vcnt == 1); @@ -8116,7 +8115,7 @@ static void btrfs_retry_endio(struct bio *bio) int ret; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; uptodate = 1; @@ -8141,8 +8140,8 @@ end: bio_put(bio); } -static int __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { struct btrfs_fs_info *fs_info; struct bio_vec *bvec; @@ -8184,7 +8183,7 @@ try_again: io_bio->mirror_num, btrfs_retry_endio, &done); if (ret) { - err = ret; + err = errno_to_blk_status(ret); goto next; } @@ -8211,8 +8210,8 @@ next: return err; } -static int btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8232,7 +8231,7 @@ static void btrfs_endio_direct_read(struct bio *bio) struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); @@ -8243,11 +8242,11 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, blk_status_to_errno(err)); bio_put(bio); } @@ -8299,20 +8298,20 @@ static void btrfs_endio_direct_write(struct bio *bio) struct bio *dio_bio = dip->dio_bio; __endio_write_update_ordered(dip->inode, dip->logical_offset, - dip->bytes, !bio->bi_error); + dip->bytes, !bio->bi_status); kfree(dip); - dio_bio->bi_error = bio->bi_error; - dio_end_io(dio_bio, bio->bi_error); + dio_bio->bi_status = bio->bi_status; + dio_end_io(dio_bio); bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, +static blk_status_t __btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { - int ret; + blk_status_t ret; ret = btrfs_csum_one_bio(inode, bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; @@ -8321,7 +8320,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -8351,7 +8350,7 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_error = 0; + dip->dio_bio->bi_status = 0; bio_endio(dip->orig_bio); } out: @@ -8368,14 +8367,14 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, return bio; } -static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, +static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 file_offset) { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - int ret; + blk_status_t ret; /* * We load all the csum data we need when we submit @@ -8406,7 +8405,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; bool write = bio_op(bio) == REQ_OP_WRITE; - int ret; + blk_status_t ret; if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -8649,7 +8648,7 @@ free_ordered: * callbacks - they require an allocated dip and a clone of dio_bio. */ if (io_bio && dip) { - io_bio->bi_error = -EIO; + io_bio->bi_status = BLK_STS_IOERR; bio_endio(io_bio); /* * The end io callbacks free our dip, do the final put on io_bio @@ -8668,12 +8667,12 @@ free_ordered: unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - dio_bio->bi_error = -EIO; + dio_bio->bi_status = BLK_STS_IOERR; /* * Releases and cleans up our dio_bio, no need to bio_put() * nor bio_endio()/bio_io_error() against dio_bio. */ - dio_end_io(dio_bio, ret); + dio_end_io(dio_bio); } if (io_bio) bio_put(io_bio); @@ -8755,6 +8754,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.overwrite = 1; inode_unlock(inode); relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; } ret = btrfs_delalloc_reserve_space(inode, offset, count); if (ret) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d8ea0eb76325..f3d30d9ea8f9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -871,7 +871,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -884,7 +884,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_error = err; + cur->bi_status = err; bio_endio(cur); cur = next; } @@ -897,7 +897,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; int max_errors; if (err) @@ -914,7 +914,7 @@ static void raid_write_end_io(struct bio *bio) max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 0 : rbio->bbio->max_errors; if (atomic_read(&rbio->error) > max_errors) - err = -EIO; + err = BLK_STS_IOERR; rbio_orig_end_io(rbio, err); } @@ -1092,7 +1092,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - !last->bi_error && + !last->bi_status && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) @@ -1448,7 +1448,7 @@ static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1991,7 +1991,7 @@ static void raid_recover_end_io(struct bio *bio) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2530,7 +2530,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c7b45eb2403d..ba5595d19de1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -95,7 +95,7 @@ struct scrub_bio { struct scrub_ctx *sctx; struct btrfs_device *dev; struct bio *bio; - int err; + blk_status_t status; u64 logical; u64 physical; #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO @@ -1668,14 +1668,14 @@ leave_nomem: struct scrub_bio_ret { struct completion event; - int error; + blk_status_t status; }; static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->status = bio->bi_status; complete(&ret->event); } @@ -1693,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, int ret; init_completion(&done.event); - done.error = 0; + done.status = 0; bio->bi_iter.bi_sector = page->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; @@ -1705,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, return ret; wait_for_completion(&done.event); - if (done.error) + if (done.status) return -EIO; return 0; @@ -1937,7 +1937,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -1992,7 +1992,7 @@ static void scrub_wr_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -2007,7 +2007,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) int i; WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); - if (sbio->err) { + if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -2341,7 +2341,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -2377,7 +2377,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) sblock->no_io_error_seen = 0; bio_put(bio); @@ -2588,7 +2588,7 @@ static void scrub_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2601,7 +2601,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); - if (sbio->err) { + if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -3004,7 +3004,7 @@ static void scrub_parity_bio_endio(struct bio *bio) struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 017b67daa3bb..84a495967e0a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6042,9 +6042,10 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; - if (bio->bi_error) { + if (bio->bi_status) { atomic_inc(&bbio->error); - if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { + if (bio->bi_status == BLK_STS_IOERR || + bio->bi_status == BLK_STS_TARGET) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; @@ -6082,13 +6083,13 @@ static void btrfs_end_bio(struct bio *bio) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_error = 0; + bio->bi_status = 0; } btrfs_end_bbio(bbio, bio); @@ -6199,7 +6200,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; btrfs_end_bbio(bbio, bio); } } diff --git a/fs/buffer.c b/fs/buffer.c index 161be58c5cb0..5c2cba8d2387 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,7 +49,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1829,7 +1829,8 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1883,7 +1884,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -3038,7 +3040,7 @@ static void end_bio_bh_io_sync(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bio_put(bio); } @@ -3091,7 +3093,7 @@ void guard_bio_eod(int op, struct bio *bio) } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - struct writeback_control *wbc) + enum rw_hint write_hint, struct writeback_control *wbc) { struct bio *bio; @@ -3120,6 +3122,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; + bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); BUG_ON(bio->bi_iter.bi_size != bh->b_size); @@ -3142,7 +3145,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, NULL); + return submit_bh_wbc(op, op_flags, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a409a84f1bca..6181e9526860 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -129,7 +129,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, goto errout; } err = submit_bio_wait(bio); - if ((err == 0) && bio->bi_error) + if (err == 0 && bio->bi_status) err = -EIO; bio_put(bio); if (err) diff --git a/fs/direct-io.c b/fs/direct-io.c index a04ebea77de8..08cf27811e5a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -294,7 +294,7 @@ static void dio_aio_complete_work(struct work_struct *work) dio_complete(dio, 0, true); } -static int dio_bio_complete(struct dio *dio, struct bio *bio); +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. @@ -348,13 +348,12 @@ static void dio_bio_end_io(struct bio *bio) /** * dio_end_io - handle the end io action for the given bio * @bio: The direct io bio thats being completed - * @error: Error if there was one * * This is meant to be called by any filesystem that uses their own dio_submit_t * so that the DIO specific endio actions are dealt with after the filesystem * has done it's completion work. */ -void dio_end_io(struct bio *bio, int error) +void dio_end_io(struct bio *bio) { struct dio *dio = bio->bi_private; @@ -386,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, else bio->bi_end_io = dio_bio_end_io; + bio->bi_write_hint = dio->iocb->ki_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } @@ -474,17 +475,20 @@ static struct bio *dio_await_one(struct dio *dio) /* * Process one completed BIO. No locks are held. */ -static int dio_bio_complete(struct dio *dio, struct bio *bio) +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { struct bio_vec *bvec; unsigned i; - int err; + blk_status_t err = bio->bi_status; - if (bio->bi_error) - dio->io_error = -EIO; + if (err) { + if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) + dio->io_error = -EAGAIN; + else + dio->io_error = -EIO; + } if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { - err = bio->bi_error; bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { @@ -495,7 +499,6 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) set_page_dirty_lock(page); put_page(page); } - err = bio->bi_error; bio_put(bio); } return err; @@ -539,7 +542,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) bio = dio->bio_list; dio->bio_list = bio->bi_private; spin_unlock_irqrestore(&dio->bio_lock, flags); - ret2 = dio_bio_complete(dio, bio); + ret2 = blk_status_to_errno(dio_bio_complete(dio, bio)); if (ret == 0) ret = ret2; } @@ -1197,6 +1200,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, if (iov_iter_rw(iter) == WRITE) { dio->op = REQ_OP_WRITE; dio->op_flags = REQ_SYNC | REQ_IDLE; + if (iocb->ki_flags & IOCB_NOWAIT) + dio->op_flags |= REQ_NOWAIT; } else { dio->op = REQ_OP_READ; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 02ce7e7bbdf5..58e2eeaa0bc4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -37,7 +37,11 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock_shared(inode); + if (!inode_trylock_shared(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock_shared(inode); + } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore @@ -179,7 +183,11 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -216,7 +224,12 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_dax_write_iter(iocb, from); #endif - inode_lock(inode); + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -235,9 +248,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->private = &overwrite; /* Check whether we do a DIO overwrite or not */ - if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && - ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) - overwrite = 1; + if (o_direct && !unaligned_aio) { + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + if (ext4_should_dioread_nolock(inode)) + overwrite = 1; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + } ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); @@ -435,6 +454,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } + + /* Set the flags to support nowait AIO */ + filp->f_mode |= FMODE_AIO_NOWAIT; + return dquot_file_open(inode, filp); } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1a82138ba739..c2fce4478cca 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -85,7 +85,7 @@ static void ext4_finish_bio(struct bio *bio) } #endif - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); mapping_set_error(page->mapping, -EIO); } @@ -104,7 +104,7 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_error) + if (bio->bi_status) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); @@ -303,24 +303,25 @@ static void ext4_end_bio(struct bio *bio) bdevname(bio->bi_bdev, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), - bio->bi_error)) { + bio->bi_status)) { ext4_finish_bio(bio); bio_put(bio); return; } bio->bi_end_io = NULL; - if (bio->bi_error) { + if (bio->bi_status) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - bio->bi_error, inode->i_ino, + bio->bi_status, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); - mapping_set_error(inode->i_mapping, bio->bi_error); + mapping_set_error(inode->i_mapping, + blk_status_to_errno(bio->bi_status)); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -349,6 +350,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0; + io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); submit_bio(io->io_bio); } @@ -396,6 +398,7 @@ submit_and_retry: ret = io_submit_init_bio(io, bh); if (ret) return ret; + io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a81b829d56de..40a5497b0f60 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -73,7 +73,7 @@ static void mpage_end_io(struct bio *bio) int i; if (ext4_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -83,7 +83,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { SetPageUptodate(page); } else { ClearPageUptodate(page); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7c0f6bdf817d..36fe82012a33 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -58,12 +58,12 @@ static void f2fs_read_end_io(struct bio *bio) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } #endif if (f2fs_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -74,7 +74,7 @@ static void f2fs_read_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { if (!PageUptodate(page)) SetPageUptodate(page); } else { @@ -102,14 +102,14 @@ static void f2fs_write_end_io(struct bio *bio) unlock_page(page); mempool_free(page, sbi->write_io_dummy); - if (unlikely(bio->bi_error)) + if (unlikely(bio->bi_status)) f2fs_stop_checkpoint(sbi, true); continue; } fscrypt_pullback_bio_page(&page, true); - if (unlikely(bio->bi_error)) { + if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 96845854e7ee..ea9f455d94ba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -749,7 +749,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - dc->error = bio->bi_error; + dc->error = blk_status_to_errno(bio->bi_status); dc->state = D_DONE; complete(&dc->wait); bio_put(bio); diff --git a/fs/fcntl.c b/fs/fcntl.c index f4e7267d117f..ed051f825bad 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -243,6 +243,67 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) } #endif +static bool rw_hint_valid(enum rw_hint hint) +{ + switch (hint) { + case RWF_WRITE_LIFE_NOT_SET: + case RWH_WRITE_LIFE_NONE: + case RWH_WRITE_LIFE_SHORT: + case RWH_WRITE_LIFE_MEDIUM: + case RWH_WRITE_LIFE_LONG: + case RWH_WRITE_LIFE_EXTREME: + return true; + default: + return false; + } +} + +static long fcntl_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + u64 *argp = (u64 __user *)arg; + enum rw_hint hint; + u64 h; + + switch (cmd) { + case F_GET_FILE_RW_HINT: + h = file_write_hint(file); + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_FILE_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + spin_lock(&file->f_lock); + file->f_write_hint = hint; + spin_unlock(&file->f_lock); + return 0; + case F_GET_RW_HINT: + h = inode->i_write_hint; + if (copy_to_user(argp, &h, sizeof(*argp))) + return -EFAULT; + return 0; + case F_SET_RW_HINT: + if (copy_from_user(&h, argp, sizeof(h))) + return -EFAULT; + hint = (enum rw_hint) h; + if (!rw_hint_valid(hint)) + return -EINVAL; + + inode_lock(inode); + inode->i_write_hint = hint; + inode_unlock(inode); + return 0; + default: + return -EINVAL; + } +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -337,6 +398,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_GET_SEALS: err = shmem_fcntl(filp, cmd, arg); break; + case F_GET_RW_HINT: + case F_SET_RW_HINT: + case F_GET_FILE_RW_HINT: + case F_SET_FILE_RW_HINT: + err = fcntl_rw_hint(filp, cmd, arg); + break; default: break; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b7cf65d13561..aa3d44527fa2 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -815,7 +815,6 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; - int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index b1f9144b42c7..885d36e7a29f 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -170,7 +170,7 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) */ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, - int error) + blk_status_t error) { struct buffer_head *bh, *next; struct page *page = bvec->bv_page; @@ -209,15 +209,13 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_error) { - sdp->sd_log_error = bio->bi_error; - fs_err(sdp, "Error %d writing to log\n", bio->bi_error); - } + if (bio->bi_status) + fs_err(sdp, "Error %d writing to log\n", bio->bi_status); bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, bio->bi_error); + gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); else mempool_free(page, gfs2_page_pool); } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 663ffc135ef3..fabe1614f879 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -201,7 +201,7 @@ static void gfs2_meta_read_endio(struct bio *bio) do { struct buffer_head *next = bh->b_this_page; len -= bh->b_size; - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bh = next; } while (bh && len); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b92135c202c2..e76058d34b74 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -176,10 +176,10 @@ static void end_bio_io_page(struct bio *bio) { struct page *page = bio->bi_private; - if (!bio->bi_error) + if (!bio->bi_status) SetPageUptodate(page); else - pr_warn("error %d reading superblock\n", bio->bi_error); + pr_warn("error %d reading superblock\n", bio->bi_status); unlock_page(page); } diff --git a/fs/inode.c b/fs/inode.c index db5914783a71..f0e5fc77e6a4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -146,6 +146,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; + inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; diff --git a/fs/iomap.c b/fs/iomap.c index 4b10892967a5..fa6cd5b3f578 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -672,8 +672,8 @@ static void iomap_dio_bio_end_io(struct bio *bio) struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); - if (bio->bi_error) - iomap_dio_set_error(dio, bio->bi_error); + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); if (atomic_dec_and_test(&dio->ref)) { if (is_sync_kiocb(dio->iocb)) { @@ -793,6 +793,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_bdev = iomap->bdev; bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -881,6 +882,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, flags |= IOMAP_WRITE; } + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, start, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + flags |= IOMAP_NOWAIT; + } + ret = filemap_write_and_wait_range(mapping, start, end); if (ret) goto out_free_dio; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index bb1da1feafeb..a21f0e9eecd4 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2205,7 +2205,7 @@ static void lbmIODone(struct bio *bio) bp->l_flag |= lbmDONE; - if (bio->bi_error) { + if (bio->bi_status) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 489aaa1403e5..ce93db3aef3c 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -280,7 +280,7 @@ static void metapage_read_end_io(struct bio *bio) { struct page *page = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_read_end_io: I/O error\n"); SetPageError(page); } @@ -337,7 +337,7 @@ static void metapage_write_end_io(struct bio *bio) BUG_ON(!PagePrivate(page)); - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_write_end_io: I/O error\n"); SetPageError(page); } diff --git a/fs/mpage.c b/fs/mpage.c index baff8f820c29..d6d1486d6f99 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -50,7 +50,8 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, op_is_write(bio_op(bio)), bio->bi_error); + page_endio(page, op_is_write(bio_op(bio)), + blk_status_to_errno(bio->bi_status)); } bio_put(bio); @@ -614,6 +615,7 @@ alloc_new: goto confused; wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 0ca370d23ddb..d8863a804b15 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -188,7 +188,7 @@ static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { struct nfs_pgio_header *header = par->data; if (!header->pnfs_error) @@ -319,7 +319,7 @@ static void bl_end_io_write(struct bio *bio) struct parallel_io *par = bio->bi_private; struct nfs_pgio_header *header = par->data; - if (bio->bi_error) { + if (bio->bi_status) { if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index fb5213afc854..c862c2489df0 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -219,6 +219,9 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, u8 *buf, *d, type, assoc; int error; + if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) + return -EINVAL; + buf = kzalloc(bufflen, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -229,7 +232,6 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, goto out_free_buf; } req = scsi_req(rq); - scsi_req_init(rq); error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); if (error) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6f87b2ac1aeb..e73c86d9855c 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,7 +338,7 @@ static void nilfs_end_bio_write(struct bio *bio) { struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) atomic_inc(&segbuf->sb_err); bio_put(bio); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 0da0332725aa..ffe003982d95 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -516,9 +516,9 @@ static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (bio->bi_error) { - mlog(ML_ERROR, "IO Error %d\n", bio->bi_error); - wc->wc_error = bio->bi_error; + if (bio->bi_status) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_status); + wc->wc_error = blk_status_to_errno(bio->bi_status); } o2hb_bio_wait_dec(wc, 1); diff --git a/fs/open.c b/fs/open.c index cd0c5be8d012..3fe0c4aa7d27 100644 --- a/fs/open.c +++ b/fs/open.c @@ -759,6 +759,7 @@ static int do_dentry_open(struct file *f, likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; + f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/fs/read_write.c b/fs/read_write.c index 19d4d88fa285..d591eeed061f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -678,16 +678,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, struct kiocb kiocb; ssize_t ret; - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)) - return -EOPNOTSUPP; - init_sync_kiocb(&kiocb, filp); - if (flags & RWF_HIPRI) - kiocb.ki_flags |= IOCB_HIPRI; - if (flags & RWF_DSYNC) - kiocb.ki_flags |= IOCB_DSYNC; - if (flags & RWF_SYNC) - kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + ret = kiocb_set_rw_flags(&kiocb, flags); + if (ret) + return ret; kiocb.ki_pos = *ppos; if (type == READ) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3b91faacc1ba..d20c29b9c95b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -276,7 +276,7 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; - int error = ioend->io_bio->bi_error; + int error; /* * Just clean up the in-memory strutures if the fs has been shut down. @@ -289,6 +289,7 @@ xfs_end_io( /* * Clean up any COW blocks on an I/O error. */ + error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { switch (ioend->io_type) { case XFS_IO_COW: @@ -332,7 +333,7 @@ xfs_end_bio( else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); else - xfs_destroy_ioend(ioend, bio->bi_error); + xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } STATIC int @@ -500,11 +501,12 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_bio->bi_error = status; + ioend->io_bio->bi_status = errno_to_blk_status(status); bio_endio(ioend->io_bio); return status; } + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); return 0; } @@ -564,6 +566,7 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); ioend->io_bio = new; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 16d6a578fc16..438505f395e7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1227,8 +1227,11 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) - cmpxchg(&bp->b_io_error, 0, bio->bi_error); + if (bio->bi_status) { + int error = blk_status_to_errno(bio->bi_status); + + cmpxchg(&bp->b_io_error, 0, error); + } if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5fb5a0958a14..17f27a2fb5e2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -237,7 +237,11 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -541,7 +545,11 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -553,9 +561,15 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to take the exclusive lock * for other reasons in xfs_file_aio_write_checks. */ - if (unaligned_io) - inode_dio_wait(inode); - else if (iolock == XFS_IOLOCK_EXCL) { + if (unaligned_io) { + /* If we are going to wait for other DIO to finish, bail */ + if (iocb->ki_flags & IOCB_NOWAIT) { + if (atomic_read(&inode->i_dio_count)) + return -EAGAIN; + } else { + inode_dio_wait(inode); + } + } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -585,7 +599,12 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_ilock(ip, iolock); + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -892,6 +911,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; + file->f_mode |= FMODE_AIO_NOWAIT; return 0; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 94e5bdf7304c..05dc87e8c1f5 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -995,6 +995,11 @@ xfs_file_iomap_begin( lockmode = xfs_ilock_data_map_shared(ip); } + if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = -EAGAIN; + goto out_unlock; + } + ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) length = mp->m_super->s_maxbytes - offset; @@ -1016,6 +1021,15 @@ xfs_file_iomap_begin( if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { if (flags & IOMAP_DIRECT) { + /* + * A reflinked inode will result in CoW alloc. + * FIXME: It could still overwrite on unshared extents + * and not need allocation. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode); @@ -1033,6 +1047,14 @@ xfs_file_iomap_begin( if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { /* + * If nowait is set bail since we are going to make + * allocations. + */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } + /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat symmetric * with the work writeback does. This is a completely arbitrary diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 455a575f101d..97df4db13b2e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1766,7 +1766,8 @@ STATIC int __init xfs_init_zones(void) { xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, - offsetof(struct xfs_ioend, io_inline_bio)); + offsetof(struct xfs_ioend, io_inline_bio), + BIOSET_NEED_BVECS); if (!xfs_ioend_bioset) goto out; diff --git a/include/linux/bio.h b/include/linux/bio.h index a7e29fa0981f..664a27da276d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -118,7 +118,6 @@ static inline void *bio_data(struct bio *bio) /* * will die */ -#define bio_to_phys(bio) (page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio))) #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) /* @@ -373,8 +372,11 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, return bio_split(bio, sectors, gfp, bs); } -extern struct bio_set *bioset_create(unsigned int, unsigned int); -extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); +extern struct bio_set *bioset_create(unsigned int, unsigned int, int flags); +enum { + BIOSET_NEED_BVECS = BIT(0), + BIOSET_NEED_RESCUER = BIT(1), +}; extern void bioset_free(struct bio_set *); extern mempool_t *biovec_create_pool(int pool_entries); @@ -392,11 +394,6 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); } -static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) -{ - return bio_clone_bioset(bio, gfp_mask, fs_bio_set); -} - static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); @@ -414,7 +411,13 @@ extern void bio_endio(struct bio *); static inline void bio_io_error(struct bio *bio) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); +} + +static inline void bio_wouldblock_error(struct bio *bio) +{ + bio->bi_status = BLK_STS_AGAIN; bio_endio(bio); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fcd641032f8d..23d32ff0b462 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -39,8 +39,6 @@ struct blk_mq_hw_ctx { struct blk_mq_tags *tags; struct blk_mq_tags *sched_tags; - struct srcu_struct queue_rq_srcu; - unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 @@ -62,6 +60,9 @@ struct blk_mq_hw_ctx { struct dentry *debugfs_dir; struct dentry *sched_debugfs_dir; #endif + + /* Must be the last member - see also blk_mq_hw_ctx_size(). */ + struct srcu_struct queue_rq_srcu[0]; }; struct blk_mq_tag_set { @@ -87,7 +88,8 @@ struct blk_mq_queue_data { bool last; }; -typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); +typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, + const struct blk_mq_queue_data *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -142,6 +144,8 @@ struct blk_mq_ops { init_request_fn *init_request; exit_request_fn *exit_request; reinit_request_fn *reinit_request; + /* Called from inside blk_get_request() */ + void (*initialize_rq_fn)(struct request *rq); map_queues_fn *map_queues; @@ -155,10 +159,6 @@ struct blk_mq_ops { }; enum { - BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */ - BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */ - BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */ - BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, @@ -204,10 +204,10 @@ enum { BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ }; -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, +struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, unsigned int flags); -struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op, - unsigned int flags, unsigned int hctx_idx); +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, + unsigned int op, unsigned int flags, unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); enum { @@ -230,8 +230,8 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) int blk_mq_request_started(struct request *rq); void blk_mq_start_request(struct request *rq); -void blk_mq_end_request(struct request *rq, int error); -void __blk_mq_end_request(struct request *rq, int error); +void blk_mq_end_request(struct request *rq, blk_status_t error); +void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, @@ -247,6 +247,8 @@ void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); +void blk_mq_quiesce_queue(struct request_queue *q); +void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); @@ -264,6 +266,8 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); +void blk_mq_quiesce_queue_nowait(struct request_queue *q); + /* * Driver command data is immediately after the request. So subtract request * size to get back to the original request, add request size to get the PDU. diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 61339bc44400..d2eb87c84d82 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -17,6 +17,27 @@ struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); +/* + * Block error status values. See block/blk-core:blk_errors for the details. + */ +typedef u8 __bitwise blk_status_t; +#define BLK_STS_OK 0 +#define BLK_STS_NOTSUPP ((__force blk_status_t)1) +#define BLK_STS_TIMEOUT ((__force blk_status_t)2) +#define BLK_STS_NOSPC ((__force blk_status_t)3) +#define BLK_STS_TRANSPORT ((__force blk_status_t)4) +#define BLK_STS_TARGET ((__force blk_status_t)5) +#define BLK_STS_NEXUS ((__force blk_status_t)6) +#define BLK_STS_MEDIUM ((__force blk_status_t)7) +#define BLK_STS_PROTECTION ((__force blk_status_t)8) +#define BLK_STS_RESOURCE ((__force blk_status_t)9) +#define BLK_STS_IOERR ((__force blk_status_t)10) + +/* hack for device mapper, don't use elsewhere: */ +#define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) + +#define BLK_STS_AGAIN ((__force blk_status_t)12) + struct blk_issue_stat { u64 stat; }; @@ -28,13 +49,14 @@ struct blk_issue_stat { struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - int bi_error; + blk_status_t bi_status; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. */ unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; + unsigned short bi_write_hint; struct bvec_iter bi_iter; @@ -205,6 +227,7 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ + __REQ_NOWAIT, /* Don't wait if request will block */ __REQ_NR_BITS, /* stops here */ }; @@ -223,6 +246,7 @@ enum req_flag_bits { #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) +#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ddd36bd2173..25f6a0cb27d3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -55,7 +55,7 @@ struct blk_stat_callback; */ #define BLKCG_MAX_POLS 3 -typedef void (rq_end_io_fn)(struct request *, int); +typedef void (rq_end_io_fn)(struct request *, blk_status_t); #define BLK_RL_SYNCFULL (1U << 0) #define BLK_RL_ASYNCFULL (1U << 1) @@ -225,6 +225,8 @@ struct request { unsigned int extra_len; /* length of alignment and padding */ + unsigned short write_hint; + unsigned long deadline; struct list_head timeout_list; @@ -412,8 +414,12 @@ struct request_queue { rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; + /* Called just after a request is allocated */ init_rq_fn *init_rq_fn; + /* Called just before a request is freed */ exit_rq_fn *exit_rq_fn; + /* Called from inside blk_get_request() */ + void (*initialize_rq_fn)(struct request *rq); const struct blk_mq_ops *mq_ops; @@ -590,6 +596,9 @@ struct request_queue { void *rq_alloc_data; struct work_struct release_work; + +#define BLK_MAX_WRITE_HINTS 5 + u64 write_hints[BLK_MAX_WRITE_HINTS]; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ @@ -622,6 +631,8 @@ struct request_queue { #define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */ +#define QUEUE_FLAG_SCSI_PASSTHROUGH 30 /* queue supports SCSI commands */ +#define QUEUE_FLAG_QUIESCED 31 /* queue has been quiesced */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -633,6 +644,13 @@ struct request_queue { (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_POLL)) +/* + * @q->queue_lock is set while a queue is being initialized. Since we know + * that no other threads access the queue object before @q->queue_lock has + * been set, it is safe to manipulate queue flags without holding the + * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and + * blk_init_allocated_queue(). + */ static inline void queue_lockdep_assert_held(struct request_queue *q) { if (q->queue_lock) @@ -712,10 +730,13 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) +#define blk_queue_scsi_passthrough(q) \ + test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) +#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) static inline bool blk_account_rq(struct request *rq) { @@ -814,7 +835,8 @@ static inline bool rq_mergeable(struct request *rq) static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) { - if (bio_data(a) == bio_data(b)) + if (bio_page(a) == bio_page(b) && + bio_offset(a) == bio_offset(b)) return true; return false; @@ -862,19 +884,6 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn; #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) #define BLK_MIN_SG_TIMEOUT (7 * HZ) -#ifdef CONFIG_BOUNCE -extern int init_emergency_isa_pool(void); -extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); -#else -static inline int init_emergency_isa_pool(void) -{ - return 0; -} -static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) -{ -} -#endif /* CONFIG_MMU */ - struct rq_map_data { struct page **pages; int page_order; @@ -933,7 +942,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); -extern struct request *blk_get_request(struct request_queue *, int, gfp_t); +extern struct request *blk_get_request(struct request_queue *, unsigned int op, + gfp_t gfp_mask); extern void blk_requeue_request(struct request_queue *, struct request *); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, @@ -941,12 +951,11 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); extern void blk_rq_unprep_clone(struct request *rq); -extern int blk_insert_cloned_request(struct request_queue *q, +extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_delay_queue(struct request_queue *, unsigned long); -extern void blk_queue_split(struct request_queue *, struct bio **, - struct bio_set *); +extern void blk_queue_split(struct request_queue *, struct bio **); extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, @@ -967,7 +976,6 @@ extern void __blk_run_queue(struct request_queue *q); extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); -extern void blk_mq_quiesce_queue(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); @@ -981,6 +989,9 @@ extern void blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); +int blk_status_to_errno(blk_status_t status); +blk_status_t errno_to_blk_status(int errno); + bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) @@ -1113,16 +1124,16 @@ extern struct request *blk_fetch_request(struct request_queue *q); * blk_end_request() for parts of the original function. * This prevents code duplication in drivers. */ -extern bool blk_update_request(struct request *rq, int error, +extern bool blk_update_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void blk_finish_request(struct request *rq, int error); -extern bool blk_end_request(struct request *rq, int error, +extern void blk_finish_request(struct request *rq, blk_status_t error); +extern bool blk_end_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void blk_end_request_all(struct request *rq, int error); -extern bool __blk_end_request(struct request *rq, int error, +extern void blk_end_request_all(struct request *rq, blk_status_t error); +extern bool __blk_end_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); -extern void __blk_end_request_all(struct request *rq, int error); -extern bool __blk_end_request_cur(struct request *rq, int error); +extern void __blk_end_request_all(struct request *rq, blk_status_t error); +extern bool __blk_end_request_cur(struct request *rq, blk_status_t error); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); @@ -1374,11 +1385,6 @@ enum blk_default_limits { #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) -static inline unsigned long queue_bounce_pfn(struct request_queue *q) -{ - return q->limits.bounce_pfn; -} - static inline unsigned long queue_segment_boundary(struct request_queue *q) { return q->limits.seg_boundary_mask; @@ -1780,7 +1786,7 @@ struct blk_integrity_iter { const char *disk_name; }; -typedef int (integrity_processing_fn) (struct blk_integrity_iter *); +typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); struct blk_integrity_profile { integrity_processing_fn *generate_fn; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index f4c639c0c362..456da5017b32 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -72,9 +72,9 @@ typedef void (*dm_release_clone_request_fn) (struct request *clone); * 2 : The target wants to push back the io */ typedef int (*dm_endio_fn) (struct dm_target *ti, - struct bio *bio, int error); + struct bio *bio, blk_status_t *error); typedef int (*dm_request_endio_fn) (struct dm_target *ti, - struct request *clone, int error, + struct request *clone, blk_status_t error, union map_info *map_context); typedef void (*dm_presuspend_fn) (struct dm_target *ti); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 0e306c5a86d6..5bc8f8682a3e 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -104,8 +104,9 @@ struct elevator_mq_ops { int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); - struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *); - void (*put_request)(struct request *); + void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *); + void (*prepare_request)(struct request *, struct bio *bio); + void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); @@ -114,8 +115,6 @@ struct elevator_mq_ops { void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); - int (*get_rq_priv)(struct request_queue *, struct request *, struct bio *); - void (*put_rq_priv)(struct request_queue *, struct request *); void (*init_icq)(struct io_cq *); void (*exit_icq)(struct io_cq *); }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3e68cabb8457..65adbddb3163 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -20,6 +20,7 @@ #include <linux/rwsem.h> #include <linux/capability.h> #include <linux/semaphore.h> +#include <linux/fcntl.h> #include <linux/fiemap.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> @@ -143,6 +144,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) +/* File is capable of returning -EAGAIN if AIO will block */ +#define FMODE_AIO_NOWAIT ((__force fmode_t)0x8000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are @@ -262,6 +266,18 @@ struct page; struct address_space; struct writeback_control; +/* + * Write life time hint values. + */ +enum rw_hint { + WRITE_LIFE_NOT_SET = 0, + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, +}; + #define IOCB_EVENTFD (1 << 0) #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) @@ -269,6 +285,7 @@ struct writeback_control; #define IOCB_DSYNC (1 << 4) #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) +#define IOCB_NOWAIT (1 << 7) struct kiocb { struct file *ki_filp; @@ -276,6 +293,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; + enum rw_hint ki_hint; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -283,16 +301,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) return kiocb->ki_complete == NULL; } -static inline int iocb_flags(struct file *file); - -static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) -{ - *kiocb = (struct kiocb) { - .ki_filp = filp, - .ki_flags = iocb_flags(filp), - }; -} - /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet @@ -593,6 +601,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; unsigned int i_blkbits; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED @@ -847,6 +856,7 @@ struct file { * Must not be taken from IRQ context. */ spinlock_t f_lock; + enum rw_hint f_write_hint; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; @@ -1022,8 +1032,6 @@ struct file_lock_context { #define OFFT_OFFSET_MAX INT_LIMIT(off_t) #endif -#include <linux/fcntl.h> - extern void send_sigio(struct fown_struct *fown, int fd, int band); /* @@ -1874,6 +1882,25 @@ static inline bool HAS_UNMAPPED_ID(struct inode *inode) return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid); } +static inline enum rw_hint file_write_hint(struct file *file) +{ + if (file->f_write_hint != WRITE_LIFE_NOT_SET) + return file->f_write_hint; + + return file_inode(file)->i_write_hint; +} + +static inline int iocb_flags(struct file *file); + +static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) +{ + *kiocb = (struct kiocb) { + .ki_filp = filp, + .ki_flags = iocb_flags(filp), + .ki_hint = file_write_hint(filp), + }; +} + /* * Inode state bits. Protected by inode->i_lock * @@ -2518,6 +2545,8 @@ extern int filemap_fdatawait(struct address_space *); extern void filemap_fdatawait_keep_errors(struct address_space *); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); +extern bool filemap_range_has_page(struct address_space *, loff_t lstart, + loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); @@ -2844,7 +2873,7 @@ enum { DIO_SKIP_DIO_COUNT = 0x08, }; -void dio_end_io(struct bio *bio, int error); +void dio_end_io(struct bio *bio); ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, @@ -3057,6 +3086,25 @@ static inline int iocb_flags(struct file *file) return res; } +static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags) +{ + if (unlikely(flags & ~RWF_SUPPORTED)) + return -EOPNOTSUPP; + + if (flags & RWF_NOWAIT) { + if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT)) + return -EOPNOTSUPP; + ki->ki_flags |= IOCB_NOWAIT; + } + if (flags & RWF_HIPRI) + ki->ki_flags |= IOCB_HIPRI; + if (flags & RWF_DSYNC) + ki->ki_flags |= IOCB_DSYNC; + if (flags & RWF_SYNC) + ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + return 0; +} + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; diff --git a/include/linux/ide.h b/include/linux/ide.h index 6980ca322074..dc152e4b7f73 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -671,7 +671,7 @@ struct ide_port_ops { void (*init_dev)(ide_drive_t *); void (*set_pio_mode)(struct hwif_s *, ide_drive_t *); void (*set_dma_mode)(struct hwif_s *, ide_drive_t *); - int (*reset_poll)(ide_drive_t *); + blk_status_t (*reset_poll)(ide_drive_t *); void (*pre_reset)(ide_drive_t *); void (*resetproc)(ide_drive_t *); void (*maskproc)(ide_drive_t *, int); @@ -1092,7 +1092,7 @@ int generic_ide_ioctl(ide_drive_t *, struct block_device *, unsigned, unsigned l extern int ide_vlb_clk; extern int ide_pci_clk; -int ide_end_rq(ide_drive_t *, struct request *, int, unsigned int); +int ide_end_rq(ide_drive_t *, struct request *, blk_status_t, unsigned int); void ide_kill_rq(ide_drive_t *, struct request *); void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int); @@ -1123,7 +1123,7 @@ extern int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, int arg); void ide_complete_cmd(ide_drive_t *, struct ide_cmd *, u8, u8); -int ide_complete_rq(ide_drive_t *, int, unsigned int); +int ide_complete_rq(ide_drive_t *, blk_status_t, unsigned int); void ide_tf_readback(ide_drive_t *drive, struct ide_cmd *cmd); void ide_tf_dump(const char *, struct ide_cmd *); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f753e788da31..69f4e9470084 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -52,6 +52,7 @@ struct iomap { #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ #define IOMAP_DIRECT (1 << 4) /* direct I/O */ +#define IOMAP_NOWAIT (1 << 5) /* Don't wait for writeback */ struct iomap_ops { /* diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e400a69fa1d3..6b8ee9e628e1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -87,7 +87,7 @@ enum { NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */ }; -#define NVMF_AQ_DEPTH 32 +#define NVME_AQ_DEPTH 32 enum { NVME_REG_CAP = 0x0000, /* Controller Capabilities */ @@ -102,6 +102,7 @@ enum { NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ + NVME_REG_DBS = 0x1000, /* SQ 0 Tail Doorbell */ }; #define NVME_CAP_MQES(cap) ((cap) & 0xffff) @@ -208,9 +209,15 @@ struct nvme_id_ctrl { __u8 tnvmcap[16]; __u8 unvmcap[16]; __le32 rpmbs; - __u8 rsvd316[4]; + __le16 edstt; + __u8 dsto; + __u8 fwug; __le16 kas; - __u8 rsvd322[190]; + __le16 hctma; + __le16 mntmt; + __le16 mxtmt; + __le32 sanicap; + __u8 rsvd332[180]; __u8 sqes; __u8 cqes; __le16 maxcmd; @@ -246,6 +253,7 @@ enum { NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, + NVME_CTRL_OACS_DIRECTIVES = 1 << 5, NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7, }; @@ -275,7 +283,7 @@ struct nvme_id_ns { __le16 nabsn; __le16 nabo; __le16 nabspf; - __u16 rsvd46; + __le16 noiob; __u8 nvmcap[16]; __u8 rsvd64[40]; __u8 nguid[16]; @@ -289,6 +297,7 @@ enum { NVME_ID_CNS_NS = 0x00, NVME_ID_CNS_CTRL = 0x01, NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESC_LIST = 0x03, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -296,6 +305,19 @@ enum { }; enum { + NVME_DIR_IDENTIFY = 0x00, + NVME_DIR_STREAMS = 0x01, + NVME_DIR_SND_ID_OP_ENABLE = 0x01, + NVME_DIR_SND_ST_OP_REL_ID = 0x01, + NVME_DIR_SND_ST_OP_REL_RSC = 0x02, + NVME_DIR_RCV_ID_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_STATUS = 0x02, + NVME_DIR_RCV_ST_OP_RESOURCE = 0x03, + NVME_DIR_ENDIR = 0x01, +}; + +enum { NVME_NS_FEAT_THIN = 1 << 0, NVME_NS_FLBAS_LBA_MASK = 0xf, NVME_NS_FLBAS_META_EXT = 0x10, @@ -315,6 +337,22 @@ enum { NVME_NS_DPS_PI_TYPE3 = 3, }; +struct nvme_ns_id_desc { + __u8 nidt; + __u8 nidl; + __le16 reserved; +}; + +#define NVME_NIDT_EUI64_LEN 8 +#define NVME_NIDT_NGUID_LEN 16 +#define NVME_NIDT_UUID_LEN 16 + +enum { + NVME_NIDT_EUI64 = 0x01, + NVME_NIDT_NGUID = 0x02, + NVME_NIDT_UUID = 0x03, +}; + struct nvme_smart_log { __u8 critical_warning; __u8 temperature[2]; @@ -536,6 +574,7 @@ enum { NVME_RW_PRINFO_PRCHK_APP = 1 << 11, NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, + NVME_RW_DTYPE_STREAMS = 1 << 4, }; struct nvme_dsm_cmd { @@ -587,6 +626,11 @@ struct nvme_feat_auto_pst { __le64 entries[32]; }; +enum { + NVME_HOST_MEM_ENABLE = (1 << 0), + NVME_HOST_MEM_RETURN = (1 << 1), +}; + /* Admin commands */ enum nvme_admin_opcode { @@ -605,6 +649,8 @@ enum nvme_admin_opcode { nvme_admin_download_fw = 0x11, nvme_admin_ns_attach = 0x15, nvme_admin_keep_alive = 0x18, + nvme_admin_directive_send = 0x19, + nvme_admin_directive_recv = 0x1a, nvme_admin_dbbuf = 0x7C, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, @@ -659,6 +705,8 @@ struct nvme_identify { __u32 rsvd11[5]; }; +#define NVME_IDENTIFY_DATA_SIZE 4096 + struct nvme_features { __u8 opcode; __u8 flags; @@ -668,7 +716,16 @@ struct nvme_features { union nvme_data_ptr dptr; __le32 fid; __le32 dword11; - __u32 rsvd12[4]; + __le32 dword12; + __le32 dword13; + __le32 dword14; + __le32 dword15; +}; + +struct nvme_host_mem_buf_desc { + __le64 addr; + __le32 size; + __u32 rsvd; }; struct nvme_create_cq { @@ -757,6 +814,24 @@ struct nvme_get_log_page_command { __u32 rsvd14[2]; }; +struct nvme_directive_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 numd; + __u8 doper; + __u8 dtype; + __le16 dspec; + __u8 endir; + __u8 tdtype; + __u16 rsvd15; + + __u32 rsvd16[3]; +}; + /* * Fabrics subcommands. */ @@ -887,6 +962,18 @@ struct nvme_dbbuf { __u32 rsvd12[6]; }; +struct streams_directive_params { + __u16 msl; + __u16 nssa; + __u16 nsso; + __u8 rsvd[10]; + __u32 sws; + __u16 sgs; + __u16 nsa; + __u16 nso; + __u8 rsvd2[6]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -907,6 +994,7 @@ struct nvme_command { struct nvmf_property_set_command prop_set; struct nvmf_property_get_command prop_get; struct nvme_dbbuf dbbuf; + struct nvme_directive_cmd directive; }; }; @@ -1051,4 +1139,8 @@ struct nvme_completion { #define NVME_VS(major, minor, tertiary) \ (((major) << 16) | ((minor) << 8) | (tertiary)) +#define NVME_MAJOR(ver) ((ver) >> 16) +#define NVME_MINOR(ver) (((ver) >> 8) & 0xff) +#define NVME_TERTIARY(ver) ((ver) & 0xff) + #endif /* _LINUX_NVME_H */ diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index cb3c8fe6acd7..4b3286ac60c8 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -278,6 +278,8 @@ size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); +size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, + size_t buflen, off_t skip); /* * Maximum number of entries that will be allocated in one piece, if diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h index a09cca829082..a29d3086eb56 100644 --- a/include/scsi/osd_initiator.h +++ b/include/scsi/osd_initiator.h @@ -157,7 +157,7 @@ struct osd_request { osd_req_done_fn *async_done; void *async_private; - int async_error; + blk_status_t async_error; int req_errors; }; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index b379f93a2c48..da9bf2bcdf1a 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -166,6 +166,7 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, extern void scsi_kunmap_atomic_sg(void *virt); extern int scsi_init_io(struct scsi_cmnd *cmd); +extern void scsi_initialize_rq(struct request *rq); extern int scsi_dma_map(struct scsi_cmnd *cmd); extern void scsi_dma_unmap(struct scsi_cmnd *cmd); diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h index f0c76f9dc285..e0afa445ee4e 100644 --- a/include/scsi/scsi_request.h +++ b/include/scsi/scsi_request.h @@ -27,6 +27,6 @@ static inline void scsi_req_free_cmd(struct scsi_request *req) kfree(req->cmd); } -void scsi_req_init(struct request *); +void scsi_req_init(struct scsi_request *req); #endif /* _SCSI_SCSI_REQUEST_H */ diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index bb2554f7fbd1..a2d4a8ac94ca 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -79,7 +79,7 @@ struct io_event { struct iocb { /* these are internal to the kernel/libc. */ __u64 aio_data; /* data to be returned in event's data */ - __u32 PADDED(aio_key, aio_reserved1); + __u32 PADDED(aio_key, aio_rw_flags); /* the kernel sets aio_key to the req # */ /* common fields */ diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 4bf9f1eabffc..2f6c77aebe1a 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 35 +#define DM_VERSION_MINOR 36 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2016-06-23)" +#define DM_VERSION_EXTRA "-ioctl (2017-06-09)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 813afd6eee71..ec69d55bcec7 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -43,6 +43,27 @@ /* (1U << 31) is reserved for signed error codes */ /* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + +/* * Types of directory notifications that may be requested. */ #define DN_ACCESS 0x00000001 /* File accessed */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 24e61a54feaa..27d8c36c04af 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -360,5 +360,9 @@ struct fscrypt_key { #define RWF_HIPRI 0x00000001 /* high priority request, poll if possible */ #define RWF_DSYNC 0x00000002 /* per-IO O_DSYNC */ #define RWF_SYNC 0x00000004 /* per-IO O_SYNC */ +#define RWF_NOWAIT 0x00000008 /* per-IO, return -EAGAIN if operation would block */ + +#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC |\ + RWF_NOWAIT) #endif /* _UAPI_LINUX_FS_H */ diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index c8125ec1f4f2..a3960f98679c 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -22,6 +22,7 @@ enum { LO_FLAGS_AUTOCLEAR = 4, LO_FLAGS_PARTSCAN = 8, LO_FLAGS_DIRECT_IO = 16, + LO_FLAGS_BLOCKSIZE = 32, }; #include <asm/posix_types.h> /* for __kernel_old_dev_t */ @@ -59,6 +60,8 @@ struct loop_info64 { __u64 lo_init[2]; }; +#define LO_INFO_BLOCKSIZE(l) (l)->lo_init[0] + /* * Loop filter types */ diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 155e33f81913..a50527ebf671 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -41,10 +41,14 @@ enum { #define NBD_FLAG_HAS_FLAGS (1 << 0) /* nbd-server supports flags */ #define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ #define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ +#define NBD_FLAG_SEND_FUA (1 << 3) /* send FUA (forced unit access) */ /* there is a gap here to match userspace */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ #define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ +/* values for cmd flags in the upper 16 bits of request type */ +#define NBD_CMD_FLAG_FUA (1 << 16) /* FUA (forced unit access) op */ + /* These are client behavior specific flags. */ #define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on disconnect. */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f80fd33639e0..57d22571f306 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -225,14 +225,14 @@ static struct block_device *hib_resume_bdev; struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; - int error; + blk_status_t error; }; static void hib_init_batch(struct hib_bio_batch *hb) { atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); - hb->error = 0; + hb->error = BLK_STS_OK; } static void hib_end_io(struct bio *bio) @@ -240,7 +240,7 @@ static void hib_end_io(struct bio *bio) struct hib_bio_batch *hb = bio->bi_private; struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), @@ -253,8 +253,8 @@ static void hib_end_io(struct bio *bio) flush_icache_range((unsigned long)page_address(page), (unsigned long)page_address(page) + PAGE_SIZE); - if (bio->bi_error && !hb->error) - hb->error = bio->bi_error; + if (bio->bi_status && !hb->error) + hb->error = bio->bi_status; if (atomic_dec_and_test(&hb->count)) wake_up(&hb->wait); @@ -293,10 +293,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, return error; } -static int hib_wait_io(struct hib_bio_batch *hb) +static blk_status_t hib_wait_io(struct hib_bio_batch *hb) { wait_event(hb->wait, atomic_read(&hb->count) == 0); - return hb->error; + return blk_status_to_errno(hb->error); } /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 193c5f5e3f79..bc364f86100a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -867,7 +867,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), + BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu); } } @@ -900,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r); } diff --git a/lib/scatterlist.c b/lib/scatterlist.c index c6cf82242d65..be7b4dd6b68d 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -751,3 +751,38 @@ size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, return sg_copy_buffer(sgl, nents, buf, buflen, skip, true); } EXPORT_SYMBOL(sg_pcopy_to_buffer); + +/** + * sg_zero_buffer - Zero-out a part of a SG list + * @sgl: The SG list + * @nents: Number of SG entries + * @buflen: The number of bytes to zero out + * @skip: Number of bytes to skip before zeroing + * + * Returns the number of bytes zeroed. + **/ +size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, + size_t buflen, off_t skip) +{ + unsigned int offset = 0; + struct sg_mapping_iter miter; + unsigned int sg_flags = SG_MITER_ATOMIC | SG_MITER_TO_SG; + + sg_miter_start(&miter, sgl, nents, sg_flags); + + if (!sg_miter_skip(&miter, skip)) + return false; + + while (offset < buflen && sg_miter_next(&miter)) { + unsigned int len; + + len = min(miter.length, buflen - offset); + memset(miter.addr, 0, len); + + offset += len; + } + + sg_miter_stop(&miter); + return offset; +} +EXPORT_SYMBOL(sg_zero_buffer); diff --git a/mm/filemap.c b/mm/filemap.c index 6f1be573a5e6..742034e56100 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -376,6 +376,38 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); +/** + * filemap_range_has_page - check if a page exists in range. + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. + */ +bool filemap_range_has_page(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + pgoff_t index = start_byte >> PAGE_SHIFT; + pgoff_t end = end_byte >> PAGE_SHIFT; + struct pagevec pvec; + bool ret; + + if (end_byte < start_byte) + return false; + + if (mapping->nrpages == 0) + return false; + + pagevec_init(&pvec, 0); + if (!pagevec_lookup(&pvec, mapping, index, 1)) + return false; + ret = (pvec.pages[0]->index <= end); + pagevec_release(&pvec); + return ret; +} +EXPORT_SYMBOL(filemap_range_has_page); + static int __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { @@ -2038,10 +2070,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t size; size = i_size_read(inode); - retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) + return -EAGAIN; + } else { + retval = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (retval < 0) + goto out; + } file_accessed(file); @@ -2642,6 +2681,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + if (limit != RLIM_INFINITY) { if (iocb->ki_pos >= limit) { send_sig(SIGXFSZ, current, 0); @@ -2710,9 +2752,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; - written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); - if (written) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* If there are pages to writeback, return */ + if (filemap_range_has_page(inode->i_mapping, pos, + pos + iov_iter_count(from))) + return -EAGAIN; + } else { + written = filemap_write_and_wait_range(mapping, pos, + pos + write_len - 1); + if (written) + goto out; + } /* * After a write we want buffered reads to be sure to go to disk to get diff --git a/mm/page_io.c b/mm/page_io.c index 23f6d0d3470f..2da71e627812 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); /* * We failed to write the page out to swap-space. @@ -118,7 +118,7 @@ static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); ClearPageUptodate(page); pr_alert("Read-error on swap-device (%u:%u:%llu)\n", |