diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile | 4 | ||||
-rw-r--r-- | block/bdev.c | 36 | ||||
-rw-r--r-- | block/bfq-cgroup.c | 12 | ||||
-rw-r--r-- | block/bio.c | 44 | ||||
-rw-r--r-- | block/blk-cgroup.c | 10 | ||||
-rw-r--r-- | block/blk-core.c | 113 | ||||
-rw-r--r-- | block/blk-crypto-fallback.c | 118 | ||||
-rw-r--r-- | block/blk-crypto-profile.c | 565 | ||||
-rw-r--r-- | block/blk-crypto.c | 29 | ||||
-rw-r--r-- | block/blk-flush.c | 12 | ||||
-rw-r--r-- | block/blk-ia-ranges.c | 348 | ||||
-rw-r--r-- | block/blk-integrity.c | 4 | ||||
-rw-r--r-- | block/blk-merge.c | 23 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 4 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 19 | ||||
-rw-r--r-- | block/blk-mq-sched.h | 12 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 7 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 23 | ||||
-rw-r--r-- | block/blk-mq.c | 502 | ||||
-rw-r--r-- | block/blk-mq.h | 36 | ||||
-rw-r--r-- | block/blk-settings.c | 20 | ||||
-rw-r--r-- | block/blk-sysfs.c | 26 | ||||
-rw-r--r-- | block/blk-wbt.c | 3 | ||||
-rw-r--r-- | block/blk.h | 43 | ||||
-rw-r--r-- | block/bsg-lib.c | 32 | ||||
-rw-r--r-- | block/fops.c | 196 | ||||
-rw-r--r-- | block/genhd.c | 57 | ||||
-rw-r--r-- | block/ioctl.c | 20 | ||||
-rw-r--r-- | block/keyslot-manager.c | 579 | ||||
-rw-r--r-- | block/partitions/core.c | 2 | ||||
-rw-r--r-- | block/partitions/efi.c | 2 | ||||
-rw-r--r-- | block/partitions/ibm.c | 19 |
32 files changed, 1754 insertions, 1166 deletions
diff --git a/block/Makefile b/block/Makefile index 74df168729ec..44df57e562bf 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,7 +9,7 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ - disk-events.o + disk-events.o blk-ia-ranges.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o @@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o -obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/bdev.c b/block/bdev.c index cff0bb3a4578..b4dab2fb6a74 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -185,14 +185,13 @@ int sb_min_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_min_blocksize); -int __sync_blockdev(struct block_device *bdev, int wait) +int sync_blockdev_nowait(struct block_device *bdev) { if (!bdev) return 0; - if (!wait) - return filemap_flush(bdev->bd_inode->i_mapping); - return filemap_write_and_wait(bdev->bd_inode->i_mapping); + return filemap_flush(bdev->bd_inode->i_mapping); } +EXPORT_SYMBOL_GPL(sync_blockdev_nowait); /* * Write out and wait upon all the dirty data associated with a block @@ -200,7 +199,9 @@ int __sync_blockdev(struct block_device *bdev, int wait) */ int sync_blockdev(struct block_device *bdev) { - return __sync_blockdev(bdev, 1); + if (!bdev) + return 0; + return filemap_write_and_wait(bdev->bd_inode->i_mapping); } EXPORT_SYMBOL(sync_blockdev); @@ -964,9 +965,11 @@ EXPORT_SYMBOL(blkdev_put); * @pathname: special file representing the block device * @dev: return value of the block device's dev_t * - * Get a reference to the blockdevice at @pathname in the current - * namespace if possible and return it. Return ERR_PTR(error) - * otherwise. + * Lookup the block device's dev_t at @pathname in the current + * namespace if possible and return it by @dev. + * + * RETURNS: + * 0 if succeeded, errno otherwise. */ int lookup_bdev(const char *pathname, dev_t *dev) { @@ -1018,7 +1021,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) } EXPORT_SYMBOL(__invalidate_device); -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; @@ -1049,8 +1052,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) - func(bdev, arg); + if (!bdev->bd_openers) { + ; /* skip */ + } else if (wait) { + /* + * We keep the error status of individual mapping so + * that applications can catch the writeback error using + * fsync(2). See filemap_fdatawait_keep_errors() for + * details. + */ + filemap_fdatawait_keep_errors(inode->i_mapping); + } else { + filemap_fdatawrite(inode->i_mapping); + } mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 882ec4bc51ad..24a5c5329bcd 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) - return -ENOMEM; + goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || @@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || - bfq_stat_init(&stats->empty_time, gfp)) { - bfqg_stats_exit(stats); - return -ENOMEM; - } + bfq_stat_init(&stats->empty_time, gfp)) + goto error; #endif return 0; + +error: + bfqg_stats_exit(stats); + return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) diff --git a/block/bio.c b/block/bio.c index 4f397ba47db5..15ab0d6d1c06 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1033,52 +1033,40 @@ int bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_add_page); -void bio_release_pages(struct bio *bio, bool mark_dirty) +void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct bvec_iter_all iter_all; struct bio_vec *bvec; - if (bio_flagged(bio, BIO_NO_PAGE_REF)) - return; - bio_for_each_segment_all(bvec, bio, iter_all) { if (mark_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); put_page(bvec->bv_page); } } -EXPORT_SYMBOL_GPL(bio_release_pages); +EXPORT_SYMBOL_GPL(__bio_release_pages); -static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { + size_t size = iov_iter_count(iter); + WARN_ON_ONCE(bio->bi_max_vecs); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + size_t max_sectors = queue_max_zone_append_sectors(q); + + size = min(size, max_sectors << SECTOR_SHIFT); + } + bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; - bio->bi_iter.bi_size = iter->count; + bio->bi_iter.bi_size = size; bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) -{ - __bio_iov_bvec_set(bio, iter); - iov_iter_advance(iter, iter->count); - return 0; -} - -static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - struct iov_iter i = *iter; - - iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9); - __bio_iov_bvec_set(bio, &i); - iov_iter_advance(iter, i.count); - return 0; -} - static void bio_put_pages(struct page **pages, size_t size, size_t off) { size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); @@ -1220,9 +1208,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) int ret = 0; if (iov_iter_is_bvec(iter)) { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - return bio_iov_bvec_set_append(bio, iter); - return bio_iov_bvec_set(bio, iter); + bio_iov_bvec_set(bio, iter); + iov_iter_advance(iter, bio->bi_iter.bi_size); + return 0; } do { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8908298d6ad3..88b1fce90520 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -634,6 +634,14 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, q = bdev_get_queue(bdev); + /* + * blkcg_deactivate_policy() requires queue to be frozen, we can grab + * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). + */ + ret = blk_queue_enter(q, 0); + if (ret) + return ret; + rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -703,6 +711,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: + blk_queue_exit(q); ctx->bdev = bdev; ctx->blkg = blkg; ctx->body = input; @@ -715,6 +724,7 @@ fail_unlock: rcu_read_unlock(); fail: blkdev_put_no_open(bdev); + blk_queue_exit(q); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue diff --git a/block/blk-core.c b/block/blk-core.c index d0c2e11411d0..b043de2baaac 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -386,30 +386,6 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -static bool blk_try_enter_queue(struct request_queue *q, bool pm) -{ - rcu_read_lock(); - if (!percpu_ref_tryget_live(&q->q_usage_counter)) - goto fail; - - /* - * The code that increments the pm_only counter must ensure that the - * counter is globally visible before the queue is unfrozen. - */ - if (blk_queue_pm_only(q) && - (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) - goto fail_put; - - rcu_read_unlock(); - return true; - -fail_put: - percpu_ref_put(&q->q_usage_counter); -fail: - rcu_read_unlock(); - return false; -} - /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -442,10 +418,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) return 0; } -static inline int bio_queue_enter(struct bio *bio) +int __bio_queue_enter(struct request_queue *q, struct bio *bio) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - while (!blk_try_enter_queue(q, false)) { struct gendisk *disk = bio->bi_bdev->bd_disk; @@ -597,34 +571,6 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -/** - * blk_get_request - allocate a request - * @q: request queue to allocate a request for - * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. - * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. - */ -struct request *blk_get_request(struct request_queue *q, unsigned int op, - blk_mq_req_flags_t flags) -{ - struct request *req; - - WARN_ON_ONCE(op & REQ_NOWAIT); - WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM)); - - req = blk_mq_alloc_request(q, op, flags); - if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) - q->mq_ops->initialize_rq_fn(req); - - return req; -} -EXPORT_SYMBOL(blk_get_request); - -void blk_put_request(struct request *req) -{ - blk_mq_free_request(req); -} -EXPORT_SYMBOL(blk_put_request); - static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; @@ -770,7 +716,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_OK; } -static noinline_for_stack bool submit_bio_checks(struct bio *bio) +noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct request_queue *q = bdev_get_queue(bdev); @@ -888,22 +834,23 @@ end_io: return false; } -static void __submit_bio(struct bio *bio) +static void __submit_bio_fops(struct gendisk *disk, struct bio *bio) { - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (unlikely(bio_queue_enter(bio) != 0)) return; + if (submit_bio_checks(bio) && blk_crypto_bio_prep(&bio)) + disk->fops->submit_bio(bio); + blk_queue_exit(disk->queue); +} + +static void __submit_bio(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; - if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio)) - goto queue_exit; - if (!disk->fops->submit_bio) { + if (!disk->fops->submit_bio) blk_mq_submit_bio(bio); - return; - } - disk->fops->submit_bio(bio); -queue_exit: - blk_queue_exit(disk->queue); + else + __submit_bio_fops(disk, bio); } /* @@ -1080,7 +1027,7 @@ EXPORT_SYMBOL(submit_bio); */ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); blk_qc_t cookie = READ_ONCE(bio->bi_cookie); int ret; @@ -1089,7 +1036,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) return 0; if (current->plug) - blk_flush_plug_list(current->plug, false); + blk_flush_plug(current->plug, false); if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT)) return 0; @@ -1550,11 +1497,12 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) if (tsk->plug) return; - INIT_LIST_HEAD(&plug->mq_list); + plug->mq_list = NULL; plug->cached_rq = NULL; plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); plug->rq_count = 0; plug->multiple_queues = false; + plug->has_elevator = false; plug->nowait = false; INIT_LIST_HEAD(&plug->cb_list); @@ -1636,13 +1584,19 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, } EXPORT_SYMBOL(blk_check_plugged); -void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) +void blk_flush_plug(struct blk_plug *plug, bool from_schedule) { - flush_plug_callbacks(plug, from_schedule); - - if (!list_empty(&plug->mq_list)) + if (!list_empty(&plug->cb_list)) + flush_plug_callbacks(plug, from_schedule); + if (!rq_list_empty(plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); - if (unlikely(!from_schedule && plug->cached_rq)) + /* + * Unconditionally flush out cached requests, even if the unplug + * event came from schedule. Since we know hold references to the + * queue for cached requests, we don't want a blocked task holding + * up a queue freeze/quiesce event. + */ + if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); } @@ -1658,11 +1612,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ void blk_finish_plug(struct blk_plug *plug) { - if (plug != current->plug) - return; - blk_flush_plug_list(plug, false); - - current->plug = NULL; + if (plug == current->plug) { + blk_flush_plug(plug, false); + current->plug = NULL; + } } EXPORT_SYMBOL(blk_finish_plug); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index ec4c7823541c..c87aba8584c6 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -12,9 +12,9 @@ #include <crypto/skcipher.h> #include <linux/blk-cgroup.h> #include <linux/blk-crypto.h> +#include <linux/blk-crypto-profile.h> #include <linux/blkdev.h> #include <linux/crypto.h> -#include <linux/keyslot-manager.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/random.h> @@ -73,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool; static DEFINE_MUTEX(tfms_init_lock); static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; -static struct blk_crypto_keyslot { +static struct blk_crypto_fallback_keyslot { enum blk_crypto_mode_num crypto_mode; struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; } *blk_crypto_keyslots; -static struct blk_keyslot_manager blk_crypto_ksm; +static struct blk_crypto_profile blk_crypto_fallback_profile; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; static struct bio_set crypto_bio_split; @@ -89,9 +89,9 @@ static struct bio_set crypto_bio_split; */ static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; -static void blk_crypto_evict_keyslot(unsigned int slot) +static void blk_crypto_fallback_evict_keyslot(unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode; int err; @@ -104,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot) slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; } -static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int +blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; const enum blk_crypto_mode_num crypto_mode = key->crypto_cfg.crypto_mode; int err; if (crypto_mode != slotp->crypto_mode && slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID) - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, key->size); if (err) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return err; } return 0; } -static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return 0; } -/* - * The crypto API fallback KSM ops - only used for a bio when it specifies a - * blk_crypto_key that was not supported by the device's inline encryption - * hardware. - */ -static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = { - .keyslot_program = blk_crypto_keyslot_program, - .keyslot_evict = blk_crypto_keyslot_evict, +static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = { + .keyslot_program = blk_crypto_fallback_keyslot_program, + .keyslot_evict = blk_crypto_fallback_keyslot_evict, }; static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) @@ -160,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) bio_endio(src_bio); } -static struct bio *blk_crypto_clone_bio(struct bio *bio_src) +static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) { struct bvec_iter iter; struct bio_vec bv; @@ -187,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) return bio; } -static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, - struct skcipher_request **ciph_req_ret, - struct crypto_wait *wait) +static bool +blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot, + struct skcipher_request **ciph_req_ret, + struct crypto_wait *wait) { struct skcipher_request *ciph_req; - const struct blk_crypto_keyslot *slotp; - int keyslot_idx = blk_ksm_get_slot_idx(slot); + const struct blk_crypto_fallback_keyslot *slotp; + int keyslot_idx = blk_crypto_keyslot_index(slot); slotp = &blk_crypto_keyslots[keyslot_idx]; ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], @@ -210,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, return true; } -static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) +static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; unsigned int i = 0; @@ -265,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) { struct bio *src_bio, *enc_bio; struct bio_crypt_ctx *bc; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; int data_unit_size; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); @@ -277,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) blk_status_t blk_st; /* Split the bio if it's too big for single page bvec */ - if (!blk_crypto_split_bio_if_needed(bio_ptr)) + if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr)) return false; src_bio = *bio_ptr; @@ -285,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; /* Allocate bounce bio for encryption */ - enc_bio = blk_crypto_clone_bio(src_bio); + enc_bio = blk_crypto_fallback_clone_bio(src_bio); if (!enc_bio) { src_bio->bi_status = BLK_STS_RESOURCE; return false; } /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { src_bio->bi_status = blk_st; goto out_put_enc_bio; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { src_bio->bi_status = BLK_STS_RESOURCE; goto out_release_keyslot; } @@ -363,7 +361,7 @@ out_free_bounce_pages: out_free_ciph_req: skcipher_request_free(ciph_req); out_release_keyslot: - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_put_enc_bio: if (enc_bio) bio_put(enc_bio); @@ -381,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) container_of(work, struct bio_fallback_crypt_ctx, work); struct bio *bio = f_ctx->bio; struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; @@ -394,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) blk_status_t blk_st; /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { bio->bi_status = blk_st; goto out_no_keyslot; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { bio->bi_status = BLK_STS_RESOURCE; goto out; } @@ -435,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) out: skcipher_request_free(ciph_req); - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_no_keyslot: mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); bio_endio(bio); @@ -474,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio) * @bio_ptr: pointer to the bio to prepare * * If bio is doing a WRITE operation, this splits the bio into two parts if it's - * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio - * for the first part, encrypts it, and update bio_ptr to point to the bounce - * bio. + * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a + * bounce bio for the first part, encrypts it, and updates bio_ptr to point to + * the bounce bio. * * For a READ operation, we mark the bio for decryption by using bi_private and * bi_end_io. @@ -500,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) return false; } - if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm, - &bc->bc_key->crypto_cfg)) { + if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile, + &bc->bc_key->crypto_cfg)) { bio->bi_status = BLK_STS_NOTSUPP; return false; } @@ -527,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { - return blk_ksm_evict_key(&blk_crypto_ksm, key); + return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key); } static bool blk_crypto_fallback_inited; @@ -535,6 +534,7 @@ static int blk_crypto_fallback_init(void) { int i; int err; + struct blk_crypto_profile *profile = &blk_crypto_fallback_profile; if (blk_crypto_fallback_inited) return 0; @@ -545,24 +545,24 @@ static int blk_crypto_fallback_init(void) if (err) goto out; - err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots); if (err) goto fail_free_bioset; err = -ENOMEM; - blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; - blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + profile->ll_ops = blk_crypto_fallback_ll_ops; + profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) - blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF; - blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; + profile->modes_supported[i] = 0xFFFFFFFF; + profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; blk_crypto_wq = alloc_workqueue("blk_crypto_wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, num_online_cpus()); if (!blk_crypto_wq) - goto fail_free_ksm; + goto fail_destroy_profile; blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots, sizeof(blk_crypto_keyslots[0]), @@ -596,8 +596,8 @@ fail_free_keyslots: kfree(blk_crypto_keyslots); fail_free_wq: destroy_workqueue(blk_crypto_wq); -fail_free_ksm: - blk_ksm_destroy(&blk_crypto_ksm); +fail_destroy_profile: + blk_crypto_profile_destroy(profile); fail_free_bioset: bioset_exit(&crypto_bio_split); out: @@ -611,7 +611,7 @@ out: int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { const char *cipher_str = blk_crypto_modes[mode_num].cipher_str; - struct blk_crypto_keyslot *slotp; + struct blk_crypto_fallback_keyslot *slotp; unsigned int i; int err = 0; diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c new file mode 100644 index 000000000000..605ba0626a5c --- /dev/null +++ b/block/blk-crypto-profile.c @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/** + * DOC: blk-crypto profiles + * + * 'struct blk_crypto_profile' contains all generic inline encryption-related + * state for a particular inline encryption device. blk_crypto_profile serves + * as the way that drivers for inline encryption hardware expose their crypto + * capabilities and certain functions (e.g., functions to program and evict + * keys) to upper layers. Device drivers that want to support inline encryption + * construct a crypto profile, then associate it with the disk's request_queue. + * + * If the device has keyslots, then its blk_crypto_profile also handles managing + * these keyslots in a device-independent way, using the driver-provided + * functions to program and evict keys as needed. This includes keeping track + * of which key and how many I/O requests are using each keyslot, getting + * keyslots for I/O requests, and handling key eviction requests. + * + * For more information, see Documentation/block/inline-encryption.rst. + */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + +#include <linux/blk-crypto-profile.h> +#include <linux/device.h> +#include <linux/atomic.h> +#include <linux/mutex.h> +#include <linux/pm_runtime.h> +#include <linux/wait.h> +#include <linux/blkdev.h> +#include <linux/blk-integrity.h> + +struct blk_crypto_keyslot { + atomic_t slot_refs; + struct list_head idle_slot_node; + struct hlist_node hash_node; + const struct blk_crypto_key *key; + struct blk_crypto_profile *profile; +}; + +static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile) +{ + /* + * Calling into the driver requires profile->lock held and the device + * resumed. But we must resume the device first, since that can acquire + * and release profile->lock via blk_crypto_reprogram_all_keys(). + */ + if (profile->dev) + pm_runtime_get_sync(profile->dev); + down_write(&profile->lock); +} + +static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile) +{ + up_write(&profile->lock); + if (profile->dev) + pm_runtime_put_sync(profile->dev); +} + +/** + * blk_crypto_profile_init() - Initialize a blk_crypto_profile + * @profile: the blk_crypto_profile to initialize + * @num_slots: the number of keyslots + * + * Storage drivers must call this when starting to set up a blk_crypto_profile, + * before filling in additional fields. + * + * Return: 0 on success, or else a negative error code. + */ +int blk_crypto_profile_init(struct blk_crypto_profile *profile, + unsigned int num_slots) +{ + unsigned int slot; + unsigned int i; + unsigned int slot_hashtable_size; + + memset(profile, 0, sizeof(*profile)); + init_rwsem(&profile->lock); + + if (num_slots == 0) + return 0; + + /* Initialize keyslot management data. */ + + profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]), + GFP_KERNEL); + if (!profile->slots) + return -ENOMEM; + + profile->num_slots = num_slots; + + init_waitqueue_head(&profile->idle_slots_wait_queue); + INIT_LIST_HEAD(&profile->idle_slots); + + for (slot = 0; slot < num_slots; slot++) { + profile->slots[slot].profile = profile; + list_add_tail(&profile->slots[slot].idle_slot_node, + &profile->idle_slots); + } + + spin_lock_init(&profile->idle_slots_lock); + + slot_hashtable_size = roundup_pow_of_two(num_slots); + /* + * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2 + * buckets. This only makes a difference when there is only 1 keyslot. + */ + if (slot_hashtable_size < 2) + slot_hashtable_size = 2; + + profile->log_slot_ht_size = ilog2(slot_hashtable_size); + profile->slot_hashtable = + kvmalloc_array(slot_hashtable_size, + sizeof(profile->slot_hashtable[0]), GFP_KERNEL); + if (!profile->slot_hashtable) + goto err_destroy; + for (i = 0; i < slot_hashtable_size; i++) + INIT_HLIST_HEAD(&profile->slot_hashtable[i]); + + return 0; + +err_destroy: + blk_crypto_profile_destroy(profile); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_crypto_profile_init); + +static void blk_crypto_profile_destroy_callback(void *profile) +{ + blk_crypto_profile_destroy(profile); +} + +/** + * devm_blk_crypto_profile_init() - Resource-managed blk_crypto_profile_init() + * @dev: the device which owns the blk_crypto_profile + * @profile: the blk_crypto_profile to initialize + * @num_slots: the number of keyslots + * + * Like blk_crypto_profile_init(), but causes blk_crypto_profile_destroy() to be + * called automatically on driver detach. + * + * Return: 0 on success, or else a negative error code. + */ +int devm_blk_crypto_profile_init(struct device *dev, + struct blk_crypto_profile *profile, + unsigned int num_slots) +{ + int err = blk_crypto_profile_init(profile, num_slots); + + if (err) + return err; + + return devm_add_action_or_reset(dev, + blk_crypto_profile_destroy_callback, + profile); +} +EXPORT_SYMBOL_GPL(devm_blk_crypto_profile_init); + +static inline struct hlist_head * +blk_crypto_hash_bucket_for_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + return &profile->slot_hashtable[ + hash_ptr(key, profile->log_slot_ht_size)]; +} + +static void +blk_crypto_remove_slot_from_lru_list(struct blk_crypto_keyslot *slot) +{ + struct blk_crypto_profile *profile = slot->profile; + unsigned long flags; + + spin_lock_irqsave(&profile->idle_slots_lock, flags); + list_del(&slot->idle_slot_node); + spin_unlock_irqrestore(&profile->idle_slots_lock, flags); +} + +static struct blk_crypto_keyslot * +blk_crypto_find_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + const struct hlist_head *head = + blk_crypto_hash_bucket_for_key(profile, key); + struct blk_crypto_keyslot *slotp; + + hlist_for_each_entry(slotp, head, hash_node) { + if (slotp->key == key) + return slotp; + } + return NULL; +} + +static struct blk_crypto_keyslot * +blk_crypto_find_and_grab_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + struct blk_crypto_keyslot *slot; + + slot = blk_crypto_find_keyslot(profile, key); + if (!slot) + return NULL; + if (atomic_inc_return(&slot->slot_refs) == 1) { + /* Took first reference to this slot; remove it from LRU list */ + blk_crypto_remove_slot_from_lru_list(slot); + } + return slot; +} + +/** + * blk_crypto_keyslot_index() - Get the index of a keyslot + * @slot: a keyslot that blk_crypto_get_keyslot() returned + * + * Return: the 0-based index of the keyslot within the device's keyslots. + */ +unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot) +{ + return slot - slot->profile->slots; +} +EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index); + +/** + * blk_crypto_get_keyslot() - Get a keyslot for a key, if needed. + * @profile: the crypto profile of the device the key will be used on + * @key: the key that will be used + * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct + * will be stored here; otherwise NULL will be stored here. + * + * If the device has keyslots, this gets a keyslot that's been programmed with + * the specified key. If the key is already in a slot, this reuses it; + * otherwise this waits for a slot to become idle and programs the key into it. + * + * This must be paired with a call to blk_crypto_put_keyslot(). + * + * Context: Process context. Takes and releases profile->lock. + * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or + * one wasn't needed; or a blk_status_t error on failure. + */ +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr) +{ + struct blk_crypto_keyslot *slot; + int slot_idx; + int err; + + *slot_ptr = NULL; + + /* + * If the device has no concept of "keyslots", then there is no need to + * get one. + */ + if (profile->num_slots == 0) + return BLK_STS_OK; + + down_read(&profile->lock); + slot = blk_crypto_find_and_grab_keyslot(profile, key); + up_read(&profile->lock); + if (slot) + goto success; + + for (;;) { + blk_crypto_hw_enter(profile); + slot = blk_crypto_find_and_grab_keyslot(profile, key); + if (slot) { + blk_crypto_hw_exit(profile); + goto success; + } + + /* + * If we're here, that means there wasn't a slot that was + * already programmed with the key. So try to program it. + */ + if (!list_empty(&profile->idle_slots)) + break; + + blk_crypto_hw_exit(profile); + wait_event(profile->idle_slots_wait_queue, + !list_empty(&profile->idle_slots)); + } + + slot = list_first_entry(&profile->idle_slots, struct blk_crypto_keyslot, + idle_slot_node); + slot_idx = blk_crypto_keyslot_index(slot); + + err = profile->ll_ops.keyslot_program(profile, key, slot_idx); + if (err) { + wake_up(&profile->idle_slots_wait_queue); + blk_crypto_hw_exit(profile); + return errno_to_blk_status(err); + } + + /* Move this slot to the hash list for the new key. */ + if (slot->key) + hlist_del(&slot->hash_node); + slot->key = key; + hlist_add_head(&slot->hash_node, + blk_crypto_hash_bucket_for_key(profile, key)); + + atomic_set(&slot->slot_refs, 1); + + blk_crypto_remove_slot_from_lru_list(slot); + + blk_crypto_hw_exit(profile); +success: + *slot_ptr = slot; + return BLK_STS_OK; +} + +/** + * blk_crypto_put_keyslot() - Release a reference to a keyslot + * @slot: The keyslot to release the reference of (may be NULL). + * + * Context: Any context. + */ +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot) +{ + struct blk_crypto_profile *profile; + unsigned long flags; + + if (!slot) + return; + + profile = slot->profile; + + if (atomic_dec_and_lock_irqsave(&slot->slot_refs, + &profile->idle_slots_lock, flags)) { + list_add_tail(&slot->idle_slot_node, &profile->idle_slots); + spin_unlock_irqrestore(&profile->idle_slots_lock, flags); + wake_up(&profile->idle_slots_wait_queue); + } +} + +/** + * __blk_crypto_cfg_supported() - Check whether the given crypto profile + * supports the given crypto configuration. + * @profile: the crypto profile to check + * @cfg: the crypto configuration to check for + * + * Return: %true if @profile supports the given @cfg. + */ +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg) +{ + if (!profile) + return false; + if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size)) + return false; + if (profile->max_dun_bytes_supported < cfg->dun_bytes) + return false; + return true; +} + +/** + * __blk_crypto_evict_key() - Evict a key from a device. + * @profile: the crypto profile of the device + * @key: the key to evict. It must not still be used in any I/O. + * + * If the device has keyslots, this finds the keyslot (if any) that contains the + * specified key and calls the driver's keyslot_evict function to evict it. + * + * Otherwise, this just calls the driver's keyslot_evict function if it is + * implemented, passing just the key (without any particular keyslot). This + * allows layered devices to evict the key from their underlying devices. + * + * Context: Process context. Takes and releases profile->lock. + * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY + * if the keyslot is still in use, or another -errno value on other + * error. + */ +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + struct blk_crypto_keyslot *slot; + int err = 0; + + if (profile->num_slots == 0) { + if (profile->ll_ops.keyslot_evict) { + blk_crypto_hw_enter(profile); + err = profile->ll_ops.keyslot_evict(profile, key, -1); + blk_crypto_hw_exit(profile); + return err; + } + return 0; + } + + blk_crypto_hw_enter(profile); + slot = blk_crypto_find_keyslot(profile, key); + if (!slot) + goto out_unlock; + + if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { + err = -EBUSY; + goto out_unlock; + } + err = profile->ll_ops.keyslot_evict(profile, key, + blk_crypto_keyslot_index(slot)); + if (err) + goto out_unlock; + + hlist_del(&slot->hash_node); + slot->key = NULL; + err = 0; +out_unlock: + blk_crypto_hw_exit(profile); + return err; +} + +/** + * blk_crypto_reprogram_all_keys() - Re-program all keyslots. + * @profile: The crypto profile + * + * Re-program all keyslots that are supposed to have a key programmed. This is + * intended only for use by drivers for hardware that loses its keys on reset. + * + * Context: Process context. Takes and releases profile->lock. + */ +void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile) +{ + unsigned int slot; + + if (profile->num_slots == 0) + return; + + /* This is for device initialization, so don't resume the device */ + down_write(&profile->lock); + for (slot = 0; slot < profile->num_slots; slot++) { + const struct blk_crypto_key *key = profile->slots[slot].key; + int err; + + if (!key) + continue; + + err = profile->ll_ops.keyslot_program(profile, key, slot); + WARN_ON(err); + } + up_write(&profile->lock); +} +EXPORT_SYMBOL_GPL(blk_crypto_reprogram_all_keys); + +void blk_crypto_profile_destroy(struct blk_crypto_profile *profile) +{ + if (!profile) + return; + kvfree(profile->slot_hashtable); + kvfree_sensitive(profile->slots, + sizeof(profile->slots[0]) * profile->num_slots); + memzero_explicit(profile, sizeof(*profile)); +} +EXPORT_SYMBOL_GPL(blk_crypto_profile_destroy); + +bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q) +{ + if (blk_integrity_queue_supports_integrity(q)) { + pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + return false; + } + q->crypto_profile = profile; + return true; +} +EXPORT_SYMBOL_GPL(blk_crypto_register); + +void blk_crypto_unregister(struct request_queue *q) +{ + q->crypto_profile = NULL; +} + +/** + * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities + * by child device + * @parent: the crypto profile for the parent device + * @child: the crypto profile for the child device, or NULL + * + * This clears all crypto capabilities in @parent that aren't set in @child. If + * @child is NULL, then this clears all parent capabilities. + * + * Only use this when setting up the crypto profile for a layered device, before + * it's been exposed yet. + */ +void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, + const struct blk_crypto_profile *child) +{ + if (child) { + unsigned int i; + + parent->max_dun_bytes_supported = + min(parent->max_dun_bytes_supported, + child->max_dun_bytes_supported); + for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++) + parent->modes_supported[i] &= child->modes_supported[i]; + } else { + parent->max_dun_bytes_supported = 0; + memset(parent->modes_supported, 0, + sizeof(parent->modes_supported)); + } +} +EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities); + +/** + * blk_crypto_has_capabilities() - Check whether @target supports at least all + * the crypto capabilities that @reference does. + * @target: the target profile + * @reference: the reference profile + * + * Return: %true if @target supports all the crypto capabilities of @reference. + */ +bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, + const struct blk_crypto_profile *reference) +{ + int i; + + if (!reference) + return true; + + if (!target) + return false; + + for (i = 0; i < ARRAY_SIZE(target->modes_supported); i++) { + if (reference->modes_supported[i] & ~target->modes_supported[i]) + return false; + } + + if (reference->max_dun_bytes_supported > + target->max_dun_bytes_supported) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities); + +/** + * blk_crypto_update_capabilities() - Update the capabilities of a crypto + * profile to match those of another crypto + * profile. + * @dst: The crypto profile whose capabilities to update. + * @src: The crypto profile whose capabilities this function will update @dst's + * capabilities to. + * + * Blk-crypto requires that crypto capabilities that were + * advertised when a bio was created continue to be supported by the + * device until that bio is ended. This is turn means that a device cannot + * shrink its advertised crypto capabilities without any explicit + * synchronization with upper layers. So if there's no such explicit + * synchronization, @src must support all the crypto capabilities that + * @dst does (i.e. we need blk_crypto_has_capabilities(@src, @dst)). + * + * Note also that as long as the crypto capabilities are being expanded, the + * order of updates becoming visible is not important because it's alright + * for blk-crypto to see stale values - they only cause blk-crypto to + * believe that a crypto capability isn't supported when it actually is (which + * might result in blk-crypto-fallback being used if available, or the bio being + * failed). + */ +void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, + const struct blk_crypto_profile *src) +{ + memcpy(dst->modes_supported, src->modes_supported, + sizeof(dst->modes_supported)); + + dst->max_dun_bytes_supported = src->max_dun_bytes_supported; +} +EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities); diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 8f53f4a1f9e2..ec9efeeeca91 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -11,7 +11,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/keyslot-manager.h> +#include <linux/blk-crypto-profile.h> #include <linux/module.h> #include <linux/slab.h> @@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio) blk_status_t __blk_crypto_init_request(struct request *rq) { - return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key, - &rq->crypt_keyslot); + return blk_crypto_get_keyslot(rq->q->crypto_profile, + rq->crypt_ctx->bc_key, + &rq->crypt_keyslot); } /** @@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq) */ void __blk_crypto_free_request(struct request *rq) { - blk_ksm_put_slot(rq->crypt_keyslot); + blk_crypto_put_keyslot(rq->crypt_keyslot); mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); blk_crypto_rq_set_defaults(rq); } @@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; + struct blk_crypto_profile *profile; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { @@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - if (blk_ksm_crypto_cfg_supported(bdev_get_queue(bio->bi_bdev)->ksm, - &bc_key->crypto_cfg)) + profile = bdev_get_queue(bio->bi_bdev)->crypto_profile; + if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg)) return true; if (blk_crypto_fallback_bio_prep(bio_ptr)) @@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q, const struct blk_crypto_config *cfg) { return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - blk_ksm_crypto_cfg_supported(q->ksm, cfg); + __blk_crypto_cfg_supported(q->crypto_profile, cfg); } /** @@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q, int blk_crypto_start_using_key(const struct blk_crypto_key *key, struct request_queue *q) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) return 0; return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } @@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key, * evicted from any hardware that it might have been programmed into. The key * must not be in use by any in-flight IO when this function is called. * - * Return: 0 on success or if key is not present in the q's ksm, -err on error. + * Return: 0 on success or if the key wasn't in any keyslot; -errno on error. */ int blk_crypto_evict_key(struct request_queue *q, const struct blk_crypto_key *key) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) - return blk_ksm_evict_key(q->ksm, key); + if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) + return __blk_crypto_evict_key(q->crypto_profile, key); /* - * If the request queue's associated inline encryption hardware didn't - * have support for the key, then the key might have been programmed - * into the fallback keyslot manager, so try to evict from there. + * If the request_queue didn't support the key, then blk-crypto-fallback + * may have been used, so try to evict the key from blk-crypto-fallback. */ return blk_crypto_fallback_evict_key(key); } diff --git a/block/blk-flush.c b/block/blk-flush.c index 4201728bf3a5..8e364bda5166 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -379,7 +379,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) * @rq is being submitted. Analyze what needs to be done and put it on the * right queue. */ -void blk_insert_flush(struct request *rq) +bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; unsigned long fflags = q->queue_flags; /* may change, cache */ @@ -409,7 +409,7 @@ void blk_insert_flush(struct request *rq) */ if (!policy) { blk_mq_end_request(rq, 0); - return; + return true; } BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ @@ -420,10 +420,8 @@ void blk_insert_flush(struct request *rq) * for normal execution. */ if ((policy & REQ_FSEQ_DATA) && - !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - blk_mq_request_bypass_insert(rq, false, false); - return; - } + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) + return false; /* * @rq should go through flush machinery. Mark it part of flush @@ -439,6 +437,8 @@ void blk_insert_flush(struct request *rq) spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); + + return true; } /** diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c new file mode 100644 index 000000000000..c246c425d0d7 --- /dev/null +++ b/block/blk-ia-ranges.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block device concurrent positioning ranges. + * + * Copyright (C) 2021 Western Digital Corporation or its Affiliates. + */ +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/init.h> + +#include "blk.h" + +static ssize_t +blk_ia_range_sector_show(struct blk_independent_access_range *iar, + char *buf) +{ + return sprintf(buf, "%llu\n", iar->sector); +} + +static ssize_t +blk_ia_range_nr_sectors_show(struct blk_independent_access_range *iar, + char *buf) +{ + return sprintf(buf, "%llu\n", iar->nr_sectors); +} + +struct blk_ia_range_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_independent_access_range *iar, char *buf); +}; + +static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = { + .attr = { .name = "sector", .mode = 0444 }, + .show = blk_ia_range_sector_show, +}; + +static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = { + .attr = { .name = "nr_sectors", .mode = 0444 }, + .show = blk_ia_range_nr_sectors_show, +}; + +static struct attribute *blk_ia_range_attrs[] = { + &blk_ia_range_sector_entry.attr, + &blk_ia_range_nr_sectors_entry.attr, + NULL, +}; +ATTRIBUTE_GROUPS(blk_ia_range); + +static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct blk_ia_range_sysfs_entry *entry = + container_of(attr, struct blk_ia_range_sysfs_entry, attr); + struct blk_independent_access_range *iar = + container_of(kobj, struct blk_independent_access_range, kobj); + ssize_t ret; + + mutex_lock(&iar->queue->sysfs_lock); + ret = entry->show(iar, buf); + mutex_unlock(&iar->queue->sysfs_lock); + + return ret; +} + +static const struct sysfs_ops blk_ia_range_sysfs_ops = { + .show = blk_ia_range_sysfs_show, +}; + +/* + * Independent access range entries are not freed individually, but alltogether + * with struct blk_independent_access_ranges and its array of ranges. Since + * kobject_add() takes a reference on the parent kobject contained in + * struct blk_independent_access_ranges, the array of independent access range + * entries cannot be freed until kobject_del() is called for all entries. + * So we do not need to do anything here, but still need this no-op release + * operation to avoid complaints from the kobject code. + */ +static void blk_ia_range_sysfs_nop_release(struct kobject *kobj) +{ +} + +static struct kobj_type blk_ia_range_ktype = { + .sysfs_ops = &blk_ia_range_sysfs_ops, + .default_groups = blk_ia_range_groups, + .release = blk_ia_range_sysfs_nop_release, +}; + +/* + * This will be executed only after all independent access range entries are + * removed with kobject_del(), at which point, it is safe to free everything, + * including the array of ranges. + */ +static void blk_ia_ranges_sysfs_release(struct kobject *kobj) +{ + struct blk_independent_access_ranges *iars = + container_of(kobj, struct blk_independent_access_ranges, kobj); + + kfree(iars); +} + +static struct kobj_type blk_ia_ranges_ktype = { + .release = blk_ia_ranges_sysfs_release, +}; + +/** + * disk_register_ia_ranges - register with sysfs a set of independent + * access ranges + * @disk: Target disk + * @new_iars: New set of independent access ranges + * + * Register with sysfs a set of independent access ranges for @disk. + * If @new_iars is not NULL, this set of ranges is registered and the old set + * specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is + * registered if it is not already. + */ +int disk_register_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *new_iars) +{ + struct request_queue *q = disk->queue; + struct blk_independent_access_ranges *iars; + int i, ret; + + lockdep_assert_held(&q->sysfs_dir_lock); + lockdep_assert_held(&q->sysfs_lock); + + /* If a new range set is specified, unregister the old one */ + if (new_iars) { + if (q->ia_ranges) + disk_unregister_independent_access_ranges(disk); + q->ia_ranges = new_iars; + } + + iars = q->ia_ranges; + if (!iars) + return 0; + + /* + * At this point, iars is the new set of sector access ranges that needs + * to be registered with sysfs. + */ + WARN_ON(iars->sysfs_registered); + ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype, + &q->kobj, "%s", "independent_access_ranges"); + if (ret) { + q->ia_ranges = NULL; + kfree(iars); + return ret; + } + + for (i = 0; i < iars->nr_ia_ranges; i++) { + iars->ia_range[i].queue = q; + ret = kobject_init_and_add(&iars->ia_range[i].kobj, + &blk_ia_range_ktype, &iars->kobj, + "%d", i); + if (ret) { + while (--i >= 0) + kobject_del(&iars->ia_range[i].kobj); + kobject_del(&iars->kobj); + kobject_put(&iars->kobj); + return ret; + } + } + + iars->sysfs_registered = true; + + return 0; +} + +void disk_unregister_independent_access_ranges(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct blk_independent_access_ranges *iars = q->ia_ranges; + int i; + + lockdep_assert_held(&q->sysfs_dir_lock); + lockdep_assert_held(&q->sysfs_lock); + + if (!iars) + return; + + if (iars->sysfs_registered) { + for (i = 0; i < iars->nr_ia_ranges; i++) + kobject_del(&iars->ia_range[i].kobj); + kobject_del(&iars->kobj); + kobject_put(&iars->kobj); + } else { + kfree(iars); + } + + q->ia_ranges = NULL; +} + +static struct blk_independent_access_range * +disk_find_ia_range(struct blk_independent_access_ranges *iars, + sector_t sector) +{ + struct blk_independent_access_range *iar; + int i; + + for (i = 0; i < iars->nr_ia_ranges; i++) { + iar = &iars->ia_range[i]; + if (sector >= iar->sector && + sector < iar->sector + iar->nr_sectors) + return iar; + } + + return NULL; +} + +static bool disk_check_ia_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *iars) +{ + struct blk_independent_access_range *iar, *tmp; + sector_t capacity = get_capacity(disk); + sector_t sector = 0; + int i; + + /* + * While sorting the ranges in increasing LBA order, check that the + * ranges do not overlap, that there are no sector holes and that all + * sectors belong to one range. + */ + for (i = 0; i < iars->nr_ia_ranges; i++) { + tmp = disk_find_ia_range(iars, sector); + if (!tmp || tmp->sector != sector) { + pr_warn("Invalid non-contiguous independent access ranges\n"); + return false; + } + + iar = &iars->ia_range[i]; + if (tmp != iar) { + swap(iar->sector, tmp->sector); + swap(iar->nr_sectors, tmp->nr_sectors); + } + + sector += iar->nr_sectors; + } + + if (sector != capacity) { + pr_warn("Independent access ranges do not match disk capacity\n"); + return false; + } + + return true; +} + +static bool disk_ia_ranges_changed(struct gendisk *disk, + struct blk_independent_access_ranges *new) +{ + struct blk_independent_access_ranges *old = disk->queue->ia_ranges; + int i; + + if (!old) + return true; + + if (old->nr_ia_ranges != new->nr_ia_ranges) + return true; + + for (i = 0; i < old->nr_ia_ranges; i++) { + if (new->ia_range[i].sector != old->ia_range[i].sector || + new->ia_range[i].nr_sectors != old->ia_range[i].nr_sectors) + return true; + } + + return false; +} + +/** + * disk_alloc_independent_access_ranges - Allocate an independent access ranges + * data structure + * @disk: target disk + * @nr_ia_ranges: Number of independent access ranges + * + * Allocate a struct blk_independent_access_ranges structure with @nr_ia_ranges + * access range descriptors. + */ +struct blk_independent_access_ranges * +disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges) +{ + struct blk_independent_access_ranges *iars; + + iars = kzalloc_node(struct_size(iars, ia_range, nr_ia_ranges), + GFP_KERNEL, disk->queue->node); + if (iars) + iars->nr_ia_ranges = nr_ia_ranges; + return iars; +} +EXPORT_SYMBOL_GPL(disk_alloc_independent_access_ranges); + +/** + * disk_set_independent_access_ranges - Set a disk independent access ranges + * @disk: target disk + * @iars: independent access ranges structure + * + * Set the independent access ranges information of the request queue + * of @disk to @iars. If @iars is NULL and the independent access ranges + * structure already set is cleared. If there are no differences between + * @iars and the independent access ranges structure already set, @iars + * is freed. + */ +void disk_set_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *iars) +{ + struct request_queue *q = disk->queue; + + if (WARN_ON_ONCE(iars && !iars->nr_ia_ranges)) { + kfree(iars); + iars = NULL; + } + + mutex_lock(&q->sysfs_dir_lock); + mutex_lock(&q->sysfs_lock); + + if (iars) { + if (!disk_check_ia_ranges(disk, iars)) { + kfree(iars); + iars = NULL; + goto reg; + } + + if (!disk_ia_ranges_changed(disk, iars)) { + kfree(iars); + goto unlock; + } + } + + /* + * This may be called for a registered queue. E.g. during a device + * revalidation. If that is the case, we need to unregister the old + * set of independent access ranges and register the new set. If the + * queue is not registered, registration of the device request queue + * will register the independent access ranges, so only swap in the + * new set and free the old one. + */ +reg: + if (blk_queue_registered(q)) { + disk_register_independent_access_ranges(disk, iars); + } else { + swap(q->ia_ranges, iars); + kfree(iars); + } + +unlock: + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); +} +EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index cef534a7cbc9..d670d54e5f7a 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); #ifdef CONFIG_BLK_INLINE_ENCRYPTION - if (disk->queue->ksm) { + if (disk->queue->crypto_profile) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - blk_ksm_unregister(disk->queue); + blk_crypto_unregister(disk->queue); } #endif } diff --git a/block/blk-merge.c b/block/blk-merge.c index ec727234ac48..893c1a60b701 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -383,7 +383,7 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, */ void blk_queue_split(struct bio **bio) { - struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue((*bio)->bi_bdev); unsigned int nr_segs; if (blk_may_split(q, *bio)) @@ -1067,9 +1067,8 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @nr_segs: number of segments in @bio - * @same_queue_rq: pointer to &struct request that gets filled in when - * another request associated with @q is found on the plug list - * (optional, may be %NULL) + * @same_queue_rq: output value, will be true if there's an existing request + * from the passed in @q already in the plug list * * Determine whether @bio being queued on @q can be merged with the previous * request on %current's plugged list. Returns %true if merge was successful, @@ -1085,26 +1084,28 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq) + unsigned int nr_segs, bool *same_queue_rq) { struct blk_plug *plug; struct request *rq; plug = blk_mq_plug(q, bio); - if (!plug || list_empty(&plug->mq_list)) + if (!plug || rq_list_empty(plug->mq_list)) return false; /* check the previously added entry for a quick merge attempt */ - rq = list_last_entry(&plug->mq_list, struct request, queuelist); - if (rq->q == q && same_queue_rq) { + rq = rq_list_peek(&plug->mq_list); + if (rq->q == q) { /* * Only blk-mq multiple hardware queues case checks the rq in * the same queue, there should be only one such rq in a queue */ - *same_queue_rq = rq; + *same_queue_rq = true; + + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; } - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == BIO_MERGE_OK) - return true; return false; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 68ca5d21cda7..4f2cf8399f3d 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -124,7 +124,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(POLL_STATS), QUEUE_FLAG_NAME(REGISTERED), - QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(ZONE_RESETALL), @@ -309,6 +308,7 @@ static const char *const rqf_name[] = { RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(MQ_POLL_SLEPT), + RQF_NAME(ELV), }; #undef RQF_NAME @@ -550,7 +550,7 @@ static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; - seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); + seq_printf(m, "%d\n", __blk_mq_active_requests(hctx)); return 0; } diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e85b7556b096..4be652fa38e7 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -361,7 +361,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) } } -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, +bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct elevator_queue *e = q->elevator; @@ -370,15 +370,20 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, bool ret = false; enum hctx_type type; - if (e && e->type->ops.bio_merge) - return e->type->ops.bio_merge(q, bio, nr_segs); + if (bio_queue_enter(bio)) + return false; + + if (e && e->type->ops.bio_merge) { + ret = e->type->ops.bio_merge(q, bio, nr_segs); + goto out_put; + } ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || list_empty_careful(&ctx->rq_lists[type])) - return false; + goto out_put; /* default per sw-queue merge */ spin_lock(&ctx->lock); @@ -391,6 +396,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, ret = true; spin_unlock(&ctx->lock); +out_put: + blk_queue_exit(q); return ret; } @@ -497,7 +504,7 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, * busy in case of 'none' scheduler, and this way may save * us one extra enqueue & dequeue to sw queue. */ - if (!hctx->dispatch_busy && !e && !run_queue_async) { + if (!hctx->dispatch_busy && !run_queue_async) { blk_mq_try_issue_list_directly(hctx, list); if (list_empty(list)) goto out; @@ -541,7 +548,7 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int fla queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { - if (!blk_mq_is_shared_tags(q->tag_set->flags)) + if (!blk_mq_is_shared_tags(flags)) blk_mq_free_rq_map(hctx->sched_tags); hctx->sched_tags = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 98836106b25f..25d1034952b6 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -12,7 +12,7 @@ void blk_mq_sched_assign_ioc(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, +bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); @@ -43,16 +43,6 @@ static inline bool bio_mergeable(struct bio *bio) } static inline bool -blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs) -{ - if (blk_queue_nomerges(q) || !bio_mergeable(bio)) - return false; - - return __blk_mq_sched_bio_merge(q, bio, nr_segs); -} - -static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index b94c3e8ef392..995336abee33 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -399,9 +399,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { - int i; + unsigned int flags = tagset->flags; + int i, nr_tags; + + nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; - for (i = 0; i < tagset->nr_hw_queues; i++) { + for (i = 0; i < nr_tags; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 78ae2fb8e2a4..df787b5a23bd 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -4,29 +4,6 @@ struct blk_mq_alloc_data; -/* - * Tag address space map. - */ -struct blk_mq_tags { - unsigned int nr_tags; - unsigned int nr_reserved_tags; - - atomic_t active_queues; - - struct sbitmap_queue bitmap_tags; - struct sbitmap_queue breserved_tags; - - struct request **rqs; - struct request **static_rqs; - struct list_head page_list; - - /* - * used to clear request reference in rqs[] before freeing one - * request pool - */ - spinlock_t lock; -}; - extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); diff --git a/block/blk-mq.c b/block/blk-mq.c index 9248edd8a7d3..629cf421417f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -19,7 +19,6 @@ #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/llist.h> -#include <linux/list_sort.h> #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/sysctl.h> @@ -242,27 +241,28 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + + spin_lock_irqsave(&q->queue_lock, flags); + if (!q->quiesce_depth++) + blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** - * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished + * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done * @q: request queue. * - * Note: this function does not prevent that the struct request end_io() - * callback function is invoked. Once this function is returned, we make - * sure no dispatch can happen until the queue is unquiesced via - * blk_mq_unquiesce_queue(). + * Note: it is driver's responsibility for making sure that quiesce has + * been started. */ -void blk_mq_quiesce_queue(struct request_queue *q) +void blk_mq_wait_quiesce_done(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned int i; bool rcu = false; - blk_mq_quiesce_queue_nowait(q); - queue_for_each_hw_ctx(q, hctx, i) { if (hctx->flags & BLK_MQ_F_BLOCKING) synchronize_srcu(hctx->srcu); @@ -272,6 +272,22 @@ void blk_mq_quiesce_queue(struct request_queue *q) if (rcu) synchronize_rcu(); } +EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); + +/** + * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished + * @q: request queue. + * + * Note: this function does not prevent that the struct request end_io() + * callback function is invoked. Once this function is returned, we make + * sure no dispatch can happen until the queue is unquiesced via + * blk_mq_unquiesce_queue(). + */ +void blk_mq_quiesce_queue(struct request_queue *q) +{ + blk_mq_quiesce_queue_nowait(q); + blk_mq_wait_quiesce_done(q); +} EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); /* @@ -283,10 +299,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); */ void blk_mq_unquiesce_queue(struct request_queue *q) { - blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + bool run_queue = false; + + spin_lock_irqsave(&q->queue_lock, flags); + if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { + ; + } else if (!--q->quiesce_depth) { + blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + run_queue = true; + } + spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ - blk_mq_run_hw_queues(q, true); + if (run_queue) + blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); @@ -301,40 +328,37 @@ void blk_mq_wake_waiters(struct request_queue *q) } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, u64 alloc_time_ns) + struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) { struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; struct request_queue *q = data->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - unsigned int rq_flags = 0; - if (e) { - rq_flags = RQF_ELV; - rq->tag = BLK_MQ_NO_TAG; - rq->internal_tag = tag; - } else { - rq->tag = tag; - rq->internal_tag = BLK_MQ_NO_TAG; - } + rq->q = q; + rq->mq_ctx = ctx; + rq->mq_hctx = hctx; + rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PM) - rq_flags |= RQF_PM; + data->rq_flags |= RQF_PM; if (blk_queue_io_stat(q)) - rq_flags |= RQF_IO_STAT; - rq->rq_flags = rq_flags; + data->rq_flags |= RQF_IO_STAT; + rq->rq_flags = data->rq_flags; + + if (!(data->rq_flags & RQF_ELV)) { + rq->tag = tag; + rq->internal_tag = BLK_MQ_NO_TAG; + } else { + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = tag; + } + rq->timeout = 0; if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; - /* csd/requeue_work/fifo_time is initialized before use */ - rq->q = q; - rq->mq_ctx = ctx; - rq->mq_hctx = hctx; - rq->cmd_flags = data->cmd_flags; rq->rq_disk = NULL; rq->part = NULL; #ifdef CONFIG_BLK_RQ_ALLOC_TIME @@ -346,7 +370,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif - rq->timeout = 0; rq->end_io = NULL; rq->end_io_data = NULL; @@ -381,22 +404,28 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, u64 alloc_time_ns) { unsigned int tag, tag_offset; + struct blk_mq_tags *tags; struct request *rq; - unsigned long tags; + unsigned long tag_mask; int i, nr = 0; - tags = blk_mq_get_tags(data, data->nr_tags, &tag_offset); - if (unlikely(!tags)) + tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); + if (unlikely(!tag_mask)) return NULL; - for (i = 0; tags; i++) { - if (!(tags & (1UL << i))) + tags = blk_mq_tags_from_data(data); + for (i = 0; tag_mask; i++) { + if (!(tag_mask & (1UL << i))) continue; tag = tag_offset + i; - tags &= ~(1UL << i); - rq = blk_mq_rq_ctx_init(data, tag, alloc_time_ns); + prefetch(tags->static_rqs[tag]); + tag_mask &= ~(1UL << i); + rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns); rq_list_add(data->cached_rq, rq); + nr++; } + /* caller already holds a reference, add for remainder */ + percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; return rq_list_pop(data->cached_rq); @@ -405,7 +434,6 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; - struct elevator_queue *e = q->elevator; u64 alloc_time_ns = 0; struct request *rq; unsigned int tag; @@ -417,7 +445,11 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; - if (e) { + if (q->elevator) { + struct elevator_queue *e = q->elevator; + + data->rq_flags |= RQF_ELV; + /* * Flush/passthrough requests are special and go directly to the * dispatch list. Don't include reserved tags in the @@ -433,7 +465,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!e) + if (!(data->rq_flags & RQF_ELV)) blk_mq_tag_busy(data->hctx); /* @@ -465,7 +497,8 @@ retry: goto retry; } - return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); + return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag, + alloc_time_ns); } struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, @@ -544,12 +577,15 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, if (!q->elevator) blk_mq_tag_busy(data.hctx); + else + data.rq_flags |= RQF_ELV; ret = -EWOULDBLOCK; tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); + return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, + alloc_time_ns); out_queue_exit: blk_queue_exit(q); @@ -580,7 +616,7 @@ void blk_mq_free_request(struct request *rq) struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - if (rq->rq_flags & (RQF_ELVPRIV | RQF_ELV)) { + if (rq->rq_flags & RQF_ELVPRIV) { struct elevator_queue *e = q->elevator; if (e->type->ops.finish_request) @@ -609,34 +645,30 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; - while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) { - percpu_ref_get(&rq->q->q_usage_counter); + while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) blk_mq_free_request(rq); - } } static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, blk_status_t error) { - if (error) + if (unlikely(error)) { bio->bi_status = error; - - if (unlikely(rq->rq_flags & RQF_QUIET)) - bio_set_flag(bio, BIO_QUIET); - - bio_advance(bio, nbytes); - - if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { + } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. */ - if (bio->bi_iter.bi_size) + if (bio->bi_iter.bi_size != nbytes) bio->bi_status = BLK_STS_IOERR; else bio->bi_iter.bi_sector = rq->__sector; } + bio_advance(bio, nbytes); + + if (unlikely(rq->rq_flags & RQF_QUIET)) + bio_set_flag(bio, BIO_QUIET); /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); @@ -680,7 +712,7 @@ bool blk_update_request(struct request *req, blk_status_t error, { int total_bytes; - trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); + trace_block_rq_complete(req, error, nr_bytes); if (!req->bio) return false; @@ -799,6 +831,13 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; + /* + * All requests should have been marked as RQF_MQ_INFLIGHT, so + * update hctx->nr_active in batch + */ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_sub_active_requests(hctx, nr_tags); + blk_mq_put_tags(hctx->tags, tag_array, nr_tags); percpu_ref_put_many(&q->q_usage_counter, nr_tags); } @@ -806,7 +845,7 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, void blk_mq_end_request_batch(struct io_comp_batch *iob) { int tags[TAG_COMP_BATCH], nr_tags = 0; - struct blk_mq_hw_ctx *last_hctx = NULL; + struct blk_mq_hw_ctx *cur_hctx = NULL; struct request *rq; u64 now = 0; @@ -829,17 +868,17 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) blk_pm_mark_last_busy(rq); rq_qos_done(rq->q, rq); - if (nr_tags == TAG_COMP_BATCH || - (last_hctx && last_hctx != rq->mq_hctx)) { - blk_mq_flush_tag_batch(last_hctx, tags, nr_tags); + if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { + if (cur_hctx) + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); nr_tags = 0; + cur_hctx = rq->mq_hctx; } tags[nr_tags++] = rq->tag; - last_hctx = rq->mq_hctx; } if (nr_tags) - blk_mq_flush_tag_batch(last_hctx, tags, nr_tags); + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); } EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); @@ -1040,7 +1079,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - BUG_ON(!list_empty(&rq->queuelist)); blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -1121,17 +1159,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) -{ - if (tag < tags->nr_tags) { - prefetch(tags->rqs[tag]); - return tags->rqs[tag]; - } - - return NULL; -} -EXPORT_SYMBOL(blk_mq_tag_to_rq); - static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -1336,7 +1363,7 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, return data.rq; } -static bool __blk_mq_get_driver_tag(struct request *rq) +static bool __blk_mq_alloc_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; @@ -1360,11 +1387,9 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -bool blk_mq_get_driver_tag(struct request *rq) +bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && @@ -1594,6 +1619,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, int errors, queued; blk_status_t ret = BLK_STS_OK; LIST_HEAD(zone_list); + bool needs_resource = false; if (list_empty(list)) return false; @@ -1639,6 +1665,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, queued++; break; case BLK_STS_RESOURCE: + needs_resource = true; + fallthrough; case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; @@ -1649,6 +1677,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, * accept. */ blk_mq_handle_zone_resource(rq, &zone_list); + needs_resource = true; break; default: errors++; @@ -1673,7 +1702,6 @@ out: /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); - bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; if (nr_budgets) blk_mq_release_budgets(q, list); @@ -1714,14 +1742,16 @@ out: * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do - * similar if we couldn't get budget and SCHED_RESTART is set. + * similar if we couldn't get budget or couldn't lock a zone + * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); + if (prep == PREP_DISPATCH_NO_BUDGET) + needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && (ret == BLK_STS_RESOURCE || - no_budget_avail)) + else if (needs_restart && needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); @@ -2161,54 +2191,106 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, spin_unlock(&ctx->lock); } -static int plug_rq_cmp(void *priv, const struct list_head *a, - const struct list_head *b) +static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued, + bool from_schedule) +{ + if (hctx->queue->mq_ops->commit_rqs) { + trace_block_unplug(hctx->queue, *queued, !from_schedule); + hctx->queue->mq_ops->commit_rqs(hctx); + } + *queued = 0; +} + +static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) { - struct request *rqa = container_of(a, struct request, queuelist); - struct request *rqb = container_of(b, struct request, queuelist); + struct blk_mq_hw_ctx *hctx = NULL; + struct request *rq; + int queued = 0; + int errors = 0; - if (rqa->mq_ctx != rqb->mq_ctx) - return rqa->mq_ctx > rqb->mq_ctx; - if (rqa->mq_hctx != rqb->mq_hctx) - return rqa->mq_hctx > rqb->mq_hctx; + while ((rq = rq_list_pop(&plug->mq_list))) { + bool last = rq_list_empty(plug->mq_list); + blk_status_t ret; + + if (hctx != rq->mq_hctx) { + if (hctx) + blk_mq_commit_rqs(hctx, &queued, from_schedule); + hctx = rq->mq_hctx; + } + + ret = blk_mq_request_issue_directly(rq, last); + switch (ret) { + case BLK_STS_OK: + queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, false, last); + blk_mq_commit_rqs(hctx, &queued, from_schedule); + return; + default: + blk_mq_end_request(rq, ret); + errors++; + break; + } + } - return blk_rq_pos(rqa) > blk_rq_pos(rqb); + /* + * If we didn't flush the entire list, we could have told the driver + * there was more coming, but that turned out to be a lie. + */ + if (errors) + blk_mq_commit_rqs(hctx, &queued, from_schedule); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { + struct blk_mq_hw_ctx *this_hctx; + struct blk_mq_ctx *this_ctx; + unsigned int depth; LIST_HEAD(list); - if (list_empty(&plug->mq_list)) + if (rq_list_empty(plug->mq_list)) return; - list_splice_init(&plug->mq_list, &list); - - if (plug->rq_count > 2 && plug->multiple_queues) - list_sort(NULL, &list, plug_rq_cmp); - plug->rq_count = 0; + if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { + blk_mq_plug_issue_direct(plug, false); + if (rq_list_empty(plug->mq_list)) + return; + } + + this_hctx = NULL; + this_ctx = NULL; + depth = 0; do { - struct list_head rq_list; - struct request *rq, *head_rq = list_entry_rq(list.next); - struct list_head *pos = &head_rq->queuelist; /* skip first */ - struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; - struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; - unsigned int depth = 1; - - list_for_each_continue(pos, &list) { - rq = list_entry_rq(pos); - BUG_ON(!rq->q); - if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) - break; - depth++; + struct request *rq; + + rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + trace_block_unplug(this_hctx->queue, depth, + !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, + &list, from_schedule); + depth = 0; + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } - list_cut_before(&rq_list, &list, pos); - trace_block_unplug(head_rq->q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, + list_add(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + if (!list_empty(&list)) { + trace_block_unplug(this_hctx->queue, depth, !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_schedule); - } while(!list_empty(&list)); + } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, @@ -2388,16 +2470,17 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) { - list_add_tail(&rq->queuelist, &plug->mq_list); - plug->rq_count++; - if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { - struct request *tmp; + if (!plug->multiple_queues) { + struct request *nxt = rq_list_peek(&plug->mq_list); - tmp = list_first_entry(&plug->mq_list, struct request, - queuelist); - if (tmp->q != rq->q) + if (nxt && nxt->q != rq->q) plug->multiple_queues = true; } + if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; } /* @@ -2412,6 +2495,83 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) return BLK_MAX_REQUEST_COUNT; } +static bool blk_attempt_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs, bool *same_queue_rq) +{ + if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { + if (blk_attempt_plug_merge(q, bio, nr_segs, same_queue_rq)) + return true; + if (blk_mq_sched_bio_merge(q, bio, nr_segs)) + return true; + } + return false; +} + +static struct request *blk_mq_get_new_requests(struct request_queue *q, + struct blk_plug *plug, + struct bio *bio, + unsigned int nsegs, + bool *same_queue_rq) +{ + struct blk_mq_alloc_data data = { + .q = q, + .nr_tags = 1, + .cmd_flags = bio->bi_opf, + }; + struct request *rq; + + if (unlikely(bio_queue_enter(bio))) + return NULL; + if (unlikely(!submit_bio_checks(bio))) + goto put_exit; + if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq)) + goto put_exit; + + rq_qos_throttle(q, bio); + + if (plug) { + data.nr_tags = plug->nr_ios; + plug->nr_ios = 1; + data.cached_rq = &plug->cached_rq; + } + + rq = __blk_mq_alloc_requests(&data); + if (rq) + return rq; + + rq_qos_cleanup(q, bio); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); +put_exit: + blk_queue_exit(q); + return NULL; +} + +static inline struct request *blk_mq_get_request(struct request_queue *q, + struct blk_plug *plug, + struct bio *bio, + unsigned int nsegs, + bool *same_queue_rq) +{ + if (plug) { + struct request *rq; + + rq = rq_list_peek(&plug->cached_rq); + if (rq && rq->q == q) { + if (unlikely(!submit_bio_checks(bio))) + return NULL; + if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq)) + return NULL; + plug->cached_rq = rq_list_next(rq); + INIT_LIST_HEAD(&rq->queuelist); + rq_qos_throttle(q, bio); + return rq; + } + } + + return blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq); +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2429,53 +2589,26 @@ void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = op_is_flush(bio->bi_opf); struct request *rq; struct blk_plug *plug; - struct request *same_queue_rq = NULL; + bool same_queue_rq = false; unsigned int nr_segs = 1; blk_status_t ret; + if (unlikely(!blk_crypto_bio_prep(&bio))) + return; + blk_queue_bounce(q, &bio); if (blk_may_split(q, bio)) __blk_queue_split(q, &bio, &nr_segs); if (!bio_integrity_prep(bio)) - goto queue_exit; - - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) - goto queue_exit; - - if (blk_mq_sched_bio_merge(q, bio, nr_segs)) - goto queue_exit; - - rq_qos_throttle(q, bio); + return; plug = blk_mq_plug(q, bio); - if (plug && plug->cached_rq) { - rq = rq_list_pop(&plug->cached_rq); - INIT_LIST_HEAD(&rq->queuelist); - } else { - struct blk_mq_alloc_data data = { - .q = q, - .nr_tags = 1, - .cmd_flags = bio->bi_opf, - }; - - if (plug) { - data.nr_tags = plug->nr_ios; - plug->nr_ios = 1; - data.cached_rq = &plug->cached_rq; - } - rq = __blk_mq_alloc_requests(&data); - if (unlikely(!rq)) { - rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) - bio_wouldblock_error(bio); - goto queue_exit; - } - } + rq = blk_mq_get_request(q, plug, bio, nr_segs, &same_queue_rq); + if (unlikely(!rq)) + return; trace_block_getrq(bio); @@ -2491,14 +2624,12 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (unlikely(is_flush_fua)) { - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - /* Bypass scheduler for flush requests */ - blk_insert_flush(rq); - blk_mq_run_hw_queue(hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || - blk_mq_is_shared_tags(rq->mq_hctx->flags) || - q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) + return; + + if (plug && (q->nr_hw_queues == 1 || + blk_mq_is_shared_tags(rq->mq_hctx->flags) || + q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { /* * Use plugging if we have a ->commit_rqs() hook as well, as * we know the driver uses bd->last in a smart fashion. @@ -2509,14 +2640,16 @@ void blk_mq_submit_bio(struct bio *bio) unsigned int request_count = plug->rq_count; struct request *last = NULL; - if (!request_count) + if (!request_count) { trace_block_plug(q); - else - last = list_entry_rq(plug->mq_list.prev); + } else if (!blk_queue_nomerges(q)) { + last = rq_list_peek(&plug->mq_list); + if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE) + last = NULL; + } - if (request_count >= blk_plug_max_rq_count(plug) || (last && - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { - blk_flush_plug_list(plug, false); + if (request_count >= blk_plug_max_rq_count(plug) || last) { + blk_mq_flush_plug_list(plug, false); trace_block_plug(q); } @@ -2525,6 +2658,8 @@ void blk_mq_submit_bio(struct bio *bio) /* Insert the request at the IO scheduler queue */ blk_mq_sched_insert_request(rq, false, true, true); } else if (plug && !blk_queue_nomerges(q)) { + struct request *next_rq = NULL; + /* * We do limited plugging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be @@ -2532,19 +2667,16 @@ void blk_mq_submit_bio(struct bio *bio) * The plug list might get flushed before this. If that happens, * the plug list is empty, and same_queue_rq is invalid. */ - if (list_empty(&plug->mq_list)) - same_queue_rq = NULL; if (same_queue_rq) { - list_del_init(&same_queue_rq->queuelist); + next_rq = rq_list_pop(&plug->mq_list); plug->rq_count--; } blk_add_rq_to_plug(plug, rq); trace_block_plug(q); - if (same_queue_rq) { + if (next_rq) { trace_block_unplug(q, 1, true); - blk_mq_try_issue_directly(same_queue_rq->mq_hctx, - same_queue_rq); + blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq); } } else if ((q->nr_hw_queues > 1 && is_sync) || !rq->mq_hctx->dispatch_busy) { @@ -2557,10 +2689,6 @@ void blk_mq_submit_bio(struct bio *bio) /* Default case. */ blk_mq_sched_insert_request(rq, false, true, true); } - - return; -queue_exit: - blk_queue_exit(q); } static size_t order_to_size(unsigned int order) @@ -3546,7 +3674,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - __blk_mq_free_map_and_rqs(set, j); blk_mq_exit_hctx(q, set, hctx, j); hctxs[j] = NULL; } @@ -4054,8 +4181,13 @@ fallback: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { + int i = prev_nr_hw_queues; + pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); + for (; i < set->nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i); + set->nr_hw_queues = prev_nr_hw_queues; blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); goto fallback; diff --git a/block/blk-mq.h b/block/blk-mq.h index ebf67f4d4f2e..cb0b5482ca5e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -122,6 +122,7 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_release(struct request_queue *q); @@ -148,6 +149,7 @@ struct blk_mq_alloc_data { blk_mq_req_flags_t flags; unsigned int shallow_depth; unsigned int cmd_flags; + unsigned int rq_flags; /* allocate multiple requests/tags in one go */ unsigned int nr_tags; @@ -165,10 +167,9 @@ static inline bool blk_mq_is_shared_tags(unsigned int flags) static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { - if (data->q->elevator) - return data->hctx->sched_tags; - - return data->hctx->tags; + if (!(data->rq_flags & RQF_ELV)) + return data->hctx->tags; + return data->hctx->sched_tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) @@ -224,12 +225,18 @@ static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) atomic_inc(&hctx->nr_active); } -static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) +static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, + int val) { if (blk_mq_is_shared_tags(hctx->flags)) - atomic_dec(&hctx->queue->nr_active_requests_shared_tags); + atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); else - atomic_dec(&hctx->nr_active); + atomic_sub(val, &hctx->nr_active); +} + +static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) +{ + __blk_mq_sub_active_requests(hctx, 1); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) @@ -258,7 +265,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } -bool blk_mq_get_driver_tag(struct request *rq); +bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq); + +static inline bool blk_mq_get_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->tag != BLK_MQ_NO_TAG && + !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + hctx->tags->rqs[rq->tag] = rq; + return true; + } + + return __blk_mq_get_driver_tag(hctx, rq); +} static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { diff --git a/block/blk-settings.c b/block/blk-settings.c index a7c857ad7d10..b880c70e22e4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -842,6 +842,24 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); +static bool disk_has_partitions(struct gendisk *disk) +{ + unsigned long idx; + struct block_device *part; + bool ret = false; + + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (bdev_is_partition(part)) { + ret = true; + break; + } + } + rcu_read_unlock(); + + return ret; +} + /** * blk_queue_set_zoned - configure a disk queue zoned model. * @disk: the gendisk of the queue to configure @@ -876,7 +894,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) * we do nothing special as far as the block layer is concerned. */ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - !xa_empty(&disk->part_tbl)) + disk_has_partitions(disk)) model = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 36f14d658e81..cef1f713370b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -873,16 +873,15 @@ int blk_register_queue(struct gendisk *disk) } mutex_lock(&q->sysfs_lock); + + ret = disk_register_independent_access_ranges(disk, NULL); + if (ret) + goto put_dev; + if (q->elevator) { ret = elv_register_queue(q, false); - if (ret) { - mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); - kobject_del(&q->kobj); - blk_trace_remove_sysfs(dev); - kobject_put(&dev->kobj); - return ret; - } + if (ret) + goto put_dev; } blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); @@ -914,6 +913,16 @@ unlock: } return ret; + +put_dev: + disk_unregister_independent_access_ranges(disk); + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); + kobject_del(&q->kobj); + blk_trace_remove_sysfs(dev); + kobject_put(&dev->kobj); + + return ret; } /** @@ -958,6 +967,7 @@ void blk_unregister_queue(struct gendisk *disk) mutex_lock(&q->sysfs_lock); if (q->elevator) elv_unregister_queue(q); + disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 874c1c37bf0c..0c119be0e813 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -357,6 +357,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) unsigned int inflight = wbt_inflight(rwb); int status; + if (!rwb->rqos.q->disk) + return; + status = latency_exceeded(rwb, cb->stat); trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, diff --git a/block/blk.h b/block/blk.h index e80350327e6d..b4fed2033e48 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,6 +55,41 @@ void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); void blk_queue_start_drain(struct request_queue *q); +int __bio_queue_enter(struct request_queue *q, struct bio *bio); +bool submit_bio_checks(struct bio *bio); + +static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) +{ + rcu_read_lock(); + if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter)) + goto fail; + + /* + * The code that increments the pm_only counter must ensure that the + * counter is globally visible before the queue is unfrozen. + */ + if (blk_queue_pm_only(q) && + (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) + goto fail_put; + + rcu_read_unlock(); + return true; + +fail_put: + blk_queue_exit(q); +fail: + rcu_read_unlock(); + return false; +} + +static inline int bio_queue_enter(struct bio *bio) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + if (blk_try_enter_queue(q, false)) + return 0; + return __bio_queue_enter(q, bio); +} #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, @@ -218,7 +253,7 @@ void blk_add_timer(struct request *req); void blk_print_req_error(struct request *req, blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq); + unsigned int nr_segs, bool *same_queue_rq); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); @@ -236,7 +271,7 @@ void __blk_account_io_done(struct request *req, u64 now); */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) -void blk_insert_flush(struct request *rq); +bool blk_insert_flush(struct request *rq); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); @@ -454,4 +489,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); extern const struct address_space_operations def_blk_aops; +int disk_register_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *new_iars); +void disk_unregister_independent_access_ranges(struct gendisk *disk); + #endif /* BLK_INTERNAL_H */ diff --git a/block/bsg-lib.c b/block/bsg-lib.c index ccb98276c964..10aa378702fa 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -31,6 +31,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, struct bsg_job *job; struct request *rq; struct bio *bio; + void *reply; int ret; if (hdr->protocol != BSG_PROTOCOL_SCSI || @@ -39,22 +40,28 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, if (!capable(CAP_SYS_RAWIO)) return -EPERM; - rq = blk_get_request(q, hdr->dout_xfer_len ? + rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); rq->timeout = timeout; job = blk_mq_rq_to_pdu(rq); + reply = job->reply; + memset(job, 0, sizeof(*job)); + job->reply = reply; + job->reply_len = SCSI_SENSE_BUFFERSIZE; + job->dd_data = job + 1; + job->request_len = hdr->request_len; job->request = memdup_user(uptr64(hdr->request), hdr->request_len); if (IS_ERR(job->request)) { ret = PTR_ERR(job->request); - goto out_put_request; + goto out_free_rq; } if (hdr->dout_xfer_len && hdr->din_xfer_len) { - job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0); + job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0); if (IS_ERR(job->bidi_rq)) { ret = PTR_ERR(job->bidi_rq); goto out_free_job_request; @@ -134,11 +141,11 @@ out_unmap_bidi_rq: blk_rq_unmap_user(job->bidi_bio); out_free_bidi_rq: if (job->bidi_rq) - blk_put_request(job->bidi_rq); + blk_mq_free_request(job->bidi_rq); out_free_job_request: kfree(job->request); -out_put_request: - blk_put_request(rq); +out_free_rq: + blk_mq_free_request(rq); return ret; } @@ -302,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req, return 0; } -/* called right before the request is given to the request_queue user */ -static void bsg_initialize_rq(struct request *req) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(req); - void *reply = job->reply; - - memset(job, 0, sizeof(*job)); - job->reply = reply; - job->reply_len = SCSI_SENSE_BUFFERSIZE; - job->dd_data = job + 1; -} - static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx) { @@ -350,7 +345,6 @@ static const struct blk_mq_ops bsg_mq_ops = { .queue_rq = bsg_queue_rq, .init_request = bsg_init_rq, .exit_request = bsg_exit_rq, - .initialize_rq_fn = bsg_initialize_rq, .complete = bsg_complete, .timeout = bsg_timeout, }; diff --git a/block/fops.c b/block/fops.c index 2c43e493e37c..ad732a36f9b3 100644 --- a/block/fops.c +++ b/block/fops.c @@ -76,7 +76,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, vecs, nr_pages); bio_set_dev(&bio, bdev); - bio.bi_iter.bi_sector = pos >> 9; + bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; @@ -124,9 +124,8 @@ out: } enum { - DIO_MULTI_BIO = 1, - DIO_SHOULD_DIRTY = 2, - DIO_IS_SYNC = 4, + DIO_SHOULD_DIRTY = 1, + DIO_IS_SYNC = 2, }; struct blkdev_dio { @@ -137,7 +136,7 @@ struct blkdev_dio { size_t size; atomic_t ref; unsigned int flags; - struct bio bio; + struct bio bio ____cacheline_aligned_in_smp; }; static struct bio_set blkdev_dio_pool; @@ -150,7 +149,7 @@ static void blkdev_bio_end_io(struct bio *bio) if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; - if (!(dio->flags & DIO_MULTI_BIO) || atomic_dec_and_test(&dio->ref)) { + if (atomic_dec_and_test(&dio->ref)) { if (!(dio->flags & DIO_IS_SYNC)) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -164,9 +163,8 @@ static void blkdev_bio_end_io(struct bio *bio) ret = blk_status_to_errno(dio->bio.bi_status); } - dio->iocb->ki_complete(iocb, ret, 0); - if (dio->flags & DIO_MULTI_BIO) - bio_put(&dio->bio); + dio->iocb->ki_complete(iocb, ret); + bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; @@ -190,7 +188,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; - bool do_poll = (iocb->ki_flags & IOCB_HIPRI); bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; int ret = 0; @@ -202,11 +199,17 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); + atomic_set(&dio->ref, 1); + /* + * Grab an extra reference to ensure the dio structure which is embedded + * into the first bio stays around. + */ + bio_get(bio); + is_sync = is_sync_kiocb(iocb); if (is_sync) { dio->flags = DIO_IS_SYNC; dio->waiter = current; - bio_get(bio); } else { dio->flags = 0; dio->iocb = iocb; @@ -216,16 +219,11 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (is_read && iter_is_iovec(iter)) dio->flags |= DIO_SHOULD_DIRTY; - /* - * Don't plug for HIPRI/polled IO, as those should go straight - * to issue - */ - if (!(iocb->ki_flags & IOCB_HIPRI)) - blk_start_plug(&plug); + blk_start_plug(&plug); for (;;) { bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = pos >> 9; + bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; @@ -254,34 +252,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { - if (do_poll) - bio_set_polled(bio, iocb); submit_bio(bio); - if (do_poll) - WRITE_ONCE(iocb->private, bio); break; } - if (!(dio->flags & DIO_MULTI_BIO)) { - /* - * AIO needs an extra reference to ensure the dio - * structure which is embedded into the first bio - * stays around. - */ - if (!is_sync) - bio_get(bio); - dio->flags |= DIO_MULTI_BIO; - atomic_set(&dio->ref, 2); - do_poll = false; - } else { - atomic_inc(&dio->ref); - } - + atomic_inc(&dio->ref); submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } - if (!(iocb->ki_flags & IOCB_HIPRI)) - blk_finish_plug(&plug); + blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; @@ -290,9 +269,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; - - if (!do_poll || !bio_poll(bio, NULL, 0)) - blk_io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -305,6 +282,94 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } +static void blkdev_bio_end_io_async(struct bio *bio) +{ + struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio); + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (likely(!bio->bi_status)) { + ret = dio->size; + iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(bio->bi_status); + } + + iocb->ki_complete(iocb, ret); + + if (dio->flags & DIO_SHOULD_DIRTY) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, + struct iov_iter *iter, + unsigned int nr_pages) +{ + struct block_device *bdev = iocb->ki_filp->private_data; + struct blkdev_dio *dio; + struct bio *bio; + loff_t pos = iocb->ki_pos; + int ret = 0; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); + dio = container_of(bio, struct blkdev_dio, bio); + dio->flags = 0; + dio->iocb = iocb; + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = iocb->ki_hint; + bio->bi_end_io = blkdev_bio_end_io_async; + bio->bi_ioprio = iocb->ki_ioprio; + + if (iov_iter_is_bvec(iter)) { + /* + * Users don't rely on the iterator being in any particular + * state for async I/O returning -EIOCBQUEUED, hence we can + * avoid expensive iov_iter_advance(). Bypass + * bio_iov_iter_get_pages() and set the bvec directly. + */ + bio_iov_bvec_set(bio, iter); + } else { + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return ret; + } + } + dio->size = bio->bi_iter.bi_size; + + if (iov_iter_rw(iter) == READ) { + bio->bi_opf = REQ_OP_READ; + if (iter_is_iovec(iter)) { + dio->flags |= DIO_SHOULD_DIRTY; + bio_set_pages_dirty(bio); + } + } else { + bio->bi_opf = dio_bio_write_op(iocb); + task_io_account_write(bio->bi_iter.bi_size); + } + + if (iocb->ki_flags & IOCB_HIPRI) { + bio->bi_opf |= REQ_POLLED | REQ_NOWAIT; + submit_bio(bio); + WRITE_ONCE(iocb->private, bio); + } else { + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + submit_bio(bio); + } + return -EIOCBQUEUED; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { unsigned int nr_pages; @@ -313,9 +378,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - + if (likely(nr_pages <= BIO_MAX_VECS)) { + if (is_sync_kiocb(iocb)) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + return __blkdev_direct_IO_async(iocb, iter, nr_pages); + } return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } @@ -460,7 +527,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct block_device *bdev = iocb->ki_filp->private_data; struct inode *bd_inode = bdev->bd_inode; - loff_t size = i_size_read(bd_inode); + loff_t size = bdev_nr_bytes(bdev); struct blk_plug plug; size_t shorted = 0; ssize_t ret; @@ -498,22 +565,25 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct block_device *bdev = iocb->ki_filp->private_data; - loff_t size = i_size_read(bdev->bd_inode); + loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret; - if (pos >= size) - return 0; - - size -= pos; - if (iov_iter_count(to) > size) { - shorted = iov_iter_count(to) - size; - iov_iter_truncate(to, size); + if (unlikely(pos + iov_iter_count(to) > size)) { + if (pos >= size) + return 0; + size -= pos; + if (iov_iter_count(to) > size) { + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } } ret = generic_file_read_iter(iocb, to); - iov_iter_reexpand(to, iov_iter_count(to) + shorted); + + if (unlikely(shorted)) + iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } @@ -535,7 +605,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return -EOPNOTSUPP; /* Don't go off the end of the device. */ - isize = i_size_read(bdev->bd_inode); + isize = bdev_nr_bytes(bdev); if (start >= isize) return -EINVAL; if (end >= isize) { @@ -562,16 +632,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, + BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, + BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); + error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, 0); break; default: error = -EOPNOTSUPP; diff --git a/block/genhd.c b/block/genhd.c index 13494ed79675..ca2fbab1d425 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -58,6 +58,7 @@ void set_capacity(struct gendisk *disk, sector_t sectors) spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } EXPORT_SYMBOL(set_capacity); @@ -471,11 +472,15 @@ int device_add_disk(struct device *parent, struct gendisk *disk, disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); - if (!disk->part0->bd_holder_dir) + if (!disk->part0->bd_holder_dir) { + ret = -ENOMEM; goto out_del_integrity; + } disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - if (!disk->slave_dir) + if (!disk->slave_dir) { + ret = -ENOMEM; goto out_put_holder_dir; + } ret = bd_register_pending_holders(disk); if (ret < 0) @@ -592,16 +597,6 @@ void del_gendisk(struct gendisk *disk) * Prevent new I/O from crossing bio_queue_enter(). */ blk_queue_start_drain(q); - blk_mq_freeze_queue_wait(q); - - rq_qos_exit(q); - blk_sync_queue(q); - blk_flush_integrity(); - /* - * Allow using passthrough request again after the queue is torn down. - */ - blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); - __blk_mq_unfreeze_queue(q, true); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -624,9 +619,41 @@ void del_gendisk(struct gendisk *disk) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); + + blk_mq_freeze_queue_wait(q); + + rq_qos_exit(q); + blk_sync_queue(q); + blk_flush_integrity(); + /* + * Allow using passthrough request again after the queue is torn down. + */ + blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); + __blk_mq_unfreeze_queue(q, true); + } EXPORT_SYMBOL(del_gendisk); +/** + * invalidate_disk - invalidate the disk + * @disk: the struct gendisk to invalidate + * + * A helper to invalidates the disk. It will clean the disk's associated + * buffer/page caches and reset its internal states so that the disk + * can be reused by the drivers. + * + * Context: can sleep + */ +void invalidate_disk(struct gendisk *disk) +{ + struct block_device *bdev = disk->part0; + + invalidate_bdev(bdev); + bdev->bd_inode->i_mapping->wb_err = 0; + set_capacity(disk, 0); +} +EXPORT_SYMBOL(invalidate_disk); + /* sysfs access to bad-blocks list. */ static ssize_t disk_badblocks_show(struct device *dev, struct device_attribute *attr, @@ -1392,12 +1419,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only) } EXPORT_SYMBOL(set_disk_ro); -int bdev_read_only(struct block_device *bdev) -{ - return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); -} -EXPORT_SYMBOL(bdev_read_only); - void inc_diskseq(struct gendisk *disk) { disk->diskseq = atomic64_inc_return(&diskseq); diff --git a/block/ioctl.c b/block/ioctl.c index 77b1b2453f39..d6af0ac97e57 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -132,7 +132,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, if (len & 511) return -EINVAL; - if (start + len > i_size_read(bdev->bd_inode)) + if (start + len > bdev_nr_bytes(bdev)) return -EINVAL; err = truncate_bdev_range(bdev, mode, start, start + len - 1); @@ -164,7 +164,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, return -EINVAL; if (len & 511) return -EINVAL; - if (end >= (uint64_t)i_size_read(bdev->bd_inode)) + if (end >= (uint64_t)bdev_nr_bytes(bdev)) return -EINVAL; if (end < start) return -EINVAL; @@ -543,7 +543,6 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct block_device *bdev = I_BDEV(file->f_mapping->host); void __user *argp = (void __user *)arg; fmode_t mode = file->f_mode; - loff_t size; int ret; /* @@ -570,10 +569,9 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; - return put_ulong(argp, size >> 9); + return put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ @@ -581,7 +579,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKBSZSET: return blkdev_bszset(bdev, mode, argp); case BLKGETSIZE64: - return put_u64(argp, i_size_read(bdev->bd_inode)); + return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP: @@ -615,7 +613,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; fmode_t mode = file->f_mode; - loff_t size; /* * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have @@ -641,10 +638,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return compat_put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; - return compat_put_ulong(argp, size >> 9); + return compat_put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ @@ -652,7 +648,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKBSZSET_32: return blkdev_bszset(bdev, mode, argp); case BLKGETSIZE64_32: - return put_u64(argp, i_size_read(bdev->bd_inode)); + return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP32: diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c deleted file mode 100644 index 1792159d12d1..000000000000 --- a/block/keyslot-manager.c +++ /dev/null @@ -1,579 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2019 Google LLC - */ - -/** - * DOC: The Keyslot Manager - * - * Many devices with inline encryption support have a limited number of "slots" - * into which encryption contexts may be programmed, and requests can be tagged - * with a slot number to specify the key to use for en/decryption. - * - * As the number of slots is limited, and programming keys is expensive on - * many inline encryption hardware, we don't want to program the same key into - * multiple slots - if multiple requests are using the same key, we want to - * program just one slot with that key and use that slot for all requests. - * - * The keyslot manager manages these keyslots appropriately, and also acts as - * an abstraction between the inline encryption hardware and the upper layers. - * - * Lower layer devices will set up a keyslot manager in their request queue - * and tell it how to perform device specific operations like programming/ - * evicting keys from keyslots. - * - * Upper layers will call blk_ksm_get_slot_for_key() to program a - * key into some slot in the inline encryption hardware. - */ - -#define pr_fmt(fmt) "blk-crypto: " fmt - -#include <linux/keyslot-manager.h> -#include <linux/device.h> -#include <linux/atomic.h> -#include <linux/mutex.h> -#include <linux/pm_runtime.h> -#include <linux/wait.h> -#include <linux/blkdev.h> -#include <linux/blk-integrity.h> - -struct blk_ksm_keyslot { - atomic_t slot_refs; - struct list_head idle_slot_node; - struct hlist_node hash_node; - const struct blk_crypto_key *key; - struct blk_keyslot_manager *ksm; -}; - -static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm) -{ - /* - * Calling into the driver requires ksm->lock held and the device - * resumed. But we must resume the device first, since that can acquire - * and release ksm->lock via blk_ksm_reprogram_all_keys(). - */ - if (ksm->dev) - pm_runtime_get_sync(ksm->dev); - down_write(&ksm->lock); -} - -static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) -{ - up_write(&ksm->lock); - if (ksm->dev) - pm_runtime_put_sync(ksm->dev); -} - -static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm) -{ - return ksm->num_slots == 0; -} - -/** - * blk_ksm_init() - Initialize a keyslot manager - * @ksm: The keyslot_manager to initialize. - * @num_slots: The number of key slots to manage. - * - * Allocate memory for keyslots and initialize a keyslot manager. Called by - * e.g. storage drivers to set up a keyslot manager in their request_queue. - * - * Return: 0 on success, or else a negative error code. - */ -int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots) -{ - unsigned int slot; - unsigned int i; - unsigned int slot_hashtable_size; - - memset(ksm, 0, sizeof(*ksm)); - - if (num_slots == 0) - return -EINVAL; - - ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL); - if (!ksm->slots) - return -ENOMEM; - - ksm->num_slots = num_slots; - - init_rwsem(&ksm->lock); - - init_waitqueue_head(&ksm->idle_slots_wait_queue); - INIT_LIST_HEAD(&ksm->idle_slots); - - for (slot = 0; slot < num_slots; slot++) { - ksm->slots[slot].ksm = ksm; - list_add_tail(&ksm->slots[slot].idle_slot_node, - &ksm->idle_slots); - } - - spin_lock_init(&ksm->idle_slots_lock); - - slot_hashtable_size = roundup_pow_of_two(num_slots); - /* - * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2 - * buckets. This only makes a difference when there is only 1 keyslot. - */ - if (slot_hashtable_size < 2) - slot_hashtable_size = 2; - - ksm->log_slot_ht_size = ilog2(slot_hashtable_size); - ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size, - sizeof(ksm->slot_hashtable[0]), - GFP_KERNEL); - if (!ksm->slot_hashtable) - goto err_destroy_ksm; - for (i = 0; i < slot_hashtable_size; i++) - INIT_HLIST_HEAD(&ksm->slot_hashtable[i]); - - return 0; - -err_destroy_ksm: - blk_ksm_destroy(ksm); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(blk_ksm_init); - -static void blk_ksm_destroy_callback(void *ksm) -{ - blk_ksm_destroy(ksm); -} - -/** - * devm_blk_ksm_init() - Resource-managed blk_ksm_init() - * @dev: The device which owns the blk_keyslot_manager. - * @ksm: The blk_keyslot_manager to initialize. - * @num_slots: The number of key slots to manage. - * - * Like blk_ksm_init(), but causes blk_ksm_destroy() to be called automatically - * on driver detach. - * - * Return: 0 on success, or else a negative error code. - */ -int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm, - unsigned int num_slots) -{ - int err = blk_ksm_init(ksm, num_slots); - - if (err) - return err; - - return devm_add_action_or_reset(dev, blk_ksm_destroy_callback, ksm); -} -EXPORT_SYMBOL_GPL(devm_blk_ksm_init); - -static inline struct hlist_head * -blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)]; -} - -static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot) -{ - struct blk_keyslot_manager *ksm = slot->ksm; - unsigned long flags; - - spin_lock_irqsave(&ksm->idle_slots_lock, flags); - list_del(&slot->idle_slot_node); - spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); -} - -static struct blk_ksm_keyslot *blk_ksm_find_keyslot( - struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key); - struct blk_ksm_keyslot *slotp; - - hlist_for_each_entry(slotp, head, hash_node) { - if (slotp->key == key) - return slotp; - } - return NULL; -} - -static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot( - struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - struct blk_ksm_keyslot *slot; - - slot = blk_ksm_find_keyslot(ksm, key); - if (!slot) - return NULL; - if (atomic_inc_return(&slot->slot_refs) == 1) { - /* Took first reference to this slot; remove it from LRU list */ - blk_ksm_remove_slot_from_lru_list(slot); - } - return slot; -} - -unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot) -{ - return slot - slot->ksm->slots; -} -EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx); - -/** - * blk_ksm_get_slot_for_key() - Program a key into a keyslot. - * @ksm: The keyslot manager to program the key into. - * @key: Pointer to the key object to program, including the raw key, crypto - * mode, and data unit size. - * @slot_ptr: A pointer to return the pointer of the allocated keyslot. - * - * Get a keyslot that's been programmed with the specified key. If one already - * exists, return it with incremented refcount. Otherwise, wait for a keyslot - * to become idle and program it. - * - * Context: Process context. Takes and releases ksm->lock. - * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the - * allocated keyslot), or some other blk_status_t otherwise (and - * keyslot is set to NULL). - */ -blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - struct blk_ksm_keyslot **slot_ptr) -{ - struct blk_ksm_keyslot *slot; - int slot_idx; - int err; - - *slot_ptr = NULL; - - if (blk_ksm_is_passthrough(ksm)) - return BLK_STS_OK; - - down_read(&ksm->lock); - slot = blk_ksm_find_and_grab_keyslot(ksm, key); - up_read(&ksm->lock); - if (slot) - goto success; - - for (;;) { - blk_ksm_hw_enter(ksm); - slot = blk_ksm_find_and_grab_keyslot(ksm, key); - if (slot) { - blk_ksm_hw_exit(ksm); - goto success; - } - - /* - * If we're here, that means there wasn't a slot that was - * already programmed with the key. So try to program it. - */ - if (!list_empty(&ksm->idle_slots)) - break; - - blk_ksm_hw_exit(ksm); - wait_event(ksm->idle_slots_wait_queue, - !list_empty(&ksm->idle_slots)); - } - - slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot, - idle_slot_node); - slot_idx = blk_ksm_get_slot_idx(slot); - - err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx); - if (err) { - wake_up(&ksm->idle_slots_wait_queue); - blk_ksm_hw_exit(ksm); - return errno_to_blk_status(err); - } - - /* Move this slot to the hash list for the new key. */ - if (slot->key) - hlist_del(&slot->hash_node); - slot->key = key; - hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key)); - - atomic_set(&slot->slot_refs, 1); - - blk_ksm_remove_slot_from_lru_list(slot); - - blk_ksm_hw_exit(ksm); -success: - *slot_ptr = slot; - return BLK_STS_OK; -} - -/** - * blk_ksm_put_slot() - Release a reference to a slot - * @slot: The keyslot to release the reference of. - * - * Context: Any context. - */ -void blk_ksm_put_slot(struct blk_ksm_keyslot *slot) -{ - struct blk_keyslot_manager *ksm; - unsigned long flags; - - if (!slot) - return; - - ksm = slot->ksm; - - if (atomic_dec_and_lock_irqsave(&slot->slot_refs, - &ksm->idle_slots_lock, flags)) { - list_add_tail(&slot->idle_slot_node, &ksm->idle_slots); - spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); - wake_up(&ksm->idle_slots_wait_queue); - } -} - -/** - * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is - * supported by a ksm. - * @ksm: The keyslot manager to check - * @cfg: The crypto configuration to check for. - * - * Checks for crypto_mode/data unit size/dun bytes support. - * - * Return: Whether or not this ksm supports the specified crypto config. - */ -bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, - const struct blk_crypto_config *cfg) -{ - if (!ksm) - return false; - if (!(ksm->crypto_modes_supported[cfg->crypto_mode] & - cfg->data_unit_size)) - return false; - if (ksm->max_dun_bytes_supported < cfg->dun_bytes) - return false; - return true; -} - -/** - * blk_ksm_evict_key() - Evict a key from the lower layer device. - * @ksm: The keyslot manager to evict from - * @key: The key to evict - * - * Find the keyslot that the specified key was programmed into, and evict that - * slot from the lower layer device. The slot must not be in use by any - * in-flight IO when this function is called. - * - * Context: Process context. Takes and releases ksm->lock. - * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY - * if the keyslot is still in use, or another -errno value on other - * error. - */ -int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - struct blk_ksm_keyslot *slot; - int err = 0; - - if (blk_ksm_is_passthrough(ksm)) { - if (ksm->ksm_ll_ops.keyslot_evict) { - blk_ksm_hw_enter(ksm); - err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1); - blk_ksm_hw_exit(ksm); - return err; - } - return 0; - } - - blk_ksm_hw_enter(ksm); - slot = blk_ksm_find_keyslot(ksm, key); - if (!slot) - goto out_unlock; - - if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { - err = -EBUSY; - goto out_unlock; - } - err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, - blk_ksm_get_slot_idx(slot)); - if (err) - goto out_unlock; - - hlist_del(&slot->hash_node); - slot->key = NULL; - err = 0; -out_unlock: - blk_ksm_hw_exit(ksm); - return err; -} - -/** - * blk_ksm_reprogram_all_keys() - Re-program all keyslots. - * @ksm: The keyslot manager - * - * Re-program all keyslots that are supposed to have a key programmed. This is - * intended only for use by drivers for hardware that loses its keys on reset. - * - * Context: Process context. Takes and releases ksm->lock. - */ -void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) -{ - unsigned int slot; - - if (blk_ksm_is_passthrough(ksm)) - return; - - /* This is for device initialization, so don't resume the device */ - down_write(&ksm->lock); - for (slot = 0; slot < ksm->num_slots; slot++) { - const struct blk_crypto_key *key = ksm->slots[slot].key; - int err; - - if (!key) - continue; - - err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot); - WARN_ON(err); - } - up_write(&ksm->lock); -} -EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys); - -void blk_ksm_destroy(struct blk_keyslot_manager *ksm) -{ - if (!ksm) - return; - kvfree(ksm->slot_hashtable); - kvfree_sensitive(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots); - memzero_explicit(ksm, sizeof(*ksm)); -} -EXPORT_SYMBOL_GPL(blk_ksm_destroy); - -bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q) -{ - if (blk_integrity_queue_supports_integrity(q)) { - pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - return false; - } - q->ksm = ksm; - return true; -} -EXPORT_SYMBOL_GPL(blk_ksm_register); - -void blk_ksm_unregister(struct request_queue *q) -{ - q->ksm = NULL; -} - -/** - * blk_ksm_intersect_modes() - restrict supported modes by child device - * @parent: The keyslot manager for parent device - * @child: The keyslot manager for child device, or NULL - * - * Clear any crypto mode support bits in @parent that aren't set in @child. - * If @child is NULL, then all parent bits are cleared. - * - * Only use this when setting up the keyslot manager for a layered device, - * before it's been exposed yet. - */ -void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, - const struct blk_keyslot_manager *child) -{ - if (child) { - unsigned int i; - - parent->max_dun_bytes_supported = - min(parent->max_dun_bytes_supported, - child->max_dun_bytes_supported); - for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported); - i++) { - parent->crypto_modes_supported[i] &= - child->crypto_modes_supported[i]; - } - } else { - parent->max_dun_bytes_supported = 0; - memset(parent->crypto_modes_supported, 0, - sizeof(parent->crypto_modes_supported)); - } -} -EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes); - -/** - * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes - * and DUN bytes that another KSM supports. Here, - * "superset" refers to the mathematical meaning of the - * word - i.e. if two KSMs have the *same* capabilities, - * they *are* considered supersets of each other. - * @ksm_superset: The KSM that we want to verify is a superset - * @ksm_subset: The KSM that we want to verify is a subset - * - * Return: True if @ksm_superset supports a superset of the crypto modes and DUN - * bytes that @ksm_subset supports. - */ -bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, - struct blk_keyslot_manager *ksm_subset) -{ - int i; - - if (!ksm_subset) - return true; - - if (!ksm_superset) - return false; - - for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) { - if (ksm_subset->crypto_modes_supported[i] & - (~ksm_superset->crypto_modes_supported[i])) { - return false; - } - } - - if (ksm_subset->max_dun_bytes_supported > - ksm_superset->max_dun_bytes_supported) { - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(blk_ksm_is_superset); - -/** - * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of - * another KSM - * @target_ksm: The KSM whose restrictions to update. - * @reference_ksm: The KSM to whose restrictions this function will update - * @target_ksm's restrictions to. - * - * Blk-crypto requires that crypto capabilities that were - * advertised when a bio was created continue to be supported by the - * device until that bio is ended. This is turn means that a device cannot - * shrink its advertised crypto capabilities without any explicit - * synchronization with upper layers. So if there's no such explicit - * synchronization, @reference_ksm must support all the crypto capabilities that - * @target_ksm does - * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true). - * - * Note also that as long as the crypto capabilities are being expanded, the - * order of updates becoming visible is not important because it's alright - * for blk-crypto to see stale values - they only cause blk-crypto to - * believe that a crypto capability isn't supported when it actually is (which - * might result in blk-crypto-fallback being used if available, or the bio being - * failed). - */ -void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, - struct blk_keyslot_manager *reference_ksm) -{ - memcpy(target_ksm->crypto_modes_supported, - reference_ksm->crypto_modes_supported, - sizeof(target_ksm->crypto_modes_supported)); - - target_ksm->max_dun_bytes_supported = - reference_ksm->max_dun_bytes_supported; -} -EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities); - -/** - * blk_ksm_init_passthrough() - Init a passthrough keyslot manager - * @ksm: The keyslot manager to init - * - * Initialize a passthrough keyslot manager. - * Called by e.g. storage drivers to set up a keyslot manager in their - * request_queue, when the storage driver wants to manage its keys by itself. - * This is useful for inline encryption hardware that doesn't have the concept - * of keyslots, and for layered devices. - */ -void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm) -{ - memset(ksm, 0, sizeof(*ksm)); - init_rwsem(&ksm->lock); -} -EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough); diff --git a/block/partitions/core.c b/block/partitions/core.c index 9dbddc355b40..334b72ef1d73 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -91,6 +91,7 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) { spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } @@ -424,6 +425,7 @@ out_del: device_del(pdev); out_put: put_device(pdev); + return ERR_PTR(err); out_put_disk: put_disk(disk); return ERR_PTR(err); diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 7ca5c4c374d4..5e9be13a56a8 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -133,7 +133,7 @@ efi_crc32(const void *buf, unsigned long len) */ static u64 last_lba(struct gendisk *disk) { - return div_u64(disk->part0->bd_inode->i_size, + return div_u64(bdev_nr_bytes(disk->part0), queue_logical_block_size(disk->queue)) - 1ULL; } diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 9bca396aef4a..403756dbd50d 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -198,7 +198,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, char name[], union label_t *label, sector_t labelsect, - loff_t i_size, + sector_t nr_sectors, dasd_information2_t *info) { loff_t offset, geo_size, size; @@ -213,14 +213,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state, } else { /* * Formated w/o large volume support. If the sanity check - * 'size based on geo == size based on i_size' is true, then + * 'size based on geo == size based on nr_sectors' is true, then * we can safely assume that we know the formatted size of * the disk, otherwise we need additional information * that we can only get from a real DASD device. */ geo_size = geo->cylinders * geo->heads * geo->sectors * secperblk; - size = i_size >> 9; + size = nr_sectors; if (size != geo_size) { if (!info) { strlcat(state->pp_buf, "\n", PAGE_SIZE); @@ -229,7 +229,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, if (!strcmp(info->type, "ECKD")) if (geo_size < size) size = geo_size; - /* else keep size based on i_size */ + /* else keep size based on nr_sectors */ } } /* first and only partition starts in the first block after the label */ @@ -293,7 +293,8 @@ int ibm_partition(struct parsed_partitions *state) struct gendisk *disk = state->disk; struct block_device *bdev = disk->part0; int blocksize, res; - loff_t i_size, offset, size; + loff_t offset, size; + sector_t nr_sectors; dasd_information2_t *info; struct hd_geometry *geo; char type[5] = {0,}; @@ -308,8 +309,8 @@ int ibm_partition(struct parsed_partitions *state) blocksize = bdev_logical_block_size(bdev); if (blocksize <= 0) goto out_symbol; - i_size = i_size_read(bdev->bd_inode); - if (i_size == 0) + nr_sectors = bdev_nr_sectors(bdev); + if (nr_sectors == 0) goto out_symbol; info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); if (info == NULL) @@ -336,7 +337,7 @@ int ibm_partition(struct parsed_partitions *state) label); } else if (!strncmp(type, "LNX1", 4)) { res = find_lnx1_partitions(state, geo, blocksize, name, - label, labelsect, i_size, + label, labelsect, nr_sectors, info); } else if (!strncmp(type, "CMS1", 4)) { res = find_cms1_partitions(state, geo, blocksize, name, @@ -353,7 +354,7 @@ int ibm_partition(struct parsed_partitions *state) res = 1; if (info->format == DASD_FORMAT_LDL) { strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); - size = i_size >> 9; + size = nr_sectors; offset = (info->label_block + 1) * (blocksize >> 9); put_partition(state, 1, offset, size-offset); strlcat(state->pp_buf, "\n", PAGE_SIZE); |