diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-16 21:49:16 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-16 21:49:16 -0400 |
commit | d82312c80860b8b83cd4473ac6eafd244e712061 (patch) | |
tree | 028b2e843e9d59d35aeb8924582864f18aa4ca36 | |
parent | 7d69cff26ceadce8638cb65191285932a3de3d4c (diff) | |
parent | 889fa31f00b218a2cef96c32a6b3f57e6d3bf918 (diff) | |
download | linux-d82312c80860b8b83cd4473ac6eafd244e712061.tar.bz2 |
Merge branch 'for-4.1/core' of git://git.kernel.dk/linux-block
Pull block layer core bits from Jens Axboe:
"This is the core pull request for 4.1. Not a lot of stuff in here for
this round, mostly little fixes or optimizations. This pull request
contains:
- An optimization that speeds up queue runs on blk-mq, especially for
the case where there's a large difference between nr_cpu_ids and
the actual mapped software queues on a hardware queue. From Chong
Yuan.
- Honor node local allocations for requests on legacy devices. From
David Rientjes.
- Cleanup of blk_mq_rq_to_pdu() from me.
- exit_aio() fixup from me, greatly speeding up exiting multiple IO
contexts off exit_group(). For my particular test case, fio exit
took ~6 seconds. A typical case of both exposing RCU grace periods
to user space, and serializing exit of them.
- Make blk_mq_queue_enter() honor the gfp mask passed in, so we only
wait if __GFP_WAIT is set. From Keith Busch.
- blk-mq exports and two added helpers from Mike Snitzer, which will
be used by the dm-mq code.
- Cleanups of blk-mq queue init from Wei Fang and Xiaoguang Wang"
* 'for-4.1/core' of git://git.kernel.dk/linux-block:
blk-mq: reduce unnecessary software queue looping
aio: fix serial draining in exit_aio()
blk-mq: cleanup blk_mq_rq_to_pdu()
blk-mq: put blk_queue_rq_timeout together in blk_mq_init_queue()
block: remove redundant check about 'set->nr_hw_queues' in blk_mq_alloc_tag_set()
block: allocate request memory local to request queue
blk-mq: don't wait in blk_mq_queue_enter() if __GFP_WAIT isn't set
blk-mq: export blk_mq_run_hw_queues
blk-mq: add blk_mq_init_allocated_queue and export blk_mq_register_disk
-rw-r--r-- | block/blk-core.c | 19 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 1 | ||||
-rw-r--r-- | block/blk-mq.c | 67 | ||||
-rw-r--r-- | fs/aio.c | 45 | ||||
-rw-r--r-- | include/linux/blk-mq.h | 7 |
5 files changed, 93 insertions, 46 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index 794c3e7f01cf..fd154b94447a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -557,6 +557,18 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); +/* Allocate memory local to the request queue */ +static void *alloc_request_struct(gfp_t gfp_mask, void *data) +{ + int nid = (int)(long)data; + return kmem_cache_alloc_node(request_cachep, gfp_mask, nid); +} + +static void free_request_struct(void *element, void *unused) +{ + kmem_cache_free(request_cachep, element); +} + int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { @@ -569,9 +581,10 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q, init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, request_cachep, - gfp_mask, q->node); + rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct, + free_request_struct, + (void *)(long)q->node, gfp_mask, + q->node); if (!rl->rq_pool) return -ENOMEM; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 1630a20d5dcf..b79685e06b70 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -436,6 +436,7 @@ int blk_mq_register_disk(struct gendisk *disk) return 0; } +EXPORT_SYMBOL_GPL(blk_mq_register_disk); void blk_mq_sysfs_unregister(struct request_queue *q) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 33c428530193..c82de08f3721 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -33,7 +33,6 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); -static void blk_mq_run_queues(struct request_queue *q); /* * Check if any of the ctx's have pending work in this hardware queue @@ -78,7 +77,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static int blk_mq_queue_enter(struct request_queue *q) +static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) { while (true) { int ret; @@ -86,6 +85,9 @@ static int blk_mq_queue_enter(struct request_queue *q) if (percpu_ref_tryget_live(&q->mq_usage_counter)) return 0; + if (!(gfp & __GFP_WAIT)) + return -EBUSY; + ret = wait_event_interruptible(q->mq_freeze_wq, !q->mq_freeze_depth || blk_queue_dying(q)); if (blk_queue_dying(q)) @@ -118,7 +120,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q) if (freeze) { percpu_ref_kill(&q->mq_usage_counter); - blk_mq_run_queues(q); + blk_mq_run_hw_queues(q, false); } } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); @@ -257,7 +259,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_alloc_data alloc_data; int ret; - ret = blk_mq_queue_enter(q); + ret = blk_mq_queue_enter(q, gfp); if (ret) return ERR_PTR(ret); @@ -904,7 +906,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) &hctx->run_work, 0); } -static void blk_mq_run_queues(struct request_queue *q) +void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; int i; @@ -915,9 +917,10 @@ static void blk_mq_run_queues(struct request_queue *q) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, async); } } +EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { @@ -1186,7 +1189,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, int rw = bio_data_dir(bio); struct blk_mq_alloc_data alloc_data; - if (unlikely(blk_mq_queue_enter(q))) { + if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) { bio_endio(bio, -EIO); return NULL; } @@ -1517,8 +1520,6 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) if (!bitmap->map) return -ENOMEM; - bitmap->map_size = num_maps; - total = nr_cpu_ids; for (i = 0; i < num_maps; i++) { bitmap->map[i].depth = min(total, bitmap->bits_per_word); @@ -1759,8 +1760,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, continue; hctx = q->mq_ops->map_queue(q, i); - cpumask_set_cpu(i, hctx->cpumask); - hctx->nr_ctx++; /* * Set local node, IFF we have more than one hw queue. If @@ -1797,6 +1796,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) } queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_ctxmap *map = &hctx->ctx_map; + /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. @@ -1813,6 +1814,13 @@ static void blk_mq_map_swqueue(struct request_queue *q) } /* + * Set the map size to the number of mapped software queues. + * This is more accurate and more efficient than looping + * over all possibly mapped software queues. + */ + map->map_size = hctx->nr_ctx / map->bits_per_word; + + /* * Initialize batch roundrobin counts */ hctx->next_cpu = cpumask_first(hctx->cpumask); @@ -1889,9 +1897,25 @@ void blk_mq_release(struct request_queue *q) struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { + struct request_queue *uninit_q, *q; + + uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + if (!uninit_q) + return ERR_PTR(-ENOMEM); + + q = blk_mq_init_allocated_queue(set, uninit_q); + if (IS_ERR(q)) + blk_cleanup_queue(uninit_q); + + return q; +} +EXPORT_SYMBOL(blk_mq_init_queue); + +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) +{ struct blk_mq_hw_ctx **hctxs; struct blk_mq_ctx __percpu *ctx; - struct request_queue *q; unsigned int *map; int i; @@ -1926,20 +1950,16 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) hctxs[i]->queue_num = i; } - q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); - if (!q) - goto err_hctxs; - /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto err_mq_usage; + goto err_hctxs; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); - blk_queue_rq_timeout(q, 30000); + blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30000); q->nr_queues = nr_cpu_ids; q->nr_hw_queues = set->nr_hw_queues; @@ -1965,9 +1985,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) else blk_queue_make_request(q, blk_sq_make_request); - if (set->timeout) - blk_queue_rq_timeout(q, set->timeout); - /* * Do this after blk_queue_make_request() overrides it... */ @@ -1979,7 +1996,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_init_cpu_queues(q, set->nr_hw_queues); if (blk_mq_init_hw_queues(q, set)) - goto err_mq_usage; + goto err_hctxs; mutex_lock(&all_q_mutex); list_add_tail(&q->all_q_node, &all_q_list); @@ -1991,8 +2008,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) return q; -err_mq_usage: - blk_cleanup_queue(q); err_hctxs: kfree(map); for (i = 0; i < set->nr_hw_queues; i++) { @@ -2007,7 +2022,7 @@ err_percpu: free_percpu(ctx); return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL(blk_mq_init_queue); +EXPORT_SYMBOL(blk_mq_init_allocated_queue); void blk_mq_free_queue(struct request_queue *q) { @@ -2159,7 +2174,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; - if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) + if (!set->ops->queue_rq || !set->ops->map_queue) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { @@ -77,6 +77,11 @@ struct kioctx_cpu { unsigned reqs_available; }; +struct ctx_rq_wait { + struct completion comp; + atomic_t count; +}; + struct kioctx { struct percpu_ref users; atomic_t dead; @@ -115,7 +120,7 @@ struct kioctx { /* * signals when all in-flight requests are done */ - struct completion *requests_done; + struct ctx_rq_wait *rq_wait; struct { /* @@ -572,8 +577,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref) struct kioctx *ctx = container_of(ref, struct kioctx, reqs); /* At this point we know that there are no any in-flight requests */ - if (ctx->requests_done) - complete(ctx->requests_done); + if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) + complete(&ctx->rq_wait->comp); INIT_WORK(&ctx->free_work, free_ioctx); schedule_work(&ctx->free_work); @@ -783,7 +788,7 @@ err: * the rapid destruction of the kioctx. */ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, - struct completion *requests_done) + struct ctx_rq_wait *wait) { struct kioctx_table *table; @@ -813,7 +818,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); - ctx->requests_done = requests_done; + ctx->rq_wait = wait; percpu_ref_kill(&ctx->users); return 0; } @@ -829,18 +834,24 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, void exit_aio(struct mm_struct *mm) { struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); - int i; + struct ctx_rq_wait wait; + int i, skipped; if (!table) return; + atomic_set(&wait.count, table->nr); + init_completion(&wait.comp); + + skipped = 0; for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = table->table[i]; - struct completion requests_done = - COMPLETION_INITIALIZER_ONSTACK(requests_done); - if (!ctx) + if (!ctx) { + skipped++; continue; + } + /* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, @@ -849,10 +860,12 @@ void exit_aio(struct mm_struct *mm) * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, &requests_done); + kill_ioctx(mm, ctx, &wait); + } + if (!atomic_sub_and_test(skipped, &wait.count)) { /* Wait until all IO for the context are done. */ - wait_for_completion(&requests_done); + wait_for_completion(&wait.comp); } RCU_INIT_POINTER(mm->ioctx_table, NULL); @@ -1331,15 +1344,17 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - struct completion requests_done = - COMPLETION_INITIALIZER_ONSTACK(requests_done); + struct ctx_rq_wait wait; int ret; + init_completion(&wait.comp); + atomic_set(&wait.count, 1); + /* Pass requests_done to kill_ioctx() where it can be set * in a thread-safe way. If we try to set it here then we have * a race condition if two io_destroy() called simultaneously. */ - ret = kill_ioctx(current->mm, ioctx, &requests_done); + ret = kill_ioctx(current->mm, ioctx, &wait); percpu_ref_put(&ioctx->users); /* Wait until all IO for the context are done. Otherwise kernel @@ -1347,7 +1362,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) * is destroyed. */ if (!ret) - wait_for_completion(&requests_done); + wait_for_completion(&wait.comp); return ret; } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7aec86127335..8210e8797c12 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -164,6 +164,8 @@ enum { << BLK_MQ_F_ALLOC_POLICY_START_BIT) struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q); void blk_mq_finish_init(struct request_queue *q); int blk_mq_register_disk(struct gendisk *); void blk_mq_unregister_disk(struct gendisk *); @@ -218,6 +220,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); +void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, void *priv); @@ -227,7 +230,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q); /* * Driver command data is immediately after the request. So subtract request - * size to get back to the original request. + * size to get back to the original request, add request size to get the PDU. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { @@ -235,7 +238,7 @@ static inline struct request *blk_mq_rq_from_pdu(void *pdu) } static inline void *blk_mq_rq_to_pdu(struct request *rq) { - return (void *) rq + sizeof(*rq); + return rq + 1; } #define queue_for_each_hw_ctx(q, hctx, i) \ |