From d07335e51df0c6dec202d315fc4f1f7e100eec4e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 16 Nov 2010 12:52:38 +0100 Subject: block: Rename "block_remap" tracepoint to "block_bio_remap" to clarify the event. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 4ce953f1b390..151070541e21 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -33,7 +33,7 @@ #include "blk.h" -EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); @@ -1329,9 +1329,9 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; - trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, - bio->bi_sector - p->start_sect); + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, + bdev->bd_dev, + bio->bi_sector - p->start_sect); } } @@ -1500,7 +1500,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; if (old_sector != -1) - trace_block_remap(q, bio, old_dev, old_sector); + trace_block_bio_remap(q, bio, old_dev, old_sector); old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; -- cgit v1.2.3 From 89b90be2d877a904b1704e4029db65655bfc6282 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jan 2011 15:01:47 +0100 Subject: block: make kblockd_workqueue smarter kblockd is used for unplugging and may affect IO latency and throughput and the max number of concurrent work items are bound by the number of block devices. Make it HIGHPRI workqueue w/ default max concurrency. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 151070541e21..3689319a5974 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2606,7 +2606,9 @@ int __init blk_dev_init(void) BUILD_BUG_ON(__REQ_NR_BITS > 8 * sizeof(((struct request *)0)->cmd_flags)); - kblockd_workqueue = create_workqueue("kblockd"); + /* used for unplugging and affects IO latency/throughput - HIGHPRI */ + kblockd_workqueue = alloc_workqueue("kblockd", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); -- cgit v1.2.3 From 09e099d4bafea3b15be003d548bdf94b4b6e0e17 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Wed, 5 Jan 2011 16:57:38 +0100 Subject: block: fix accounting bug on cross partition merges /proc/diskstats would display a strange output as follows. $ cat /proc/diskstats |grep sda 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691 ~~~~~~~~~~ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137 Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE. The detailed root cause is as follows. Assuming that there are two partition, sda1 and sda2. 1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight is 0 and sda2's one is 1. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 2. A bio belongs to sda1 is issued and is merged into the request mentioned on step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed from sda2 region to sda1 region. However the two partition's hd_struct->in_flight are not changed. | hd_struct->in_flight --------------------------- sda1 | 0 sda2 | 1 --------------------------- 3. The request is finished and blk_account_io_done() is called. In this case, sda2's hd_struct->in_flight, not a sda1's one, is decremented. | hd_struct->in_flight --------------------------- sda1 | -1 sda2 | 1 --------------------------- The patch fixes the problem by caching the partition lookup inside the request structure, hence making sure that the increment and decrement will always happen on the same partition struct. This also speeds up IO with accounting enabled, since it cuts down on the number of lookups we have to do. Also add a refcount to struct hd_struct to keep the partition in memory as long as users exist. We use kref_test_and_get() to ensure we don't add a reference to a partition which is going away. Signed-off-by: Jerome Marchand Signed-off-by: Yasuaki Ishimatsu Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 26 +++++++++++++++++++++----- block/blk-merge.c | 3 ++- block/genhd.c | 1 + fs/partitions/check.c | 10 +++++++++- include/linux/blkdev.h | 1 + include/linux/genhd.h | 2 ++ 6 files changed, 36 insertions(+), 7 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 3689319a5974..500c080a6a6b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!new_io) + if (!new_io) { + part = rq->part; part_stat_inc(cpu, part, merges[rw]); - else { + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + if (!kref_test_and_get(&part->ref)) { + /* + * The partition is already being removed, + * the request will be accounted on the disk only + * + * We take a reference on disk->part0 although that + * partition will never be deleted, so we can treat + * it as any other partition. + */ + part = &rq->rq_disk->part0; + kref_get(&part->ref); + } part_round_stats(cpu, part); part_inc_in_flight(part, rw); + rq->part = part; } part_stat_unlock(); @@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); + rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); @@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); part_dec_in_flight(part, rw); + kref_put(&part->ref, __delete_partition); part_stat_unlock(); } } diff --git a/block/blk-merge.c b/block/blk-merge.c index 77b7c26df6b5..b06b83b89d89 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); + kref_put(&part->ref, __delete_partition); part_stat_unlock(); } } diff --git a/block/genhd.c b/block/genhd.c index 16ccc0d2d5d3..85c150598830 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) return NULL; } disk->part_tbl->part[0] = &disk->part0; + kref_init(&disk->part0.ref); disk->minors = minors; rand_initialize_disk(disk); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index bdf8d3cc95a4..48209f58522b 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -381,6 +381,13 @@ static void delete_partition_rcu_cb(struct rcu_head *head) put_device(part_to_dev(part)); } +void __delete_partition(struct kref *ref) +{ + struct hd_struct *part = container_of(ref, struct hd_struct, ref); + + call_rcu(&part->rcu_head, delete_partition_rcu_cb); +} + void delete_partition(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = disk->part_tbl; @@ -399,7 +406,7 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); + kref_put(&part->ref, __delete_partition); } static ssize_t whole_disk_show(struct device *dev, @@ -498,6 +505,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); + kref_init(&p->ref); return p; out_free_info: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aae86fd10c4f..482a7fd48831 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -115,6 +115,7 @@ struct request { void *elevator_private3; struct gendisk *rq_disk; + struct hd_struct *part; unsigned long start_time; #ifdef CONFIG_BLK_CGROUP unsigned long long start_time_ns; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7a7b9c1644e4..2ba2792a3dd4 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -116,6 +116,7 @@ struct hd_struct { struct disk_stats dkstats; #endif struct rcu_head rcu_head; + struct kref ref; }; #define GENHD_FL_REMOVABLE 1 @@ -583,6 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk, sector_t len, int flags, struct partition_meta_info *info); +extern void __delete_partition(struct kref *ref); extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); -- cgit v1.2.3 From 6c23a9681c0fe7fb7dd331b39dda11926f43746e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 7 Jan 2011 08:43:37 +0100 Subject: block: add internal hd part table references We can't use krefs since it's apparently restricted to very basic reference counting. This reverts commit e4a683c8. Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++--- block/blk-merge.c | 2 +- block/genhd.c | 3 ++- fs/partitions/check.c | 8 +++----- include/linux/genhd.h | 27 +++++++++++++++++++++++++-- include/linux/kref.h | 1 - lib/kref.c | 12 ------------ 7 files changed, 34 insertions(+), 25 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 500c080a6a6b..2f4002f79a24 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -70,7 +70,7 @@ static void drive_stat_acct(struct request *rq, int new_io) part_stat_inc(cpu, part, merges[rw]); } else { part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!kref_test_and_get(&part->ref)) { + if (!hd_struct_try_get(part)) { /* * The partition is already being removed, * the request will be accounted on the disk only @@ -80,7 +80,7 @@ static void drive_stat_acct(struct request *rq, int new_io) * it as any other partition. */ part = &rq->rq_disk->part0; - kref_get(&part->ref); + hd_struct_get(part); } part_round_stats(cpu, part); part_inc_in_flight(part, rw); @@ -1818,7 +1818,7 @@ static void blk_account_io_done(struct request *req) part_round_stats(cpu, part); part_dec_in_flight(part, rw); - kref_put(&part->ref, __delete_partition); + hd_struct_put(part); part_stat_unlock(); } } diff --git a/block/blk-merge.c b/block/blk-merge.c index b06b83b89d89..00b7d31b38a2 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -356,7 +356,7 @@ static void blk_account_io_merge(struct request *req) part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); - kref_put(&part->ref, __delete_partition); + hd_struct_put(part); part_stat_unlock(); } } diff --git a/block/genhd.c b/block/genhd.c index 85c150598830..399d37ec7412 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1192,7 +1192,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id) return NULL; } disk->part_tbl->part[0] = &disk->part0; - kref_init(&disk->part0.ref); + + hd_ref_init(&disk->part0); disk->minors = minors; rand_initialize_disk(disk); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 48209f58522b..011520df71ae 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -381,10 +381,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head) put_device(part_to_dev(part)); } -void __delete_partition(struct kref *ref) +void __delete_partition(struct hd_struct *part) { - struct hd_struct *part = container_of(ref, struct hd_struct, ref); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); } @@ -406,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); - kref_put(&part->ref, __delete_partition); + hd_struct_put(part); } static ssize_t whole_disk_show(struct device *dev, @@ -505,7 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - kref_init(&p->ref); + hd_ref_init(p); return p; out_free_info: diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 2ba2792a3dd4..2d0468145967 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -115,8 +115,8 @@ struct hd_struct { #else struct disk_stats dkstats; #endif + atomic_t ref; struct rcu_head rcu_head; - struct kref ref; }; #define GENHD_FL_REMOVABLE 1 @@ -584,7 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk, sector_t len, int flags, struct partition_meta_info *info); -extern void __delete_partition(struct kref *ref); +extern void __delete_partition(struct hd_struct *); extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); @@ -613,6 +613,29 @@ extern ssize_t part_fail_store(struct device *dev, const char *buf, size_t count); #endif /* CONFIG_FAIL_MAKE_REQUEST */ +static inline void hd_ref_init(struct hd_struct *part) +{ + atomic_set(&part->ref, 1); + smp_mb(); +} + +static inline void hd_struct_get(struct hd_struct *part) +{ + atomic_inc(&part->ref); + smp_mb__after_atomic_inc(); +} + +static inline int hd_struct_try_get(struct hd_struct *part) +{ + return atomic_inc_not_zero(&part->ref); +} + +static inline void hd_struct_put(struct hd_struct *part) +{ + if (atomic_dec_and_test(&part->ref)) + __delete_partition(part); +} + #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } diff --git a/include/linux/kref.h b/include/linux/kref.h index 90b9e44abf54..6cc38fc07ab7 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -23,7 +23,6 @@ struct kref { void kref_init(struct kref *kref); void kref_get(struct kref *kref); -int kref_test_and_get(struct kref *kref); int kref_put(struct kref *kref, void (*release) (struct kref *kref)); #endif /* _KREF_H_ */ diff --git a/lib/kref.c b/lib/kref.c index e7a6e1067122..d3d227a08a4b 100644 --- a/lib/kref.c +++ b/lib/kref.c @@ -36,18 +36,6 @@ void kref_get(struct kref *kref) smp_mb__after_atomic_inc(); } -/** - * kref_test_and_get - increment refcount for object only if refcount is not - * zero. - * @kref: object. - * - * Return non-zero if the refcount was incremented, 0 otherwise - */ -int kref_test_and_get(struct kref *kref) -{ - return atomic_inc_not_zero(&kref->refcount); -} - /** * kref_put - decrement refcount for object. * @kref: object. -- cgit v1.2.3