From 96aa1b419d47286db446f292cf898bb1a8b27f24 Mon Sep 17 00:00:00 2001 From: Ciju Rajan K Date: Mon, 23 Aug 2010 10:56:30 +0200 Subject: blkio: Fix return code for mkdir calls If the cgroup hierarchy for blkio control groups is deeper than two levels, kernel should not allow the creation of further levels. mkdir system call does not except EINVAL as a return value. This patch replaces EINVAL with more appropriate EPERM Signed-off-by: Ciju Rajan K Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a6809645d212..2fef1ef931a0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -966,7 +966,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) /* Currently we do not support hierarchy deeper than two level (0,1) */ if (parent != cgroup->top_cgroup) - return ERR_PTR(-EINVAL); + return ERR_PTR(-EPERM); blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) -- cgit v1.2.3 From b6508c1618e7aab085f191efb41b7b019a94ea38 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 23 Aug 2010 12:23:33 +0200 Subject: cfq-iosched: Do not idle if slice_idle=0 Do not idle either on cfq queue or service tree if slice_idle=0. User does not want any queue or service tree idling. Currently even if slice_idle=0, we were waiting for request to finish before expiring the queue and that can lead to lower queue depths. Acked-by: Jeff Moyer Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index eb4086f7dfef..8830569542c4 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1839,6 +1839,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!service_tree); BUG_ON(!service_tree->count); + if (!cfqd->cfq_slice_idle) + return false; + /* We never do for idle class queues. */ if (prio == IDLE_WORKLOAD) return false; @@ -1879,7 +1882,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * idle is disabled, either manually or by past process history */ - if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) + if (!cfq_should_idle(cfqd, cfqq)) return; /* -- cgit v1.2.3 From 02b35081fc98f681411586d3acf9eaad8b8f6e07 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 23 Aug 2010 12:23:53 +0200 Subject: cfq-iosched: Do group share accounting in IOPS when slice_idle=0 o Implement another CFQ mode where we charge group in terms of number of requests dispatched instead of measuring the time. Measuring in terms of time is not possible when we are driving deeper queue depths and there are requests from multiple cfq queues in the request queue. o This mode currently gets activated if one sets slice_idle=0 and associated disk supports NCQ. Again the idea is that on an NCQ disk with idling disabled most of the queues will dispatch 1 or more requests and then cfq queue expiry happens and we don't have a way to measure time. So start providing fairness in terms of IOPS. o Currently IOPS mode works only with cfq group scheduling. CFQ is following different scheduling algorithms for queue and group scheduling. These IOPS stats are used only for group scheduling hence in non-croup mode nothing should change. o For CFQ group scheduling one can disable slice idling so that we don't idle on queue and drive deeper request queue depths (achieving better throughput), at the same time group idle is enabled so one should get service differentiation among groups. Signed-off-by: Vivek Goyal Acked-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8830569542c4..3fc6be110c1d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -378,6 +378,21 @@ CFQ_CFQQ_FNS(wait_busy); &cfqg->service_trees[i][j]: NULL) \ +static inline bool iops_mode(struct cfq_data *cfqd) +{ + /* + * If we are not idling on queues and it is a NCQ drive, parallel + * execution of requests is on and measuring time is not possible + * in most of the cases until and unless we drive shallower queue + * depths and that becomes a performance bottleneck. In such cases + * switch to start providing fairness in terms of number of IOs. + */ + if (!cfqd->cfq_slice_idle && cfqd->hw_tag) + return true; + else + return false; +} + static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) { if (cfq_class_idle(cfqq)) @@ -906,7 +921,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) slice_used = cfqq->allocated_slice; } - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used); return slice_used; } @@ -914,19 +928,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, struct cfq_queue *cfqq) { struct cfq_rb_root *st = &cfqd->grp_service_tree; - unsigned int used_sl, charge_sl; + unsigned int used_sl, charge; int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) - cfqg->service_tree_idle.count; BUG_ON(nr_sync < 0); - used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); + used_sl = charge = cfq_cfqq_slice_usage(cfqq); - if (!cfq_cfqq_sync(cfqq) && !nr_sync) - charge_sl = cfqq->allocated_slice; + if (iops_mode(cfqd)) + charge = cfqq->slice_dispatch; + else if (!cfq_cfqq_sync(cfqq) && !nr_sync) + charge = cfqq->allocated_slice; /* Can't update vdisktime while group is on service tree */ cfq_rb_erase(&cfqg->rb_node, st); - cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); + cfqg->vdisktime += cfq_scale_slice(charge, cfqg); __cfq_group_service_tree_add(st, cfqg); /* This group is being expired. Save the context */ @@ -940,6 +956,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u", + used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd)); cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); cfq_blkiocg_set_start_empty_time(&cfqg->blkg); } -- cgit v1.2.3 From 80bdf0c78fff075dfa21576273c8b0c7db22bdfe Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 23 Aug 2010 12:24:26 +0200 Subject: cfq-iosched: Implement tunable group_idle o Implement a new tunable group_idle, which allows idling on the group instead of a cfq queue. Hence one can set slice_idle = 0 and not idle on the individual queues but idle on the group. This way on fast storage we can get fairness between groups at the same time overall throughput improves. Signed-off-by: Vivek Goyal Acked-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 65 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3fc6be110c1d..85e48192754d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 10; static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static int cfq_group_idle = HZ / 125; static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ static const int cfq_hist_divisor = 4; @@ -198,6 +199,8 @@ struct cfq_group { struct hlist_node cfqd_node; atomic_t ref; #endif + /* number of requests that are on the dispatch list or inside driver */ + int dispatched; }; /* @@ -271,6 +274,7 @@ struct cfq_data { unsigned int cfq_slice[2]; unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; + unsigned int cfq_group_idle; unsigned int cfq_latency; unsigned int cfq_group_isolation; @@ -1884,7 +1888,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; struct cfq_io_context *cic; - unsigned long sl; + unsigned long sl, group_idle = 0; /* * SSD device without seek penalty, disable idling. But only do so @@ -1900,8 +1904,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * idle is disabled, either manually or by past process history */ - if (!cfq_should_idle(cfqd, cfqq)) - return; + if (!cfq_should_idle(cfqd, cfqq)) { + /* no queue idling. Check for group idling */ + if (cfqd->cfq_group_idle) + group_idle = cfqd->cfq_group_idle; + else + return; + } /* * still active requests from this queue, don't idle @@ -1928,13 +1937,21 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) return; } + /* There are other queues in the group, don't do group idle */ + if (group_idle && cfqq->cfqg->nr_cfqq > 1) + return; + cfq_mark_cfqq_wait_request(cfqq); - sl = cfqd->cfq_slice_idle; + if (group_idle) + sl = cfqd->cfq_group_idle; + else + sl = cfqd->cfq_slice_idle; mod_timer(&cfqd->idle_slice_timer, jiffies + sl); cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); - cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); + cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, + group_idle ? 1 : 0); } /* @@ -1950,6 +1967,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); cfq_remove_request(rq); cfqq->dispatched++; + (RQ_CFQG(rq))->dispatched++; elv_dispatch_sort(q, rq); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; @@ -2219,7 +2237,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) cfqq = NULL; goto keep_queue; } else - goto expire; + goto check_group_idle; } /* @@ -2247,8 +2265,23 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * flight or is idling for a new request, allow either of these * conditions to happen (or time out) before selecting a new queue. */ - if (timer_pending(&cfqd->idle_slice_timer) || - (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { + if (timer_pending(&cfqd->idle_slice_timer)) { + cfqq = NULL; + goto keep_queue; + } + + if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { + cfqq = NULL; + goto keep_queue; + } + + /* + * If group idle is enabled and there are requests dispatched from + * this group, wait for requests to complete. + */ +check_group_idle: + if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 + && cfqq->cfqg->dispatched) { cfqq = NULL; goto keep_queue; } @@ -3396,6 +3429,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) WARN_ON(!cfqq->dispatched); cfqd->rq_in_driver--; cfqq->dispatched--; + (RQ_CFQG(rq))->dispatched--; cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq), rq_io_start_time_ns(rq), rq_data_dir(rq), rq_is_sync(rq)); @@ -3425,7 +3459,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) * the queue. */ if (cfq_should_wait_busy(cfqd, cfqq)) { - cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; + unsigned long extend_sl = cfqd->cfq_slice_idle; + if (!cfqd->cfq_slice_idle) + extend_sl = cfqd->cfq_group_idle; + cfqq->slice_end = jiffies + extend_sl; cfq_mark_cfqq_wait_busy(cfqq); cfq_log_cfqq(cfqd, cfqq, "will busy wait"); } @@ -3871,6 +3908,7 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->cfq_group_idle = cfq_group_idle; cfqd->cfq_latency = 1; cfqd->cfq_group_isolation = 0; cfqd->hw_tag = -1; @@ -3943,6 +3981,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); +SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); @@ -3975,6 +4014,7 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); +STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, @@ -3996,6 +4036,7 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_async), CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), + CFQ_ATTR(group_idle), CFQ_ATTR(low_latency), CFQ_ATTR(group_isolation), __ATTR_NULL @@ -4049,6 +4090,12 @@ static int __init cfq_init(void) if (!cfq_slice_idle) cfq_slice_idle = 1; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (!cfq_group_idle) + cfq_group_idle = 1; +#else + cfq_group_idle = 0; +#endif if (cfq_slab_setup()) return -ENOMEM; -- cgit v1.2.3 From c4e7893ebc3a5c507b53f59b9de448db20849944 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 23 Aug 2010 12:25:03 +0200 Subject: cfq-iosched: blktrace print per slice sector stats o Divyesh had gotten rid of this code in the past. I want to re-introduce it back as it helps me a lot during debugging. Reviewed-by: Jeff Moyer Reviewed-by: Divyesh Shah Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 85e48192754d..f65c6f01c475 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -148,6 +148,8 @@ struct cfq_queue { struct cfq_queue *new_cfqq; struct cfq_group *cfqg; struct cfq_group *orig_cfqg; + /* Number of sectors dispatched from queue in single dispatch round */ + unsigned long nr_sectors; }; /* @@ -960,8 +962,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u", - used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd)); + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" + " sect=%u", used_sl, cfqq->slice_dispatch, charge, + iops_mode(cfqd), cfqq->nr_sectors); cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); cfq_blkiocg_set_start_empty_time(&cfqg->blkg); } @@ -1609,6 +1612,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, cfqq->allocated_slice = 0; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; + cfqq->nr_sectors = 0; cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_must_dispatch(cfqq); @@ -1971,6 +1975,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) elv_dispatch_sort(q, rq); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; + cfqq->nr_sectors += blk_rq_sectors(rq); cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), rq_data_dir(rq), rq_is_sync(rq)); } -- cgit v1.2.3 From c87ffbb812cf6150097a5095b031f4013a8f78a5 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Mon, 23 Aug 2010 12:30:29 +0200 Subject: block: put dev->kobj in blk_register_queue fail path kernel needs to kobject_put on dev->kobj if elv_register_queue fails. Signed-off-by: Xiaotian Feng Cc: "Martin K. Petersen" Cc: Stephen Hemminger Cc: Nikanth Karthikesan Cc: David Teigland Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 001ab18078f5..0749b89c6885 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -511,6 +511,7 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); + kobject_put(&dev->kobj); return ret; } -- cgit v1.2.3 From 5e00d1b5b4c10fb839afd5ce61db8e24339454b0 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Aug 2010 14:31:06 +0200 Subject: BLOCK: fix bio.bi_rw handling Return of the bi_rw tests is no longer bool after commit 74450be1. But results of such tests are stored in bools. This doesn't fit in there for some compilers (gcc 4.5 here), so either use !! magic to get real bools or use ulong where the result is assigned somewhere. Signed-off-by: Jiri Slaby Cc: Christoph Hellwig Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++--- drivers/block/loop.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index ee1a1e7e63cc..32a1c123dfb3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1198,9 +1198,9 @@ static int __make_request(struct request_queue *q, struct bio *bio) int el_ret; unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); - const bool sync = (bio->bi_rw & REQ_SYNC); - const bool unplug = (bio->bi_rw & REQ_UNPLUG); - const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; + const bool sync = !!(bio->bi_rw & REQ_SYNC); + const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); + const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; if ((bio->bi_rw & REQ_HARDBARRIER) && diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f3c636d23718..91797bbbe702 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -477,7 +477,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; if (bio_rw(bio) == WRITE) { - bool barrier = (bio->bi_rw & REQ_HARDBARRIER); + bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER); struct file *file = lo->lo_backing_file; if (barrier) { -- cgit v1.2.3 From 5dd531a03ad721b41911ddb32e6e0481404e7aaf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Aug 2010 13:52:19 +0200 Subject: block: add function call to switch the IO scheduler from a driver Currently drivers must do an elevator_exit() + elevator_init() to switch IO schedulers. There are a few problems with this: - Since commit 1abec4fdbb142e3ccb6ce99832fae42129134a96, elevator_init() requires a zeroed out q->elevator pointer. The two existing in-kernel users don't do that. - It will only work at initialization time, since using the above two-staged construct does not properly quisce the queue. So add elevator_change() which takes care of this, and convert the elv_iosched_store() sysfs interface to use this helper as well. Reported-by: Peter Oberparleiter Reported-by: Kevin Vigor Signed-off-by: Jens Axboe --- block/elevator.c | 44 +++++++++++++++++++++++++++++++------------- include/linux/elevator.h | 1 + 2 files changed, 32 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index ec585c9554d3..205b09a5bd9e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1009,18 +1009,19 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { struct elevator_queue *old_elevator, *e; void *data; + int err; /* * Allocate new elevator */ e = elevator_alloc(q, new_e); if (!e) - return 0; + return -ENOMEM; data = elevator_init_queue(q, e); if (!data) { kobject_put(&e->kobj); - return 0; + return -ENOMEM; } /* @@ -1043,7 +1044,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) __elv_unregister_queue(old_elevator); - if (elv_register_queue(q)) + err = elv_register_queue(q); + if (err) goto fail_register; /* @@ -1056,7 +1058,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); - return 1; + return 0; fail_register: /* @@ -1071,17 +1073,19 @@ fail_register: queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); spin_unlock_irq(q->queue_lock); - return 0; + return err; } -ssize_t elv_iosched_store(struct request_queue *q, const char *name, - size_t count) +/* + * Switch this queue to the given IO scheduler. + */ +int elevator_change(struct request_queue *q, const char *name) { char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; if (!q->elevator) - return count; + return -ENXIO; strlcpy(elevator_name, name, sizeof(elevator_name)); e = elevator_get(strstrip(elevator_name)); @@ -1092,13 +1096,27 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { elevator_put(e); - return count; + return 0; } - if (!elevator_switch(q, e)) - printk(KERN_ERR "elevator: switch to %s failed\n", - elevator_name); - return count; + return elevator_switch(q, e); +} +EXPORT_SYMBOL(elevator_change); + +ssize_t elv_iosched_store(struct request_queue *q, const char *name, + size_t count) +{ + int ret; + + if (!q->elevator) + return count; + + ret = elevator_change(q, name); + if (!ret) + return count; + + printk(KERN_ERR "elevator: switch to %s failed\n", name); + return ret; } ssize_t elv_iosched_show(struct request_queue *q, char *name) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2c958f4fce1e..926b50322a46 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -136,6 +136,7 @@ extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); extern int elevator_init(struct request_queue *, char *); extern void elevator_exit(struct elevator_queue *); +extern int elevator_change(struct request_queue *, const char *); extern int elv_rq_merge_ok(struct request *, struct bio *); /* -- cgit v1.2.3 From be14eb619108fa8b7120eb2c42d66d5f623ae10e Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 10 Sep 2010 09:03:21 +0200 Subject: block: Range check cpu in blk_cpu_to_group While testing CPU DLPAR, the following problem was discovered. We were DLPAR removing the first CPU, which in this case was logical CPUs 0-3. CPUs 0-2 were already marked offline and we were in the process of offlining CPU 3. After marking the CPU inactive and offline in cpu_disable, but before the cpu was completely idle (cpu_die), we ended up in __make_request on CPU 3. There we looked at the topology map to see which CPU to complete the I/O on and found no CPUs in the cpu_sibling_map. This resulted in the block layer setting the completion cpu to be NR_CPUS, which then caused an oops when we tried to complete the I/O. Fix this by sanity checking the value we return from blk_cpu_to_group to be a valid cpu value. Signed-off-by: Brian King Signed-off-by: Jens Axboe --- block/blk.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 6e7dc87141e4..d6b911ac002c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -142,14 +142,18 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) static inline int blk_cpu_to_group(int cpu) { + int group = NR_CPUS; #ifdef CONFIG_SCHED_MC const struct cpumask *mask = cpu_coregroup_mask(cpu); - return cpumask_first(mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - return cpumask_first(topology_thread_cpumask(cpu)); + group = cpumask_first(topology_thread_cpumask(cpu)); #else return cpu; #endif + if (likely(group < NR_CPUS)) + return group; + return cpu; } /* -- cgit v1.2.3 From a45dc2d2b8d1afa57c91dcfac224e50ffcd3f805 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Mon, 13 Sep 2010 21:32:19 +0200 Subject: block: fix blk_rq_map_kern bio direction flag This bug was introduced in 7b6d91daee5cac6402186ff224c3af39d79f4a0e "block: unify flags for struct bio and struct request" Cc: Boaz Harrosh Signed-off-by: Benny Halevy Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index c65d7593f7f1..ade0a08c9099 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -307,7 +307,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, return PTR_ERR(bio); if (rq_data_dir(rq) == WRITE) - bio->bi_rw |= (1 << REQ_WRITE); + bio->bi_rw |= REQ_WRITE; if (do_copy) rq->cmd_flags |= REQ_COPY_USER; -- cgit v1.2.3 From 180be2a0420a0d680285e9352c0db85016b9ed2a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 14 Sep 2010 08:47:11 +0200 Subject: cfq-iosched: fix a kernel OOPs when usb key is inserted Mike reported a kernel crash when a usb key hotplug is performed while all kernel thrads are not in a root cgroup and are running in one of the child cgroups of blkio controller. BUG: unable to handle kernel NULL pointer dereference at 0000002c IP: [] cfq_get_queue+0x232/0x412 *pde = 00000000 Oops: 0000 [#1] PREEMPT last sysfs file: /sys/devices/pci0000:00/0000:00:1d.7/usb2/2-1/2-1:1.0/host3/scsi_host/host3/uevent [..] Pid: 30039, comm: scsi_scan_3 Not tainted 2.6.35.2-fg.roam #1 Volvi2 /Aspire 4315 EIP: 0060:[] EFLAGS: 00010086 CPU: 0 EIP is at cfq_get_queue+0x232/0x412 EAX: f705f9c0 EBX: e977abac ECX: 00000000 EDX: 00000000 ESI: f00da400 EDI: f00da4ec EBP: e977a800 ESP: dff8fd00 DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 Process scsi_scan_3 (pid: 30039, ti=dff8e000 task=f6b6c9a0 task.ti=dff8e000) Stack: 00000000 00000000 00000001 01ff0000 f00da508 00000000 f00da524 f00da540 <0> e7994940 dd631750 f705f9c0 e977a820 e977ac44 f00da4d0 00000001 f6b6c9a0 <0> 00000010 00008010 0000000b 00000000 00000001 e977a800 dd76fac0 00000246 Call Trace: [] ? cfq_set_request+0x228/0x34c [] ? cfq_set_request+0x0/0x34c [] ? elv_set_request+0xf/0x1c [] ? get_request+0x1ad/0x22f [] ? get_request_wait+0x1f/0x11a [] ? kvasprintf+0x33/0x3b [] ? scsi_execute+0x1d/0x103 [] ? scsi_execute_req+0x58/0x83 [] ? scsi_probe_and_add_lun+0x188/0x7c2 [] ? attribute_container_add_device+0x15/0xfa [] ? kobject_get+0xf/0x13 [] ? get_device+0x10/0x14 [] ? scsi_alloc_target+0x217/0x24d [] ? __scsi_scan_target+0x95/0x480 [] ? dequeue_entity+0x14/0x1fe [] ? update_curr+0x165/0x1ab [] ? update_curr+0x165/0x1ab [] ? scsi_scan_channel+0x4a/0x76 [] ? scsi_scan_host_selected+0x77/0xad [] ? do_scan_async+0x0/0x11a [] ? do_scsi_scan_host+0x51/0x56 [] ? do_scan_async+0x0/0x11a [] ? do_scan_async+0xe/0x11a [] ? do_scan_async+0x0/0x11a [] ? kthread+0x5e/0x63 [] ? kthread+0x0/0x63 [] ? kernel_thread_helper+0x6/0x10 Code: 44 24 1c 54 83 44 24 18 54 83 fa 03 75 94 8b 06 c7 86 64 02 00 00 01 00 00 00 83 e0 03 09 f0 89 06 8b 44 24 28 8b 90 58 01 00 00 <8b> 42 2c 85 c0 75 03 8b 42 08 8d 54 24 48 52 8d 4c 24 50 51 68 EIP: [] cfq_get_queue+0x232/0x412 SS:ESP 0068:dff8fd00 CR2: 000000000000002c ---[ end trace 9a88306573f69b12 ]--- The problem here is that we don't have bdi->dev information available when thread does some IO. Hence when dev_name() tries to access bdi->dev, it crashes. This problem does not happen if kernel threads are in root group as root group is statically allocated at device initialization time and we don't hit this piece of code. Fix it by delaying the filling of major and minor number information of device in blk_group. Initially a blk_group is created with 0 as device information and this information is filled later once some more IO comes in from same group. Reported-by: Mike Kazantsev Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f65c6f01c475..9eba291eb6fd 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1019,10 +1019,20 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) */ atomic_set(&cfqg->ref, 1); - /* Add group onto cgroup list */ - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + /* + * Add group onto cgroup list. It might happen that bdi->dev is + * not initiliazed yet. Initialize this new group without major + * and minor info and this info will be filled in once a new thread + * comes for IO. See code above. + */ + if (bdi->dev) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, MKDEV(major, minor)); + } else + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + 0); + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); /* Add group on cfqd list */ -- cgit v1.2.3 From f281fb5fe54e15a7ab802945e42f8e24fceb56b2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Sat, 25 Sep 2010 12:42:55 +0200 Subject: block: prevent merges of discard and write requests Add logic to prevent two I/O requests being merged if only one of them is a discard. Ditto secure discard. Without this fix, it is possible for write requests to transform into discard requests. For example: Submit bio 1 to discard 8 sectors from sector n Submit bio 2 to write 8 sectors from sector n + 16 Submit bio 3 to write 8 sectors from sector n + 8 Bio 1 becomes request 1. Bio 2 becomes request 2. Bio 3 is merged with request 2, and then subsequently request 2 is merged with request 1 resulting in just one I/O request which discards all 24 sectors. Signed-off-by: Adrian Hunter (Moved the checks above the position checks /Jens) Signed-off-by: Jens Axboe --- block/blk-merge.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 3b0cd4249671..eafc94f68d79 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -361,6 +361,18 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (!rq_mergeable(req) || !rq_mergeable(next)) return 0; + /* + * Don't merge file system requests and discard requests + */ + if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD)) + return 0; + + /* + * Don't merge discard requests and secure discard requests + */ + if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE)) + return 0; + /* * not contiguous */ -- cgit v1.2.3