diff options
author | Michal Marek <mmarek@suse.cz> | 2011-06-07 15:37:51 +0200 |
---|---|---|
committer | Michal Marek <mmarek@suse.cz> | 2011-06-07 15:37:51 +0200 |
commit | 2e483528cebad089d0bb3f9aebb0ada22d968ffa (patch) | |
tree | d701405826b271e819a9a8500838cebd37b1364a /block | |
parent | 163d3fe6a2357aba7b18b938d6ae6ce9570324e4 (diff) | |
parent | 55922c9d1b84b89cb946c777fddccb3247e7df2c (diff) | |
download | linux-2e483528cebad089d0bb3f9aebb0ada22d968ffa.tar.bz2 |
Merge commit 'v3.0-rc1' into kbuild/kbuild
Diffstat (limited to 'block')
-rw-r--r-- | block/blk-cgroup.c | 229 | ||||
-rw-r--r-- | block/blk-cgroup.h | 43 | ||||
-rw-r--r-- | block/blk-core.c | 223 | ||||
-rw-r--r-- | block/blk-exec.c | 4 | ||||
-rw-r--r-- | block/blk-flush.c | 26 | ||||
-rw-r--r-- | block/blk-integrity.c | 12 | ||||
-rw-r--r-- | block/blk-ioc.c | 3 | ||||
-rw-r--r-- | block/blk-lib.c | 82 | ||||
-rw-r--r-- | block/blk-settings.c | 9 | ||||
-rw-r--r-- | block/blk-sysfs.c | 14 | ||||
-rw-r--r-- | block/blk-throttle.c | 324 | ||||
-rw-r--r-- | block/blk.h | 25 | ||||
-rw-r--r-- | block/cfq-iosched.c | 265 | ||||
-rw-r--r-- | block/elevator.c | 47 | ||||
-rw-r--r-- | block/genhd.c | 12 |
15 files changed, 879 insertions, 439 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2bef5705ce24..bcaf16ee6ad1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup); static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, struct cgroup *); -static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, - struct task_struct *, bool); -static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, - struct cgroup *, struct task_struct *, bool); +static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *); +static void blkiocg_attach_task(struct cgroup *, struct task_struct *); static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); @@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); struct cgroup_subsys blkio_subsys = { .name = "blkio", .create = blkiocg_create, - .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, + .can_attach_task = blkiocg_can_attach_task, + .attach_task = blkiocg_attach_task, .destroy = blkiocg_destroy, .populate = blkiocg_populate, #ifdef CONFIG_BLK_CGROUP @@ -114,6 +112,13 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); +struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, blkio_subsys_id), + struct blkio_cgroup, css); +} +EXPORT_SYMBOL_GPL(task_blkio_cgroup); + static inline void blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) { @@ -378,25 +383,40 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, spin_lock_irqsave(&blkg->stats_lock, flags); blkg->stats.time += time; +#ifdef CONFIG_DEBUG_BLK_CGROUP blkg->stats.unaccounted_time += unaccounted_time; +#endif spin_unlock_irqrestore(&blkg->stats_lock, flags); } EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); +/* + * should be called under rcu read lock or queue lock to make sure blkg pointer + * is valid. + */ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, bool direction, bool sync) { - struct blkio_group_stats *stats; + struct blkio_group_stats_cpu *stats_cpu; unsigned long flags; - spin_lock_irqsave(&blkg->stats_lock, flags); - stats = &blkg->stats; - stats->sectors += bytes >> 9; - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, - sync); - blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, - direction, sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + /* + * Disabling interrupts to provide mutual exclusion between two + * writes on same cpu. It probably is not needed for 64bit. Not + * optimizing that case yet. + */ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(blkg->stats_cpu); + + u64_stats_update_begin(&stats_cpu->syncp); + stats_cpu->sectors += bytes >> 9; + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], + 1, direction, sync); + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], + bytes, direction, sync); + u64_stats_update_end(&stats_cpu->syncp); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); @@ -419,18 +439,44 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg, } EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); +/* Merged stats are per cpu. */ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, bool sync) { + struct blkio_group_stats_cpu *stats_cpu; unsigned long flags; - spin_lock_irqsave(&blkg->stats_lock, flags); - blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, - sync); - spin_unlock_irqrestore(&blkg->stats_lock, flags); + /* + * Disabling interrupts to provide mutual exclusion between two + * writes on same cpu. It probably is not needed for 64bit. Not + * optimizing that case yet. + */ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(blkg->stats_cpu); + + u64_stats_update_begin(&stats_cpu->syncp); + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, + direction, sync); + u64_stats_update_end(&stats_cpu->syncp); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); +/* + * This function allocates the per cpu stats for blkio_group. Should be called + * from sleepable context as alloc_per_cpu() requires that. + */ +int blkio_alloc_blkg_stats(struct blkio_group *blkg) +{ + /* Allocate memory for per cpu stats */ + blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); + if (!blkg->stats_cpu) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); + void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, enum blkio_policy_id plid) @@ -501,6 +547,30 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) } EXPORT_SYMBOL_GPL(blkiocg_lookup_group); +static void blkio_reset_stats_cpu(struct blkio_group *blkg) +{ + struct blkio_group_stats_cpu *stats_cpu; + int i, j, k; + /* + * Note: On 64 bit arch this should not be an issue. This has the + * possibility of returning some inconsistent value on 32bit arch + * as 64bit update on 32bit is non atomic. Taking care of this + * corner case makes code very complicated, like sending IPIs to + * cpus, taking care of stats of offline cpus etc. + * + * reset stats is anyway more of a debug feature and this sounds a + * corner case. So I am not complicating the code yet until and + * unless this becomes a real issue. + */ + for_each_possible_cpu(i) { + stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); + stats_cpu->sectors = 0; + for(j = 0; j < BLKIO_STAT_CPU_NR; j++) + for (k = 0; k < BLKIO_STAT_TOTAL; k++) + stats_cpu->stat_arr_cpu[j][k] = 0; + } +} + static int blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { @@ -545,7 +615,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) } #endif spin_unlock(&blkg->stats_lock); + + /* Reset Per cpu stats which don't take blkg->stats_lock */ + blkio_reset_stats_cpu(blkg); } + spin_unlock_irq(&blkcg->lock); return 0; } @@ -591,6 +665,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, return val; } + +static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, + enum stat_type_cpu type, enum stat_sub_type sub_type) +{ + int cpu; + struct blkio_group_stats_cpu *stats_cpu; + u64 val = 0, tval; + + for_each_possible_cpu(cpu) { + unsigned int start; + stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); + + do { + start = u64_stats_fetch_begin(&stats_cpu->syncp); + if (type == BLKIO_STAT_CPU_SECTORS) + tval = stats_cpu->sectors; + else + tval = stats_cpu->stat_arr_cpu[type][sub_type]; + } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); + + val += tval; + } + + return val; +} + +static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, + struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) +{ + uint64_t disk_total, val; + char key_str[MAX_KEY_LEN]; + enum stat_sub_type sub_type; + + if (type == BLKIO_STAT_CPU_SECTORS) { + val = blkio_read_stat_cpu(blkg, type, 0); + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); + } + + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; + sub_type++) { + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); + val = blkio_read_stat_cpu(blkg, type, sub_type); + cb->fill(cb, key_str, val); + } + + disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + + blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); + + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, disk_total); + return disk_total; +} + /* This should be called with blkg->stats_lock held */ static uint64_t blkio_get_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) @@ -602,9 +729,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, if (type == BLKIO_STAT_TIME) return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, blkg->stats.time, cb, dev); - if (type == BLKIO_STAT_SECTORS) - return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, - blkg->stats.sectors, cb, dev); #ifdef CONFIG_DEBUG_BLK_CGROUP if (type == BLKIO_STAT_UNACCOUNTED_TIME) return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, @@ -868,7 +992,7 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, } /* - * Some rules/values in blkg have changed. Propogate those to respective + * Some rules/values in blkg have changed. Propagate those to respective * policies. */ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, @@ -903,7 +1027,7 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, } /* - * A policy node rule has been updated. Propogate this update to all the + * A policy node rule has been updated. Propagate this update to all the * block groups which might be affected by this update. */ static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, @@ -1068,8 +1192,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, } static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, - struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, - bool show_total) + struct cftype *cft, struct cgroup_map_cb *cb, + enum stat_type type, bool show_total, bool pcpu) { struct blkio_group *blkg; struct hlist_node *n; @@ -1080,10 +1204,15 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, if (blkg->dev) { if (!cftype_blkg_same_policy(cft, blkg)) continue; - spin_lock_irq(&blkg->stats_lock); - cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, - type); - spin_unlock_irq(&blkg->stats_lock); + if (pcpu) + cgroup_total += blkio_get_stat_cpu(blkg, cb, + blkg->dev, type); + else { + spin_lock_irq(&blkg->stats_lock); + cgroup_total += blkio_get_stat(blkg, cb, + blkg->dev, type); + spin_unlock_irq(&blkg->stats_lock); + } } } if (show_total) @@ -1107,47 +1236,47 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, switch(name) { case BLKIO_PROP_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_TIME, 0); + BLKIO_STAT_TIME, 0, 0); case BLKIO_PROP_sectors: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SECTORS, 0); + BLKIO_STAT_CPU_SECTORS, 0, 1); case BLKIO_PROP_io_service_bytes: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_BYTES, 1); + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); case BLKIO_PROP_io_serviced: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICED, 1); + BLKIO_STAT_CPU_SERVICED, 1, 1); case BLKIO_PROP_io_service_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_TIME, 1); + BLKIO_STAT_SERVICE_TIME, 1, 0); case BLKIO_PROP_io_wait_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_WAIT_TIME, 1); + BLKIO_STAT_WAIT_TIME, 1, 0); case BLKIO_PROP_io_merged: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_MERGED, 1); + BLKIO_STAT_CPU_MERGED, 1, 1); case BLKIO_PROP_io_queued: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_QUEUED, 1); + BLKIO_STAT_QUEUED, 1, 0); #ifdef CONFIG_DEBUG_BLK_CGROUP case BLKIO_PROP_unaccounted_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_UNACCOUNTED_TIME, 0); + BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); case BLKIO_PROP_dequeue: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_DEQUEUE, 0); + BLKIO_STAT_DEQUEUE, 0, 0); case BLKIO_PROP_avg_queue_size: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_AVG_QUEUE_SIZE, 0); + BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); case BLKIO_PROP_group_wait_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_GROUP_WAIT_TIME, 0); + BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); case BLKIO_PROP_idle_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_IDLE_TIME, 0); + BLKIO_STAT_IDLE_TIME, 0, 0); case BLKIO_PROP_empty_time: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_EMPTY_TIME, 0); + BLKIO_STAT_EMPTY_TIME, 0, 0); #endif default: BUG(); @@ -1157,10 +1286,10 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, switch(name){ case BLKIO_THROTL_io_service_bytes: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICE_BYTES, 1); + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); case BLKIO_THROTL_io_serviced: return blkio_read_blkg_stats(blkcg, cft, cb, - BLKIO_STAT_SERVICED, 1); + BLKIO_STAT_CPU_SERVICED, 1, 1); default: BUG(); } @@ -1485,9 +1614,7 @@ done: * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkiocg_can_attach(struct cgroup_subsys *subsys, - struct cgroup *cgroup, struct task_struct *tsk, - bool threadgroup) +static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { struct io_context *ioc; int ret = 0; @@ -1502,9 +1629,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys, return ret; } -static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, - struct cgroup *prev, struct task_struct *tsk, - bool threadgroup) +static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { struct io_context *ioc; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 10919fae2d3a..a71d2904ffb9 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -14,6 +14,7 @@ */ #include <linux/cgroup.h> +#include <linux/u64_stats_sync.h> enum blkio_policy_id { BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ @@ -36,22 +37,15 @@ enum stat_type { * request completion for IOs doen by this cgroup. This may not be * accurate when NCQ is turned on. */ BLKIO_STAT_SERVICE_TIME = 0, - /* Total bytes transferred */ - BLKIO_STAT_SERVICE_BYTES, - /* Total IOs serviced, post merge */ - BLKIO_STAT_SERVICED, /* Total time spent waiting in scheduler queue in ns */ BLKIO_STAT_WAIT_TIME, - /* Number of IOs merged */ - BLKIO_STAT_MERGED, /* Number of IOs queued up */ BLKIO_STAT_QUEUED, /* All the single valued stats go below this */ BLKIO_STAT_TIME, - BLKIO_STAT_SECTORS, +#ifdef CONFIG_DEBUG_BLK_CGROUP /* Time not charged to this cgroup */ BLKIO_STAT_UNACCOUNTED_TIME, -#ifdef CONFIG_DEBUG_BLK_CGROUP BLKIO_STAT_AVG_QUEUE_SIZE, BLKIO_STAT_IDLE_TIME, BLKIO_STAT_EMPTY_TIME, @@ -60,6 +54,18 @@ enum stat_type { #endif }; +/* Per cpu stats */ +enum stat_type_cpu { + BLKIO_STAT_CPU_SECTORS, + /* Total bytes transferred */ + BLKIO_STAT_CPU_SERVICE_BYTES, + /* Total IOs serviced, post merge */ + BLKIO_STAT_CPU_SERVICED, + /* Number of IOs merged */ + BLKIO_STAT_CPU_MERGED, + BLKIO_STAT_CPU_NR +}; + enum stat_sub_type { BLKIO_STAT_READ = 0, BLKIO_STAT_WRITE, @@ -116,11 +122,11 @@ struct blkio_cgroup { struct blkio_group_stats { /* total disk time and nr sectors dispatched by this group */ uint64_t time; - uint64_t sectors; - /* Time not charged to this cgroup */ - uint64_t unaccounted_time; uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; #ifdef CONFIG_DEBUG_BLK_CGROUP + /* Time not charged to this cgroup */ + uint64_t unaccounted_time; + /* Sum of number of IOs queued across all samples */ uint64_t avg_queue_size_sum; /* Count of samples taken for average */ @@ -145,6 +151,13 @@ struct blkio_group_stats { #endif }; +/* Per cpu blkio group stats */ +struct blkio_group_stats_cpu { + uint64_t sectors; + uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; + struct u64_stats_sync syncp; +}; + struct blkio_group { /* An rcu protected unique identifier for the group */ void *key; @@ -160,6 +173,8 @@ struct blkio_group { /* Need to serialize the stats in the case of reset/update */ spinlock_t stats_lock; struct blkio_group_stats stats; + /* Per cpu stats pointer */ + struct blkio_group_stats_cpu __percpu *stats_cpu; }; struct blkio_policy_node { @@ -291,9 +306,11 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); +extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, enum blkio_policy_id plid); +extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); extern int blkiocg_del_blkio_group(struct blkio_group *blkg); extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); @@ -314,11 +331,15 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg, struct cgroup; static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } +static inline struct blkio_cgroup * +task_blkio_cgroup(struct task_struct *tsk) { return NULL; } static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, enum blkio_policy_id plid) {} +static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } + static inline int blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index e0a062363937..d2f8f4049abd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -198,26 +198,13 @@ void blk_dump_rq_flags(struct request *rq, char *msg) } EXPORT_SYMBOL(blk_dump_rq_flags); -/* - * Make sure that plugs that were pending when this function was entered, - * are now complete and requests pushed to the queue. -*/ -static inline void queue_sync_plugs(struct request_queue *q) -{ - /* - * If the current process is plugged and has barriers submitted, - * we will livelock if we don't unplug first. - */ - blk_flush_plug(current); -} - static void blk_delay_work(struct work_struct *work) { struct request_queue *q; q = container_of(work, struct request_queue, delay_work.work); spin_lock_irq(q->queue_lock); - __blk_run_queue(q, false); + __blk_run_queue(q); spin_unlock_irq(q->queue_lock); } @@ -233,7 +220,8 @@ static void blk_delay_work(struct work_struct *work) */ void blk_delay_queue(struct request_queue *q, unsigned long msecs) { - schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs)); + queue_delayed_work(kblockd_workqueue, &q->delay_work, + msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_delay_queue); @@ -251,7 +239,7 @@ void blk_start_queue(struct request_queue *q) WARN_ON(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); - __blk_run_queue(q, false); + __blk_run_queue(q); } EXPORT_SYMBOL(blk_start_queue); @@ -298,38 +286,44 @@ void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->timeout); cancel_delayed_work_sync(&q->delay_work); - queue_sync_plugs(q); } EXPORT_SYMBOL(blk_sync_queue); /** * __blk_run_queue - run a single device queue * @q: The queue to run - * @force_kblockd: Don't run @q->request_fn directly. Use kblockd. * * Description: * See @blk_run_queue. This variant must be called with the queue lock * held and interrupts disabled. - * */ -void __blk_run_queue(struct request_queue *q, bool force_kblockd) +void __blk_run_queue(struct request_queue *q) { if (unlikely(blk_queue_stopped(q))) return; - /* - * Only recurse once to avoid overrunning the stack, let the unplug - * handling reinvoke the handler shortly if we already got there. - */ - if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { - q->request_fn(q); - queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else - queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); + q->request_fn(q); } EXPORT_SYMBOL(__blk_run_queue); /** + * blk_run_queue_async - run a single device queue in workqueue context + * @q: The queue to run + * + * Description: + * Tells kblockd to perform the equivalent of @blk_run_queue on behalf + * of us. + */ +void blk_run_queue_async(struct request_queue *q) +{ + if (likely(!blk_queue_stopped(q))) { + __cancel_delayed_work(&q->delay_work); + queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); + } +} +EXPORT_SYMBOL(blk_run_queue_async); + +/** * blk_run_queue - run a single device queue * @q: The queue to run * @@ -342,7 +336,7 @@ void blk_run_queue(struct request_queue *q) unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - __blk_run_queue(q, false); + __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_run_queue); @@ -351,6 +345,7 @@ void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); } +EXPORT_SYMBOL(blk_put_queue); /* * Note: If a driver supplied the queue lock, it should not zap that lock @@ -572,11 +567,10 @@ int blk_get_queue(struct request_queue *q) return 1; } +EXPORT_SYMBOL(blk_get_queue); static inline void blk_free_request(struct request_queue *q, struct request *rq) { - BUG_ON(rq->cmd_flags & REQ_ON_PLUG); - if (rq->cmd_flags & REQ_ELVPRIV) elv_put_request(q, rq); mempool_free(rq, q->rq.rq_pool); @@ -991,7 +985,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq, blk_queue_end_tag(q, rq); add_acct_request(q, rq, where); - __blk_run_queue(q, false); + __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_insert_request); @@ -1116,14 +1110,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; - /* - * Debug stuff, kill later - */ - if (!rq_mergeable(req)) { - blk_dump_rq_flags(req, "back"); - return false; - } - if (!ll_back_merge_fn(q, req, bio)) return false; @@ -1138,6 +1124,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); return true; } @@ -1145,15 +1132,6 @@ static bool bio_attempt_front_merge(struct request_queue *q, struct request *req, struct bio *bio) { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; - sector_t sector; - - /* - * Debug stuff, kill later - */ - if (!rq_mergeable(req)) { - blk_dump_rq_flags(req, "front"); - return false; - } if (!ll_front_merge_fn(q, req, bio)) return false; @@ -1163,8 +1141,6 @@ static bool bio_attempt_front_merge(struct request_queue *q, if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); - sector = bio->bi_sector; - bio->bi_next = req->bio; req->bio = bio; @@ -1179,12 +1155,13 @@ static bool bio_attempt_front_merge(struct request_queue *q, req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); return true; } /* * Attempts to merge with the plugged list in the current process. Returns - * true if merge was succesful, otherwise false. + * true if merge was successful, otherwise false. */ static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, struct bio *bio) @@ -1264,14 +1241,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) el_ret = elv_merge(q, &req, bio); if (el_ret == ELEVATOR_BACK_MERGE) { - BUG_ON(req->cmd_flags & REQ_ON_PLUG); if (bio_attempt_back_merge(q, req, bio)) { if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out_unlock; } } else if (el_ret == ELEVATOR_FRONT_MERGE) { - BUG_ON(req->cmd_flags & REQ_ON_PLUG); if (bio_attempt_front_merge(q, req, bio)) { if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); @@ -1311,23 +1286,27 @@ get_rq: plug = current->plug; if (plug) { - if (!plug->should_sort && !list_empty(&plug->list)) { + /* + * If this is the first request added after a plug, fire + * of a plug trace. If others have been added before, check + * if we have multiple devices in this plug. If so, make a + * note to sort the list before dispatch. + */ + if (list_empty(&plug->list)) + trace_block_plug(q); + else if (!plug->should_sort) { struct request *__rq; __rq = list_entry_rq(plug->list.prev); if (__rq->q != q) plug->should_sort = 1; } - /* - * Debug flag, kill later - */ - req->cmd_flags |= REQ_ON_PLUG; list_add_tail(&req->queuelist, &plug->list); drive_stat_acct(req, 1); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); - __blk_run_queue(q, false); + __blk_run_queue(q); out_unlock: spin_unlock_irq(q->queue_lock); } @@ -1548,7 +1527,8 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } - blk_throtl_bio(q, &bio); + if (blk_throtl_bio(q, &bio)) + goto end_io; /* * If bio = NULL, bio has been throttled and will be submitted @@ -2163,7 +2143,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { - printk(KERN_ERR "blk: request botched\n"); + blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } @@ -2644,6 +2624,7 @@ void blk_start_plug(struct blk_plug *plug) plug->magic = PLUG_MAGIC; INIT_LIST_HEAD(&plug->list); + INIT_LIST_HEAD(&plug->cb_list); plug->should_sort = 0; /* @@ -2665,40 +2646,97 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); - return !(rqa->q == rqb->q); + return !(rqa->q <= rqb->q); } -static void flush_plug_list(struct blk_plug *plug) +/* + * If 'from_schedule' is true, then postpone the dispatch of requests + * until a safe kblockd context. We due this to avoid accidental big + * additional stack usage in driver dispatch, in places where the originally + * plugger did not intend it. + */ +static void queue_unplugged(struct request_queue *q, unsigned int depth, + bool from_schedule) + __releases(q->queue_lock) +{ + trace_block_unplug(q, depth, !from_schedule); + + /* + * If we are punting this to kblockd, then we can safely drop + * the queue_lock before waking kblockd (which needs to take + * this lock). + */ + if (from_schedule) { + spin_unlock(q->queue_lock); + blk_run_queue_async(q); + } else { + __blk_run_queue(q); + spin_unlock(q->queue_lock); + } + +} + +static void flush_plug_callbacks(struct blk_plug *plug) +{ + LIST_HEAD(callbacks); + + if (list_empty(&plug->cb_list)) + return; + + list_splice_init(&plug->cb_list, &callbacks); + + while (!list_empty(&callbacks)) { + struct blk_plug_cb *cb = list_first_entry(&callbacks, + struct blk_plug_cb, + list); + list_del(&cb->list); + cb->callback(cb); + } +} + +void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; unsigned long flags; struct request *rq; + LIST_HEAD(list); + unsigned int depth; BUG_ON(plug->magic != PLUG_MAGIC); + flush_plug_callbacks(plug); if (list_empty(&plug->list)) return; - if (plug->should_sort) - list_sort(NULL, &plug->list, plug_rq_cmp); + list_splice_init(&plug->list, &list); + + if (plug->should_sort) { + list_sort(NULL, &list, plug_rq_cmp); + plug->should_sort = 0; + } q = NULL; + depth = 0; + + /* + * Save and disable interrupts here, to avoid doing it for every + * queue lock we have to take. + */ local_irq_save(flags); - while (!list_empty(&plug->list)) { - rq = list_entry_rq(plug->list.next); + while (!list_empty(&list)) { + rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); - BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG)); BUG_ON(!rq->q); if (rq->q != q) { - if (q) { - __blk_run_queue(q, false); - spin_unlock(q->queue_lock); - } + /* + * This drops the queue lock + */ + if (q) + queue_unplugged(q, depth, from_schedule); q = rq->q; + depth = 0; spin_lock(q->queue_lock); } - rq->cmd_flags &= ~REQ_ON_PLUG; - /* * rq is already accounted, so use raw insert */ @@ -2706,38 +2744,27 @@ static void flush_plug_list(struct blk_plug *plug) __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); else __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); - } - if (q) { - __blk_run_queue(q, false); - spin_unlock(q->queue_lock); + depth++; } - BUG_ON(!list_empty(&plug->list)); - local_irq_restore(flags); -} - -static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug) -{ - flush_plug_list(plug); + /* + * This drops the queue lock + */ + if (q) + queue_unplugged(q, depth, from_schedule); - if (plug == tsk->plug) - tsk->plug = NULL; + local_irq_restore(flags); } void blk_finish_plug(struct blk_plug *plug) { - if (plug) - __blk_finish_plug(current, plug); -} -EXPORT_SYMBOL(blk_finish_plug); + blk_flush_plug_list(plug, false); -void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug) -{ - __blk_finish_plug(tsk, plug); - tsk->plug = plug; + if (plug == current->plug) + current->plug = NULL; } -EXPORT_SYMBOL(__blk_flush_plug); +EXPORT_SYMBOL(blk_finish_plug); int __init blk_dev_init(void) { diff --git a/block/blk-exec.c b/block/blk-exec.c index 7482b7fa863b..8a0e7ec056e7 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -55,8 +55,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, WARN_ON(irqs_disabled()); spin_lock_irq(q->queue_lock); __elv_add_request(q, rq, where); - __blk_run_queue(q, false); - /* the queue is stopped so it won't be plugged+unplugged */ + __blk_run_queue(q); + /* the queue is stopped so it won't be run */ if (rq->cmd_type == REQ_TYPE_PM_RESUME) q->request_fn(q); spin_unlock_irq(q->queue_lock); diff --git a/block/blk-flush.c b/block/blk-flush.c index 93d5fd8e51eb..bb21e4c36f70 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -212,13 +212,19 @@ static void flush_end_io(struct request *flush_rq, int error) } /* - * Moving a request silently to empty queue_head may stall the - * queue. Kick the queue in those cases. This function is called - * from request completion path and calling directly into - * request_fn may confuse the driver. Always use kblockd. + * Kick the queue to avoid stall for two cases: + * 1. Moving a request silently to empty queue_head may stall the + * queue. + * 2. When flush request is running in non-queueable queue, the + * queue is hold. Restart the queue after flush request is finished + * to avoid stall. + * This function is called from request completion path and calling + * directly into request_fn may confuse the driver. Always use + * kblockd. */ - if (queued) - __blk_run_queue(q, true); + if (queued || q->flush_queue_delayed) + blk_run_queue_async(q); + q->flush_queue_delayed = 0; } /** @@ -261,7 +267,7 @@ static bool blk_kick_flush(struct request_queue *q) q->flush_rq.end_io = flush_end_io; q->flush_pending_idx ^= 1; - elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE); + list_add_tail(&q->flush_rq.queuelist, &q->queue_head); return true; } @@ -274,14 +280,14 @@ static void flush_data_end_io(struct request *rq, int error) * the comment in flush_end_io(). */ if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) - __blk_run_queue(q, true); + blk_run_queue_async(q); } /** * blk_insert_flush - insert a new FLUSH/FUA request * @rq: request to insert * - * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions. + * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. * @rq is being submitted. Analyze what needs to be done and put it on the * right queue. * @@ -312,7 +318,7 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - list_add(&rq->queuelist, &q->queue_head); + list_add_tail(&rq->queuelist, &q->queue_head); return; } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 54bcba6c02a7..129b9e209a3b 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -30,6 +30,8 @@ static struct kmem_cache *integrity_cachep; +static const char *bi_unsupported_name = "unsupported"; + /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements * @q: request queue @@ -358,6 +360,14 @@ static struct kobj_type integrity_ktype = { .release = blk_integrity_release, }; +bool blk_integrity_is_initialized(struct gendisk *disk) +{ + struct blk_integrity *bi = blk_get_integrity(disk); + + return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0); +} +EXPORT_SYMBOL(blk_integrity_is_initialized); + /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware @@ -407,7 +417,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) bi->get_tag_fn = template->get_tag_fn; bi->tag_size = template->tag_size; } else - bi->name = "unsupported"; + bi->name = bi_unsupported_name; return 0; } diff --git a/block/blk-ioc.c b/block/blk-ioc.c index b791022beef3..c898049dafd5 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -96,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); ret->ioc_data = NULL; +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) + ret->cgroup_changed = 0; +#endif } return ret; diff --git a/block/blk-lib.c b/block/blk-lib.c index 25de73e4759b..78e627e2581d 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -9,17 +9,20 @@ #include "blk.h" -static void blkdev_discard_end_io(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } +struct bio_batch { + atomic_t done; + unsigned long flags; + struct completion *wait; +}; - if (bio->bi_private) - complete(bio->bi_private); +static void bio_batch_end_io(struct bio *bio, int err) +{ + struct bio_batch *bb = bio->bi_private; + if (err && (err != -EOPNOTSUPP)) + clear_bit(BIO_UPTODATE, &bb->flags); + if (atomic_dec_and_test(&bb->done)) + complete(bb->wait); bio_put(bio); } @@ -41,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; unsigned int max_discard_sectors; + struct bio_batch bb; struct bio *bio; int ret = 0; @@ -67,7 +71,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, type |= REQ_SECURE; } - while (nr_sects && !ret) { + atomic_set(&bb.done, 1); + bb.flags = 1 << BIO_UPTODATE; + bb.wait = &wait; + + while (nr_sects) { bio = bio_alloc(gfp_mask, 1); if (!bio) { ret = -ENOMEM; @@ -75,9 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } bio->bi_sector = sector; - bio->bi_end_io = blkdev_discard_end_io; + bio->bi_end_io = bio_batch_end_io; bio->bi_bdev = bdev; - bio->bi_private = &wait; + bio->bi_private = &bb; if (nr_sects > max_discard_sectors) { bio->bi_size = max_discard_sectors << 9; @@ -88,45 +96,21 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, nr_sects = 0; } - bio_get(bio); + atomic_inc(&bb.done); submit_bio(type, bio); + } + /* Wait for bios in-flight */ + if (!atomic_dec_and_test(&bb.done)) wait_for_completion(&wait); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } + if (!test_bit(BIO_UPTODATE, &bb.flags)) + ret = -EIO; return ret; } EXPORT_SYMBOL(blkdev_issue_discard); -struct bio_batch -{ - atomic_t done; - unsigned long flags; - struct completion *wait; -}; - -static void bio_batch_end_io(struct bio *bio, int err) -{ - struct bio_batch *bb = bio->bi_private; - - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bb->flags); - else - clear_bit(BIO_UPTODATE, &bb->flags); - } - if (bb) - if (atomic_dec_and_test(&bb->done)) - complete(bb->wait); - bio_put(bio); -} - /** * blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue @@ -151,7 +135,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, bb.flags = 1 << BIO_UPTODATE; bb.wait = &wait; -submit: ret = 0; while (nr_sects != 0) { bio = bio_alloc(gfp_mask, @@ -168,9 +151,6 @@ submit: while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); - if (sz == 0) - /* bio has maximum size possible */ - break; ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); nr_sects -= ret >> 9; sector += ret >> 9; @@ -190,16 +170,6 @@ submit: /* One of bios in the batch was completed with error.*/ ret = -EIO; - if (ret) - goto out; - - if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { - ret = -EOPNOTSUPP; - goto out; - } - if (nr_sects != 0) - goto submit; -out: return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/block/blk-settings.c b/block/blk-settings.c index 1fa769293597..fa1eb0449a05 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -120,7 +120,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; - lim->discard_zeroes_data = -1; + lim->discard_zeroes_data = 1; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -166,6 +166,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) blk_set_default_limits(&q->limits); blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); + q->limits.discard_zeroes_data = 0; /* * by default assume old behaviour and bounce for any highmem page @@ -790,6 +791,12 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush) } EXPORT_SYMBOL_GPL(blk_queue_flush); +void blk_queue_flush_queueable(struct request_queue *q, bool queueable) +{ + q->flush_not_queueable = !queueable; +} +EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 261c75c665ae..d935bd859c87 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -66,14 +66,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_queue_full(q, BLK_RW_SYNC); - } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { + } else { blk_clear_queue_full(q, BLK_RW_SYNC); wake_up(&rl->wait[BLK_RW_SYNC]); } if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { blk_set_queue_full(q, BLK_RW_ASYNC); - } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) { + } else { blk_clear_queue_full(q, BLK_RW_ASYNC); wake_up(&rl->wait[BLK_RW_ASYNC]); } @@ -152,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag static ssize_t queue_discard_max_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.max_discard_sectors << 9, page); + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_discard_sectors << 9); } static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) @@ -498,7 +499,6 @@ int blk_register_queue(struct gendisk *disk) { int ret; struct device *dev = disk_to_dev(disk); - struct request_queue *q = disk->queue; if (WARN_ON(!q)) @@ -509,8 +509,10 @@ int blk_register_queue(struct gendisk *disk) return ret; ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); - if (ret < 0) + if (ret < 0) { + blk_trace_remove_sysfs(dev); return ret; + } kobject_uevent(&q->kobj, KOBJ_ADD); @@ -521,7 +523,7 @@ int blk_register_queue(struct gendisk *disk) if (ret) { kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); - blk_trace_remove_sysfs(disk_to_dev(disk)); + blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); return ret; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5352bdafbcf0..a62be8d0dc1b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -77,7 +77,9 @@ struct throtl_grp { unsigned long slice_end[2]; /* Some throttle limits got updated for the group */ - bool limits_changed; + int limits_changed; + + struct rcu_head rcu_head; }; struct throtl_data @@ -88,7 +90,7 @@ struct throtl_data /* service tree for active throtl groups */ struct throtl_rb_root tg_service_tree; - struct throtl_grp root_tg; + struct throtl_grp *root_tg; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ @@ -102,7 +104,7 @@ struct throtl_data /* Work for dispatching throttled bios */ struct delayed_work throtl_work; - bool limits_changed; + int limits_changed; }; enum tg_state_flags { @@ -151,57 +153,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) return tg; } -static void throtl_put_tg(struct throtl_grp *tg) +static void throtl_free_tg(struct rcu_head *head) { - BUG_ON(atomic_read(&tg->ref) <= 0); - if (!atomic_dec_and_test(&tg->ref)) - return; + struct throtl_grp *tg; + + tg = container_of(head, struct throtl_grp, rcu_head); + free_percpu(tg->blkg.stats_cpu); kfree(tg); } -static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, - struct cgroup *cgroup) +static void throtl_put_tg(struct throtl_grp *tg) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); - struct throtl_grp *tg = NULL; - void *key = td; - struct backing_dev_info *bdi = &td->queue->backing_dev_info; - unsigned int major, minor; + BUG_ON(atomic_read(&tg->ref) <= 0); + if (!atomic_dec_and_test(&tg->ref)) + return; /* - * TODO: Speed up blkiocg_lookup_group() by maintaining a radix - * tree of blkg (instead of traversing through hash list all - * the time. + * A group is freed in rcu manner. But having an rcu lock does not + * mean that one can access all the fields of blkg and assume these + * are valid. For example, don't try to follow throtl_data and + * request queue links. + * + * Having a reference to blkg under an rcu allows acess to only + * values local to groups like group stats and group rate limits */ + call_rcu(&tg->rcu_head, throtl_free_tg); +} - /* - * This is the common case when there are no blkio cgroups. - * Avoid lookup in this case - */ - if (blkcg == &blkio_root_cgroup) - tg = &td->root_tg; - else - tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); - - /* Fill in device details for root group */ - if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - tg->blkg.dev = MKDEV(major, minor); - goto done; - } - - if (tg) - goto done; - - tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); - if (!tg) - goto done; - +static void throtl_init_group(struct throtl_grp *tg) +{ INIT_HLIST_NODE(&tg->tg_node); RB_CLEAR_NODE(&tg->rb_node); bio_list_init(&tg->bio_lists[0]); bio_list_init(&tg->bio_lists[1]); - td->limits_changed = false; + tg->limits_changed = false; + + /* Practically unlimited BW */ + tg->bps[0] = tg->bps[1] = -1; + tg->iops[0] = tg->iops[1] = -1; /* * Take the initial reference that will be released on destroy @@ -210,33 +199,181 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, * exit or cgroup deletion path depending on who is exiting first. */ atomic_set(&tg->ref, 1); +} + +/* Should be called with rcu read lock held (needed for blkcg) */ +static void +throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) +{ + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; +} + +static void +__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) +{ + struct backing_dev_info *bdi = &td->queue->backing_dev_info; + unsigned int major, minor; + + if (!tg || tg->blkg.dev) + return; + + /* + * Fill in device details for a group which might not have been + * filled at group creation time as queue was being instantiated + * and driver had not attached a device yet + */ + if (bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + tg->blkg.dev = MKDEV(major, minor); + } +} + +/* + * Should be called with without queue lock held. Here queue lock will be + * taken rarely. It will be taken only once during life time of a group + * if need be + */ +static void +throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) +{ + if (!tg || tg->blkg.dev) + return; + + spin_lock_irq(td->queue->queue_lock); + __throtl_tg_fill_dev_details(td, tg); + spin_unlock_irq(td->queue->queue_lock); +} + +static void throtl_init_add_tg_lists(struct throtl_data *td, + struct throtl_grp *tg, struct blkio_cgroup *blkcg) +{ + __throtl_tg_fill_dev_details(td, tg); /* Add group onto cgroup list */ - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, - MKDEV(major, minor), BLKIO_POLICY_THROTL); + tg->blkg.dev, BLKIO_POLICY_THROTL); tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); - hlist_add_head(&tg->tg_node, &td->tg_list); - td->nr_undestroyed_grps++; -done: + throtl_add_group_to_td_list(td, tg); +} + +/* Should be called without queue lock and outside of rcu period */ +static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) +{ + struct throtl_grp *tg = NULL; + int ret; + + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); + if (!tg) + return NULL; + + ret = blkio_alloc_blkg_stats(&tg->blkg); + + if (ret) { + kfree(tg); + return NULL; + } + + throtl_init_group(tg); return tg; } -static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +static struct +throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) { - struct cgroup *cgroup; struct throtl_grp *tg = NULL; + void *key = td; + + /* + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case + */ + if (blkcg == &blkio_root_cgroup) + tg = td->root_tg; + else + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + __throtl_tg_fill_dev_details(td, tg); + return tg; +} + +/* + * This function returns with queue lock unlocked in case of error, like + * request queue is no more + */ +static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +{ + struct throtl_grp *tg = NULL, *__tg = NULL; + struct blkio_cgroup *blkcg; + struct request_queue *q = td->queue; rcu_read_lock(); - cgroup = task_cgroup(current, blkio_subsys_id); - tg = throtl_find_alloc_tg(td, cgroup); - if (!tg) - tg = &td->root_tg; + blkcg = task_blkio_cgroup(current); + tg = throtl_find_tg(td, blkcg); + if (tg) { + rcu_read_unlock(); + return tg; + } + + /* + * Need to allocate a group. Allocation of group also needs allocation + * of per cpu stats which in-turn takes a mutex() and can block. Hence + * we need to drop rcu lock and queue_lock before we call alloc + * + * Take the request queue reference to make sure queue does not + * go away once we return from allocation. + */ + blk_get_queue(q); + rcu_read_unlock(); + spin_unlock_irq(q->queue_lock); + + tg = throtl_alloc_tg(td); + /* + * We might have slept in group allocation. Make sure queue is not + * dead + */ + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { + blk_put_queue(q); + if (tg) + kfree(tg); + + return ERR_PTR(-ENODEV); + } + blk_put_queue(q); + + /* Group allocated and queue is still alive. take the lock */ + spin_lock_irq(q->queue_lock); + + /* + * Initialize the new group. After sleeping, read the blkcg again. + */ + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + + /* + * If some other thread already allocated the group while we were + * not holding queue lock, free up the group + */ + __tg = throtl_find_tg(td, blkcg); + + if (__tg) { + kfree(tg); + rcu_read_unlock(); + return __tg; + } + + /* Group allocation failed. Account the IO to root group */ + if (!tg) { + tg = td->root_tg; + return tg; + } + + throtl_init_add_tg_lists(td, tg, blkcg); rcu_read_unlock(); return tg; } @@ -545,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, return 0; } +static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) + return 1; + return 0; +} + /* * Returns whether one can dispatch a bio or not. Also returns approx number * of jiffies to wait before this bio is with-in IO rate and can be dispatched @@ -609,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg->bytes_disp[rw] += bio->bi_size; tg->io_disp[rw]++; - /* - * TODO: This will take blkg->stats_lock. Figure out a way - * to avoid this cost. - */ blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); } @@ -916,7 +1055,7 @@ static void throtl_update_blkio_group_common(struct throtl_data *td, /* * For all update functions, key should be a valid pointer because these * update functions are called under blkcg_lock, that means, blkg is - * valid and in turn key is valid. queue exit path can not race becuase + * valid and in turn key is valid. queue exit path can not race because * of blkcg_lock * * Can not take queue lock in update functions as queue lock under blkcg_lock @@ -990,15 +1129,51 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) struct throtl_grp *tg; struct bio *bio = *biop; bool rw = bio_data_dir(bio), update_disptime = true; + struct blkio_cgroup *blkcg; if (bio->bi_rw & REQ_THROTTLED) { bio->bi_rw &= ~REQ_THROTTLED; return 0; } + /* + * A throtl_grp pointer retrieved under rcu can be used to access + * basic fields like stats and io rates. If a group has no rules, + * just update the dispatch stats in lockless manner and return. + */ + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + tg = throtl_find_tg(td, blkcg); + if (tg) { + throtl_tg_fill_dev_details(td, tg); + + if (tg_no_rule_group(tg, rw)) { + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, + rw, bio->bi_rw & REQ_SYNC); + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + + /* + * Either group has not been allocated yet or it is not an unlimited + * IO group + */ + spin_lock_irq(q->queue_lock); tg = throtl_get_tg(td); + if (IS_ERR(tg)) { + if (PTR_ERR(tg) == -ENODEV) { + /* + * Queue is gone. No queue lock held here. + */ + return -ENODEV; + } + } + if (tg->nr_queued[rw]) { /* * There is already another bio queued in same dir. No @@ -1061,39 +1236,24 @@ int blk_throtl_init(struct request_queue *q) INIT_HLIST_HEAD(&td->tg_list); td->tg_service_tree = THROTL_RB_ROOT; td->limits_changed = false; + INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); - /* Init root group */ - tg = &td->root_tg; - INIT_HLIST_NODE(&tg->tg_node); - RB_CLEAR_NODE(&tg->rb_node); - bio_list_init(&tg->bio_lists[0]); - bio_list_init(&tg->bio_lists[1]); - - /* Practically unlimited BW */ - tg->bps[0] = tg->bps[1] = -1; - tg->iops[0] = tg->iops[1] = -1; - td->limits_changed = false; + /* alloc and Init root group. */ + td->queue = q; + tg = throtl_alloc_tg(td); - /* - * Set root group reference to 2. One reference will be dropped when - * all groups on tg_list are being deleted during queue exit. Other - * reference will remain there as we don't want to delete this group - * as it is statically allocated and gets destroyed when throtl_data - * goes away. - */ - atomic_set(&tg->ref, 2); - hlist_add_head(&tg->tg_node, &td->tg_list); - td->nr_undestroyed_grps++; + if (!tg) { + kfree(td); + return -ENOMEM; + } - INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); + td->root_tg = tg; rcu_read_lock(); - blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, - 0, BLKIO_POLICY_THROTL); + throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); rcu_read_unlock(); /* Attach throtl data to request queue */ - td->queue = q; q->td = td; return 0; } diff --git a/block/blk.h b/block/blk.h index c8db371a921d..d6586287adc9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -32,7 +32,7 @@ enum rq_atomic_flags { /* * EH timer and IO completion will both attempt to 'grab' the request, make - * sure that only one of them suceeds + * sure that only one of them succeeds */ static inline int blk_mark_rq_complete(struct request *rq) { @@ -62,7 +62,28 @@ static inline struct request *__elv_next_request(struct request_queue *q) return rq; } - if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) + /* + * Flush request is running and flush request isn't queueable + * in the drive, we can hold the queue till flush request is + * finished. Even we don't do this, driver can't dispatch next + * requests and will requeue them. And this can improve + * throughput too. For example, we have request flush1, write1, + * flush 2. flush1 is dispatched, then queue is hold, write1 + * isn't inserted to queue. After flush1 is finished, flush2 + * will be dispatched. Since disk cache is already clean, + * flush2 will be finished very soon, so looks like flush2 is + * folded to flush1. + * Since the queue is hold, a flag is set to indicate the queue + * should be restarted later. Please see flush_end_io() for + * details. + */ + if (q->flush_pending_idx != q->flush_running_idx && + !queue_flush_queueable(q)) { + q->flush_queue_delayed = 1; + return NULL; + } + if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) || + !q->elevator->ops->elevator_dispatch_fn(q, 0)) return NULL; } } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7785169f3c8f..7c52d6888924 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -300,7 +300,9 @@ struct cfq_data { /* List of cfq groups being managed on this device*/ struct hlist_head cfqg_list; - struct rcu_head rcu; + + /* Number of groups which are on blkcg->blkg_list */ + unsigned int nr_blkcg_linked_grps; }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); @@ -665,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, if (rq2 == NULL) return rq1; - if (rq_is_sync(rq1) && !rq_is_sync(rq2)) - return rq1; - else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) - return rq2; - if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) - return rq1; - else if ((rq2->cmd_flags & REQ_META) && - !(rq1->cmd_flags & REQ_META)) - return rq2; + if (rq_is_sync(rq1) != rq_is_sync(rq2)) + return rq_is_sync(rq1) ? rq1 : rq2; + + if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) + return rq1->cmd_flags & REQ_META ? rq1 : rq2; s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); @@ -888,7 +886,7 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) /* * Currently put the group at the end. Later implement something * so that groups get lesser vtime based on their weights, so that - * if group does not loose all if it was not continously backlogged. + * if group does not loose all if it was not continuously backlogged. */ n = rb_last(&st->rb); if (n) { @@ -1014,29 +1012,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, cfqg->needs_update = true; } -static struct cfq_group * -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, + struct cfq_group *cfqg, struct blkio_cgroup *blkcg) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); - struct cfq_group *cfqg = NULL; - void *key = cfqd; - int i, j; - struct cfq_rb_root *st; struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; unsigned int major, minor; - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + /* + * Add group onto cgroup list. It might happen that bdi->dev is + * not initialized yet. Initialize this new group without major + * and minor info and this info will be filled in once a new thread + * comes for IO. + */ + if (bdi->dev) { sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfqg->blkg.dev = MKDEV(major, minor); - goto done; - } - if (cfqg || !create) - goto done; + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, + (void *)cfqd, MKDEV(major, minor)); + } else + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, + (void *)cfqd, 0); + + cfqd->nr_blkcg_linked_grps++; + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); + + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); +} + +/* + * Should be called from sleepable context. No request queue lock as per + * cpu stats are allocated dynamically and alloc_percpu needs to be called + * from sleepable context. + */ +static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg = NULL; + int i, j, ret; + struct cfq_rb_root *st; cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); if (!cfqg) - goto done; + return NULL; for_each_cfqg_st(cfqg, i, j, st) *st = CFQ_RB_ROOT; @@ -1050,43 +1066,94 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) */ cfqg->ref = 1; + ret = blkio_alloc_blkg_stats(&cfqg->blkg); + if (ret) { + kfree(cfqg); + return NULL; + } + + return cfqg; +} + +static struct cfq_group * +cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) +{ + struct cfq_group *cfqg = NULL; + void *key = cfqd; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + /* - * Add group onto cgroup list. It might happen that bdi->dev is - * not initialized yet. Initialize this new group without major - * and minor info and this info will be filled in once a new thread - * comes for IO. See code above. + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case */ - if (bdi->dev) { - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, - MKDEV(major, minor)); - } else - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, - 0); - - cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); + if (blkcg == &blkio_root_cgroup) + cfqg = &cfqd->root_group; + else + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); - /* Add group on cfqd list */ - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfqg->blkg.dev = MKDEV(major, minor); + } -done: return cfqg; } /* - * Search for the cfq group current task belongs to. If create = 1, then also - * create the cfq group if it does not exist. request_queue lock must be held. + * Search for the cfq group current task belongs to. request_queue lock must + * be held. */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) { - struct cgroup *cgroup; - struct cfq_group *cfqg = NULL; + struct blkio_cgroup *blkcg; + struct cfq_group *cfqg = NULL, *__cfqg = NULL; + struct request_queue *q = cfqd->queue; rcu_read_lock(); - cgroup = task_cgroup(current, blkio_subsys_id); - cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); - if (!cfqg && create) + blkcg = task_blkio_cgroup(current); + cfqg = cfq_find_cfqg(cfqd, blkcg); + if (cfqg) { + rcu_read_unlock(); + return cfqg; + } + + /* + * Need to allocate a group. Allocation of group also needs allocation + * of per cpu stats which in-turn takes a mutex() and can block. Hence + * we need to drop rcu lock and queue_lock before we call alloc. + * + * Not taking any queue reference here and assuming that queue is + * around by the time we return. CFQ queue allocation code does + * the same. It might be racy though. + */ + + rcu_read_unlock(); + spin_unlock_irq(q->queue_lock); + + cfqg = cfq_alloc_cfqg(cfqd); + + spin_lock_irq(q->queue_lock); + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + + /* + * If some other thread already allocated the group while we were + * not holding queue lock, free up the group + */ + __cfqg = cfq_find_cfqg(cfqd, blkcg); + + if (__cfqg) { + kfree(cfqg); + rcu_read_unlock(); + return __cfqg; + } + + if (!cfqg) cfqg = &cfqd->root_group; + + cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); rcu_read_unlock(); return cfqg; } @@ -1119,6 +1186,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg) return; for_each_cfqg_st(cfqg, i, j, st) BUG_ON(!RB_EMPTY_ROOT(&st->rb)); + free_percpu(cfqg->blkg.stats_cpu); kfree(cfqg); } @@ -1177,7 +1245,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) } #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) { return &cfqd->root_group; } @@ -1211,7 +1279,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_rb_root *service_tree; int left; int new_cfqq = 1; - int group_changed = 0; service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), cfqq_type(cfqq)); @@ -1282,7 +1349,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, rb_link_node(&cfqq->rb_node, parent, p); rb_insert_color(&cfqq->rb_node, &service_tree->rb); service_tree->count++; - if ((add_front || !new_cfqq) && !group_changed) + if (add_front || !new_cfqq) return; cfq_group_notify_queue_add(cfqd, cfqq->cfqg); } @@ -2030,7 +2097,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); - return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); + return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); } /* @@ -2582,28 +2649,20 @@ static void cfq_put_queue(struct cfq_queue *cfqq) } /* - * Must always be called with the rcu_read_lock() held + * Call func for each cic attached to this ioc. */ static void -__call_for_each_cic(struct io_context *ioc, - void (*func)(struct io_context *, struct cfq_io_context *)) +call_for_each_cic(struct io_context *ioc, + void (*func)(struct io_context *, struct cfq_io_context *)) { struct cfq_io_context *cic; struct hlist_node *n; + rcu_read_lock(); + hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) func(ioc, cic); -} -/* - * Call func for each cic attached to this ioc. - */ -static void -call_for_each_cic(struct io_context *ioc, - void (*func)(struct io_context *, struct cfq_io_context *)) -{ - rcu_read_lock(); - __call_for_each_cic(ioc, func); rcu_read_unlock(); } @@ -2664,7 +2723,7 @@ static void cfq_free_io_context(struct io_context *ioc) * should be ok to iterate over the known list, we will see all cic's * since no new ones are added. */ - __call_for_each_cic(ioc, cic_free_func); + call_for_each_cic(ioc, cic_free_func); } static void cfq_put_cooperator(struct cfq_queue *cfqq) @@ -2920,7 +2979,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_group *cfqg; retry: - cfqg = cfq_get_cfqg(cfqd, 1); + cfqg = cfq_get_cfqg(cfqd); cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); @@ -3368,7 +3427,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->busy_queues > 1) { cfq_del_timer(cfqd, cfqq); cfq_clear_cfqq_wait_request(cfqq); - __blk_run_queue(cfqd->queue, false); + __blk_run_queue(cfqd->queue); } else { cfq_blkiocg_update_idle_time_stats( &cfqq->cfqg->blkg); @@ -3383,7 +3442,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, * this new queue is RT and the current one is BE */ cfq_preempt_queue(cfqd, cfqq); - __blk_run_queue(cfqd->queue, false); + __blk_run_queue(cfqd->queue); } } @@ -3743,7 +3802,7 @@ static void cfq_kick_queue(struct work_struct *work) struct request_queue *q = cfqd->queue; spin_lock_irq(q->queue_lock); - __blk_run_queue(cfqd->queue, false); + __blk_run_queue(cfqd->queue); spin_unlock_irq(q->queue_lock); } @@ -3824,15 +3883,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd) cfq_put_queue(cfqd->async_idle_cfqq); } -static void cfq_cfqd_free(struct rcu_head *head) -{ - kfree(container_of(head, struct cfq_data, rcu)); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; struct request_queue *q = cfqd->queue; + bool wait = false; cfq_shutdown_timer_wq(cfqd); @@ -3851,7 +3906,13 @@ static void cfq_exit_queue(struct elevator_queue *e) cfq_put_async_queues(cfqd); cfq_release_cfq_groups(cfqd); - cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); + + /* + * If there are groups which we could not unlink from blkcg list, + * wait for a rcu period for them to be freed. + */ + if (cfqd->nr_blkcg_linked_grps) + wait = true; spin_unlock_irq(q->queue_lock); @@ -3861,8 +3922,25 @@ static void cfq_exit_queue(struct elevator_queue *e) ida_remove(&cic_index_ida, cfqd->cic_index); spin_unlock(&cic_index_lock); - /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ - call_rcu(&cfqd->rcu, cfq_cfqd_free); + /* + * Wait for cfqg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other unlinked groups out + * there. This can happen if cgroup deletion path claimed the + * responsibility of cleaning up a group before queue cleanup code + * get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* Free up per cpu stats for root group */ + free_percpu(cfqd->root_group.blkg.stats_cpu); +#endif + kfree(cfqd); } static int cfq_alloc_cic_index(void) @@ -3895,8 +3973,12 @@ static void *cfq_init_queue(struct request_queue *q) return NULL; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); - if (!cfqd) + if (!cfqd) { + spin_lock(&cic_index_lock); + ida_remove(&cic_index_ida, i); + spin_unlock(&cic_index_lock); return NULL; + } /* * Don't need take queue_lock in the routine, since we are @@ -3918,14 +4000,29 @@ static void *cfq_init_queue(struct request_queue *q) #ifdef CONFIG_CFQ_GROUP_IOSCHED /* - * Take a reference to root group which we never drop. This is just - * to make sure that cfq_put_cfqg() does not try to kfree root group + * Set root group reference to 2. One reference will be dropped when + * all groups on cfqd->cfqg_list are being deleted during queue exit. + * Other reference will remain there as we don't want to delete this + * group as it is statically allocated and gets destroyed when + * throtl_data goes away. */ - cfqg->ref = 1; + cfqg->ref = 2; + + if (blkio_alloc_blkg_stats(&cfqg->blkg)) { + kfree(cfqg); + kfree(cfqd); + return NULL; + } + rcu_read_lock(); + cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, 0); rcu_read_unlock(); + cfqd->nr_blkcg_linked_grps++; + + /* Add group on cfqd->cfqg_list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); #endif /* * Not strictly needed (since RB_ROOT just clears the node and we diff --git a/block/elevator.c b/block/elevator.c index c387d3168734..b0b38ce0dcb6 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name) e = elevator_find(name); if (!e) { - char elv[ELV_NAME_MAX + strlen("-iosched")]; - spin_unlock(&elv_list_lock); - - snprintf(elv, sizeof(elv), "%s-iosched", name); - - request_module("%s", elv); + request_module("%s-iosched", name); spin_lock(&elv_list_lock); e = elevator_find(name); } @@ -421,8 +416,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) struct list_head *entry; int stop_flags; - BUG_ON(rq->cmd_flags & REQ_ON_PLUG); - if (q->last_merge == rq) q->last_merge = NULL; @@ -610,7 +603,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) rq->cmd_flags &= ~REQ_STARTED; - elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); + __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE); } void elv_drain_elevator(struct request_queue *q) @@ -642,7 +635,7 @@ void elv_quiesce_start(struct request_queue *q) */ elv_drain_elevator(q); while (q->rq.elvpriv) { - __blk_run_queue(q, false); + __blk_run_queue(q); spin_unlock_irq(q->queue_lock); msleep(10); spin_lock_irq(q->queue_lock); @@ -655,12 +648,24 @@ void elv_quiesce_end(struct request_queue *q) queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); } -void elv_insert(struct request_queue *q, struct request *rq, int where) +void __elv_add_request(struct request_queue *q, struct request *rq, int where) { trace_block_rq_insert(q, rq); rq->q = q; + if (rq->cmd_flags & REQ_SOFTBARRIER) { + /* barriers are scheduling boundary, update end_sector */ + if (rq->cmd_type == REQ_TYPE_FS || + (rq->cmd_flags & REQ_DISCARD)) { + q->end_sector = rq_end_sector(rq); + q->boundary_rq = rq; + } + } else if (!(rq->cmd_flags & REQ_ELVPRIV) && + (where == ELEVATOR_INSERT_SORT || + where == ELEVATOR_INSERT_SORT_MERGE)) + where = ELEVATOR_INSERT_BACK; + switch (where) { case ELEVATOR_INSERT_REQUEUE: case ELEVATOR_INSERT_FRONT: @@ -682,7 +687,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) * with anything. There's no point in delaying queue * processing. */ - __blk_run_queue(q, false); + __blk_run_queue(q); break; case ELEVATOR_INSERT_SORT_MERGE: @@ -722,24 +727,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) BUG(); } } - -void __elv_add_request(struct request_queue *q, struct request *rq, int where) -{ - BUG_ON(rq->cmd_flags & REQ_ON_PLUG); - - if (rq->cmd_flags & REQ_SOFTBARRIER) { - /* barriers are scheduling boundary, update end_sector */ - if (rq->cmd_type == REQ_TYPE_FS || - (rq->cmd_flags & REQ_DISCARD)) { - q->end_sector = rq_end_sector(rq); - q->boundary_rq = rq; - } - } else if (!(rq->cmd_flags & REQ_ELVPRIV) && - where == ELEVATOR_INSERT_SORT) - where = ELEVATOR_INSERT_BACK; - - elv_insert(q, rq, where); -} EXPORT_SYMBOL(__elv_add_request); void elv_add_request(struct request_queue *q, struct request *rq, int where) diff --git a/block/genhd.c b/block/genhd.c index c91a2dac6b6b..95822ae25cfe 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -739,7 +739,7 @@ void __init printk_all_partitions(void) /* * Don't show empty devices or things that have been - * surpressed + * suppressed */ if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) @@ -1588,9 +1588,13 @@ static void disk_events_workfn(struct work_struct *work) spin_unlock_irq(&ev->lock); - /* tell userland about new events */ + /* + * Tell userland about new events. Only the events listed in + * @disk->events are reported. Unlisted events are processed the + * same internally but never get reported to userland. + */ for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if (events & (1 << i)) + if (events & disk->events & (1 << i)) envp[nr_events++] = disk_uevents[i]; if (nr_events) @@ -1724,7 +1728,7 @@ static void disk_add_events(struct gendisk *disk) { struct disk_events *ev; - if (!disk->fops->check_events || !(disk->events | disk->async_events)) + if (!disk->fops->check_events) return; ev = kzalloc(sizeof(*ev), GFP_KERNEL); |