summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.cz>2011-06-07 15:37:51 +0200
committerMichal Marek <mmarek@suse.cz>2011-06-07 15:37:51 +0200
commit2e483528cebad089d0bb3f9aebb0ada22d968ffa (patch)
treed701405826b271e819a9a8500838cebd37b1364a /block
parent163d3fe6a2357aba7b18b938d6ae6ce9570324e4 (diff)
parent55922c9d1b84b89cb946c777fddccb3247e7df2c (diff)
downloadlinux-2e483528cebad089d0bb3f9aebb0ada22d968ffa.tar.bz2
Merge commit 'v3.0-rc1' into kbuild/kbuild
Diffstat (limited to 'block')
-rw-r--r--block/blk-cgroup.c229
-rw-r--r--block/blk-cgroup.h43
-rw-r--r--block/blk-core.c223
-rw-r--r--block/blk-exec.c4
-rw-r--r--block/blk-flush.c26
-rw-r--r--block/blk-integrity.c12
-rw-r--r--block/blk-ioc.c3
-rw-r--r--block/blk-lib.c82
-rw-r--r--block/blk-settings.c9
-rw-r--r--block/blk-sysfs.c14
-rw-r--r--block/blk-throttle.c324
-rw-r--r--block/blk.h25
-rw-r--r--block/cfq-iosched.c265
-rw-r--r--block/elevator.c47
-rw-r--r--block/genhd.c12
15 files changed, 879 insertions, 439 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2bef5705ce24..bcaf16ee6ad1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
struct cgroup *);
-static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
- struct task_struct *, bool);
-static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
- struct cgroup *, struct task_struct *, bool);
+static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
+static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
@@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
struct cgroup_subsys blkio_subsys = {
.name = "blkio",
.create = blkiocg_create,
- .can_attach = blkiocg_can_attach,
- .attach = blkiocg_attach,
+ .can_attach_task = blkiocg_can_attach_task,
+ .attach_task = blkiocg_attach_task,
.destroy = blkiocg_destroy,
.populate = blkiocg_populate,
#ifdef CONFIG_BLK_CGROUP
@@ -114,6 +112,13 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
}
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
+struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
+{
+ return container_of(task_subsys_state(tsk, blkio_subsys_id),
+ struct blkio_cgroup, css);
+}
+EXPORT_SYMBOL_GPL(task_blkio_cgroup);
+
static inline void
blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
{
@@ -378,25 +383,40 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
spin_lock_irqsave(&blkg->stats_lock, flags);
blkg->stats.time += time;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
blkg->stats.unaccounted_time += unaccounted_time;
+#endif
spin_unlock_irqrestore(&blkg->stats_lock, flags);
}
EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
+/*
+ * should be called under rcu read lock or queue lock to make sure blkg pointer
+ * is valid.
+ */
void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
uint64_t bytes, bool direction, bool sync)
{
- struct blkio_group_stats *stats;
+ struct blkio_group_stats_cpu *stats_cpu;
unsigned long flags;
- spin_lock_irqsave(&blkg->stats_lock, flags);
- stats = &blkg->stats;
- stats->sectors += bytes >> 9;
- blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
- sync);
- blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
- direction, sync);
- spin_unlock_irqrestore(&blkg->stats_lock, flags);
+ /*
+ * Disabling interrupts to provide mutual exclusion between two
+ * writes on same cpu. It probably is not needed for 64bit. Not
+ * optimizing that case yet.
+ */
+ local_irq_save(flags);
+
+ stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+
+ u64_stats_update_begin(&stats_cpu->syncp);
+ stats_cpu->sectors += bytes >> 9;
+ blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
+ 1, direction, sync);
+ blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
+ bytes, direction, sync);
+ u64_stats_update_end(&stats_cpu->syncp);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
@@ -419,18 +439,44 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
}
EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
+/* Merged stats are per cpu. */
void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
bool sync)
{
+ struct blkio_group_stats_cpu *stats_cpu;
unsigned long flags;
- spin_lock_irqsave(&blkg->stats_lock, flags);
- blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
- sync);
- spin_unlock_irqrestore(&blkg->stats_lock, flags);
+ /*
+ * Disabling interrupts to provide mutual exclusion between two
+ * writes on same cpu. It probably is not needed for 64bit. Not
+ * optimizing that case yet.
+ */
+ local_irq_save(flags);
+
+ stats_cpu = this_cpu_ptr(blkg->stats_cpu);
+
+ u64_stats_update_begin(&stats_cpu->syncp);
+ blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
+ direction, sync);
+ u64_stats_update_end(&stats_cpu->syncp);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
+/*
+ * This function allocates the per cpu stats for blkio_group. Should be called
+ * from sleepable context as alloc_per_cpu() requires that.
+ */
+int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+{
+ /* Allocate memory for per cpu stats */
+ blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+ if (!blkg->stats_cpu)
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
+
void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev,
enum blkio_policy_id plid)
@@ -501,6 +547,30 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
}
EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
+static void blkio_reset_stats_cpu(struct blkio_group *blkg)
+{
+ struct blkio_group_stats_cpu *stats_cpu;
+ int i, j, k;
+ /*
+ * Note: On 64 bit arch this should not be an issue. This has the
+ * possibility of returning some inconsistent value on 32bit arch
+ * as 64bit update on 32bit is non atomic. Taking care of this
+ * corner case makes code very complicated, like sending IPIs to
+ * cpus, taking care of stats of offline cpus etc.
+ *
+ * reset stats is anyway more of a debug feature and this sounds a
+ * corner case. So I am not complicating the code yet until and
+ * unless this becomes a real issue.
+ */
+ for_each_possible_cpu(i) {
+ stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
+ stats_cpu->sectors = 0;
+ for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
+ for (k = 0; k < BLKIO_STAT_TOTAL; k++)
+ stats_cpu->stat_arr_cpu[j][k] = 0;
+ }
+}
+
static int
blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
{
@@ -545,7 +615,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
}
#endif
spin_unlock(&blkg->stats_lock);
+
+ /* Reset Per cpu stats which don't take blkg->stats_lock */
+ blkio_reset_stats_cpu(blkg);
}
+
spin_unlock_irq(&blkcg->lock);
return 0;
}
@@ -591,6 +665,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
return val;
}
+
+static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
+ enum stat_type_cpu type, enum stat_sub_type sub_type)
+{
+ int cpu;
+ struct blkio_group_stats_cpu *stats_cpu;
+ u64 val = 0, tval;
+
+ for_each_possible_cpu(cpu) {
+ unsigned int start;
+ stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
+
+ do {
+ start = u64_stats_fetch_begin(&stats_cpu->syncp);
+ if (type == BLKIO_STAT_CPU_SECTORS)
+ tval = stats_cpu->sectors;
+ else
+ tval = stats_cpu->stat_arr_cpu[type][sub_type];
+ } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
+
+ val += tval;
+ }
+
+ return val;
+}
+
+static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
+ struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
+{
+ uint64_t disk_total, val;
+ char key_str[MAX_KEY_LEN];
+ enum stat_sub_type sub_type;
+
+ if (type == BLKIO_STAT_CPU_SECTORS) {
+ val = blkio_read_stat_cpu(blkg, type, 0);
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
+ }
+
+ for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+ sub_type++) {
+ blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+ val = blkio_read_stat_cpu(blkg, type, sub_type);
+ cb->fill(cb, key_str, val);
+ }
+
+ disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
+ blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
+
+ blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+ cb->fill(cb, key_str, disk_total);
+ return disk_total;
+}
+
/* This should be called with blkg->stats_lock held */
static uint64_t blkio_get_stat(struct blkio_group *blkg,
struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
@@ -602,9 +729,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
if (type == BLKIO_STAT_TIME)
return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
blkg->stats.time, cb, dev);
- if (type == BLKIO_STAT_SECTORS)
- return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
- blkg->stats.sectors, cb, dev);
#ifdef CONFIG_DEBUG_BLK_CGROUP
if (type == BLKIO_STAT_UNACCOUNTED_TIME)
return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
@@ -868,7 +992,7 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
}
/*
- * Some rules/values in blkg have changed. Propogate those to respective
+ * Some rules/values in blkg have changed. Propagate those to respective
* policies.
*/
static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
@@ -903,7 +1027,7 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
}
/*
- * A policy node rule has been updated. Propogate this update to all the
+ * A policy node rule has been updated. Propagate this update to all the
* block groups which might be affected by this update.
*/
static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
@@ -1068,8 +1192,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
}
static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
- struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
- bool show_total)
+ struct cftype *cft, struct cgroup_map_cb *cb,
+ enum stat_type type, bool show_total, bool pcpu)
{
struct blkio_group *blkg;
struct hlist_node *n;
@@ -1080,10 +1204,15 @@ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
if (blkg->dev) {
if (!cftype_blkg_same_policy(cft, blkg))
continue;
- spin_lock_irq(&blkg->stats_lock);
- cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
- type);
- spin_unlock_irq(&blkg->stats_lock);
+ if (pcpu)
+ cgroup_total += blkio_get_stat_cpu(blkg, cb,
+ blkg->dev, type);
+ else {
+ spin_lock_irq(&blkg->stats_lock);
+ cgroup_total += blkio_get_stat(blkg, cb,
+ blkg->dev, type);
+ spin_unlock_irq(&blkg->stats_lock);
+ }
}
}
if (show_total)
@@ -1107,47 +1236,47 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
switch(name) {
case BLKIO_PROP_time:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_TIME, 0);
+ BLKIO_STAT_TIME, 0, 0);
case BLKIO_PROP_sectors:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_SECTORS, 0);
+ BLKIO_STAT_CPU_SECTORS, 0, 1);
case BLKIO_PROP_io_service_bytes:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_SERVICE_BYTES, 1);
+ BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
case BLKIO_PROP_io_serviced:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_SERVICED, 1);
+ BLKIO_STAT_CPU_SERVICED, 1, 1);
case BLKIO_PROP_io_service_time:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_SERVICE_TIME, 1);
+ BLKIO_STAT_SERVICE_TIME, 1, 0);
case BLKIO_PROP_io_wait_time:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_WAIT_TIME, 1);
+ BLKIO_STAT_WAIT_TIME, 1, 0);
case BLKIO_PROP_io_merged:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_MERGED, 1);
+ BLKIO_STAT_CPU_MERGED, 1, 1);
case BLKIO_PROP_io_queued:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_QUEUED, 1);
+ BLKIO_STAT_QUEUED, 1, 0);
#ifdef CONFIG_DEBUG_BLK_CGROUP
case BLKIO_PROP_unaccounted_time:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_UNACCOUNTED_TIME, 0);
+ BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
case BLKIO_PROP_dequeue:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_DEQUEUE, 0);
+ BLKIO_STAT_DEQUEUE, 0, 0);
case BLKIO_PROP_avg_queue_size:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+ BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
case BLKIO_PROP_group_wait_time:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_GROUP_WAIT_TIME, 0);
+ BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
case BLKIO_PROP_idle_time:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_IDLE_TIME, 0);
+ BLKIO_STAT_IDLE_TIME, 0, 0);
case BLKIO_PROP_empty_time:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_EMPTY_TIME, 0);
+ BLKIO_STAT_EMPTY_TIME, 0, 0);
#endif
default:
BUG();
@@ -1157,10 +1286,10 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
switch(name){
case BLKIO_THROTL_io_service_bytes:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_SERVICE_BYTES, 1);
+ BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
case BLKIO_THROTL_io_serviced:
return blkio_read_blkg_stats(blkcg, cft, cb,
- BLKIO_STAT_SERVICED, 1);
+ BLKIO_STAT_CPU_SERVICED, 1, 1);
default:
BUG();
}
@@ -1485,9 +1614,7 @@ done:
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkiocg_can_attach(struct cgroup_subsys *subsys,
- struct cgroup *cgroup, struct task_struct *tsk,
- bool threadgroup)
+static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
struct io_context *ioc;
int ret = 0;
@@ -1502,9 +1629,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
return ret;
}
-static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
- struct cgroup *prev, struct task_struct *tsk,
- bool threadgroup)
+static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
struct io_context *ioc;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 10919fae2d3a..a71d2904ffb9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -14,6 +14,7 @@
*/
#include <linux/cgroup.h>
+#include <linux/u64_stats_sync.h>
enum blkio_policy_id {
BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
@@ -36,22 +37,15 @@ enum stat_type {
* request completion for IOs doen by this cgroup. This may not be
* accurate when NCQ is turned on. */
BLKIO_STAT_SERVICE_TIME = 0,
- /* Total bytes transferred */
- BLKIO_STAT_SERVICE_BYTES,
- /* Total IOs serviced, post merge */
- BLKIO_STAT_SERVICED,
/* Total time spent waiting in scheduler queue in ns */
BLKIO_STAT_WAIT_TIME,
- /* Number of IOs merged */
- BLKIO_STAT_MERGED,
/* Number of IOs queued up */
BLKIO_STAT_QUEUED,
/* All the single valued stats go below this */
BLKIO_STAT_TIME,
- BLKIO_STAT_SECTORS,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Time not charged to this cgroup */
BLKIO_STAT_UNACCOUNTED_TIME,
-#ifdef CONFIG_DEBUG_BLK_CGROUP
BLKIO_STAT_AVG_QUEUE_SIZE,
BLKIO_STAT_IDLE_TIME,
BLKIO_STAT_EMPTY_TIME,
@@ -60,6 +54,18 @@ enum stat_type {
#endif
};
+/* Per cpu stats */
+enum stat_type_cpu {
+ BLKIO_STAT_CPU_SECTORS,
+ /* Total bytes transferred */
+ BLKIO_STAT_CPU_SERVICE_BYTES,
+ /* Total IOs serviced, post merge */
+ BLKIO_STAT_CPU_SERVICED,
+ /* Number of IOs merged */
+ BLKIO_STAT_CPU_MERGED,
+ BLKIO_STAT_CPU_NR
+};
+
enum stat_sub_type {
BLKIO_STAT_READ = 0,
BLKIO_STAT_WRITE,
@@ -116,11 +122,11 @@ struct blkio_cgroup {
struct blkio_group_stats {
/* total disk time and nr sectors dispatched by this group */
uint64_t time;
- uint64_t sectors;
- /* Time not charged to this cgroup */
- uint64_t unaccounted_time;
uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
#ifdef CONFIG_DEBUG_BLK_CGROUP
+ /* Time not charged to this cgroup */
+ uint64_t unaccounted_time;
+
/* Sum of number of IOs queued across all samples */
uint64_t avg_queue_size_sum;
/* Count of samples taken for average */
@@ -145,6 +151,13 @@ struct blkio_group_stats {
#endif
};
+/* Per cpu blkio group stats */
+struct blkio_group_stats_cpu {
+ uint64_t sectors;
+ uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
+ struct u64_stats_sync syncp;
+};
+
struct blkio_group {
/* An rcu protected unique identifier for the group */
void *key;
@@ -160,6 +173,8 @@ struct blkio_group {
/* Need to serialize the stats in the case of reset/update */
spinlock_t stats_lock;
struct blkio_group_stats stats;
+ /* Per cpu stats pointer */
+ struct blkio_group_stats_cpu __percpu *stats_cpu;
};
struct blkio_policy_node {
@@ -291,9 +306,11 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
extern struct blkio_cgroup blkio_root_cgroup;
extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
+extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev,
enum blkio_policy_id plid);
+extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
void *key);
@@ -314,11 +331,15 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
struct cgroup;
static inline struct blkio_cgroup *
cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
+static inline struct blkio_cgroup *
+task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev,
enum blkio_policy_id plid) {}
+static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
+
static inline int
blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
diff --git a/block/blk-core.c b/block/blk-core.c
index e0a062363937..d2f8f4049abd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -198,26 +198,13 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
}
EXPORT_SYMBOL(blk_dump_rq_flags);
-/*
- * Make sure that plugs that were pending when this function was entered,
- * are now complete and requests pushed to the queue.
-*/
-static inline void queue_sync_plugs(struct request_queue *q)
-{
- /*
- * If the current process is plugged and has barriers submitted,
- * we will livelock if we don't unplug first.
- */
- blk_flush_plug(current);
-}
-
static void blk_delay_work(struct work_struct *work)
{
struct request_queue *q;
q = container_of(work, struct request_queue, delay_work.work);
spin_lock_irq(q->queue_lock);
- __blk_run_queue(q, false);
+ __blk_run_queue(q);
spin_unlock_irq(q->queue_lock);
}
@@ -233,7 +220,8 @@ static void blk_delay_work(struct work_struct *work)
*/
void blk_delay_queue(struct request_queue *q, unsigned long msecs)
{
- schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));
+ queue_delayed_work(kblockd_workqueue, &q->delay_work,
+ msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_delay_queue);
@@ -251,7 +239,7 @@ void blk_start_queue(struct request_queue *q)
WARN_ON(!irqs_disabled());
queue_flag_clear(QUEUE_FLAG_STOPPED, q);
- __blk_run_queue(q, false);
+ __blk_run_queue(q);
}
EXPORT_SYMBOL(blk_start_queue);
@@ -298,38 +286,44 @@ void blk_sync_queue(struct request_queue *q)
{
del_timer_sync(&q->timeout);
cancel_delayed_work_sync(&q->delay_work);
- queue_sync_plugs(q);
}
EXPORT_SYMBOL(blk_sync_queue);
/**
* __blk_run_queue - run a single device queue
* @q: The queue to run
- * @force_kblockd: Don't run @q->request_fn directly. Use kblockd.
*
* Description:
* See @blk_run_queue. This variant must be called with the queue lock
* held and interrupts disabled.
- *
*/
-void __blk_run_queue(struct request_queue *q, bool force_kblockd)
+void __blk_run_queue(struct request_queue *q)
{
if (unlikely(blk_queue_stopped(q)))
return;
- /*
- * Only recurse once to avoid overrunning the stack, let the unplug
- * handling reinvoke the handler shortly if we already got there.
- */
- if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
- q->request_fn(q);
- queue_flag_clear(QUEUE_FLAG_REENTER, q);
- } else
- queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+ q->request_fn(q);
}
EXPORT_SYMBOL(__blk_run_queue);
/**
+ * blk_run_queue_async - run a single device queue in workqueue context
+ * @q: The queue to run
+ *
+ * Description:
+ * Tells kblockd to perform the equivalent of @blk_run_queue on behalf
+ * of us.
+ */
+void blk_run_queue_async(struct request_queue *q)
+{
+ if (likely(!blk_queue_stopped(q))) {
+ __cancel_delayed_work(&q->delay_work);
+ queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+ }
+}
+EXPORT_SYMBOL(blk_run_queue_async);
+
+/**
* blk_run_queue - run a single device queue
* @q: The queue to run
*
@@ -342,7 +336,7 @@ void blk_run_queue(struct request_queue *q)
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
- __blk_run_queue(q, false);
+ __blk_run_queue(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_run_queue);
@@ -351,6 +345,7 @@ void blk_put_queue(struct request_queue *q)
{
kobject_put(&q->kobj);
}
+EXPORT_SYMBOL(blk_put_queue);
/*
* Note: If a driver supplied the queue lock, it should not zap that lock
@@ -572,11 +567,10 @@ int blk_get_queue(struct request_queue *q)
return 1;
}
+EXPORT_SYMBOL(blk_get_queue);
static inline void blk_free_request(struct request_queue *q, struct request *rq)
{
- BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
if (rq->cmd_flags & REQ_ELVPRIV)
elv_put_request(q, rq);
mempool_free(rq, q->rq.rq_pool);
@@ -991,7 +985,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
blk_queue_end_tag(q, rq);
add_acct_request(q, rq, where);
- __blk_run_queue(q, false);
+ __blk_run_queue(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_insert_request);
@@ -1116,14 +1110,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
- /*
- * Debug stuff, kill later
- */
- if (!rq_mergeable(req)) {
- blk_dump_rq_flags(req, "back");
- return false;
- }
-
if (!ll_back_merge_fn(q, req, bio))
return false;
@@ -1138,6 +1124,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
drive_stat_acct(req, 0);
+ elv_bio_merged(q, req, bio);
return true;
}
@@ -1145,15 +1132,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
struct request *req, struct bio *bio)
{
const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
- sector_t sector;
-
- /*
- * Debug stuff, kill later
- */
- if (!rq_mergeable(req)) {
- blk_dump_rq_flags(req, "front");
- return false;
- }
if (!ll_front_merge_fn(q, req, bio))
return false;
@@ -1163,8 +1141,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
- sector = bio->bi_sector;
-
bio->bi_next = req->bio;
req->bio = bio;
@@ -1179,12 +1155,13 @@ static bool bio_attempt_front_merge(struct request_queue *q,
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
drive_stat_acct(req, 0);
+ elv_bio_merged(q, req, bio);
return true;
}
/*
* Attempts to merge with the plugged list in the current process. Returns
- * true if merge was succesful, otherwise false.
+ * true if merge was successful, otherwise false.
*/
static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
struct bio *bio)
@@ -1264,14 +1241,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
el_ret = elv_merge(q, &req, bio);
if (el_ret == ELEVATOR_BACK_MERGE) {
- BUG_ON(req->cmd_flags & REQ_ON_PLUG);
if (bio_attempt_back_merge(q, req, bio)) {
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
} else if (el_ret == ELEVATOR_FRONT_MERGE) {
- BUG_ON(req->cmd_flags & REQ_ON_PLUG);
if (bio_attempt_front_merge(q, req, bio)) {
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
@@ -1311,23 +1286,27 @@ get_rq:
plug = current->plug;
if (plug) {
- if (!plug->should_sort && !list_empty(&plug->list)) {
+ /*
+ * If this is the first request added after a plug, fire
+ * of a plug trace. If others have been added before, check
+ * if we have multiple devices in this plug. If so, make a
+ * note to sort the list before dispatch.
+ */
+ if (list_empty(&plug->list))
+ trace_block_plug(q);
+ else if (!plug->should_sort) {
struct request *__rq;
__rq = list_entry_rq(plug->list.prev);
if (__rq->q != q)
plug->should_sort = 1;
}
- /*
- * Debug flag, kill later
- */
- req->cmd_flags |= REQ_ON_PLUG;
list_add_tail(&req->queuelist, &plug->list);
drive_stat_acct(req, 1);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where);
- __blk_run_queue(q, false);
+ __blk_run_queue(q);
out_unlock:
spin_unlock_irq(q->queue_lock);
}
@@ -1548,7 +1527,8 @@ static inline void __generic_make_request(struct bio *bio)
goto end_io;
}
- blk_throtl_bio(q, &bio);
+ if (blk_throtl_bio(q, &bio))
+ goto end_io;
/*
* If bio = NULL, bio has been throttled and will be submitted
@@ -2163,7 +2143,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
* size, something has gone terribly wrong.
*/
if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
- printk(KERN_ERR "blk: request botched\n");
+ blk_dump_rq_flags(req, "request botched");
req->__data_len = blk_rq_cur_bytes(req);
}
@@ -2644,6 +2624,7 @@ void blk_start_plug(struct blk_plug *plug)
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
+ INIT_LIST_HEAD(&plug->cb_list);
plug->should_sort = 0;
/*
@@ -2665,40 +2646,97 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
struct request *rqa = container_of(a, struct request, queuelist);
struct request *rqb = container_of(b, struct request, queuelist);
- return !(rqa->q == rqb->q);
+ return !(rqa->q <= rqb->q);
}
-static void flush_plug_list(struct blk_plug *plug)
+/*
+ * If 'from_schedule' is true, then postpone the dispatch of requests
+ * until a safe kblockd context. We due this to avoid accidental big
+ * additional stack usage in driver dispatch, in places where the originally
+ * plugger did not intend it.
+ */
+static void queue_unplugged(struct request_queue *q, unsigned int depth,
+ bool from_schedule)
+ __releases(q->queue_lock)
+{
+ trace_block_unplug(q, depth, !from_schedule);
+
+ /*
+ * If we are punting this to kblockd, then we can safely drop
+ * the queue_lock before waking kblockd (which needs to take
+ * this lock).
+ */
+ if (from_schedule) {
+ spin_unlock(q->queue_lock);
+ blk_run_queue_async(q);
+ } else {
+ __blk_run_queue(q);
+ spin_unlock(q->queue_lock);
+ }
+
+}
+
+static void flush_plug_callbacks(struct blk_plug *plug)
+{
+ LIST_HEAD(callbacks);
+
+ if (list_empty(&plug->cb_list))
+ return;
+
+ list_splice_init(&plug->cb_list, &callbacks);
+
+ while (!list_empty(&callbacks)) {
+ struct blk_plug_cb *cb = list_first_entry(&callbacks,
+ struct blk_plug_cb,
+ list);
+ list_del(&cb->list);
+ cb->callback(cb);
+ }
+}
+
+void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
struct request_queue *q;
unsigned long flags;
struct request *rq;
+ LIST_HEAD(list);
+ unsigned int depth;
BUG_ON(plug->magic != PLUG_MAGIC);
+ flush_plug_callbacks(plug);
if (list_empty(&plug->list))
return;
- if (plug->should_sort)
- list_sort(NULL, &plug->list, plug_rq_cmp);
+ list_splice_init(&plug->list, &list);
+
+ if (plug->should_sort) {
+ list_sort(NULL, &list, plug_rq_cmp);
+ plug->should_sort = 0;
+ }
q = NULL;
+ depth = 0;
+
+ /*
+ * Save and disable interrupts here, to avoid doing it for every
+ * queue lock we have to take.
+ */
local_irq_save(flags);
- while (!list_empty(&plug->list)) {
- rq = list_entry_rq(plug->list.next);
+ while (!list_empty(&list)) {
+ rq = list_entry_rq(list.next);
list_del_init(&rq->queuelist);
- BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
BUG_ON(!rq->q);
if (rq->q != q) {
- if (q) {
- __blk_run_queue(q, false);
- spin_unlock(q->queue_lock);
- }
+ /*
+ * This drops the queue lock
+ */
+ if (q)
+ queue_unplugged(q, depth, from_schedule);
q = rq->q;
+ depth = 0;
spin_lock(q->queue_lock);
}
- rq->cmd_flags &= ~REQ_ON_PLUG;
-
/*
* rq is already accounted, so use raw insert
*/
@@ -2706,38 +2744,27 @@ static void flush_plug_list(struct blk_plug *plug)
__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
else
__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
- }
- if (q) {
- __blk_run_queue(q, false);
- spin_unlock(q->queue_lock);
+ depth++;
}
- BUG_ON(!list_empty(&plug->list));
- local_irq_restore(flags);
-}
-
-static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
-{
- flush_plug_list(plug);
+ /*
+ * This drops the queue lock
+ */
+ if (q)
+ queue_unplugged(q, depth, from_schedule);
- if (plug == tsk->plug)
- tsk->plug = NULL;
+ local_irq_restore(flags);
}
void blk_finish_plug(struct blk_plug *plug)
{
- if (plug)
- __blk_finish_plug(current, plug);
-}
-EXPORT_SYMBOL(blk_finish_plug);
+ blk_flush_plug_list(plug, false);
-void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
-{
- __blk_finish_plug(tsk, plug);
- tsk->plug = plug;
+ if (plug == current->plug)
+ current->plug = NULL;
}
-EXPORT_SYMBOL(__blk_flush_plug);
+EXPORT_SYMBOL(blk_finish_plug);
int __init blk_dev_init(void)
{
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 7482b7fa863b..8a0e7ec056e7 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -55,8 +55,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
WARN_ON(irqs_disabled());
spin_lock_irq(q->queue_lock);
__elv_add_request(q, rq, where);
- __blk_run_queue(q, false);
- /* the queue is stopped so it won't be plugged+unplugged */
+ __blk_run_queue(q);
+ /* the queue is stopped so it won't be run */
if (rq->cmd_type == REQ_TYPE_PM_RESUME)
q->request_fn(q);
spin_unlock_irq(q->queue_lock);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 93d5fd8e51eb..bb21e4c36f70 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -212,13 +212,19 @@ static void flush_end_io(struct request *flush_rq, int error)
}
/*
- * Moving a request silently to empty queue_head may stall the
- * queue. Kick the queue in those cases. This function is called
- * from request completion path and calling directly into
- * request_fn may confuse the driver. Always use kblockd.
+ * Kick the queue to avoid stall for two cases:
+ * 1. Moving a request silently to empty queue_head may stall the
+ * queue.
+ * 2. When flush request is running in non-queueable queue, the
+ * queue is hold. Restart the queue after flush request is finished
+ * to avoid stall.
+ * This function is called from request completion path and calling
+ * directly into request_fn may confuse the driver. Always use
+ * kblockd.
*/
- if (queued)
- __blk_run_queue(q, true);
+ if (queued || q->flush_queue_delayed)
+ blk_run_queue_async(q);
+ q->flush_queue_delayed = 0;
}
/**
@@ -261,7 +267,7 @@ static bool blk_kick_flush(struct request_queue *q)
q->flush_rq.end_io = flush_end_io;
q->flush_pending_idx ^= 1;
- elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE);
+ list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
return true;
}
@@ -274,14 +280,14 @@ static void flush_data_end_io(struct request *rq, int error)
* the comment in flush_end_io().
*/
if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
- __blk_run_queue(q, true);
+ blk_run_queue_async(q);
}
/**
* blk_insert_flush - insert a new FLUSH/FUA request
* @rq: request to insert
*
- * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions.
+ * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*
@@ -312,7 +318,7 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- list_add(&rq->queuelist, &q->queue_head);
+ list_add_tail(&rq->queuelist, &q->queue_head);
return;
}
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 54bcba6c02a7..129b9e209a3b 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -30,6 +30,8 @@
static struct kmem_cache *integrity_cachep;
+static const char *bi_unsupported_name = "unsupported";
+
/**
* blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
* @q: request queue
@@ -358,6 +360,14 @@ static struct kobj_type integrity_ktype = {
.release = blk_integrity_release,
};
+bool blk_integrity_is_initialized(struct gendisk *disk)
+{
+ struct blk_integrity *bi = blk_get_integrity(disk);
+
+ return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0);
+}
+EXPORT_SYMBOL(blk_integrity_is_initialized);
+
/**
* blk_integrity_register - Register a gendisk as being integrity-capable
* @disk: struct gendisk pointer to make integrity-aware
@@ -407,7 +417,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
bi->get_tag_fn = template->get_tag_fn;
bi->tag_size = template->tag_size;
} else
- bi->name = "unsupported";
+ bi->name = bi_unsupported_name;
return 0;
}
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index b791022beef3..c898049dafd5 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -96,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
INIT_HLIST_HEAD(&ret->cic_list);
ret->ioc_data = NULL;
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
+ ret->cgroup_changed = 0;
+#endif
}
return ret;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 25de73e4759b..78e627e2581d 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,17 +9,20 @@
#include "blk.h"
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- }
+struct bio_batch {
+ atomic_t done;
+ unsigned long flags;
+ struct completion *wait;
+};
- if (bio->bi_private)
- complete(bio->bi_private);
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+ struct bio_batch *bb = bio->bi_private;
+ if (err && (err != -EOPNOTSUPP))
+ clear_bit(BIO_UPTODATE, &bb->flags);
+ if (atomic_dec_and_test(&bb->done))
+ complete(bb->wait);
bio_put(bio);
}
@@ -41,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
struct request_queue *q = bdev_get_queue(bdev);
int type = REQ_WRITE | REQ_DISCARD;
unsigned int max_discard_sectors;
+ struct bio_batch bb;
struct bio *bio;
int ret = 0;
@@ -67,7 +71,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
type |= REQ_SECURE;
}
- while (nr_sects && !ret) {
+ atomic_set(&bb.done, 1);
+ bb.flags = 1 << BIO_UPTODATE;
+ bb.wait = &wait;
+
+ while (nr_sects) {
bio = bio_alloc(gfp_mask, 1);
if (!bio) {
ret = -ENOMEM;
@@ -75,9 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
bio->bi_sector = sector;
- bio->bi_end_io = blkdev_discard_end_io;
+ bio->bi_end_io = bio_batch_end_io;
bio->bi_bdev = bdev;
- bio->bi_private = &wait;
+ bio->bi_private = &bb;
if (nr_sects > max_discard_sectors) {
bio->bi_size = max_discard_sectors << 9;
@@ -88,45 +96,21 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
nr_sects = 0;
}
- bio_get(bio);
+ atomic_inc(&bb.done);
submit_bio(type, bio);
+ }
+ /* Wait for bios in-flight */
+ if (!atomic_dec_and_test(&bb.done))
wait_for_completion(&wait);
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
- else if (!bio_flagged(bio, BIO_UPTODATE))
- ret = -EIO;
- bio_put(bio);
- }
+ if (!test_bit(BIO_UPTODATE, &bb.flags))
+ ret = -EIO;
return ret;
}
EXPORT_SYMBOL(blkdev_issue_discard);
-struct bio_batch
-{
- atomic_t done;
- unsigned long flags;
- struct completion *wait;
-};
-
-static void bio_batch_end_io(struct bio *bio, int err)
-{
- struct bio_batch *bb = bio->bi_private;
-
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bb->flags);
- else
- clear_bit(BIO_UPTODATE, &bb->flags);
- }
- if (bb)
- if (atomic_dec_and_test(&bb->done))
- complete(bb->wait);
- bio_put(bio);
-}
-
/**
* blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
@@ -151,7 +135,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
bb.flags = 1 << BIO_UPTODATE;
bb.wait = &wait;
-submit:
ret = 0;
while (nr_sects != 0) {
bio = bio_alloc(gfp_mask,
@@ -168,9 +151,6 @@ submit:
while (nr_sects != 0) {
sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
- if (sz == 0)
- /* bio has maximum size possible */
- break;
ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
nr_sects -= ret >> 9;
sector += ret >> 9;
@@ -190,16 +170,6 @@ submit:
/* One of bios in the batch was completed with error.*/
ret = -EIO;
- if (ret)
- goto out;
-
- if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- if (nr_sects != 0)
- goto submit;
-out:
return ret;
}
EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1fa769293597..fa1eb0449a05 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -120,7 +120,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->discard_granularity = 0;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
- lim->discard_zeroes_data = -1;
+ lim->discard_zeroes_data = 1;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
lim->alignment_offset = 0;
@@ -166,6 +166,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
blk_set_default_limits(&q->limits);
blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
+ q->limits.discard_zeroes_data = 0;
/*
* by default assume old behaviour and bounce for any highmem page
@@ -790,6 +791,12 @@ void blk_queue_flush(struct request_queue *q, unsigned int flush)
}
EXPORT_SYMBOL_GPL(blk_queue_flush);
+void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
+{
+ q->flush_not_queueable = !queueable;
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
+
static int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 261c75c665ae..d935bd859c87 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -66,14 +66,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
blk_set_queue_full(q, BLK_RW_SYNC);
- } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
+ } else {
blk_clear_queue_full(q, BLK_RW_SYNC);
wake_up(&rl->wait[BLK_RW_SYNC]);
}
if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
blk_set_queue_full(q, BLK_RW_ASYNC);
- } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
+ } else {
blk_clear_queue_full(q, BLK_RW_ASYNC);
wake_up(&rl->wait[BLK_RW_ASYNC]);
}
@@ -152,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
{
- return queue_var_show(q->limits.max_discard_sectors << 9, page);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)q->limits.max_discard_sectors << 9);
}
static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
@@ -498,7 +499,6 @@ int blk_register_queue(struct gendisk *disk)
{
int ret;
struct device *dev = disk_to_dev(disk);
-
struct request_queue *q = disk->queue;
if (WARN_ON(!q))
@@ -509,8 +509,10 @@ int blk_register_queue(struct gendisk *disk)
return ret;
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
- if (ret < 0)
+ if (ret < 0) {
+ blk_trace_remove_sysfs(dev);
return ret;
+ }
kobject_uevent(&q->kobj, KOBJ_ADD);
@@ -521,7 +523,7 @@ int blk_register_queue(struct gendisk *disk)
if (ret) {
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
- blk_trace_remove_sysfs(disk_to_dev(disk));
+ blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5352bdafbcf0..a62be8d0dc1b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -77,7 +77,9 @@ struct throtl_grp {
unsigned long slice_end[2];
/* Some throttle limits got updated for the group */
- bool limits_changed;
+ int limits_changed;
+
+ struct rcu_head rcu_head;
};
struct throtl_data
@@ -88,7 +90,7 @@ struct throtl_data
/* service tree for active throtl groups */
struct throtl_rb_root tg_service_tree;
- struct throtl_grp root_tg;
+ struct throtl_grp *root_tg;
struct request_queue *queue;
/* Total Number of queued bios on READ and WRITE lists */
@@ -102,7 +104,7 @@ struct throtl_data
/* Work for dispatching throttled bios */
struct delayed_work throtl_work;
- bool limits_changed;
+ int limits_changed;
};
enum tg_state_flags {
@@ -151,57 +153,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
return tg;
}
-static void throtl_put_tg(struct throtl_grp *tg)
+static void throtl_free_tg(struct rcu_head *head)
{
- BUG_ON(atomic_read(&tg->ref) <= 0);
- if (!atomic_dec_and_test(&tg->ref))
- return;
+ struct throtl_grp *tg;
+
+ tg = container_of(head, struct throtl_grp, rcu_head);
+ free_percpu(tg->blkg.stats_cpu);
kfree(tg);
}
-static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
- struct cgroup *cgroup)
+static void throtl_put_tg(struct throtl_grp *tg)
{
- struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
- struct throtl_grp *tg = NULL;
- void *key = td;
- struct backing_dev_info *bdi = &td->queue->backing_dev_info;
- unsigned int major, minor;
+ BUG_ON(atomic_read(&tg->ref) <= 0);
+ if (!atomic_dec_and_test(&tg->ref))
+ return;
/*
- * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
- * tree of blkg (instead of traversing through hash list all
- * the time.
+ * A group is freed in rcu manner. But having an rcu lock does not
+ * mean that one can access all the fields of blkg and assume these
+ * are valid. For example, don't try to follow throtl_data and
+ * request queue links.
+ *
+ * Having a reference to blkg under an rcu allows acess to only
+ * values local to groups like group stats and group rate limits
*/
+ call_rcu(&tg->rcu_head, throtl_free_tg);
+}
- /*
- * This is the common case when there are no blkio cgroups.
- * Avoid lookup in this case
- */
- if (blkcg == &blkio_root_cgroup)
- tg = &td->root_tg;
- else
- tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
-
- /* Fill in device details for root group */
- if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- tg->blkg.dev = MKDEV(major, minor);
- goto done;
- }
-
- if (tg)
- goto done;
-
- tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
- if (!tg)
- goto done;
-
+static void throtl_init_group(struct throtl_grp *tg)
+{
INIT_HLIST_NODE(&tg->tg_node);
RB_CLEAR_NODE(&tg->rb_node);
bio_list_init(&tg->bio_lists[0]);
bio_list_init(&tg->bio_lists[1]);
- td->limits_changed = false;
+ tg->limits_changed = false;
+
+ /* Practically unlimited BW */
+ tg->bps[0] = tg->bps[1] = -1;
+ tg->iops[0] = tg->iops[1] = -1;
/*
* Take the initial reference that will be released on destroy
@@ -210,33 +199,181 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
* exit or cgroup deletion path depending on who is exiting first.
*/
atomic_set(&tg->ref, 1);
+}
+
+/* Should be called with rcu read lock held (needed for blkcg) */
+static void
+throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
+{
+ hlist_add_head(&tg->tg_node, &td->tg_list);
+ td->nr_undestroyed_grps++;
+}
+
+static void
+__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+ struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+ unsigned int major, minor;
+
+ if (!tg || tg->blkg.dev)
+ return;
+
+ /*
+ * Fill in device details for a group which might not have been
+ * filled at group creation time as queue was being instantiated
+ * and driver had not attached a device yet
+ */
+ if (bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ tg->blkg.dev = MKDEV(major, minor);
+ }
+}
+
+/*
+ * Should be called with without queue lock held. Here queue lock will be
+ * taken rarely. It will be taken only once during life time of a group
+ * if need be
+ */
+static void
+throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+ if (!tg || tg->blkg.dev)
+ return;
+
+ spin_lock_irq(td->queue->queue_lock);
+ __throtl_tg_fill_dev_details(td, tg);
+ spin_unlock_irq(td->queue->queue_lock);
+}
+
+static void throtl_init_add_tg_lists(struct throtl_data *td,
+ struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+{
+ __throtl_tg_fill_dev_details(td, tg);
/* Add group onto cgroup list */
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
- MKDEV(major, minor), BLKIO_POLICY_THROTL);
+ tg->blkg.dev, BLKIO_POLICY_THROTL);
tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
- hlist_add_head(&tg->tg_node, &td->tg_list);
- td->nr_undestroyed_grps++;
-done:
+ throtl_add_group_to_td_list(td, tg);
+}
+
+/* Should be called without queue lock and outside of rcu period */
+static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+{
+ struct throtl_grp *tg = NULL;
+ int ret;
+
+ tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+ if (!tg)
+ return NULL;
+
+ ret = blkio_alloc_blkg_stats(&tg->blkg);
+
+ if (ret) {
+ kfree(tg);
+ return NULL;
+ }
+
+ throtl_init_group(tg);
return tg;
}
-static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+static struct
+throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
{
- struct cgroup *cgroup;
struct throtl_grp *tg = NULL;
+ void *key = td;
+
+ /*
+ * This is the common case when there are no blkio cgroups.
+ * Avoid lookup in this case
+ */
+ if (blkcg == &blkio_root_cgroup)
+ tg = td->root_tg;
+ else
+ tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+ __throtl_tg_fill_dev_details(td, tg);
+ return tg;
+}
+
+/*
+ * This function returns with queue lock unlocked in case of error, like
+ * request queue is no more
+ */
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+{
+ struct throtl_grp *tg = NULL, *__tg = NULL;
+ struct blkio_cgroup *blkcg;
+ struct request_queue *q = td->queue;
rcu_read_lock();
- cgroup = task_cgroup(current, blkio_subsys_id);
- tg = throtl_find_alloc_tg(td, cgroup);
- if (!tg)
- tg = &td->root_tg;
+ blkcg = task_blkio_cgroup(current);
+ tg = throtl_find_tg(td, blkcg);
+ if (tg) {
+ rcu_read_unlock();
+ return tg;
+ }
+
+ /*
+ * Need to allocate a group. Allocation of group also needs allocation
+ * of per cpu stats which in-turn takes a mutex() and can block. Hence
+ * we need to drop rcu lock and queue_lock before we call alloc
+ *
+ * Take the request queue reference to make sure queue does not
+ * go away once we return from allocation.
+ */
+ blk_get_queue(q);
+ rcu_read_unlock();
+ spin_unlock_irq(q->queue_lock);
+
+ tg = throtl_alloc_tg(td);
+ /*
+ * We might have slept in group allocation. Make sure queue is not
+ * dead
+ */
+ if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+ blk_put_queue(q);
+ if (tg)
+ kfree(tg);
+
+ return ERR_PTR(-ENODEV);
+ }
+ blk_put_queue(q);
+
+ /* Group allocated and queue is still alive. take the lock */
+ spin_lock_irq(q->queue_lock);
+
+ /*
+ * Initialize the new group. After sleeping, read the blkcg again.
+ */
+ rcu_read_lock();
+ blkcg = task_blkio_cgroup(current);
+
+ /*
+ * If some other thread already allocated the group while we were
+ * not holding queue lock, free up the group
+ */
+ __tg = throtl_find_tg(td, blkcg);
+
+ if (__tg) {
+ kfree(tg);
+ rcu_read_unlock();
+ return __tg;
+ }
+
+ /* Group allocation failed. Account the IO to root group */
+ if (!tg) {
+ tg = td->root_tg;
+ return tg;
+ }
+
+ throtl_init_add_tg_lists(td, tg, blkcg);
rcu_read_unlock();
return tg;
}
@@ -545,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
return 0;
}
+static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
+ if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
+ return 1;
+ return 0;
+}
+
/*
* Returns whether one can dispatch a bio or not. Also returns approx number
* of jiffies to wait before this bio is with-in IO rate and can be dispatched
@@ -609,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
tg->bytes_disp[rw] += bio->bi_size;
tg->io_disp[rw]++;
- /*
- * TODO: This will take blkg->stats_lock. Figure out a way
- * to avoid this cost.
- */
blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
}
@@ -916,7 +1055,7 @@ static void throtl_update_blkio_group_common(struct throtl_data *td,
/*
* For all update functions, key should be a valid pointer because these
* update functions are called under blkcg_lock, that means, blkg is
- * valid and in turn key is valid. queue exit path can not race becuase
+ * valid and in turn key is valid. queue exit path can not race because
* of blkcg_lock
*
* Can not take queue lock in update functions as queue lock under blkcg_lock
@@ -990,15 +1129,51 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
struct throtl_grp *tg;
struct bio *bio = *biop;
bool rw = bio_data_dir(bio), update_disptime = true;
+ struct blkio_cgroup *blkcg;
if (bio->bi_rw & REQ_THROTTLED) {
bio->bi_rw &= ~REQ_THROTTLED;
return 0;
}
+ /*
+ * A throtl_grp pointer retrieved under rcu can be used to access
+ * basic fields like stats and io rates. If a group has no rules,
+ * just update the dispatch stats in lockless manner and return.
+ */
+
+ rcu_read_lock();
+ blkcg = task_blkio_cgroup(current);
+ tg = throtl_find_tg(td, blkcg);
+ if (tg) {
+ throtl_tg_fill_dev_details(td, tg);
+
+ if (tg_no_rule_group(tg, rw)) {
+ blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
+ rw, bio->bi_rw & REQ_SYNC);
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * Either group has not been allocated yet or it is not an unlimited
+ * IO group
+ */
+
spin_lock_irq(q->queue_lock);
tg = throtl_get_tg(td);
+ if (IS_ERR(tg)) {
+ if (PTR_ERR(tg) == -ENODEV) {
+ /*
+ * Queue is gone. No queue lock held here.
+ */
+ return -ENODEV;
+ }
+ }
+
if (tg->nr_queued[rw]) {
/*
* There is already another bio queued in same dir. No
@@ -1061,39 +1236,24 @@ int blk_throtl_init(struct request_queue *q)
INIT_HLIST_HEAD(&td->tg_list);
td->tg_service_tree = THROTL_RB_ROOT;
td->limits_changed = false;
+ INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
- /* Init root group */
- tg = &td->root_tg;
- INIT_HLIST_NODE(&tg->tg_node);
- RB_CLEAR_NODE(&tg->rb_node);
- bio_list_init(&tg->bio_lists[0]);
- bio_list_init(&tg->bio_lists[1]);
-
- /* Practically unlimited BW */
- tg->bps[0] = tg->bps[1] = -1;
- tg->iops[0] = tg->iops[1] = -1;
- td->limits_changed = false;
+ /* alloc and Init root group. */
+ td->queue = q;
+ tg = throtl_alloc_tg(td);
- /*
- * Set root group reference to 2. One reference will be dropped when
- * all groups on tg_list are being deleted during queue exit. Other
- * reference will remain there as we don't want to delete this group
- * as it is statically allocated and gets destroyed when throtl_data
- * goes away.
- */
- atomic_set(&tg->ref, 2);
- hlist_add_head(&tg->tg_node, &td->tg_list);
- td->nr_undestroyed_grps++;
+ if (!tg) {
+ kfree(td);
+ return -ENOMEM;
+ }
- INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+ td->root_tg = tg;
rcu_read_lock();
- blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
- 0, BLKIO_POLICY_THROTL);
+ throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
rcu_read_unlock();
/* Attach throtl data to request queue */
- td->queue = q;
q->td = td;
return 0;
}
diff --git a/block/blk.h b/block/blk.h
index c8db371a921d..d6586287adc9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -32,7 +32,7 @@ enum rq_atomic_flags {
/*
* EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them suceeds
+ * sure that only one of them succeeds
*/
static inline int blk_mark_rq_complete(struct request *rq)
{
@@ -62,7 +62,28 @@ static inline struct request *__elv_next_request(struct request_queue *q)
return rq;
}
- if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+ /*
+ * Flush request is running and flush request isn't queueable
+ * in the drive, we can hold the queue till flush request is
+ * finished. Even we don't do this, driver can't dispatch next
+ * requests and will requeue them. And this can improve
+ * throughput too. For example, we have request flush1, write1,
+ * flush 2. flush1 is dispatched, then queue is hold, write1
+ * isn't inserted to queue. After flush1 is finished, flush2
+ * will be dispatched. Since disk cache is already clean,
+ * flush2 will be finished very soon, so looks like flush2 is
+ * folded to flush1.
+ * Since the queue is hold, a flag is set to indicate the queue
+ * should be restarted later. Please see flush_end_io() for
+ * details.
+ */
+ if (q->flush_pending_idx != q->flush_running_idx &&
+ !queue_flush_queueable(q)) {
+ q->flush_queue_delayed = 1;
+ return NULL;
+ }
+ if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
+ !q->elevator->ops->elevator_dispatch_fn(q, 0))
return NULL;
}
}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7785169f3c8f..7c52d6888924 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -300,7 +300,9 @@ struct cfq_data {
/* List of cfq groups being managed on this device*/
struct hlist_head cfqg_list;
- struct rcu_head rcu;
+
+ /* Number of groups which are on blkcg->blkg_list */
+ unsigned int nr_blkcg_linked_grps;
};
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -665,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
if (rq2 == NULL)
return rq1;
- if (rq_is_sync(rq1) && !rq_is_sync(rq2))
- return rq1;
- else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
- return rq2;
- if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
- return rq1;
- else if ((rq2->cmd_flags & REQ_META) &&
- !(rq1->cmd_flags & REQ_META))
- return rq2;
+ if (rq_is_sync(rq1) != rq_is_sync(rq2))
+ return rq_is_sync(rq1) ? rq1 : rq2;
+
+ if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
+ return rq1->cmd_flags & REQ_META ? rq1 : rq2;
s1 = blk_rq_pos(rq1);
s2 = blk_rq_pos(rq2);
@@ -888,7 +886,7 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
/*
* Currently put the group at the end. Later implement something
* so that groups get lesser vtime based on their weights, so that
- * if group does not loose all if it was not continously backlogged.
+ * if group does not loose all if it was not continuously backlogged.
*/
n = rb_last(&st->rb);
if (n) {
@@ -1014,29 +1012,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
cfqg->needs_update = true;
}
-static struct cfq_group *
-cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
+ struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
{
- struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
- struct cfq_group *cfqg = NULL;
- void *key = cfqd;
- int i, j;
- struct cfq_rb_root *st;
struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
unsigned int major, minor;
- cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ /*
+ * Add group onto cgroup list. It might happen that bdi->dev is
+ * not initialized yet. Initialize this new group without major
+ * and minor info and this info will be filled in once a new thread
+ * comes for IO.
+ */
+ if (bdi->dev) {
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfqg->blkg.dev = MKDEV(major, minor);
- goto done;
- }
- if (cfqg || !create)
- goto done;
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+ (void *)cfqd, MKDEV(major, minor));
+ } else
+ cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+ (void *)cfqd, 0);
+
+ cfqd->nr_blkcg_linked_grps++;
+ cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+
+ /* Add group on cfqd list */
+ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+/*
+ * Should be called from sleepable context. No request queue lock as per
+ * cpu stats are allocated dynamically and alloc_percpu needs to be called
+ * from sleepable context.
+ */
+static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+{
+ struct cfq_group *cfqg = NULL;
+ int i, j, ret;
+ struct cfq_rb_root *st;
cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
if (!cfqg)
- goto done;
+ return NULL;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
@@ -1050,43 +1066,94 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
*/
cfqg->ref = 1;
+ ret = blkio_alloc_blkg_stats(&cfqg->blkg);
+ if (ret) {
+ kfree(cfqg);
+ return NULL;
+ }
+
+ return cfqg;
+}
+
+static struct cfq_group *
+cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
+{
+ struct cfq_group *cfqg = NULL;
+ void *key = cfqd;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+
/*
- * Add group onto cgroup list. It might happen that bdi->dev is
- * not initialized yet. Initialize this new group without major
- * and minor info and this info will be filled in once a new thread
- * comes for IO. See code above.
+ * This is the common case when there are no blkio cgroups.
+ * Avoid lookup in this case
*/
- if (bdi->dev) {
- sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- MKDEV(major, minor));
- } else
- cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
- 0);
-
- cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+ if (blkcg == &blkio_root_cgroup)
+ cfqg = &cfqd->root_group;
+ else
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
- /* Add group on cfqd list */
- hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+ if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ }
-done:
return cfqg;
}
/*
- * Search for the cfq group current task belongs to. If create = 1, then also
- * create the cfq group if it does not exist. request_queue lock must be held.
+ * Search for the cfq group current task belongs to. request_queue lock must
+ * be held.
*/
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
{
- struct cgroup *cgroup;
- struct cfq_group *cfqg = NULL;
+ struct blkio_cgroup *blkcg;
+ struct cfq_group *cfqg = NULL, *__cfqg = NULL;
+ struct request_queue *q = cfqd->queue;
rcu_read_lock();
- cgroup = task_cgroup(current, blkio_subsys_id);
- cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
- if (!cfqg && create)
+ blkcg = task_blkio_cgroup(current);
+ cfqg = cfq_find_cfqg(cfqd, blkcg);
+ if (cfqg) {
+ rcu_read_unlock();
+ return cfqg;
+ }
+
+ /*
+ * Need to allocate a group. Allocation of group also needs allocation
+ * of per cpu stats which in-turn takes a mutex() and can block. Hence
+ * we need to drop rcu lock and queue_lock before we call alloc.
+ *
+ * Not taking any queue reference here and assuming that queue is
+ * around by the time we return. CFQ queue allocation code does
+ * the same. It might be racy though.
+ */
+
+ rcu_read_unlock();
+ spin_unlock_irq(q->queue_lock);
+
+ cfqg = cfq_alloc_cfqg(cfqd);
+
+ spin_lock_irq(q->queue_lock);
+
+ rcu_read_lock();
+ blkcg = task_blkio_cgroup(current);
+
+ /*
+ * If some other thread already allocated the group while we were
+ * not holding queue lock, free up the group
+ */
+ __cfqg = cfq_find_cfqg(cfqd, blkcg);
+
+ if (__cfqg) {
+ kfree(cfqg);
+ rcu_read_unlock();
+ return __cfqg;
+ }
+
+ if (!cfqg)
cfqg = &cfqd->root_group;
+
+ cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
rcu_read_unlock();
return cfqg;
}
@@ -1119,6 +1186,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
return;
for_each_cfqg_st(cfqg, i, j, st)
BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+ free_percpu(cfqg->blkg.stats_cpu);
kfree(cfqg);
}
@@ -1177,7 +1245,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
}
#else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
{
return &cfqd->root_group;
}
@@ -1211,7 +1279,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct cfq_rb_root *service_tree;
int left;
int new_cfqq = 1;
- int group_changed = 0;
service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
cfqq_type(cfqq));
@@ -1282,7 +1349,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
rb_link_node(&cfqq->rb_node, parent, p);
rb_insert_color(&cfqq->rb_node, &service_tree->rb);
service_tree->count++;
- if ((add_front || !new_cfqq) && !group_changed)
+ if (add_front || !new_cfqq)
return;
cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
}
@@ -2030,7 +2097,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
- return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
+ return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
}
/*
@@ -2582,28 +2649,20 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
}
/*
- * Must always be called with the rcu_read_lock() held
+ * Call func for each cic attached to this ioc.
*/
static void
-__call_for_each_cic(struct io_context *ioc,
- void (*func)(struct io_context *, struct cfq_io_context *))
+call_for_each_cic(struct io_context *ioc,
+ void (*func)(struct io_context *, struct cfq_io_context *))
{
struct cfq_io_context *cic;
struct hlist_node *n;
+ rcu_read_lock();
+
hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
func(ioc, cic);
-}
-/*
- * Call func for each cic attached to this ioc.
- */
-static void
-call_for_each_cic(struct io_context *ioc,
- void (*func)(struct io_context *, struct cfq_io_context *))
-{
- rcu_read_lock();
- __call_for_each_cic(ioc, func);
rcu_read_unlock();
}
@@ -2664,7 +2723,7 @@ static void cfq_free_io_context(struct io_context *ioc)
* should be ok to iterate over the known list, we will see all cic's
* since no new ones are added.
*/
- __call_for_each_cic(ioc, cic_free_func);
+ call_for_each_cic(ioc, cic_free_func);
}
static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -2920,7 +2979,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
struct cfq_group *cfqg;
retry:
- cfqg = cfq_get_cfqg(cfqd, 1);
+ cfqg = cfq_get_cfqg(cfqd);
cic = cfq_cic_lookup(cfqd, ioc);
/* cic always exists here */
cfqq = cic_to_cfqq(cic, is_sync);
@@ -3368,7 +3427,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqd->busy_queues > 1) {
cfq_del_timer(cfqd, cfqq);
cfq_clear_cfqq_wait_request(cfqq);
- __blk_run_queue(cfqd->queue, false);
+ __blk_run_queue(cfqd->queue);
} else {
cfq_blkiocg_update_idle_time_stats(
&cfqq->cfqg->blkg);
@@ -3383,7 +3442,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
* this new queue is RT and the current one is BE
*/
cfq_preempt_queue(cfqd, cfqq);
- __blk_run_queue(cfqd->queue, false);
+ __blk_run_queue(cfqd->queue);
}
}
@@ -3743,7 +3802,7 @@ static void cfq_kick_queue(struct work_struct *work)
struct request_queue *q = cfqd->queue;
spin_lock_irq(q->queue_lock);
- __blk_run_queue(cfqd->queue, false);
+ __blk_run_queue(cfqd->queue);
spin_unlock_irq(q->queue_lock);
}
@@ -3824,15 +3883,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
cfq_put_queue(cfqd->async_idle_cfqq);
}
-static void cfq_cfqd_free(struct rcu_head *head)
-{
- kfree(container_of(head, struct cfq_data, rcu));
-}
-
static void cfq_exit_queue(struct elevator_queue *e)
{
struct cfq_data *cfqd = e->elevator_data;
struct request_queue *q = cfqd->queue;
+ bool wait = false;
cfq_shutdown_timer_wq(cfqd);
@@ -3851,7 +3906,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
cfq_put_async_queues(cfqd);
cfq_release_cfq_groups(cfqd);
- cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
+
+ /*
+ * If there are groups which we could not unlink from blkcg list,
+ * wait for a rcu period for them to be freed.
+ */
+ if (cfqd->nr_blkcg_linked_grps)
+ wait = true;
spin_unlock_irq(q->queue_lock);
@@ -3861,8 +3922,25 @@ static void cfq_exit_queue(struct elevator_queue *e)
ida_remove(&cic_index_ida, cfqd->cic_index);
spin_unlock(&cic_index_lock);
- /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
- call_rcu(&cfqd->rcu, cfq_cfqd_free);
+ /*
+ * Wait for cfqg->blkg->key accessors to exit their grace periods.
+ * Do this wait only if there are other unlinked groups out
+ * there. This can happen if cgroup deletion path claimed the
+ * responsibility of cleaning up a group before queue cleanup code
+ * get to the group.
+ *
+ * Do not call synchronize_rcu() unconditionally as there are drivers
+ * which create/delete request queue hundreds of times during scan/boot
+ * and synchronize_rcu() can take significant time and slow down boot.
+ */
+ if (wait)
+ synchronize_rcu();
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ /* Free up per cpu stats for root group */
+ free_percpu(cfqd->root_group.blkg.stats_cpu);
+#endif
+ kfree(cfqd);
}
static int cfq_alloc_cic_index(void)
@@ -3895,8 +3973,12 @@ static void *cfq_init_queue(struct request_queue *q)
return NULL;
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
- if (!cfqd)
+ if (!cfqd) {
+ spin_lock(&cic_index_lock);
+ ida_remove(&cic_index_ida, i);
+ spin_unlock(&cic_index_lock);
return NULL;
+ }
/*
* Don't need take queue_lock in the routine, since we are
@@ -3918,14 +4000,29 @@ static void *cfq_init_queue(struct request_queue *q)
#ifdef CONFIG_CFQ_GROUP_IOSCHED
/*
- * Take a reference to root group which we never drop. This is just
- * to make sure that cfq_put_cfqg() does not try to kfree root group
+ * Set root group reference to 2. One reference will be dropped when
+ * all groups on cfqd->cfqg_list are being deleted during queue exit.
+ * Other reference will remain there as we don't want to delete this
+ * group as it is statically allocated and gets destroyed when
+ * throtl_data goes away.
*/
- cfqg->ref = 1;
+ cfqg->ref = 2;
+
+ if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
+ kfree(cfqg);
+ kfree(cfqd);
+ return NULL;
+ }
+
rcu_read_lock();
+
cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
(void *)cfqd, 0);
rcu_read_unlock();
+ cfqd->nr_blkcg_linked_grps++;
+
+ /* Add group on cfqd->cfqg_list */
+ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
#endif
/*
* Not strictly needed (since RB_ROOT just clears the node and we
diff --git a/block/elevator.c b/block/elevator.c
index c387d3168734..b0b38ce0dcb6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name)
e = elevator_find(name);
if (!e) {
- char elv[ELV_NAME_MAX + strlen("-iosched")];
-
spin_unlock(&elv_list_lock);
-
- snprintf(elv, sizeof(elv), "%s-iosched", name);
-
- request_module("%s", elv);
+ request_module("%s-iosched", name);
spin_lock(&elv_list_lock);
e = elevator_find(name);
}
@@ -421,8 +416,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
struct list_head *entry;
int stop_flags;
- BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
if (q->last_merge == rq)
q->last_merge = NULL;
@@ -610,7 +603,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
rq->cmd_flags &= ~REQ_STARTED;
- elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
+ __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
}
void elv_drain_elevator(struct request_queue *q)
@@ -642,7 +635,7 @@ void elv_quiesce_start(struct request_queue *q)
*/
elv_drain_elevator(q);
while (q->rq.elvpriv) {
- __blk_run_queue(q, false);
+ __blk_run_queue(q);
spin_unlock_irq(q->queue_lock);
msleep(10);
spin_lock_irq(q->queue_lock);
@@ -655,12 +648,24 @@ void elv_quiesce_end(struct request_queue *q)
queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
}
-void elv_insert(struct request_queue *q, struct request *rq, int where)
+void __elv_add_request(struct request_queue *q, struct request *rq, int where)
{
trace_block_rq_insert(q, rq);
rq->q = q;
+ if (rq->cmd_flags & REQ_SOFTBARRIER) {
+ /* barriers are scheduling boundary, update end_sector */
+ if (rq->cmd_type == REQ_TYPE_FS ||
+ (rq->cmd_flags & REQ_DISCARD)) {
+ q->end_sector = rq_end_sector(rq);
+ q->boundary_rq = rq;
+ }
+ } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
+ (where == ELEVATOR_INSERT_SORT ||
+ where == ELEVATOR_INSERT_SORT_MERGE))
+ where = ELEVATOR_INSERT_BACK;
+
switch (where) {
case ELEVATOR_INSERT_REQUEUE:
case ELEVATOR_INSERT_FRONT:
@@ -682,7 +687,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
* with anything. There's no point in delaying queue
* processing.
*/
- __blk_run_queue(q, false);
+ __blk_run_queue(q);
break;
case ELEVATOR_INSERT_SORT_MERGE:
@@ -722,24 +727,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
BUG();
}
}
-
-void __elv_add_request(struct request_queue *q, struct request *rq, int where)
-{
- BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
- if (rq->cmd_flags & REQ_SOFTBARRIER) {
- /* barriers are scheduling boundary, update end_sector */
- if (rq->cmd_type == REQ_TYPE_FS ||
- (rq->cmd_flags & REQ_DISCARD)) {
- q->end_sector = rq_end_sector(rq);
- q->boundary_rq = rq;
- }
- } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
- where == ELEVATOR_INSERT_SORT)
- where = ELEVATOR_INSERT_BACK;
-
- elv_insert(q, rq, where);
-}
EXPORT_SYMBOL(__elv_add_request);
void elv_add_request(struct request_queue *q, struct request *rq, int where)
diff --git a/block/genhd.c b/block/genhd.c
index c91a2dac6b6b..95822ae25cfe 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -739,7 +739,7 @@ void __init printk_all_partitions(void)
/*
* Don't show empty devices or things that have been
- * surpressed
+ * suppressed
*/
if (get_capacity(disk) == 0 ||
(disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
@@ -1588,9 +1588,13 @@ static void disk_events_workfn(struct work_struct *work)
spin_unlock_irq(&ev->lock);
- /* tell userland about new events */
+ /*
+ * Tell userland about new events. Only the events listed in
+ * @disk->events are reported. Unlisted events are processed the
+ * same internally but never get reported to userland.
+ */
for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
- if (events & (1 << i))
+ if (events & disk->events & (1 << i))
envp[nr_events++] = disk_uevents[i];
if (nr_events)
@@ -1724,7 +1728,7 @@ static void disk_add_events(struct gendisk *disk)
{
struct disk_events *ev;
- if (!disk->fops->check_events || !(disk->events | disk->async_events))
+ if (!disk->fops->check_events)
return;
ev = kzalloc(sizeof(*ev), GFP_KERNEL);