From 9195291e5f05e01d67f9a09c756b8aca8f009089 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 1 Apr 2010 15:01:41 -0700
Subject: blkio: Increment the blkio cgroup stats for real now

We also add start_time_ns and io_start_time_ns fields to struct request
here to record the time when a request is created and when it is
dispatched to device. We use ns uints here as ms and jiffies are
not very useful for non-rotational media.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6690e8bae7bb..f3fff8bf85ee 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -194,7 +194,10 @@ struct request {
 
 	struct gendisk *rq_disk;
 	unsigned long start_time;
-
+#ifdef CONFIG_BLK_CGROUP
+	unsigned long long start_time_ns;
+	unsigned long long io_start_time_ns;    /* when passed to hardware */
+#endif
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
@@ -1196,6 +1199,21 @@ static inline void put_dev_sector(Sector p)
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
 
+#ifdef CONFIG_BLK_CGROUP
+static inline void set_start_time_ns(struct request *req)
+{
+	req->start_time_ns = sched_clock();
+}
+
+static inline void set_io_start_time_ns(struct request *req)
+{
+	req->io_start_time_ns = sched_clock();
+}
+#else
+static inline void set_start_time_ns(struct request *req) {}
+static inline void set_io_start_time_ns(struct request *req) {}
+#endif
+
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
-- 
cgit v1.2.3


From 31373d09da5b7fe21fe6f781e92bd534a3495f00 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Tue, 6 Apr 2010 14:25:14 +0200
Subject: laptop-mode: Make flushes per-device

One of the features of laptop-mode is that it forces a writeout of dirty
pages if something else triggers a physical read or write from a device.
The current implementation flushes pages on all devices, rather than only
the one that triggered the flush. This patch alters the behaviour so that
only the recently accessed block device is flushed, preventing other
disks being spun up for no terribly good reason.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c            |  5 ++++-
 include/linux/backing-dev.h |  3 +++
 include/linux/writeback.h   |  4 +++-
 mm/page-writeback.c         | 39 ++++++++++++++++++++-------------------
 4 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1d94f15d7f0d..4b1b29ef2cb0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -451,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q)
 	 */
 	blk_sync_queue(q);
 
+	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
 	mutex_lock(&q->sysfs_lock);
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 	mutex_unlock(&q->sysfs_lock);
@@ -511,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 		return NULL;
 	}
 
+	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
+		    laptop_mode_timer_fn, (unsigned long) q);
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
@@ -2101,7 +2104,7 @@ static void blk_finish_request(struct request *req, int error)
 	BUG_ON(blk_queued_rq(req));
 
 	if (unlikely(laptop_mode) && blk_fs_request(req))
-		laptop_io_completion();
+		laptop_io_completion(&req->q->backing_dev_info);
 
 	blk_delete_timer(req);
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fcbc26af00e4..2742e1adfc30 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/timer.h>
 #include <linux/writeback.h>
 #include <asm/atomic.h>
 
@@ -88,6 +89,8 @@ struct backing_dev_info {
 
 	struct device *dev;
 
+	struct timer_list laptop_mode_wb_timer;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 36520ded3e06..eb38a2c645f6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -96,8 +96,10 @@ static inline void inode_sync_wait(struct inode *inode)
 /*
  * mm/page-writeback.c
  */
-void laptop_io_completion(void);
+void laptop_io_completion(struct backing_dev_info *info);
 void laptop_sync_completion(void);
+void laptop_mode_sync(struct work_struct *work);
+void laptop_mode_timer_fn(unsigned long data);
 void throttle_vm_writeout(gfp_t gfp_mask);
 
 /* These are exported to sysctl. */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8b..d0f2b3765f8d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
         }
 }
 
-static void laptop_timer_fn(unsigned long unused);
-
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
-
 /*
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
@@ -697,21 +693,19 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	return 0;
 }
 
-static void do_laptop_sync(struct work_struct *work)
+void laptop_mode_timer_fn(unsigned long data)
 {
-	wakeup_flusher_threads(0);
-	kfree(work);
-}
+	struct request_queue *q = (struct request_queue *)data;
+	int nr_pages = global_page_state(NR_FILE_DIRTY) +
+		global_page_state(NR_UNSTABLE_NFS);
 
-static void laptop_timer_fn(unsigned long unused)
-{
-	struct work_struct *work;
+	/*
+	 * We want to write everything out, not just down to the dirty
+	 * threshold
+	 */
 
-	work = kmalloc(sizeof(*work), GFP_ATOMIC);
-	if (work) {
-		INIT_WORK(work, do_laptop_sync);
-		schedule_work(work);
-	}
+	if (bdi_has_dirty_io(&q->backing_dev_info))
+		bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
 }
 
 /*
@@ -719,9 +713,9 @@ static void laptop_timer_fn(unsigned long unused)
  * of all dirty data a few seconds from now.  If the flush is already scheduled
  * then push it back - the user is still using the disk.
  */
-void laptop_io_completion(void)
+void laptop_io_completion(struct backing_dev_info *info)
 {
-	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+	mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 
 /*
@@ -731,7 +725,14 @@ void laptop_io_completion(void)
  */
 void laptop_sync_completion(void)
 {
-	del_timer(&laptop_mode_wb_timer);
+	struct backing_dev_info *bdi;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+		del_timer(&bdi->laptop_mode_wb_timer);
+
+	rcu_read_unlock();
 }
 
 /*
-- 
cgit v1.2.3


From 84c124da9ff50bd71fab9c939ee5b7cd8bef2bd9 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Fri, 9 Apr 2010 08:31:19 +0200
Subject: blkio: Changes to IO controller additional stats patches

that include some minor fixes and addresses all comments.

Changelog: (most based on Vivek Goyal's comments)
o renamed blkiocg_reset_write to blkiocg_reset_stats
o more clarification in the documentation on io_service_time and io_wait_time
o Initialize blkg->stats_lock
o rename io_add_stat to blkio_add_stat and declare it static
o use bool for direction and sync
o derive direction and sync info from existing rq methods
o use 12 for major:minor string length
o define io_service_time better to cover the NCQ case
o add a separate reset_stats interface
o make the indexed stats a 2d array to simplify macro and function pointer code
o blkio.time now exports in jiffies as before
o Added stats description in patch description and
  Documentation/cgroup/blkio-controller.txt
o Prefix all stats functions with blkio and make them static as applicable
o replace IO_TYPE_MAX with IO_TYPE_TOTAL
o Moved #define constant to top of blk-cgroup.c
o Pass dev_t around instead of char *
o Add note to documentation file about resetting stats
o use BLK_CGROUP_MODULE in addition to BLK_CGROUP config option in #ifdef
  statements
o Avoid struct request specific knowledge in blk-cgroup. blk-cgroup.h now has
  rq_direction() and rq_sync() functions which are used by CFQ and when using
  io-controller at a higher level, bio_* functions can be added.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/cgroups/blkio-controller.txt |  48 +++++++-
 block/blk-cgroup.c                         | 190 +++++++++++++----------------
 block/blk-cgroup.h                         |  64 ++++++----
 block/cfq-iosched.c                        |   8 +-
 include/linux/blkdev.h                     |  18 +++
 5 files changed, 198 insertions(+), 130 deletions(-)

(limited to 'include')

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879cd9a42..ed04fe9cce1a 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -77,7 +77,6 @@ Details of cgroup files
 =======================
 - blkio.weight
 	- Specifies per cgroup weight.
-
 	  Currently allowed range of weights is from 100 to 1000.
 
 - blkio.time
@@ -92,6 +91,49 @@ Details of cgroup files
 	  third field specifies the number of sectors transferred by the
 	  group to/from the device.
 
+- blkio.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+- blkio.io_serviced
+	- Number of IOs completed to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.io_service_time
+	- Total amount of time between request dispatch and request completion
+	  for the IOs done by this cgroup. This is in nanoseconds to make it
+	  meaningful for flash devices too. For devices with queue depth of 1,
+	  this time represents the actual service time. When queue_depth > 1,
+	  that is no longer true as requests may be served out of order. This
+	  may cause the service time for a given IO to include the service time
+	  of multiple IOs when served out of order which may result in total
+	  io_service_time > actual time elapsed. This time is further divided by
+	  the type of operation - read or write, sync or async. First two fields
+	  specify the major and minor number of the device, third field
+	  specifies the operation type and the fourth field specifies the
+	  io_service_time in ns.
+
+- blkio.io_wait_time
+	- Total amount of time the IOs for this cgroup spent waiting in the
+	  scheduler queues for service. This can be greater than the total time
+	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
+	  measure of total time the cgroup spent waiting but rather a measure of
+	  the wait_time for its individual IOs. For devices with queue_depth > 1
+	  this metric does not include the time spent waiting for service once
+	  the IO is dispatched to the device but till it actually gets serviced
+	  (there might be a time lag here due to re-ordering of requests by the
+	  device). This is in nanoseconds to make it meaningful for flash
+	  devices too. This time is further divided by the type of operation -
+	  read or write, sync or async. First two fields specify the major and
+	  minor number of the device, third field specifies the operation type
+	  and the fourth field specifies the io_wait_time in ns.
+
 - blkio.dequeue
 	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
 	  gives the statistics about how many a times a group was dequeued
@@ -99,6 +141,10 @@ Details of cgroup files
 	  and minor number of the device and third field specifies the number
 	  of times a group was dequeued from a particular device.
 
+- blkio.reset_stats
+	- Writing an int to this file will result in resetting all the stats
+	  for that cgroup.
+
 CFQ sysfs tunable
 =================
 /sys/block/<disk>/queue/iosched/group_isolation
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9af7257f429c..6797df508821 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -18,6 +18,8 @@
 #include <linux/blkdev.h>
 #include "blk-cgroup.h"
 
+#define MAX_KEY_LEN 100
+
 static DEFINE_SPINLOCK(blkio_list_lock);
 static LIST_HEAD(blkio_list);
 
@@ -56,24 +58,27 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
+void blkio_group_init(struct blkio_group *blkg)
+{
+	spin_lock_init(&blkg->stats_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_group_init);
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
  */
-void io_add_stat(uint64_t *stat, uint64_t add, unsigned int flags)
+static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
+				bool sync)
 {
-	if (flags & REQ_RW)
-		stat[IO_WRITE] += add;
+	if (direction)
+		stat[BLKIO_STAT_WRITE] += add;
 	else
-		stat[IO_READ] += add;
-	/*
-	 * Everywhere in the block layer, an IO is treated as sync if it is a
-	 * read or a SYNC write. We follow the same norm.
-	 */
-	if (!(flags & REQ_RW) || flags & REQ_RW_SYNC)
-		stat[IO_SYNC] += add;
+		stat[BLKIO_STAT_READ] += add;
+	if (sync)
+		stat[BLKIO_STAT_SYNC] += add;
 	else
-		stat[IO_ASYNC] += add;
+		stat[BLKIO_STAT_ASYNC] += add;
 }
 
 void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
@@ -86,23 +91,25 @@ void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 
-void blkiocg_update_request_dispatch_stats(struct blkio_group *blkg,
-						struct request *rq)
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+				uint64_t bytes, bool direction, bool sync)
 {
 	struct blkio_group_stats *stats;
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
 	stats = &blkg->stats;
-	stats->sectors += blk_rq_sectors(rq);
-	io_add_stat(stats->io_serviced, 1, rq->cmd_flags);
-	io_add_stat(stats->io_service_bytes, blk_rq_sectors(rq) << 9,
-			rq->cmd_flags);
+	stats->sectors += bytes >> 9;
+	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
+			sync);
+	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
+			direction, sync);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
+EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 
-void blkiocg_update_request_completion_stats(struct blkio_group *blkg,
-						struct request *rq)
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
 {
 	struct blkio_group_stats *stats;
 	unsigned long flags;
@@ -110,16 +117,15 @@ void blkiocg_update_request_completion_stats(struct blkio_group *blkg,
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
 	stats = &blkg->stats;
-	if (time_after64(now, rq->io_start_time_ns))
-		io_add_stat(stats->io_service_time, now - rq->io_start_time_ns,
-				rq->cmd_flags);
-	if (time_after64(rq->io_start_time_ns, rq->start_time_ns))
-		io_add_stat(stats->io_wait_time,
-				rq->io_start_time_ns - rq->start_time_ns,
-				rq->cmd_flags);
+	if (time_after64(now, io_start_time))
+		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
+				now - io_start_time, direction, sync);
+	if (time_after64(io_start_time, start_time))
+		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
+				io_start_time - start_time, direction, sync);
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_request_completion_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev)
@@ -230,7 +236,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 }
 
 static int
-blkiocg_reset_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
 	struct blkio_cgroup *blkcg;
 	struct blkio_group *blkg;
@@ -249,29 +255,32 @@ blkiocg_reset_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	return 0;
 }
 
-void get_key_name(int type, char *disk_id, char *str, int chars_left)
+static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
+				int chars_left, bool diskname_only)
 {
-	strlcpy(str, disk_id, chars_left);
+	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
 	chars_left -= strlen(str);
 	if (chars_left <= 0) {
 		printk(KERN_WARNING
 			"Possibly incorrect cgroup stat display format");
 		return;
 	}
+	if (diskname_only)
+		return;
 	switch (type) {
-	case IO_READ:
+	case BLKIO_STAT_READ:
 		strlcat(str, " Read", chars_left);
 		break;
-	case IO_WRITE:
+	case BLKIO_STAT_WRITE:
 		strlcat(str, " Write", chars_left);
 		break;
-	case IO_SYNC:
+	case BLKIO_STAT_SYNC:
 		strlcat(str, " Sync", chars_left);
 		break;
-	case IO_ASYNC:
+	case BLKIO_STAT_ASYNC:
 		strlcat(str, " Async", chars_left);
 		break;
-	case IO_TYPE_MAX:
+	case BLKIO_STAT_TOTAL:
 		strlcat(str, " Total", chars_left);
 		break;
 	default:
@@ -279,63 +288,47 @@ void get_key_name(int type, char *disk_id, char *str, int chars_left)
 	}
 }
 
-typedef uint64_t (get_var) (struct blkio_group *, int);
+static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
+				struct cgroup_map_cb *cb, dev_t dev)
+{
+	blkio_get_key_name(0, dev, str, chars_left, true);
+	cb->fill(cb, str, val);
+	return val;
+}
 
-#define MAX_KEY_LEN 100
-uint64_t get_typed_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb,
-		get_var *getvar, char *disk_id)
+/* This should be called with blkg->stats_lock held */
+static uint64_t blkio_get_stat(struct blkio_group *blkg,
+		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
 {
 	uint64_t disk_total;
 	char key_str[MAX_KEY_LEN];
-	int type;
+	enum stat_sub_type sub_type;
+
+	if (type == BLKIO_STAT_TIME)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.time, cb, dev);
+	if (type == BLKIO_STAT_SECTORS)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.sectors, cb, dev);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	if (type == BLKIO_STAT_DEQUEUE)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.dequeue, cb, dev);
+#endif
 
-	for (type = 0; type < IO_TYPE_MAX; type++) {
-		get_key_name(type, disk_id, key_str, MAX_KEY_LEN);
-		cb->fill(cb, key_str, getvar(blkg, type));
+	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+			sub_type++) {
+		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
 	}
-	disk_total = getvar(blkg, IO_READ) + getvar(blkg, IO_WRITE);
-	get_key_name(IO_TYPE_MAX, disk_id, key_str, MAX_KEY_LEN);
+	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
+			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
+	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
 	cb->fill(cb, key_str, disk_total);
 	return disk_total;
 }
 
-uint64_t get_stat(struct blkio_group *blkg, struct cgroup_map_cb *cb,
-		get_var *getvar, char *disk_id)
-{
-	uint64_t var = getvar(blkg, 0);
-	cb->fill(cb, disk_id, var);
-	return var;
-}
-
-#define GET_STAT_INDEXED(__VAR)						\
-uint64_t get_##__VAR##_stat(struct blkio_group *blkg, int type)		\
-{									\
-	return blkg->stats.__VAR[type];					\
-}									\
-
-GET_STAT_INDEXED(io_service_bytes);
-GET_STAT_INDEXED(io_serviced);
-GET_STAT_INDEXED(io_service_time);
-GET_STAT_INDEXED(io_wait_time);
-#undef GET_STAT_INDEXED
-
-#define GET_STAT(__VAR, __CONV)						\
-uint64_t get_##__VAR##_stat(struct blkio_group *blkg, int dummy)	\
-{									\
-	uint64_t data = blkg->stats.__VAR;				\
-	if (__CONV)							\
-		data = (uint64_t)jiffies_to_msecs(data) * NSEC_PER_MSEC;\
-	return data;							\
-}
-
-GET_STAT(time, 1);
-GET_STAT(sectors, 0);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-GET_STAT(dequeue, 0);
-#endif
-#undef GET_STAT
-
-#define SHOW_FUNCTION_PER_GROUP(__VAR, get_stats, getvar, show_total)	\
+#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)		\
 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 		struct cftype *cftype, struct cgroup_map_cb *cb)	\
 {									\
@@ -343,7 +336,6 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 	struct blkio_group *blkg;					\
 	struct hlist_node *n;						\
 	uint64_t cgroup_total = 0;					\
-	char disk_id[10];						\
 									\
 	if (!cgroup_lock_live_group(cgroup))				\
 		return -ENODEV;						\
@@ -353,10 +345,8 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
 		if (blkg->dev) {					\
 			spin_lock_irq(&blkg->stats_lock);		\
-			snprintf(disk_id, 10, "%u:%u", MAJOR(blkg->dev),\
-					MINOR(blkg->dev));		\
-			cgroup_total += get_stats(blkg, cb, getvar,	\
-						disk_id);		\
+			cgroup_total += blkio_get_stat(blkg, cb,	\
+						blkg->dev, type);	\
 			spin_unlock_irq(&blkg->stats_lock);		\
 		}							\
 	}								\
@@ -367,16 +357,14 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 	return 0;							\
 }
 
-SHOW_FUNCTION_PER_GROUP(time, get_stat, get_time_stat, 0);
-SHOW_FUNCTION_PER_GROUP(sectors, get_stat, get_sectors_stat, 0);
-SHOW_FUNCTION_PER_GROUP(io_service_bytes, get_typed_stat,
-			get_io_service_bytes_stat, 1);
-SHOW_FUNCTION_PER_GROUP(io_serviced, get_typed_stat, get_io_serviced_stat, 1);
-SHOW_FUNCTION_PER_GROUP(io_service_time, get_typed_stat,
-			get_io_service_time_stat, 1);
-SHOW_FUNCTION_PER_GROUP(io_wait_time, get_typed_stat, get_io_wait_time_stat, 1);
+SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
+SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
+SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
+SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue, get_stat, get_dequeue_stat, 0);
+SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
 #endif
 #undef SHOW_FUNCTION_PER_GROUP
 
@@ -398,32 +386,30 @@ struct cftype blkio_files[] = {
 	{
 		.name = "time",
 		.read_map = blkiocg_time_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "sectors",
 		.read_map = blkiocg_sectors_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "io_service_bytes",
 		.read_map = blkiocg_io_service_bytes_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "io_serviced",
 		.read_map = blkiocg_io_serviced_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "io_service_time",
 		.read_map = blkiocg_io_service_time_read,
-		.write_u64 = blkiocg_reset_write,
 	},
 	{
 		.name = "io_wait_time",
 		.read_map = blkiocg_io_wait_time_read,
-		.write_u64 = blkiocg_reset_write,
+	},
+	{
+		.name = "reset_stats",
+		.write_u64 = blkiocg_reset_stats,
 	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
        {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 80010ef64ab0..b22e55390a4f 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,12 +23,31 @@ extern struct cgroup_subsys blkio_subsys;
 #define blkio_subsys_id blkio_subsys.subsys_id
 #endif
 
-enum io_type {
-	IO_READ = 0,
-	IO_WRITE,
-	IO_SYNC,
-	IO_ASYNC,
-	IO_TYPE_MAX
+enum stat_type {
+	/* Total time spent (in ns) between request dispatch to the driver and
+	 * request completion for IOs doen by this cgroup. This may not be
+	 * accurate when NCQ is turned on. */
+	BLKIO_STAT_SERVICE_TIME = 0,
+	/* Total bytes transferred */
+	BLKIO_STAT_SERVICE_BYTES,
+	/* Total IOs serviced, post merge */
+	BLKIO_STAT_SERVICED,
+	/* Total time spent waiting in scheduler queue in ns */
+	BLKIO_STAT_WAIT_TIME,
+	/* All the single valued stats go below this */
+	BLKIO_STAT_TIME,
+	BLKIO_STAT_SECTORS,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	BLKIO_STAT_DEQUEUE
+#endif
+};
+
+enum stat_sub_type {
+	BLKIO_STAT_READ = 0,
+	BLKIO_STAT_WRITE,
+	BLKIO_STAT_SYNC,
+	BLKIO_STAT_ASYNC,
+	BLKIO_STAT_TOTAL
 };
 
 struct blkio_cgroup {
@@ -42,13 +61,7 @@ struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
 	uint64_t sectors;
-	/* Total disk time used by IOs in ns */
-	uint64_t io_service_time[IO_TYPE_MAX];
-	uint64_t io_service_bytes[IO_TYPE_MAX]; /* Total bytes transferred */
-	/* Total IOs serviced, post merge */
-	uint64_t io_serviced[IO_TYPE_MAX];
-	/* Total time spent waiting in scheduler queue in ns */
-	uint64_t io_wait_time[IO_TYPE_MAX];
+	uint64_t stat_arr[BLKIO_STAT_WAIT_TIME + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* How many times this group has been removed from service tree */
 	unsigned long dequeue;
@@ -65,7 +78,7 @@ struct blkio_group {
 	char path[128];
 #endif
 	/* The device MKDEV(major, minor), this group has been created for */
-	dev_t   dev;
+	dev_t dev;
 
 	/* Need to serialize the stats in the case of reset/update */
 	spinlock_t stats_lock;
@@ -128,21 +141,21 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
+void blkio_group_init(struct blkio_group *blkg);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 					unsigned long time);
-void blkiocg_update_request_dispatch_stats(struct blkio_group *blkg,
-						struct request *rq);
-void blkiocg_update_request_completion_stats(struct blkio_group *blkg,
-						struct request *rq);
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
+						bool direction, bool sync);
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 
+static inline void blkio_group_init(struct blkio_group *blkg) {}
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev)
-{
-}
+			struct blkio_group *blkg, void *key, dev_t dev) {}
 
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
@@ -151,9 +164,10 @@ static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 						unsigned long time) {}
-static inline void blkiocg_update_request_dispatch_stats(
-		struct blkio_group *blkg, struct request *rq) {}
-static inline void blkiocg_update_request_completion_stats(
-		struct blkio_group *blkg, struct request *rq) {}
+static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+				uint64_t bytes, bool direction, bool sync) {}
+static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
+		uint64_t start_time, uint64_t io_start_time, bool direction,
+		bool sync) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 42028e7128a7..5617ae030b15 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -955,6 +955,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
+	blkio_group_init(&cfqg->blkg);
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -1865,7 +1866,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	elv_dispatch_sort(q, rq);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
-	blkiocg_update_request_dispatch_stats(&cfqq->cfqg->blkg, rq);
+	blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
+					rq_data_dir(rq), rq_is_sync(rq));
 }
 
 /*
@@ -3286,7 +3288,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
-	blkiocg_update_request_completion_stats(&cfqq->cfqg->blkg, rq);
+	blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
+			rq_io_start_time_ns(rq), rq_data_dir(rq),
+			rq_is_sync(rq));
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3fff8bf85ee..d483c494672a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1209,9 +1209,27 @@ static inline void set_io_start_time_ns(struct request *req)
 {
 	req->io_start_time_ns = sched_clock();
 }
+
+static inline uint64_t rq_start_time_ns(struct request *req)
+{
+        return req->start_time_ns;
+}
+
+static inline uint64_t rq_io_start_time_ns(struct request *req)
+{
+        return req->io_start_time_ns;
+}
 #else
 static inline void set_start_time_ns(struct request *req) {}
 static inline void set_io_start_time_ns(struct request *req) {}
+static inline uint64_t rq_start_time_ns(struct request *req)
+{
+	return 0;
+}
+static inline uint64_t rq_io_start_time_ns(struct request *req)
+{
+	return 0;
+}
 #endif
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
-- 
cgit v1.2.3


From 812d402648f4fc1ab1091b2172a46fc1b367c724 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 8 Apr 2010 21:14:23 -0700
Subject: blkio: Add io_merged stat

This includes both the number of bios merged into requests belonging to this
cgroup as well as the number of requests merged together.
In the past, we've observed different merging behavior across upstream kernels,
some by design some actual bugs. This stat helps a lot in debugging such
problems when applications report decreased throughput with a new kernel
version.

This needed adding an extra elevator function to capture bios being merged as I
did not want to pollute elevator code with blkiocg knowledge and hence needed
the accounting invocation to come from CFQ.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/cgroups/blkio-controller.txt |  5 +++++
 block/blk-cgroup.c                         | 17 +++++++++++++++++
 block/blk-cgroup.h                         |  8 +++++++-
 block/blk-core.c                           |  2 ++
 block/cfq-iosched.c                        | 11 +++++++++++
 block/elevator.c                           |  9 +++++++++
 include/linux/elevator.h                   |  6 ++++++
 7 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index ed04fe9cce1a..810e30171a54 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -134,6 +134,11 @@ Details of cgroup files
 	  minor number of the device, third field specifies the operation type
 	  and the fourth field specifies the io_wait_time in ns.
 
+- blkio.io_merged
+	- Total number of bios/requests merged into requests belonging to this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
 - blkio.dequeue
 	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
 	  gives the statistics about how many a times a group was dequeued
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6797df508821..d23b538858ce 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -127,6 +127,18 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+					bool sync)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkg->stats_lock, flags);
+	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
+			sync);
+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
+
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev)
 {
@@ -363,6 +375,7 @@ SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
 SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
 SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
 SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
 #endif
@@ -407,6 +420,10 @@ struct cftype blkio_files[] = {
 		.name = "io_wait_time",
 		.read_map = blkiocg_io_wait_time_read,
 	},
+	{
+		.name = "io_merged",
+		.read_map = blkiocg_io_merged_read,
+	},
 	{
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b22e55390a4f..470a29db6bec 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -34,6 +34,8 @@ enum stat_type {
 	BLKIO_STAT_SERVICED,
 	/* Total time spent waiting in scheduler queue in ns */
 	BLKIO_STAT_WAIT_TIME,
+	/* Number of IOs merged */
+	BLKIO_STAT_MERGED,
 	/* All the single valued stats go below this */
 	BLKIO_STAT_TIME,
 	BLKIO_STAT_SECTORS,
@@ -61,7 +63,7 @@ struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
 	uint64_t sectors;
-	uint64_t stat_arr[BLKIO_STAT_WAIT_TIME + 1][BLKIO_STAT_TOTAL];
+	uint64_t stat_arr[BLKIO_STAT_MERGED + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* How many times this group has been removed from service tree */
 	unsigned long dequeue;
@@ -148,6 +150,8 @@ void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
 						bool direction, bool sync);
 void blkiocg_update_completion_stats(struct blkio_group *blkg,
 	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+					bool sync);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
@@ -169,5 +173,7 @@ static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
 		uint64_t start_time, uint64_t io_start_time, bool direction,
 		bool sync) {}
+static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+						bool direction, bool sync) {}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 4b1b29ef2cb0..e9a5ae25db8c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1202,6 +1202,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
+		elv_bio_merged(q, req, bio);
 		if (!attempt_back_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
@@ -1235,6 +1236,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
+		elv_bio_merged(q, req, bio);
 		if (!attempt_front_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5617ae030b15..4eb1906cf6c6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1467,6 +1467,14 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 	}
 }
 
+static void cfq_bio_merged(struct request_queue *q, struct request *req,
+				struct bio *bio)
+{
+	struct cfq_queue *cfqq = RQ_CFQQ(req);
+	blkiocg_update_io_merged_stats(&cfqq->cfqg->blkg, bio_data_dir(bio),
+					cfq_bio_sync(bio));
+}
+
 static void
 cfq_merged_requests(struct request_queue *q, struct request *rq,
 		    struct request *next)
@@ -1484,6 +1492,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
+	blkiocg_update_io_merged_stats(&cfqq->cfqg->blkg, rq_data_dir(next),
+					rq_is_sync(next));
 }
 
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -3861,6 +3871,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_merged_fn =		cfq_merged_request,
 		.elevator_merge_req_fn =	cfq_merged_requests,
 		.elevator_allow_merge_fn =	cfq_allow_merge,
+		.elevator_bio_merged_fn =	cfq_bio_merged,
 		.elevator_dispatch_fn =		cfq_dispatch_requests,
 		.elevator_add_req_fn =		cfq_insert_request,
 		.elevator_activate_req_fn =	cfq_activate_request,
diff --git a/block/elevator.c b/block/elevator.c
index 76e3702d5381..5e734592bb40 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 	q->last_merge = rq;
 }
 
+void elv_bio_merged(struct request_queue *q, struct request *rq,
+			struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->ops->elevator_bio_merged_fn)
+		e->ops->elevator_bio_merged_fn(q, rq, bio);
+}
+
 void elv_requeue_request(struct request_queue *q, struct request *rq)
 {
 	/*
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1cb3372e65d8..2c958f4fce1e 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -14,6 +14,9 @@ typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int
 
 typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *);
 
+typedef void (elevator_bio_merged_fn) (struct request_queue *,
+						struct request *, struct bio *);
+
 typedef int (elevator_dispatch_fn) (struct request_queue *, int);
 
 typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
@@ -36,6 +39,7 @@ struct elevator_ops
 	elevator_merged_fn *elevator_merged_fn;
 	elevator_merge_req_fn *elevator_merge_req_fn;
 	elevator_allow_merge_fn *elevator_allow_merge_fn;
+	elevator_bio_merged_fn *elevator_bio_merged_fn;
 
 	elevator_dispatch_fn *elevator_dispatch_fn;
 	elevator_add_req_fn *elevator_add_req_fn;
@@ -103,6 +107,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *, int);
+extern void elv_bio_merged(struct request_queue *q, struct request *,
+				struct bio *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern int elv_queue_empty(struct request_queue *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
-- 
cgit v1.2.3


From 7f1dc8a2d2f45fc557b27fd56115338b1d34fc24 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 21 Apr 2010 17:44:16 +0200
Subject: blkio: Fix blkio crash during rq stat update

blkio + cfq was crashing even when two sequential readers were put in two
separate cgroups (group_isolation=0).

The reason being that cfqq can migrate across groups based on its being
sync-noidle or not, it can happen that at request insertion time, cfqq
belonged to one cfqg and at request dispatch time, it belonged to root
group. In this case request stats per cgroup can go wrong and it also runs
into BUG_ON().

This patch implements rq stashing away a cfq group pointer and not relying
on cfqq->cfqg pointer alone for rq stat accounting.

[   65.163523] ------------[ cut here ]------------
[   65.164301] kernel BUG at block/blk-cgroup.c:117!
[   65.164301] invalid opcode: 0000 [#1] SMP
[   65.164301] last sysfs file: /sys/devices/pci0000:00/0000:00:05.0/0000:60:00.1/host9/rport-9:0-0/target9:0:0/9:0:0:2/block/sde/stat
[   65.164301] CPU 1
[   65.164301] Modules linked in: dm_round_robin dm_multipath qla2xxx scsi_transport_fc dm_zero dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
[   65.164301]
[   65.164301] Pid: 4505, comm: fio Not tainted 2.6.34-rc4-blk-for-35 #34 0A98h/HP xw8600 Workstation
[   65.164301] RIP: 0010:[<ffffffff8121924f>]  [<ffffffff8121924f>] blkiocg_update_io_remove_stats+0x5b/0xaf
[   65.164301] RSP: 0018:ffff8800ba5a79e8  EFLAGS: 00010046
[   65.164301] RAX: 0000000000000096 RBX: ffff8800bb268d60 RCX: 0000000000000000
[   65.164301] RDX: ffff8800bb268eb8 RSI: 0000000000000000 RDI: ffff8800bb268e00
[   65.164301] RBP: ffff8800ba5a7a08 R08: 0000000000000064 R09: 0000000000000001
[   65.164301] R10: 0000000000079640 R11: ffff8800a0bd5bf0 R12: ffff8800bab4af01
[   65.164301] R13: ffff8800bab4af00 R14: ffff8800bb1d8928 R15: 0000000000000000
[   65.164301] FS:  00007f18f75056f0(0000) GS:ffff880001e40000(0000) knlGS:0000000000000000
[   65.164301] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   65.164301] CR2: 000000000040e7f0 CR3: 00000000ba52b000 CR4: 00000000000006e0
[   65.164301] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   65.164301] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[   65.164301] Process fio (pid: 4505, threadinfo ffff8800ba5a6000, task ffff8800ba45ae80)
[   65.164301] Stack:
[   65.164301]  ffff8800ba5a7a08 ffff8800ba722540 ffff8800bab4af68 ffff8800bab4af68
[   65.164301] <0> ffff8800ba5a7a38 ffffffff8121d814 ffff8800ba722540 ffff8800bab4af68
[   65.164301] <0> ffff8800ba722540 ffff8800a08f6800 ffff8800ba5a7a68 ffffffff8121d8ca
[   65.164301] Call Trace:
[   65.164301]  [<ffffffff8121d814>] cfq_remove_request+0xe4/0x116
[   65.164301]  [<ffffffff8121d8ca>] cfq_dispatch_insert+0x84/0xe1
[   65.164301]  [<ffffffff8121e833>] cfq_dispatch_requests+0x767/0x8e8
[   65.164301]  [<ffffffff8120e524>] ? submit_bio+0xc3/0xcc
[   65.164301]  [<ffffffff810ad657>] ? sync_page_killable+0x0/0x35
[   65.164301]  [<ffffffff8120ea8d>] blk_peek_request+0x191/0x1a7
[   65.164301]  [<ffffffffa000109c>] ? dm_get_live_table+0x44/0x4f [dm_mod]
[   65.164301]  [<ffffffffa0002799>] dm_request_fn+0x38/0x14c [dm_mod]
[   65.164301]  [<ffffffff810ad657>] ? sync_page_killable+0x0/0x35
[   65.164301]  [<ffffffff8120f600>] __generic_unplug_device+0x32/0x37
[   65.164301]  [<ffffffff8120f8a0>] generic_unplug_device+0x2e/0x3c
[   65.164301]  [<ffffffffa00011a6>] dm_unplug_all+0x42/0x5b [dm_mod]
[   65.164301]  [<ffffffff8120b063>] blk_unplug+0x29/0x2d
[   65.164301]  [<ffffffff8120b079>] blk_backing_dev_unplug+0x12/0x14
[   65.164301]  [<ffffffff81108a82>] block_sync_page+0x35/0x39
[   65.164301]  [<ffffffff810ad64e>] sync_page+0x41/0x4a
[   65.164301]  [<ffffffff810ad665>] sync_page_killable+0xe/0x35
[   65.164301]  [<ffffffff81589027>] __wait_on_bit_lock+0x46/0x8f
[   65.164301]  [<ffffffff810ad52d>] __lock_page_killable+0x66/0x6d
[   65.164301]  [<ffffffff81055fd4>] ? wake_bit_function+0x0/0x33
[   65.164301]  [<ffffffff810ad560>] lock_page_killable+0x2c/0x2e
[   65.164301]  [<ffffffff810aebfd>] generic_file_aio_read+0x361/0x4f0
[   65.164301]  [<ffffffff810e906c>] do_sync_read+0xcb/0x108
[   65.164301]  [<ffffffff811e32a3>] ? security_file_permission+0x16/0x18
[   65.164301]  [<ffffffff810e96d3>] vfs_read+0xab/0x108
[   65.164301]  [<ffffffff810e97f0>] sys_read+0x4a/0x6e
[   65.164301]  [<ffffffff81002b5b>] system_call_fastpath+0x16/0x1b
[   65.164301] Code: 00 74 1c 48 8b 8b 60 01 00 00 48 85 c9 75 04 0f 0b eb fe 48 ff c9 48 89 8b 60 01 00 00 eb 1a 48 8b 8b 58 01 00 00 48 85 c9 75 04 <0f> 0b eb fe 48 ff c9 48 89 8b 58 01 00 00 45 84 e4 74 16 48 8b
[   65.164301] RIP  [<ffffffff8121924f>] blkiocg_update_io_remove_stats+0x5b/0xaf
[   65.164301]  RSP <ffff8800ba5a79e8>
[   65.164301] ---[ end trace 1b2b828753032e68 ]---

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c    | 36 ++++++++++++++++++++++++++----------
 include/linux/blkdev.h |  3 ++-
 2 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 62defd05518f..d5927b53020e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4;
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
+#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private3)
 
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
@@ -1001,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 	return cfqg;
 }
 
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+	atomic_inc(&cfqg->ref);
+	return cfqg;
+}
+
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
 	/* Currently, all async queues are mapped to root group */
@@ -1084,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
 	return &cfqd->root_group;
 }
+
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+	return NULL;
+}
+
 static inline void
 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
 	cfqq->cfqg = cfqg;
@@ -1386,12 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	blkiocg_update_io_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
+	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
 						rq_is_sync(rq));
 	cfq_add_rq_rb(rq);
-	blkiocg_update_io_add_stats(
-			&cfqq->cfqg->blkg, &cfqq->cfqd->serving_group->blkg,
-			rq_data_dir(rq), rq_is_sync(rq));
+	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
+			rq_is_sync(rq));
 }
 
 static struct request *
@@ -1447,7 +1460,7 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	blkiocg_update_io_remove_stats(&cfqq->cfqg->blkg, rq_data_dir(rq),
+	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
 						rq_is_sync(rq));
 	if (rq_is_meta(rq)) {
 		WARN_ON(!cfqq->meta_pending);
@@ -1483,8 +1496,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	struct cfq_queue *cfqq = RQ_CFQQ(req);
-	blkiocg_update_io_merged_stats(&cfqq->cfqg->blkg, bio_data_dir(bio),
+	blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
 					cfq_bio_sync(bio));
 }
 
@@ -1505,7 +1517,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
-	blkiocg_update_io_merged_stats(&cfqq->cfqg->blkg, rq_data_dir(next),
+	blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
 					rq_is_sync(next));
 }
 
@@ -3240,8 +3252,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
-
-	blkiocg_update_io_add_stats(&cfqq->cfqg->blkg,
+	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
 			&cfqd->serving_group->blkg, rq_data_dir(rq),
 			rq_is_sync(rq));
 	cfq_rq_enqueued(cfqd, cfqq, rq);
@@ -3472,6 +3483,10 @@ static void cfq_put_request(struct request *rq)
 		rq->elevator_private = NULL;
 		rq->elevator_private2 = NULL;
 
+		/* Put down rq reference on cfqg */
+		cfq_put_cfqg(RQ_CFQG(rq));
+		rq->elevator_private3 = NULL;
+
 		cfq_put_queue(cfqq);
 	}
 }
@@ -3560,6 +3575,7 @@ new_queue:
 
 	rq->elevator_private = cic;
 	rq->elevator_private2 = cfqq;
+	rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
 	return 0;
 
 queue_fail:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d483c494672a..5cf17a49ce38 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -186,11 +186,12 @@ struct request {
 	};
 
 	/*
-	 * two pointers are available for the IO schedulers, if they need
+	 * Three pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.
 	 */
 	void *elevator_private;
 	void *elevator_private2;
+	void *elevator_private3;
 
 	struct gendisk *rq_disk;
 	unsigned long start_time;
-- 
cgit v1.2.3


From 6b4517a7913a09d3259bb1d21c9cb300f12294bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 7 Apr 2010 18:53:59 +0900
Subject: block: implement bd_claiming and claiming block

Currently, device claiming for exclusive open is done after low level
open - disk->fops->open() - has completed successfully.  This means
that exclusive open attempts while a device is already exclusively
open will fail only after disk->fops->open() is called.

cdrom driver issues commands during open() which means that O_EXCL
open attempt can unintentionally inject commands to in-progress
command stream for burning thus disturbing burning process.  In most
cases, this doesn't cause problems because the first command to be
issued is TUR which most devices can process in the middle of burning.
However, depending on how a device replies to TUR during burning,
cdrom driver may end up issuing further commands.

This can't be resolved trivially by moving bd_claim() before doing
actual open() because that means an open attempt which will end up
failing could interfere other legit O_EXCL open attempts.
ie. unconfirmed open attempts can fail others.

This patch resolves the problem by introducing claiming block which is
started by bd_start_claiming() and terminated either by bd_claim() or
bd_abort_claiming().  bd_claim() from inside a claiming block is
guaranteed to succeed and once a claiming block is started, other
bd_start_claiming() or bd_claim() attempts block till the current
claiming block is terminated.

bd_claim() can still be used standalone although now it always
synchronizes against claiming blocks, so the existing users will keep
working without any change.

blkdev_open() and open_bdev_exclusive() are converted to use claiming
blocks so that exclusive open attempts from these functions don't
interfere with the existing exclusive open.

This problem was discovered while investigating bko#15403.

  https://bugzilla.kernel.org/show_bug.cgi?id=15403

The burning problem itself can be resolved by updating userspace
probing tools to always open w/ O_EXCL.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Matthias-Christian Ott <ott@mirix.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c     | 198 ++++++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/fs.h |   1 +
 2 files changed, 175 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index e59440c7e1cf..ea8385ea58ab 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -693,12 +693,145 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 		return true;	 /* is a partition of an un-held device */
 }
 
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev.  This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress.  This function doesn't actually claim.  On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+			       struct block_device *whole, void *holder)
+{
+retry:
+	/* if someone else claimed, fail */
+	if (!bd_may_claim(bdev, whole, holder))
+		return -EBUSY;
+
+	/* if someone else is claiming, wait for it to finish */
+	if (whole->bd_claiming && whole->bd_claiming != holder) {
+		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&bdev_lock);
+		schedule();
+		finish_wait(wq, &wait);
+		spin_lock(&bdev_lock);
+		goto retry;
+	}
+
+	/* yay, all mine */
+	return 0;
+}
+
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_claim() or bd_abort_claiming().  If this function
+ * succeeds, the matching bd_claim() is guaranteed to succeed.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+					      void *holder)
+{
+	struct gendisk *disk;
+	struct block_device *whole;
+	int partno, err;
+
+	might_sleep();
+
+	/*
+	 * @bdev might not have been initialized properly yet, look up
+	 * and grab the outer block device the hard way.
+	 */
+	disk = get_gendisk(bdev->bd_dev, &partno);
+	if (!disk)
+		return ERR_PTR(-ENXIO);
+
+	whole = bdget_disk(disk, 0);
+	put_disk(disk);
+	if (!whole)
+		return ERR_PTR(-ENOMEM);
+
+	/* prepare to claim, if successful, mark claiming in progress */
+	spin_lock(&bdev_lock);
+
+	err = bd_prepare_to_claim(bdev, whole, holder);
+	if (err == 0) {
+		whole->bd_claiming = holder;
+		spin_unlock(&bdev_lock);
+		return whole;
+	} else {
+		spin_unlock(&bdev_lock);
+		bdput(whole);
+		return ERR_PTR(err);
+	}
+}
+
+/* releases bdev_lock */
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+	BUG_ON(whole->bd_claiming != holder);
+	whole->bd_claiming = NULL;
+	wake_up_bit(&whole->bd_claiming, 0);
+
+	spin_unlock(&bdev_lock);
+	bdput(whole);
+}
+
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming().  Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+	spin_lock(&bdev_lock);
+	__bd_abort_claiming(whole, holder);		/* releases bdev_lock */
+}
+
 /**
  * bd_claim - claim a block device
  * @bdev: block device to claim
  * @holder: holder trying to claim @bdev
  *
- * Try to claim @bdev.
+ * Try to claim @bdev which must have been opened successfully.  This
+ * function may be called with or without preceding
+ * blk_start_claiming().  In the former case, this function is always
+ * successful and terminates the claiming block.
+ *
+ * CONTEXT:
+ * Might sleep.
  *
  * RETURNS:
  * 0 if successful, -EBUSY if @bdev is already claimed.
@@ -706,11 +839,14 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 int bd_claim(struct block_device *bdev, void *holder)
 {
 	struct block_device *whole = bdev->bd_contains;
-	int res = -EBUSY;
+	int res;
+
+	might_sleep();
 
 	spin_lock(&bdev_lock);
 
-	if (bd_may_claim(bdev, whole, holder)) {
+	res = bd_prepare_to_claim(bdev, whole, holder);
+	if (res == 0) {
 		/* note that for a whole device bd_holders
 		 * will be incremented twice, and bd_holder will
 		 * be set to bd_claim before being set to holder
@@ -719,10 +855,13 @@ int bd_claim(struct block_device *bdev, void *holder)
 		whole->bd_holder = bd_claim;
 		bdev->bd_holders++;
 		bdev->bd_holder = holder;
-		res = 0;
 	}
 
-	spin_unlock(&bdev_lock);
+	if (whole->bd_claiming)
+		__bd_abort_claiming(whole, holder);	/* releases bdev_lock */
+	else
+		spin_unlock(&bdev_lock);
+
 	return res;
 }
 EXPORT_SYMBOL(bd_claim);
@@ -1338,6 +1477,7 @@ EXPORT_SYMBOL(blkdev_get);
 
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
+	struct block_device *whole = NULL;
 	struct block_device *bdev;
 	int res;
 
@@ -1360,22 +1500,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (bdev == NULL)
 		return -ENOMEM;
 
+	if (filp->f_mode & FMODE_EXCL) {
+		whole = bd_start_claiming(bdev, filp);
+		if (IS_ERR(whole)) {
+			bdput(bdev);
+			return PTR_ERR(whole);
+		}
+	}
+
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 
 	res = blkdev_get(bdev, filp->f_mode);
-	if (res)
-		return res;
 
-	if (filp->f_mode & FMODE_EXCL) {
-		res = bd_claim(bdev, filp);
-		if (res)
-			goto out_blkdev_put;
+	if (whole) {
+		if (res == 0)
+			BUG_ON(bd_claim(bdev, filp) != 0);
+		else
+			bd_abort_claiming(whole, filp);
 	}
 
-	return 0;
-
- out_blkdev_put:
-	blkdev_put(bdev, filp->f_mode);
 	return res;
 }
 
@@ -1586,27 +1729,34 @@ EXPORT_SYMBOL(lookup_bdev);
  */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-	struct block_device *bdev;
-	int error = 0;
+	struct block_device *bdev, *whole;
+	int error;
 
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 
+	whole = bd_start_claiming(bdev, holder);
+	if (IS_ERR(whole)) {
+		bdput(bdev);
+		return whole;
+	}
+
 	error = blkdev_get(bdev, mode);
 	if (error)
-		return ERR_PTR(error);
+		goto out_abort_claiming;
+
 	error = -EACCES;
 	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-		goto blkdev_put;
-	error = bd_claim(bdev, holder);
-	if (error)
-		goto blkdev_put;
+		goto out_blkdev_put;
 
+	BUG_ON(bd_claim(bdev, holder) != 0);
 	return bdev;
-	
-blkdev_put:
+
+out_blkdev_put:
 	blkdev_put(bdev, mode);
+out_abort_claiming:
+	bd_abort_claiming(whole, holder);
 	return ERR_PTR(error);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 39d57bc6cc71..31ee31be51e9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -651,6 +651,7 @@ struct block_device {
 	int			bd_openers;
 	struct mutex		bd_mutex;	/* open/close mutex */
 	struct list_head	bd_inodes;
+	void *			bd_claiming;
 	void *			bd_holder;
 	int			bd_holders;
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From fbd9b09a177a481eda256447c881f014f29034fe Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 28 Apr 2010 17:55:06 +0400
Subject: blkdev: generalize flags for blkdev_issue_fn functions

The patch just convert all blkdev_issue_xxx function to common
set of flags. Wait/allocation semantics preserved.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c                | 20 +++++++++++---------
 block/ioctl.c                      |  2 +-
 drivers/block/drbd/drbd_int.h      |  3 ++-
 drivers/block/drbd/drbd_receiver.c |  3 ++-
 fs/block_dev.c                     |  3 ++-
 fs/btrfs/extent-tree.c             |  2 +-
 fs/ext3/fsync.c                    |  3 ++-
 fs/ext4/fsync.c                    |  6 ++++--
 fs/gfs2/rgrp.c                     |  5 +++--
 fs/jbd2/checkpoint.c               |  3 ++-
 fs/jbd2/commit.c                   |  6 ++++--
 fs/reiserfs/file.c                 |  3 ++-
 fs/xfs/linux-2.6/xfs_super.c       |  3 ++-
 include/linux/blkdev.h             | 18 +++++++++++-------
 mm/swapfile.c                      |  9 ++++++---
 15 files changed, 55 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6d88544b677f..cf14311b98fc 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -293,19 +293,22 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @error_sector:	error sector
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
  *    room for storing the error offset in case of a flush error, if they
  *    wish to.
  */
-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+		sector_t *error_sector, unsigned long flags)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
 	struct bio *bio;
-	int ret;
+	int ret = 0;
 
 	if (bdev->bd_disk == NULL)
 		return -ENXIO;
@@ -314,7 +317,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	if (!q)
 		return -ENXIO;
 
-	bio = bio_alloc(GFP_KERNEL, 0);
+	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_end_io = bio_end_empty_barrier;
 	bio->bi_private = &wait;
 	bio->bi_bdev = bdev;
@@ -330,7 +333,6 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	if (error_sector)
 		*error_sector = bio->bi_sector;
 
-	ret = 0;
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	else if (!bio_flagged(bio, BIO_UPTODATE))
@@ -362,17 +364,17 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
  * @sector:	start sector
  * @nr_sects:	number of sectors to discard
  * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	DISCARD_FL_* flags to control behaviour
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a discard request for the sectors in question.
  */
 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, int flags)
+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q = bdev_get_queue(bdev);
-	int type = flags & DISCARD_FL_BARRIER ?
+	int type = flags & BLKDEV_IFL_BARRIER ?
 		DISCARD_BARRIER : DISCARD_NOBARRIER;
 	struct bio *bio;
 	struct page *page;
@@ -395,7 +397,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
-		if (flags & DISCARD_FL_WAIT)
+		if (flags & BLKDEV_IFL_WAIT)
 			bio->bi_private = &wait;
 
 		/*
@@ -426,7 +428,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio_get(bio);
 		submit_bio(type, bio);
 
-		if (flags & DISCARD_FL_WAIT)
+		if (flags & BLKDEV_IFL_WAIT)
 			wait_for_completion(&wait);
 
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
diff --git a/block/ioctl.c b/block/ioctl.c
index 8905d2a2a717..e8eb679f2f9b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
 	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
-				    DISCARD_FL_WAIT);
+				    BLKDEV_IFL_WAIT);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e5e86a781820..d6f1ae342b1d 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2251,7 +2251,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
 	if (test_bit(MD_NO_BARRIER, &mdev->flags))
 		return;
 
-	r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
+	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 	if (r) {
 		set_bit(MD_NO_BARRIER, &mdev->flags);
 		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index ed9f1de24a71..54f56ea8a786 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -945,7 +945,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
 	int rv;
 
 	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
-		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
+		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
+					NULL, BLKDEV_IFL_WAIT);
 		if (rv) {
 			dev_err(DEV, "local disk flush failed with status %d\n", rv);
 			/* would rather check on EOPNOTSUPP, but that is not reliable.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ea8385ea58ab..dd769304382e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -413,7 +413,8 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
 	if (error)
 		return error;
 	
-	error = blkdev_issue_flush(bdev, NULL);
+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL,
+				(BLKDEV_IFL_WAIT));
 	if (error == -EOPNOTSUPP)
 		error = 0;
 	return error;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..c6a4f459ad76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-			     DISCARD_FL_BARRIER);
+			BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..9492f6003ef9 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -91,7 +91,8 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 	 * storage
 	 */
 	if (test_opt(inode->i_sb, BARRIER))
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+				BLKDEV_IFL_WAIT);
 out:
 	return ret;
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..ef3d980e67cb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		if (ext4_should_writeback_data(inode) &&
 		    (journal->j_fs_dev != journal->j_dev) &&
 		    (journal->j_flags & JBD2_BARRIER))
-			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+					NULL, BLKDEV_IFL_WAIT);
 		jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 	return ret;
 }
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..bf011dc63471 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
 							    nr_sects, GFP_NOFS,
-							    DISCARD_FL_BARRIER);
+							    BLKDEV_IFL_WAIT |
+							    BLKDEV_IFL_BARRIER);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
 	}
 	if (nr_sects) {
 		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-					 DISCARD_FL_BARRIER);
+					 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 		if (rv)
 			goto fail;
 	}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 */
 	if ((journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, NULL);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 	if (!(journal->j_flags & JBD2_ABORT))
 		jbd2_journal_update_superblock(journal, 1);
 	return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
 	if (commit_transaction->t_flushed_data_blocks &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, NULL);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
 		if (err)
 			__jbd2_journal_abort_hard(journal);
 		if (journal->j_flags & JBD2_BARRIER)
-			blkdev_issue_flush(journal->j_dev, NULL);
+			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+				BLKDEV_IFL_WAIT);
 	}
 
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..9977df9f3a54 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
 	barrier_done = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
+			BLKDEV_IFL_WAIT);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 52e06b487ced..2b177c778ba7 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -725,7 +725,8 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->bt_bdev, NULL);
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 }
 
 STATIC void
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5cf17a49ce38..59b9aed0ee7d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -998,12 +998,16 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 		return NULL;
 	return bqt->tag_index[tag];
 }
-
-extern int blkdev_issue_flush(struct block_device *, sector_t *);
-#define DISCARD_FL_WAIT		0x01	/* wait for completion */
-#define DISCARD_FL_BARRIER	0x02	/* issue DISCARD_BARRIER request */
-extern int blkdev_issue_discard(struct block_device *, sector_t sector,
-		sector_t nr_sects, gfp_t, int flags);
+enum{
+	BLKDEV_WAIT,	/* wait for completion */
+	BLKDEV_BARRIER,	/*issue request with barrier */
+};
+#define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
+#define BLKDEV_IFL_BARRIER	(1 << BLKDEV_BARRIER)
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
+			unsigned long);
+extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 
 static inline int sb_issue_discard(struct super_block *sb,
 				   sector_t block, sector_t nr_blocks)
@@ -1011,7 +1015,7 @@ static inline int sb_issue_discard(struct super_block *sb,
 	block <<= (sb->s_blocksize_bits - 9);
 	nr_blocks <<= (sb->s_blocksize_bits - 9);
 	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
-				    DISCARD_FL_BARRIER);
+				   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc7..eb086e0f4dcc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
 	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 	if (nr_blocks) {
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+				nr_blocks, GFP_KERNEL,
+				BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 		if (err)
 			return err;
 		cond_resched();
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
 		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+				nr_blocks, GFP_KERNEL,
+				BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 		if (err)
 			break;
 
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
 			if (blkdev_issue_discard(si->bdev, start_block,
-				    nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
+				    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
+							BLKDEV_IFL_BARRIER))
 				break;
 		}
 
-- 
cgit v1.2.3


From 3f14d792f9a8fede64ce918dbb517f934497a4f8 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 28 Apr 2010 17:55:09 +0400
Subject: blkdev: add blkdev_issue_zeroout helper function

- Add bio_batch helper primitive. This is rather generic primitive
  for submitting/waiting a complex request which consists of several
  bios.
- blkdev_issue_zeroout() generate number of zero filed write bios.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-lib.c        | 118 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |   3 +-
 2 files changed, 120 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 0dc438812d81..886c3f9e1be4 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -112,3 +112,121 @@ out:
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
+
+struct bio_batch
+{
+	atomic_t 		done;
+	unsigned long 		flags;
+	struct completion 	*wait;
+	bio_end_io_t		*end_io;
+};
+
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+	struct bio_batch *bb = bio->bi_private;
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bb->flags);
+		else
+			clear_bit(BIO_UPTODATE, &bb->flags);
+	}
+	if (bb) {
+		if (bb->end_io)
+			bb->end_io(bio, err);
+		atomic_inc(&bb->done);
+		complete(bb->wait);
+	}
+	bio_put(bio);
+}
+
+/**
+ * blkdev_issue_zeroout generate number of zero filed write bios
+ * @bdev:	blockdev to issue
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *  Generate and issue number of bios with zerofiled pages.
+ *  Send barrier at the beginning and at the end if requested. This guarantie
+ *  correct request ordering. Empty barrier allow us to avoid post queue flush.
+ */
+
+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+	int ret = 0;
+	struct bio *bio;
+	struct bio_batch bb;
+	unsigned int sz, issued = 0;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	atomic_set(&bb.done, 0);
+	bb.flags = 1 << BIO_UPTODATE;
+	bb.wait = &wait;
+	bb.end_io = NULL;
+
+	if (flags & BLKDEV_IFL_BARRIER) {
+		/* issue async barrier before the data */
+		ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
+		if (ret)
+			return ret;
+	}
+submit:
+	while (nr_sects != 0) {
+		bio = bio_alloc(gfp_mask,
+				min(nr_sects, (sector_t)BIO_MAX_PAGES));
+		if (!bio)
+			break;
+
+		bio->bi_sector = sector;
+		bio->bi_bdev   = bdev;
+		bio->bi_end_io = bio_batch_end_io;
+		if (flags & BLKDEV_IFL_WAIT)
+			bio->bi_private = &bb;
+
+		while(nr_sects != 0) {
+			sz = min(PAGE_SIZE >> 9 , nr_sects);
+			if (sz == 0)
+				/* bio has maximum size possible */
+				break;
+			ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+			nr_sects -= ret >> 9;
+			sector += ret >> 9;
+			if (ret < (sz << 9))
+				break;
+		}
+		issued++;
+		submit_bio(WRITE, bio);
+	}
+	/*
+	 * When all data bios are in flight. Send final barrier if requeted.
+	 */
+	if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
+		ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
+					flags & BLKDEV_IFL_WAIT);
+
+
+	if (flags & BLKDEV_IFL_WAIT)
+		/* Wait for bios in-flight */
+		while ( issued != atomic_read(&bb.done))
+			wait_for_completion(&wait);
+
+	if (!test_bit(BIO_UPTODATE, &bb.flags))
+		/* One of bios in the batch was completed with error.*/
+		ret = -EIO;
+
+	if (ret)
+		goto out;
+
+	if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if (nr_sects != 0)
+		goto submit;
+out:
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 59b9aed0ee7d..3ac2bd2fc485 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1008,7 +1008,8 @@ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
 			unsigned long);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
-
+extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 static inline int sb_issue_discard(struct super_block *sb,
 				   sector_t block, sector_t nr_blocks)
 {
-- 
cgit v1.2.3


From 01effb0dc1451fad55925873ffbfb88fa4eadce0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 11 May 2010 08:57:42 +0200
Subject: block: allow initialization of previously allocated request_queue

blk_init_queue() allocates the request_queue structure and then
initializes it as needed (request_fn, elevator, etc).

Split initialization out to blk_init_allocated_queue_node.
Introduce blk_init_allocated_queue wrapper function to model existing
blk_init_queue and blk_init_queue_node interfaces.

Export elv_register_queue to allow a newly added elevator to be
registered with sysfs.  Export elv_unregister_queue for symmetry.

These changes allow DM to initialize a device's request_queue with more
precision.  In particular, DM no longer unconditionally initializes a
full request_queue (elevator et al).  It only does so for a
request-based DM device.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 18 +++++++++++++++++-
 block/elevator.c       |  2 ++
 include/linux/blkdev.h |  5 +++++
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index e9a5ae25db8c..3bc5579d6f54 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -572,6 +572,22 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
 	struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 
+	return blk_init_allocated_queue_node(q, rfn, lock, node_id);
+}
+EXPORT_SYMBOL(blk_init_queue_node);
+
+struct request_queue *
+blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
+			 spinlock_t *lock)
+{
+	return blk_init_allocated_queue_node(q, rfn, lock, -1);
+}
+EXPORT_SYMBOL(blk_init_allocated_queue);
+
+struct request_queue *
+blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
+			      spinlock_t *lock, int node_id)
+{
 	if (!q)
 		return NULL;
 
@@ -605,7 +621,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	blk_put_queue(q);
 	return NULL;
 }
-EXPORT_SYMBOL(blk_init_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue_node);
 
 int blk_get_queue(struct request_queue *q)
 {
diff --git a/block/elevator.c b/block/elevator.c
index 5e734592bb40..6df2b5056b51 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -930,6 +930,7 @@ int elv_register_queue(struct request_queue *q)
 	}
 	return error;
 }
+EXPORT_SYMBOL(elv_register_queue);
 
 static void __elv_unregister_queue(struct elevator_queue *e)
 {
@@ -942,6 +943,7 @@ void elv_unregister_queue(struct request_queue *q)
 	if (q)
 		__elv_unregister_queue(q->elevator);
 }
+EXPORT_SYMBOL(elv_unregister_queue);
 
 void elv_register(struct elevator_type *e)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3ac2bd2fc485..346fd4856733 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -921,7 +921,12 @@ extern void blk_abort_queue(struct request_queue *);
  */
 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
 					spinlock_t *lock, int node_id);
+extern struct request_queue *blk_init_allocated_queue_node(struct request_queue *,
+							   request_fn_proc *,
+							   spinlock_t *, int node_id);
 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
+extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
+						      request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
-- 
cgit v1.2.3


From e913fc825dc685a444cb4c1d0f9d32f372f59861 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 17 May 2010 12:55:07 +0200
Subject: writeback: fix WB_SYNC_NONE writeback from umount

When umount calls sync_filesystem(), we first do a WB_SYNC_NONE
writeback to kick off writeback of pending dirty inodes, then follow
that up with a WB_SYNC_ALL to wait for it. Since umount already holds
the sb s_umount mutex, WB_SYNC_NONE ends up doing nothing and all
writeback happens as WB_SYNC_ALL. This can greatly slow down umount,
since WB_SYNC_ALL writeback is a data integrity operation and thus
a bigger hammer than simple WB_SYNC_NONE. For barrier aware file systems
it's a lot slower.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c           | 48 ++++++++++++++++++++++++++++++++++-----------
 fs/sync.c                   |  2 +-
 include/linux/backing-dev.h |  2 +-
 include/linux/writeback.h   | 10 ++++++++++
 mm/page-writeback.c         |  4 ++--
 5 files changed, 51 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 760dc8d0b4ff..67db89786e7d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_args {
 	int for_kupdate:1;
 	int range_cyclic:1;
 	int for_background:1;
+	int sb_pinned:1;
 };
 
 /*
@@ -230,6 +231,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
+		/*
+		 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+		 * lets make it explicitly clear.
+		 */
+		.sb_pinned	= 1,
 	};
 	struct bdi_work work;
 
@@ -245,21 +251,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
  * @bdi: the backing device to write from
  * @sb: write inodes from this super_block
  * @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
  *
  * Description:
  *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
  *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   completion. Caller specifies whether sb umount sem is held already or not.
  *
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-			 long nr_pages)
+			 long nr_pages, int sb_locked)
 {
 	struct wb_writeback_args args = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.nr_pages	= nr_pages,
 		.range_cyclic	= 1,
+		.sb_pinned	= sb_locked,
 	};
 
 	/*
@@ -577,7 +585,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
 	/*
 	 * Caller must already hold the ref for this
 	 */
-	if (wbc->sync_mode == WB_SYNC_ALL) {
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
 		WARN_ON(!rwsem_is_locked(&sb->s_umount));
 		return SB_NOT_PINNED;
 	}
@@ -751,6 +759,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		.for_kupdate		= args->for_kupdate,
 		.for_background		= args->for_background,
 		.range_cyclic		= args->range_cyclic,
+		.sb_pinned		= args->sb_pinned,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -1193,6 +1202,18 @@ static void wait_sb_inodes(struct super_block *sb)
 	iput(old_inode);
 }
 
+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
+{
+	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+	long nr_to_write;
+
+	nr_to_write = nr_dirty + nr_unstable +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+	bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
+}
+
 /**
  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1204,17 +1225,22 @@ static void wait_sb_inodes(struct super_block *sb)
  */
 void writeback_inodes_sb(struct super_block *sb)
 {
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-	long nr_to_write;
-
-	nr_to_write = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
-	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+	__writeback_inodes_sb(sb, 0);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
+/**
+ * writeback_inodes_sb_locked	- writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+	__writeback_inodes_sb(sb, 1);
+}
+
 /**
  * writeback_inodes_sb_if_idle	-	start writeback if none underway
  * @sb: the superblock
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7c..de6a44192832 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	if (wait)
 		sync_inodes_sb(sb);
 	else
-		writeback_inodes_sb(sb);
+		writeback_inodes_sb_locked(sb);
 
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 7534979d83bd..ff8bac63213f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -106,7 +106,7 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-				long nr_pages);
+				long nr_pages, int sb_locked);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index eb38a2c645f6..47e1c686cb02 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -65,6 +65,15 @@ struct writeback_control {
 	 * so we use a single control to update them
 	 */
 	unsigned no_nrwrite_index_update:1;
+
+	/*
+	 * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
+	 * the writeback code will pin the sb for the caller. However,
+	 * for eg umount, the caller does WB_SYNC_NONE but already has
+	 * the sb pinned. If the below is set, caller already has the
+	 * sb pinned.
+	 */
+	unsigned sb_pinned:1;
 };
 
 /*
@@ -73,6 +82,7 @@ struct writeback_control {
 struct bdi_writeback;
 int inode_wait(void *);
 void writeback_inodes_sb(struct super_block *);
+void writeback_inodes_sb_locked(struct super_block *);
 int writeback_inodes_sb_if_idle(struct super_block *);
 void sync_inodes_sb(struct super_block *);
 void writeback_inodes_wbc(struct writeback_control *wbc);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d0f2b3765f8d..53b2fcf2d283 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
 			       + global_page_state(NR_UNSTABLE_NFS))
 					  > background_thresh)))
-		bdi_start_writeback(bdi, NULL, 0);
+		bdi_start_writeback(bdi, NULL, 0, 0);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -705,7 +705,7 @@ void laptop_mode_timer_fn(unsigned long data)
 	 */
 
 	if (bdi_has_dirty_io(&q->backing_dev_info))
-		bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
+		bdi_start_writeback(&q->backing_dev_info, NULL, 0, nr_pages);
 }
 
 /*
-- 
cgit v1.2.3


From 6495d2c6d04f4c45411fdb1b40527c24015f39d6 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 24 Mar 2010 16:07:04 +0100
Subject: drbd: Implemented the --assume-clean option for drbdsetup resize

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 11 +++++++++--
 include/linux/drbd.h         |  3 ++-
 include/linux/drbd_nl.h      |  1 +
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6f7933376a11..19b9a2851e7b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1479,6 +1479,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	int retcode = NO_ERROR;
 	int ldsc = 0; /* local disk size changed */
 	enum determine_dev_size dd;
+	enum dds_flags ddsf;
 
 	memset(&rs, 0, sizeof(struct resize));
 	if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
@@ -1502,13 +1503,19 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 		goto fail;
 	}
 
+	if (rs.no_resync && mdev->agreed_pro_version < 93) {
+		retcode = ERR_NEED_APV_93;
+		goto fail;
+	}
+
 	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
 		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
 		ldsc = 1;
 	}
 
 	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
-	dd = drbd_determin_dev_size(mdev, rs.resize_force ? DDSF_FORCED : 0);
+	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+	dd = drbd_determin_dev_size(mdev, ddsf);
 	drbd_md_sync(mdev);
 	put_ldev(mdev);
 	if (dd == dev_size_error) {
@@ -1521,7 +1528,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			set_bit(RESIZE_PENDING, &mdev->flags);
 
 		drbd_send_uuids(mdev);
-		drbd_send_sizes(mdev, 1, 0);
+		drbd_send_sizes(mdev, 1, ddsf);
 	}
 
  fail:
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 4341b1a97a34..7944dd36ee3e 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -56,7 +56,7 @@ extern const char *drbd_buildtag(void);
 #define REL_VERSION "8.3.7"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 92
+#define PRO_VERSION_MAX 93
 
 
 enum drbd_io_error_p {
@@ -139,6 +139,7 @@ enum drbd_ret_codes {
 	ERR_DATA_NOT_CURRENT	= 150,
 	ERR_CONNECTED		= 151, /* DRBD 8.3 only */
 	ERR_PERM		= 152,
+	ERR_NEED_APV_93		= 153,
 
 	/* insert new ones above this line */
 	AFTER_LAST_ERR_CODE
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index f7431a4ca608..de055c07c2ba 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -71,6 +71,7 @@ NL_PACKET(disconnect, 6, )
 NL_PACKET(resize, 7,
 	NL_INT64(		29,	T_MAY_IGNORE,	resize_size)
 	NL_BIT(			68,	T_MAY_IGNORE,	resize_force)
+	NL_BIT(			69,	T_MANDATORY,	no_resync)
 )
 
 NL_PACKET(syncer_conf, 8,
-- 
cgit v1.2.3


From 67c7ddd055c794f0d8e9466ca2d6b5cc0b73d4df Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 4 May 2010 11:12:00 +0200
Subject: drbd: Four new configuration settings for resync speed control

To reasonably control resync speed over drbd-proxy connections,
drbd has to measure the current delay of packets transmitted over
the (possibly congested) data socket vs the meta-data socket.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c |  4 ++++
 include/linux/drbd_limits.h  | 16 ++++++++++++++++
 include/linux/drbd_nl.h      |  4 ++++
 3 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6cb703686b28..93d150661f4b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1555,6 +1555,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		sc.rate       = DRBD_RATE_DEF;
 		sc.after      = DRBD_AFTER_DEF;
 		sc.al_extents = DRBD_AL_EXTENTS_DEF;
+		sc.dp_volume  = DRBD_DP_VOLUME_DEF;
+		sc.dp_interval = DRBD_DP_INTERVAL_DEF;
+		sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF;
+		sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF;
 	} else
 		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
 
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 51f47a586ad8..440b42e38e89 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -133,5 +133,21 @@
 #define DRBD_MAX_BIO_BVECS_MAX 128
 #define DRBD_MAX_BIO_BVECS_DEF 0
 
+#define DRBD_DP_VOLUME_MIN 4
+#define DRBD_DP_VOLUME_MAX 1048576
+#define DRBD_DP_VOLUME_DEF 16384
+
+#define DRBD_DP_INTERVAL_MIN 1
+#define DRBD_DP_INTERVAL_MAX 600
+#define DRBD_DP_INTERVAL_DEF 5
+
+#define DRBD_RS_THROTTLE_TH_MIN 1
+#define DRBD_RS_THROTTLE_TH_MAX 600
+#define DRBD_RS_THROTTLE_TH_DEF 20
+
+#define DRBD_RS_HOLD_OFF_TH_MIN 1
+#define DRBD_RS_HOLD_OFF_TH_MAX 6000
+#define DRBD_RS_HOLD_OFF_TH_DEF 100
+
 #undef RANGE
 #endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index de055c07c2ba..ce77a746fc9d 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -78,6 +78,10 @@ NL_PACKET(syncer_conf, 8,
 	NL_INTEGER(	30,	T_MAY_IGNORE,	rate)
 	NL_INTEGER(	31,	T_MAY_IGNORE,	after)
 	NL_INTEGER(	32,	T_MAY_IGNORE,	al_extents)
+	NL_INTEGER(     71,	T_MAY_IGNORE,	dp_volume)
+	NL_INTEGER(     72,	T_MAY_IGNORE,	dp_interval)
+	NL_INTEGER(     73,	T_MAY_IGNORE,	throttle_th)
+	NL_INTEGER(     74,	T_MAY_IGNORE,	hold_off_th)
 	NL_STRING(      52,     T_MAY_IGNORE,   verify_alg,     SHARED_SECRET_MAX)
 	NL_STRING(      51,     T_MAY_IGNORE,   cpu_mask,       32)
 	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
-- 
cgit v1.2.3


From 6423104b6a1e6f0c18be60e8c33f02d263331d5e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 21 May 2010 20:00:35 +0200
Subject: writeback: fixups for !dirty_writeback_centisecs

Commit 69b62d01 fixed up most of the places where we would enter
busy schedule() spins when disabling the periodic background
writeback. This fixes up the sb timer so that it doesn't get
hammered on with the delay disabled, and ensures that it gets
rearmed if needed when /proc/sys/vm/dirty_writeback_centisecs
gets modified.

bdi_forker_task() also needs to check for !dirty_writeback_centisecs
and use schedule() appropriately, fix that up too.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/backing-dev.h |  1 +
 mm/backing-dev.c            | 15 ++++++++++-----
 mm/page-writeback.c         |  1 +
 3 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index ff8bac63213f..e6e0cb5437e6 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -109,6 +109,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
 				long nr_pages, int sb_locked);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
+void bdi_arm_supers_timer(void);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 707d0dc6da0f..660a87a22511 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -48,7 +48,6 @@ static struct timer_list sync_supers_timer;
 
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
-static void arm_supers_timer(void);
 
 static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
 
@@ -252,7 +251,7 @@ static int __init default_bdi_init(void)
 
 	init_timer(&sync_supers_timer);
 	setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
-	arm_supers_timer();
+	bdi_arm_supers_timer();
 
 	err = bdi_init(&default_backing_dev_info);
 	if (!err)
@@ -374,10 +373,13 @@ static int bdi_sync_supers(void *unused)
 	return 0;
 }
 
-static void arm_supers_timer(void)
+void bdi_arm_supers_timer(void)
 {
 	unsigned long next;
 
+	if (!dirty_writeback_interval)
+		return;
+
 	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
 	mod_timer(&sync_supers_timer, round_jiffies_up(next));
 }
@@ -385,7 +387,7 @@ static void arm_supers_timer(void)
 static void sync_supers_timer_fn(unsigned long unused)
 {
 	wake_up_process(sync_supers_tsk);
-	arm_supers_timer();
+	bdi_arm_supers_timer();
 }
 
 static int bdi_forker_task(void *ptr)
@@ -428,7 +430,10 @@ static int bdi_forker_task(void *ptr)
 
 			spin_unlock_bh(&bdi_lock);
 			wait = msecs_to_jiffies(dirty_writeback_interval * 10);
-			schedule_timeout(wait);
+			if (wait)
+				schedule_timeout(wait);
+			else
+				schedule();
 			try_to_freeze();
 			continue;
 		}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 53b2fcf2d283..0d7bbe859550 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -690,6 +690,7 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);
+	bdi_arm_supers_timer();
 	return 0;
 }
 
-- 
cgit v1.2.3


From c3e33e043f5e9c583aa59d5591a614b2a8243d3a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 15 May 2010 20:09:29 +0200
Subject: block,ide: simplify bdops->set_capacity() to
 ->unlock_native_capacity()

bdops->set_capacity() is unnecessarily generic.  All that's required
is a simple one way notification to lower level driver telling it to
try to unlock native capacity.  There's no reason to pass in target
capacity or return the new capacity.  The former is always the
inherent native capacity and the latter can be handled via the usual
device resize / revalidation path.  In fact, the current API is always
used that way.

Replace ->set_capacity() with ->unlock_native_capacity() which take
only @disk and doesn't return anything.  IDE which is the only current
user of the API is converted accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-disk.c | 40 ++++++++++++++++------------------------
 drivers/ide/ide-gd.c   | 11 ++++-------
 fs/partitions/check.c  |  4 ++--
 include/linux/blkdev.h |  3 +--
 include/linux/ide.h    |  2 +-
 5 files changed, 24 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 3b128dce9c3a..33d65039cce9 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -407,32 +407,24 @@ static int ide_disk_get_capacity(ide_drive_t *drive)
 	return 0;
 }
 
-static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity)
+static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
 {
-	u64 set = min(capacity, drive->probed_capacity);
 	u16 *id = drive->id;
 	int lba48 = ata_id_lba48_enabled(id);
 
 	if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
 	    ata_id_hpa_enabled(id) == 0)
-		goto out;
+		return;
 
 	/*
 	 * according to the spec the SET MAX ADDRESS command shall be
 	 * immediately preceded by a READ NATIVE MAX ADDRESS command
 	 */
-	capacity = ide_disk_hpa_get_native_capacity(drive, lba48);
-	if (capacity == 0)
-		goto out;
-
-	set = ide_disk_hpa_set_capacity(drive, set, lba48);
-	if (set) {
-		/* needed for ->resume to disable HPA */
-		drive->dev_flags |= IDE_DFLAG_NOHPA;
-		return set;
-	}
-out:
-	return drive->capacity64;
+	if (!ide_disk_hpa_get_native_capacity(drive, lba48))
+		return;
+
+	if (ide_disk_hpa_set_capacity(drive, drive->probed_capacity, lba48))
+		drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
 }
 
 static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
@@ -783,13 +775,13 @@ static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
 }
 
 const struct ide_disk_ops ide_ata_disk_ops = {
-	.check		= ide_disk_check,
-	.set_capacity	= ide_disk_set_capacity,
-	.get_capacity	= ide_disk_get_capacity,
-	.setup		= ide_disk_setup,
-	.flush		= ide_disk_flush,
-	.init_media	= ide_disk_init_media,
-	.set_doorlock	= ide_disk_set_doorlock,
-	.do_request	= ide_do_rw_disk,
-	.ioctl		= ide_disk_ioctl,
+	.check			= ide_disk_check,
+	.unlock_native_capacity	= ide_disk_unlock_native_capacity,
+	.get_capacity		= ide_disk_get_capacity,
+	.setup			= ide_disk_setup,
+	.flush			= ide_disk_flush,
+	.init_media		= ide_disk_init_media,
+	.set_doorlock		= ide_disk_set_doorlock,
+	.do_request		= ide_do_rw_disk,
+	.ioctl			= ide_disk_ioctl,
 };
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index c32d83996ae1..c102d23d9b38 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -288,17 +288,14 @@ static int ide_gd_media_changed(struct gendisk *disk)
 	return ret;
 }
 
-static unsigned long long ide_gd_set_capacity(struct gendisk *disk,
-					      unsigned long long capacity)
+static void ide_gd_unlock_native_capacity(struct gendisk *disk)
 {
 	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
 	ide_drive_t *drive = idkp->drive;
 	const struct ide_disk_ops *disk_ops = drive->disk_ops;
 
-	if (disk_ops->set_capacity)
-		return disk_ops->set_capacity(drive, capacity);
-
-	return drive->capacity64;
+	if (disk_ops->unlock_native_capacity)
+		disk_ops->unlock_native_capacity(drive);
 }
 
 static int ide_gd_revalidate_disk(struct gendisk *disk)
@@ -329,7 +326,7 @@ static const struct block_device_operations ide_gd_ops = {
 	.locked_ioctl		= ide_gd_ioctl,
 	.getgeo			= ide_gd_getgeo,
 	.media_changed		= ide_gd_media_changed,
-	.set_capacity		= ide_gd_set_capacity,
+	.unlock_native_capacity	= ide_gd_unlock_native_capacity,
 	.revalidate_disk	= ide_gd_revalidate_disk
 };
 
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8f01df354f04..4f1fee0355ad 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -601,10 +601,10 @@ rescan:
 			       "%s: p%d size %llu exceeds device capacity, ",
 			       disk->disk_name, p, (unsigned long long) size);
 
-			if (bdops->set_capacity &&
+			if (bdops->unlock_native_capacity &&
 			    (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
 				printk(KERN_CONT "enabling native capacity\n");
-				bdops->set_capacity(disk, ~0ULL);
+				bdops->unlock_native_capacity(disk);
 				disk->flags |= GENHD_FL_NATIVE_CAPACITY;
 				/* free state and restart */
 				kfree(state);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 346fd4856733..be411c12ebbe 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1330,8 +1330,7 @@ struct block_device_operations {
 	int (*direct_access) (struct block_device *, sector_t,
 						void **, unsigned long *);
 	int (*media_changed) (struct gendisk *);
-	unsigned long long (*set_capacity) (struct gendisk *,
-						unsigned long long);
+	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
 	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	struct module *owner;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 3239d1c10acb..b6d448048ae2 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,7 +362,7 @@ struct ide_drive_s;
 struct ide_disk_ops {
 	int		(*check)(struct ide_drive_s *, const char *);
 	int		(*get_capacity)(struct ide_drive_s *);
-	u64		(*set_capacity)(struct ide_drive_s *, u64);
+	void		(*unlock_native_capacity)(struct ide_drive_s *);
 	void		(*setup)(struct ide_drive_s *);
 	void		(*flush)(struct ide_drive_s *);
 	int		(*init_media)(struct ide_drive_s *, struct gendisk *);
-- 
cgit v1.2.3


From c2c4986eddaa7dc3d036cb2bfa5c8c5f1f2492a0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 20 May 2010 09:18:47 +0200
Subject: writeback: fix problem with !CONFIG_BLOCK compilation

When CONFIG_BLOCK isn't enabled:

mm/page-writeback.c: In function 'laptop_mode_timer_fn':
mm/page-writeback.c:708: error: dereferencing pointer to incomplete type
mm/page-writeback.c:709: error: dereferencing pointer to incomplete type

Fix this by essentially eliminating the laptop sync handlers when
CONFIG_BLOCK isn't set, as most are only used from the block layer code.
The exception is laptop_sync_completion() which is used from sys_sync(),
make that an empty declaration in that case.

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/super.c                | 1 +
 include/linux/writeback.h | 4 ++++
 mm/page-writeback.c       | 2 ++
 3 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index dc72491a19f9..1527e6a0ee35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 47e1c686cb02..cc97d6caf2b3 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -106,10 +106,14 @@ static inline void inode_sync_wait(struct inode *inode)
 /*
  * mm/page-writeback.c
  */
+#ifdef CONFIG_BLOCK
 void laptop_io_completion(struct backing_dev_info *info);
 void laptop_sync_completion(void);
 void laptop_mode_sync(struct work_struct *work);
 void laptop_mode_timer_fn(unsigned long data);
+#else
+static inline void laptop_sync_completion(void) { }
+#endif
 void throttle_vm_writeout(gfp_t gfp_mask);
 
 /* These are exported to sysctl. */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0d7bbe859550..9886424e1864 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -694,6 +694,7 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	return 0;
 }
 
+#ifdef CONFIG_BLOCK
 void laptop_mode_timer_fn(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
@@ -735,6 +736,7 @@ void laptop_sync_completion(void)
 
 	rcu_read_unlock();
 }
+#endif
 
 /*
  * If ratelimit_pages is too high then we can get into dirty-data overload
-- 
cgit v1.2.3


From 3d42b3612891baecf709d93f28655a6882a65d41 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 20 May 2010 12:14:54 +0200
Subject: drbd: This is now equivalent to drbd release 8.3.8rc1

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/drbd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 7944dd36ee3e..68530521ad00 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,10 +53,10 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.7"
+#define REL_VERSION "8.3.8rc1"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 93
+#define PRO_VERSION_MAX 94
 
 
 enum drbd_io_error_p {
-- 
cgit v1.2.3


From 35f3d14dbbc58447c61e38a162ea10add6b31dc7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 20 May 2010 10:43:18 +0200
Subject: pipe: add support for shrinking and growing pipes

This patch adds F_GETPIPE_SZ and F_SETPIPE_SZ fcntl() actions for
growing and shrinking the size of a pipe and adjusts pipe.c and splice.c
(and relay and network splice) usage to work with these larger (or smaller)
pipes.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fcntl.c                |   5 ++
 fs/pipe.c                 | 107 ++++++++++++++++++++++++++++----
 fs/splice.c               | 151 ++++++++++++++++++++++++++++++++--------------
 include/linux/fcntl.h     |   6 ++
 include/linux/pipe_fs_i.h |  11 ++--
 include/linux/splice.h    |   7 +++
 kernel/relay.c            |  15 +++--
 kernel/trace/trace.c      |  60 ++++++++++--------
 net/core/skbuff.c         |  38 ++++++------
 9 files changed, 292 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..bcba960328fa 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
 #include <linux/dnotify.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_NOTIFY:
 		err = fcntl_dirnotify(fd, filp, arg);
 		break;
+	case F_SETPIPE_SZ:
+	case F_GETPIPE_SZ:
+		err = pipe_fcntl(filp, cmd, arg);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..054b8a6a2c7a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/log2.h>
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
@@ -390,7 +391,7 @@ redo:
 			if (!buf->len) {
 				buf->ops = NULL;
 				ops->release(pipe, buf);
-				curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+				curbuf = (curbuf + 1) & (pipe->buffers - 1);
 				pipe->curbuf = curbuf;
 				pipe->nrbufs = --bufs;
 				do_wakeup = 1;
@@ -472,7 +473,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 	if (pipe->nrbufs && chars != 0) {
 		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
-							(PIPE_BUFFERS-1);
+							(pipe->buffers - 1);
 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		const struct pipe_buf_operations *ops = buf->ops;
 		int offset = buf->offset + buf->len;
@@ -518,8 +519,8 @@ redo1:
 			break;
 		}
 		bufs = pipe->nrbufs;
-		if (bufs < PIPE_BUFFERS) {
-			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+		if (bufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pipe->tmp_page;
 			char *src;
@@ -580,7 +581,7 @@ redo2:
 			if (!total_len)
 				break;
 		}
-		if (bufs < PIPE_BUFFERS)
+		if (bufs < pipe->buffers)
 			continue;
 		if (filp->f_flags & O_NONBLOCK) {
 			if (!ret)
@@ -640,7 +641,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			nrbufs = pipe->nrbufs;
 			while (--nrbufs >= 0) {
 				count += pipe->bufs[buf].len;
-				buf = (buf+1) & (PIPE_BUFFERS-1);
+				buf = (buf+1) & (pipe->buffers - 1);
 			}
 			mutex_unlock(&inode->i_mutex);
 
@@ -671,7 +672,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 	}
 
 	if (filp->f_mode & FMODE_WRITE) {
-		mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
 		/*
 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
@@ -877,25 +878,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 
 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 	if (pipe) {
-		init_waitqueue_head(&pipe->wait);
-		pipe->r_counter = pipe->w_counter = 1;
-		pipe->inode = inode;
+		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+		if (pipe->bufs) {
+			init_waitqueue_head(&pipe->wait);
+			pipe->r_counter = pipe->w_counter = 1;
+			pipe->inode = inode;
+			pipe->buffers = PIPE_DEF_BUFFERS;
+			return pipe;
+		}
+		kfree(pipe);
 	}
 
-	return pipe;
+	return NULL;
 }
 
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
-	for (i = 0; i < PIPE_BUFFERS; i++) {
+	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
 			buf->ops->release(pipe, buf);
 	}
 	if (pipe->tmp_page)
 		__free_page(pipe->tmp_page);
+	kfree(pipe->bufs);
 	kfree(pipe);
 }
 
@@ -1093,6 +1101,81 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 	return sys_pipe2(fildes, 0);
 }
 
+/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+	struct pipe_buffer *bufs;
+
+	/*
+	 * Must be a power-of-2 currently
+	 */
+	if (!is_power_of_2(arg))
+		return -EINVAL;
+
+	/*
+	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+	 * expect a lot of shrink+grow operations, just free and allocate
+	 * again like we would do for growing. If the pipe currently
+	 * contains more buffers than arg, then return busy.
+	 */
+	if (arg < pipe->nrbufs)
+		return -EBUSY;
+
+	bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+	if (unlikely(!bufs))
+		return -ENOMEM;
+
+	/*
+	 * The pipe array wraps around, so just start the new one at zero
+	 * and adjust the indexes.
+	 */
+	if (pipe->nrbufs) {
+		const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
+		const unsigned int head = pipe->nrbufs - tail;
+
+		if (head)
+			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+		if (tail)
+			memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+	}
+
+	pipe->curbuf = 0;
+	kfree(pipe->bufs);
+	pipe->bufs = bufs;
+	pipe->buffers = arg;
+	return arg;
+}
+
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pipe_inode_info *pipe;
+	long ret;
+
+	pipe = file->f_path.dentry->d_inode->i_pipe;
+	if (!pipe)
+		return -EBADF;
+
+	mutex_lock(&pipe->inode->i_mutex);
+
+	switch (cmd) {
+	case F_SETPIPE_SZ:
+		ret = pipe_set_size(pipe, arg);
+		break;
+	case F_GETPIPE_SZ:
+		ret = pipe->buffers;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	mutex_unlock(&pipe->inode->i_mutex);
+	return ret;
+}
+
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..ac22b00d86c3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 			break;
 		}
 
-		if (pipe->nrbufs < PIPE_BUFFERS) {
-			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+		if (pipe->nrbufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 
 			buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 
 			if (!--spd->nr_pages)
 				break;
-			if (pipe->nrbufs < PIPE_BUFFERS)
+			if (pipe->nrbufs < pipe->buffers)
 				continue;
 
 			break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 	page_cache_release(spd->pages[i]);
 }
 
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return 0;
+
+	spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+	spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+
+	if (spd->pages && spd->partial)
+		return 0;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+	return -ENOMEM;
+}
+
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+		       struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+}
+
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int loff, nr_pages, req_pages;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		.spd_release = spd_release_page,
 	};
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	loff = *ppos & ~PAGE_CACHE_MASK;
 	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+	nr_pages = min(req_pages, pipe->buffers);
 
 	/*
 	 * Lookup the (hopefully) full range of pages we need.
 	 */
-	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
 	index += spd.nr_pages;
 
 	/*
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			unlock_page(page);
 		}
 
-		pages[spd.nr_pages++] = page;
+		spd.pages[spd.nr_pages++] = page;
 		index++;
 	}
 
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		 * this_len is the max we'll use from this page
 		 */
 		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
-		page = pages[page_nr];
+		page = spd.pages[page_nr];
 
 		if (PageReadahead(page))
 			page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 					error = -ENOMEM;
 					break;
 				}
-				page_cache_release(pages[page_nr]);
-				pages[page_nr] = page;
+				page_cache_release(spd.pages[page_nr]);
+				spd.pages[page_nr] = page;
 			}
 			/*
 			 * page was already under io and is now done, great
@@ -451,8 +484,8 @@ fill_it:
 			len = this_len;
 		}
 
-		partial[page_nr].offset = loff;
-		partial[page_nr].len = this_len;
+		spd.partial[page_nr].offset = loff;
+		spd.partial[page_nr].len = this_len;
 		len -= this_len;
 		loff = 0;
 		spd.nr_pages++;
@@ -464,12 +497,13 @@ fill_it:
 	 * we got, 'nr_pages' is how many pages are in the map.
 	 */
 	while (page_nr < nr_pages)
-		page_cache_release(pages[page_nr++]);
+		page_cache_release(spd.pages[page_nr++]);
 	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 
 	if (spd.nr_pages)
-		return splice_to_pipe(pipe, &spd);
+		error = splice_to_pipe(pipe, &spd);
 
+	splice_shrink_spd(pipe, &spd);
 	return error;
 }
 
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	unsigned int nr_pages;
 	unsigned int nr_freed;
 	size_t offset;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
-	struct iovec vec[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
 	pgoff_t index;
 	ssize_t res;
 	size_t this_len;
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		.spd_release = spd_release_page,
 	};
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	res = -ENOMEM;
+	vec = __vec;
+	if (pipe->buffers > PIPE_DEF_BUFFERS) {
+		vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+		if (!vec)
+			goto shrink_ret;
+	}
+
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+	for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
 		struct page *page;
 
 		page = alloc_page(GFP_USER);
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
 		vec[i].iov_base = (void __user *) page_address(page);
 		vec[i].iov_len = this_len;
-		pages[i] = page;
+		spd.pages[i] = page;
 		spd.nr_pages++;
 		len -= this_len;
 		offset = 0;
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	nr_freed = 0;
 	for (i = 0; i < spd.nr_pages; i++) {
 		this_len = min_t(size_t, vec[i].iov_len, res);
-		partial[i].offset = 0;
-		partial[i].len = this_len;
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = this_len;
 		if (!this_len) {
-			__free_page(pages[i]);
-			pages[i] = NULL;
+			__free_page(spd.pages[i]);
+			spd.pages[i] = NULL;
 			nr_freed++;
 		}
 		res -= this_len;
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	if (res > 0)
 		*ppos += res;
 
+shrink_ret:
+	if (vec != __vec)
+		kfree(vec);
+	splice_shrink_spd(pipe, &spd);
 	return res;
 
 err:
 	for (i = 0; i < spd.nr_pages; i++)
-		__free_page(pages[i]);
+		__free_page(spd.pages[i]);
 
-	return error;
+	res = error;
+	goto shrink_ret;
 }
 EXPORT_SYMBOL(default_file_splice_read);
 
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 		if (!buf->len) {
 			buf->ops = NULL;
 			ops->release(pipe, buf);
-			pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 			pipe->nrbufs--;
 			if (pipe->inode)
 				sd->need_wakeup = true;
@@ -1211,7 +1261,7 @@ out_release:
 	 * If we did an incomplete transfer we must release
 	 * the pipe buffers in question:
 	 */
-	for (i = 0; i < PIPE_BUFFERS; i++) {
+	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 
 		if (buf->ops) {
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
  */
 static int get_iovec_page_array(const struct iovec __user *iov,
 				unsigned int nr_vecs, struct page **pages,
-				struct partial_page *partial, int aligned)
+				struct partial_page *partial, int aligned,
+				unsigned int pipe_buffers)
 {
 	int buffers = 0, error = 0;
 
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 			break;
 
 		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		if (npages > PIPE_BUFFERS - buffers)
-			npages = PIPE_BUFFERS - buffers;
+		if (npages > pipe_buffers - buffers)
+			npages = pipe_buffers - buffers;
 
 		error = get_user_pages_fast((unsigned long)base, npages,
 					0, &pages[buffers]);
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		 * or if we mapped the max number of pages that we have
 		 * room for.
 		 */
-		if (error < npages || buffers == PIPE_BUFFERS)
+		if (error < npages || buffers == pipe_buffers)
 			break;
 
 		nr_vecs--;
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 		.ops = &user_page_pipe_buf_ops,
 		.spd_release = spd_release_page,
 	};
+	long ret;
 
 	pipe = pipe_info(file->f_path.dentry->d_inode);
 	if (!pipe)
 		return -EBADF;
 
-	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
-					    flags & SPLICE_F_GIFT);
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+					    spd.partial, flags & SPLICE_F_GIFT,
+					    pipe->buffers);
 	if (spd.nr_pages <= 0)
-		return spd.nr_pages;
+		ret = spd.nr_pages;
+	else
+		ret = splice_to_pipe(pipe, &spd);
 
-	return splice_to_pipe(pipe, &spd);
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 }
 
 /*
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	 * Check ->nrbufs without the inode lock first. This function
 	 * is speculative anyways, so missing one is ok.
 	 */
-	if (pipe->nrbufs < PIPE_BUFFERS)
+	if (pipe->nrbufs < pipe->buffers)
 		return 0;
 
 	ret = 0;
 	pipe_lock(pipe);
 
-	while (pipe->nrbufs >= PIPE_BUFFERS) {
+	while (pipe->nrbufs >= pipe->buffers) {
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			ret = -EPIPE;
@@ -1810,7 +1869,7 @@ retry:
 		 * Cannot make any progress, because either the input
 		 * pipe is empty or the output pipe is full.
 		 */
-		if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
 			/* Already processed some buffers, break */
 			if (ret)
 				break;
@@ -1831,7 +1890,7 @@ retry:
 		}
 
 		ibuf = ipipe->bufs + ipipe->curbuf;
-		nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
 		obuf = opipe->bufs + nbuf;
 
 		if (len >= ibuf->len) {
@@ -1841,7 +1900,7 @@ retry:
 			*obuf = *ibuf;
 			ibuf->ops = NULL;
 			opipe->nrbufs++;
-			ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
 			ipipe->nrbufs--;
 			input_wakeup = true;
 		} else {
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		 * If we have iterated all input buffers or ran out of
 		 * output room, break.
 		 */
-		if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
 			break;
 
-		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
-		nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
 
 		/*
 		 * Get a reference to this pipe buffer,
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 86037400a6e3..afc00af3229b 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -21,6 +21,12 @@
  */
 #define F_NOTIFY	(F_LINUX_SPECIFIC_BASE+2)
 
+/*
+ * Set and get of pipe page size array
+ */
+#define F_SETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 7)
+#define F_GETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 8)
+
 /*
  * Types of directory notifications that may be requested.
  */
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b43a9e039059..65f4282fcbaf 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -3,7 +3,7 @@
 
 #define PIPEFS_MAGIC 0x50495045
 
-#define PIPE_BUFFERS (16)
+#define PIPE_DEF_BUFFERS	16
 
 #define PIPE_BUF_FLAG_LRU	0x01	/* page is on the LRU */
 #define PIPE_BUF_FLAG_ATOMIC	0x02	/* was atomically mapped */
@@ -44,17 +44,17 @@ struct pipe_buffer {
  **/
 struct pipe_inode_info {
 	wait_queue_head_t wait;
-	unsigned int nrbufs, curbuf;
-	struct page *tmp_page;
+	unsigned int nrbufs, curbuf, buffers;
 	unsigned int readers;
 	unsigned int writers;
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
+	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
 	struct inode *inode;
-	struct pipe_buffer bufs[PIPE_BUFFERS];
+	struct pipe_buffer *bufs;
 };
 
 /*
@@ -154,4 +154,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
 void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
 
+/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
+long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
+
 #endif
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 18e7c7c0cae6..997c3b4c212b 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -82,4 +82,11 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *,
 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
 				      splice_direct_actor *);
 
+/*
+ * for dynamic pipe sizing
+ */
+extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *);
+extern void splice_shrink_spd(struct pipe_inode_info *,
+				struct splice_pipe_desc *);
+
 #endif
diff --git a/kernel/relay.c b/kernel/relay.c
index 3d97f2821611..4268287148c1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
 	size_t read_subbuf = read_start / subbuf_size;
 	size_t padding = rbuf->padding[read_subbuf];
 	size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.nr_pages = 0,
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
 
 	if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
 		return 0;
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
 
 	/*
 	 * Adjust read len, if longer than what is available
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
 	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
 	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
 	poff = read_start & ~PAGE_MASK;
-	nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
+	nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
 
 	for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
 		unsigned int this_len, this_end, private;
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
 		}
 	}
 
+	ret = 0;
 	if (!spd.nr_pages)
-		return 0;
+		goto out;
 
 	ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
 	if (ret < 0 || ret < total_len)
-		return ret;
+		goto out;
 
         if (read_start + ret == nonpad_end)
                 ret += padding;
 
+out:
+	splice_shrink_spd(pipe, &spd);
         return ret;
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44f916a04065..7b155a0e6f31 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3269,12 +3269,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 					size_t len,
 					unsigned int flags)
 {
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages_def[PIPE_DEF_BUFFERS];
+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
 	struct trace_iterator *iter = filp->private_data;
 	struct splice_pipe_desc spd = {
-		.pages		= pages,
-		.partial	= partial,
+		.pages		= pages_def,
+		.partial	= partial_def,
 		.nr_pages	= 0, /* This gets updated below. */
 		.flags		= flags,
 		.ops		= &tracing_pipe_buf_ops,
@@ -3285,6 +3285,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	size_t rem;
 	unsigned int i;
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
 	if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3315,23 +3318,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	trace_access_lock(iter->cpu_file);
 
 	/* Fill as many pages as possible. */
-	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
-		pages[i] = alloc_page(GFP_KERNEL);
-		if (!pages[i])
+	for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+		spd.pages[i] = alloc_page(GFP_KERNEL);
+		if (!spd.pages[i])
 			break;
 
 		rem = tracing_fill_pipe_page(rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
-					  page_address(pages[i]),
+					  page_address(spd.pages[i]),
 					  iter->seq.len);
 		if (ret < 0) {
-			__free_page(pages[i]);
+			__free_page(spd.pages[i]);
 			break;
 		}
-		partial[i].offset = 0;
-		partial[i].len = iter->seq.len;
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = iter->seq.len;
 
 		trace_seq_init(&iter->seq);
 	}
@@ -3342,12 +3345,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
 	spd.nr_pages = i;
 
-	return splice_to_pipe(pipe, &spd);
+	ret = splice_to_pipe(pipe, &spd);
+out:
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 
 out_err:
 	mutex_unlock(&iter->mutex);
-
-	return ret;
+	goto out;
 }
 
 static ssize_t
@@ -3746,11 +3751,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			    unsigned int flags)
 {
 	struct ftrace_buffer_info *info = file->private_data;
-	struct partial_page partial[PIPE_BUFFERS];
-	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
+	struct page *pages_def[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
-		.pages		= pages,
-		.partial	= partial,
+		.pages		= pages_def,
+		.partial	= partial_def,
 		.flags		= flags,
 		.ops		= &buffer_pipe_buf_ops,
 		.spd_release	= buffer_spd_release,
@@ -3759,22 +3764,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	int entries, size, i;
 	size_t ret;
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	if (*ppos & (PAGE_SIZE - 1)) {
 		WARN_ONCE(1, "Ftrace: previous read must page-align\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
 
 	if (len & (PAGE_SIZE - 1)) {
 		WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
-		if (len < PAGE_SIZE)
-			return -EINVAL;
+		if (len < PAGE_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
 		len &= PAGE_MASK;
 	}
 
 	trace_access_lock(info->cpu);
 	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
 
-	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
+	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
@@ -3829,11 +3840,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		else
 			ret = 0;
 		/* TODO: block */
-		return ret;
+		goto out;
 	}
 
 	ret = splice_to_pipe(pipe, &spd);
-
+	splice_shrink_spd(pipe, &spd);
+out:
 	return ret;
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e060c91e..931981774b1a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1417,12 +1417,13 @@ new_page:
 /*
  * Fill page/offset/length into spd, if it can hold more pages.
  */
-static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+static inline int spd_fill_page(struct splice_pipe_desc *spd,
+				struct pipe_inode_info *pipe, struct page *page,
 				unsigned int *len, unsigned int offset,
 				struct sk_buff *skb, int linear,
 				struct sock *sk)
 {
-	if (unlikely(spd->nr_pages == PIPE_BUFFERS))
+	if (unlikely(spd->nr_pages == pipe->buffers))
 		return 1;
 
 	if (linear) {
@@ -1458,7 +1459,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
 				   unsigned int plen, unsigned int *off,
 				   unsigned int *len, struct sk_buff *skb,
 				   struct splice_pipe_desc *spd, int linear,
-				   struct sock *sk)
+				   struct sock *sk,
+				   struct pipe_inode_info *pipe)
 {
 	if (!*len)
 		return 1;
@@ -1481,7 +1483,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
 		/* the linear region may spread across several pages  */
 		flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
 
-		if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk))
+		if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
 			return 1;
 
 		__segment_seek(&page, &poff, &plen, flen);
@@ -1496,9 +1498,9 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
  * Map linear and fragment data from the skb to spd. It reports failure if the
  * pipe is full or if we already spliced the requested length.
  */
-static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
-			     unsigned int *len, struct splice_pipe_desc *spd,
-			     struct sock *sk)
+static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+			     unsigned int *offset, unsigned int *len,
+			     struct splice_pipe_desc *spd, struct sock *sk)
 {
 	int seg;
 
@@ -1508,7 +1510,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
 	if (__splice_segment(virt_to_page(skb->data),
 			     (unsigned long) skb->data & (PAGE_SIZE - 1),
 			     skb_headlen(skb),
-			     offset, len, skb, spd, 1, sk))
+			     offset, len, skb, spd, 1, sk, pipe))
 		return 1;
 
 	/*
@@ -1518,7 +1520,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
 		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
 
 		if (__splice_segment(f->page, f->page_offset, f->size,
-				     offset, len, skb, spd, 0, sk))
+				     offset, len, skb, spd, 0, sk, pipe))
 			return 1;
 	}
 
@@ -1535,8 +1537,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int tlen,
 		    unsigned int flags)
 {
-	struct partial_page partial[PIPE_BUFFERS];
-	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
@@ -1546,12 +1548,16 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 	};
 	struct sk_buff *frag_iter;
 	struct sock *sk = skb->sk;
+	int ret = 0;
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
 
 	/*
 	 * __skb_splice_bits() only fails if the output has no room left,
 	 * so no point in going over the frag_list for the error case.
 	 */
-	if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk))
+	if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
 		goto done;
 	else if (!tlen)
 		goto done;
@@ -1562,14 +1568,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 	skb_walk_frags(skb, frag_iter) {
 		if (!tlen)
 			break;
-		if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk))
+		if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
 			break;
 	}
 
 done:
 	if (spd.nr_pages) {
-		int ret;
-
 		/*
 		 * Drop the socket lock, otherwise we have reverse
 		 * locking dependencies between sk_lock and i_mutex
@@ -1582,10 +1586,10 @@ done:
 		release_sock(sk);
 		ret = splice_to_pipe(pipe, &spd);
 		lock_sock(sk);
-		return ret;
 	}
 
-	return 0;
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3


From b492e95be0ae672922f4734acf3f5d35c30be948 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 19 May 2010 21:03:16 +0200
Subject: pipe: set lower and upper limit on max pages in the pipe page array

We need at least two to guarantee proper POSIX behaviour, so
never allow a smaller limit than that.

Also expose a /proc/sys/fs/pipe-max-pages sysctl file that allows
root to define a sane upper limit. Make it default to 16 times the
default size, which is 16 pages.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/pipe.c                 | 15 +++++++++++++++
 include/linux/pipe_fs_i.h |  2 ++
 kernel/sysctl.c           |  9 +++++++++
 3 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/fs/pipe.c b/fs/pipe.c
index 054b8a6a2c7a..d79872eba09a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -19,10 +19,17 @@
 #include <linux/pagemap.h>
 #include <linux/audit.h>
 #include <linux/syscalls.h>
+#include <linux/fcntl.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
+/*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-pages
+ */
+unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+
 /*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
@@ -1162,6 +1169,14 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case F_SETPIPE_SZ:
+		if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
+			return -EINVAL;
+		/*
+		 * The pipe needs to be at least 2 pages large to
+		 * guarantee POSIX behaviour.
+		 */
+		if (arg < 2)
+			return -EINVAL;
 		ret = pipe_set_size(pipe, arg);
 		break;
 	case F_GETPIPE_SZ:
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 65f4282fcbaf..16de3933c45e 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -139,6 +139,8 @@ void pipe_lock(struct pipe_inode_info *);
 void pipe_unlock(struct pipe_inode_info *);
 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
 
+extern unsigned int pipe_max_pages;
+
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..c649d1c5fe09 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,7 @@
 #include <linux/slow-work.h>
 #include <linux/perf_event.h>
 #include <linux/kprobes.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -1423,6 +1424,14 @@ static struct ctl_table fs_table[] = {
 		.child		= binfmt_misc_table,
 	},
 #endif
+	{
+		.procname	= "pipe-max-pages",
+		.data		= &pipe_max_pages,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &two,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3