From ba32929a91fe2c0628f5be62d1597b379c8d3062 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Nov 2008 15:29:58 +0900
Subject: block: make add_partition() return pointer to hd_struct

Make add_partition() return pointer to the new hd_struct on success
and ERR_PTR() value on failure.  This change will be used to fix md
autodetection bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ioctl.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index c832d639b6e2..d03985b04d67 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -18,7 +18,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	struct disk_part_iter piter;
 	long long start, length;
 	int partno;
-	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -61,10 +60,10 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			disk_part_iter_exit(&piter);
 
 			/* all seems OK */
-			err = add_partition(disk, partno, start, length,
-					    ADDPART_FLAG_NONE);
+			part = add_partition(disk, partno, start, length,
+					     ADDPART_FLAG_NONE);
 			mutex_unlock(&bdev->bd_mutex);
-			return err;
+			return IS_ERR(part) ? PTR_ERR(part) : 0;
 		case BLKPG_DEL_PARTITION:
 			part = disk_get_part(disk, partno);
 			if (!part)
-- 
cgit v1.2.3


From 561ec68e4de7947167937c49c451728e6b19e63b Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Fri, 14 Nov 2008 08:26:30 +0100
Subject: block: fix boot failure with CONFIG_DEBUG_BLOCK_EXT_DEVT=y and nash

We run into system boot failure with kernel 2.6.28-rc. We found it on a
couple of machines, including T61 notebook, nehalem machine, and another
HPC NX6325 notebook.  All the machines use FedoraCore 8 or FedoraCore 9.
With kernel prior to 2.6.28-rc, system boot doesn't fail.

I debug it and locate the root cause. Pls. see
http://bugzilla.kernel.org/show_bug.cgi?id=11899
https://bugzilla.redhat.com/show_bug.cgi?id=471517

As a matter of fact, there are 2 bugs.

1)root=/dev/sda1, system boot randomly fails. Mostly, boot for 5 times
and fails once. nash has a bug. Some of its functions misuse return
value 0.  Sometimes, 0 means timeout and no uevent available. Sometimes,
0 means nash gets an uevent, but the uevent isn't block-related (for
exmaple, usb). If by coincidence, kernel tells nash that uevents are
available, but kernel also set timeout, nash might stops collecting
other uevents in queue if current uevent isn't block-related.  I work
out a patch for nash to fix it.
http://bugzilla.kernel.org/attachment.cgi?id=18858

2) root=LABEL=/, system always can't boot. initrd init reports
switchroot fails. Here is an executation branch of nash when booting:
    (1) nash read /sys/block/sda/dev; Assume major is 8 (on my desktop)
    (2) nash query /proc/devices with the major number; It found line
	"8 sd";
    (3) nash use 'sd' to search its own probe table to find device (DISK)
	type for the device and add it to its own list;
    (4) Later on, it probes all devices in its list to get filesystem
	labels; scsi register "8 sd" always.

When major is 259, nash fails to find the device(DISK) type. I enables
CONFIG_DEBUG_BLOCK_EXT_DEVT=y when compiling kernel, so 259 is picked up
for device /dev/sda1, which causes nash to fail to find device (DISK)
type.

To fixing issue 2), I create a patch for nash and another patch for
kernel.

http://bugzilla.kernel.org/attachment.cgi?id=18859
http://bugzilla.kernel.org/attachment.cgi?id=18837

Below is the patch for kernel 2.6.28-rc4. It registers blkext, a new
block device in proc/devices.

With 2 patches on nash and 1 patch on kernel, I boot my machines for
dozens of times without failure.

Signed-off-by Zhang Yanmin <yanmin.zhang@linux.intel.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 4e5e7493f676..27549e470da5 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -768,6 +768,8 @@ static int __init genhd_device_init(void)
 	bdev_map = kobj_map_init(base_probe, &block_class_lock);
 	blk_dev_init();
 
+	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
+
 #ifndef CONFIG_SYSFS_DEPRECATED
 	/* create top-level block dir */
 	block_depr = kobject_create_and_add("block", NULL);
-- 
cgit v1.2.3


From c26156b2534c75bb3cdedf76f6ad1340971cf5bd Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 18 Nov 2008 15:07:05 +0100
Subject: block: hold extra reference to bio in blk_rq_map_user_iov()

If the size passed in is OK but we end up mapping too many segments,
we call the unmap path directly like from IO completion. But from IO
completion we have an extra reference to the bio, so this error case
goes OOPS when it attempts to free and already free bio.

Fix it by getting an extra reference to the bio before calling the
unmap failure case.

Reported-by: Petr Vandrovec <vandrove@vc.cvut.cz>

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index 4849fa36161e..0f4b4b881811 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -217,6 +217,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		return PTR_ERR(bio);
 
 	if (bio->bi_size != len) {
+		/*
+		 * Grab an extra reference to this bio, as bio_unmap_user()
+		 * expects to be able to drop it twice as it happens on the
+		 * normal IO completion path
+		 */
+		bio_get(bio);
 		bio_endio(bio, 0);
 		bio_unmap_user(bio);
 		return -EINVAL;
-- 
cgit v1.2.3


From 53cc0b2948bcb8a084982e6c1f9bd7b337e0df38 Mon Sep 17 00:00:00 2001
From: Petr Vandrovec <petr@vandrovec.name>
Date: Wed, 19 Nov 2008 11:12:14 +0100
Subject: When block layer fails to map iov, it calls bio_unmap_user to undo
 mapping.  Which is good if pages were mapped - but if they were provided by
 someone else and just copied then bad things happen - pages are released once
 here, and once by caller, leading to user triggerable BUG at
 include/linux/mm.h:246.

Signed-off-by: Petr Vandrovec <petr@vandrovec.name>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index 0f4b4b881811..2990447f45e9 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -224,7 +224,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		 */
 		bio_get(bio);
 		bio_endio(bio, 0);
-		bio_unmap_user(bio);
+		__blk_rq_unmap_user(bio);
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From bf91db18ac2852a3ff39fe25ff56c5557c0fff78 Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Thu, 20 Nov 2008 08:37:37 +0100
Subject: block: set disk->node_id before it's being used

disk->node_id will be refered in allocating in disk_expand_part_tbl, so we
should set it before disk->node_id is refered.

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 27549e470da5..2f7feda61e35 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1102,6 +1102,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 			kfree(disk);
 			return NULL;
 		}
+		disk->node_id = node_id;
 		if (disk_expand_part_tbl(disk, 0)) {
 			free_part_stats(&disk->part0);
 			kfree(disk);
@@ -1116,7 +1117,6 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 		device_initialize(disk_to_dev(disk));
 		INIT_WORK(&disk->async_notify,
 			media_change_notify_thread);
-		disk->node_id = node_id;
 	}
 	return disk;
 }
-- 
cgit v1.2.3


From 53a08807c01989c6847bb135d8d43f61c5dfdda5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Dec 2008 12:41:26 +0100
Subject: block: internal dequeue shouldn't start timer

blkdev_dequeue_request() and elv_dequeue_request() are equivalent and
both start the timeout timer.  Barrier code dequeues the original
barrier request but doesn't passes the request itself to lower level
driver, only broken down proxy requests; however, as the original
barrier code goes through the same dequeue path and timeout timer is
started on it.  If barrier sequence takes long enough, this timer
expires but the low level driver has no idea about this request and
oops follows.

Timeout timer shouldn't have been started on the original barrier
request as it never goes through actual IO.  This patch unexports
elv_dequeue_request(), which has no external user anyway, and makes it
operate on elevator proper w/o adding the timer and make
blkdev_dequeue_request() call elv_dequeue_request() and add timer.
Internal users which don't pass the request to driver - barrier code
and end_that_request_last() - are converted to use
elv_dequeue_request().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    |  4 ++--
 block/blk-core.c       | 24 +++++++++++++++++++++++-
 block/elevator.c       |  7 -------
 include/linux/blkdev.h |  7 ++-----
 4 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 5c99ff8d2db8..6e72d661ae42 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -161,7 +161,7 @@ static inline struct request *start_ordered(struct request_queue *q,
 	/*
 	 * Prep proxy barrier request.
 	 */
-	blkdev_dequeue_request(rq);
+	elv_dequeue_request(q, rq);
 	q->orig_bar_rq = rq;
 	rq = &q->bar_rq;
 	blk_rq_init(q, rq);
@@ -219,7 +219,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 * This can happen when the queue switches to
 			 * ORDERED_NONE while this request is on it.
 			 */
-			blkdev_dequeue_request(rq);
+			elv_dequeue_request(q, rq);
 			if (__blk_end_request(rq, -EOPNOTSUPP,
 					      blk_rq_bytes(rq)))
 				BUG();
diff --git a/block/blk-core.c b/block/blk-core.c
index 10e8a64a5a5b..7a779d7c69c9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1636,6 +1636,28 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
+/**
+ * blkdev_dequeue_request - dequeue request and start timeout timer
+ * @req: request to dequeue
+ *
+ * Dequeue @req and start timeout timer on it.  This hands off the
+ * request to the driver.
+ *
+ * Block internal functions which don't want to start timer should
+ * call elv_dequeue_request().
+ */
+void blkdev_dequeue_request(struct request *req)
+{
+	elv_dequeue_request(req->q, req);
+
+	/*
+	 * We are now handing the request to the hardware, add the
+	 * timeout handler.
+	 */
+	blk_add_timer(req);
+}
+EXPORT_SYMBOL(blkdev_dequeue_request);
+
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
@@ -1774,7 +1796,7 @@ static void end_that_request_last(struct request *req, int error)
 		blk_queue_end_tag(req->q, req);
 
 	if (blk_queued_rq(req))
-		blkdev_dequeue_request(req);
+		elv_dequeue_request(req->q, req);
 
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
diff --git a/block/elevator.c b/block/elevator.c
index 9ac82dde99dd..a6951f76ba0c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -844,14 +844,7 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 	 */
 	if (blk_account_rq(rq))
 		q->in_flight++;
-
-	/*
-	 * We are now handing the request to the hardware, add the
-	 * timeout handler.
-	 */
-	blk_add_timer(rq);
 }
-EXPORT_SYMBOL(elv_dequeue_request);
 
 int elv_queue_empty(struct request_queue *q)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a135256b272c..9cc7cc5fdce1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -786,6 +786,8 @@ static inline void blk_run_address_space(struct address_space *mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, NULL);
 }
 
+extern void blkdev_dequeue_request(struct request *req);
+
 /*
  * blk_end_request() and friends.
  * __blk_end_request() and end_request() must be called with
@@ -820,11 +822,6 @@ extern void blk_update_request(struct request *rq, int error,
 extern unsigned int blk_rq_bytes(struct request *rq);
 extern unsigned int blk_rq_cur_bytes(struct request *rq);
 
-static inline void blkdev_dequeue_request(struct request *req)
-{
-	elv_dequeue_request(req->q, req);
-}
-
 /*
  * Access functions for manipulating queue properties
  */
-- 
cgit v1.2.3


From 0e435ac26e3f951d83338ed3d4ab7dc0fe0055bc Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Wed, 3 Dec 2008 12:55:08 +0100
Subject: block: fix setting of max_segment_size and seg_boundary mask

Fix setting of max_segment_size and seg_boundary mask for stacked md/dm
devices.

When stacking devices (LVM over MD over SCSI) some of the request queue
parameters are not set up correctly in some cases by default, namely
max_segment_size and and seg_boundary mask.

If you create MD device over SCSI, these attributes are zeroed.

Problem become when there is over this mapping next device-mapper mapping
- queue attributes are set in DM this way:

request_queue   max_segment_size  seg_boundary_mask
SCSI                65536             0xffffffff
MD RAID1                0                      0
LVM                 65536                 -1 (64bit)

Unfortunately bio_add_page (resp.  bio_phys_segments) calculates number of
physical segments according to these parameters.

During the generic_make_request() is segment cout recalculated and can
increase bio->bi_phys_segments count over the allowed limit.  (After
bio_clone() in stack operation.)

Thi is specially problem in CCISS driver, where it produce OOPS here

    BUG_ON(creq->nr_phys_segments > MAXSGENTRIES);

(MAXSEGENTRIES is 31 by default.)

Sometimes even this command is enough to cause oops:

  dd iflag=direct if=/dev/<vg>/<lv> of=/dev/null bs=128000 count=10

This command generates bios with 250 sectors, allocated in 32 4k-pages
(last page uses only 1024 bytes).

For LVM layer, it allocates bio with 31 segments (still OK for CCISS),
unfortunatelly on lower layer it is recalculated to 32 segments and this
violates CCISS restriction and triggers BUG_ON().

The patch tries to fix it by:

 * initializing attributes above in queue request constructor
   blk_queue_make_request()

 * make sure that blk_queue_stack_limits() inherits setting

 (DM uses its own function to set the limits because it
 blk_queue_stack_limits() was introduced later.  It should probably switch
 to use generic stack limit function too.)

 * sets the default seg_boundary value in one place (blkdev.h)

 * use this mask as default in DM (instead of -1, which differs in 64bit)

Bugs related to this:
https://bugzilla.redhat.com/show_bug.cgi?id=471639
http://bugzilla.kernel.org/show_bug.cgi?id=8672

Signed-off-by: Milan Broz <mbroz@redhat.com>
Reviewed-by: Alasdair G Kergon <agk@redhat.com>
Cc: Neil Brown <neilb@suse.de>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Mike Miller <mike.miller@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 2 +-
 block/blk-settings.c   | 4 ++++
 drivers/md/dm-table.c  | 2 +-
 include/linux/blkdev.h | 2 ++
 4 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 7a779d7c69c9..c36aa98fafa3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -592,7 +592,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 				   1 << QUEUE_FLAG_STACKABLE);
 	q->queue_lock		= lock;
 
-	blk_queue_segment_boundary(q, 0xffffffff);
+	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
 
 	blk_queue_make_request(q, __make_request);
 	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 41392fbe19ff..afa55e14e278 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -125,6 +125,9 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->nr_requests = BLKDEV_MAX_RQ;
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
+	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
+	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
+
 	q->make_request_fn = mfn;
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -314,6 +317,7 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 	/* zero is "infinity" */
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
 
 	t->max_phys_segments = min(t->max_phys_segments, b->max_phys_segments);
 	t->max_hw_segments = min(t->max_hw_segments, b->max_hw_segments);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a63161aec487..04e5fd742c2c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -668,7 +668,7 @@ static void check_for_valid_limits(struct io_restrictions *rs)
 	if (!rs->max_segment_size)
 		rs->max_segment_size = MAX_SEGMENT_SIZE;
 	if (!rs->seg_boundary_mask)
-		rs->seg_boundary_mask = -1;
+		rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	if (!rs->bounce_pfn)
 		rs->bounce_pfn = -1;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9cc7cc5fdce1..6dcd30d806cd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -918,6 +918,8 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define MAX_SEGMENT_SIZE	65536
 
+#define BLK_SEG_BOUNDARY_MASK	0xFFFFFFFFUL
+
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
 static inline int queue_hardsect_size(struct request_queue *q)
-- 
cgit v1.2.3