From 6e68af666f5336254b5715dca591026b7324499a Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Fri, 11 Nov 2005 05:30:27 -0600
Subject: [SCSI] Convert SCSI mid-layer to scsi_execute_async

Add scsi helpers to create really-large-requests and convert
scsi-ml to scsi_execute_async().

Per Jens's previous comments, I placed this function in scsi_lib.c.
I made it follow all the queue's limits - I think I did at least :), so
I removed the warning on the function header.

I think the scsi_execute_* functions should eventually take a request_queue
and be placed some place where the dm-multipath hw_handler can use them
if that failover code is going to stay in the kernel. That conversion
patch will be sent in another mail though.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 fs/bio.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 460554b07ff9..4d21ee3873ec 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -385,6 +385,25 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	return len;
 }
 
+/**
+ *	bio_add_pc_page	-	attempt to add page to bio
+ *	@bio: destination bio
+ *	@page: page to add
+ *	@len: vec entry length
+ *	@offset: vec entry offset
+ *
+ *	Attempt to add a page to the bio_vec maplist. This can fail for a
+ *	number of reasons, such as the bio being full or target block
+ *	device limitations. The target block device must allow bio's
+ *      smaller than PAGE_SIZE, so it is always possible to add a single
+ *      page to an empty bio. This should only be used by REQ_PC bios.
+ */
+int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page,
+		    unsigned int len, unsigned int offset)
+{
+	return __bio_add_page(q, bio, page, len, offset);
+}
+
 /**
  *	bio_add_page	-	attempt to add page to bio
  *	@bio: destination bio
@@ -1228,6 +1247,7 @@ EXPORT_SYMBOL(bio_clone);
 EXPORT_SYMBOL(bio_phys_segments);
 EXPORT_SYMBOL(bio_hw_segments);
 EXPORT_SYMBOL(bio_add_page);
+EXPORT_SYMBOL(bio_add_pc_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
 EXPORT_SYMBOL(bio_map_user);
 EXPORT_SYMBOL(bio_unmap_user);
-- 
cgit v1.2.3


From defd94b75409b983f94548ea2f52ff5787ddb848 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Mon, 5 Dec 2005 02:37:06 -0600
Subject: [SCSI] seperate max_sectors from max_hw_sectors

- export __blk_put_request and blk_execute_rq_nowait
needed for async REQ_BLOCK_PC requests
- seperate max_hw_sectors and max_sectors for block/scsi_ioctl.c and
SG_IO bio.c helpers per Jens's last comments. Since block/scsi_ioctl.c SG_IO was
already testing against max_sectors and SCSI-ml was setting max_sectors and
max_hw_sectors to the same value this does not change any scsi SG_IO behavior. It only
prepares ll_rw_blk.c, scsi_ioctl.c and bio.c for when SCSI-ml begins to set
a valid max_hw_sectors for all LLDs. Today if a LLD does not set it
SCSI-ml sets it to a safe default and some LLDs set it to a artificial low
value to overcome memory and feedback issues.

Note: Since we now cap max_sectors to BLK_DEF_MAX_SECTORS, which is 1024,
drivers that used to call blk_queue_max_sectors with a large value of
max_sectors will now see the fs requests capped to BLK_DEF_MAX_SECTORS.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 block/ll_rw_blk.c       | 34 ++++++++++++++++++++++++++--------
 block/scsi_ioctl.c      |  2 +-
 drivers/md/dm-table.c   |  2 +-
 drivers/scsi/scsi_lib.c |  2 +-
 fs/bio.c                | 20 +++++++++++---------
 include/linux/blkdev.h  |  3 ++-
 6 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index c525b5a2b598..d4beb9a89ee0 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -239,7 +239,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
-	blk_queue_max_sectors(q, MAX_SECTORS);
+	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
@@ -555,7 +555,12 @@ void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
 		printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
 	}
 
-	q->max_sectors = q->max_hw_sectors = max_sectors;
+	if (BLK_DEF_MAX_SECTORS > max_sectors)
+		q->max_hw_sectors = q->max_sectors = max_sectors;
+ 	else {
+		q->max_sectors = BLK_DEF_MAX_SECTORS;
+		q->max_hw_sectors = max_sectors;
+	}
 }
 
 EXPORT_SYMBOL(blk_queue_max_sectors);
@@ -657,8 +662,8 @@ EXPORT_SYMBOL(blk_queue_hardsect_size);
 void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
 {
 	/* zero is "infinity" */
-	t->max_sectors = t->max_hw_sectors =
-		min_not_zero(t->max_sectors,b->max_sectors);
+	t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
+	t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
 
 	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
 	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
@@ -1293,9 +1298,15 @@ static inline int ll_new_hw_segment(request_queue_t *q,
 static int ll_back_merge_fn(request_queue_t *q, struct request *req, 
 			    struct bio *bio)
 {
+	unsigned short max_sectors;
 	int len;
 
-	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
+	if (unlikely(blk_pc_request(req)))
+		max_sectors = q->max_hw_sectors;
+	else
+		max_sectors = q->max_sectors;
+
+	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
 		req->flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -1325,9 +1336,16 @@ static int ll_back_merge_fn(request_queue_t *q, struct request *req,
 static int ll_front_merge_fn(request_queue_t *q, struct request *req, 
 			     struct bio *bio)
 {
+	unsigned short max_sectors;
 	int len;
 
-	if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
+	if (unlikely(blk_pc_request(req)))
+		max_sectors = q->max_hw_sectors;
+	else
+		max_sectors = q->max_sectors;
+
+
+	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
 		req->flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -2144,7 +2162,7 @@ int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf,
 	struct bio *bio;
 	int reading;
 
-	if (len > (q->max_sectors << 9))
+	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
 	if (!len || !ubuf)
 		return -EINVAL;
@@ -2259,7 +2277,7 @@ int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf,
 {
 	struct bio *bio;
 
-	if (len > (q->max_sectors << 9))
+	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
 	if (!len || !kbuf)
 		return -EINVAL;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 382dea7b224c..4e390dfd3157 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -233,7 +233,7 @@ static int sg_io(struct file *file, request_queue_t *q,
 	if (verify_command(file, cmd))
 		return -EPERM;
 
-	if (hdr->dxfer_len > (q->max_sectors << 9))
+	if (hdr->dxfer_len > (q->max_hw_sectors << 9))
 		return -EIO;
 
 	if (hdr->dxfer_len)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a6d3baa46f61..a6f2dc66c3db 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -638,7 +638,7 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 static void check_for_valid_limits(struct io_restrictions *rs)
 {
 	if (!rs->max_sectors)
-		rs->max_sectors = MAX_SECTORS;
+		rs->max_sectors = SAFE_MAX_SECTORS;
 	if (!rs->max_phys_segments)
 		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
 	if (!rs->max_hw_segments)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 54a72f197487..14ad2a785a34 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -462,6 +462,7 @@ int scsi_execute_async(struct scsi_device *sdev, const unsigned char *cmd,
 	req = blk_get_request(sdev->request_queue, write, gfp);
 	if (!req)
 		goto free_sense;
+	req->flags |= REQ_BLOCK_PC | REQ_QUIET;
 
 	if (use_sg)
 		err = scsi_req_map_sg(req, buffer, use_sg, bufflen, gfp);
@@ -477,7 +478,6 @@ int scsi_execute_async(struct scsi_device *sdev, const unsigned char *cmd,
 	req->sense_len = 0;
 	req->timeout = timeout;
 	req->retries = retries;
-	req->flags |= REQ_BLOCK_PC | REQ_QUIET;
 	req->end_io_data = sioc;
 
 	sioc->data = privdata;
diff --git a/fs/bio.c b/fs/bio.c
index 4d21ee3873ec..38d3e8023a07 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -313,7 +313,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
 }
 
 static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
-			  *page, unsigned int len, unsigned int offset)
+			  *page, unsigned int len, unsigned int offset,
+			  unsigned short max_sectors)
 {
 	int retried_segments = 0;
 	struct bio_vec *bvec;
@@ -327,7 +328,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return 0;
 
-	if (((bio->bi_size + len) >> 9) > q->max_sectors)
+	if (((bio->bi_size + len) >> 9) > max_sectors)
 		return 0;
 
 	/*
@@ -401,7 +402,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page,
 		    unsigned int len, unsigned int offset)
 {
-	return __bio_add_page(q, bio, page, len, offset);
+	return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
 }
 
 /**
@@ -420,8 +421,8 @@ int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page,
 int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
 {
-	return __bio_add_page(bdev_get_queue(bio->bi_bdev), bio, page,
-			      len, offset);
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
 }
 
 struct bio_map_data {
@@ -533,7 +534,7 @@ struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr,
 			break;
 		}
 
-		if (__bio_add_page(q, bio, page, bytes, 0) < bytes) {
+		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
 			ret = -EINVAL;
 			break;
 		}
@@ -647,7 +648,8 @@ static struct bio *__bio_map_user_iov(request_queue_t *q,
 			/*
 			 * sorry...
 			 */
-			if (__bio_add_page(q, bio, pages[j], bytes, offset) < bytes)
+			if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
+					    bytes)
 				break;
 
 			len -= bytes;
@@ -820,8 +822,8 @@ static struct bio *__bio_map_kern(request_queue_t *q, void *data,
 		if (bytes > len)
 			bytes = len;
 
-		if (__bio_add_page(q, bio, virt_to_page(data), bytes,
-				   offset) < bytes)
+		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
+				    offset) < bytes)
 			break;
 
 		data += bytes;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 509e9a03a328..a18500d196e1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -702,7 +702,8 @@ extern int blkdev_issue_flush(struct block_device *, sector_t *);
 
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
-#define MAX_SECTORS 255
+#define SAFE_MAX_SECTORS 255
+#define BLK_DEF_MAX_SECTORS 1024
 
 #define MAX_SEGMENT_SIZE	65536
 
-- 
cgit v1.2.3


From 4d399cae3f5ec1f59b9e88084aae09c4f00760c9 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 3 Jan 2006 13:19:13 +0100
Subject: remove pointers to the defunct UDF mailing list

This patch removes pointers to the defunct UDF mailing list.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 MAINTAINERS               | 1 -
 fs/udf/balloc.c           | 5 -----
 fs/udf/crc.c              | 5 -----
 fs/udf/dir.c              | 5 -----
 fs/udf/directory.c        | 5 -----
 fs/udf/file.c             | 5 -----
 fs/udf/fsync.c            | 5 -----
 fs/udf/ialloc.c           | 5 -----
 fs/udf/inode.c            | 5 -----
 fs/udf/lowlevel.c         | 5 -----
 fs/udf/misc.c             | 5 -----
 fs/udf/namei.c            | 5 -----
 fs/udf/partition.c        | 5 -----
 fs/udf/super.c            | 5 -----
 fs/udf/symlink.c          | 5 -----
 fs/udf/truncate.c         | 5 -----
 fs/udf/unicode.c          | 5 -----
 include/linux/udf_fs.h    | 5 -----
 include/linux/udf_fs_i.h  | 5 -----
 include/linux/udf_fs_sb.h | 5 -----
 20 files changed, 96 deletions(-)

(limited to 'fs')

diff --git a/MAINTAINERS b/MAINTAINERS
index 6af683025ae0..bbe5f04f59ec 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2587,7 +2587,6 @@ S:	Maintained
 UDF FILESYSTEM
 P:	Ben Fennema
 M:	bfennema@falcon.csc.calpoly.edu
-L:	linux_udf@hpesjro.fc.hp.com
 W:	http://linux-udf.sourceforge.net
 S:	Maintained
 
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index b9ded26b10a9..6598a5037ac8 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Block allocation handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/crc.c b/fs/udf/crc.c
index d95c6e38a455..1b82a4adc2f7 100644
--- a/fs/udf/crc.c
+++ b/fs/udf/crc.c
@@ -14,11 +14,6 @@
  *
  *	AT&T gives permission for the free use of the CRC source code.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 82440b731142..f5222527fe39 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  Directory handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 9a61ecc5451b..fe751a2a0e47 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Directory related functions
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 01f520c71dc1..8a388289040d 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  File handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *  E-mail regarding any portion of the Linux UDF file system should be
- *  directed to the development team mailing list (run by majordomo):
- *    linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *  This file is distributed under the terms of the GNU General Public
  *  License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
index 2dde6b888c2b..5887d78cde43 100644
--- a/fs/udf/fsync.c
+++ b/fs/udf/fsync.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  Fsync handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *  E-mail regarding any portion of the Linux UDF file system should be
- *  directed to the development team mailing list (run by majordomo):
- *      linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *  This file is distributed under the terms of the GNU General Public
  *  License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index a7e5d40f1ebc..c9b707b470ca 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Inode allocation handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b83890beaaac..4014f17d382e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  Inode handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *  E-mail regarding any portion of the Linux UDF file system should be
- *  directed to the development team mailing list (run by majordomo):
- *    linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *  This file is distributed under the terms of the GNU General Public
  *  License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 2da5087dfe05..084216107667 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *  Low Level Device Routines for the UDF filesystem
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index fd321f9ace83..cc8ca3254db1 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Miscellaneous routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ac191ed7df0a..ca732e79c48b 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *      Inode name handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *      E-mail regarding any portion of the Linux UDF file system should be
- *      directed to the development team mailing list (run by majordomo):
- *              linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *      This file is distributed under the terms of the GNU General Public
  *      License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4d36f264be0d..dabf2b841db8 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *      Partition handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *      E-mail regarding any portion of the Linux UDF file system should be
- *      directed to the development team mailing list (run by majordomo):
- *              linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *      This file is distributed under the terms of the GNU General Public
  *      License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 15bd4f24c5b7..4a6f49adc609 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -14,11 +14,6 @@
  *    http://www.ecma.ch/
  *    http://www.iso.org/
  *
- * CONTACTS
- *  E-mail regarding any portion of the Linux UDF file system should be
- *  directed to the development team mailing list (run by majordomo):
- *	  linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *  This file is distributed under the terms of the GNU General Public
  *  License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 43f3051ef756..674bb40edc83 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Symlink handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 7dc8a5572ca1..e1b0e8cfecb4 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -4,11 +4,6 @@
  * PURPOSE
  *	Truncate handling routines for the OSTA-UDF(tm) filesystem.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 5a80efd8debc..706c92e1dcc9 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -11,11 +11,6 @@
  *	UTF-8 is explained in the IETF RFC XXXX.
  *		ftp://ftp.internic.net/rfc/rfcxxxx.txt
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team's mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/include/linux/udf_fs.h b/include/linux/udf_fs.h
index 46e2bb945353..36c684e1b110 100644
--- a/include/linux/udf_fs.h
+++ b/include/linux/udf_fs.h
@@ -13,11 +13,6 @@
  *    http://www.osta.org/ *    http://www.ecma.ch/
  *    http://www.iso.org/
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/include/linux/udf_fs_i.h b/include/linux/udf_fs_i.h
index 62b15a4214e6..1e7508420fcf 100644
--- a/include/linux/udf_fs_i.h
+++ b/include/linux/udf_fs_i.h
@@ -3,11 +3,6 @@
  *
  * This file is intended for the Linux kernel/module. 
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
diff --git a/include/linux/udf_fs_sb.h b/include/linux/udf_fs_sb.h
index 1966a6dbb4b6..b15ff2e99c91 100644
--- a/include/linux/udf_fs_sb.h
+++ b/include/linux/udf_fs_sb.h
@@ -3,11 +3,6 @@
  * 
  * This include file is for the Linux kernel/module.
  *
- * CONTACTS
- *	E-mail regarding any portion of the Linux UDF file system should be
- *	directed to the development team mailing list (run by majordomo):
- *		linux_udf@hpesjro.fc.hp.com
- *
  * COPYRIGHT
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from:
-- 
cgit v1.2.3


From 7a1119b1fc87cc347d3932b8aee051e86b32818f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 3 Jan 2006 13:21:37 +0100
Subject: fs/qnx4/bitmap.c: #if 0 qnx4_new_block()

qnx4_new_block() is neither implemented nor used.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Anders Larsen <al@alarsen.net>
---
 fs/qnx4/bitmap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 991253927658..46efbf52cbec 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -23,10 +23,12 @@
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
 
+#if 0
 int qnx4_new_block(struct super_block *sb)
 {
 	return 0;
 }
+#endif  /*  0  */
 
 static void count_bits(register const char *bmPart, register int size,
 		       int *const tf)
-- 
cgit v1.2.3


From 4a4efbdee278b2f4ed91aad2db5c006ff754276e Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Tue, 3 Jan 2006 13:27:11 +0100
Subject: s/retreiv/retriev/g

As everyone knows, the rule is: "i before e.. um.. always."

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 drivers/char/n_hdlc.c           |  2 +-
 drivers/net/tlan.c              |  4 ++--
 drivers/usb/serial/ftdi_sio.h   |  2 +-
 drivers/video/aty/radeon_base.c | 16 ++++++++--------
 fs/reiserfs/xattr.c             |  4 ++--
 net/irda/iriap.c                |  2 +-
 scripts/kconfig/util.c          |  2 +-
 7 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c
index c3660d8781a4..a133a62f3d55 100644
--- a/drivers/char/n_hdlc.c
+++ b/drivers/char/n_hdlc.c
@@ -562,7 +562,7 @@ static void n_hdlc_tty_receive(struct tty_struct *tty, const __u8 *data,
 }	/* end of n_hdlc_tty_receive() */
 
 /**
- * n_hdlc_tty_read - Called to retreive one frame of data (if available)
+ * n_hdlc_tty_read - Called to retrieve one frame of data (if available)
  * @tty - pointer to tty instance data
  * @file - pointer to open file object
  * @buf - pointer to returned data buffer
diff --git a/drivers/net/tlan.c b/drivers/net/tlan.c
index 942fae0f2130..c2506b56a186 100644
--- a/drivers/net/tlan.c
+++ b/drivers/net/tlan.c
@@ -2865,11 +2865,11 @@ void TLan_PhyMonitor( struct net_device *dev )
 	 *				for this device.
 	 *		phy		The address of the PHY to be queried.
 	 *		reg		The register whose contents are to be
-	 *				retreived.
+	 *				retrieved.
 	 *		val		A pointer to a variable to store the
 	 *				retrieved value.
 	 *
-	 *	This function uses the TLAN's MII bus to retreive the contents
+	 *	This function uses the TLAN's MII bus to retrieve the contents
 	 *	of a given register on a PHY.  It sends the appropriate info
 	 *	and then reads the 16-bit register value from the MII bus via
 	 *	the TLAN SIO register.
diff --git a/drivers/usb/serial/ftdi_sio.h b/drivers/usb/serial/ftdi_sio.h
index 773ea3eca086..da004dd025fb 100644
--- a/drivers/usb/serial/ftdi_sio.h
+++ b/drivers/usb/serial/ftdi_sio.h
@@ -714,7 +714,7 @@ typedef enum {
  */
 
 /* FTDI_SIO_GET_MODEM_STATUS */
-/* Retreive the current value of the modem status register */
+/* Retrieve the current value of the modem status register */
 
 #define FTDI_SIO_GET_MODEM_STATUS_REQUEST_TYPE 0xc0
 #define FTDI_SIO_GET_MODEM_STATUS_REQUEST FTDI_SIO_GET_MODEM_STATUS
diff --git a/drivers/video/aty/radeon_base.c b/drivers/video/aty/radeon_base.c
index 4f01ccc02aa4..156db84cb363 100644
--- a/drivers/video/aty/radeon_base.c
+++ b/drivers/video/aty/radeon_base.c
@@ -594,7 +594,7 @@ static int __devinit radeon_probe_pll_params(struct radeonfb_info *rinfo)
 }
 
 /*
- * Retreive PLL infos by different means (BIOS, Open Firmware, register probing...)
+ * Retrieve PLL infos by different means (BIOS, Open Firmware, register probing...)
  */
 static void __devinit radeon_get_pllinfo(struct radeonfb_info *rinfo)
 {
@@ -660,17 +660,17 @@ static void __devinit radeon_get_pllinfo(struct radeonfb_info *rinfo)
 
 #ifdef CONFIG_PPC_OF
 	/*
-	 * Retreive PLL infos from Open Firmware first
+	 * Retrieve PLL infos from Open Firmware first
 	 */
        	if (!force_measure_pll && radeon_read_xtal_OF(rinfo) == 0) {
-       		printk(KERN_INFO "radeonfb: Retreived PLL infos from Open Firmware\n");
+       		printk(KERN_INFO "radeonfb: Retrieved PLL infos from Open Firmware\n");
 		goto found;
 	}
 #endif /* CONFIG_PPC_OF */
 
 	/*
 	 * Check out if we have an X86 which gave us some PLL informations
-	 * and if yes, retreive them
+	 * and if yes, retrieve them
 	 */
 	if (!force_measure_pll && rinfo->bios_seg) {
 		u16 pll_info_block = BIOS_IN16(rinfo->fp_bios_start + 0x30);
@@ -682,7 +682,7 @@ static void __devinit radeon_get_pllinfo(struct radeonfb_info *rinfo)
 		rinfo->pll.ppll_min	= BIOS_IN32(pll_info_block + 0x12);
 		rinfo->pll.ppll_max	= BIOS_IN32(pll_info_block + 0x16);
 
-		printk(KERN_INFO "radeonfb: Retreived PLL infos from BIOS\n");
+		printk(KERN_INFO "radeonfb: Retrieved PLL infos from BIOS\n");
 		goto found;
 	}
 
@@ -691,7 +691,7 @@ static void __devinit radeon_get_pllinfo(struct radeonfb_info *rinfo)
 	 * probe them
 	 */
 	if (radeon_probe_pll_params(rinfo) == 0) {
-		printk(KERN_INFO "radeonfb: Retreived PLL infos from registers\n");
+		printk(KERN_INFO "radeonfb: Retrieved PLL infos from registers\n");
 		goto found;
 	}
 
@@ -702,7 +702,7 @@ static void __devinit radeon_get_pllinfo(struct radeonfb_info *rinfo)
 
 found:
 	/*
-	 * Some methods fail to retreive SCLK and MCLK values, we apply default
+	 * Some methods fail to retrieve SCLK and MCLK values, we apply default
 	 * settings in this case (200Mhz). If that really happne often, we could
 	 * fetch from registers instead...
 	 */
@@ -2393,7 +2393,7 @@ static int radeonfb_pci_register (struct pci_dev *pdev,
 	       rinfo->mapped_vram/1024);
 
 	/*
-	 * Map the BIOS ROM if any and retreive PLL parameters from
+	 * Map the BIOS ROM if any and retrieve PLL parameters from
 	 * the BIOS. We skip that on mobility chips as the real panel
 	 * values we need aren't in the ROM but in the BIOS image in
 	 * memory. This is definitely not the best meacnism though,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 72e120798677..02091eaac0b4 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -115,8 +115,8 @@ static struct dentry *__get_xa_root(struct super_block *s)
 }
 
 /* Returns the dentry (or NULL) referring to the root of the extended
- * attribute directory tree. If it has already been retreived, it is used.
- * Otherwise, we attempt to retreive it from disk. It may also return
+ * attribute directory tree. If it has already been retrieved, it is used.
+ * Otherwise, we attempt to retrieve it from disk. It may also return
  * a pointer-encoded error.
  */
 static inline struct dentry *get_xa_root(struct super_block *s)
diff --git a/net/irda/iriap.c b/net/irda/iriap.c
index b8bb78af8b8a..254f90746900 100644
--- a/net/irda/iriap.c
+++ b/net/irda/iriap.c
@@ -364,7 +364,7 @@ static void iriap_disconnect_request(struct iriap_cb *self)
 /*
  * Function iriap_getvaluebyclass (addr, name, attr)
  *
- *    Retreive all values from attribute in all objects with given class
+ *    Retrieve all values from attribute in all objects with given class
  *    name
  */
 int iriap_getvaluebyclass_request(struct iriap_cb *self,
diff --git a/scripts/kconfig/util.c b/scripts/kconfig/util.c
index 1fa4c0b801b3..4556014f2f00 100644
--- a/scripts/kconfig/util.c
+++ b/scripts/kconfig/util.c
@@ -101,7 +101,7 @@ void str_printf(struct gstr *gs, const char *fmt, ...)
 	va_end(ap);
 }
 
-/* Retreive value of growable string */
+/* Retrieve value of growable string */
 const char *str_get(struct gstr *gs)
 {
 	return gs->s;
-- 
cgit v1.2.3


From f4b09ebc8baa51ec8394c4173e3de9d62b2cc97a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 3 Jan 2006 13:37:51 +0100
Subject: update the email address of Randy Dunlap

This patch removes all references to the bouncing address
rddunlap@osdl.org and one dead web page from the kernel.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
---
 Documentation/block/biodoc.txt          | 2 +-
 Documentation/scsi/scsi_mid_low_api.txt | 2 +-
 MAINTAINERS                             | 3 +--
 drivers/usb/class/usblp.c               | 2 +-
 fs/ntfs/ChangeLog                       | 2 +-
 kernel/configs.c                        | 2 +-
 scripts/binoffset.c                     | 2 +-
 scripts/checkversion.pl                 | 2 +-
 scripts/patch-kernel                    | 4 ++--
 9 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 0fe01c805480..303c57a7fad9 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -31,7 +31,7 @@ The following people helped with review comments and inputs for this
 document:
 	Christoph Hellwig <hch@infradead.org>
 	Arjan van de Ven <arjanv@redhat.com>
-	Randy Dunlap <rddunlap@osdl.org>
+	Randy Dunlap <rdunlap@xenotime.net>
 	Andre Hedrick <andre@linux-ide.org>
 
 The following people helped with fixes/contributions to the bio patches
diff --git a/Documentation/scsi/scsi_mid_low_api.txt b/Documentation/scsi/scsi_mid_low_api.txt
index 66565d42288f..3209b37e07ec 100644
--- a/Documentation/scsi/scsi_mid_low_api.txt
+++ b/Documentation/scsi/scsi_mid_low_api.txt
@@ -1433,7 +1433,7 @@ The following people have contributed to this document:
         Christoph Hellwig <hch at infradead dot org>
         Doug Ledford <dledford at redhat dot com>
         Andries Brouwer <Andries dot Brouwer at cwi dot nl>
-        Randy Dunlap <rddunlap at osdl dot org>
+        Randy Dunlap <rdunlap at xenotime dot net>
         Alan Stern <stern at rowland dot harvard dot edu>
 
 
diff --git a/MAINTAINERS b/MAINTAINERS
index bbe5f04f59ec..79f0efa29778 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1465,7 +1465,6 @@ P:	Several
 L:	kernel-janitors@osdl.org
 W:	http://www.kerneljanitors.org/
 W:	http://sf.net/projects/kernel-janitor/
-W:	http://developer.osdl.org/rddunlap/kj-patches/
 S:	Maintained
 
 KERNEL NFSD
@@ -1486,7 +1485,7 @@ KEXEC
 P:	Eric Biederman
 P:	Randy Dunlap
 M:	ebiederm@xmission.com
-M:	rddunlap@osdl.org
+M:	rdunlap@xenotime.net
 W:	http://www.xmission.com/~ebiederm/files/kexec/
 L:	linux-kernel@vger.kernel.org
 L:	fastboot@osdl.org
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 357e75335f17..38f905db0d6a 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1999 Michael Gee	<michael@linuxspecific.com>
  * Copyright (c) 1999 Pavel Machek	<pavel@suse.cz>
- * Copyright (c) 2000 Randy Dunlap	<rddunlap@osdl.org>
+ * Copyright (c) 2000 Randy Dunlap	<rdunlap@xenotime.net>
  * Copyright (c) 2000 Vojtech Pavlik	<vojtech@suse.cz>
  # Copyright (c) 2001 Pete Zaitcev	<zaitcev@redhat.com>
  # Copyright (c) 2001 David Paschal	<paschal@rcsis.com>
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 50a7749cfca1..02f44094bda9 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -884,7 +884,7 @@ ToDo/Notes:
 
 	- Add handling for initialized_size != data_size in compressed files.
 	- Reduce function local stack usage from 0x3d4 bytes to just noise in
-	  fs/ntfs/upcase.c. (Randy Dunlap <rddunlap@osdl.ord>)
+	  fs/ntfs/upcase.c. (Randy Dunlap <rdunlap@xenotime.net>)
 	- Remove compiler warnings for newer gcc.
 	- Pages are no longer kmapped by mm/filemap.c::generic_file_write()
 	  around calls to ->{prepare,commit}_write.  Adapt NTFS appropriately
diff --git a/kernel/configs.c b/kernel/configs.c
index 986f7af31e0a..009e1ebdcb88 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -3,7 +3,7 @@
  * Echo the kernel .config file used to build the kernel
  *
  * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
- * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org>
+ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
  * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
  * Copyright (C) 2002 Hewlett-Packard Company
  *
diff --git a/scripts/binoffset.c b/scripts/binoffset.c
index 591309d85518..1a2e39b8e3e5 100644
--- a/scripts/binoffset.c
+++ b/scripts/binoffset.c
@@ -1,6 +1,6 @@
 /***************************************************************************
  * binoffset.c
- * (C) 2002 Randy Dunlap <rddunlap@osdl.org>
+ * (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
 
 #   This program is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
diff --git a/scripts/checkversion.pl b/scripts/checkversion.pl
index df10db662eb1..9f84e562318d 100755
--- a/scripts/checkversion.pl
+++ b/scripts/checkversion.pl
@@ -3,7 +3,7 @@
 # checkversion find uses of LINUX_VERSION_CODE, KERNEL_VERSION, or
 # UTS_RELEASE without including <linux/version.h>, or cases of
 # including <linux/version.h> that don't need it.
-# Copyright (C) 2003, Randy Dunlap <rddunlap@osdl.org>
+# Copyright (C) 2003, Randy Dunlap <rdunlap@xenotime.net>
 
 $| = 1;
 
diff --git a/scripts/patch-kernel b/scripts/patch-kernel
index f2d47ca9c8fa..67e4b1868e50 100755
--- a/scripts/patch-kernel
+++ b/scripts/patch-kernel
@@ -45,7 +45,7 @@
 # update usage message;
 # fix some whitespace damage;
 # be smarter about stopping when current version is larger than requested;
-#	Randy Dunlap <rddunlap@osdl.org>, 2004-AUG-18.
+#	Randy Dunlap <rdunlap@xenotime.net>, 2004-AUG-18.
 #
 # Add better support for (non-incremental) 2.6.x.y patches;
 # If an ending version number if not specified, the script automatically
@@ -56,7 +56,7 @@
 # patch-kernel does not normally support reverse patching, but does so when
 # applying EXTRAVERSION (x.y) patches, so that moving from 2.6.11.y to 2.6.11.z
 # is easy and handled by the script (reverse 2.6.11.y and apply 2.6.11.z).
-#	Randy Dunlap <rddunlap@osdl.org>, 2005-APR-08.
+#	Randy Dunlap <rdunlap@xenotime.net>, 2005-APR-08.
 
 PNAME=patch-kernel
 
-- 
cgit v1.2.3


From 7063fbf2261194f72ee75afca67b3b38b554b5fa Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 15 Dec 2005 14:29:43 -0800
Subject: [PATCH] configfs: User-driven configuration filesystem

Configfs, a file system for userspace-driven kernel object configuration.
The OCFS2 stack makes extensive use of this for propagation of cluster
configuration information into kernel.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/00-INDEX                 |    2 +
 Documentation/filesystems/configfs/configfs.txt    |  434 ++++++++
 .../filesystems/configfs/configfs_example.c        |  474 +++++++++
 MAINTAINERS                                        |    5 +
 fs/Kconfig                                         |   14 +
 fs/Makefile                                        |    1 +
 fs/configfs/Makefile                               |    7 +
 fs/configfs/configfs_internal.h                    |  142 +++
 fs/configfs/dir.c                                  | 1102 ++++++++++++++++++++
 fs/configfs/file.c                                 |  360 +++++++
 fs/configfs/inode.c                                |  162 +++
 fs/configfs/item.c                                 |  227 ++++
 fs/configfs/mount.c                                |  159 +++
 fs/configfs/symlink.c                              |  281 +++++
 include/linux/configfs.h                           |  205 ++++
 15 files changed, 3575 insertions(+)
 create mode 100644 Documentation/filesystems/configfs/configfs.txt
 create mode 100644 Documentation/filesystems/configfs/configfs_example.c
 create mode 100644 fs/configfs/Makefile
 create mode 100644 fs/configfs/configfs_internal.h
 create mode 100644 fs/configfs/dir.c
 create mode 100644 fs/configfs/file.c
 create mode 100644 fs/configfs/inode.c
 create mode 100644 fs/configfs/item.c
 create mode 100644 fs/configfs/mount.c
 create mode 100644 fs/configfs/symlink.c
 create mode 100644 include/linux/configfs.h

(limited to 'fs')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index bcfbab899b37..628f8a7adb85 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -12,6 +12,8 @@ cifs.txt
 	- description of the CIFS filesystem
 coda.txt
 	- description of the CODA filesystem.
+configfs/
+	- directory containing configfs documentation and example code.
 cramfs.txt
 	- info on the cram filesystem for small storage (ROMs etc)
 devfs/
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
new file mode 100644
index 000000000000..c4ff96b7c4e0
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -0,0 +1,434 @@
+
+configfs - Userspace-driven kernel object configuation.
+
+Joel Becker <joel.becker@oracle.com>
+
+Updated: 31 March 2005
+
+Copyright (c) 2005 Oracle Corporation,
+	Joel Becker <joel.becker@oracle.com>
+
+
+[What is configfs?]
+
+configfs is a ram-based filesystem that provides the converse of
+sysfs's functionality.  Where sysfs is a filesystem-based view of
+kernel objects, configfs is a filesystem-based manager of kernel
+objects, or config_items.
+
+With sysfs, an object is created in kernel (for example, when a device
+is discovered) and it is registered with sysfs.  Its attributes then
+appear in sysfs, allowing userspace to read the attributes via
+readdir(3)/read(2).  It may allow some attributes to be modified via
+write(2).  The important point is that the object is created and
+destroyed in kernel, the kernel controls the lifecycle of the sysfs
+representation, and sysfs is merely a window on all this.
+
+A configfs config_item is created via an explicit userspace operation:
+mkdir(2).  It is destroyed via rmdir(2).  The attributes appear at
+mkdir(2) time, and can be read or modified via read(2) and write(2).
+As with sysfs, readdir(3) queries the list of items and/or attributes.
+symlink(2) can be used to group items together.  Unlike sysfs, the
+lifetime of the representation is completely driven by userspace.  The
+kernel modules backing the items must respond to this.
+
+Both sysfs and configfs can and should exist together on the same
+system.  One is not a replacement for the other.
+
+[Using configfs]
+
+configfs can be compiled as a module or into the kernel.  You can access
+it by doing
+
+	mount -t configfs none /config
+
+The configfs tree will be empty unless client modules are also loaded.
+These are modules that register their item types with configfs as
+subsystems.  Once a client subsystem is loaded, it will appear as a
+subdirectory (or more than one) under /config.  Like sysfs, the
+configfs tree is always there, whether mounted on /config or not.
+
+An item is created via mkdir(2).  The item's attributes will also
+appear at this time.  readdir(3) can determine what the attributes are,
+read(2) can query their default values, and write(2) can store new
+values.  Like sysfs, attributes should be ASCII text files, preferably
+with only one value per file.  The same efficiency caveats from sysfs
+apply.  Don't mix more than one attribute in one attribute file.
+
+Like sysfs, configfs expects write(2) to store the entire buffer at
+once.  When writing to configfs attributes, userspace processes should
+first read the entire file, modify the portions they wish to change, and
+then write the entire buffer back.  Attribute files have a maximum size
+of one page (PAGE_SIZE, 4096 on i386).
+
+When an item needs to be destroyed, remove it with rmdir(2).  An
+item cannot be destroyed if any other item has a link to it (via
+symlink(2)).  Links can be removed via unlink(2).
+
+[Configuring FakeNBD: an Example]
+
+Imagine there's a Network Block Device (NBD) driver that allows you to
+access remote block devices.  Call it FakeNBD.  FakeNBD uses configfs
+for its configuration.  Obviously, there will be a nice program that
+sysadmins use to configure FakeNBD, but somehow that program has to tell
+the driver about it.  Here's where configfs comes in.
+
+When the FakeNBD driver is loaded, it registers itself with configfs.
+readdir(3) sees this just fine:
+
+	# ls /config
+	fakenbd
+
+A fakenbd connection can be created with mkdir(2).  The name is
+arbitrary, but likely the tool will make some use of the name.  Perhaps
+it is a uuid or a disk name:
+
+	# mkdir /config/fakenbd/disk1
+	# ls /config/fakenbd/disk1
+	target device rw
+
+The target attribute contains the IP address of the server FakeNBD will
+connect to.  The device attribute is the device on the server.
+Predictably, the rw attribute determines whether the connection is
+read-only or read-write.
+
+	# echo 10.0.0.1 > /config/fakenbd/disk1/target
+	# echo /dev/sda1 > /config/fakenbd/disk1/device
+	# echo 1 > /config/fakenbd/disk1/rw
+
+That's it.  That's all there is.  Now the device is configured, via the
+shell no less.
+
+[Coding With configfs]
+
+Every object in configfs is a config_item.  A config_item reflects an
+object in the subsystem.  It has attributes that match values on that
+object.  configfs handles the filesystem representation of that object
+and its attributes, allowing the subsystem to ignore all but the
+basic show/store interaction.
+
+Items are created and destroyed inside a config_group.  A group is a
+collection of items that share the same attributes and operations.
+Items are created by mkdir(2) and removed by rmdir(2), but configfs
+handles that.  The group has a set of operations to perform these tasks
+
+A subsystem is the top level of a client module.  During initialization,
+the client module registers the subsystem with configfs, the subsystem
+appears as a directory at the top of the configfs filesystem.  A
+subsystem is also a config_group, and can do everything a config_group
+can.
+
+[struct config_item]
+
+	struct config_item {
+		char                    *ci_name;
+		char                    ci_namebuf[UOBJ_NAME_LEN];
+		struct kref             ci_kref;
+		struct list_head        ci_entry;
+		struct config_item      *ci_parent;
+		struct config_group     *ci_group;
+		struct config_item_type *ci_type;
+		struct dentry           *ci_dentry;
+	};
+
+	void config_item_init(struct config_item *);
+	void config_item_init_type_name(struct config_item *,
+					const char *name,
+					struct config_item_type *type);
+	struct config_item *config_item_get(struct config_item *);
+	void config_item_put(struct config_item *);
+
+Generally, struct config_item is embedded in a container structure, a
+structure that actually represents what the subsystem is doing.  The
+config_item portion of that structure is how the object interacts with
+configfs.
+
+Whether statically defined in a source file or created by a parent
+config_group, a config_item must have one of the _init() functions
+called on it.  This initializes the reference count and sets up the
+appropriate fields.
+
+All users of a config_item should have a reference on it via
+config_item_get(), and drop the reference when they are done via
+config_item_put().
+
+By itself, a config_item cannot do much more than appear in configfs.
+Usually a subsystem wants the item to display and/or store attributes,
+among other things.  For that, it needs a type.
+
+[struct config_item_type]
+
+	struct configfs_item_operations {
+		void (*release)(struct config_item *);
+		ssize_t (*show_attribute)(struct config_item *,
+					  struct configfs_attribute *,
+					  char *);
+		ssize_t (*store_attribute)(struct config_item *,
+					   struct configfs_attribute *,
+					   const char *, size_t);
+		int (*allow_link)(struct config_item *src,
+				  struct config_item *target);
+		int (*drop_link)(struct config_item *src,
+				 struct config_item *target);
+	};
+
+	struct config_item_type {
+		struct module                           *ct_owner;
+		struct configfs_item_operations         *ct_item_ops;
+		struct configfs_group_operations        *ct_group_ops;
+		struct configfs_attribute               **ct_attrs;
+	};
+
+The most basic function of a config_item_type is to define what
+operations can be performed on a config_item.  All items that have been
+allocated dynamically will need to provide the ct_item_ops->release()
+method.  This method is called when the config_item's reference count
+reaches zero.  Items that wish to display an attribute need to provide
+the ct_item_ops->show_attribute() method.  Similarly, storing a new
+attribute value uses the store_attribute() method.
+
+[struct configfs_attribute]
+
+	struct configfs_attribute {
+		char                    *ca_name;
+		struct module           *ca_owner;
+		mode_t                  ca_mode;
+	};
+
+When a config_item wants an attribute to appear as a file in the item's
+configfs directory, it must define a configfs_attribute describing it.
+It then adds the attribute to the NULL-terminated array
+config_item_type->ct_attrs.  When the item appears in configfs, the
+attribute file will appear with the configfs_attribute->ca_name
+filename.  configfs_attribute->ca_mode specifies the file permissions.
+
+If an attribute is readable and the config_item provides a
+ct_item_ops->show_attribute() method, that method will be called
+whenever userspace asks for a read(2) on the attribute.  The converse
+will happen for write(2).
+
+[struct config_group]
+
+A config_item cannot live in a vaccum.  The only way one can be created
+is via mkdir(2) on a config_group.  This will trigger creation of a
+child item.
+
+	struct config_group {
+		struct config_item		cg_item;
+		struct list_head		cg_children;
+		struct configfs_subsystem 	*cg_subsys;
+		struct config_group		**default_groups;
+	};
+
+	void config_group_init(struct config_group *group);
+	void config_group_init_type_name(struct config_group *group,
+					 const char *name,
+					 struct config_item_type *type);
+
+
+The config_group structure contains a config_item.  Properly configuring
+that item means that a group can behave as an item in its own right.
+However, it can do more: it can create child items or groups.  This is
+accomplished via the group operations specified on the group's
+config_item_type.
+
+	struct configfs_group_operations {
+		struct config_item *(*make_item)(struct config_group *group,
+						 const char *name);
+		struct config_group *(*make_group)(struct config_group *group,
+						   const char *name);
+		int (*commit_item)(struct config_item *item);
+		void (*drop_item)(struct config_group *group,
+				  struct config_item *item);
+	};
+
+A group creates child items by providing the
+ct_group_ops->make_item() method.  If provided, this method is called from mkdir(2) in the group's directory.  The subsystem allocates a new
+config_item (or more likely, its container structure), initializes it,
+and returns it to configfs.  Configfs will then populate the filesystem
+tree to reflect the new item.
+
+If the subsystem wants the child to be a group itself, the subsystem
+provides ct_group_ops->make_group().  Everything else behaves the same,
+using the group _init() functions on the group.
+
+Finally, when userspace calls rmdir(2) on the item or group,
+ct_group_ops->drop_item() is called.  As a config_group is also a
+config_item, it is not necessary for a seperate drop_group() method.
+The subsystem must config_item_put() the reference that was initialized
+upon item allocation.  If a subsystem has no work to do, it may omit
+the ct_group_ops->drop_item() method, and configfs will call
+config_item_put() on the item on behalf of the subsystem.
+
+IMPORTANT: drop_item() is void, and as such cannot fail.  When rmdir(2)
+is called, configfs WILL remove the item from the filesystem tree
+(assuming that it has no children to keep it busy).  The subsystem is
+responsible for responding to this.  If the subsystem has references to
+the item in other threads, the memory is safe.  It may take some time
+for the item to actually disappear from the subsystem's usage.  But it
+is gone from configfs.
+
+A config_group cannot be removed while it still has child items.  This
+is implemented in the configfs rmdir(2) code.  ->drop_item() will not be
+called, as the item has not been dropped.  rmdir(2) will fail, as the
+directory is not empty.
+
+[struct configfs_subsystem]
+
+A subsystem must register itself, ususally at module_init time.  This
+tells configfs to make the subsystem appear in the file tree.
+
+	struct configfs_subsystem {
+		struct config_group	su_group;
+		struct semaphore	su_sem;
+	};
+
+	int configfs_register_subsystem(struct configfs_subsystem *subsys);
+	void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+	A subsystem consists of a toplevel config_group and a semaphore.
+The group is where child config_items are created.  For a subsystem,
+this group is usually defined statically.  Before calling
+configfs_register_subsystem(), the subsystem must have initialized the
+group via the usual group _init() functions, and it must also have
+initialized the semaphore.
+	When the register call returns, the subsystem is live, and it
+will be visible via configfs.  At that point, mkdir(2) can be called and
+the subsystem must be ready for it.
+
+[An Example]
+
+The best example of these basic concepts is the simple_children
+subsystem/group and the simple_child item in configfs_example.c  It
+shows a trivial object displaying and storing an attribute, and a simple
+group creating and destroying these children.
+
+[Hierarchy Navigation and the Subsystem Semaphore]
+
+There is an extra bonus that configfs provides.  The config_groups and
+config_items are arranged in a hierarchy due to the fact that they
+appear in a filesystem.  A subsystem is NEVER to touch the filesystem
+parts, but the subsystem might be interested in this hierarchy.  For
+this reason, the hierarchy is mirrored via the config_group->cg_children
+and config_item->ci_parent structure members.
+
+A subsystem can navigate the cg_children list and the ci_parent pointer
+to see the tree created by the subsystem.  This can race with configfs'
+management of the hierarchy, so configfs uses the subsystem semaphore to
+protect modifications.  Whenever a subsystem wants to navigate the
+hierarchy, it must do so under the protection of the subsystem
+semaphore.
+
+A subsystem will be prevented from acquiring the semaphore while a newly
+allocated item has not been linked into this hierarchy.   Similarly, it
+will not be able to acquire the semaphore while a dropping item has not
+yet been unlinked.  This means that an item's ci_parent pointer will
+never be NULL while the item is in configfs, and that an item will only
+be in its parent's cg_children list for the same duration.  This allows
+a subsystem to trust ci_parent and cg_children while they hold the
+semaphore.
+
+[Item Aggregation Via symlink(2)]
+
+configfs provides a simple group via the group->item parent/child
+relationship.  Often, however, a larger environment requires aggregation
+outside of the parent/child connection.  This is implemented via
+symlink(2).
+
+A config_item may provide the ct_item_ops->allow_link() and
+ct_item_ops->drop_link() methods.  If the ->allow_link() method exists,
+symlink(2) may be called with the config_item as the source of the link.
+These links are only allowed between configfs config_items.  Any
+symlink(2) attempt outside the configfs filesystem will be denied.
+
+When symlink(2) is called, the source config_item's ->allow_link()
+method is called with itself and a target item.  If the source item
+allows linking to target item, it returns 0.  A source item may wish to
+reject a link if it only wants links to a certain type of object (say,
+in its own subsystem).
+
+When unlink(2) is called on the symbolic link, the source item is
+notified via the ->drop_link() method.  Like the ->drop_item() method,
+this is a void function and cannot return failure.  The subsystem is
+responsible for responding to the change.
+
+A config_item cannot be removed while it links to any other item, nor
+can it be removed while an item links to it.  Dangling symlinks are not
+allowed in configfs.
+
+[Automatically Created Subgroups]
+
+A new config_group may want to have two types of child config_items.
+While this could be codified by magic names in ->make_item(), it is much
+more explicit to have a method whereby userspace sees this divergence.
+
+Rather than have a group where some items behave differently than
+others, configfs provides a method whereby one or many subgroups are
+automatically created inside the parent at its creation.  Thus,
+mkdir("parent) results in "parent", "parent/subgroup1", up through
+"parent/subgroupN".  Items of type 1 can now be created in
+"parent/subgroup1", and items of type N can be created in
+"parent/subgroupN".
+
+These automatic subgroups, or default groups, do not preclude other
+children of the parent group.  If ct_group_ops->make_group() exists,
+other child groups can be created on the parent group directly.
+
+A configfs subsystem specifies default groups by filling in the
+NULL-terminated array default_groups on the config_group structure.
+Each group in that array is populated in the configfs tree at the same
+time as the parent group.  Similarly, they are removed at the same time
+as the parent.  No extra notification is provided.  When a ->drop_item()
+method call notifies the subsystem the parent group is going away, it
+also means every default group child associated with that parent group.
+
+As a consequence of this, default_groups cannot be removed directly via
+rmdir(2).  They also are not considered when rmdir(2) on the parent
+group is checking for children.
+
+[Committable Items]
+
+NOTE: Committable items are currently unimplemented.
+
+Some config_items cannot have a valid initial state.  That is, no
+default values can be specified for the item's attributes such that the
+item can do its work.  Userspace must configure one or more attributes,
+after which the subsystem can start whatever entity this item
+represents.
+
+Consider the FakeNBD device from above.  Without a target address *and*
+a target device, the subsystem has no idea what block device to import.
+The simple example assumes that the subsystem merely waits until all the
+appropriate attributes are configured, and then connects.  This will,
+indeed, work, but now every attribute store must check if the attributes
+are initialized.  Every attribute store must fire off the connection if
+that condition is met.
+
+Far better would be an explicit action notifying the subsystem that the
+config_item is ready to go.  More importantly, an explicit action allows
+the subsystem to provide feedback as to whether the attibutes are
+initialized in a way that makes sense.  configfs provides this as
+committable items.
+
+configfs still uses only normal filesystem operations.  An item is
+committed via rename(2).  The item is moved from a directory where it
+can be modified to a directory where it cannot.
+
+Any group that provides the ct_group_ops->commit_item() method has
+committable items.  When this group appears in configfs, mkdir(2) will
+not work directly in the group.  Instead, the group will have two
+subdirectories: "live" and "pending".  The "live" directory does not
+support mkdir(2) or rmdir(2) either.  It only allows rename(2).  The
+"pending" directory does allow mkdir(2) and rmdir(2).  An item is
+created in the "pending" directory.  Its attributes can be modified at
+will.  Userspace commits the item by renaming it into the "live"
+directory.  At this point, the subsystem recieves the ->commit_item()
+callback.  If all required attributes are filled to satisfaction, the
+method returns zero and the item is moved to the "live" directory.
+
+As rmdir(2) does not work in the "live" directory, an item must be
+shutdown, or "uncommitted".  Again, this is done via rename(2), this
+time from the "live" directory back to the "pending" one.  The subsystem
+is notified by the ct_group_ops->uncommit_object() method.
+
+
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
new file mode 100644
index 000000000000..f3c6e4946f98
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -0,0 +1,474 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example.c - This file is a demonstration module containing
+ *      a number of configfs subsystems.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem.  It cannot create
+ * any config_items.  It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem.  See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+	struct configfs_subsystem subsys;
+	int showme;
+	int storeme;
+};
+
+struct childless_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct childless *, char *);
+	ssize_t (*store)(struct childless *, const char *, size_t);
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+	return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+static ssize_t childless_showme_read(struct childless *childless,
+				     char *page)
+{
+	ssize_t pos;
+
+	pos = sprintf(page, "%d\n", childless->showme);
+	childless->showme++;
+
+	return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+				      char *page)
+{
+	return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+				       const char *page,
+				       size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	childless->storeme = tmp;
+
+	return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+					  char *page)
+{
+	return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs.  It does not support the creation of child config_items.\n"
+"It only has a few attributes.  In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+static struct childless_attribute childless_attr_showme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
+	.show	= childless_showme_read,
+};
+static struct childless_attribute childless_attr_storeme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= childless_storeme_read,
+	.store	= childless_storeme_write,
+};
+static struct childless_attribute childless_attr_description = {
+	.attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
+	.show = childless_description_read,
+};
+
+static struct configfs_attribute *childless_attrs[] = {
+	&childless_attr_showme.attr,
+	&childless_attr_storeme.attr,
+	&childless_attr_description.attr,
+	NULL,
+};
+
+static ssize_t childless_attr_show(struct config_item *item,
+				   struct configfs_attribute *attr,
+				   char *page)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = 0;
+
+	if (childless_attr->show)
+		ret = childless_attr->show(childless, page);
+	return ret;
+}
+
+static ssize_t childless_attr_store(struct config_item *item,
+				    struct configfs_attribute *attr,
+				    const char *page, size_t count)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (childless_attr->store)
+		ret = childless_attr->store(childless, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations childless_item_ops = {
+	.show_attribute		= childless_attr_show,
+	.store_attribute	= childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+	.ct_item_ops	= &childless_item_ops,
+	.ct_attrs	= childless_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+	.subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "01-childless",
+				.ci_type = &childless_type,
+			},
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child.  Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go.  Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+	struct config_item item;
+	int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+	return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "storeme",
+	.ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+	&simple_child_attr_storeme,
+	NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+				      struct configfs_attribute *attr,
+				      char *page)
+{
+	ssize_t count;
+	struct simple_child *simple_child = to_simple_child(item);
+
+	count = sprintf(page, "%d\n", simple_child->storeme);
+
+	return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+				       struct configfs_attribute *attr,
+				       const char *page, size_t count)
+{
+	struct simple_child *simple_child = to_simple_child(item);
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	simple_child->storeme = tmp;
+
+	return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+	kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+	.release		= simple_child_release,
+	.show_attribute		= simple_child_attr_show,
+	.store_attribute	= simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+	.ct_item_ops	= &simple_child_item_ops,
+	.ct_attrs	= simple_child_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+	struct simple_child *simple_child;
+
+	simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL);
+	if (!simple_child)
+		return NULL;
+
+	memset(simple_child, 0, sizeof(struct simple_child));
+
+	config_item_init_type_name(&simple_child->item, name,
+				   &simple_child_type);
+
+	simple_child->storeme = 0;
+
+	return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+	&simple_children_attr_description,
+	NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+			   		 struct configfs_attribute *attr,
+			   		 char *page)
+{
+	return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items.  These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+	.show_attribute	= simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+	.make_item	= simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+	.ct_item_ops	= &simple_children_item_ops,
+	.ct_group_ops	= &simple_children_group_ops,
+	.ct_attrs	= simple_children_attrs,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "02-simple-children",
+			.ci_type = &simple_children_type,
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above.  However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem.  Creation of a group in the subsystem creates
+ * a new simple_children group.  That group can then have simple_child
+ * children of its own.
+ */
+
+struct simple_children {
+	struct config_group group;
+};
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+	struct simple_children *simple_children;
+
+	simple_children = kmalloc(sizeof(struct simple_children),
+				  GFP_KERNEL);
+	if (!simple_children)
+		return NULL;
+
+	memset(simple_children, 0, sizeof(struct simple_children));
+
+	config_group_init_type_name(&simple_children->group, name,
+				    &simple_children_type);
+
+	return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+	&group_children_attr_description,
+	NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+			   		struct configfs_attribute *attr,
+			   		char *page)
+{
+	return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups.  These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+	.show_attribute	= group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+	.make_group	= group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+	.ct_item_ops	= &group_children_item_ops,
+	.ct_group_ops	= &group_children_group_ops,
+	.ct_attrs	= group_children_attrs,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "03-group-children",
+			.ci_type = &group_children_type,
+		},
+	},
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all.  It
+ * allows the init function to easily register them.  Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+	&childless_subsys.subsys,
+	&simple_children_subsys,
+	&group_children_subsys,
+	NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+	int ret;
+	int i;
+	struct configfs_subsystem *subsys;
+
+	for (i = 0; example_subsys[i]; i++) {
+		subsys = example_subsys[i];
+
+		config_group_init(&subsys->su_group);
+		init_MUTEX(&subsys->su_sem);
+		ret = configfs_register_subsystem(subsys);
+		if (ret) {
+			printk(KERN_ERR "Error %d while registering subsystem %s\n",
+			       ret,
+			       subsys->su_group.cg_item.ci_namebuf);
+			goto out_unregister;
+		}
+	}
+
+	return 0;
+
+out_unregister:
+	for (; i >= 0; i--) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+
+	return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+	int i;
+
+	for (i = 0; example_subsys[i]; i++) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/MAINTAINERS b/MAINTAINERS
index 6af683025ae0..86ee06f43794 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -554,6 +554,11 @@ W:	http://us1.samba.org/samba/Linux_CIFS_client.html
 T:	git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
 S:	Supported	
 
+CONFIGFS
+P:	Joel Becker
+M:	Joel Becker <joel.becker@oracle.com>
+S:	Supported
+
 CIRRUS LOGIC GENERIC FBDEV DRIVER
 P:	Jeff Garzik
 M:	jgarzik@pobox.com
diff --git a/fs/Kconfig b/fs/Kconfig
index d5255e627b5f..ba1dbe2b2202 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -841,6 +841,20 @@ config RELAYFS_FS
 
 	  If unsure, say N.
 
+config CONFIGFS_FS
+	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  configfs is a ram-based filesystem that provides the converse
+	  of sysfs's functionality. Where sysfs is a filesystem-based
+	  view of kernel objects, configfs is a filesystem-based manager
+	  of kernel objects, or config_items.
+
+	  Both sysfs and configfs can and should exist together on the
+	  same system. One is not a replacement for the other.
+
+	  If unsure, say N.
+
 endmenu
 
 menu "Miscellaneous filesystems"
diff --git a/fs/Makefile b/fs/Makefile
index 4c2655759078..ff3d48a744f5 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -101,3 +101,4 @@ obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile
new file mode 100644
index 000000000000..00ffb278e98c
--- /dev/null
+++ b/fs/configfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the configfs virtual filesystem
+#
+
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs.o
+
+configfs-objs	:= inode.o file.o dir.o symlink.o mount.o item.o
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
new file mode 100644
index 000000000000..8899d9c5f6bf
--- /dev/null
+++ b/fs/configfs/configfs_internal.h
@@ -0,0 +1,142 @@
+/* -*- mode: c; c-basic-offset:8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs_internal.h - Internal stuff for configfs
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include <linux/list.h>
+
+struct configfs_dirent {
+	atomic_t		s_count;
+	struct list_head	s_sibling;
+	struct list_head	s_children;
+	struct list_head	s_links;
+	void 			* s_element;
+	int			s_type;
+	umode_t			s_mode;
+	struct dentry		* s_dentry;
+};
+
+#define CONFIGFS_ROOT		0x0001
+#define CONFIGFS_DIR		0x0002
+#define CONFIGFS_ITEM_ATTR 	0x0004
+#define CONFIGFS_ITEM_LINK 	0x0020
+#define CONFIGFS_USET_DIR	0x0040
+#define CONFIGFS_USET_DEFAULT	0x0080
+#define CONFIGFS_USET_DROPPING	0x0100
+#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
+
+extern struct vfsmount * configfs_mount;
+
+extern int configfs_is_root(struct config_item *item);
+
+extern struct inode * configfs_new_inode(mode_t mode);
+extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+
+extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_make_dirent(struct configfs_dirent *,
+				struct dentry *, void *, umode_t, int);
+
+extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
+extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
+
+extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
+extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
+
+extern int configfs_pin_fs(void);
+extern void configfs_release_fs(void);
+
+extern struct rw_semaphore configfs_rename_sem;
+extern struct super_block * configfs_sb;
+extern struct file_operations configfs_dir_operations;
+extern struct file_operations configfs_file_operations;
+extern struct file_operations bin_fops;
+extern struct inode_operations configfs_dir_inode_operations;
+extern struct inode_operations configfs_symlink_inode_operations;
+
+extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname);
+extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
+
+struct configfs_symlink {
+	struct list_head sl_list;
+	struct config_item *sl_target;
+};
+
+extern int configfs_create_link(struct configfs_symlink *sl,
+				struct dentry *parent,
+				struct dentry *dentry);
+
+static inline struct config_item * to_item(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct config_item *) sd->s_element);
+}
+
+static inline struct configfs_attribute * to_attr(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct configfs_attribute *) sd->s_element);
+}
+
+static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
+{
+	struct config_item * item = NULL;
+
+	spin_lock(&dcache_lock);
+	if (!d_unhashed(dentry)) {
+		struct configfs_dirent * sd = dentry->d_fsdata;
+		if (sd->s_type & CONFIGFS_ITEM_LINK) {
+			struct configfs_symlink * sl = sd->s_element;
+			item = config_item_get(sl->sl_target);
+		} else
+			item = config_item_get(sd->s_element);
+	}
+	spin_unlock(&dcache_lock);
+
+	return item;
+}
+
+static inline void release_configfs_dirent(struct configfs_dirent * sd)
+{
+	if (!(sd->s_type & CONFIGFS_ROOT))
+		kfree(sd);
+}
+
+static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+	return sd;
+}
+
+static inline void configfs_put(struct configfs_dirent * sd)
+{
+	WARN_ON(!atomic_read(&sd->s_count));
+	if (atomic_dec_and_test(&sd->s_count))
+		release_configfs_dirent(sd);
+}
+
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
new file mode 100644
index 000000000000..e48b539243a1
--- /dev/null
+++ b/fs/configfs/dir.c
@@ -0,0 +1,1102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c - Operations for configfs directories.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+DECLARE_RWSEM(configfs_rename_sem);
+
+static void configfs_d_iput(struct dentry * dentry,
+			    struct inode * inode)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+
+	if (sd) {
+		BUG_ON(sd->s_dentry != dentry);
+		sd->s_dentry = NULL;
+		configfs_put(sd);
+	}
+	iput(inode);
+}
+
+/*
+ * We _must_ delete our dentries on last dput, as the chain-to-parent
+ * behavior is required to clear the parents of default_groups.
+ */
+static int configfs_d_delete(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations configfs_dentry_ops = {
+	.d_iput		= configfs_d_iput,
+	/* simple_delete_dentry() isn't exported */
+	.d_delete	= configfs_d_delete,
+};
+
+/*
+ * Allocates a new configfs_dirent and links it to the parent configfs_dirent
+ */
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
+						void * element)
+{
+	struct configfs_dirent * sd;
+
+	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+	if (!sd)
+		return NULL;
+
+	memset(sd, 0, sizeof(*sd));
+	atomic_set(&sd->s_count, 1);
+	INIT_LIST_HEAD(&sd->s_links);
+	INIT_LIST_HEAD(&sd->s_children);
+	list_add(&sd->s_sibling, &parent_sd->s_children);
+	sd->s_element = element;
+
+	return sd;
+}
+
+int configfs_make_dirent(struct configfs_dirent * parent_sd,
+			 struct dentry * dentry, void * element,
+			 umode_t mode, int type)
+{
+	struct configfs_dirent * sd;
+
+	sd = configfs_new_dirent(parent_sd, element);
+	if (!sd)
+		return -ENOMEM;
+
+	sd->s_mode = mode;
+	sd->s_type = type;
+	sd->s_dentry = dentry;
+	if (dentry) {
+		dentry->d_fsdata = configfs_get(sd);
+		dentry->d_op = &configfs_dentry_ops;
+	}
+
+	return 0;
+}
+
+static int init_dir(struct inode * inode)
+{
+	inode->i_op = &configfs_dir_inode_operations;
+	inode->i_fop = &configfs_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inode->i_nlink++;
+	return 0;
+}
+
+static int init_file(struct inode * inode)
+{
+	inode->i_size = PAGE_SIZE;
+	inode->i_fop = &configfs_file_operations;
+	return 0;
+}
+
+static int init_symlink(struct inode * inode)
+{
+	inode->i_op = &configfs_symlink_inode_operations;
+	return 0;
+}
+
+static int create_dir(struct config_item * k, struct dentry * p,
+		      struct dentry * d)
+{
+	int error;
+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+
+	error = configfs_create(d, mode, init_dir);
+	if (!error) {
+		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+					   CONFIGFS_DIR);
+		if (!error) {
+			p->d_inode->i_nlink++;
+			(d)->d_op = &configfs_dentry_ops;
+		}
+	}
+	return error;
+}
+
+
+/**
+ *	configfs_create_dir - create a directory for an config_item.
+ *	@item:		config_itemwe're creating directory for.
+ *	@dentry:	config_item's dentry.
+ */
+
+static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
+{
+	struct dentry * parent;
+	int error = 0;
+
+	BUG_ON(!item);
+
+	if (item->ci_parent)
+		parent = item->ci_parent->ci_dentry;
+	else if (configfs_mount && configfs_mount->mnt_sb)
+		parent = configfs_mount->mnt_sb->s_root;
+	else
+		return -EFAULT;
+
+	error = create_dir(item,parent,dentry);
+	if (!error)
+		item->ci_dentry = dentry;
+	return error;
+}
+
+int configfs_create_link(struct configfs_symlink *sl,
+			 struct dentry *parent,
+			 struct dentry *dentry)
+{
+	int err = 0;
+	umode_t mode = S_IFLNK | S_IRWXUGO;
+
+	err = configfs_create(dentry, mode, init_symlink);
+	if (!err) {
+		err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
+					 mode, CONFIGFS_ITEM_LINK);
+		if (!err)
+			dentry->d_op = &configfs_dentry_ops;
+	}
+	return err;
+}
+
+static void remove_dir(struct dentry * d)
+{
+	struct dentry * parent = dget(d->d_parent);
+	struct configfs_dirent * sd;
+
+	sd = d->d_fsdata;
+ 	list_del_init(&sd->s_sibling);
+	configfs_put(sd);
+	if (d->d_inode)
+		simple_rmdir(parent->d_inode,d);
+
+	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+		 atomic_read(&d->d_count));
+
+	dput(parent);
+}
+
+/**
+ * configfs_remove_dir - remove an config_item's directory.
+ * @item:	config_item we're removing.
+ *
+ * The only thing special about this is that we remove any files in
+ * the directory before we remove the directory, and we've inlined
+ * what used to be configfs_rmdir() below, instead of calling separately.
+ */
+
+static void configfs_remove_dir(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+
+	if (!dentry)
+		return;
+
+	remove_dir(dentry);
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+
+/* attaches attribute's configfs_dirent to the dentry corresponding to the
+ * attribute file
+ */
+static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
+{
+	struct configfs_attribute * attr = sd->s_element;
+	int error;
+
+	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
+	if (error)
+		return error;
+
+	dentry->d_op = &configfs_dentry_ops;
+	dentry->d_fsdata = configfs_get(sd);
+	sd->s_dentry = dentry;
+	d_rehash(dentry);
+
+	return 0;
+}
+
+static struct dentry * configfs_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       struct nameidata *nd)
+{
+	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
+	struct configfs_dirent * sd;
+	int found = 0;
+	int err = 0;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED) {
+			const unsigned char * name = configfs_get_name(sd);
+
+			if (strcmp(name, dentry->d_name.name))
+				continue;
+
+			found = 1;
+			err = configfs_attach_attr(sd, dentry);
+			break;
+		}
+	}
+
+	if (!found) {
+		/*
+		 * If it doesn't exist and it isn't a NOT_PINNED item,
+		 * it must be negative.
+		 */
+		return simple_lookup(dir, dentry, nd);
+	}
+
+	return ERR_PTR(err);
+}
+
+/*
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes and are removed by rmdir().  We recurse, taking i_sem
+ * on all children that are candidates for default detach.  If the
+ * result is clean, then configfs_detach_group() will handle dropping
+ * i_sem.  If there is an error, the caller will clean up the i_sem
+ * holders via configfs_detach_rollback().
+ */
+static int configfs_detach_prep(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+	int ret;
+
+	ret = -EBUSY;
+	if (!list_empty(&parent_sd->s_links))
+		goto out;
+
+	ret = 0;
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED)
+			continue;
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			down(&sd->s_dentry->d_inode->i_sem);
+			/* Mark that we've taken i_sem */
+			sd->s_type |= CONFIGFS_USET_DROPPING;
+
+			ret = configfs_detach_prep(sd->s_dentry);
+			if (!ret)
+			       	continue;
+		} else
+			ret = -ENOTEMPTY;
+
+		break;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
+ * set.
+ */
+static void configfs_detach_rollback(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			configfs_detach_rollback(sd->s_dentry);
+
+			if (sd->s_type & CONFIGFS_USET_DROPPING) {
+				sd->s_type &= ~CONFIGFS_USET_DROPPING;
+				up(&sd->s_dentry->d_inode->i_sem);
+			}
+		}
+	}
+}
+
+static void detach_attrs(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+	struct configfs_dirent * parent_sd;
+	struct configfs_dirent * sd, * tmp;
+
+	if (!dentry)
+		return;
+
+	pr_debug("configfs %s: dropping attrs for  dir\n",
+		 dentry->d_name.name);
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
+			continue;
+		list_del_init(&sd->s_sibling);
+		configfs_drop_dentry(sd, dentry);
+		configfs_put(sd);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+static int populate_attrs(struct config_item *item)
+{
+	struct config_item_type *t = item->ci_type;
+	struct configfs_attribute *attr;
+	int error = 0;
+	int i;
+
+	if (!t)
+		return -EINVAL;
+	if (t->ct_attrs) {
+		for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+			if ((error = configfs_create_file(item, attr)))
+				break;
+		}
+	}
+
+	if (error)
+		detach_attrs(item);
+
+	return error;
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry);
+static void configfs_detach_group(struct config_item *item);
+
+static void detach_groups(struct config_group *group)
+{
+	struct dentry * dentry = dget(group->cg_item.ci_dentry);
+	struct dentry *child;
+	struct configfs_dirent *parent_sd;
+	struct configfs_dirent *sd, *tmp;
+
+	if (!dentry)
+		return;
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element ||
+		    !(sd->s_type & CONFIGFS_USET_DEFAULT))
+			continue;
+
+		child = sd->s_dentry;
+
+		configfs_detach_group(sd->s_element);
+		child->d_inode->i_flags |= S_DEAD;
+
+		/*
+		 * From rmdir/unregister, a configfs_detach_prep() pass
+		 * has taken our i_sem for us.  Drop it.
+		 * From mkdir/register cleanup, there is no sem held.
+		 */
+		if (sd->s_type & CONFIGFS_USET_DROPPING)
+			up(&child->d_inode->i_sem);
+
+		d_delete(child);
+		dput(child);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+/*
+ * This fakes mkdir(2) on a default_groups[] entry.  It
+ * creates a dentry, attachs it, and then does fixup
+ * on the sd->s_type.
+ *
+ * We could, perhaps, tweak our parent's ->mkdir for a minute and
+ * try using vfs_mkdir.  Just a thought.
+ */
+static int create_default_group(struct config_group *parent_group,
+				struct config_group *group)
+{
+	int ret;
+	struct qstr name;
+	struct configfs_dirent *sd;
+	/* We trust the caller holds a reference to parent */
+	struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	ret = -ENOMEM;
+	child = d_alloc(parent, &name);
+	if (child) {
+		d_add(child, NULL);
+
+		ret = configfs_attach_group(&parent_group->cg_item,
+					    &group->cg_item, child);
+		if (!ret) {
+			sd = child->d_fsdata;
+			sd->s_type |= CONFIGFS_USET_DEFAULT;
+		} else {
+			d_delete(child);
+			dput(child);
+		}
+	}
+
+	return ret;
+}
+
+static int populate_groups(struct config_group *group)
+{
+	struct config_group *new_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+	int ret = 0;
+	int i;
+
+	if (group && group->default_groups) {
+		/* FYI, we're faking mkdir here
+		 * I'm not sure we need this semaphore, as we're called
+		 * from our parent's mkdir.  That holds our parent's
+		 * i_sem, so afaik lookup cannot continue through our
+		 * parent to find us, let alone mess with our tree.
+		 * That said, taking our i_sem is closer to mkdir
+		 * emulation, and shouldn't hurt. */
+		down(&dentry->d_inode->i_sem);
+
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+
+			ret = create_default_group(group, new_group);
+			if (ret)
+				break;
+		}
+
+		up(&dentry->d_inode->i_sem);
+	}
+
+	if (ret)
+		detach_groups(group);
+
+	return ret;
+}
+
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_sem is held.
+ */
+
+static void unlink_obj(struct config_item *item)
+{
+	struct config_group *group;
+
+	group = item->ci_group;
+	if (group) {
+		list_del_init(&item->ci_entry);
+
+		item->ci_group = NULL;
+		item->ci_parent = NULL;
+		config_item_put(item);
+
+		config_group_put(group);
+	}
+}
+
+static void link_obj(struct config_item *parent_item, struct config_item *item)
+{
+	/* Parent seems redundant with group, but it makes certain
+	 * traversals much nicer. */
+	item->ci_parent = parent_item;
+	item->ci_group = config_group_get(to_config_group(parent_item));
+	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
+
+	config_item_get(item);
+}
+
+static void unlink_group(struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			unlink_group(new_group);
+		}
+	}
+
+	group->cg_subsys = NULL;
+	unlink_obj(&group->cg_item);
+}
+
+static void link_group(struct config_group *parent_group, struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+	struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
+
+	link_obj(&parent_group->cg_item, &group->cg_item);
+
+	if (parent_group->cg_subsys)
+		subsys = parent_group->cg_subsys;
+	else if (configfs_is_root(&parent_group->cg_item))
+		subsys = to_configfs_subsystem(group);
+	else
+		BUG();
+	group->cg_subsys = subsys;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			link_group(group, new_group);
+		}
+	}
+}
+
+/*
+ * The goal is that configfs_attach_item() (and
+ * configfs_attach_group()) can be called from either the VFS or this
+ * module.  That is, they assume that the items have been created,
+ * the dentry allocated, and the dcache is all ready to go.
+ *
+ * If they fail, they must clean up after themselves as if they
+ * had never been called.  The caller (VFS or local function) will
+ * handle cleaning up the dcache bits.
+ *
+ * configfs_detach_group() and configfs_detach_item() behave similarly on
+ * the way out.  They assume that the proper semaphores are held, they
+ * clean up the configfs items, and they expect their callers will
+ * handle the dcache bits.
+ */
+static int configfs_attach_item(struct config_item *parent_item,
+				struct config_item *item,
+				struct dentry *dentry)
+{
+	int ret;
+
+	ret = configfs_create_dir(item, dentry);
+	if (!ret) {
+		ret = populate_attrs(item);
+		if (ret) {
+			configfs_remove_dir(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_item(struct config_item *item)
+{
+	detach_attrs(item);
+	configfs_remove_dir(item);
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry)
+{
+	int ret;
+	struct configfs_dirent *sd;
+
+	ret = configfs_attach_item(parent_item, item, dentry);
+	if (!ret) {
+		sd = dentry->d_fsdata;
+		sd->s_type |= CONFIGFS_USET_DIR;
+
+		ret = populate_groups(to_config_group(item));
+		if (ret) {
+			configfs_detach_item(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_group(struct config_item *item)
+{
+	detach_groups(to_config_group(item));
+	configfs_detach_item(item);
+}
+
+/*
+ * Drop the initial reference from make_item()/make_group()
+ * This function assumes that reference is held on item
+ * and that item holds a valid reference to the parent.  Also, it
+ * assumes the caller has validated ci_type.
+ */
+static void client_drop_item(struct config_item *parent_item,
+			     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	if (type->ct_group_ops && type->ct_group_ops->drop_item)
+		type->ct_group_ops->drop_item(to_config_group(parent_item),
+						item);
+	else
+		config_item_put(item);
+}
+
+
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int ret;
+	struct config_group *group;
+	struct config_item *item;
+	struct config_item *parent_item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct config_item_type *type;
+	struct module *owner;
+	char *name;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_parent->d_fsdata;
+	if (!(sd->s_type & CONFIGFS_USET_DIR))
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!type || !type->ct_group_ops ||
+	    (!type->ct_group_ops->make_group &&
+	     !type->ct_group_ops->make_item)) {
+		config_item_put(parent_item);
+		return -EPERM;  /* What lack-of-mkdir returns */
+	}
+
+	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
+	if (!name) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
+
+	down(&subsys->su_sem);
+	group = NULL;
+	item = NULL;
+	if (type->ct_group_ops->make_group) {
+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+		if (group) {
+			link_group(to_config_group(parent_item), group);
+			item = &group->cg_item;
+		}
+	} else {
+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+		if (item)
+			link_obj(parent_item, item);
+	}
+	up(&subsys->su_sem);
+
+	kfree(name);
+	if (!item) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+
+	ret = -EINVAL;
+	type = item->ci_type;
+	if (type) {
+		owner = type->ct_owner;
+		if (try_module_get(owner)) {
+			if (group) {
+				ret = configfs_attach_group(parent_item,
+							    item,
+							    dentry);
+			} else {
+				ret = configfs_attach_item(parent_item,
+							   item,
+							   dentry);
+			}
+
+			if (ret) {
+				down(&subsys->su_sem);
+				if (group)
+					unlink_group(group);
+				else
+					unlink_obj(item);
+				client_drop_item(parent_item, item);
+				up(&subsys->su_sem);
+
+				config_item_put(parent_item);
+				module_put(owner);
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct config_item *parent_item;
+	struct config_item *item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct module *owner = NULL;
+	int ret;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_fsdata;
+	if (sd->s_type & CONFIGFS_USET_DEFAULT)
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!parent_item->ci_type) {
+		config_item_put(parent_item);
+		return -EINVAL;
+	}
+
+	ret = configfs_detach_prep(dentry);
+	if (ret) {
+		configfs_detach_rollback(dentry);
+		config_item_put(parent_item);
+		return ret;
+	}
+
+	item = configfs_get_config_item(dentry);
+
+	/* Drop reference from above, item already holds one. */
+	config_item_put(parent_item);
+
+	if (item->ci_type)
+		owner = item->ci_type->ct_owner;
+
+	if (sd->s_type & CONFIGFS_USET_DIR) {
+		configfs_detach_group(item);
+
+		down(&subsys->su_sem);
+		unlink_group(to_config_group(item));
+	} else {
+		configfs_detach_item(item);
+
+		down(&subsys->su_sem);
+		unlink_obj(item);
+	}
+
+	client_drop_item(parent_item, item);
+	up(&subsys->su_sem);
+
+	/* Drop our reference from above */
+	config_item_put(item);
+
+	module_put(owner);
+
+	return 0;
+}
+
+struct inode_operations configfs_dir_inode_operations = {
+	.mkdir		= configfs_mkdir,
+	.rmdir		= configfs_rmdir,
+	.symlink	= configfs_symlink,
+	.unlink		= configfs_unlink,
+	.lookup		= configfs_lookup,
+};
+
+#if 0
+int configfs_rename_dir(struct config_item * item, const char *new_name)
+{
+	int error = 0;
+	struct dentry * new_dentry, * parent;
+
+	if (!strcmp(config_item_name(item), new_name))
+		return -EINVAL;
+
+	if (!item->parent)
+		return -EINVAL;
+
+	down_write(&configfs_rename_sem);
+	parent = item->parent->dentry;
+
+	down(&parent->d_inode->i_sem);
+
+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
+	if (!IS_ERR(new_dentry)) {
+  		if (!new_dentry->d_inode) {
+			error = config_item_set_name(item, "%s", new_name);
+			if (!error) {
+				d_add(new_dentry, NULL);
+				d_move(item->dentry, new_dentry);
+			}
+			else
+				d_delete(new_dentry);
+		} else
+			error = -EEXIST;
+		dput(new_dentry);
+	}
+	up(&parent->d_inode->i_sem);
+	up_write(&configfs_rename_sem);
+
+	return error;
+}
+#endif
+
+static int configfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+
+	down(&dentry->d_inode->i_sem);
+	file->private_data = configfs_new_dirent(parent_sd, NULL);
+	up(&dentry->d_inode->i_sem);
+
+	return file->private_data ? 0 : -ENOMEM;
+
+}
+
+static int configfs_dir_close(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * cursor = file->private_data;
+
+	down(&dentry->d_inode->i_sem);
+	list_del_init(&cursor->s_sibling);
+	up(&dentry->d_inode->i_sem);
+
+	release_configfs_dirent(cursor);
+
+	return 0;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct configfs_dirent *sd)
+{
+	return (sd->s_mode >> 12) & 15;
+}
+
+static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *cursor = filp->private_data;
+	struct list_head *p, *q = &cursor->s_sibling;
+	ino_t ino;
+	int i = filp->f_pos;
+
+	switch (i) {
+		case 0:
+			ino = dentry->d_inode->i_ino;
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		default:
+			if (filp->f_pos == 2) {
+				list_del(q);
+				list_add(q, &parent_sd->s_children);
+			}
+			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
+				struct configfs_dirent *next;
+				const char * name;
+				int len;
+
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (!next->s_element)
+					continue;
+
+				name = configfs_get_name(next);
+				len = strlen(name);
+				if (next->s_dentry)
+					ino = next->s_dentry->d_inode->i_ino;
+				else
+					ino = iunique(configfs_sb, 2);
+
+				if (filldir(dirent, name, len, filp->f_pos, ino,
+						 dt_type(next)) < 0)
+					return 0;
+
+				list_del(q);
+				list_add(q, p);
+				p = q;
+				filp->f_pos++;
+			}
+	}
+	return 0;
+}
+
+static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+{
+	struct dentry * dentry = file->f_dentry;
+
+	down(&dentry->d_inode->i_sem);
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			up(&file->f_dentry->d_inode->i_sem);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			struct configfs_dirent *cursor = file->private_data;
+			struct list_head *p;
+			loff_t n = file->f_pos - 2;
+
+			list_del(&cursor->s_sibling);
+			p = sd->s_children.next;
+			while (n && p != &sd->s_children) {
+				struct configfs_dirent *next;
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (next->s_element)
+					n--;
+				p = p->next;
+			}
+			list_add_tail(&cursor->s_sibling, p);
+		}
+	}
+	up(&dentry->d_inode->i_sem);
+	return offset;
+}
+
+struct file_operations configfs_dir_operations = {
+	.open		= configfs_dir_open,
+	.release	= configfs_dir_close,
+	.llseek		= configfs_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= configfs_readdir,
+};
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys)
+{
+	int err;
+	struct config_group *group = &subsys->su_group;
+	struct qstr name;
+	struct dentry *dentry;
+	struct configfs_dirent *sd;
+
+	err = configfs_pin_fs();
+	if (err)
+		return err;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+
+	sd = configfs_sb->s_root->d_fsdata;
+	link_group(to_config_group(sd->s_element), group);
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	err = -ENOMEM;
+	dentry = d_alloc(configfs_sb->s_root, &name);
+	if (!dentry)
+		goto out_release;
+
+	d_add(dentry, NULL);
+
+	err = configfs_attach_group(sd->s_element, &group->cg_item,
+				    dentry);
+	if (!err)
+		dentry = NULL;
+	else
+		d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	if (dentry) {
+	    dput(dentry);
+out_release:
+	    unlink_group(group);
+	    configfs_release_fs();
+	}
+
+	return err;
+}
+
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
+{
+	struct config_group *group = &subsys->su_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+
+	if (dentry->d_parent != configfs_sb->s_root) {
+		printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
+		return;
+	}
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+	down(&dentry->d_inode->i_sem);
+	if (configfs_detach_prep(dentry)) {
+		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
+	}
+	configfs_detach_group(&group->cg_item);
+	dentry->d_inode->i_flags |= S_DEAD;
+	up(&dentry->d_inode->i_sem);
+
+	d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	dput(dentry);
+
+	unlink_group(group);
+	configfs_release_fs();
+}
+
+EXPORT_SYMBOL(configfs_register_subsystem);
+EXPORT_SYMBOL(configfs_unregister_subsystem);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
new file mode 100644
index 000000000000..af1ffc9a15c0
--- /dev/null
+++ b/fs/configfs/file.c
@@ -0,0 +1,360 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c - operations for regular (text) files.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/dnotify.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+
+struct configfs_buffer {
+	size_t			count;
+	loff_t			pos;
+	char			* page;
+	struct configfs_item_operations	* ops;
+	struct semaphore	sem;
+	int			needs_read_fill;
+};
+
+
+/**
+ *	fill_read_buffer - allocate and fill buffer from item.
+ *	@dentry:	dentry pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Allocate @buffer->page, if it hasn't been already, then call the
+ *	config_item's show() method to fill the buffer with this attribute's
+ *	data.
+ *	This is called only once, on the file's first read.
+ */
+static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+	int ret = 0;
+	ssize_t count;
+
+	if (!buffer->page)
+		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	count = ops->show_attribute(item,attr,buffer->page);
+	buffer->needs_read_fill = 0;
+	BUG_ON(count > (ssize_t)PAGE_SIZE);
+	if (count >= 0)
+		buffer->count = count;
+	else
+		ret = count;
+	return ret;
+}
+
+
+/**
+ *	flush_read_buffer - push buffer to userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	user-passed buffer.
+ *	@count:		number of bytes requested.
+ *	@ppos:		file position.
+ *
+ *	Copy the buffer we filled in fill_read_buffer() to userspace.
+ *	This is done at the reader's leisure, copying and advancing
+ *	the amount they specify each time.
+ *	This may be called continuously until the buffer is empty.
+ */
+static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
+			     size_t count, loff_t * ppos)
+{
+	int error;
+
+	if (*ppos > buffer->count)
+		return 0;
+
+	if (count > (buffer->count - *ppos))
+		count = buffer->count - *ppos;
+
+	error = copy_to_user(buf,buffer->page + *ppos,count);
+	if (!error)
+		*ppos += count;
+	return error ? -EFAULT : count;
+}
+
+/**
+ *	configfs_read_file - read an attribute.
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
+ *
+ *	Userspace wants to read an attribute file. The attribute descriptor
+ *	is in the file's ->d_fsdata. The target item is in the directory's
+ *	->d_fsdata.
+ *
+ *	We call fill_read_buffer() to allocate and fill the buffer from the
+ *	item's show() method exactly once (if the read is happening from
+ *	the beginning of the file). That should fill the entire buffer with
+ *	all the data the item has to offer for that attribute.
+ *	We then call flush_read_buffer() to copy the buffer to userspace
+ *	in the increments specified.
+ */
+
+static ssize_t
+configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+	ssize_t retval = 0;
+
+	down(&buffer->sem);
+	if (buffer->needs_read_fill) {
+		if ((retval = fill_read_buffer(file->f_dentry,buffer)))
+			goto out;
+	}
+	pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
+		 __FUNCTION__,count,*ppos,buffer->page);
+	retval = flush_read_buffer(buffer,buf,count,ppos);
+out:
+	up(&buffer->sem);
+	return retval;
+}
+
+
+/**
+ *	fill_write_buffer - copy buffer from userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	data from user.
+ *	@count:		number of bytes in @userbuf.
+ *
+ *	Allocate @buffer->page if it hasn't been already, then
+ *	copy the user-supplied buffer into it.
+ */
+
+static int
+fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+{
+	int error;
+
+	if (!buffer->page)
+		buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+	error = copy_from_user(buffer->page,buf,count);
+	buffer->needs_read_fill = 1;
+	return error ? -EFAULT : count;
+}
+
+
+/**
+ *	flush_write_buffer - push buffer to config_item.
+ *	@file:		file pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Get the correct pointers for the config_item and the attribute we're
+ *	dealing with, then call the store() method for the attribute,
+ *	passing the buffer that we acquired in fill_write_buffer().
+ */
+
+static int
+flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+
+	return ops->store_attribute(item,attr,buffer->page,count);
+}
+
+
+/**
+ *	configfs_write_file - write an attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
+ *
+ *	Similar to configfs_read_file(), though working in the opposite direction.
+ *	We allocate and fill the data from the user in fill_write_buffer(),
+ *	then push it to the config_item in flush_write_buffer().
+ *	There is no easy way for us to know if userspace is only doing a partial
+ *	write, so we don't support them. We expect the entire buffer to come
+ *	on the first write.
+ *	Hint: if you're writing a value, first read the file, modify only the
+ *	the value you're changing, then write entire buffer back.
+ */
+
+static ssize_t
+configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+
+	down(&buffer->sem);
+	count = fill_write_buffer(buffer,buf,count);
+	if (count > 0)
+		count = flush_write_buffer(file->f_dentry,buffer,count);
+	if (count > 0)
+		*ppos += count;
+	up(&buffer->sem);
+	return count;
+}
+
+static int check_perm(struct inode * inode, struct file * file)
+{
+	struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(file->f_dentry);
+	struct configfs_buffer * buffer;
+	struct configfs_item_operations * ops = NULL;
+	int error = 0;
+
+	if (!item || !attr)
+		goto Einval;
+
+	/* Grab the module reference for this attribute if we have one */
+	if (!try_module_get(attr->ca_owner)) {
+		error = -ENODEV;
+		goto Done;
+	}
+
+	if (item->ci_type)
+		ops = item->ci_type->ct_item_ops;
+	else
+		goto Eaccess;
+
+	/* File needs write support.
+	 * The inode's perms must say it's ok,
+	 * and we must have a store method.
+	 */
+	if (file->f_mode & FMODE_WRITE) {
+
+		if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+			goto Eaccess;
+
+	}
+
+	/* File needs read support.
+	 * The inode's perms must say it's ok, and we there
+	 * must be a show method for it.
+	 */
+	if (file->f_mode & FMODE_READ) {
+		if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+			goto Eaccess;
+	}
+
+	/* No error? Great, allocate a buffer for the file, and store it
+	 * it in file->private_data for easy access.
+	 */
+	buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+	if (buffer) {
+		memset(buffer,0,sizeof(struct configfs_buffer));
+		init_MUTEX(&buffer->sem);
+		buffer->needs_read_fill = 1;
+		buffer->ops = ops;
+		file->private_data = buffer;
+	} else
+		error = -ENOMEM;
+	goto Done;
+
+ Einval:
+	error = -EINVAL;
+	goto Done;
+ Eaccess:
+	error = -EACCES;
+	module_put(attr->ca_owner);
+ Done:
+	if (error && item)
+		config_item_put(item);
+	return error;
+}
+
+static int configfs_open_file(struct inode * inode, struct file * filp)
+{
+	return check_perm(inode,filp);
+}
+
+static int configfs_release(struct inode * inode, struct file * filp)
+{
+	struct config_item * item = to_item(filp->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(filp->f_dentry);
+	struct module * owner = attr->ca_owner;
+	struct configfs_buffer * buffer = filp->private_data;
+
+	if (item)
+		config_item_put(item);
+	/* After this point, attr should not be accessed. */
+	module_put(owner);
+
+	if (buffer) {
+		if (buffer->page)
+			free_page((unsigned long)buffer->page);
+		kfree(buffer);
+	}
+	return 0;
+}
+
+struct file_operations configfs_file_operations = {
+	.read		= configfs_read_file,
+	.write		= configfs_write_file,
+	.llseek		= generic_file_llseek,
+	.open		= configfs_open_file,
+	.release	= configfs_release,
+};
+
+
+int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
+{
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+	int error = 0;
+
+	down(&dir->d_inode->i_sem);
+	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
+	up(&dir->d_inode->i_sem);
+
+	return error;
+}
+
+
+/**
+ *	configfs_create_file - create an attribute file for an item.
+ *	@item:	item we're creating for.
+ *	@attr:	atrribute descriptor.
+ */
+
+int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
+{
+	BUG_ON(!item || !item->ci_dentry || !attr);
+
+	return configfs_add_file(item->ci_dentry, attr,
+				 CONFIGFS_ITEM_ATTR);
+}
+
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
new file mode 100644
index 000000000000..6b274c6d428f
--- /dev/null
+++ b/fs/configfs/inode.c
@@ -0,0 +1,162 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c - basic inode and dentry operations.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see Documentation/filesystems/configfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+extern struct super_block * configfs_sb;
+
+static struct address_space_operations configfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+static struct backing_dev_info configfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+struct inode * configfs_new_inode(mode_t mode)
+{
+	struct inode * inode = new_inode(configfs_sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = 0;
+		inode->i_gid = 0;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_mapping->a_ops = &configfs_aops;
+		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+	}
+	return inode;
+}
+
+int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
+{
+	int error = 0;
+	struct inode * inode = NULL;
+	if (dentry) {
+		if (!dentry->d_inode) {
+			if ((inode = configfs_new_inode(mode))) {
+				if (dentry->d_parent && dentry->d_parent->d_inode) {
+					struct inode *p_inode = dentry->d_parent->d_inode;
+					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+				}
+				goto Proceed;
+			}
+			else
+				error = -ENOMEM;
+		} else
+			error = -EEXIST;
+	} else
+		error = -ENOENT;
+	goto Done;
+
+ Proceed:
+	if (init)
+		error = init(inode);
+	if (!error) {
+		d_instantiate(dentry, inode);
+		if (S_ISDIR(mode) || S_ISLNK(mode))
+			dget(dentry);  /* pin link and directory dentries in core */
+	} else
+		iput(inode);
+ Done:
+	return error;
+}
+
+/*
+ * Get the name for corresponding element represented by the given configfs_dirent
+ */
+const unsigned char * configfs_get_name(struct configfs_dirent *sd)
+{
+	struct attribute * attr;
+
+	if (!sd || !sd->s_element)
+		BUG();
+
+	/* These always have a dentry, so use that */
+	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
+		return sd->s_dentry->d_name.name;
+
+	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+		attr = sd->s_element;
+		return attr->name;
+	}
+	return NULL;
+}
+
+
+/*
+ * Unhashes the dentry corresponding to given configfs_dirent
+ * Called with parent inode's i_sem held.
+ */
+void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
+{
+	struct dentry * dentry = sd->s_dentry;
+
+	if (dentry) {
+		spin_lock(&dcache_lock);
+		if (!(d_unhashed(dentry) && dentry->d_inode)) {
+			dget_locked(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dcache_lock);
+			simple_unlink(parent->d_inode, dentry);
+		} else
+			spin_unlock(&dcache_lock);
+	}
+}
+
+void configfs_hash_and_remove(struct dentry * dir, const char * name)
+{
+	struct configfs_dirent * sd;
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+
+	down(&dir->d_inode->i_sem);
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element)
+			continue;
+		if (!strcmp(configfs_get_name(sd), name)) {
+			list_del_init(&sd->s_sibling);
+			configfs_drop_dentry(sd, dir);
+			configfs_put(sd);
+			break;
+		}
+	}
+	up(&dir->d_inode->i_sem);
+}
+
+
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
new file mode 100644
index 000000000000..e07485ac50ad
--- /dev/null
+++ b/fs/configfs/item.c
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * item.c - library routines for handling generic config items
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on kobject:
+ * 	kobject is Copyright (c) 2002-2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see the file Documentation/filesystems/configfs.txt for
+ * critical information about using the config_item interface.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+static inline struct config_item * to_item(struct list_head * entry)
+{
+	return container_of(entry,struct config_item,ci_entry);
+}
+
+/* Evil kernel */
+static void config_item_release(struct kref *kref);
+
+/**
+ *	config_item_init - initialize item.
+ *	@item:	item in question.
+ */
+void config_item_init(struct config_item * item)
+{
+	kref_init(&item->ci_kref);
+	INIT_LIST_HEAD(&item->ci_entry);
+}
+
+/**
+ *	config_item_set_name - Set the name of an item
+ *	@item:	item.
+ *	@name:	name.
+ *
+ *	If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
+ *	dynamically allocated string that @item->ci_name points to.
+ *	Otherwise, use the static @item->ci_namebuf array.
+ */
+
+int config_item_set_name(struct config_item * item, const char * fmt, ...)
+{
+	int error = 0;
+	int limit = CONFIGFS_ITEM_NAME_LEN;
+	int need;
+	va_list args;
+	char * name;
+
+	/*
+	 * First, try the static array
+	 */
+	va_start(args,fmt);
+	need = vsnprintf(item->ci_namebuf,limit,fmt,args);
+	va_end(args);
+	if (need < limit)
+		name = item->ci_namebuf;
+	else {
+		/*
+		 * Need more space? Allocate it and try again
+		 */
+		limit = need + 1;
+		name = kmalloc(limit,GFP_KERNEL);
+		if (!name) {
+			error = -ENOMEM;
+			goto Done;
+		}
+		va_start(args,fmt);
+		need = vsnprintf(name,limit,fmt,args);
+		va_end(args);
+
+		/* Still? Give up. */
+		if (need >= limit) {
+			kfree(name);
+			error = -EFAULT;
+			goto Done;
+		}
+	}
+
+	/* Free the old name, if necessary. */
+	if (item->ci_name && item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+
+	/* Now, set the new name */
+	item->ci_name = name;
+ Done:
+	return error;
+}
+
+EXPORT_SYMBOL(config_item_set_name);
+
+void config_item_init_type_name(struct config_item *item,
+				const char *name,
+				struct config_item_type *type)
+{
+	config_item_set_name(item, name);
+	item->ci_type = type;
+	config_item_init(item);
+}
+EXPORT_SYMBOL(config_item_init_type_name);
+
+void config_group_init_type_name(struct config_group *group, const char *name,
+			 struct config_item_type *type)
+{
+	config_item_set_name(&group->cg_item, name);
+	group->cg_item.ci_type = type;
+	config_group_init(group);
+}
+EXPORT_SYMBOL(config_group_init_type_name);
+
+struct config_item * config_item_get(struct config_item * item)
+{
+	if (item)
+		kref_get(&item->ci_kref);
+	return item;
+}
+
+/**
+ *	config_item_cleanup - free config_item resources.
+ *	@item:	item.
+ */
+
+void config_item_cleanup(struct config_item * item)
+{
+	struct config_item_type * t = item->ci_type;
+	struct config_group * s = item->ci_group;
+	struct config_item * parent = item->ci_parent;
+
+	pr_debug("config_item %s: cleaning up\n",config_item_name(item));
+	if (item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+	item->ci_name = NULL;
+	if (t && t->ct_item_ops && t->ct_item_ops->release)
+		t->ct_item_ops->release(item);
+	if (s)
+		config_group_put(s);
+	if (parent)
+		config_item_put(parent);
+}
+
+static void config_item_release(struct kref *kref)
+{
+	config_item_cleanup(container_of(kref, struct config_item, ci_kref));
+}
+
+/**
+ *	config_item_put - decrement refcount for item.
+ *	@item:	item.
+ *
+ *	Decrement the refcount, and if 0, call config_item_cleanup().
+ */
+void config_item_put(struct config_item * item)
+{
+	if (item)
+		kref_put(&item->ci_kref, config_item_release);
+}
+
+
+/**
+ *	config_group_init - initialize a group for use
+ *	@k:	group
+ */
+
+void config_group_init(struct config_group *group)
+{
+	config_item_init(&group->cg_item);
+	INIT_LIST_HEAD(&group->cg_children);
+}
+
+
+/**
+ *	config_group_find_obj - search for item in group.
+ *	@group:	group we're looking in.
+ *	@name:	item's name.
+ *
+ *	Lock group via @group->cg_subsys, and iterate over @group->cg_list,
+ *	looking for a matching config_item. If matching item is found
+ *	take a reference and return the item.
+ */
+
+struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+{
+	struct list_head * entry;
+	struct config_item * ret = NULL;
+
+        /* XXX LOCKING! */
+	list_for_each(entry,&group->cg_children) {
+		struct config_item * item = to_item(entry);
+		if (config_item_name(item) &&
+                    !strcmp(config_item_name(item), name)) {
+			ret = config_item_get(item);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+EXPORT_SYMBOL(config_item_init);
+EXPORT_SYMBOL(config_group_init);
+EXPORT_SYMBOL(config_item_get);
+EXPORT_SYMBOL(config_item_put);
+
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
new file mode 100644
index 000000000000..1a2f6f6a4d91
--- /dev/null
+++ b/fs/configfs/mount.c
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mount.c - operations for initializing and mounting configfs.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Random magic number */
+#define CONFIGFS_MAGIC 0x62656570
+
+struct vfsmount * configfs_mount = NULL;
+struct super_block * configfs_sb = NULL;
+static int configfs_mnt_count = 0;
+
+static struct super_operations configfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct config_group configfs_root_group = {
+	.cg_item = {
+		.ci_namebuf	= "root",
+		.ci_name	= configfs_root_group.cg_item.ci_namebuf,
+	},
+};
+
+int configfs_is_root(struct config_item *item)
+{
+	return item == &configfs_root_group.cg_item;
+}
+
+static struct configfs_dirent configfs_root = {
+	.s_sibling	= LIST_HEAD_INIT(configfs_root.s_sibling),
+	.s_children	= LIST_HEAD_INIT(configfs_root.s_children),
+	.s_element	= &configfs_root_group.cg_item,
+	.s_type		= CONFIGFS_ROOT,
+};
+
+static int configfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = CONFIGFS_MAGIC;
+	sb->s_op = &configfs_ops;
+	configfs_sb = sb;
+
+	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+	if (inode) {
+		inode->i_op = &configfs_dir_inode_operations;
+		inode->i_fop = &configfs_dir_operations;
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+	} else {
+		pr_debug("configfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
+		iput(inode);
+		return -ENOMEM;
+	}
+	config_group_init(&configfs_root_group);
+	configfs_root_group.cg_item.ci_dentry = root;
+	root->d_fsdata = &configfs_root;
+	sb->s_root = root;
+	return 0;
+}
+
+static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+}
+
+static struct file_system_type configfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "configfs",
+	.get_sb		= configfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+int configfs_pin_fs(void)
+{
+	return simple_pin_fs("configfs", &configfs_mount,
+			     &configfs_mnt_count);
+}
+
+void configfs_release_fs(void)
+{
+	simple_release_fs(&configfs_mount, &configfs_mnt_count);
+}
+
+
+static decl_subsys(config, NULL, NULL);
+
+static int __init configfs_init(void)
+{
+	int err;
+
+	kset_set_kset_s(&config_subsys, kernel_subsys);
+	err = subsystem_register(&config_subsys);
+	if (err)
+		return err;
+
+	err = register_filesystem(&configfs_fs_type);
+	if (err) {
+		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+		subsystem_unregister(&config_subsys);
+	}
+
+	return err;
+}
+
+static void __exit configfs_exit(void)
+{
+	unregister_filesystem(&configfs_fs_type);
+	subsystem_unregister(&config_subsys);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.0.1");
+MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
+
+module_init(configfs_init);
+module_exit(configfs_exit);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
new file mode 100644
index 000000000000..50f5840521a9
--- /dev/null
+++ b/fs/configfs/symlink.c
@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.c - operations for configfs symlinks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+static int item_depth(struct config_item * item)
+{
+	struct config_item * p = item;
+	int depth = 0;
+	do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
+	return depth;
+}
+
+static int item_path_length(struct config_item * item)
+{
+	struct config_item * p = item;
+	int length = 1;
+	do {
+		length += strlen(config_item_name(p)) + 1;
+		p = p->ci_parent;
+	} while (p && !configfs_is_root(p));
+	return length;
+}
+
+static void fill_item_path(struct config_item * item, char * buffer, int length)
+{
+	struct config_item * p;
+
+	--length;
+	for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
+		int cur = strlen(config_item_name(p));
+
+		/* back up enough to print this bus id with '/' */
+		length -= cur;
+		strncpy(buffer + length,config_item_name(p),cur);
+		*(buffer + --length) = '/';
+	}
+}
+
+static int create_link(struct config_item *parent_item,
+ 		       struct config_item *item,
+		       struct dentry *dentry)
+{
+	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	int ret;
+
+	ret = -ENOMEM;
+	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
+	if (sl) {
+		sl->sl_target = config_item_get(item);
+		/* FIXME: needs a lock, I'd bet */
+		list_add(&sl->sl_list, &target_sd->s_links);
+		ret = configfs_create_link(sl, parent_item->ci_dentry,
+					   dentry);
+		if (ret) {
+			list_del_init(&sl->sl_list);
+			config_item_put(item);
+			kfree(sl);
+		}
+	}
+
+	return ret;
+}
+
+
+static int get_target(const char *symname, struct nameidata *nd,
+		      struct config_item **target)
+{
+	int ret;
+
+	ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
+	if (!ret) {
+		if (nd->dentry->d_sb == configfs_sb) {
+			*target = configfs_get_config_item(nd->dentry);
+			if (!*target) {
+				ret = -ENOENT;
+				path_release(nd);
+			}
+		} else
+			ret = -EPERM;
+	}
+
+	return ret;
+}
+
+
+int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int ret;
+	struct nameidata nd;
+	struct config_item *parent_item;
+	struct config_item *target_item;
+	struct config_item_type *type;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (dentry->d_parent == configfs_sb->s_root)
+		goto out;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	if (!type || !type->ct_item_ops ||
+	    !type->ct_item_ops->allow_link)
+		goto out_put;
+
+	ret = get_target(symname, &nd, &target_item);
+	if (ret)
+		goto out_put;
+
+	ret = type->ct_item_ops->allow_link(parent_item, target_item);
+	if (!ret)
+		ret = create_link(parent_item, target_item, dentry);
+
+	config_item_put(target_item);
+	path_release(&nd);
+
+out_put:
+	config_item_put(parent_item);
+
+out:
+	return ret;
+}
+
+int configfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct configfs_dirent *sd = dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	struct config_item *parent_item;
+	struct config_item_type *type;
+	int ret;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
+		goto out;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		BUG();
+
+	sl = sd->s_element;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	list_del_init(&sd->s_sibling);
+	configfs_drop_dentry(sd, dentry->d_parent);
+	dput(dentry);
+	configfs_put(sd);
+
+	/*
+	 * drop_link() must be called before
+	 * list_del_init(&sl->sl_list), so that the order of
+	 * drop_link(this, target) and drop_item(target) is preserved.
+	 */
+	if (type && type->ct_item_ops &&
+	    type->ct_item_ops->drop_link)
+		type->ct_item_ops->drop_link(parent_item,
+					       sl->sl_target);
+
+	/* FIXME: Needs lock */
+	list_del_init(&sl->sl_list);
+
+	/* Put reference from create_link() */
+	config_item_put(sl->sl_target);
+	kfree(sl);
+
+	config_item_put(parent_item);
+
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int configfs_get_target_path(struct config_item * item, struct config_item * target,
+				   char *path)
+{
+	char * s;
+	int depth, size;
+
+	depth = item_depth(item);
+	size = item_path_length(target) + depth * 3 - 1;
+	if (size > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+
+	for (s = path; depth--; s += 3)
+		strcpy(s,"../");
+
+	fill_item_path(target, path, size);
+	pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+
+	return 0;
+}
+
+static int configfs_getlink(struct dentry *dentry, char * path)
+{
+	struct config_item *item, *target_item;
+	int error = 0;
+
+	item = configfs_get_config_item(dentry->d_parent);
+	if (!item)
+		return -EINVAL;
+
+	target_item = configfs_get_config_item(dentry);
+	if (!target_item) {
+		config_item_put(item);
+		return -EINVAL;
+	}
+
+	down_read(&configfs_rename_sem);
+	error = configfs_get_target_path(item, target_item, path);
+	up_read(&configfs_rename_sem);
+
+	config_item_put(item);
+	config_item_put(target_item);
+	return error;
+
+}
+
+static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	if (page) {
+		error = configfs_getlink(dentry, (char *)page);
+		if (!error) {
+			nd_set_link(nd, (char *)page);
+			return (void *)page;
+		}
+	}
+
+	nd_set_link(nd, ERR_PTR(error));
+	return NULL;
+}
+
+static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			      void *cookie)
+{
+	if (cookie) {
+		unsigned long page = (unsigned long)cookie;
+		free_page(page);
+	}
+}
+
+struct inode_operations configfs_symlink_inode_operations = {
+	.follow_link = configfs_follow_link,
+	.readlink = generic_readlink,
+	.put_link = configfs_put_link,
+};
+
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
new file mode 100644
index 000000000000..acffb8c9073a
--- /dev/null
+++ b/include/linux/configfs.h
@@ -0,0 +1,205 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs.h - definitions for the device driver filesystem
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * Based on kobject.h:
+ *      Copyright (c) 2002-2003	Patrick Mochel
+ *      Copyright (c) 2002-2003	Open Source Development Labs
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please read Documentation/filesystems/configfs.txt before using the
+ * configfs interface, ESPECIALLY the parts about reference counts and
+ * item destructors.
+ */
+
+#ifndef _CONFIGFS_H_
+#define _CONFIGFS_H_
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#define CONFIGFS_ITEM_NAME_LEN	20
+
+struct module;
+
+struct configfs_item_operations;
+struct configfs_group_operations;
+struct configfs_attribute;
+struct configfs_subsystem;
+
+struct config_item {
+	char			*ci_name;
+	char			ci_namebuf[CONFIGFS_ITEM_NAME_LEN];
+	struct kref		ci_kref;
+	struct list_head	ci_entry;
+	struct config_item	*ci_parent;
+	struct config_group	*ci_group;
+	struct config_item_type	*ci_type;
+	struct dentry		*ci_dentry;
+};
+
+extern int config_item_set_name(struct config_item *, const char *, ...);
+
+static inline char *config_item_name(struct config_item * item)
+{
+	return item->ci_name;
+}
+
+extern void config_item_init(struct config_item *);
+extern void config_item_init_type_name(struct config_item *item,
+				       const char *name,
+				       struct config_item_type *type);
+extern void config_item_cleanup(struct config_item *);
+
+extern struct config_item * config_item_get(struct config_item *);
+extern void config_item_put(struct config_item *);
+
+struct config_item_type {
+	struct module				*ct_owner;
+	struct configfs_item_operations		*ct_item_ops;
+	struct configfs_group_operations	*ct_group_ops;
+	struct configfs_attribute		**ct_attrs;
+};
+
+
+/**
+ *	group - a group of config_items of a specific type, belonging
+ *	to a specific subsystem.
+ */
+
+struct config_group {
+	struct config_item		cg_item;
+	struct list_head		cg_children;
+	struct configfs_subsystem 	*cg_subsys;
+	struct config_group		**default_groups;
+};
+
+
+extern void config_group_init(struct config_group *group);
+extern void config_group_init_type_name(struct config_group *group,
+					const char *name,
+					struct config_item_type *type);
+
+
+static inline struct config_group *to_config_group(struct config_item *item)
+{
+	return item ? container_of(item,struct config_group,cg_item) : NULL;
+}
+
+static inline struct config_group *config_group_get(struct config_group *group)
+{
+	return group ? to_config_group(config_item_get(&group->cg_item)) : NULL;
+}
+
+static inline void config_group_put(struct config_group *group)
+{
+	config_item_put(&group->cg_item);
+}
+
+extern struct config_item *config_group_find_obj(struct config_group *, const char *);
+
+
+struct configfs_attribute {
+	char			*ca_name;
+	struct module 		*ca_owner;
+	mode_t			ca_mode;
+};
+
+
+/*
+ * If allow_link() exists, the item can symlink(2) out to other
+ * items.  If the item is a group, it may support mkdir(2).
+ * Groups supply one of make_group() and make_item().  If the
+ * group supports make_group(), one can create group children.  If it
+ * supports make_item(), one can create config_item children.  If it has
+ * default_groups on group->default_groups, it has automatically created
+ * group children.  default_groups may coexist alongsize make_group() or
+ * make_item(), but if the group wishes to have only default_groups
+ * children (disallowing mkdir(2)), it need not provide either function.
+ * If the group has commit(), it supports pending and commited (active)
+ * items.
+ */
+struct configfs_item_operations {
+	void (*release)(struct config_item *);
+	ssize_t	(*show_attribute)(struct config_item *, struct configfs_attribute *,char *);
+	ssize_t	(*store_attribute)(struct config_item *,struct configfs_attribute *,const char *, size_t);
+	int (*allow_link)(struct config_item *src, struct config_item *target);
+	int (*drop_link)(struct config_item *src, struct config_item *target);
+};
+
+struct configfs_group_operations {
+	struct config_item *(*make_item)(struct config_group *group, const char *name);
+	struct config_group *(*make_group)(struct config_group *group, const char *name);
+	int (*commit_item)(struct config_item *item);
+	void (*drop_item)(struct config_group *group, struct config_item *item);
+};
+
+
+
+/**
+ * Use these macros to make defining attributes easier. See include/linux/device.h
+ * for examples..
+ */
+
+#if 0
+#define __ATTR(_name,_mode,_show,_store) { \
+	.attr = {.ca_name = __stringify(_name), .ca_mode = _mode, .ca_owner = THIS_MODULE },	\
+	.show	= _show,					\
+	.store	= _store,					\
+}
+
+#define __ATTR_RO(_name) { \
+	.attr	= { .ca_name = __stringify(_name), .ca_mode = 0444, .ca_owner = THIS_MODULE },	\
+	.show	= _name##_show,	\
+}
+
+#define __ATTR_NULL { .attr = { .name = NULL } }
+
+#define attr_name(_attr) (_attr).attr.name
+#endif
+
+
+struct configfs_subsystem {
+	struct config_group	su_group;
+	struct semaphore	su_sem;
+};
+
+static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct configfs_subsystem, su_group) :
+		NULL;
+}
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys);
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+#endif  /* __KERNEL__ */
+
+#endif /* _CONFIGFS_H_ */
-- 
cgit v1.2.3


From 994fc28c7b1e697ac56befe4aecabf23f0689f46 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Thu, 15 Dec 2005 14:28:17 -0800
Subject: [PATCH] add AOP_TRUNCATED_PAGE, prepend AOP_ to WRITEPAGE_ACTIVATE

readpage(), prepare_write(), and commit_write() callers are updated to
understand the special return code AOP_TRUNCATED_PAGE in the style of
writepage() and WRITEPAGE_ACTIVATE.  AOP_TRUNCATED_PAGE tells the caller that
the callee has unlocked the page and that the operation should be tried again
with a new page.  OCFS2 uses this to detect and work around a lock inversion in
its aop methods.  There should be no change in behaviour for methods that don't
return AOP_TRUNCATED_PAGE.

WRITEPAGE_ACTIVATE is also prepended with AOP_ for consistency and they are
made enums so that kerneldoc can be used to document their semantics.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 drivers/block/loop.c      | 23 +++++++++++----
 drivers/block/rd.c        |  4 +--
 fs/mpage.c                |  2 +-
 include/linux/fs.h        | 31 ++++++++++++++++++++
 include/linux/writeback.h |  6 ----
 mm/filemap.c              | 73 ++++++++++++++++++++++++++++++++---------------
 mm/readahead.c            | 15 ++++++----
 mm/shmem.c                |  2 +-
 mm/vmscan.c               |  2 +-
 9 files changed, 113 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 96c664af8d06..a452b13620a2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	struct address_space_operations *aops = mapping->a_ops;
 	pgoff_t index;
 	unsigned offset, bv_offs;
-	int len, ret = 0;
+	int len, ret;
 
 	down(&mapping->host->i_sem);
 	index = pos >> PAGE_CACHE_SHIFT;
@@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		page = grab_cache_page(mapping, index);
 		if (unlikely(!page))
 			goto fail;
-		if (unlikely(aops->prepare_write(file, page, offset,
-				offset + size)))
+		ret = aops->prepare_write(file, page, offset,
+					  offset + size);
+		if (unlikely(ret)) {
+			if (ret == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				continue;
+			}
 			goto unlock;
+		}
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
 				bvec->bv_page, bv_offs, size, IV);
 		if (unlikely(transfer_result)) {
@@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 			kunmap_atomic(kaddr, KM_USER0);
 		}
 		flush_dcache_page(page);
-		if (unlikely(aops->commit_write(file, page, offset,
-				offset + size)))
+		ret = aops->commit_write(file, page, offset,
+					 offset + size);
+		if (unlikely(ret)) {
+			if (ret == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				continue;
+			}
 			goto unlock;
+		}
 		if (unlikely(transfer_result))
 			goto unlock;
 		bv_offs += size;
@@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		unlock_page(page);
 		page_cache_release(page);
 	}
+	ret = 0;
 out:
 	up(&mapping->host->i_sem);
 	return ret;
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 68c60a5bcdab..ffd6abd6d5a0 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -154,7 +154,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page,
 
 /*
  * ->writepage to the the blockdev's mapping has to redirty the page so that the
- * VM doesn't go and steal it.  We return WRITEPAGE_ACTIVATE so that the VM
+ * VM doesn't go and steal it.  We return AOP_WRITEPAGE_ACTIVATE so that the VM
  * won't try to (pointlessly) write the page again for a while.
  *
  * Really, these pages should not be on the LRU at all.
@@ -165,7 +165,7 @@ static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
 		make_page_uptodate(page);
 	SetPageDirty(page);
 	if (wbc->for_reclaim)
-		return WRITEPAGE_ACTIVATE;
+		return AOP_WRITEPAGE_ACTIVATE;
 	unlock_page(page);
 	return 0;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index c5adcdddf3cc..f1d2d02bd4c8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -721,7 +721,7 @@ retry:
 						&last_block_in_bio, &ret, wbc,
 						page->mapping->a_ops->writepage);
 			}
-			if (unlikely(ret == WRITEPAGE_ACTIVATE))
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
 				unlock_page(page);
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc35b6ac778d..ed9a41a71e8b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -302,6 +302,37 @@ struct iattr {
  */
 #include <linux/quota.h>
 
+/** 
+ * enum positive_aop_returns - aop return codes with specific semantics
+ *
+ * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
+ * 			    completed, that the page is still locked, and
+ * 			    should be considered active.  The VM uses this hint
+ * 			    to return the page to the active list -- it won't
+ * 			    be a candidate for writeback again in the near
+ * 			    future.  Other callers must be careful to unlock
+ * 			    the page if they get this return.  Returned by
+ * 			    writepage(); 
+ *
+ * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
+ *  			unlocked it and the page might have been truncated.
+ *  			The caller should back up to acquiring a new page and
+ *  			trying again.  The aop will be taking reasonable
+ *  			precautions not to livelock.  If the caller held a page
+ *  			reference, it should drop it before retrying.  Returned
+ *  			by readpage(), prepare_write(), and commit_write().
+ *
+ * address_space_operation functions return these large constants to indicate
+ * special semantics to the caller.  These are much larger than the bytes in a
+ * page to allow for functions that return the number of bytes operated on in a
+ * given page.
+ */
+
+enum positive_aop_returns {
+	AOP_WRITEPAGE_ACTIVATE	= 0x80000,
+	AOP_TRUNCATED_PAGE	= 0x80001,
+};
+
 /*
  * oh the beauties of C type declarations.
  */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 343d883d69c5..64a36ba43b2f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -59,12 +59,6 @@ struct writeback_control {
 	unsigned for_reclaim:1;			/* Invoked from the page allocator */
 };
 
-/*
- * ->writepage() return values (make these much larger than a pagesize, in
- * case some fs is returning number-of-bytes-written from writepage)
- */
-#define WRITEPAGE_ACTIVATE	0x80000	/* IO was not started: activate page */
-
 /*
  * fs/fs-writeback.c
  */	
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde158..6e1d08a2b8b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -831,8 +831,13 @@ readpage:
 		/* Start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
 
-		if (unlikely(error))
+		if (unlikely(error)) {
+			if (error == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				goto find_page;
+			}
 			goto readpage_error;
+		}
 
 		if (!PageUptodate(page)) {
 			lock_page(page);
@@ -1152,26 +1157,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
 	struct address_space *mapping = file->f_mapping;
 	struct page *page; 
-	int error;
+	int ret;
 
-	page = page_cache_alloc_cold(mapping);
-	if (!page)
-		return -ENOMEM;
+	do {
+		page = page_cache_alloc_cold(mapping);
+		if (!page)
+			return -ENOMEM;
+
+		ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+		if (ret == 0)
+			ret = mapping->a_ops->readpage(file, page);
+		else if (ret == -EEXIST)
+			ret = 0; /* losing race to add is OK */
 
-	error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
-	if (!error) {
-		error = mapping->a_ops->readpage(file, page);
 		page_cache_release(page);
-		return error;
-	}
 
-	/*
-	 * We arrive here in the unlikely event that someone 
-	 * raced with us and added our page to the cache first
-	 * or we are out of memory for radix-tree nodes.
-	 */
-	page_cache_release(page);
-	return error == -EEXIST ? 0 : error;
+	} while (ret == AOP_TRUNCATED_PAGE);
+		
+	return ret;
 }
 
 #define MMAP_LOTSAMISS  (100)
@@ -1331,10 +1334,14 @@ page_not_uptodate:
 		goto success;
 	}
 
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1358,10 +1365,14 @@ page_not_uptodate:
 		goto success;
 	}
 	ClearPageError(page);
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1444,10 +1455,14 @@ page_not_uptodate:
 		goto success;
 	}
 
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1470,10 +1485,14 @@ page_not_uptodate:
 	}
 
 	ClearPageError(page);
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1934,12 +1953,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
 		if (unlikely(status)) {
 			loff_t isize = i_size_read(inode);
+
+			if (status != AOP_TRUNCATED_PAGE)
+				unlock_page(page);
+			page_cache_release(page);
+			if (status == AOP_TRUNCATED_PAGE)
+				continue;
 			/*
 			 * prepare_write() may have instantiated a few blocks
 			 * outside i_size.  Trim these off again.
 			 */
-			unlock_page(page);
-			page_cache_release(page);
 			if (pos + bytes > isize)
 				vmtruncate(inode, isize);
 			break;
@@ -1952,6 +1975,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 						cur_iov, iov_base, bytes);
 		flush_dcache_page(page);
 		status = a_ops->commit_write(file, page, offset, offset+bytes);
+		if (status == AOP_TRUNCATED_PAGE) {
+			page_cache_release(page);
+			continue;
+		}
 		if (likely(copied > 0)) {
 			if (!status)
 				status = copied;
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c7..8d6eeaaa6296 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 {
 	unsigned page_idx;
 	struct pagevec lru_pvec;
-	int ret = 0;
+	int ret;
 
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 		list_del(&page->lru);
 		if (!add_to_page_cache(page, mapping,
 					page->index, GFP_KERNEL)) {
-			mapping->a_ops->readpage(filp, page);
-			if (!pagevec_add(&lru_pvec, page))
-				__pagevec_lru_add(&lru_pvec);
-		} else {
-			page_cache_release(page);
+			ret = mapping->a_ops->readpage(filp, page);
+			if (ret != AOP_TRUNCATED_PAGE) {
+				if (!pagevec_add(&lru_pvec, page))
+					__pagevec_lru_add(&lru_pvec);
+				continue;
+			} /* else fall through to release */
 		}
+		page_cache_release(page);
 	}
 	pagevec_lru_add(&lru_pvec);
+	ret = 0;
 out:
 	return ret;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e9..d9fc277940da 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -855,7 +855,7 @@ unlock:
 	swap_free(swap);
 redirty:
 	set_page_dirty(page);
-	return WRITEPAGE_ACTIVATE;	/* Return with the page locked */
+	return AOP_WRITEPAGE_ACTIVATE;	/* Return with the page locked */
 }
 
 #ifdef CONFIG_NUMA
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de6..795a050fe471 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 		res = mapping->a_ops->writepage(page, &wbc);
 		if (res < 0)
 			handle_write_error(mapping, page, res);
-		if (res == WRITEPAGE_ACTIVATE) {
+		if (res == AOP_WRITEPAGE_ACTIVATE) {
 			ClearPageReclaim(page);
 			return PAGE_ACTIVATE;
 		}
-- 
cgit v1.2.3


From 52fd3d6fea441835fe3a35b7280e5e128bdeca9b Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

Very simple printk wrapper which adds the ability to enable various
sets of debug messages at run-time.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/cluster/masklog.c | 166 +++++++++++++++++++++++++++
 fs/ocfs2/cluster/masklog.h | 275 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 441 insertions(+)
 create mode 100644 fs/ocfs2/cluster/masklog.c
 create mode 100644 fs/ocfs2/cluster/masklog.h

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
new file mode 100644
index 000000000000..fd741cea5705
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.c
@@ -0,0 +1,166 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+
+#include "masklog.h"
+
+struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
+EXPORT_SYMBOL_GPL(mlog_and_bits);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
+EXPORT_SYMBOL_GPL(mlog_not_bits);
+
+static ssize_t mlog_mask_show(u64 mask, char *buf)
+{
+	char *state;
+
+	if (__mlog_test_u64(mask, mlog_and_bits))
+		state = "allow";
+	else if (__mlog_test_u64(mask, mlog_not_bits))
+		state = "deny";
+	else
+		state = "off";
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", state);
+}
+
+static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
+{
+	if (!strnicmp(buf, "allow", 5)) {
+		__mlog_set_u64(mask, mlog_and_bits);
+		__mlog_clear_u64(mask, mlog_not_bits);
+	} else if (!strnicmp(buf, "deny", 4)) {
+		__mlog_set_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else if (!strnicmp(buf, "off", 3)) {
+		__mlog_clear_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+struct mlog_attribute {
+	struct attribute attr;
+	u64 mask;
+};
+
+#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
+
+#define define_mask(_name) {			\
+	.attr = {				\
+		.name = #_name,			\
+		.mode = S_IRUGO | S_IWUSR,	\
+	},					\
+	.mask = ML_##_name,			\
+}
+
+static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
+	define_mask(ENTRY),
+	define_mask(EXIT),
+	define_mask(TCP),
+	define_mask(MSG),
+	define_mask(SOCKET),
+	define_mask(HEARTBEAT),
+	define_mask(HB_BIO),
+	define_mask(DLMFS),
+	define_mask(DLM),
+	define_mask(DLM_DOMAIN),
+	define_mask(DLM_THREAD),
+	define_mask(DLM_MASTER),
+	define_mask(DLM_RECOVERY),
+	define_mask(AIO),
+	define_mask(JOURNAL),
+	define_mask(DISK_ALLOC),
+	define_mask(SUPER),
+	define_mask(FILE_IO),
+	define_mask(EXTENT_MAP),
+	define_mask(DLM_GLUE),
+	define_mask(BH_IO),
+	define_mask(UPTODATE),
+	define_mask(NAMEI),
+	define_mask(INODE),
+	define_mask(VOTE),
+	define_mask(DCACHE),
+	define_mask(CONN),
+	define_mask(QUORUM),
+	define_mask(EXPORT),
+	define_mask(ERROR),
+	define_mask(NOTICE),
+	define_mask(KTHREAD),
+};
+
+static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
+
+static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
+			 char *buf)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_show(mlog_attr->mask, buf);
+}
+
+static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_store(mlog_attr->mask, buf, count);
+}
+
+static struct sysfs_ops mlog_attr_ops = {
+	.show  = mlog_show,
+	.store = mlog_store,
+};
+
+static struct kobj_type mlog_ktype = {
+	.default_attrs = mlog_attr_ptrs,
+	.sysfs_ops     = &mlog_attr_ops,
+};
+
+static struct kset mlog_kset = {
+	.kobj   = {.name = "logmask", .ktype = &mlog_ktype},
+};
+
+int mlog_sys_init(struct subsystem *o2cb_subsys)
+{
+	int i = 0;
+
+	while (mlog_attrs[i].attr.mode) {
+		mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
+		i++;
+	}
+	mlog_attr_ptrs[i] = NULL;
+
+	mlog_kset.subsys = o2cb_subsys;
+	return kset_register(&mlog_kset);
+}
+
+void mlog_sys_shutdown(void)
+{
+	kset_unregister(&mlog_kset);
+}
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
new file mode 100644
index 000000000000..f5ef5ea61a05
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.h
@@ -0,0 +1,275 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_MASKLOG_H
+#define O2CLUSTER_MASKLOG_H
+
+/*
+ * For now this is a trivial wrapper around printk() that gives the critical
+ * ability to enable sets of debugging output at run-time.  In the future this
+ * will almost certainly be redirected to relayfs so that it can pay a
+ * substantially lower heisenberg tax.
+ *
+ * Callers associate the message with a bitmask and a global bitmask is
+ * maintained with help from /proc.  If any of the bits match the message is
+ * output.
+ *
+ * We must have efficient bit tests on i386 and it seems gcc still emits crazy
+ * code for the 64bit compare.  It emits very good code for the dual unsigned
+ * long tests, though, completely avoiding tests that can never pass if the
+ * caller gives a constant bitmask that fills one of the longs with all 0s.  So
+ * the desire is to have almost all of the calls decided on by comparing just
+ * one of the longs.  This leads to having infrequently given bits that are
+ * frequently matched in the high bits.
+ *
+ * _ERROR and _NOTICE are used for messages that always go to the console and
+ * have appropriate KERN_ prefixes.  We wrap these in our function instead of
+ * just calling printk() so that this can eventually make its way through
+ * relayfs along with the debugging messages.  Everything else gets KERN_DEBUG.
+ * The inline tests and macro dance give GCC the opportunity to quite cleverly
+ * only emit the appropriage printk() when the caller passes in a constant
+ * mask, as is almost always the case.
+ *
+ * All this bitmask nonsense is hidden from the /proc interface so that Joel
+ * doesn't have an aneurism.  Reading the file gives a straight forward
+ * indication of which bits are on or off:
+ * 	ENTRY off
+ * 	EXIT off
+ * 	TCP off
+ * 	MSG off
+ * 	SOCKET off
+ * 	ERROR off
+ * 	NOTICE on
+ *
+ * Writing changes the state of a given bit and requires a strictly formatted
+ * single write() call:
+ *
+ * 	write(fd, "ENTRY on", 8);
+ *
+ * would turn the entry bit on.  "1" is also accepted in the place of "on", and
+ * "off" and "0" behave as expected.
+ *
+ * Some trivial shell can flip all the bits on or off:
+ *
+ * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
+ * cat $log_mask | (
+ * 	while read bit status; do
+ * 		# $1 is "on" or "off", say
+ * 		echo "$bit $1" > $log_mask
+ * 	done
+ * )
+ */
+
+/* for task_struct */
+#include <linux/sched.h>
+
+/* bits that are frequently given and infrequently matched in the low word */
+/* NOTE: If you add a flag, you need to also update mlog.c! */
+#define ML_ENTRY	0x0000000000000001ULL /* func call entry */
+#define ML_EXIT		0x0000000000000002ULL /* func call exit */
+#define ML_TCP		0x0000000000000004ULL /* net cluster/tcp.c */
+#define ML_MSG		0x0000000000000008ULL /* net network messages */
+#define ML_SOCKET	0x0000000000000010ULL /* net socket lifetime */
+#define ML_HEARTBEAT	0x0000000000000020ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO	0x0000000000000040ULL /* hb io tracing */
+#define ML_DLMFS	0x0000000000000080ULL /* dlm user dlmfs */
+#define ML_DLM		0x0000000000000100ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN	0x0000000000000200ULL /* dlm domain debugging */
+#define ML_DLM_THREAD	0x0000000000000400ULL /* dlm domain thread */
+#define ML_DLM_MASTER	0x0000000000000800ULL /* dlm master functions */
+#define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
+#define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
+#define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
+#define ML_DISK_ALLOC	0x0000000000008000ULL /* ocfs2 disk allocation */
+#define ML_SUPER	0x0000000000010000ULL /* ocfs2 mount / umount */
+#define ML_FILE_IO	0x0000000000020000ULL /* ocfs2 file I/O */
+#define ML_EXTENT_MAP	0x0000000000040000ULL /* ocfs2 extent map caching */
+#define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
+#define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
+#define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
+#define ML_NAMEI	0x0000000000400000ULL /* ocfs2 directory / namespace */
+#define ML_INODE	0x0000000000800000ULL /* ocfs2 inode manipulation */
+#define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
+#define ML_DCACHE	0x0000000002000000ULL /* ocfs2 dcache operations */
+#define ML_CONN		0x0000000004000000ULL /* net connection management */
+#define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
+#define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
+/* bits that are infrequently given and frequently matched in the high word */
+#define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
+#define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
+#define ML_KTHREAD	0x0000000400000000ULL /* kernel thread activity */
+
+#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
+#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
+#ifndef MLOG_MASK_PREFIX
+#define MLOG_MASK_PREFIX 0
+#endif
+
+#define MLOG_MAX_BITS 64
+
+struct mlog_bits {
+	unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
+};
+
+extern struct mlog_bits mlog_and_bits, mlog_not_bits;
+
+#if BITS_PER_LONG == 32
+
+#define __mlog_test_u64(mask, bits)			\
+	( (u32)(mask & 0xffffffff) & bits.words[0] || 	\
+	  ((u64)(mask) >> 32) & bits.words[1] )
+#define __mlog_set_u64(mask, bits) do {			\
+	bits.words[0] |= (u32)(mask & 0xffffffff);	\
+       	bits.words[1] |= (u64)(mask) >> 32;		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {		\
+	bits.words[0] &= ~((u32)(mask & 0xffffffff));	\
+       	bits.words[1] &= ~((u64)(mask) >> 32);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) {				\
+	{						\
+		[0] = (u32)(mask & 0xffffffff),		\
+		[1] = (u64)(mask) >> 32,		\
+	}						\
+}
+
+#else /* 32bit long above, 64bit long below */
+
+#define __mlog_test_u64(mask, bits)	((mask) & bits.words[0])
+#define __mlog_set_u64(mask, bits) do {		\
+	bits.words[0] |= (mask);		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {	\
+	bits.words[0] &= ~(mask);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) { { (mask) } }
+
+#endif
+
+/*
+ * smp_processor_id() "helpfully" screams when called outside preemptible
+ * regions in current kernels.  sles doesn't have the variants that don't
+ * scream.  just do this instead of trying to guess which we're building
+ * against.. *sigh*.
+ */
+#define __mlog_cpu_guess ({		\
+	unsigned long _cpu = get_cpu();	\
+	put_cpu();			\
+	_cpu;				\
+})
+
+/* In the following two macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define __mlog_printk(level, fmt, args...)				\
+	printk(level "(%u,%lu):%s:%d " fmt, current->pid,		\
+	       __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ ,	\
+	       ##args)
+
+#define mlog(mask, fmt, args...) do {					\
+	u64 __m = MLOG_MASK_PREFIX | (mask);				\
+	if (__mlog_test_u64(__m, mlog_and_bits) &&			\
+	    !__mlog_test_u64(__m, mlog_not_bits)) {			\
+		if (__m & ML_ERROR)					\
+			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
+		else if (__m & ML_NOTICE)				\
+			__mlog_printk(KERN_NOTICE, fmt , ##args);	\
+		else __mlog_printk(KERN_INFO, fmt , ##args);		\
+	}								\
+} while (0)
+
+#define mlog_errno(st) do {						\
+	int _st = (st);							\
+	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
+	    _st != AOP_TRUNCATED_PAGE)					\
+		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
+} while (0)
+
+#define mlog_entry(fmt, args...) do {					\
+	mlog(ML_ENTRY, "ENTRY:" fmt , ##args);				\
+} while (0)
+
+#define mlog_entry_void() do {						\
+	mlog(ML_ENTRY, "ENTRY:\n");					\
+} while (0)
+
+/* We disable this for old compilers since they don't have support for
+ * __builtin_types_compatible_p.
+ */
+#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
+    !defined(__CHECKER__)
+#define mlog_exit(st) do {						     \
+	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
+		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
+	else if (__builtin_types_compatible_p(typeof(st), signed long))      \
+		mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st));	     \
+	else if (__builtin_types_compatible_p(typeof(st), unsigned int)	     \
+		 || __builtin_types_compatible_p(typeof(st), unsigned short) \
+		 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
+		mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st));	     \
+	else if (__builtin_types_compatible_p(typeof(st), signed int)	     \
+		 || __builtin_types_compatible_p(typeof(st), signed short)   \
+		 || __builtin_types_compatible_p(typeof(st), signed char))   \
+		mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st));		     \
+	else if (__builtin_types_compatible_p(typeof(st), long long))	     \
+		mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));	     \
+	else								     \
+		mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st));    \
+} while (0)
+#else
+#define mlog_exit(st) do {						     \
+	mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));		     \
+} while (0)
+#endif
+
+#define mlog_exit_ptr(ptr) do {						\
+	mlog(ML_EXIT, "EXIT: %p\n", ptr);				\
+} while (0)
+
+#define mlog_exit_void() do {						\
+	mlog(ML_EXIT, "EXIT\n");					\
+} while (0)
+
+#define mlog_bug_on_msg(cond, fmt, args...) do {			\
+	if (cond) {							\
+		mlog(ML_ERROR, "bug expression: " #cond "\n");		\
+		mlog(ML_ERROR, fmt, ##args);				\
+		BUG();							\
+	}								\
+} while (0)
+
+#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
+#define MLFi64 "lld"
+#define MLFu64 "llu"
+#define MLFx64 "llx"
+#else
+#define MLFi64 "ld"
+#define MLFu64 "lu"
+#define MLFx64 "lx"
+#endif
+
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+int mlog_sys_init(struct subsystem *o2cb_subsys);
+void mlog_sys_shutdown(void);
+
+#endif /* O2CLUSTER_MASKLOG_H */
-- 
cgit v1.2.3


From 0c83ed8eeb28a045cdbd0b216679938aa9e665fe Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

A simple node information service, filled and updated from
userspace. The rest of the stack queries this service for simple node
information.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/cluster/Makefile            |   4 +
 fs/ocfs2/cluster/endian.h            |  30 ++
 fs/ocfs2/cluster/nodemanager.c       | 791 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/cluster/nodemanager.h       |  64 +++
 fs/ocfs2/cluster/ocfs2_nodemanager.h |  39 ++
 fs/ocfs2/cluster/ver.c               |  42 ++
 fs/ocfs2/cluster/ver.h               |  31 ++
 7 files changed, 1001 insertions(+)
 create mode 100644 fs/ocfs2/cluster/Makefile
 create mode 100644 fs/ocfs2/cluster/endian.h
 create mode 100644 fs/ocfs2/cluster/nodemanager.c
 create mode 100644 fs/ocfs2/cluster/nodemanager.h
 create mode 100644 fs/ocfs2/cluster/ocfs2_nodemanager.h
 create mode 100644 fs/ocfs2/cluster/ver.c
 create mode 100644 fs/ocfs2/cluster/ver.h

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
new file mode 100644
index 000000000000..cdd162f13650
--- /dev/null
+++ b/fs/ocfs2/cluster/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
+
+ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
+	quorum.o tcp.o ver.o
diff --git a/fs/ocfs2/cluster/endian.h b/fs/ocfs2/cluster/endian.h
new file mode 100644
index 000000000000..2df9082f4e35
--- /dev/null
+++ b/fs/ocfs2/cluster/endian.h
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_CLUSTER_ENDIAN_H
+#define OCFS2_CLUSTER_ENDIAN_H
+
+static inline void be32_add_cpu(__be32 *var, u32 val)
+{
+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
+}
+
+#endif /* OCFS2_CLUSTER_ENDIAN_H */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
new file mode 100644
index 000000000000..5fd60c105913
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -0,0 +1,791 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/configfs.h>
+
+#include "endian.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "heartbeat.h"
+#include "masklog.h"
+#include "sys.h"
+#include "ver.h"
+
+/* for now we operate under the assertion that there can be only one
+ * cluster active at a time.  Changing this will require trickling
+ * cluster references throughout where nodes are looked up */
+static struct o2nm_cluster *o2nm_single_cluster = NULL;
+
+#define OCFS2_MAX_HB_CTL_PATH 256
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
+
+static ctl_table ocfs2_nm_table[] = {
+	{
+		.ctl_name	= 1,
+		.procname	= "hb_ctl_path",
+		.data		= ocfs2_hb_ctl_path,
+		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ocfs2_mod_table[] = {
+	{
+		.ctl_name	= KERN_OCFS2_NM,
+		.procname	= "nm",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_nm_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_kern_table[] = {
+	{
+		.ctl_name	= KERN_OCFS2,
+		.procname	= "ocfs2",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_mod_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_root_table[] = {
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_kern_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header *ocfs2_table_header = NULL;
+
+const char *o2nm_get_hb_ctl_path(void)
+{
+	return ocfs2_hb_ctl_path;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
+
+struct o2nm_cluster {
+	struct config_group	cl_group;
+	unsigned		cl_has_local:1;
+	u8			cl_local_node;
+	rwlock_t		cl_nodes_lock;
+	struct o2nm_node  	*cl_nodes[O2NM_MAX_NODES];
+	struct rb_root		cl_node_ip_tree;
+	/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
+	unsigned long	cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
+{
+	struct o2nm_node *node = NULL;
+
+	if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
+		goto out;
+
+	read_lock(&o2nm_single_cluster->cl_nodes_lock);
+	node = o2nm_single_cluster->cl_nodes[node_num];
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&o2nm_single_cluster->cl_nodes_lock);
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
+{
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
+
+	if (cluster == NULL)
+		return -EINVAL;
+
+	read_lock(&cluster->cl_nodes_lock);
+	memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+	read_unlock(&cluster->cl_nodes_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
+
+static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
+						  __be32 ip_needle,
+						  struct rb_node ***ret_p,
+						  struct rb_node **ret_parent)
+{
+	struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct o2nm_node *node, *ret = NULL;
+
+	while (*p) {
+		parent = *p;
+		node = rb_entry(parent, struct o2nm_node, nd_ip_node);
+
+		if (memcmp(&ip_needle, &node->nd_ipv4_address,
+		           sizeof(ip_needle)) < 0)
+			p = &(*p)->rb_left;
+		else if (memcmp(&ip_needle, &node->nd_ipv4_address,
+			        sizeof(ip_needle)) > 0)
+			p = &(*p)->rb_right;
+		else {
+			ret = node;
+			break;
+		}
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+
+	return ret;
+}
+
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
+{
+	struct o2nm_node *node = NULL;
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	if (cluster == NULL)
+		goto out;
+
+	read_lock(&cluster->cl_nodes_lock);
+	node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&cluster->cl_nodes_lock);
+
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
+
+void o2nm_node_put(struct o2nm_node *node)
+{
+	config_item_put(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_put);
+
+void o2nm_node_get(struct o2nm_node *node)
+{
+	config_item_get(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_get);
+
+u8 o2nm_this_node(void)
+{
+	u8 node_num = O2NM_MAX_NODES;
+
+	if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
+		node_num = o2nm_single_cluster->cl_local_node;
+
+	return node_num;
+}
+EXPORT_SYMBOL_GPL(o2nm_this_node);
+
+/* node configfs bits */
+
+static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
+{
+	return item ?
+		container_of(to_config_group(item), struct o2nm_cluster,
+			     cl_group)
+		: NULL;
+}
+
+static struct o2nm_node *to_o2nm_node(struct config_item *item)
+{
+	return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
+}
+
+static void o2nm_node_release(struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	kfree(node);
+}
+
+static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_num);
+}
+
+static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
+{
+	/* through the first node_set .parent
+	 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
+	return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+}
+
+enum {
+	O2NM_NODE_ATTR_NUM = 0,
+	O2NM_NODE_ATTR_PORT,
+	O2NM_NODE_ATTR_ADDRESS,
+	O2NM_NODE_ATTR_LOCAL,
+};
+
+static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+				   size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp >= O2NM_MAX_NODES)
+		return -ERANGE;
+
+	/* once we're in the cl_nodes tree networking can look us up by
+	 * node number and try to use our address and port attributes
+	 * to connect to this node.. make sure that they've been set
+	 * before writing the node attribute? */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	write_lock(&cluster->cl_nodes_lock);
+	if (cluster->cl_nodes[tmp])
+		p = NULL;
+	else  {
+		cluster->cl_nodes[tmp] = node;
+		node->nd_num = tmp;
+		set_bit(tmp, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (p == NULL)
+		return -EEXIST;
+
+	return count;
+}
+static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+}
+
+static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+					 const char *page, size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp == 0)
+		return -EINVAL;
+	if (tmp >= (u16)-1)
+		return -ERANGE;
+
+	node->nd_ipv4_port = htons(tmp);
+
+	return count;
+}
+
+static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
+}
+
+static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+					    const char *page,
+					    size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	int ret, i;
+	struct rb_node **p, *parent;
+	unsigned int octets[4];
+	__be32 ipv4_addr = 0;
+
+	ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
+		     &octets[1], &octets[0]);
+	if (ret != 4)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(octets); i++) {
+		if (octets[i] > 255)
+			return -ERANGE;
+		be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
+	}
+
+	ret = 0;
+	write_lock(&cluster->cl_nodes_lock);
+	if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
+		ret = -EEXIST;
+	else {
+		rb_link_node(&node->nd_ip_node, parent, p);
+		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (ret)
+		return ret;
+
+	memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
+
+	return count;
+}
+
+static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_local);
+}
+
+static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+				     size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+	ssize_t ret;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	tmp = !!tmp; /* boolean of whether this node wants to be local */
+
+	/* setting local turns on networking rx for now so we require having
+	 * set everything else first */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	/* the only failure case is trying to set a new local node
+	 * when a different one is already set */
+	if (tmp && tmp == cluster->cl_has_local &&
+	    cluster->cl_local_node != node->nd_num)
+		return -EBUSY;
+
+	/* bring up the rx thread if we're setting the new local node. */
+	if (tmp && !cluster->cl_has_local) {
+		ret = o2net_start_listening(node);
+		if (ret)
+			return ret;
+	}
+
+	if (!tmp && cluster->cl_has_local &&
+	    cluster->cl_local_node == node->nd_num) {
+		o2net_stop_listening(node);
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+	}
+
+	node->nd_local = tmp;
+	if (node->nd_local) {
+		cluster->cl_has_local = tmp;
+		cluster->cl_local_node = node->nd_num;
+	}
+
+	return count;
+}
+
+struct o2nm_node_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2nm_node *, char *);
+	ssize_t (*store)(struct o2nm_node *, const char *, size_t);
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_num = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "num",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_num_read,
+	.store	= o2nm_node_num_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_port",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_port_read,
+	.store	= o2nm_node_ipv4_port_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_address",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_address_read,
+	.store	= o2nm_node_ipv4_address_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_local = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "local",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_local_read,
+	.store	= o2nm_node_local_write,
+};
+
+static struct configfs_attribute *o2nm_node_attrs[] = {
+	[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
+	[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
+	[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
+	[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+	NULL,
+};
+
+static int o2nm_attr_index(struct configfs_attribute *attr)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
+		if (attr == o2nm_node_attrs[i])
+			return i;
+	}
+	BUG();
+	return 0;
+}
+
+static ssize_t o2nm_node_show(struct config_item *item,
+			      struct configfs_attribute *attr,
+			      char *page)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2nm_node_attr->show)
+		ret = o2nm_node_attr->show(node, page);
+	return ret;
+}
+
+static ssize_t o2nm_node_store(struct config_item *item,
+			       struct configfs_attribute *attr,
+			       const char *page, size_t count)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret;
+	int attr_index = o2nm_attr_index(attr);
+
+	if (o2nm_node_attr->store == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (test_bit(attr_index, &node->nd_set_attributes))
+		return -EBUSY;
+
+	ret = o2nm_node_attr->store(node, page, count);
+	if (ret < count)
+		goto out;
+
+	set_bit(attr_index, &node->nd_set_attributes);
+out:
+	return ret;
+}
+
+static struct configfs_item_operations o2nm_node_item_ops = {
+	.release		= o2nm_node_release,
+	.show_attribute		= o2nm_node_show,
+	.store_attribute	= o2nm_node_store,
+};
+
+static struct config_item_type o2nm_node_type = {
+	.ct_item_ops	= &o2nm_node_item_ops,
+	.ct_attrs	= o2nm_node_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* node set */
+
+struct o2nm_node_group {
+	struct config_group ns_group;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2nm_node_group, ns_group)
+		: NULL;
+}
+#endif
+
+static struct config_item *o2nm_node_group_make_item(struct config_group *group,
+						     const char *name)
+{
+	struct o2nm_node *node = NULL;
+	struct config_item *ret = NULL;
+
+	if (strlen(name) > O2NM_MAX_NAME_LEN)
+		goto out; /* ENAMETOOLONG */
+
+	node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL);
+	if (node == NULL)
+		goto out; /* ENOMEM */
+
+	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
+	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
+	spin_lock_init(&node->nd_lock);
+
+	ret = &node->nd_item;
+
+out:
+	if (ret == NULL)
+		kfree(node);
+
+	return ret;
+}
+
+static void o2nm_node_group_drop_item(struct config_group *group,
+				      struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
+
+	o2net_disconnect_node(node);
+
+	if (cluster->cl_has_local &&
+	    (cluster->cl_local_node == node->nd_num)) {
+		cluster->cl_has_local = 0;
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+		o2net_stop_listening(node);
+	}
+
+	/* XXX call into net to stop this node from trading messages */
+
+	write_lock(&cluster->cl_nodes_lock);
+
+	/* XXX sloppy */
+	if (node->nd_ipv4_address)
+		rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+
+	/* nd_num might be 0 if the node number hasn't been set.. */
+	if (cluster->cl_nodes[node->nd_num] == node) {
+		cluster->cl_nodes[node->nd_num] = NULL;
+		clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_node_group_group_ops = {
+	.make_item	= o2nm_node_group_make_item,
+	.drop_item	= o2nm_node_group_drop_item,
+};
+
+static struct config_item_type o2nm_node_group_type = {
+	.ct_group_ops	= &o2nm_node_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster */
+
+static void o2nm_cluster_release(struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+
+	kfree(cluster->cl_group.default_groups);
+	kfree(cluster);
+}
+
+static struct configfs_item_operations o2nm_cluster_item_ops = {
+	.release	= o2nm_cluster_release,
+};
+
+static struct config_item_type o2nm_cluster_type = {
+	.ct_item_ops	= &o2nm_cluster_item_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster set */
+
+struct o2nm_cluster_group {
+	struct configfs_subsystem cs_subsys;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
+{
+	return group ?
+		container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
+	       : NULL;
+}
+#endif
+
+static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
+							  const char *name)
+{
+	struct o2nm_cluster *cluster = NULL;
+	struct o2nm_node_group *ns = NULL;
+	struct config_group *o2hb_group = NULL, *ret = NULL;
+	void *defs = NULL;
+
+	/* this runs under the parent dir's i_sem; there can be only
+	 * one caller in here at a time */
+	if (o2nm_single_cluster)
+		goto out; /* ENOSPC */
+
+	cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL);
+	ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL);
+	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+	o2hb_group = o2hb_alloc_hb_set();
+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+		goto out;
+
+	config_group_init_type_name(&cluster->cl_group, name,
+				    &o2nm_cluster_type);
+	config_group_init_type_name(&ns->ns_group, "node",
+				    &o2nm_node_group_type);
+
+	cluster->cl_group.default_groups = defs;
+	cluster->cl_group.default_groups[0] = &ns->ns_group;
+	cluster->cl_group.default_groups[1] = o2hb_group;
+	cluster->cl_group.default_groups[2] = NULL;
+	rwlock_init(&cluster->cl_nodes_lock);
+	cluster->cl_node_ip_tree = RB_ROOT;
+
+	ret = &cluster->cl_group;
+	o2nm_single_cluster = cluster;
+
+out:
+	if (ret == NULL) {
+		kfree(cluster);
+		kfree(ns);
+		o2hb_free_hb_set(o2hb_group);
+		kfree(defs);
+	}
+
+	return ret;
+}
+
+static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+	int i;
+	struct config_item *killme;
+
+	BUG_ON(o2nm_single_cluster != cluster);
+	o2nm_single_cluster = NULL;
+
+	for (i = 0; cluster->cl_group.default_groups[i]; i++) {
+		killme = &cluster->cl_group.default_groups[i]->cg_item;
+		cluster->cl_group.default_groups[i] = NULL;
+		config_item_put(killme);
+	}
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_cluster_group_group_ops = {
+	.make_group	= o2nm_cluster_group_make_group,
+	.drop_item	= o2nm_cluster_group_drop_item,
+};
+
+static struct config_item_type o2nm_cluster_group_type = {
+	.ct_group_ops	= &o2nm_cluster_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct o2nm_cluster_group o2nm_cluster_group = {
+	.cs_subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "cluster",
+				.ci_type = &o2nm_cluster_group_type,
+			},
+		},
+	},
+};
+
+static void __exit exit_o2nm(void)
+{
+	if (ocfs2_table_header)
+		unregister_sysctl_table(ocfs2_table_header);
+
+	/* XXX sync with hb callbacks and shut down hb? */
+	o2net_unregister_hb_callbacks();
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+	o2cb_sys_shutdown();
+
+	o2net_exit();
+}
+
+static int __init init_o2nm(void)
+{
+	int ret = -1;
+
+	cluster_print_version();
+
+	o2hb_init();
+	o2net_init();
+
+	ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
+	if (!ocfs2_table_header) {
+		printk(KERN_ERR "nodemanager: unable to register sysctl\n");
+		ret = -ENOMEM; /* or something. */
+		goto out;
+	}
+
+	ret = o2net_register_hb_callbacks();
+	if (ret)
+		goto out_sysctl;
+
+	config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
+	init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
+	ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
+	if (ret) {
+		printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
+		goto out_callbacks;
+	}
+
+	ret = o2cb_sys_init();
+	if (!ret)
+		goto out;
+
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+out_callbacks:
+	o2net_unregister_hb_callbacks();
+out_sysctl:
+	unregister_sysctl_table(ocfs2_table_header);
+out:
+	return ret;
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_o2nm)
+module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
new file mode 100644
index 000000000000..fce8033c310f
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -0,0 +1,64 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_NODEMANAGER_H
+#define O2CLUSTER_NODEMANAGER_H
+
+#include "ocfs2_nodemanager.h"
+
+/* This totally doesn't belong here. */
+#include <linux/configfs.h>
+#include <linux/rbtree.h>
+
+#define KERN_OCFS2		988
+#define KERN_OCFS2_NM		1
+
+const char *o2nm_get_hb_ctl_path(void);
+
+struct o2nm_node {
+	spinlock_t		nd_lock;
+	struct config_item	nd_item;
+	char			nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
+	__u8			nd_num;
+	/* only one address per node, as attributes, for now. */
+	__be32			nd_ipv4_address;
+	__be16			nd_ipv4_port;
+	struct rb_node		nd_ip_node;
+	/* there can be only one local node for now */
+	int			nd_local;
+
+	unsigned long		nd_set_attributes;
+};
+
+u8 o2nm_this_node(void);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
+void o2nm_node_get(struct o2nm_node *node);
+void o2nm_node_put(struct o2nm_node *node);
+
+#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
new file mode 100644
index 000000000000..5b9854bad571
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_nodemanager.h
+ *
+ * Header describing the interface between userspace and the kernel
+ * for the ocfs2_nodemanager module.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef _OCFS2_NODEMANAGER_H
+#define _OCFS2_NODEMANAGER_H
+
+#define O2NM_API_VERSION	5
+
+#define O2NM_MAX_NODES		255
+#define O2NM_INVALID_NODE_NUM	255
+
+/* host name, group name, cluster name all 64 bytes */
+#define O2NM_MAX_NAME_LEN        64    // __NEW_UTS_LEN
+
+#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
new file mode 100644
index 000000000000..7286c48bb30d
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "ver.h"
+
+#define CLUSTER_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
+
+void cluster_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
new file mode 100644
index 000000000000..32554c3382c2
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_VER_H
+#define O2CLUSTER_VER_H
+
+void cluster_print_version(void);
+
+#endif /* O2CLUSTER_VER_H */
-- 
cgit v1.2.3


From a7f6a5fb4bde142b622706e2006ba33f793e13ed Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

Disk based heartbeat. Configured and started from userspace, the
kernel component handles I/O submission and event generation via
callback mechanism.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c       | 1797 ++++++++++++++++++++++++++++++++++++
 fs/ocfs2/cluster/heartbeat.h       |   82 ++
 fs/ocfs2/cluster/ocfs2_heartbeat.h |   37 +
 3 files changed, 1916 insertions(+)
 create mode 100644 fs/ocfs2/cluster/heartbeat.c
 create mode 100644 fs/ocfs2/cluster/heartbeat.h
 create mode 100644 fs/ocfs2/cluster/ocfs2_heartbeat.h

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
new file mode 100644
index 000000000000..7307ba528913
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -0,0 +1,1797 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/configfs.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "quorum.h"
+
+#include "masklog.h"
+
+
+/*
+ * The first heartbeat pass had one global thread that would serialize all hb
+ * callback calls.  This global serializing sem should only be removed once
+ * we've made sure that all callees can deal with being called concurrently
+ * from multiple hb region threads.
+ */
+static DECLARE_RWSEM(o2hb_callback_sem);
+
+/*
+ * multiple hb threads are watching multiple regions.  A node is live
+ * whenever any of the threads sees activity from the node in its region.
+ */
+static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
+static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+static LIST_HEAD(o2hb_node_events);
+static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+
+static LIST_HEAD(o2hb_all_regions);
+
+static struct o2hb_callback {
+	struct list_head list;
+} o2hb_callbacks[O2HB_NUM_CB];
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
+
+#define O2HB_DEFAULT_BLOCK_BITS       9
+
+unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+
+/* Only sets a new threshold if there are no active regions. 
+ *
+ * No locking or otherwise interesting code is required for reading
+ * o2hb_dead_threshold as it can't change once regions are active and
+ * it's not interesting to anyone until then anyway. */
+static void o2hb_dead_threshold_set(unsigned int threshold)
+{
+	if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
+		spin_lock(&o2hb_live_lock);
+		if (list_empty(&o2hb_all_regions))
+			o2hb_dead_threshold = threshold;
+		spin_unlock(&o2hb_live_lock);
+	}
+}
+
+struct o2hb_node_event {
+	struct list_head        hn_item;
+	enum o2hb_callback_type hn_event_type;
+	struct o2nm_node        *hn_node;
+	int                     hn_node_num;
+};
+
+struct o2hb_disk_slot {
+	struct o2hb_disk_heartbeat_block *ds_raw_block;
+	u8			ds_node_num;
+	u64			ds_last_time;
+	u64			ds_last_generation;
+	u16			ds_equal_samples;
+	u16			ds_changed_samples;
+	struct list_head	ds_live_item;
+};
+
+/* each thread owns a region.. when we're asked to tear down the region
+ * we ask the thread to stop, who cleans up the region */
+struct o2hb_region {
+	struct config_item	hr_item;
+
+	struct list_head	hr_all_item;
+	unsigned		hr_unclean_stop:1;
+
+	/* protected by the hr_callback_sem */
+	struct task_struct 	*hr_task;
+
+	unsigned int		hr_blocks;
+	unsigned long long	hr_start_block;
+
+	unsigned int		hr_block_bits;
+	unsigned int		hr_block_bytes;
+
+	unsigned int		hr_slots_per_page;
+	unsigned int		hr_num_pages;
+
+	struct page             **hr_slot_data;
+	struct block_device	*hr_bdev;
+	struct o2hb_disk_slot	*hr_slots;
+
+	/* let the person setting up hb wait for it to return until it
+	 * has reached a 'steady' state.  This will be fixed when we have
+	 * a more complete api that doesn't lead to this sort of fragility. */
+	atomic_t		hr_steady_iterations;
+
+	char			hr_dev_name[BDEVNAME_SIZE];
+
+	unsigned int		hr_timeout_ms;
+
+	/* randomized as the region goes up and down so that a node
+	 * recognizes a node going up and down in one iteration */
+	u64			hr_generation;
+
+	struct work_struct	hr_write_timeout_work;
+	unsigned long		hr_last_timeout_start;
+
+	/* Used during o2hb_check_slot to hold a copy of the block
+	 * being checked because we temporarily have to zero out the
+	 * crc field. */
+	struct o2hb_disk_heartbeat_block *hr_tmp_block;
+};
+
+struct o2hb_bio_wait_ctxt {
+	atomic_t          wc_num_reqs;
+	struct completion wc_io_complete;
+};
+
+static void o2hb_write_timeout(void *arg)
+{
+	struct o2hb_region *reg = arg;
+
+	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
+	     "milliseconds\n", reg->hr_dev_name,
+	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 
+	o2quo_disk_timeout();
+}
+
+static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+{
+	mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
+
+	cancel_delayed_work(&reg->hr_write_timeout_work);
+	reg->hr_last_timeout_start = jiffies;
+	schedule_delayed_work(&reg->hr_write_timeout_work,
+			      msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+}
+
+static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+{
+	cancel_delayed_work(&reg->hr_write_timeout_work);
+	flush_scheduled_work();
+}
+
+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
+				      unsigned int num_ios)
+{
+	atomic_set(&wc->wc_num_reqs, num_ios);
+	init_completion(&wc->wc_io_complete);
+}
+
+/* Used in error paths too */
+static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
+				     unsigned int num)
+{
+	/* sadly atomic_sub_and_test() isn't available on all platforms.  The
+	 * good news is that the fast path only completes one at a time */
+	while(num--) {
+		if (atomic_dec_and_test(&wc->wc_num_reqs)) {
+			BUG_ON(num > 0);
+			complete(&wc->wc_io_complete);
+		}
+	}
+}
+
+static void o2hb_wait_on_io(struct o2hb_region *reg,
+			    struct o2hb_bio_wait_ctxt *wc)
+{
+	struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
+
+	blk_run_address_space(mapping);
+
+	wait_for_completion(&wc->wc_io_complete);
+}
+
+static int o2hb_bio_end_io(struct bio *bio,
+			   unsigned int bytes_done,
+			   int error)
+{
+	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
+
+	if (error)
+		mlog(ML_ERROR, "IO Error %d\n", error);
+
+	if (bio->bi_size)
+		return 1;
+
+	o2hb_bio_wait_dec(wc, 1);
+	return 0;
+}
+
+/* Setup a Bio to cover I/O against num_slots slots starting at
+ * start_slot. */
+static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
+				      struct o2hb_bio_wait_ctxt *wc,
+				      unsigned int start_slot,
+				      unsigned int num_slots)
+{
+	int i, nr_vecs, len, first_page, last_page;
+	unsigned int vec_len, vec_start;
+	unsigned int bits = reg->hr_block_bits;
+	unsigned int spp = reg->hr_slots_per_page;
+	struct bio *bio;
+	struct page *page;
+
+	nr_vecs = (num_slots + spp - 1) / spp;
+
+	/* Testing has shown this allocation to take long enough under
+	 * GFP_KERNEL that the local node can get fenced. It would be
+	 * nicest if we could pre-allocate these bios and avoid this
+	 * all together. */
+	bio = bio_alloc(GFP_ATOMIC, nr_vecs);
+	if (!bio) {
+		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
+		bio = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	/* Must put everything in 512 byte sectors for the bio... */
+	bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
+	bio->bi_bdev = reg->hr_bdev;
+	bio->bi_private = wc;
+	bio->bi_end_io = o2hb_bio_end_io;
+
+	first_page = start_slot / spp;
+	last_page = first_page + nr_vecs;
+	vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
+	for(i = first_page; i < last_page; i++) {
+		page = reg->hr_slot_data[i];
+
+		vec_len = PAGE_CACHE_SIZE;
+		/* last page might be short */
+		if (((i + 1) * spp) > (start_slot + num_slots))
+			vec_len = ((num_slots + start_slot) % spp) << bits;
+		vec_len -=  vec_start;
+
+		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
+		     i, vec_len, vec_start);
+
+		len = bio_add_page(bio, page, vec_len, vec_start);
+		if (len != vec_len) {
+			bio_put(bio);
+			bio = ERR_PTR(-EIO);
+
+			mlog(ML_ERROR, "Error adding page to bio i = %d, "
+			     "vec_len = %u, len = %d\n, start = %u\n",
+			     i, vec_len, len, vec_start);
+			goto bail;
+		}
+
+		vec_start = 0;
+	}
+
+bail:
+	return bio;
+}
+
+/*
+ * Compute the maximum number of sectors the bdev can handle in one bio,
+ * as a power of two.
+ *
+ * Stolen from oracleasm, thanks Joel!
+ */
+static int compute_max_sectors(struct block_device *bdev)
+{
+	int max_pages, max_sectors, pow_two_sectors;
+
+	struct request_queue *q;
+
+	q = bdev_get_queue(bdev);
+	max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
+	if (max_pages > q->max_phys_segments)
+		max_pages = q->max_phys_segments;
+	if (max_pages > q->max_hw_segments)
+		max_pages = q->max_hw_segments;
+	max_pages--; /* Handle I/Os that straddle a page */
+
+	max_sectors = max_pages << (PAGE_SHIFT - 9);
+
+	/* Why is fls() 1-based???? */
+	pow_two_sectors = 1 << (fls(max_sectors) - 1);
+
+	return pow_two_sectors;
+}
+
+static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
+					       unsigned int num_slots,
+					       unsigned int *num_bios,
+					       unsigned int *slots_per_bio)
+{
+	unsigned int max_sectors, io_sectors;
+
+	max_sectors = compute_max_sectors(reg->hr_bdev);
+
+	io_sectors = num_slots << (reg->hr_block_bits - 9);
+
+	*num_bios = (io_sectors + max_sectors - 1) / max_sectors;
+	*slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
+
+	mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
+	     "device can handle %u sectors of I/O\n", io_sectors, num_slots,
+	     max_sectors);
+	mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
+	     *num_bios, *slots_per_bio);
+}
+
+static int o2hb_read_slots(struct o2hb_region *reg,
+			   unsigned int max_slots)
+{
+	unsigned int num_bios, slots_per_bio, start_slot, num_slots;
+	int i, status;
+	struct o2hb_bio_wait_ctxt wc;
+	struct bio **bios;
+	struct bio *bio;
+
+	o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
+
+	bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
+	if (!bios) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		return status;
+	}
+
+	o2hb_bio_wait_init(&wc, num_bios);
+
+	num_slots = slots_per_bio;
+	for(i = 0; i < num_bios; i++) {
+		start_slot = i * slots_per_bio;
+
+		/* adjust num_slots at last bio */
+		if (max_slots < (start_slot + num_slots))
+			num_slots = max_slots - start_slot;
+
+		bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
+		if (IS_ERR(bio)) {
+			o2hb_bio_wait_dec(&wc, num_bios - i);
+
+			status = PTR_ERR(bio);
+			mlog_errno(status);
+			goto bail_and_wait;
+		}
+		bios[i] = bio;
+
+		submit_bio(READ, bio);
+	}
+
+	status = 0;
+
+bail_and_wait:
+	o2hb_wait_on_io(reg, &wc);
+
+	if (bios) {
+		for(i = 0; i < num_bios; i++)
+			if (bios[i])
+				bio_put(bios[i]);
+		kfree(bios);
+	}
+
+	return status;
+}
+
+static int o2hb_issue_node_write(struct o2hb_region *reg,
+				 struct bio **write_bio,
+				 struct o2hb_bio_wait_ctxt *write_wc)
+{
+	int status;
+	unsigned int slot;
+	struct bio *bio;
+
+	o2hb_bio_wait_init(write_wc, 1);
+
+	slot = o2nm_this_node();
+
+	bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
+	if (IS_ERR(bio)) {
+		status = PTR_ERR(bio);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	submit_bio(WRITE, bio);
+
+	*write_bio = bio;
+	status = 0;
+bail:
+	return status;
+}
+
+static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
+				     struct o2hb_disk_heartbeat_block *hb_block)
+{
+	__le32 old_cksum;
+	u32 ret;
+
+	/* We want to compute the block crc with a 0 value in the
+	 * hb_cksum field. Save it off here and replace after the
+	 * crc. */
+	old_cksum = hb_block->hb_cksum;
+	hb_block->hb_cksum = 0;
+
+	ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
+
+	hb_block->hb_cksum = old_cksum;
+
+	return ret;
+}
+
+static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
+{
+	mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, "
+	     "cksum = 0x%x, generation 0x%"MLFx64"\n",
+	     le64_to_cpu(hb_block->hb_seq), hb_block->hb_node,
+	     le32_to_cpu(hb_block->hb_cksum),
+	     le64_to_cpu(hb_block->hb_generation));
+}
+
+static int o2hb_verify_crc(struct o2hb_region *reg,
+			   struct o2hb_disk_heartbeat_block *hb_block)
+{
+	u32 read, computed;
+
+	read = le32_to_cpu(hb_block->hb_cksum);
+	computed = o2hb_compute_block_crc_le(reg, hb_block);
+
+	return read == computed;
+}
+
+/* We want to make sure that nobody is heartbeating on top of us --
+ * this will help detect an invalid configuration. */
+static int o2hb_check_last_timestamp(struct o2hb_region *reg)
+{
+	int node_num, ret;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	node_num = o2nm_this_node();
+
+	ret = 1;
+	slot = &reg->hr_slots[node_num];
+	/* Don't check on our 1st timestamp */
+	if (slot->ds_last_time) {
+		hb_block = slot->ds_raw_block;
+
+		if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
+			ret = 0;
+	}
+
+	return ret;
+}
+
+static inline void o2hb_prepare_block(struct o2hb_region *reg,
+				      u64 generation)
+{
+	int node_num;
+	u64 cputime;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	node_num = o2nm_this_node();
+	slot = &reg->hr_slots[node_num];
+
+	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
+	memset(hb_block, 0, reg->hr_block_bytes);
+	/* TODO: time stuff */
+	cputime = CURRENT_TIME.tv_sec;
+	if (!cputime)
+		cputime = 1;
+
+	hb_block->hb_seq = cpu_to_le64(cputime);
+	hb_block->hb_node = node_num;
+	hb_block->hb_generation = cpu_to_le64(generation);
+
+	/* This step must always happen last! */
+	hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
+								   hb_block));
+
+	mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
+	     cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum));
+}
+
+static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
+				struct o2nm_node *node,
+				int idx)
+{
+	struct list_head *iter;
+	struct o2hb_callback_func *f;
+
+	list_for_each(iter, &hbcall->list) {
+		f = list_entry(iter, struct o2hb_callback_func, hc_item);
+		mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
+		(f->hc_func)(node, idx, f->hc_data);
+	}
+}
+
+/* Will run the list in order until we process the passed event */
+static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
+{
+	int empty;
+	struct o2hb_callback *hbcall;
+	struct o2hb_node_event *event;
+
+	spin_lock(&o2hb_live_lock);
+	empty = list_empty(&queued_event->hn_item);
+	spin_unlock(&o2hb_live_lock);
+	if (empty)
+		return;
+
+	/* Holding callback sem assures we don't alter the callback
+	 * lists when doing this, and serializes ourselves with other
+	 * processes wanting callbacks. */
+	down_write(&o2hb_callback_sem);
+
+	spin_lock(&o2hb_live_lock);
+	while (!list_empty(&o2hb_node_events)
+	       && !list_empty(&queued_event->hn_item)) {
+		event = list_entry(o2hb_node_events.next,
+				   struct o2hb_node_event,
+				   hn_item);
+		list_del_init(&event->hn_item);
+		spin_unlock(&o2hb_live_lock);
+
+		mlog(ML_HEARTBEAT, "Node %s event for %d\n",
+		     event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
+		     event->hn_node_num);
+
+		hbcall = hbcall_from_type(event->hn_event_type);
+
+		/* We should *never* have gotten on to the list with a
+		 * bad type... This isn't something that we should try
+		 * to recover from. */
+		BUG_ON(IS_ERR(hbcall));
+
+		o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
+
+		spin_lock(&o2hb_live_lock);
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	up_write(&o2hb_callback_sem);
+}
+
+static void o2hb_queue_node_event(struct o2hb_node_event *event,
+				  enum o2hb_callback_type type,
+				  struct o2nm_node *node,
+				  int node_num)
+{
+	assert_spin_locked(&o2hb_live_lock);
+
+	event->hn_event_type = type;
+	event->hn_node = node;
+	event->hn_node_num = node_num;
+
+	mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
+	     type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
+
+	list_add_tail(&event->hn_item, &o2hb_node_events);
+}
+
+static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
+{
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node)
+		return;
+
+	spin_lock(&o2hb_live_lock);
+	if (!list_empty(&slot->ds_live_item)) {
+		mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
+		     slot->ds_node_num);
+
+		list_del_init(&slot->ds_live_item);
+
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+					      slot->ds_node_num);
+		}
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	o2hb_run_event_list(&event);
+
+	o2nm_node_put(node);
+}
+
+static int o2hb_check_slot(struct o2hb_region *reg,
+			   struct o2hb_disk_slot *slot)
+{
+	int changed = 0, gen_changed = 0;
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+	struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
+	u64 cputime;
+
+	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
+
+	/* Is this correct? Do we assume that the node doesn't exist
+	 * if we're not configured for him? */
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node)
+		return 0;
+
+	if (!o2hb_verify_crc(reg, hb_block)) {
+		/* all paths from here will drop o2hb_live_lock for
+		 * us. */
+		spin_lock(&o2hb_live_lock);
+
+		/* Don't print an error on the console in this case -
+		 * a freshly formatted heartbeat area will not have a
+		 * crc set on it. */
+		if (list_empty(&slot->ds_live_item))
+			goto out;
+
+		/* The node is live but pushed out a bad crc. We
+		 * consider it a transient miss but don't populate any
+		 * other values as they may be junk. */
+		mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
+		     slot->ds_node_num, reg->hr_dev_name);
+		o2hb_dump_slot(hb_block);
+
+		slot->ds_equal_samples++;
+		goto fire_callbacks;
+	}
+
+	/* we don't care if these wrap.. the state transitions below
+	 * clear at the right places */
+	cputime = le64_to_cpu(hb_block->hb_seq);
+	if (slot->ds_last_time != cputime)
+		slot->ds_changed_samples++;
+	else
+		slot->ds_equal_samples++;
+	slot->ds_last_time = cputime;
+
+	/* The node changed heartbeat generations. We assume this to
+	 * mean it dropped off but came back before we timed out. We
+	 * want to consider it down for the time being but don't want
+	 * to lose any changed_samples state we might build up to
+	 * considering it live again. */
+	if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
+		gen_changed = 1;
+		slot->ds_equal_samples = 0;
+		mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
+		     "to 0x%"MLFx64")\n", slot->ds_node_num,
+		     slot->ds_last_generation,
+		     le64_to_cpu(hb_block->hb_generation));
+	}
+
+	slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+
+	mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x "
+	     "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n",
+	     slot->ds_node_num, slot->ds_last_generation,
+	     le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq), 
+	     slot->ds_last_time, slot->ds_changed_samples,
+	     slot->ds_equal_samples);
+
+	spin_lock(&o2hb_live_lock);
+
+fire_callbacks:
+	/* dead nodes only come to life after some number of
+	 * changes at any time during their dead time */
+	if (list_empty(&slot->ds_live_item) &&
+	    slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
+		mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
+		     "region\n", slot->ds_node_num, slot->ds_last_generation);
+
+		/* first on the list generates a callback */
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
+					      slot->ds_node_num);
+
+			changed = 1;
+		}
+
+		list_add_tail(&slot->ds_live_item,
+			      &o2hb_live_slots[slot->ds_node_num]);
+
+		slot->ds_equal_samples = 0;
+		goto out;
+	}
+
+	/* if the list is dead, we're done.. */
+	if (list_empty(&slot->ds_live_item))
+		goto out;
+
+	/* live nodes only go dead after enough consequtive missed
+	 * samples..  reset the missed counter whenever we see
+	 * activity */
+	if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
+		mlog(ML_HEARTBEAT, "Node %d left my region\n",
+		     slot->ds_node_num);
+
+		/* last off the live_slot generates a callback */
+		list_del_init(&slot->ds_live_item);
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+					      slot->ds_node_num);
+
+			changed = 1;
+		}
+
+		/* We don't clear this because the node is still
+		 * actually writing new blocks. */
+		if (!gen_changed)
+			slot->ds_changed_samples = 0;
+		goto out;
+	}
+	if (slot->ds_changed_samples) {
+		slot->ds_changed_samples = 0;
+		slot->ds_equal_samples = 0;
+	}
+out:
+	spin_unlock(&o2hb_live_lock);
+
+	o2hb_run_event_list(&event);
+
+	o2nm_node_put(node);
+	return changed;
+}
+
+/* This could be faster if we just implmented a find_last_bit, but I
+ * don't think the circumstances warrant it. */
+static int o2hb_highest_node(unsigned long *nodes,
+			     int numbits)
+{
+	int highest, node;
+
+	highest = numbits;
+	node = -1;
+	while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
+		if (node >= numbits)
+			break;
+
+		highest = node;
+	}
+
+	return highest;
+}
+
+static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+{
+	int i, ret, highest_node, change = 0;
+	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct bio *write_bio;
+	struct o2hb_bio_wait_ctxt write_wc;
+
+	if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
+		return;
+
+	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
+	if (highest_node >= O2NM_MAX_NODES) {
+		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
+		return;
+	}
+
+	/* No sense in reading the slots of nodes that don't exist
+	 * yet. Of course, if the node definitions have holes in them
+	 * then we're reading an empty slot anyway... Consider this
+	 * best-effort. */
+	ret = o2hb_read_slots(reg, highest_node + 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return;
+	}
+
+	/* With an up to date view of the slots, we can check that no
+	 * other node has been improperly configured to heartbeat in
+	 * our slot. */
+	if (!o2hb_check_last_timestamp(reg))
+		mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
+		     "in our slot!\n", reg->hr_dev_name);
+
+	/* fill in the proper info for our next heartbeat */
+	o2hb_prepare_block(reg, reg->hr_generation);
+
+	/* And fire off the write. Note that we don't wait on this I/O
+	 * until later. */
+	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return;
+	}
+
+	i = -1;
+	while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+
+		change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+	}
+
+	/*
+	 * We have to be sure we've advertised ourselves on disk
+	 * before we can go to steady state.  This ensures that
+	 * people we find in our steady state have seen us.
+	 */
+	o2hb_wait_on_io(reg, &write_wc);
+	bio_put(write_bio);
+	o2hb_arm_write_timeout(reg);
+
+	/* let the person who launched us know when things are steady */
+	if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
+		if (atomic_dec_and_test(&reg->hr_steady_iterations))
+			wake_up(&o2hb_steady_queue);
+	}
+}
+
+/* Subtract b from a, storing the result in a. a *must* have a larger
+ * value than b. */
+static void o2hb_tv_subtract(struct timeval *a,
+			     struct timeval *b)
+{
+	/* just return 0 when a is after b */
+	if (a->tv_sec < b->tv_sec ||
+	    (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
+		a->tv_sec = 0;
+		a->tv_usec = 0;
+		return;
+	}
+
+	a->tv_sec -= b->tv_sec;
+	a->tv_usec -= b->tv_usec;
+	while ( a->tv_usec < 0 ) {
+		a->tv_sec--;
+		a->tv_usec += 1000000;
+	}
+}
+
+static unsigned int o2hb_elapsed_msecs(struct timeval *start,
+				       struct timeval *end)
+{
+	struct timeval res = *end;
+
+	o2hb_tv_subtract(&res, start);
+
+	return res.tv_sec * 1000 + res.tv_usec / 1000;
+}
+
+/*
+ * we ride the region ref that the region dir holds.  before the region
+ * dir is removed and drops it ref it will wait to tear down this
+ * thread.
+ */
+static int o2hb_thread(void *data)
+{
+	int i, ret;
+	struct o2hb_region *reg = data;
+	struct bio *write_bio;
+	struct o2hb_bio_wait_ctxt write_wc;
+	struct timeval before_hb, after_hb;
+	unsigned int elapsed_msec;
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
+
+	set_user_nice(current, -20);
+
+	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+		/* We track the time spent inside
+		 * o2hb_do_disk_heartbeat so that we avoid more then
+		 * hr_timeout_ms between disk writes. On busy systems
+		 * this should result in a heartbeat which is less
+		 * likely to time itself out. */
+		do_gettimeofday(&before_hb);
+
+		o2hb_do_disk_heartbeat(reg);
+
+		do_gettimeofday(&after_hb);
+		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+
+		mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+		     before_hb.tv_sec, before_hb.tv_usec,
+		     after_hb.tv_sec, after_hb.tv_usec, elapsed_msec);
+
+		if (elapsed_msec < reg->hr_timeout_ms) {
+			/* the kthread api has blocked signals for us so no
+			 * need to record the return value. */
+			msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
+		}
+	}
+
+	o2hb_disarm_write_timeout(reg);
+
+	/* unclean stop is only used in very bad situation */
+	for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
+		o2hb_shutdown_slot(&reg->hr_slots[i]);
+
+	/* Explicit down notification - avoid forcing the other nodes
+	 * to timeout on this region when we could just as easily
+	 * write a clear generation - thus indicating to them that
+	 * this node has left this region.
+	 *
+	 * XXX: Should we skip this on unclean_stop? */
+	o2hb_prepare_block(reg, 0);
+	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+	if (ret == 0) {
+		o2hb_wait_on_io(reg, &write_wc);
+		bio_put(write_bio);
+	} else {
+		mlog_errno(ret);
+	}
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+
+	return 0;
+}
+
+void o2hb_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
+		INIT_LIST_HEAD(&o2hb_callbacks[i].list);
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
+		INIT_LIST_HEAD(&o2hb_live_slots[i]);
+
+	INIT_LIST_HEAD(&o2hb_node_events);
+
+	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+}
+
+/* if we're already in a callback then we're already serialized by the sem */
+static void o2hb_fill_node_map_from_callback(unsigned long *map,
+					     unsigned bytes)
+{
+	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+	memcpy(map, &o2hb_live_node_bitmap, bytes);
+}
+
+/*
+ * get a map of all nodes that are heartbeating in any regions
+ */
+void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+{
+	/* callers want to serialize this map and callbacks so that they
+	 * can trust that they don't miss nodes coming to the party */
+	down_read(&o2hb_callback_sem);
+	spin_lock(&o2hb_live_lock);
+	o2hb_fill_node_map_from_callback(map, bytes);
+	spin_unlock(&o2hb_live_lock);
+	up_read(&o2hb_callback_sem);
+}
+EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
+
+/*
+ * heartbeat configfs bits.  The heartbeat set is a default set under
+ * the cluster set in nodemanager.c.
+ */
+
+static struct o2hb_region *to_o2hb_region(struct config_item *item)
+{
+	return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
+}
+
+/* drop_item only drops its ref after killing the thread, nothing should
+ * be using the region anymore.  this has to clean up any state that
+ * attributes might have built up. */
+static void o2hb_region_release(struct config_item *item)
+{
+	int i;
+	struct page *page;
+	struct o2hb_region *reg = to_o2hb_region(item);
+
+	if (reg->hr_tmp_block)
+		kfree(reg->hr_tmp_block);
+
+	if (reg->hr_slot_data) {
+		for (i = 0; i < reg->hr_num_pages; i++) {
+			page = reg->hr_slot_data[i];
+			if (page)
+				__free_page(page);
+		}
+		kfree(reg->hr_slot_data);
+	}
+
+	if (reg->hr_bdev)
+		blkdev_put(reg->hr_bdev);
+
+	if (reg->hr_slots)
+		kfree(reg->hr_slots);
+
+	spin_lock(&o2hb_live_lock);
+	list_del(&reg->hr_all_item);
+	spin_unlock(&o2hb_live_lock);
+
+	kfree(reg);
+}
+
+static int o2hb_read_block_input(struct o2hb_region *reg,
+				 const char *page,
+				 size_t count,
+				 unsigned long *ret_bytes,
+				 unsigned int *ret_bits)
+{
+	unsigned long bytes;
+	char *p = (char *)page;
+
+	bytes = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	/* Heartbeat and fs min / max block sizes are the same. */
+	if (bytes > 4096 || bytes < 512)
+		return -ERANGE;
+	if (hweight16(bytes) != 1)
+		return -EINVAL;
+
+	if (ret_bytes)
+		*ret_bytes = bytes;
+	if (ret_bits)
+		*ret_bits = ffs(bytes) - 1;
+
+	return 0;
+}
+
+static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%u\n", reg->hr_block_bytes);
+}
+
+static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	int status;
+	unsigned long block_bytes;
+	unsigned int block_bits;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	status = o2hb_read_block_input(reg, page, count,
+				       &block_bytes, &block_bits);
+	if (status)
+		return status;
+
+	reg->hr_block_bytes = (unsigned int)block_bytes;
+	reg->hr_block_bits = block_bits;
+
+	return count;
+}
+
+static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%llu\n", reg->hr_start_block);
+}
+
+static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	unsigned long long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoull(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	reg->hr_start_block = tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
+				       char *page)
+{
+	return sprintf(page, "%d\n", reg->hr_blocks);
+}
+
+static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+					const char *page,
+					size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > O2NM_MAX_NODES || tmp == 0)
+		return -ERANGE;
+
+	reg->hr_blocks = (unsigned int)tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
+				    char *page)
+{
+	unsigned int ret = 0;
+
+	if (reg->hr_bdev)
+		ret = sprintf(page, "%s\n", reg->hr_dev_name);
+
+	return ret;
+}
+
+static void o2hb_init_region_params(struct o2hb_region *reg)
+{
+	reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
+	reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
+
+	mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
+	     reg->hr_start_block, reg->hr_blocks);
+	mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
+	     reg->hr_block_bytes, reg->hr_block_bits);
+	mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
+	mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
+}
+
+static int o2hb_map_slot_data(struct o2hb_region *reg)
+{
+	int i, j;
+	unsigned int last_slot;
+	unsigned int spp = reg->hr_slots_per_page;
+	struct page *page;
+	char *raw;
+	struct o2hb_disk_slot *slot;
+
+	reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
+	if (reg->hr_tmp_block == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	reg->hr_slots = kcalloc(reg->hr_blocks,
+				sizeof(struct o2hb_disk_slot), GFP_KERNEL);
+	if (reg->hr_slots == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		slot->ds_node_num = i;
+		INIT_LIST_HEAD(&slot->ds_live_item);
+		slot->ds_raw_block = NULL;
+	}
+
+	reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
+	mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
+			   "at %u blocks per page\n",
+	     reg->hr_num_pages, reg->hr_blocks, spp);
+
+	reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
+				    GFP_KERNEL);
+	if (!reg->hr_slot_data) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_num_pages; i++) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page) {
+			mlog_errno(-ENOMEM);
+			return -ENOMEM;
+		}
+
+		reg->hr_slot_data[i] = page;
+
+		last_slot = i * spp;
+		raw = page_address(page);
+		for (j = 0;
+		     (j < spp) && ((j + last_slot) < reg->hr_blocks);
+		     j++) {
+			BUG_ON((j + last_slot) >= reg->hr_blocks);
+
+			slot = &reg->hr_slots[j + last_slot];
+			slot->ds_raw_block =
+				(struct o2hb_disk_heartbeat_block *) raw;
+
+			raw += reg->hr_block_bytes;
+		}
+	}
+
+	return 0;
+}
+
+/* Read in all the slots available and populate the tracking
+ * structures so that we can start with a baseline idea of what's
+ * there. */
+static int o2hb_populate_slot_data(struct o2hb_region *reg)
+{
+	int ret, i;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	mlog_entry_void();
+
+	ret = o2hb_read_slots(reg, reg->hr_blocks);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* We only want to get an idea of the values initially in each
+	 * slot, so we do no verification - o2hb_check_slot will
+	 * actually determine if each configured slot is valid and
+	 * whether any values have changed. */
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
+
+		/* Only fill the values that o2hb_check_slot uses to
+		 * determine changing slots */
+		slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
+		slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+	}
+
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+				     const char *page,
+				     size_t count)
+{
+	long fd;
+	int sectsize;
+	char *p = (char *)page;
+	struct file *filp = NULL;
+	struct inode *inode = NULL;
+	ssize_t ret = -EINVAL;
+
+	if (reg->hr_bdev)
+		goto out;
+
+	/* We can't heartbeat without having had our node number
+	 * configured yet. */
+	if (o2nm_this_node() == O2NM_MAX_NODES)
+		goto out;
+
+	fd = simple_strtol(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		goto out;
+
+	if (fd < 0 || fd >= INT_MAX)
+		goto out;
+
+	filp = fget(fd);
+	if (filp == NULL)
+		goto out;
+
+	if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
+	    reg->hr_block_bytes == 0)
+		goto out;
+
+	inode = igrab(filp->f_mapping->host);
+	if (inode == NULL)
+		goto out;
+
+	if (!S_ISBLK(inode->i_mode))
+		goto out;
+
+	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
+	if (ret) {
+		reg->hr_bdev = NULL;
+		goto out;
+	}
+	inode = NULL;
+
+	bdevname(reg->hr_bdev, reg->hr_dev_name);
+
+	sectsize = bdev_hardsect_size(reg->hr_bdev);
+	if (sectsize != reg->hr_block_bytes) {
+		mlog(ML_ERROR,
+		     "blocksize %u incorrect for device, expected %d",
+		     reg->hr_block_bytes, sectsize);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	o2hb_init_region_params(reg);
+
+	/* Generation of zero is invalid */
+	do {
+		get_random_bytes(&reg->hr_generation,
+				 sizeof(reg->hr_generation));
+	} while (reg->hr_generation == 0);
+
+	ret = o2hb_map_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = o2hb_populate_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	INIT_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout, reg);
+
+	/*
+	 * A node is considered live after it has beat LIVE_THRESHOLD
+	 * times.  We're not steady until we've given them a chance
+	 * _after_ our first read.
+	 */
+	atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
+
+	reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
+				   reg->hr_item.ci_name);
+	if (IS_ERR(reg->hr_task)) {
+		ret = PTR_ERR(reg->hr_task);
+		mlog_errno(ret);
+		reg->hr_task = NULL;
+		goto out;
+	}
+
+	ret = wait_event_interruptible(o2hb_steady_queue,
+				atomic_read(&reg->hr_steady_iterations) == 0);
+	if (ret) {
+		kthread_stop(reg->hr_task);
+		reg->hr_task = NULL;
+		goto out;
+	}
+
+	ret = count;
+out:
+	if (filp)
+		fput(filp);
+	if (inode)
+		iput(inode);
+	if (ret < 0) {
+		if (reg->hr_bdev) {
+			blkdev_put(reg->hr_bdev);
+			reg->hr_bdev = NULL;
+		}
+	}
+	return ret;
+}
+
+struct o2hb_region_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_region *, char *);
+	ssize_t (*store)(struct o2hb_region *, const char *, size_t);
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "block_bytes",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_block_bytes_read,
+	.store	= o2hb_region_block_bytes_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_start_block = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "start_block",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_start_block_read,
+	.store	= o2hb_region_start_block_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_blocks = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "blocks",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_blocks_read,
+	.store	= o2hb_region_blocks_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_dev = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dev",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_dev_read,
+	.store	= o2hb_region_dev_write,
+};
+
+static struct configfs_attribute *o2hb_region_attrs[] = {
+	&o2hb_region_attr_block_bytes.attr,
+	&o2hb_region_attr_start_block.attr,
+	&o2hb_region_attr_blocks.attr,
+	&o2hb_region_attr_dev.attr,
+	NULL,
+};
+
+static ssize_t o2hb_region_show(struct config_item *item,
+				struct configfs_attribute *attr,
+				char *page)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_region_attr->show)
+		ret = o2hb_region_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_region_store(struct config_item *item,
+				 struct configfs_attribute *attr,
+				 const char *page, size_t count)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_region_attr->store)
+		ret = o2hb_region_attr->store(reg, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations o2hb_region_item_ops = {
+	.release		= o2hb_region_release,
+	.show_attribute		= o2hb_region_show,
+	.store_attribute	= o2hb_region_store,
+};
+
+static struct config_item_type o2hb_region_type = {
+	.ct_item_ops	= &o2hb_region_item_ops,
+	.ct_attrs	= o2hb_region_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* heartbeat set */
+
+struct o2hb_heartbeat_group {
+	struct config_group hs_group;
+	/* some stuff? */
+};
+
+static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2hb_heartbeat_group, hs_group)
+		: NULL;
+}
+
+static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
+							  const char *name)
+{
+	struct o2hb_region *reg = NULL;
+	struct config_item *ret = NULL;
+
+	reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL);
+	if (reg == NULL)
+		goto out; /* ENOMEM */
+
+	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+	ret = &reg->hr_item;
+
+	spin_lock(&o2hb_live_lock);
+	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
+	spin_unlock(&o2hb_live_lock);
+out:
+	if (ret == NULL)
+		kfree(reg);
+
+	return ret;
+}
+
+static void o2hb_heartbeat_group_drop_item(struct config_group *group,
+					   struct config_item *item)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+
+	/* stop the thread when the user removes the region dir */
+	if (reg->hr_task) {
+		kthread_stop(reg->hr_task);
+		reg->hr_task = NULL;
+	}
+
+	config_item_put(item);
+}
+
+struct o2hb_heartbeat_group_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
+	ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
+};
+
+static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
+					 struct configfs_attribute *attr,
+					 char *page)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_heartbeat_group_attr->show)
+		ret = o2hb_heartbeat_group_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
+					  struct configfs_attribute *attr,
+					  const char *page, size_t count)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_heartbeat_group_attr->store)
+		ret = o2hb_heartbeat_group_attr->store(reg, page, count);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
+						     char *page)
+{
+	return sprintf(page, "%u\n", o2hb_dead_threshold);
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
+						    const char *page,
+						    size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+                return -EINVAL;
+
+	/* this will validate ranges for us. */
+	o2hb_dead_threshold_set((unsigned int) tmp);
+
+	return count;
+}
+
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dead_threshold",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_heartbeat_group_threshold_show,
+	.store	= o2hb_heartbeat_group_threshold_store,
+};
+
+static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
+	&o2hb_heartbeat_group_attr_threshold.attr,
+	NULL,
+};
+
+static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+	.show_attribute		= o2hb_heartbeat_group_show,
+	.store_attribute	= o2hb_heartbeat_group_store,
+};
+
+static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
+	.make_item	= o2hb_heartbeat_group_make_item,
+	.drop_item	= o2hb_heartbeat_group_drop_item,
+};
+
+static struct config_item_type o2hb_heartbeat_group_type = {
+	.ct_group_ops	= &o2hb_heartbeat_group_group_ops,
+	.ct_item_ops	= &o2hb_hearbeat_group_item_ops,
+	.ct_attrs	= o2hb_heartbeat_group_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* this is just here to avoid touching group in heartbeat.h which the
+ * entire damn world #includes */
+struct config_group *o2hb_alloc_hb_set(void)
+{
+	struct o2hb_heartbeat_group *hs = NULL;
+	struct config_group *ret = NULL;
+
+	hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
+	if (hs == NULL)
+		goto out;
+
+	config_group_init_type_name(&hs->hs_group, "heartbeat",
+				    &o2hb_heartbeat_group_type);
+
+	ret = &hs->hs_group;
+out:
+	if (ret == NULL)
+		kfree(hs);
+	return ret;
+}
+
+void o2hb_free_hb_set(struct config_group *group)
+{
+	struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
+	kfree(hs);
+}
+
+/* hb callback registration and issueing */
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
+{
+	if (type == O2HB_NUM_CB)
+		return ERR_PTR(-EINVAL);
+
+	return &o2hb_callbacks[type];
+}
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority)
+{
+	INIT_LIST_HEAD(&hc->hc_item);
+	hc->hc_func = func;
+	hc->hc_data = data;
+	hc->hc_priority = priority;
+	hc->hc_type = type;
+	hc->hc_magic = O2HB_CB_MAGIC;
+}
+EXPORT_SYMBOL_GPL(o2hb_setup_callback);
+
+int o2hb_register_callback(struct o2hb_callback_func *hc)
+{
+	struct o2hb_callback_func *tmp;
+	struct list_head *iter;
+	struct o2hb_callback *hbcall;
+	int ret;
+
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+	BUG_ON(!list_empty(&hc->hc_item));
+
+	hbcall = hbcall_from_type(hc->hc_type);
+	if (IS_ERR(hbcall)) {
+		ret = PTR_ERR(hbcall);
+		goto out;
+	}
+
+	down_write(&o2hb_callback_sem);
+
+	list_for_each(iter, &hbcall->list) {
+		tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
+		if (hc->hc_priority < tmp->hc_priority) {
+			list_add_tail(&hc->hc_item, iter);
+			break;
+		}
+	}
+	if (list_empty(&hc->hc_item))
+		list_add_tail(&hc->hc_item, &hbcall->list);
+
+	up_write(&o2hb_callback_sem);
+	ret = 0;
+out:
+	mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+	     ret, __builtin_return_address(0), hc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2hb_register_callback);
+
+int o2hb_unregister_callback(struct o2hb_callback_func *hc)
+{
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+
+	mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+	     __builtin_return_address(0), hc);
+
+	if (list_empty(&hc->hc_item))
+		return 0;
+
+	down_write(&o2hb_callback_sem);
+
+	list_del_init(&hc->hc_item);
+
+	up_write(&o2hb_callback_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
+
+int o2hb_check_node_heartbeating(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
+
+int o2hb_check_node_heartbeating_from_callback(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
+
+/* Makes sure our local node is configured with a node number, and is
+ * heartbeating. */
+int o2hb_check_local_node_heartbeating(void)
+{
+	u8 node_num;
+
+	/* if this node was set then we have networking */
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_MAX_NODES) {
+		mlog(ML_HEARTBEAT, "this node has not been configured.\n");
+		return 0;
+	}
+
+	return o2hb_check_node_heartbeating(node_num);
+}
+EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
+
+/*
+ * this is just a hack until we get the plumbing which flips file systems
+ * read only and drops the hb ref instead of killing the node dead.
+ */
+void o2hb_stop_all_regions(void)
+{
+	struct o2hb_region *reg;
+
+	mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
+
+	spin_lock(&o2hb_live_lock);
+
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
+		reg->hr_unclean_stop = 1;
+
+	spin_unlock(&o2hb_live_lock);
+}
+EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
new file mode 100644
index 000000000000..cac6223206a9
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_HEARTBEAT_H
+#define O2CLUSTER_HEARTBEAT_H
+
+#include "ocfs2_heartbeat.h"
+
+#define O2HB_REGION_TIMEOUT_MS		2000
+
+/* number of changes to be seen as live */
+#define O2HB_LIVE_THRESHOLD	   2
+/* number of equal samples to be seen as dead */
+extern unsigned int o2hb_dead_threshold;
+#define O2HB_DEFAULT_DEAD_THRESHOLD	   7
+/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
+#define O2HB_MIN_DEAD_THRESHOLD	  2
+#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
+
+#define O2HB_CB_MAGIC		0x51d1e4ec
+
+/* callback stuff */
+enum o2hb_callback_type {
+	O2HB_NODE_DOWN_CB = 0,
+	O2HB_NODE_UP_CB,
+	O2HB_NUM_CB
+};
+
+struct o2nm_node;
+typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
+
+struct o2hb_callback_func {
+	u32			hc_magic;
+	struct list_head	hc_item;
+	o2hb_cb_func		*hc_func;
+	void			*hc_data;
+	int			hc_priority;
+	enum o2hb_callback_type hc_type;
+};
+
+struct config_group *o2hb_alloc_hb_set(void);
+void o2hb_free_hb_set(struct config_group *group);
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority);
+int o2hb_register_callback(struct o2hb_callback_func *hc);
+int o2hb_unregister_callback(struct o2hb_callback_func *hc);
+void o2hb_fill_node_map(unsigned long *map,
+			unsigned bytes);
+void o2hb_init(void);
+int o2hb_check_node_heartbeating(u8 node_num);
+int o2hb_check_node_heartbeating_from_callback(u8 node_num);
+int o2hb_check_local_node_heartbeating(void);
+void o2hb_stop_all_regions(void);
+
+#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
new file mode 100644
index 000000000000..94096069cb43
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -0,0 +1,37 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_heartbeat.h
+ *
+ * On-disk structures for ocfs2_heartbeat
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_HEARTBEAT_H
+#define _OCFS2_HEARTBEAT_H
+
+struct o2hb_disk_heartbeat_block {
+	__le64 hb_seq;
+	__u8  hb_node;
+	__u8  hb_pad1[3];
+	__le32 hb_cksum;
+	__le64 hb_generation;
+};
+
+#endif /* _OCFS2_HEARTBEAT_H */
-- 
cgit v1.2.3


From 98211489d4147e41b11703e4245846d60b3acce4 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

Node messaging via tcp. Used by the dlm and the file system for point
to point communication between nodes.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/cluster/quorum.c       |  315 +++++++
 fs/ocfs2/cluster/quorum.h       |   36 +
 fs/ocfs2/cluster/sys.c          |  124 +++
 fs/ocfs2/cluster/sys.h          |   33 +
 fs/ocfs2/cluster/tcp.c          | 1829 +++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/cluster/tcp.h          |  113 +++
 fs/ocfs2/cluster/tcp_internal.h |  174 ++++
 7 files changed, 2624 insertions(+)
 create mode 100644 fs/ocfs2/cluster/quorum.c
 create mode 100644 fs/ocfs2/cluster/quorum.h
 create mode 100644 fs/ocfs2/cluster/sys.c
 create mode 100644 fs/ocfs2/cluster/sys.h
 create mode 100644 fs/ocfs2/cluster/tcp.c
 create mode 100644 fs/ocfs2/cluster/tcp.h
 create mode 100644 fs/ocfs2/cluster/tcp_internal.h

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
new file mode 100644
index 000000000000..7bba98fbfc15
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.c
@@ -0,0 +1,315 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* This quorum hack is only here until we transition to some more rational
+ * approach that is driven from userspace.  Honest.  No foolin'.
+ *
+ * Imagine two nodes lose network connectivity to each other but they're still
+ * up and operating in every other way.  Presumably a network timeout indicates
+ * that a node is broken and should be recovered.  They can't both recover each
+ * other and both carry on without serialising their access to the file system.
+ * They need to decide who is authoritative.  Now extend that problem to
+ * arbitrary groups of nodes losing connectivity between each other.
+ *
+ * So we declare that a node which has given up on connecting to a majority
+ * of nodes who are still heartbeating will fence itself.
+ *
+ * There are huge opportunities for races here.  After we give up on a node's
+ * connection we need to wait long enough to give heartbeat an opportunity
+ * to declare the node as truly dead.  We also need to be careful with the
+ * race between when we see a node start heartbeating and when we connect
+ * to it.
+ *
+ * So nodes that are in this transtion put a hold on the quorum decision
+ * with a counter.  As they fall out of this transition they drop the count
+ * and if they're the last, they fire off the decision.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_QUORUM
+#include "masklog.h"
+#include "quorum.h"
+
+static struct o2quo_state {
+	spinlock_t		qs_lock;
+	struct work_struct	qs_work;
+	int			qs_pending;
+	int			qs_heartbeating;
+	unsigned long		qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_connected;
+	unsigned long		qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_holds;
+	unsigned long		qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+} o2quo_state;
+
+/* this is horribly heavy-handed.  It should instead flip the file
+ * system RO and call some userspace script. */
+static void o2quo_fence_self(void)
+{
+	/* panic spins with interrupts enabled.  with preempt
+	 * threads can still schedule, etc, etc */
+	o2hb_stop_all_regions();
+	panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+}
+
+/* Indicate that a timeout occured on a hearbeat region write. The
+ * other nodes in the cluster may consider us dead at that time so we
+ * want to "fence" ourselves so that we don't scribble on the disk
+ * after they think they've recovered us. This can't solve all
+ * problems related to writeout after recovery but this hack can at
+ * least close some of those gaps. When we have real fencing, this can
+ * go away as our node would be fenced externally before other nodes
+ * begin recovery. */
+void o2quo_disk_timeout(void)
+{
+	o2quo_fence_self();
+}
+
+static void o2quo_make_decision(void *arg)
+{
+	int quorum;
+	int lowest_hb, lowest_reachable = 0, fence = 0;
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
+	if (lowest_hb != O2NM_MAX_NODES)
+		lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
+
+	mlog(0, "heartbeating: %d, connected: %d, "
+	     "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
+	     qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
+
+	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
+	    qs->qs_heartbeating == 1)
+		goto out;
+
+	if (qs->qs_heartbeating & 1) {
+		/* the odd numbered cluster case is straight forward --
+		 * if we can't talk to the majority we're hosed */
+		quorum = (qs->qs_heartbeating + 1)/2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+	} else {
+		/* the even numbered cluster adds the possibility of each half
+		 * of the cluster being able to talk amongst themselves.. in
+		 * that case we're hosed if we can't talk to the group that has
+		 * the lowest numbered node */
+		quorum = qs->qs_heartbeating / 2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+		else if ((qs->qs_connected == quorum) &&
+			 !lowest_reachable) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "connected to a half-quorum of %u out of %u "
+			     "nodes which doesn't include the lowest active "
+			     "node %u\n", quorum, qs->qs_heartbeating,
+			     lowest_hb);
+			fence = 1;
+		}
+	}
+
+out:
+	spin_unlock(&qs->qs_lock);
+	if (fence)
+		o2quo_fence_self();
+}
+
+static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
+		qs->qs_holds++;
+		mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
+			        "node %u\n", node);
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds);
+	}
+}
+
+static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
+		if (--qs->qs_holds == 0) {
+			if (qs->qs_pending) {
+				qs->qs_pending = 0;
+				schedule_work(&qs->qs_work);
+			}
+		}
+		mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
+				node, qs->qs_holds);
+	}
+}
+
+/* as a node comes up we delay the quorum decision until we know the fate of
+ * the connection.  the hold will be droped in conn_up or hb_down.  it might be
+ * perpetuated by con_err until hb_down.  if we already have a conn, we might
+ * be dropping a hold that conn_up got. */
+void o2quo_hb_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating++;
+	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	set_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	if (!test_bit(node, qs->qs_conn_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* hb going down releases any holds we might have had due to this node from
+ * conn_up, conn_err, or hb_up */
+void o2quo_hb_down(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating--;
+	mlog_bug_on_msg(qs->qs_heartbeating < 0,
+			"node %u, %d heartbeating\n",
+			node, qs->qs_heartbeating);
+	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	clear_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* this tells us that we've decided that the node is still heartbeating
+ * even though we've lost it's conn.  it must only be called after conn_err
+ * and indicates that we must now make a quorum decision in the future,
+ * though we might be doing so after waiting for holds to drain.  Here
+ * we'll be dropping the hold from conn_err. */
+void o2quo_hb_still_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	mlog(0, "node %u\n", node);
+
+	qs->qs_pending = 1;
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* This is analagous to hb_up.  as a node's connection comes up we delay the
+ * quorum decision until we see it heartbeating.  the hold will be droped in
+ * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
+ * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * */
+void o2quo_conn_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_connected++;
+	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
+	set_bit(node, qs->qs_conn_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (!test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* we've decided that we won't ever be connecting to the node again.  if it's
+ * still heartbeating we grab a hold that will delay decisions until either the
+ * node stops heartbeating from hb_down or the caller decides that the node is
+ * still up and calls still_up */
+void o2quo_conn_err(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	if (test_bit(node, qs->qs_conn_bm)) {
+		qs->qs_connected--;
+		mlog_bug_on_msg(qs->qs_connected < 0,
+				"node %u, connected %d\n",
+				node, qs->qs_connected);
+
+		clear_bit(node, qs->qs_conn_bm);
+	}
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+void o2quo_init(void)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock_init(&qs->qs_lock);
+	INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
+}
+
+void o2quo_exit(void)
+{
+	flush_scheduled_work();
+}
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
new file mode 100644
index 000000000000..6649cc6f67c9
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_QUORUM_H
+#define O2CLUSTER_QUORUM_H
+
+void o2quo_init(void);
+void o2quo_exit(void);
+
+void o2quo_hb_up(u8 node);
+void o2quo_hb_down(u8 node);
+void o2quo_hb_still_up(u8 node);
+void o2quo_conn_up(u8 node);
+void o2quo_conn_err(u8 node);
+void o2quo_disk_timeout(void);
+
+#endif /* O2CLUSTER_QUORUM_H */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
new file mode 100644
index 000000000000..f1e99461bb0d
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.c
@@ -0,0 +1,124 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.c
+ *
+ * OCFS2 cluster sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include "ocfs2_nodemanager.h"
+#include "masklog.h"
+#include "sys.h"
+
+struct o2cb_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(char *buf);
+	ssize_t (*store)(const char *buf, size_t count);
+};
+
+#define O2CB_ATTR(_name, _mode, _show, _store)	\
+struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
+#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
+
+static ssize_t o2cb_interface_revision_show(char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
+}
+
+O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+
+static struct attribute *o2cb_attrs[] = {
+	&o2cb_attr_interface_revision.attr,
+	NULL,
+};
+
+static ssize_t
+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
+static ssize_t
+o2cb_store(struct kobject * kobj, struct attribute * attr,
+	   const char * buffer, size_t count);
+static struct sysfs_ops o2cb_sysfs_ops = {
+	.show	= o2cb_show,
+	.store	= o2cb_store,
+};
+
+static struct kobj_type o2cb_subsys_type = {
+	.default_attrs	= o2cb_attrs,
+	.sysfs_ops	= &o2cb_sysfs_ops,
+};
+
+/* gives us o2cb_subsys */
+decl_subsys(o2cb, NULL, NULL);
+
+static ssize_t
+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
+{
+	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
+	struct subsystem *sbs = to_o2cb_subsys(kobj);
+
+	BUG_ON(sbs != &o2cb_subsys);
+
+	if (o2cb_attr->show)
+		return o2cb_attr->show(buffer);
+	return -EIO;
+}
+
+static ssize_t
+o2cb_store(struct kobject * kobj, struct attribute * attr,
+	     const char * buffer, size_t count)
+{
+	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
+	struct subsystem *sbs = to_o2cb_subsys(kobj);
+
+	BUG_ON(sbs != &o2cb_subsys);
+
+	if (o2cb_attr->store)
+		return o2cb_attr->store(buffer, count);
+	return -EIO;
+}
+
+void o2cb_sys_shutdown(void)
+{
+	mlog_sys_shutdown();
+	subsystem_unregister(&o2cb_subsys);
+}
+
+int o2cb_sys_init(void)
+{
+	int ret;
+
+	o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
+	ret = subsystem_register(&o2cb_subsys);
+	if (ret)
+		return ret;
+
+	ret = mlog_sys_init(&o2cb_subsys);
+	if (ret)
+		subsystem_unregister(&o2cb_subsys);
+	return ret;
+}
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
new file mode 100644
index 000000000000..d66b8ab0045e
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.h
+ *
+ * Function prototypes for o2cb sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_SYS_H
+#define O2CLUSTER_SYS_H
+
+void o2cb_sys_shutdown(void);
+int o2cb_sys_init(void);
+
+#endif /* O2CLUSTER_SYS_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
new file mode 100644
index 000000000000..35d92c01a972
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.c
@@ -0,0 +1,1829 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * ----
+ *
+ * Callers for this were originally written against a very simple synchronus
+ * API.  This implementation reflects those simple callers.  Some day I'm sure
+ * we'll need to move to a more robust posting/callback mechanism.
+ *
+ * Transmit calls pass in kernel virtual addresses and block copying this into
+ * the socket's tx buffers via a usual blocking sendmsg.  They'll block waiting
+ * for a failed socket to timeout.  TX callers can also pass in a poniter to an
+ * 'int' which gets filled with an errno off the wire in response to the
+ * message they send.
+ *
+ * Handlers for unsolicited messages are registered.  Each socket has a page
+ * that incoming data is copied into.  First the header, then the data.
+ * Handlers are called from only one thread with a reference to this per-socket
+ * page.  This page is destroyed after the handler call, so it can't be
+ * referenced beyond the call.  Handlers may block but are discouraged from
+ * doing so.
+ *
+ * Any framing errors (bad magic, large payload lengths) close a connection.
+ *
+ * Our sock_container holds the state we associate with a socket.  It's current
+ * framing state is held there as well as the refcounting we do around when it
+ * is safe to tear down the socket.  The socket is only finally torn down from
+ * the container when the container loses all of its references -- so as long
+ * as you hold a ref on the container you can trust that the socket is valid
+ * for use with kernel socket APIs.
+ *
+ * Connections are initiated between a pair of nodes when the node with the
+ * higher node number gets a heartbeat callback which indicates that the lower
+ * numbered node has started heartbeating.  The lower numbered node is passive
+ * and only accepts the connection if the higher numbered node is heartbeating.
+ */
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <net/tcp.h>
+
+#include <asm/uaccess.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+#include "quorum.h"
+
+#include "tcp_internal.h"
+
+/* 
+ * The linux network stack isn't sparse endian clean.. It has macros like
+ * ntohs() which perform the endian checks and structs like sockaddr_in
+ * which aren't annotated.  So __force is found here to get the build
+ * clean.  When they emerge from the dark ages and annotate the code
+ * we can remove these.
+ */
+
+#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,	\
+			  NIPQUAD(sc->sc_node->nd_ipv4_address),	\
+			  ntohs(sc->sc_node->nd_ipv4_port)
+
+/*
+ * In the following two log macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define msglog(hdr, fmt, args...) do {					\
+	typeof(hdr) __hdr = (hdr);					\
+	mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d "	\
+	     "key %08x num %u] " fmt,					\
+	     be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), 	\
+	     be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status),	\
+	     be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key),	\
+	     be32_to_cpu(__hdr->msg_num) ,  ##args);			\
+} while (0)
+
+#define sclog(sc, fmt, args...) do {					\
+	typeof(sc) __sc = (sc);						\
+	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
+	     "pg_off %zu] " fmt, __sc,					\
+	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
+	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
+	    ##args);							\
+} while (0)
+
+static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static struct rb_root o2net_handler_tree = RB_ROOT;
+
+static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
+
+/* XXX someday we'll need better accounting */
+static struct socket *o2net_listen_sock = NULL;
+
+/*
+ * listen work is only queued by the listening socket callbacks on the
+ * o2net_wq.  teardown detaches the callbacks before destroying the workqueue.
+ * quorum work is queued as sock containers are shutdown.. stop_listening
+ * tears down all the node's sock containers, preventing future shutdowns
+ * and queued quroum work, before canceling delayed quorum work and
+ * destroying the work queue.
+ */
+static struct workqueue_struct *o2net_wq;
+static struct work_struct o2net_listen_work;
+
+static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
+#define O2NET_HB_PRI 0x1
+
+static struct o2net_handshake *o2net_hand;
+static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
+
+static int o2net_sys_err_translations[O2NET_ERR_MAX] =
+		{[O2NET_ERR_NONE]	= 0,
+		 [O2NET_ERR_NO_HNDLR]	= -ENOPROTOOPT,
+		 [O2NET_ERR_OVERFLOW]	= -EOVERFLOW,
+		 [O2NET_ERR_DIED]	= -EHOSTDOWN,};
+
+/* can't quite avoid *all* internal declarations :/ */
+static void o2net_sc_connect_completed(void *arg);
+static void o2net_rx_until_empty(void *arg);
+static void o2net_shutdown_sc(void *arg);
+static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_sc_send_keep_req(void *arg);
+static void o2net_idle_timer(unsigned long data);
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
+
+static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
+{
+	int trans;
+	BUG_ON(err >= O2NET_ERR_MAX);
+	trans = o2net_sys_err_translations[err];
+
+	/* Just in case we mess up the translation table above */
+	BUG_ON(err != O2NET_ERR_NONE && trans == 0);
+	return trans;
+}
+
+static struct o2net_node * o2net_nn_from_num(u8 node_num)
+{
+	BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
+	return &o2net_nodes[node_num];
+}
+
+static u8 o2net_num_from_nn(struct o2net_node *nn)
+{
+	BUG_ON(nn == NULL);
+	return nn - o2net_nodes;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
+{
+	int ret = 0;
+
+	do {
+		if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
+			ret = -EAGAIN;
+			break;
+		}
+		spin_lock(&nn->nn_lock);
+		ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
+		if (ret == 0)
+			list_add_tail(&nsw->ns_node_item,
+				      &nn->nn_status_list);
+		spin_unlock(&nn->nn_lock);
+	} while (ret == -EAGAIN);
+
+	if (ret == 0)  {
+		init_waitqueue_head(&nsw->ns_wq);
+		nsw->ns_sys_status = O2NET_ERR_NONE;
+		nsw->ns_status = 0;
+	}
+
+	return ret;
+}
+
+static void o2net_complete_nsw_locked(struct o2net_node *nn,
+				      struct o2net_status_wait *nsw,
+				      enum o2net_system_error sys_status,
+				      s32 status)
+{
+	assert_spin_locked(&nn->nn_lock);
+
+	if (!list_empty(&nsw->ns_node_item)) {
+		list_del_init(&nsw->ns_node_item);
+		nsw->ns_sys_status = sys_status;
+		nsw->ns_status = status;
+		idr_remove(&nn->nn_status_idr, nsw->ns_id);
+		wake_up(&nsw->ns_wq);
+	}
+}
+
+static void o2net_complete_nsw(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw,
+			       u64 id, enum o2net_system_error sys_status,
+			       s32 status)
+{
+	spin_lock(&nn->nn_lock);
+	if (nsw == NULL) {
+		if (id > INT_MAX)
+			goto out;
+
+		nsw = idr_find(&nn->nn_status_idr, id);
+		if (nsw == NULL)
+			goto out;
+	}
+
+	o2net_complete_nsw_locked(nn, nsw, sys_status, status);
+
+out:
+	spin_unlock(&nn->nn_lock);
+	return;
+}
+
+static void o2net_complete_nodes_nsw(struct o2net_node *nn)
+{
+	struct list_head *iter, *tmp;
+	unsigned int num_kills = 0;
+	struct o2net_status_wait *nsw;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	list_for_each_safe(iter, tmp, &nn->nn_status_list) {
+		nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
+		o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
+		num_kills++;
+	}
+
+	mlog(0, "completed %d messages for node %u\n", num_kills,
+	     o2net_num_from_nn(nn));
+}
+
+static int o2net_nsw_completed(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw)
+{
+	int completed;
+	spin_lock(&nn->nn_lock);
+	completed = list_empty(&nsw->ns_node_item);
+	spin_unlock(&nn->nn_lock);
+	return completed;
+}
+
+/* ------------------------------------------------------------ */
+
+static void sc_kref_release(struct kref *kref)
+{
+	struct o2net_sock_container *sc = container_of(kref,
+					struct o2net_sock_container, sc_kref);
+	sclog(sc, "releasing\n");
+
+	if (sc->sc_sock) {
+		sock_release(sc->sc_sock);
+		sc->sc_sock = NULL;
+	}
+
+	o2nm_node_put(sc->sc_node);
+	sc->sc_node = NULL;
+
+	kfree(sc);
+}
+
+static void sc_put(struct o2net_sock_container *sc)
+{
+	sclog(sc, "put\n");
+	kref_put(&sc->sc_kref, sc_kref_release);
+}
+static void sc_get(struct o2net_sock_container *sc)
+{
+	sclog(sc, "get\n");
+	kref_get(&sc->sc_kref);
+}
+static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
+{
+	struct o2net_sock_container *sc, *ret = NULL;
+	struct page *page = NULL;
+
+	page = alloc_page(GFP_NOFS);
+	sc = kcalloc(1, sizeof(*sc), GFP_NOFS);
+	if (sc == NULL || page == NULL)
+		goto out;
+
+	kref_init(&sc->sc_kref);
+	o2nm_node_get(node);
+	sc->sc_node = node;
+
+	INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
+	INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
+	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
+	INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
+
+	init_timer(&sc->sc_idle_timeout);
+	sc->sc_idle_timeout.function = o2net_idle_timer;
+	sc->sc_idle_timeout.data = (unsigned long)sc;
+
+	sclog(sc, "alloced\n");
+
+	ret = sc;
+	sc->sc_page = page;
+	sc = NULL;
+	page = NULL;
+
+out:
+	if (page)
+		__free_page(page);
+	kfree(sc);
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static void o2net_sc_queue_work(struct o2net_sock_container *sc,
+				struct work_struct *work)
+{
+	sc_get(sc);
+	if (!queue_work(o2net_wq, work))
+		sc_put(sc);
+}
+static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
+					struct work_struct *work,
+					int delay)
+{
+	sc_get(sc);
+	if (!queue_delayed_work(o2net_wq, work, delay))
+		sc_put(sc);
+}
+static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
+					 struct work_struct *work)
+{
+	if (cancel_delayed_work(work))
+		sc_put(sc);
+}
+
+static void o2net_set_nn_state(struct o2net_node *nn,
+			       struct o2net_sock_container *sc,
+			       unsigned valid, int err)
+{
+	int was_valid = nn->nn_sc_valid;
+	int was_err = nn->nn_persistent_error;
+	struct o2net_sock_container *old_sc = nn->nn_sc;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	/* the node num comparison and single connect/accept path should stop
+	 * an non-null sc from being overwritten with another */
+	BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
+	mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
+	mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
+
+	/* we won't reconnect after our valid conn goes away for
+	 * this hb iteration.. here so it shows up in the logs */
+	if (was_valid && !valid && err == 0)
+		err = -ENOTCONN;
+
+	mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
+	     o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
+	     nn->nn_persistent_error, err);
+
+	nn->nn_sc = sc;
+	nn->nn_sc_valid = valid ? 1 : 0;
+	nn->nn_persistent_error = err;
+
+	/* mirrors o2net_tx_can_proceed() */
+	if (nn->nn_persistent_error || nn->nn_sc_valid)
+		wake_up(&nn->nn_sc_wq);
+
+	if (!was_err && nn->nn_persistent_error) {
+		o2quo_conn_err(o2net_num_from_nn(nn));
+		queue_delayed_work(o2net_wq, &nn->nn_still_up,
+				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+	}
+
+	if (was_valid && !valid) {
+		mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
+		     SC_NODEF_ARGS(old_sc));
+		o2net_complete_nodes_nsw(nn);
+	}
+
+	if (!was_valid && valid) {
+		o2quo_conn_up(o2net_num_from_nn(nn));
+		/* this is a bit of a hack.  we only try reconnecting
+		 * when heartbeating starts until we get a connection.
+		 * if that connection then dies we don't try reconnecting.
+		 * the only way to start connecting again is to down
+		 * heartbeat and bring it back up. */
+		cancel_delayed_work(&nn->nn_connect_expired);
+		mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", 
+		     o2nm_this_node() > sc->sc_node->nd_num ?
+		     	"connected to" : "accepted connection from",
+		     SC_NODEF_ARGS(sc));
+	}
+
+	/* trigger the connecting worker func as long as we're not valid,
+	 * it will back off if it shouldn't connect.  This can be called
+	 * from node config teardown and so needs to be careful about
+	 * the work queue actually being up. */
+	if (!valid && o2net_wq) {
+		unsigned long delay;
+		/* delay if we're withing a RECONNECT_DELAY of the
+		 * last attempt */
+		delay = (nn->nn_last_connect_attempt +
+			 msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+			- jiffies;
+		if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+			delay = 0;
+		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
+		queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+	}
+
+	/* keep track of the nn's sc ref for the caller */
+	if ((old_sc == NULL) && sc)
+		sc_get(sc);
+	if (old_sc && (old_sc != sc)) {
+		o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
+		sc_put(old_sc);
+	}
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+
+	read_lock(&sk->sk_callback_lock);
+	if (sk->sk_user_data) {
+		struct o2net_sock_container *sc = sk->sk_user_data;
+		sclog(sc, "data_ready hit\n");
+		do_gettimeofday(&sc->sc_tv_data_ready);
+		o2net_sc_queue_work(sc, &sc->sc_rx_work);
+		ready = sc->sc_data_ready;
+	} else {
+		ready = sk->sk_data_ready;
+	}
+	read_unlock(&sk->sk_callback_lock);
+
+	ready(sk, bytes);
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_state_change(struct sock *sk)
+{
+	void (*state_change)(struct sock *sk);
+	struct o2net_sock_container *sc;
+
+	read_lock(&sk->sk_callback_lock);
+	sc = sk->sk_user_data;
+	if (sc == NULL) {
+		state_change = sk->sk_state_change;
+		goto out;
+	}
+
+	sclog(sc, "state_change to %d\n", sk->sk_state);
+
+	state_change = sc->sc_state_change;
+
+	switch(sk->sk_state) {
+		/* ignore connecting sockets as they make progress */
+		case TCP_SYN_SENT:
+		case TCP_SYN_RECV:
+			break;
+		case TCP_ESTABLISHED:
+			o2net_sc_queue_work(sc, &sc->sc_connect_work);
+			break;
+		default:
+			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+			break;
+	}
+out:
+	read_unlock(&sk->sk_callback_lock);
+	state_change(sk);
+}
+
+/*
+ * we register callbacks so we can queue work on events before calling
+ * the original callbacks.  our callbacks our careful to test user_data
+ * to discover when they've reaced with o2net_unregister_callbacks().
+ */
+static void o2net_register_callbacks(struct sock *sk,
+				     struct o2net_sock_container *sc)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+
+	/* accepted sockets inherit the old listen socket data ready */
+	if (sk->sk_data_ready == o2net_listen_data_ready) {
+		sk->sk_data_ready = sk->sk_user_data;
+		sk->sk_user_data = NULL;
+	}
+
+	BUG_ON(sk->sk_user_data != NULL);
+	sk->sk_user_data = sc;
+	sc_get(sc);
+
+	sc->sc_data_ready = sk->sk_data_ready;
+	sc->sc_state_change = sk->sk_state_change;
+	sk->sk_data_ready = o2net_data_ready;
+	sk->sk_state_change = o2net_state_change;
+
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int o2net_unregister_callbacks(struct sock *sk,
+			           struct o2net_sock_container *sc)
+{
+	int ret = 0;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_user_data == sc) {
+		ret = 1;
+		sk->sk_user_data = NULL;
+		sk->sk_data_ready = sc->sc_data_ready;
+		sk->sk_state_change = sc->sc_state_change;
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	return ret;
+}
+
+/*
+ * this is a little helper that is called by callers who have seen a problem
+ * with an sc and want to detach it from the nn if someone already hasn't beat
+ * them to it.  if an error is given then the shutdown will be persistent
+ * and pending transmits will be canceled.
+ */
+static void o2net_ensure_shutdown(struct o2net_node *nn,
+			           struct o2net_sock_container *sc,
+				   int err)
+{
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc == sc)
+		o2net_set_nn_state(nn, NULL, 0, err);
+	spin_unlock(&nn->nn_lock);
+}
+
+/*
+ * This work queue function performs the blocking parts of socket shutdown.  A
+ * few paths lead here.  set_nn_state will trigger this callback if it sees an
+ * sc detached from the nn.  state_change will also trigger this callback
+ * directly when it sees errors.  In that case we need to call set_nn_state
+ * ourselves as state_change couldn't get the nn_lock and call set_nn_state
+ * itself.
+ */
+static void o2net_shutdown_sc(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	sclog(sc, "shutting down\n");
+
+	/* drop the callbacks ref and call shutdown only once */
+	if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
+		/* we shouldn't flush as we're in the thread, the
+		 * races with pending sc work structs are harmless */
+		del_timer_sync(&sc->sc_idle_timeout);
+		o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+		sc_put(sc);
+		sc->sc_sock->ops->shutdown(sc->sc_sock,
+					   RCV_SHUTDOWN|SEND_SHUTDOWN);
+	}
+
+	/* not fatal so failed connects before the other guy has our
+	 * heartbeat can be retried */
+	o2net_ensure_shutdown(nn, sc, 0);
+	sc_put(sc);
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
+			     u32 key)
+{
+	int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
+
+	if (ret == 0)
+		ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
+
+	return ret;
+}
+
+static struct o2net_msg_handler *
+o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
+			  struct rb_node **ret_parent)
+{
+        struct rb_node **p = &o2net_handler_tree.rb_node;
+        struct rb_node *parent = NULL;
+	struct o2net_msg_handler *nmh, *ret = NULL;
+	int cmp;
+
+        while (*p) {
+                parent = *p;
+                nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+		cmp = o2net_handler_cmp(nmh, msg_type, key);
+
+                if (cmp < 0)
+                        p = &(*p)->rb_left;
+                else if (cmp > 0)
+                        p = &(*p)->rb_right;
+                else {
+			ret = nmh;
+                        break;
+		}
+        }
+
+        if (ret_p != NULL)
+                *ret_p = p;
+        if (ret_parent != NULL)
+                *ret_parent = parent;
+
+        return ret;
+}
+
+static void o2net_handler_kref_release(struct kref *kref)
+{
+	struct o2net_msg_handler *nmh;
+	nmh = container_of(kref, struct o2net_msg_handler, nh_kref);
+
+	kfree(nmh);
+}
+
+static void o2net_handler_put(struct o2net_msg_handler *nmh)
+{
+	kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+}
+
+/* max_len is protection for the handler func.  incoming messages won't
+ * be given to the handler if their payload is longer than the max. */
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   struct list_head *unreg_list)
+{
+	struct o2net_msg_handler *nmh = NULL;
+	struct rb_node **p, *parent;
+	int ret = 0;
+
+	if (max_len > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "max_len for message handler out of range: %u\n",
+			max_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!msg_type) {
+		mlog(0, "no message type provided: %u, %p\n", msg_type, func);
+		ret = -EINVAL;
+		goto out;
+
+	}
+	if (!func) {
+		mlog(0, "no message handler provided: %u, %p\n",
+		       msg_type, func);
+		ret = -EINVAL;
+		goto out;
+	}
+
+       	nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS);
+	if (nmh == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	nmh->nh_func = func;
+	nmh->nh_func_data = data;
+	nmh->nh_msg_type = msg_type;
+	nmh->nh_max_len = max_len;
+	nmh->nh_key = key;
+	/* the tree and list get this ref.. they're both removed in
+	 * unregister when this ref is dropped */
+	kref_init(&nmh->nh_kref);
+	INIT_LIST_HEAD(&nmh->nh_unregister_item);
+
+	write_lock(&o2net_handler_lock);
+	if (o2net_handler_tree_lookup(msg_type, key, &p, &parent))
+		ret = -EEXIST;
+	else {
+	        rb_link_node(&nmh->nh_node, parent, p);
+		rb_insert_color(&nmh->nh_node, &o2net_handler_tree);
+		list_add_tail(&nmh->nh_unregister_item, unreg_list);
+
+		mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
+		     func, msg_type, key);
+		/* we've had some trouble with handlers seemingly vanishing. */
+		mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
+							  &parent) == NULL,
+			        "couldn't find handler we *just* registerd "
+				"for type %u key %08x\n", msg_type, key);
+	}
+	write_unlock(&o2net_handler_lock);
+	if (ret)
+		goto out;
+
+out:
+	if (ret)
+		kfree(nmh);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_register_handler);
+
+void o2net_unregister_handler_list(struct list_head *list)
+{
+	struct list_head *pos, *n;
+	struct o2net_msg_handler *nmh;
+
+	write_lock(&o2net_handler_lock);
+	list_for_each_safe(pos, n, list) {
+		nmh = list_entry(pos, struct o2net_msg_handler,
+				 nh_unregister_item);
+		mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
+		     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
+		rb_erase(&nmh->nh_node, &o2net_handler_tree);
+		list_del_init(&nmh->nh_unregister_item);
+		kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+	}
+	write_unlock(&o2net_handler_lock);
+}
+EXPORT_SYMBOL_GPL(o2net_unregister_handler_list);
+
+static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
+{
+	struct o2net_msg_handler *nmh;
+
+	read_lock(&o2net_handler_lock);
+	nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL);
+	if (nmh)
+		kref_get(&nmh->nh_kref);
+	read_unlock(&o2net_handler_lock);
+
+	return nmh;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
+{
+	int ret;
+	mm_segment_t oldfs;
+	struct kvec vec = {
+		.iov_len = len,
+		.iov_base = data,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&vec,
+       		.msg_flags = MSG_DONTWAIT,
+	};
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
+	set_fs(oldfs);
+
+	return ret;
+}
+
+static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
+			      size_t veclen, size_t total)
+{
+	int ret;
+	mm_segment_t oldfs;
+	struct msghdr msg = {
+		.msg_iov = (struct iovec *)vec,
+		.msg_iovlen = veclen,
+	};
+
+	if (sock == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	ret = sock_sendmsg(sock, &msg, total);
+	set_fs(oldfs);
+	if (ret != total) {
+		mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
+		     total);
+		if (ret >= 0)
+			ret = -EPIPE; /* should be smarter, I bet */
+		goto out;
+	}
+
+	ret = 0;
+out:
+	if (ret < 0)
+		mlog(0, "returning error: %d\n", ret);
+	return ret;
+}
+
+static void o2net_sendpage(struct o2net_sock_container *sc,
+			   void *kmalloced_virt,
+			   size_t size)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	ssize_t ret;
+
+
+	ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
+					 virt_to_page(kmalloced_virt),
+					 (long)kmalloced_virt & ~PAGE_MASK,
+					 size, MSG_DONTWAIT);
+	if (ret != size) {
+		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 
+		     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+}
+
+static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
+{
+	memset(msg, 0, sizeof(struct o2net_msg));
+	msg->magic = cpu_to_be16(O2NET_MSG_MAGIC);
+	msg->data_len = cpu_to_be16(data_len);
+	msg->msg_type = cpu_to_be16(msg_type);
+	msg->sys_status = cpu_to_be32(O2NET_ERR_NONE);
+	msg->status = 0;
+	msg->key = cpu_to_be32(key);
+}
+
+static int o2net_tx_can_proceed(struct o2net_node *nn,
+			        struct o2net_sock_container **sc_ret,
+				int *error)
+{
+	int ret = 0;
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_persistent_error) {
+		ret = 1;
+		*sc_ret = NULL;
+		*error = nn->nn_persistent_error;
+	} else if (nn->nn_sc_valid) {
+		kref_get(&nn->nn_sc->sc_kref);
+
+		ret = 1;
+		*sc_ret = nn->nn_sc;
+		*error = 0;
+	}
+	spin_unlock(&nn->nn_lock);
+
+	return ret;
+}
+
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
+			   size_t caller_veclen, u8 target_node, int *status)
+{
+	int ret, error = 0;
+	struct o2net_msg *msg = NULL;
+	size_t veclen, caller_bytes = 0;
+	struct kvec *vec = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn = o2net_nn_from_num(target_node);
+	struct o2net_status_wait nsw = {
+		.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
+	};
+
+	if (o2net_wq == NULL) {
+		mlog(0, "attempt to tx without o2netd running\n");
+		ret = -ESRCH;
+		goto out;
+	}
+
+	if (caller_veclen == 0) {
+		mlog(0, "bad kvec array length\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
+	if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "total payload len %zu too large\n", caller_bytes);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (target_node == o2nm_this_node()) {
+		ret = -ELOOP;
+		goto out;
+	}
+
+	ret = wait_event_interruptible(nn->nn_sc_wq,
+				       o2net_tx_can_proceed(nn, &sc, &error));
+	if (!ret && error)
+		ret = error;
+	if (ret)
+		goto out;
+
+	veclen = caller_veclen + 1;
+	vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
+	if (vec == NULL) {
+		mlog(0, "failed to %zu element kvec!\n", veclen);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC);
+	if (!msg) {
+		mlog(0, "failed to allocate a o2net_msg!\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	o2net_init_msg(msg, caller_bytes, msg_type, key);
+
+	vec[0].iov_len = sizeof(struct o2net_msg);
+	vec[0].iov_base = msg;
+	memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
+
+	ret = o2net_prep_nsw(nn, &nsw);
+	if (ret)
+		goto out;
+
+	msg->msg_num = cpu_to_be32(nsw.ns_id);
+
+	/* finally, convert the message header to network byte-order
+	 * and send */
+	ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
+				 sizeof(struct o2net_msg) + caller_bytes);
+	msglog(msg, "sending returned %d\n", ret);
+	if (ret < 0) {
+		mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
+		goto out;
+	}
+
+	/* wait on other node's handler */
+	wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+
+	/* Note that we avoid overwriting the callers status return
+	 * variable if a system error was reported on the other
+	 * side. Callers beware. */
+	ret = o2net_sys_err_to_errno(nsw.ns_sys_status);
+	if (status && !ret)
+		*status = nsw.ns_status;
+
+	mlog(0, "woken, returning system status %d, user status %d\n",
+	     ret, nsw.ns_status);
+out:
+	if (sc)
+		sc_put(sc);
+	if (vec)
+		kfree(vec);
+	if (msg)
+		kfree(msg);
+	o2net_complete_nsw(nn, &nsw, 0, 0, 0);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_send_message_vec);
+
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status)
+{
+	struct kvec vec = {
+		.iov_base = data,
+		.iov_len = len,
+	};
+	return o2net_send_message_vec(msg_type, key, &vec, 1,
+				      target_node, status);
+}
+EXPORT_SYMBOL_GPL(o2net_send_message);
+
+static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr,
+				   enum o2net_system_error syserr, int err)
+{
+	struct kvec vec = {
+		.iov_base = hdr,
+		.iov_len = sizeof(struct o2net_msg),
+	};
+
+	BUG_ON(syserr >= O2NET_ERR_MAX);
+
+	/* leave other fields intact from the incoming message, msg_num
+	 * in particular */
+	hdr->sys_status = cpu_to_be32(syserr);
+	hdr->status = cpu_to_be32(err);
+	hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC);  // twiddle the magic
+	hdr->data_len = 0;
+
+	msglog(hdr, "about to send status magic %d\n", err);
+	/* hdr has been in host byteorder this whole time */
+	return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg));
+}
+
+/* this returns -errno if the header was unknown or too large, etc.
+ * after this is called the buffer us reused for the next message */
+static int o2net_process_message(struct o2net_sock_container *sc,
+				 struct o2net_msg *hdr)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	int ret = 0, handler_status;
+	enum  o2net_system_error syserr;
+	struct o2net_msg_handler *nmh = NULL;
+
+	msglog(hdr, "processing message\n");
+
+	o2net_sc_postpone_idle(sc);
+
+	switch(be16_to_cpu(hdr->magic)) {
+		case O2NET_MSG_STATUS_MAGIC:
+			/* special type for returning message status */
+			o2net_complete_nsw(nn, NULL,
+					   be32_to_cpu(hdr->msg_num),
+					   be32_to_cpu(hdr->sys_status),
+					   be32_to_cpu(hdr->status));
+			goto out;
+		case O2NET_MSG_KEEP_REQ_MAGIC:
+			o2net_sendpage(sc, o2net_keep_resp,
+				       sizeof(*o2net_keep_resp));
+			goto out;
+		case O2NET_MSG_KEEP_RESP_MAGIC:
+			goto out;
+		case O2NET_MSG_MAGIC:
+			break;
+		default:
+			msglog(hdr, "bad magic\n");
+			ret = -EINVAL;
+			goto out;
+			break;
+	}
+
+	/* find a handler for it */
+	handler_status = 0;
+	nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type),
+				be32_to_cpu(hdr->key));
+	if (!nmh) {
+		mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
+		     be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
+		syserr = O2NET_ERR_NO_HNDLR;
+		goto out_respond;
+	}
+
+	syserr = O2NET_ERR_NONE;
+
+	if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
+		syserr = O2NET_ERR_OVERFLOW;
+
+	if (syserr != O2NET_ERR_NONE)
+		goto out_respond;
+
+	do_gettimeofday(&sc->sc_tv_func_start);
+	sc->sc_msg_key = be32_to_cpu(hdr->key);
+	sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
+	handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
+					     be16_to_cpu(hdr->data_len),
+					nmh->nh_func_data);
+	do_gettimeofday(&sc->sc_tv_func_stop);
+
+out_respond:
+	/* this destroys the hdr, so don't use it after this */
+	ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
+				      handler_status);
+	hdr = NULL;
+	mlog(0, "sending handler status %d, syserr %d returned %d\n",
+	     handler_status, syserr, ret);
+
+out:
+	if (nmh)
+		o2net_handler_put(nmh);
+	return ret;
+}
+
+static int o2net_check_handshake(struct o2net_sock_container *sc)
+{
+	struct o2net_handshake *hand = page_address(sc->sc_page);
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
+		     "version %llu but %llu is required, disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     (unsigned long long)be64_to_cpu(hand->protocol_version),
+		     O2NET_PROTOCOL_VERSION);
+
+		/* don't bother reconnecting if its the wrong version. */
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	sc->sc_handshake_ok = 1;
+
+	spin_lock(&nn->nn_lock);
+	/* set valid and queue the idle timers only if it hasn't been
+	 * shut down already */
+	if (nn->nn_sc == sc) {
+		o2net_sc_postpone_idle(sc);
+		o2net_set_nn_state(nn, sc, 1, 0);
+	}
+	spin_unlock(&nn->nn_lock);
+
+	/* shift everything up as though it wasn't there */
+	sc->sc_page_off -= sizeof(struct o2net_handshake);
+	if (sc->sc_page_off)
+		memmove(hand, hand + 1, sc->sc_page_off);
+
+	return 0;
+}
+
+/* this demuxes the queued rx bytes into header or payload bits and calls
+ * handlers as each full message is read off the socket.  it returns -error,
+ * == 0 eof, or > 0 for progress made.*/
+static int o2net_advance_rx(struct o2net_sock_container *sc)
+{
+	struct o2net_msg *hdr;
+	int ret = 0;
+	void *data;
+	size_t datalen;
+
+	sclog(sc, "receiving\n");
+	do_gettimeofday(&sc->sc_tv_advance_start);
+
+	/* do we need more header? */
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = sizeof(struct o2net_msg) - sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0) {
+			sc->sc_page_off += ret;
+
+			/* this working relies on the handshake being
+			 * smaller than the normal message header */
+			if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
+			    !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
+				ret = -EPROTO;
+				goto out;
+			}
+
+			/* only swab incoming here.. we can
+			 * only get here once as we cross from
+			 * being under to over */
+			if (sc->sc_page_off == sizeof(struct o2net_msg)) {
+				hdr = page_address(sc->sc_page);
+				if (be16_to_cpu(hdr->data_len) >
+				    O2NET_MAX_PAYLOAD_BYTES)
+					ret = -EOVERFLOW;
+			}
+		}
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		/* oof, still don't have a header */
+		goto out;
+	}
+
+	/* this was swabbed above when we first read it */
+	hdr = page_address(sc->sc_page);
+
+	msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
+
+	/* do we need more payload? */
+	if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) {
+		/* need more payload */
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) -
+			  sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0)
+			sc->sc_page_off += ret;
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) {
+		/* we can only get here once, the first time we read
+		 * the payload.. so set ret to progress if the handler
+		 * works out. after calling this the message is toast */
+		ret = o2net_process_message(sc, hdr);
+		if (ret == 0)
+			ret = 1;
+		sc->sc_page_off = 0;
+	}
+
+out:
+	sclog(sc, "ret = %d\n", ret);
+	do_gettimeofday(&sc->sc_tv_advance_stop);
+	return ret;
+}
+
+/* this work func is triggerd by data ready.  it reads until it can read no
+ * more.  it interprets 0, eof, as fatal.  if data_ready hits while we're doing
+ * our work the work struct will be marked and we'll be called again. */
+static void o2net_rx_until_empty(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+	int ret;
+
+	do {
+		ret = o2net_advance_rx(sc);
+	} while (ret > 0);
+
+	if (ret <= 0 && ret != -EAGAIN) {
+		struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+		sclog(sc, "saw error %d, closing\n", ret);
+		/* not permanent so read failed handshake can retry */
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+
+	sc_put(sc);
+}
+
+static int o2net_set_nodelay(struct socket *sock)
+{
+	int ret, val = 1;
+	mm_segment_t oldfs;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	/*
+	 * Dear unsuspecting programmer,
+	 *
+	 * Don't use sock_setsockopt() for SOL_TCP.  It doesn't check its level
+	 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
+	 * silently turn into SO_DEBUG.
+	 *
+	 * Yours,
+	 * Keeper of hilariously fragile interfaces.
+	 */
+	ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
+				    (char __user *)&val, sizeof(val));
+
+	set_fs(oldfs);
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+/* called when a connect completes and after a sock is accepted.  the
+ * rx path will see the response and mark the sc valid */
+static void o2net_sc_connect_completed(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+
+	mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
+              (unsigned long long)O2NET_PROTOCOL_VERSION,
+	      (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
+
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+	sc_put(sc);
+}
+
+/* this is called as a work_struct func. */
+static void o2net_sc_send_keep_req(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+
+	o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
+	sc_put(sc);
+}
+
+/* socket shutdown does a del_timer_sync against this as it tears down.
+ * we can't start this timer until we've got to the point in sc buildup
+ * where shutdown is going to be involved */
+static void o2net_idle_timer(unsigned long data)
+{
+	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct timeval now;
+
+	do_gettimeofday(&now);
+
+	mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
+	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
+	mlog(ML_NOTICE, "here are some times that might help debug the "
+	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
+	     "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
+	     sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, 
+	     now.tv_sec, now.tv_usec,
+	     sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, 
+	     sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, 
+	     sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, 
+	     sc->sc_msg_key, sc->sc_msg_type,
+	     sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec,
+	     sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec);
+
+	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+}
+
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
+{
+	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
+				    O2NET_KEEPALIVE_DELAY_SECS * HZ);
+	do_gettimeofday(&sc->sc_tv_timer);
+	mod_timer(&sc->sc_idle_timeout,
+		  jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ));
+}
+
+/* this work func is kicked whenever a path sets the nn state which doesn't
+ * have valid set.  This includes seeing hb come up, losing a connection,
+ * having a connect attempt fail, etc. This centralizes the logic which decides
+ * if a connect attempt should be made or if we should give up and all future
+ * transmit attempts should fail */
+static void o2net_start_connect(void *arg)
+{
+	struct o2net_node *nn = arg;
+	struct o2net_sock_container *sc = NULL;
+	struct o2nm_node *node = NULL;
+	struct socket *sock = NULL;
+	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
+	int ret = 0;
+
+	/* if we're greater we initiate tx, otherwise we accept */
+	if (o2nm_this_node() <= o2net_num_from_nn(nn))
+		goto out;
+
+	/* watch for racing with tearing a node down */
+	node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
+	if (node == NULL) {
+		ret = 0;
+		goto out;
+	}
+
+	spin_lock(&nn->nn_lock);
+	/* see if we already have one pending or have given up */
+	if (nn->nn_sc || nn->nn_persistent_error)
+		arg = NULL;
+	spin_unlock(&nn->nn_lock);
+	if (arg == NULL) /* *shrug*, needed some indicator */
+		goto out;
+
+	nn->nn_last_connect_attempt = jiffies;
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		mlog(0, "couldn't allocate sc\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		mlog(0, "can't create socket: %d\n", ret);
+		goto out;
+	}
+	sc->sc_sock = sock; /* freed by sc_kref_release */
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	myaddr.sin_family = AF_INET;
+	myaddr.sin_port = (__force u16)htons(0); /* any port */
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
+			      sizeof(myaddr));
+	if (ret) {
+		mlog(0, "bind failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = o2net_set_nodelay(sc->sc_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+
+	spin_lock(&nn->nn_lock);
+	/* handshake completion will set nn->nn_sc_valid */
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	remoteaddr.sin_family = AF_INET;
+	remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
+	remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
+
+	ret = sc->sc_sock->ops->connect(sc->sc_sock,
+					(struct sockaddr *)&remoteaddr,
+					sizeof(remoteaddr),
+					O_NONBLOCK);
+	if (ret == -EINPROGRESS)
+		ret = 0;
+
+out:
+	if (ret) {
+		mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
+		     "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+		/* 0 err so that another will be queued and attempted
+		 * from set_nn_state */
+		if (sc)
+			o2net_ensure_shutdown(nn, sc, 0);
+	}
+	if (sc)
+		sc_put(sc);
+	if (node)
+		o2nm_node_put(node);
+
+	return;
+}
+
+static void o2net_connect_expired(void *arg)
+{
+	struct o2net_node *nn = arg;
+
+	spin_lock(&nn->nn_lock);
+	if (!nn->nn_sc_valid) {
+		mlog(ML_ERROR, "no connection established with node %u after "
+		     "%u seconds, giving up and returning errors.\n",
+		     o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS);
+
+		o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+	}
+	spin_unlock(&nn->nn_lock);
+}
+
+static void o2net_still_up(void *arg)
+{
+	struct o2net_node *nn = arg;
+
+	o2quo_hb_still_up(o2net_num_from_nn(nn));
+}
+
+/* ------------------------------------------------------------ */
+
+void o2net_disconnect_node(struct o2nm_node *node)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
+
+	/* don't reconnect until it's heartbeating again */
+	spin_lock(&nn->nn_lock);
+	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+	spin_unlock(&nn->nn_lock);
+
+	if (o2net_wq) {
+		cancel_delayed_work(&nn->nn_connect_expired);
+		cancel_delayed_work(&nn->nn_connect_work);
+		cancel_delayed_work(&nn->nn_still_up);
+		flush_workqueue(o2net_wq);
+	}
+}
+
+static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
+				  void *data)
+{
+	o2quo_hb_down(node_num);
+
+	if (node_num != o2nm_this_node())
+		o2net_disconnect_node(node);
+}
+
+static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
+				void *data)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node_num);
+
+	o2quo_hb_up(node_num);
+
+	/* ensure an immediate connect attempt */
+	nn->nn_last_connect_attempt = jiffies -
+		(msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
+
+	if (node_num != o2nm_this_node()) {
+		/* heartbeat doesn't work unless a local node number is
+		 * configured and doing so brings up the o2net_wq, so we can
+		 * use it.. */
+		queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
+				   O2NET_IDLE_TIMEOUT_SECS * HZ);
+
+		/* believe it or not, accept and node hearbeating testing
+		 * can succeed for this node before we got here.. so
+		 * only use set_nn_state to clear the persistent error
+		 * if that hasn't already happened */
+		spin_lock(&nn->nn_lock);
+		if (nn->nn_persistent_error)
+			o2net_set_nn_state(nn, NULL, 0, 0);
+		spin_unlock(&nn->nn_lock);
+	}
+}
+
+void o2net_unregister_hb_callbacks(void)
+{
+	int ret;
+
+	ret = o2hb_unregister_callback(&o2net_hb_up);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
+		     "callback!\n", ret);
+
+	ret = o2hb_unregister_callback(&o2net_hb_down);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
+		     "callback!\n", ret);
+}
+
+int o2net_register_hb_callbacks(void)
+{
+	int ret;
+
+	o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB,
+			    o2net_hb_node_down_cb, NULL, O2NET_HB_PRI);
+	o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
+			    o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
+
+	ret = o2hb_register_callback(&o2net_hb_up);
+	if (ret == 0)
+		ret = o2hb_register_callback(&o2net_hb_down);
+
+	if (ret)
+		o2net_unregister_hb_callbacks();
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_accept_one(struct socket *sock)
+{
+	int ret, slen;
+	struct sockaddr_in sin;
+	struct socket *new_sock = NULL;
+	struct o2nm_node *node = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn;
+
+	BUG_ON(sock == NULL);
+	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
+			       sock->sk->sk_protocol, &new_sock);
+	if (ret)
+		goto out;
+
+	new_sock->type = sock->type;
+	new_sock->ops = sock->ops;
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+	if (ret < 0)
+		goto out;
+
+	new_sock->sk->sk_allocation = GFP_ATOMIC;
+
+	ret = o2net_set_nodelay(new_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	slen = sizeof(sin);
+	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
+				       &slen, 1);
+	if (ret < 0)
+		goto out;
+
+	node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
+	if (node == NULL) {
+		mlog(ML_NOTICE, "attempt to connect from unknown node at "
+		     "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (o2nm_this_node() > node->nd_num) {
+		mlog(ML_NOTICE, "unexpected connect attempted from a lower "
+		     "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port), node->nd_num);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* this happens all the time when the other node sees our heartbeat
+	 * and tries to connect before we see their heartbeat */
+	if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
+		mlog(ML_CONN, "attempt to connect from node '%s' at "
+		     "%u.%u.%u.%u:%d but it isn't heartbeating\n",
+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nn = o2net_nn_from_num(node->nd_num);
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc)
+		ret = -EBUSY;
+	else
+		ret = 0;
+	spin_unlock(&nn->nn_lock);
+	if (ret) {
+		mlog(ML_NOTICE, "attempt to connect from node '%s' at "
+		     "%u.%u.%u.%u:%d but it already has an open connection\n",
+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port));
+		goto out;
+	}
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sc->sc_sock = new_sock;
+	new_sock = NULL;
+
+	spin_lock(&nn->nn_lock);
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+	o2net_sc_queue_work(sc, &sc->sc_rx_work);
+
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+
+out:
+	if (new_sock)
+		sock_release(new_sock);
+	if (node)
+		o2nm_node_put(node);
+	if (sc)
+		sc_put(sc);
+	return ret;
+}
+
+static void o2net_accept_many(void *arg)
+{
+	struct socket *sock = arg;
+	while (o2net_accept_one(sock) == 0)
+		cond_resched();
+}
+
+static void o2net_listen_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+
+	read_lock(&sk->sk_callback_lock);
+	ready = sk->sk_user_data;
+	if (ready == NULL) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	/* ->sk_data_ready is also called for a newly established child socket
+	 * before it has been accepted and the acceptor has set up their
+	 * data_ready.. we only want to queue listen work for our listening
+	 * socket */
+	if (sk->sk_state == TCP_LISTEN) {
+		mlog(ML_TCP, "bytes: %d\n", bytes);
+		queue_work(o2net_wq, &o2net_listen_work);
+	}
+
+out:
+	read_unlock(&sk->sk_callback_lock);
+	ready(sk, bytes);
+}
+
+static int o2net_open_listening_sock(__be16 port)
+{
+	struct socket *sock = NULL;
+	int ret;
+	struct sockaddr_in sin = {
+		.sin_family = PF_INET,
+		.sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) },
+		.sin_port = (__force u16)port,
+	};
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+		goto out;
+	}
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = o2net_listen_data_ready;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	o2net_listen_sock = sock;
+	INIT_WORK(&o2net_listen_work, o2net_accept_many, sock);
+
+	sock->sk->sk_reuse = 1;
+	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n",
+		     ntohs(port), ret);
+		goto out;
+	}
+
+	ret = sock->ops->listen(sock, 64);
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n",
+		     ntohs(port), ret);
+	}
+
+out:
+	if (ret) {
+		o2net_listen_sock = NULL;
+		if (sock)
+			sock_release(sock);
+	}
+	return ret;
+}
+
+/*
+ * called from node manager when we should bring up our network listening
+ * socket.  node manager handles all the serialization to only call this
+ * once and to match it with o2net_stop_listening().  note,
+ * o2nm_this_node() doesn't work yet as we're being called while it
+ * is being set up.
+ */
+int o2net_start_listening(struct o2nm_node *node)
+{
+	int ret = 0;
+
+	BUG_ON(o2net_wq != NULL);
+	BUG_ON(o2net_listen_sock != NULL);
+
+	mlog(ML_KTHREAD, "starting o2net thread...\n");
+	o2net_wq = create_singlethread_workqueue("o2net");
+	if (o2net_wq == NULL) {
+		mlog(ML_ERROR, "unable to launch o2net thread\n");
+		return -ENOMEM; /* ? */
+	}
+
+	ret = o2net_open_listening_sock(node->nd_ipv4_port);
+	if (ret) {
+		destroy_workqueue(o2net_wq);
+		o2net_wq = NULL;
+	} else
+		o2quo_conn_up(node->nd_num);
+
+	return ret;
+}
+
+/* again, o2nm_this_node() doesn't work here as we're involved in
+ * tearing it down */
+void o2net_stop_listening(struct o2nm_node *node)
+{
+	struct socket *sock = o2net_listen_sock;
+	size_t i;
+
+	BUG_ON(o2net_wq == NULL);
+	BUG_ON(o2net_listen_sock == NULL);
+
+	/* stop the listening socket from generating work */
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_data_ready = sock->sk->sk_user_data;
+	sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2nm_node *node = o2nm_get_node_by_num(i);
+		if (node) {
+			o2net_disconnect_node(node);
+			o2nm_node_put(node);
+		}
+	}
+
+	/* finish all work and tear down the work queue */
+	mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n");
+	destroy_workqueue(o2net_wq);
+	o2net_wq = NULL;
+
+	sock_release(o2net_listen_sock);
+	o2net_listen_sock = NULL;
+
+	o2quo_conn_err(node->nd_num);
+}
+
+/* ------------------------------------------------------------ */
+
+int o2net_init(void)
+{
+	unsigned long i;
+
+	o2quo_init();
+
+	o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
+	o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
+	o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
+	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
+		kfree(o2net_hand);
+		kfree(o2net_keep_req);
+		kfree(o2net_keep_resp);
+		return -ENOMEM;
+	}
+
+	o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
+	o2net_hand->connector_id = cpu_to_be64(1);
+
+	o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
+	o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2net_node *nn = o2net_nn_from_num(i);
+
+		spin_lock_init(&nn->nn_lock);
+		INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
+		INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
+		INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
+		/* until we see hb from a node we'll return einval */
+		nn->nn_persistent_error = -ENOTCONN;
+		init_waitqueue_head(&nn->nn_sc_wq);
+		idr_init(&nn->nn_status_idr);
+		INIT_LIST_HEAD(&nn->nn_status_list);
+	}
+
+	return 0;
+}
+
+void o2net_exit(void)
+{
+	o2quo_exit();
+	kfree(o2net_hand);
+	kfree(o2net_keep_req);
+	kfree(o2net_keep_resp);
+}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
new file mode 100644
index 000000000000..a6f4585501c8
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.h
@@ -0,0 +1,113 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_TCP_H
+#define O2CLUSTER_TCP_H
+
+#include <linux/socket.h>
+#ifdef __KERNEL__
+#include <net/sock.h>
+#include <linux/tcp.h>
+#else
+#include <sys/socket.h>
+#endif
+#include <linux/inet.h>
+#include <linux/in.h>
+
+struct o2net_msg
+{
+	__be16 magic;
+	__be16 data_len;
+	__be16 msg_type;
+	__be16 pad1;
+	__be32 sys_status;
+	__be32 status;
+	__be32 key;
+	__be32 msg_num;
+	__u8  buf[0];
+};
+
+typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
+
+#define O2NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(struct o2net_msg))
+
+/* TODO: figure this out.... */
+static inline int o2net_link_down(int err, struct socket *sock)
+{
+	if (sock) {
+		if (sock->sk->sk_state != TCP_ESTABLISHED &&
+	    	    sock->sk->sk_state != TCP_CLOSE_WAIT)
+			return 1;
+	}
+
+	if (err >= 0)
+		return 0;
+	switch (err) {
+		/* ????????????????????????? */
+		case -ERESTARTSYS:
+		case -EBADF:
+		/* When the server has died, an ICMP port unreachable
+		 * message prompts ECONNREFUSED. */
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+			return 1;
+	}
+	return 0;
+}
+
+enum {
+	O2NET_DRIVER_UNINITED,
+	O2NET_DRIVER_READY,
+};
+
+int o2net_init_tcp_sock(struct inode *inode);
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status);
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
+			   size_t veclen, u8 target_node, int *status);
+int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
+			    struct inode *group);
+
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   struct list_head *unreg_list);
+void o2net_unregister_handler_list(struct list_head *list);
+
+struct o2nm_node;
+int o2net_register_hb_callbacks(void);
+void o2net_unregister_hb_callbacks(void);
+int o2net_start_listening(struct o2nm_node *node);
+void o2net_stop_listening(struct o2nm_node *node);
+void o2net_disconnect_node(struct o2nm_node *node);
+
+int o2net_init(void);
+void o2net_exit(void);
+int o2net_proc_init(struct proc_dir_entry *parent);
+void o2net_proc_exit(struct proc_dir_entry *parent);
+
+#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
new file mode 100644
index 000000000000..ff9e2e2104c2
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -0,0 +1,174 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_TCP_INTERNAL_H
+#define O2CLUSTER_TCP_INTERNAL_H
+
+#define O2NET_MSG_MAGIC           ((u16)0xfa55)
+#define O2NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
+#define O2NET_MSG_KEEP_REQ_MAGIC  ((u16)0xfa57)
+#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
+
+/* same as hb delay, we're waiting for another node to recognize our hb */
+#define O2NET_RECONNECT_DELAY_MS	O2HB_REGION_TIMEOUT_MS
+
+/* we're delaying our quorum decision so that heartbeat will have timed
+ * out truly dead nodes by the time we come around to making decisions
+ * on their number */
+#define O2NET_QUORUM_DELAY_MS	((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+
+#define O2NET_KEEPALIVE_DELAY_SECS	5
+#define O2NET_IDLE_TIMEOUT_SECS		10
+
+/* 
+ * This version number represents quite a lot, unfortunately.  It not
+ * only represents the raw network message protocol on the wire but also
+ * locking semantics of the file system using the protocol.  It should 
+ * be somewhere else, I'm sure, but right now it isn't.
+ *
+ * New in version 2:
+ * 	- full 64 bit i_size in the metadata lock lvbs
+ * 	- introduction of "rw" lock and pushing meta/data locking down
+ */
+#define O2NET_PROTOCOL_VERSION 2ULL
+struct o2net_handshake {
+	__be64	protocol_version;
+	__be64	connector_id;
+};
+
+struct o2net_node {
+	/* this is never called from int/bh */
+	spinlock_t			nn_lock;
+
+	/* set the moment an sc is allocated and a connect is started */
+	struct o2net_sock_container	*nn_sc;
+	/* _valid is only set after the handshake passes and tx can happen */
+	unsigned			nn_sc_valid:1;
+	/* if this is set tx just returns it */
+	int				nn_persistent_error;
+
+	/* threads waiting for an sc to arrive wait on the wq for generation
+	 * to increase.  it is increased when a connecting socket succeeds
+	 * or fails or when an accepted socket is attached. */
+	wait_queue_head_t		nn_sc_wq;
+
+	struct idr			nn_status_idr;
+	struct list_head		nn_status_list;
+
+	/* connects are attempted from when heartbeat comes up until either hb
+	 * goes down, the node is unconfigured, no connect attempts succeed
+	 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds.  connect_work
+	 * is queued from set_nn_state both from hb up and from itself if a
+	 * connect attempt fails and so can be self-arming.  shutdown is
+	 * careful to first mark the nn such that no connects will be attempted
+	 * before canceling delayed connect work and flushing the queue. */
+	struct work_struct		nn_connect_work;
+	unsigned long			nn_last_connect_attempt;
+
+	/* this is queued as nodes come up and is canceled when a connection is
+	 * established.  this expiring gives up on the node and errors out
+	 * transmits */
+	struct work_struct		nn_connect_expired;
+
+	/* after we give up on a socket we wait a while before deciding
+	 * that it is still heartbeating and that we should do some
+	 * quorum work */
+	struct work_struct		nn_still_up;
+};
+
+struct o2net_sock_container {
+	struct kref		sc_kref;
+	/* the next two are vaild for the life time of the sc */
+	struct socket		*sc_sock;
+	struct o2nm_node	*sc_node;
+
+	/* all of these sc work structs hold refs on the sc while they are
+	 * queued.  they should not be able to ref a freed sc.  the teardown
+	 * race is with o2net_wq destruction in o2net_stop_listening() */
+
+	/* rx and connect work are generated from socket callbacks.  sc
+	 * shutdown removes the callbacks and then flushes the work queue */
+	struct work_struct	sc_rx_work;
+	struct work_struct	sc_connect_work;
+	/* shutdown work is triggered in two ways.  the simple way is
+	 * for a code path calls ensure_shutdown which gets a lock, removes
+	 * the sc from the nn, and queues the work.  in this case the
+	 * work is single-shot.  the work is also queued from a sock
+	 * callback, though, and in this case the work will find the sc
+	 * still on the nn and will call ensure_shutdown itself.. this
+	 * ends up triggering the shutdown work again, though nothing
+	 * will be done in that second iteration.  so work queue teardown
+	 * has to be careful to remove the sc from the nn before waiting
+	 * on the work queue so that the shutdown work doesn't remove the
+	 * sc and rearm itself.
+	 */
+	struct work_struct	sc_shutdown_work;
+
+	struct timer_list	sc_idle_timeout;
+	struct work_struct	sc_keepalive_work;
+
+	unsigned		sc_handshake_ok:1;
+
+	struct page 		*sc_page;
+	size_t			sc_page_off;
+
+	/* original handlers for the sockets */
+	void			(*sc_state_change)(struct sock *sk);
+	void			(*sc_data_ready)(struct sock *sk, int bytes);
+
+	struct timeval 		sc_tv_timer;
+	struct timeval 		sc_tv_data_ready;
+	struct timeval 		sc_tv_advance_start;
+	struct timeval 		sc_tv_advance_stop;
+	struct timeval 		sc_tv_func_start;
+	struct timeval 		sc_tv_func_stop;
+	u32			sc_msg_key;
+	u16			sc_msg_type;
+};
+
+struct o2net_msg_handler {
+	struct rb_node		nh_node;
+	u32			nh_max_len;
+	u32			nh_msg_type;
+	u32			nh_key;
+	o2net_msg_handler_func	*nh_func;
+	o2net_msg_handler_func	*nh_func_data;
+	struct kref		nh_kref;
+	struct list_head	nh_unregister_item;
+};
+
+enum o2net_system_error {
+	O2NET_ERR_NONE = 0,
+	O2NET_ERR_NO_HNDLR,
+	O2NET_ERR_OVERFLOW,
+	O2NET_ERR_DIED,
+	O2NET_ERR_MAX
+};
+
+struct o2net_status_wait {
+	enum o2net_system_error	ns_sys_status;
+	s32			ns_status;
+	int			ns_id;
+	wait_queue_head_t	ns_wq;
+	struct list_head	ns_node_item;
+};
+
+#endif /* O2CLUSTER_TCP_INTERNAL_H */
-- 
cgit v1.2.3


From 6714d8e86bf443f6f7af50f9d432025649f091f5 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

A distributed lock manager built with the cluster file system use case
in mind. The OCFS2 dlm exposes a VMS style API, though things have
been simplified internally. The only lock levels implemented currently
are NLMODE, PRMODE and EXMODE.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/dlm/Makefile      |    6 +
 fs/ocfs2/dlm/dlmapi.h      |  214 ++++
 fs/ocfs2/dlm/dlmast.c      |  466 ++++++++
 fs/ocfs2/dlm/dlmcommon.h   |  884 +++++++++++++++
 fs/ocfs2/dlm/dlmconvert.c  |  530 +++++++++
 fs/ocfs2/dlm/dlmconvert.h  |   35 +
 fs/ocfs2/dlm/dlmdebug.c    |  246 ++++
 fs/ocfs2/dlm/dlmdebug.h    |   30 +
 fs/ocfs2/dlm/dlmdomain.c   | 1469 ++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmdomain.h   |   36 +
 fs/ocfs2/dlm/dlmlock.c     |  676 +++++++++++
 fs/ocfs2/dlm/dlmmaster.c   | 2666 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmrecovery.c | 2132 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmthread.c   |  695 ++++++++++++
 fs/ocfs2/dlm/dlmunlock.c   |  672 +++++++++++
 fs/ocfs2/dlm/dlmver.c      |   42 +
 fs/ocfs2/dlm/dlmver.h      |   31 +
 17 files changed, 10830 insertions(+)
 create mode 100644 fs/ocfs2/dlm/Makefile
 create mode 100644 fs/ocfs2/dlm/dlmapi.h
 create mode 100644 fs/ocfs2/dlm/dlmast.c
 create mode 100644 fs/ocfs2/dlm/dlmcommon.h
 create mode 100644 fs/ocfs2/dlm/dlmconvert.c
 create mode 100644 fs/ocfs2/dlm/dlmconvert.h
 create mode 100644 fs/ocfs2/dlm/dlmdebug.c
 create mode 100644 fs/ocfs2/dlm/dlmdebug.h
 create mode 100644 fs/ocfs2/dlm/dlmdomain.c
 create mode 100644 fs/ocfs2/dlm/dlmdomain.h
 create mode 100644 fs/ocfs2/dlm/dlmlock.c
 create mode 100644 fs/ocfs2/dlm/dlmmaster.c
 create mode 100644 fs/ocfs2/dlm/dlmrecovery.c
 create mode 100644 fs/ocfs2/dlm/dlmthread.c
 create mode 100644 fs/ocfs2/dlm/dlmunlock.c
 create mode 100644 fs/ocfs2/dlm/dlmver.c
 create mode 100644 fs/ocfs2/dlm/dlmver.h

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
new file mode 100644
index 000000000000..2a5274bcc8bb
--- /dev/null
+++ b/fs/ocfs2/dlm/Makefile
@@ -0,0 +1,6 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o
+
+ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
+	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
new file mode 100644
index 000000000000..53652f51c0e1
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -0,0 +1,214 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmapi.h
+ *
+ * externally exported dlm interfaces
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMAPI_H
+#define DLMAPI_H
+
+struct dlm_lock;
+struct dlm_ctxt;
+
+/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
+enum dlm_status {
+	DLM_NORMAL = 0,           /*  0: request in progress */
+	DLM_GRANTED,              /*  1: request granted */
+	DLM_DENIED,               /*  2: request denied */
+	DLM_DENIED_NOLOCKS,       /*  3: request denied, out of system resources */
+	DLM_WORKING,              /*  4: async request in progress */
+	DLM_BLOCKED,              /*  5: lock request blocked */
+	DLM_BLOCKED_ORPHAN,       /*  6: lock request blocked by a orphan lock*/
+	DLM_DENIED_GRACE_PERIOD,  /*  7: topological change in progress */
+	DLM_SYSERR,               /*  8: system error */
+	DLM_NOSUPPORT,            /*  9: unsupported */
+	DLM_CANCELGRANT,          /* 10: can't cancel convert: already granted */
+	DLM_IVLOCKID,             /* 11: bad lockid */
+	DLM_SYNC,                 /* 12: synchronous request granted */
+	DLM_BADTYPE,              /* 13: bad resource type */
+	DLM_BADRESOURCE,          /* 14: bad resource handle */
+	DLM_MAXHANDLES,           /* 15: no more resource handles */
+	DLM_NOCLINFO,             /* 16: can't contact cluster manager */
+	DLM_NOLOCKMGR,            /* 17: can't contact lock manager */
+	DLM_NOPURGED,             /* 18: can't contact purge daemon */
+	DLM_BADARGS,              /* 19: bad api args */
+	DLM_VOID,                 /* 20: no status */
+	DLM_NOTQUEUED,            /* 21: NOQUEUE was specified and request failed */
+	DLM_IVBUFLEN,             /* 22: invalid resource name length */
+	DLM_CVTUNGRANT,           /* 23: attempted to convert ungranted lock */
+	DLM_BADPARAM,             /* 24: invalid lock mode specified */
+	DLM_VALNOTVALID,          /* 25: value block has been invalidated */
+	DLM_REJECTED,             /* 26: request rejected, unrecognized client */
+	DLM_ABORT,                /* 27: blocked lock request cancelled */
+	DLM_CANCEL,               /* 28: conversion request cancelled */
+	DLM_IVRESHANDLE,          /* 29: invalid resource handle */
+	DLM_DEADLOCK,             /* 30: deadlock recovery refused this request */
+	DLM_DENIED_NOASTS,        /* 31: failed to allocate AST */
+	DLM_FORWARD,              /* 32: request must wait for primary's response */
+	DLM_TIMEOUT,              /* 33: timeout value for lock has expired */
+	DLM_IVGROUPID,            /* 34: invalid group specification */
+	DLM_VERS_CONFLICT,        /* 35: version conflicts prevent request handling */
+	DLM_BAD_DEVICE_PATH,      /* 36: Locks device does not exist or path wrong */
+	DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
+	DLM_NO_CONTROL_DEVICE,    /* 38: Cannot set options on opened device */
+
+	DLM_RECOVERING,           /* 39: extension, allows caller to fail a lock
+				     request if it is being recovered */
+	DLM_MIGRATING,            /* 40: extension, allows caller to fail a lock
+				     request if it is being migrated */
+	DLM_MAXSTATS,             /* 41: upper limit for return code validation */
+};
+
+/* for pretty-printing dlm_status error messages */
+const char *dlm_errmsg(enum dlm_status err);
+/* for pretty-printing dlm_status error names */
+const char *dlm_errname(enum dlm_status err);
+
+/* Eventually the DLM will use standard errno values, but in the
+ * meantime this lets us track dlm errors as they bubble up. When we
+ * bring its error reporting into line with the rest of the stack,
+ * these can just be replaced with calls to mlog_errno. */
+#define dlm_error(st) do {						\
+	if ((st) != DLM_RECOVERING &&					\
+	    (st) != DLM_MIGRATING &&					\
+	    (st) != DLM_FORWARD)					\
+		mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st)));	\
+} while (0)
+
+#define DLM_LKSB_UNUSED1           0x01  
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+#define DLM_LVB_LEN  64
+
+/* Callers are only allowed access to the lvb and status members of
+ * this struct. */
+struct dlm_lockstatus {
+	enum dlm_status status;
+	u32 flags;
+	struct dlm_lock *lockid;
+	char lvb[DLM_LVB_LEN];
+};
+
+/* Valid lock modes. */
+#define LKM_IVMODE      (-1)            /* invalid mode */
+#define LKM_NLMODE      0               /* null lock */
+#define LKM_CRMODE      1               /* concurrent read    unsupported */
+#define LKM_CWMODE      2               /* concurrent write   unsupported */
+#define LKM_PRMODE      3               /* protected read */
+#define LKM_PWMODE      4               /* protected write    unsupported */
+#define LKM_EXMODE      5               /* exclusive */
+#define LKM_MAXMODE     5
+#define LKM_MODEMASK    0xff
+
+/* Flags passed to dlmlock and dlmunlock:
+ * reserved: flags used by the "real" dlm
+ * only a few are supported by this dlm
+ * (U) = unsupported by ocfs2 dlm */
+#define LKM_ORPHAN       0x00000010  /* this lock is orphanable (U) */
+#define LKM_PARENTABLE   0x00000020  /* this lock was orphaned (U) */
+#define LKM_BLOCK        0x00000040  /* blocking lock request (U) */
+#define LKM_LOCAL        0x00000080  /* local lock request */
+#define LKM_VALBLK       0x00000100  /* lock value block request */
+#define LKM_NOQUEUE      0x00000200  /* non blocking request */
+#define LKM_CONVERT      0x00000400  /* conversion request */
+#define LKM_NODLCKWT     0x00000800  /* this lock wont deadlock (U) */
+#define LKM_UNLOCK       0x00001000  /* deallocate this lock */
+#define LKM_CANCEL       0x00002000  /* cancel conversion request */
+#define LKM_DEQALL       0x00004000  /* remove all locks held by proc (U) */
+#define LKM_INVVALBLK    0x00008000  /* invalidate lock value block */
+#define LKM_SYNCSTS      0x00010000  /* return synchronous status if poss (U) */
+#define LKM_TIMEOUT      0x00020000  /* lock request contains timeout (U) */
+#define LKM_SNGLDLCK     0x00040000  /* request can self-deadlock (U) */
+#define LKM_FINDLOCAL    0x00080000  /* find local lock request (U) */
+#define LKM_PROC_OWNED   0x00100000  /* owned by process, not group (U) */
+#define LKM_XID          0x00200000  /* use transaction id for deadlock (U) */
+#define LKM_XID_CONFLICT 0x00400000  /* do not allow lock inheritance (U) */
+#define LKM_FORCE        0x00800000  /* force unlock flag */
+#define LKM_REVVALBLK    0x01000000  /* temporary solution: re-validate
+					lock value block (U) */
+/* unused */
+#define LKM_UNUSED1      0x00000001  /* unused */
+#define LKM_UNUSED2      0x00000002  /* unused */
+#define LKM_UNUSED3      0x00000004  /* unused */
+#define LKM_UNUSED4      0x00000008  /* unused */
+#define LKM_UNUSED5      0x02000000  /* unused */
+#define LKM_UNUSED6      0x04000000  /* unused */
+#define LKM_UNUSED7      0x08000000  /* unused */
+
+/* ocfs2 extensions: internal only
+ * should never be used by caller */
+#define LKM_MIGRATION    0x10000000  /* extension: lockres is to be migrated
+					to another node */
+#define LKM_PUT_LVB      0x20000000  /* extension: lvb is being passed
+					should be applied to lockres */
+#define LKM_GET_LVB      0x40000000  /* extension: lvb should be copied
+					from lockres when lock is granted */
+#define LKM_RECOVERY     0x80000000  /* extension: flag for recovery lock
+					used to avoid recovery rwsem */
+
+
+typedef void (dlm_astlockfunc_t)(void *);
+typedef void (dlm_bastlockfunc_t)(void *, int);
+typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm,
+			int mode,
+			struct dlm_lockstatus *lksb,
+			int flags,
+			const char *name,
+			dlm_astlockfunc_t *ast,
+			void *data,
+			dlm_bastlockfunc_t *bast);
+
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
+			  struct dlm_lockstatus *lksb,
+			  int flags,
+			  dlm_astunlockfunc_t *unlockast,
+			  void *data);
+
+struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm);
+
+void dlm_print_one_lock(struct dlm_lock *lockid);
+
+typedef void (dlm_eviction_func)(int, void *);
+struct dlm_eviction_cb {
+	struct list_head        ec_item;
+	dlm_eviction_func       *ec_func;
+	void                    *ec_data;
+};
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data);
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb);
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
+
+#endif /* DLMAPI_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
new file mode 100644
index 000000000000..8d17d28ef91c
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -0,0 +1,466 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmast.c
+ *
+ * AST and BAST functionality for local and remote nodes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+#include "cluster/endian.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock);
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+
+/* Should be called as an ast gets queued to see if the new
+ * lock level will obsolete a pending bast.
+ * For example, if dlm_thread queued a bast for an EX lock that
+ * was blocking another EX, but before sending the bast the
+ * lock owner downconverted to NL, the bast is now obsolete.
+ * Only the ast should be sent.
+ * This is needed because the lock and convert paths can queue
+ * asts out-of-band (not waiting for dlm_thread) in order to
+ * allow for LKM_NOQUEUE to get immediate responses. */
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	assert_spin_locked(&dlm->ast_lock);
+	assert_spin_locked(&lock->spinlock);
+
+	if (lock->ml.highest_blocked == LKM_IVMODE)
+		return 0;
+	BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
+
+	if (lock->bast_pending &&
+	    list_empty(&lock->bast_list))
+		/* old bast already sent, ok */
+		return 0;
+
+	if (lock->ml.type == LKM_EXMODE)
+		/* EX blocks anything left, any bast still valid */
+		return 0;
+	else if (lock->ml.type == LKM_NLMODE)
+		/* NL blocks nothing, no reason to send any bast, cancel it */
+		return 1;
+	else if (lock->ml.highest_blocked != LKM_EXMODE)
+		/* PR only blocks EX */
+		return 1;
+
+	return 0;
+}
+
+static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	assert_spin_locked(&dlm->ast_lock);
+	if (!list_empty(&lock->ast_list)) {
+		mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
+		     lock->ast_pending, lock->ml.type);
+		BUG();
+	}
+	BUG_ON(!list_empty(&lock->ast_list));
+	if (lock->ast_pending)
+		mlog(0, "lock has an ast getting flushed right now\n");
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+
+	/* check to see if this ast obsoletes the bast */
+	if (dlm_should_cancel_bast(dlm, lock)) {
+		struct dlm_lock_resource *res = lock->lockres;
+		mlog(0, "%s: cancelling bast for %.*s\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		lock->bast_pending = 0;
+		list_del_init(&lock->bast_list);
+		lock->ml.highest_blocked = LKM_IVMODE;
+		/* removing lock from list, remove a ref.  guaranteed
+		 * this won't be the last ref because of the get above,
+		 * so res->spinlock will not be taken here */
+		dlm_lock_put(lock);
+		/* free up the reserved bast that we are cancelling.
+		 * guaranteed that this will not be the last reserved
+		 * ast because *both* an ast and a bast were reserved 
+		 * to get to this point.  the res->spinlock will not be
+		 * taken here */
+		dlm_lockres_release_ast(dlm, res);
+	}
+	list_add_tail(&lock->ast_list, &dlm->pending_asts);
+	lock->ast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_ast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+	assert_spin_locked(&dlm->ast_lock);
+
+	BUG_ON(!list_empty(&lock->bast_list));
+	if (lock->bast_pending)
+		mlog(0, "lock has a bast getting flushed right now\n");
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+	list_add_tail(&lock->bast_list, &dlm->pending_basts);
+	lock->bast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_bast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock)
+{
+	struct dlm_lockstatus *lksb = lock->lksb;
+	BUG_ON(!lksb);
+
+	/* only updates if this node masters the lockres */
+	if (res->owner == dlm->node_num) {
+
+		spin_lock(&res->spinlock);
+		/* check the lksb flags for the direction */
+		if (lksb->flags & DLM_LKSB_GET_LVB) {
+			mlog(0, "getting lvb from lockres for %s node\n",
+				  lock->ml.node == dlm->node_num ? "master" :
+				  "remote");
+			memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
+		} else if (lksb->flags & DLM_LKSB_PUT_LVB) {
+			mlog(0, "setting lvb from lockres for %s node\n",
+				  lock->ml.node == dlm->node_num ? "master" :
+				  "remote");
+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
+		}
+		spin_unlock(&res->spinlock);
+	}
+
+	/* reset any lvb flags on the lksb */
+	lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+}
+
+void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	dlm_astlockfunc_t *fn;
+	struct dlm_lockstatus *lksb;
+
+	mlog_entry_void();
+
+	lksb = lock->lksb;
+	fn = lock->ast;
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	dlm_update_lvb(dlm, res, lock);
+	(*fn)(lock->astdata);
+}
+
+
+int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	int ret;
+	struct dlm_lockstatus *lksb;
+	int lksbflags;
+
+	mlog_entry_void();
+
+	lksb = lock->lksb;
+	BUG_ON(lock->ml.node == dlm->node_num);
+
+	lksbflags = lksb->flags;
+	dlm_update_lvb(dlm, res, lock);
+
+	/* lock request came from another node
+	 * go do the ast over there */
+	ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
+	return ret;
+}
+
+void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		       struct dlm_lock *lock, int blocked_type)
+{
+	dlm_bastlockfunc_t *fn = lock->bast;
+
+	mlog_entry_void();
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	(*fn)(lock->astdata, blocked_type);
+}
+
+
+
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	int ret;
+	unsigned int locklen;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
+	char *name;
+	struct list_head *iter, *head=NULL;
+	u64 cookie;
+	u32 flags;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	name = past->name;
+	locklen = past->namelen;
+	cookie = be64_to_cpu(past->cookie);
+	flags = be32_to_cpu(past->flags);
+
+	if (locklen > DLM_LOCKID_NAME_MAX) {
+		ret = DLM_IVBUFLEN;
+		mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+		goto leave;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		ret = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+		  (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
+
+	if (past->type != DLM_AST &&
+	    past->type != DLM_BAST) {
+		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", "
+		     "name=%.*s\n", past->type, cookie, locklen, name);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	res = dlm_lookup_lockres(dlm, name, locklen);
+	if (!res) {
+		mlog(ML_ERROR, "got %sast for unknown lockres! "
+			       "cookie=%"MLFu64", name=%.*s, namelen=%u\n",
+		     past->type == DLM_AST ? "" : "b",
+		     cookie, locklen, name, locklen);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	/* cannot get a proxy ast message if this node owns it */
+	BUG_ON(res->owner == dlm->node_num);
+
+	mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "responding with DLM_RECOVERING!\n");
+		ret = DLM_RECOVERING;
+		goto unlock_out;
+	}
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "responding with DLM_MIGRATING!\n");
+		ret = DLM_MIGRATING;
+		goto unlock_out;
+	}
+	/* try convert queue for both ast/bast */
+	head = &res->converting;
+	lock = NULL;
+	list_for_each(iter, head) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (be64_to_cpu(lock->ml.cookie) == cookie)
+			goto do_ast;
+	}
+
+	/* if not on convert, try blocked for ast, granted for bast */
+	if (past->type == DLM_AST)
+		head = &res->blocked;
+	else
+		head = &res->granted;
+
+	list_for_each(iter, head) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (be64_to_cpu(lock->ml.cookie) == cookie)
+			goto do_ast;
+	}
+
+	mlog(ML_ERROR, "got %sast for unknown lock!  cookie=%"MLFu64", "
+		       "name=%.*s, namelen=%u\n",
+             past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen);
+
+	ret = DLM_NORMAL;
+unlock_out:
+	spin_unlock(&res->spinlock);
+	goto leave;
+
+do_ast:
+	ret = DLM_NORMAL;
+	if (past->type == DLM_AST) {
+		/* do not alter lock refcount.  switching lists. */
+		list_del_init(&lock->list);
+		list_add_tail(&lock->list, &res->granted);
+		mlog(0, "ast: adding to granted list... type=%d, "
+			  "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+		if (lock->ml.convert_type != LKM_IVMODE) {
+			lock->ml.type = lock->ml.convert_type;
+			lock->ml.convert_type = LKM_IVMODE;
+		} else {
+			// should already be there....
+		}
+
+		lock->lksb->status = DLM_NORMAL;
+
+		/* if we requested the lvb, fetch it into our lksb now */
+		if (flags & LKM_GET_LVB) {
+			BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
+			memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	if (past->type == DLM_AST)
+		dlm_do_local_ast(dlm, res, lock);
+	else
+		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
+
+leave:
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+	return ret;
+}
+
+
+
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock, int msg_type,
+			   int blocked_type, int flags)
+{
+	int ret = 0;
+	struct dlm_proxy_ast past;
+	struct kvec vec[2];
+	size_t veclen = 1;
+	int status;
+
+	mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
+		   res->lockname.len, res->lockname.name, lock->ml.node,
+		   msg_type, blocked_type);
+
+	memset(&past, 0, sizeof(struct dlm_proxy_ast));
+	past.node_idx = dlm->node_num;
+	past.type = msg_type;
+	past.blocked_type = blocked_type;
+	past.namelen = res->lockname.len;
+	memcpy(past.name, res->lockname.name, past.namelen);
+	past.cookie = lock->ml.cookie;
+
+	vec[0].iov_len = sizeof(struct dlm_proxy_ast);
+	vec[0].iov_base = &past;
+	if (flags & DLM_LKSB_GET_LVB) {
+		mlog(0, "returning requested LVB data\n");
+		be32_add_cpu(&past.flags, LKM_GET_LVB);
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
+				     lock->ml.node, &status);
+	if (ret < 0)
+		mlog_errno(ret);
+	else {
+		if (status == DLM_RECOVERING) {
+			mlog(ML_ERROR, "sent AST to node %u, it thinks this "
+			     "node is dead!\n", lock->ml.node);
+			BUG();
+		} else if (status == DLM_MIGRATING) {
+			mlog(ML_ERROR, "sent AST to node %u, it returned "
+			     "DLM_MIGRATING!\n", lock->ml.node);
+			BUG();
+		} else if (status != DLM_NORMAL) {
+			mlog(ML_ERROR, "AST to node %u returned %d!\n",
+			     lock->ml.node, status);
+			/* ignore it */
+		}
+		ret = 0;
+	}
+	return ret;
+}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
new file mode 100644
index 000000000000..3fecba0a6023
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -0,0 +1,884 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmcommon.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCOMMON_H
+#define DLMCOMMON_H
+
+#include <linux/kref.h>
+
+#define DLM_HB_NODE_DOWN_PRI     (0xf000000)
+#define DLM_HB_NODE_UP_PRI       (0x8000000)
+
+#define DLM_LOCKID_NAME_MAX    32
+
+#define DLM_DOMAIN_NAME_MAX_LEN    255
+#define DLM_LOCK_RES_OWNER_UNKNOWN     O2NM_MAX_NODES
+#define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
+#define DLM_THREAD_MS                  200   // flush at least every 200 ms
+
+#define DLM_HASH_BITS     7
+#define DLM_HASH_SIZE     (1 << DLM_HASH_BITS)
+#define DLM_HASH_MASK     (DLM_HASH_SIZE - 1)
+
+enum dlm_ast_type {
+	DLM_AST = 0,
+	DLM_BAST,
+	DLM_ASTUNLOCK
+};
+
+
+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
+			 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
+			 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
+
+#define DLM_RECOVERY_LOCK_NAME       "$RECOVERY"
+#define DLM_RECOVERY_LOCK_NAME_LEN   9
+
+static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
+{
+	if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
+	    memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
+		return 1;
+	return 0;
+}
+
+#define DLM_RECO_STATE_ACTIVE  0x0001
+
+struct dlm_recovery_ctxt
+{
+	struct list_head resources;
+	struct list_head received;
+	struct list_head node_data;
+	u8  new_master;
+	u8  dead_node;
+	u16 state;
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	wait_queue_head_t event;
+};
+
+enum dlm_ctxt_state {
+	DLM_CTXT_NEW = 0,
+	DLM_CTXT_JOINED,
+	DLM_CTXT_IN_SHUTDOWN,
+	DLM_CTXT_LEAVING,
+};
+
+struct dlm_ctxt
+{
+	struct list_head list;
+	struct list_head *resources;
+	struct list_head dirty_list;
+	struct list_head purge_list;
+	struct list_head pending_asts;
+	struct list_head pending_basts;
+	unsigned int purge_count;
+	spinlock_t spinlock;
+	spinlock_t ast_lock;
+	char *name;
+	u8 node_num;
+	u32 key;
+	u8  joining_node;
+	wait_queue_head_t dlm_join_events;
+	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct dlm_recovery_ctxt reco;
+	spinlock_t master_lock;
+	struct list_head master_list;
+	struct list_head mle_hb_events;
+
+	/* these give a really vague idea of the system load */
+	atomic_t local_resources;
+	atomic_t remote_resources;
+	atomic_t unknown_resources;
+
+	/* NOTE: Next three are protected by dlm_domain_lock */
+	struct kref dlm_refs;
+	enum dlm_ctxt_state dlm_state;
+	unsigned int num_joins;
+
+	struct o2hb_callback_func dlm_hb_up;
+	struct o2hb_callback_func dlm_hb_down;
+	struct task_struct *dlm_thread_task;
+	struct task_struct *dlm_reco_thread_task;
+	wait_queue_head_t dlm_thread_wq;
+	wait_queue_head_t dlm_reco_thread_wq;
+	wait_queue_head_t ast_wq;
+	wait_queue_head_t migration_wq;
+
+	struct work_struct dispatched_work;
+	struct list_head work_list;
+	spinlock_t work_lock;
+	struct list_head dlm_domain_handlers;
+	struct list_head	dlm_eviction_callbacks;
+};
+
+/* these keventd work queue items are for less-frequently
+ * called functions that cannot be directly called from the
+ * net message handlers for some reason, usually because
+ * they need to send net messages of their own. */
+void dlm_dispatch_work(void *data);
+
+struct dlm_lock_resource;
+struct dlm_work_item;
+
+typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
+
+struct dlm_request_all_locks_priv
+{
+	u8 reco_master;
+	u8 dead_node;
+};
+
+struct dlm_mig_lockres_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 real_master;
+};
+
+struct dlm_assert_master_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 request_from;
+	u32 flags;
+	unsigned ignore_higher:1;
+};
+
+
+struct dlm_work_item
+{
+	struct list_head list;
+	dlm_workfunc_t *func;
+	struct dlm_ctxt *dlm;
+	void *data;
+	union {
+		struct dlm_request_all_locks_priv ral;
+		struct dlm_mig_lockres_priv ml;
+		struct dlm_assert_master_priv am;
+	} u;
+};
+
+static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
+				      struct dlm_work_item *i,
+				      dlm_workfunc_t *f, void *data)
+{
+	memset(i, 0, sizeof(*i));
+	i->func = f;
+	INIT_LIST_HEAD(&i->list);
+	i->data = data;
+	i->dlm = dlm;  /* must have already done a dlm_grab on this! */
+}
+
+
+
+static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
+					  u8 node)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	dlm->joining_node = node;
+	wake_up(&dlm->dlm_join_events);
+}
+
+#define DLM_LOCK_RES_UNINITED             0x00000001
+#define DLM_LOCK_RES_RECOVERING           0x00000002
+#define DLM_LOCK_RES_READY                0x00000004
+#define DLM_LOCK_RES_DIRTY                0x00000008
+#define DLM_LOCK_RES_IN_PROGRESS          0x00000010
+#define DLM_LOCK_RES_MIGRATING            0x00000020
+
+#define DLM_PURGE_INTERVAL_MS   (8 * 1000)
+
+struct dlm_lock_resource
+{
+	/* WARNING: Please see the comment in dlm_init_lockres before
+	 * adding fields here. */
+	struct list_head list;
+	struct kref      refs;
+
+	/* please keep these next 3 in this order
+	 * some funcs want to iterate over all lists */
+	struct list_head granted;
+	struct list_head converting;
+	struct list_head blocked;
+
+	struct list_head dirty;
+	struct list_head recovering; // dlm_recovery_ctxt.resources list
+
+	/* unused lock resources have their last_used stamped and are
+	 * put on a list for the dlm thread to run. */
+	struct list_head purge;
+	unsigned long    last_used;
+
+	unsigned migration_pending:1;
+	atomic_t asts_reserved;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	u8  owner;              //node which owns the lock resource, or unknown
+	u16 state;
+	struct qstr lockname;
+	char lvb[DLM_LVB_LEN];
+};
+
+struct dlm_migratable_lock
+{
+	__be64 cookie;
+
+	/* these 3 are just padding for the in-memory structure, but
+	 * list and flags are actually used when sent over the wire */
+	__be16 pad1;
+	u8 list;  // 0=granted, 1=converting, 2=blocked
+	u8 flags;
+
+	s8 type;
+	s8 convert_type;
+	s8 highest_blocked;
+	u8 node;
+};  // 16 bytes
+
+struct dlm_lock
+{
+	struct dlm_migratable_lock ml;
+
+	struct list_head list;
+	struct list_head ast_list;
+	struct list_head bast_list;
+	struct dlm_lock_resource *lockres;
+	spinlock_t spinlock;
+	struct kref lock_refs;
+
+	// ast and bast must be callable while holding a spinlock!
+	dlm_astlockfunc_t *ast;
+	dlm_bastlockfunc_t *bast;
+	void *astdata;
+	struct dlm_lockstatus *lksb;
+	unsigned ast_pending:1,
+		 bast_pending:1,
+		 convert_pending:1,
+		 lock_pending:1,
+		 cancel_pending:1,
+		 unlock_pending:1,
+		 lksb_kernel_allocated:1;
+};
+
+
+#define DLM_LKSB_UNUSED1           0x01
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+
+enum dlm_lockres_list {
+	DLM_GRANTED_LIST = 0,
+	DLM_CONVERTING_LIST,
+	DLM_BLOCKED_LIST
+};
+
+static inline struct list_head *
+dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
+{
+	struct list_head *ret = NULL;
+	if (idx == DLM_GRANTED_LIST)
+		ret = &res->granted;
+	else if (idx == DLM_CONVERTING_LIST)
+		ret = &res->converting;
+	else if (idx == DLM_BLOCKED_LIST)
+		ret = &res->blocked;
+	else
+		BUG();
+	return ret;
+}
+
+
+
+
+struct dlm_node_iter
+{
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int curnode;
+};
+
+
+enum {
+	DLM_MASTER_REQUEST_MSG    = 500,
+	DLM_UNUSED_MSG1,         /* 501 */
+	DLM_ASSERT_MASTER_MSG,	 /* 502 */
+	DLM_CREATE_LOCK_MSG,	 /* 503 */
+	DLM_CONVERT_LOCK_MSG,	 /* 504 */
+	DLM_PROXY_AST_MSG,	 /* 505 */
+	DLM_UNLOCK_LOCK_MSG,	 /* 506 */
+	DLM_UNUSED_MSG2,	 /* 507 */
+	DLM_MIGRATE_REQUEST_MSG, /* 508 */
+	DLM_MIG_LOCKRES_MSG, 	 /* 509 */
+	DLM_QUERY_JOIN_MSG,	 /* 510 */
+	DLM_ASSERT_JOINED_MSG,	 /* 511 */
+	DLM_CANCEL_JOIN_MSG,	 /* 512 */
+	DLM_EXIT_DOMAIN_MSG,	 /* 513 */
+	DLM_MASTER_REQUERY_MSG,	 /* 514 */
+	DLM_LOCK_REQUEST_MSG,	 /* 515 */
+	DLM_RECO_DATA_DONE_MSG,	 /* 516 */
+	DLM_BEGIN_RECO_MSG,	 /* 517 */
+	DLM_FINALIZE_RECO_MSG	 /* 518 */
+};
+
+struct dlm_reco_node_data
+{
+	int state;
+	u8 node_num;
+	struct list_head list;
+};
+
+enum {
+	DLM_RECO_NODE_DATA_DEAD = -1,
+	DLM_RECO_NODE_DATA_INIT = 0,
+	DLM_RECO_NODE_DATA_REQUESTING,
+	DLM_RECO_NODE_DATA_REQUESTED,
+	DLM_RECO_NODE_DATA_RECEIVING,
+	DLM_RECO_NODE_DATA_DONE,
+	DLM_RECO_NODE_DATA_FINALIZE_SENT,
+};
+
+
+enum {
+	DLM_MASTER_RESP_NO = 0,
+	DLM_MASTER_RESP_YES,
+	DLM_MASTER_RESP_MAYBE,
+	DLM_MASTER_RESP_ERROR
+};
+
+
+struct dlm_master_request
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_ASSERT_MASTER_MLE_CLEANUP      0x00000001
+#define DLM_ASSERT_MASTER_REQUERY          0x00000002
+#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
+struct dlm_assert_master
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_migrate_request
+{
+	u8 master;
+	u8 new_master;
+	u8 namelen;
+	u8 pad1;
+	__be32 pad2;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_master_requery
+{
+	u8 pad1;
+	u8 pad2;
+	u8 node_idx;
+	u8 namelen;
+	__be32 pad3;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_MRES_RECOVERY   0x01
+#define DLM_MRES_MIGRATION  0x02
+#define DLM_MRES_ALL_DONE   0x04
+
+/*
+ * We would like to get one whole lockres into a single network
+ * message whenever possible.  Generally speaking, there will be
+ * at most one dlm_lock on a lockres for each node in the cluster,
+ * plus (infrequently) any additional locks coming in from userdlm.
+ *
+ * struct _dlm_lockres_page
+ * {
+ * 	dlm_migratable_lockres mres;
+ * 	dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
+ * 	u8 pad[DLM_MIG_LOCKRES_RESERVED];
+ * };
+ *
+ * from ../cluster/tcp.h
+ *    NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(net_msg))
+ *    (roughly 4080 bytes)
+ * and sizeof(dlm_migratable_lockres) = 112 bytes
+ * and sizeof(dlm_migratable_lock) = 16 bytes
+ *
+ * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
+ * DLM_MIG_LOCKRES_RESERVED=128 means we have this:
+ *
+ *  (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
+ *     sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
+ *        NET_MAX_PAYLOAD_BYTES
+ *  (240 * 16) + 112 + 128 = 4080
+ *
+ * So a lockres would need more than 240 locks before it would
+ * use more than one network packet to recover.  Not too bad.
+ */
+#define DLM_MAX_MIGRATABLE_LOCKS   240
+
+struct dlm_migratable_lockres
+{
+	u8 master;
+	u8 lockname_len;
+	u8 num_locks;    // locks sent in this structure
+	u8 flags;
+	__be32 total_locks; // locks to be sent for this migration cookie
+	__be64 mig_cookie;  // cookie for this lockres migration
+			 // or zero if not needed
+	// 16 bytes
+	u8 lockname[DLM_LOCKID_NAME_MAX];
+	// 48 bytes
+	u8 lvb[DLM_LVB_LEN];
+	// 112 bytes
+	struct dlm_migratable_lock ml[0];  // 16 bytes each, begins at byte 112
+};
+#define DLM_MIG_LOCKRES_MAX_LEN  \
+	(sizeof(struct dlm_migratable_lockres) + \
+	 (sizeof(struct dlm_migratable_lock) * \
+	  DLM_MAX_MIGRATABLE_LOCKS) )
+
+/* from above, 128 bytes
+ * for some undetermined future use */
+#define DLM_MIG_LOCKRES_RESERVED   (NET_MAX_PAYLOAD_BYTES - \
+				    DLM_MIG_LOCKRES_MAX_LEN)
+
+struct dlm_create_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_convert_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_CONVERT_LOCK_MAX_LEN  (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
+
+struct dlm_unlock_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	__be16 pad1;
+	u8 node_idx;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_UNLOCK_LOCK_MAX_LEN  (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
+
+struct dlm_proxy_ast
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 node_idx;
+	u8 type;
+	u8 blocked_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_PROXY_AST_MAX_LEN  (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
+
+#define DLM_MOD_KEY (0x666c6172)
+enum dlm_query_join_response {
+	JOIN_DISALLOW = 0,
+	JOIN_OK,
+	JOIN_OK_NO_MAP,
+};
+
+struct dlm_lock_request
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+struct dlm_reco_data_done
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+
+	/* unused for now */
+	/* eventually we can use this to attempt
+	 * lvb recovery based on each node's info */
+	u8 reco_lvb[DLM_LVB_LEN];
+};
+
+struct dlm_begin_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+
+struct dlm_query_join_request
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_assert_joined
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_cancel_join
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_exit_domain
+{
+	u8 node_idx;
+	u8 pad1[3];
+};
+
+struct dlm_finalize_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+static inline enum dlm_status
+__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
+{
+	enum dlm_status status = DLM_NORMAL;
+
+	assert_spin_locked(&res->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING)
+		status = DLM_RECOVERING;
+	else if (res->state & DLM_LOCK_RES_MIGRATING)
+		status = DLM_MIGRATING;
+	else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
+		status = DLM_FORWARD;
+
+	return status;
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb);
+void dlm_lock_get(struct dlm_lock *lock);
+void dlm_lock_put(struct dlm_lock *lock);
+
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res);
+
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock);
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock);
+
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+
+int dlm_launch_thread(struct dlm_ctxt *dlm);
+void dlm_complete_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
+
+void dlm_put(struct dlm_ctxt *dlm);
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
+
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res);
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res);
+void dlm_purge_lockres(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *lockres);
+void dlm_lockres_get(struct dlm_lock_resource *res);
+void dlm_lockres_put(struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+			  struct dlm_lock_resource *res);
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+						const char *name,
+						unsigned int len);
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+					      const char *name,
+					      unsigned int len);
+
+int dlm_is_host_down(int errno);
+void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res,
+			      u8 owner);
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+						 const char *lockid,
+						 int flags);
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+					  const char *name,
+					  unsigned int namelen);
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_do_local_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+int dlm_do_remote_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+void dlm_do_local_bast(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *res,
+		       struct dlm_lock *lock,
+		       int blocked_type);
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
+			   struct dlm_lock_resource *res,
+			   struct dlm_lock *lock,
+			   int msg_type,
+			   int blocked_type, int flags);
+static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock,
+				      int blocked_type)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
+				      blocked_type, 0);
+}
+
+static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_lock *lock,
+				     int flags)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
+				      0, flags);
+}
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+
+u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+
+
+int dlm_nm_init(struct dlm_ctxt *dlm);
+int dlm_heartbeat_init(struct dlm_ctxt *dlm);
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
+
+int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+int dlm_migrate_lockres(struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			u8 target);
+int dlm_finish_migration(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 u8 old_master);
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res);
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
+
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher,
+			       u8 request_from,
+			       u32 flags);
+
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to,
+			 u8 flags);
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res);
+
+/* will exit holding res->spinlock, but may drop in function */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
+void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
+
+/* will exit holding res->spinlock, but may drop in function */
+static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
+{
+	__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
+				    	  DLM_LOCK_RES_RECOVERING|
+					  DLM_LOCK_RES_MIGRATING));
+}
+
+
+int dlm_init_mle_cache(void);
+void dlm_destroy_mle_cache(void);
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
+void dlm_clean_master_list(struct dlm_ctxt *dlm,
+			   u8 dead_node);
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+
+
+static inline const char * dlm_lock_mode_name(int mode)
+{
+	switch (mode) {
+		case LKM_EXMODE:
+			return "EX";
+		case LKM_PRMODE:
+			return "PR";
+		case LKM_NLMODE:
+			return "NL";
+	}
+	return "UNKNOWN";
+}
+
+
+static inline int dlm_lock_compatible(int existing, int request)
+{
+	/* NO_LOCK compatible with all */
+	if (request == LKM_NLMODE ||
+	    existing == LKM_NLMODE)
+		return 1;
+
+	/* EX incompatible with all non-NO_LOCK */
+	if (request == LKM_EXMODE)
+		return 0;
+
+	/* request must be PR, which is compatible with PR */
+	if (existing == LKM_PRMODE)
+		return 1;
+
+	return 0;
+}
+
+static inline int dlm_lock_on_list(struct list_head *head,
+				   struct dlm_lock *lock)
+{
+	struct list_head *iter;
+	struct dlm_lock *tmplock;
+
+	list_for_each(iter, head) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (tmplock == lock)
+			return 1;
+	}
+	return 0;
+}
+
+
+static inline enum dlm_status dlm_err_to_dlm_status(int err)
+{
+	enum dlm_status ret;
+	if (err == -ENOMEM)
+		ret = DLM_SYSERR;
+	else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
+		ret = DLM_NOLOCKMGR;
+	else if (err == -EINVAL)
+		ret = DLM_BADPARAM;
+	else if (err == -ENAMETOOLONG)
+		ret = DLM_IVBUFLEN;
+	else
+		ret = DLM_BADARGS;
+	return ret;
+}
+
+
+static inline void dlm_node_iter_init(unsigned long *map,
+				      struct dlm_node_iter *iter)
+{
+	memcpy(iter->node_map, map, sizeof(iter->node_map));
+	iter->curnode = -1;
+}
+
+static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
+{
+	int bit;
+	bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+	iter->curnode = bit;
+	return bit;
+}
+
+
+
+#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
new file mode 100644
index 000000000000..6001b22a997d
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -0,0 +1,530 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.c
+ *
+ * underlying calls for lock conversion
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+/* NOTE: __dlmconvert_master is the only function in here that
+ * needs a spinlock held on entry (res->spinlock) and it is the
+ * only one that holds a lock on exit (res->spinlock).
+ * All other functions in here need no locks and drop all of
+ * the locks that they acquire. */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread);
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type);
+
+/*
+ * this is only called directly by dlmlock(), and only when the
+ * local node is the owner of the lockres
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: see __dlmconvert_master
+ */
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status;
+
+	spin_lock(&res->spinlock);
+	/* we are not in a network handler, this is fine */
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	status = __dlmconvert_master(dlm, res, lock, flags, type,
+				     &call_ast, &kick_thread);
+
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
+		dlm_error(status);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+/* performs lock conversion at the lockres master site
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         takes and drops lock->spinlock
+ *   held on exit:  res->spinlock
+ * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
+ *   call_ast: whether ast should be called for this lock
+ *   kick_thread: whether dlm_kick_thread should be called
+ */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread)
+{
+	enum dlm_status status = DLM_NORMAL;
+	struct list_head *iter;
+	struct dlm_lock *tmplock=NULL;
+
+	assert_spin_locked(&res->spinlock);
+
+	mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
+		   lock->ml.type, lock->ml.convert_type, type);
+
+	spin_lock(&lock->spinlock);
+
+	/* already converting? */
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		mlog(ML_ERROR, "attempted to convert a lock with a lock "
+		     "conversion pending\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	/* must be on grant queue to convert */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		mlog(ML_ERROR, "attempted to convert a lock not on grant "
+		     "queue\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	if (flags & LKM_VALBLK) {
+		switch (lock->ml.type) {
+			case LKM_EXMODE:
+				/* EX + LKM_VALBLK + convert == set lvb */
+				mlog(0, "will set lvb: converting %s->%s\n",
+				     dlm_lock_mode_name(lock->ml.type),
+				     dlm_lock_mode_name(type));
+				lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+				break;
+			case LKM_PRMODE:
+			case LKM_NLMODE:
+				/* refetch if new level is not NL */
+				if (type > LKM_NLMODE) {
+					mlog(0, "will fetch new value into "
+					     "lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					lock->lksb->flags |= DLM_LKSB_GET_LVB;
+				} else {
+					mlog(0, "will NOT fetch new value "
+					     "into lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					flags &= ~(LKM_VALBLK);
+				}
+				break;
+		}
+	}
+
+
+	/* in-place downconvert? */
+	if (type <= lock->ml.type)
+		goto grant;
+
+	/* upconvert from here on */
+	status = DLM_NORMAL;
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (tmplock == lock)
+			continue;
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+		/* existing conversion requests take precedence */
+		if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
+			goto switch_queues;
+	}
+
+	/* fall thru to grant */
+
+grant:
+	mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
+	     res->lockname.name, dlm_lock_mode_name(type));
+	/* immediately grant the new lock type */
+	lock->lksb->status = DLM_NORMAL;
+	if (lock->ml.node == dlm->node_num)
+		mlog(0, "doing in-place convert for nonlocal lock\n");
+	lock->ml.type = type;
+	status = DLM_NORMAL;
+	*call_ast = 1;
+	goto unlock_exit;
+
+switch_queues:
+	if (flags & LKM_NOQUEUE) {
+		mlog(0, "failed to convert NOQUEUE lock %.*s from "
+		     "%d to %d...\n", res->lockname.len, res->lockname.name,
+		     lock->ml.type, type);
+		status = DLM_NOTQUEUED;
+		goto unlock_exit;
+	}
+	mlog(0, "res %.*s, queueing...\n", res->lockname.len,
+	     res->lockname.name);
+
+	lock->ml.convert_type = type;
+	/* do not alter lock refcount.  switching lists. */
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->converting);
+
+unlock_exit:
+	spin_unlock(&lock->spinlock);
+	if (status == DLM_DENIED) {
+		__dlm_print_one_lock_resource(res);
+	}
+	if (status == DLM_NORMAL)
+		*kick_thread = 1;
+	return status;
+}
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock)
+{
+	/* do not alter lock refcount.  switching lists. */
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+	lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+}
+
+/* messages the master site to do lock conversion
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
+ */
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	enum dlm_status status;
+
+	mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
+	     lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "bailing out early since res is RECOVERING "
+		     "on secondary queue\n");
+		/* __dlm_print_one_lock_resource(res); */
+		status = DLM_RECOVERING;
+		goto bail;
+	}
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		__dlm_print_one_lock_resource(res);
+		mlog(ML_ERROR, "converting a remote lock that is already "
+		     "converting! (cookie=%"MLFu64", conv=%d)\n",
+		     lock->ml.cookie, lock->ml.convert_type);
+		status = DLM_DENIED;
+		goto bail;
+	}
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	/* move lock to local convert queue */
+	/* do not alter lock refcount.  switching lists. */
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->converting);
+	lock->convert_pending = 1;
+	lock->ml.convert_type = type;
+
+	if (flags & LKM_VALBLK) {
+		if (lock->ml.type == LKM_EXMODE) {
+			flags |= LKM_PUT_LVB;
+			lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+		} else {
+			if (lock->ml.convert_type == LKM_NLMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	/* no locks held here.
+	 * need to wait for a reply as to whether it got queued or not. */
+	status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->convert_pending = 0;
+	/* if it failed, move it back to granted queue */
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		dlm_revert_pending_convert(res, lock);
+	}
+bail:
+	spin_unlock(&res->spinlock);
+
+	/* TODO: should this be a wake_one? */
+	/* wake up any IN_PROGRESS waiters */
+	wake_up(&res->wq);
+
+	return status;
+}
+
+/* sends DLM_CONVERT_LOCK_MSG to master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, status from remote node
+ */
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type)
+{
+	struct dlm_convert_lock convert;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
+	memset(&convert, 0, sizeof(struct dlm_convert_lock));
+	convert.node_idx = dlm->node_num;
+	convert.requested_type = type;
+	convert.cookie = lock->ml.cookie;
+	convert.namelen = res->lockname.len;
+	convert.flags = cpu_to_be32(flags);
+	memcpy(convert.name, res->lockname.name, convert.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_convert_lock);
+	vec[0].iov_base = &convert;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
+					vec, veclen, res->owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		ret = status;  // this is already a dlm_status
+		if (ret == DLM_RECOVERING) {
+			mlog(0, "node %u returned DLM_RECOVERING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_MIGRATING) {
+			mlog(0, "node %u returned DLM_MIGRATING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_FORWARD) {
+			mlog(0, "node %u returned DLM_FORWARD from convert "
+			     "message!\n", res->owner);
+		} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
+			dlm_error(ret);
+	} else {
+		mlog_errno(tmpret);
+		if (dlm_is_host_down(tmpret)) {
+			ret = DLM_RECOVERING;
+			mlog(0, "node %u died so returning DLM_RECOVERING "
+			     "from convert message!\n", res->owner);
+		} else {
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+/* handler for DLM_CONVERT_LOCK_MSG on master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drop res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
+ *          status from __dlmconvert_master
+ */
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+	struct dlm_lockstatus *lksb;
+	enum dlm_status status = DLM_NORMAL;
+	u32 flags;
+	int call_ast = 0, kick_thread = 0;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
+		status = DLM_IVBUFLEN;
+		dlm_error(status);
+		goto leave;
+	}
+
+	flags = be32_to_cpu(cnv->flags);
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		status = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+	     (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	list_for_each(iter, &res->granted) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock->ml.cookie == cnv->cookie &&
+		    lock->ml.node == cnv->node_idx) {
+			dlm_lock_get(lock);
+			break;
+		}
+		lock = NULL;
+	}
+	spin_unlock(&res->spinlock);
+	if (!lock) {
+		status = DLM_IVLOCKID;
+		dlm_error(status);
+		goto leave;
+	}
+
+	/* found the lock */
+	lksb = lock->lksb;
+
+	/* see if caller needed to get/put lvb */
+	if (flags & LKM_PUT_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
+	} else if (flags & LKM_GET_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_GET_LVB;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status == DLM_NORMAL) {
+		__dlm_lockres_reserve_ast(res);
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+		status = __dlmconvert_master(dlm, res, lock, flags,
+					     cnv->requested_type,
+					     &call_ast, &kick_thread);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	}
+	spin_unlock(&res->spinlock);
+
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+	}
+
+leave:
+	if (!lock)
+		mlog(ML_ERROR, "did not find lock to convert on grant queue! "
+			       "cookie=%"MLFu64"\n",
+		     cnv->cookie);
+	else
+		dlm_lock_put(lock);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
new file mode 100644
index 000000000000..b2e3677df878
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -0,0 +1,35 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCONVERT_H
+#define DLMCONVERT_H
+
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+
+#endif
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
new file mode 100644
index 000000000000..f339fe27975a
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -0,0 +1,246 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.c
+ *
+ * debug functionality for the dlm
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdebug.h"
+
+#include "dlmdomain.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
+	       res->lockname.len, res->lockname.name,
+	       res->owner, res->state);
+	spin_lock(&res->spinlock);
+	__dlm_print_one_lock_resource(res);
+	spin_unlock(&res->spinlock);
+}
+
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	struct list_head *iter2;
+	struct dlm_lock *lock;
+
+	assert_spin_locked(&res->spinlock);
+
+	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
+	       res->lockname.len, res->lockname.name,
+	       res->owner, res->state);
+	mlog(ML_NOTICE, "  last used: %lu, on purge list: %s\n",
+	     res->last_used, list_empty(&res->purge) ? "no" : "yes");
+	mlog(ML_NOTICE, "  granted queue: \n");
+	list_for_each(iter2, &res->granted) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		spin_lock(&lock->spinlock);
+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       list_empty(&lock->ast_list) ? 'y' : 'n',
+		       lock->ast_pending ? 'y' : 'n',
+		       list_empty(&lock->bast_list) ? 'y' : 'n',
+		       lock->bast_pending ? 'y' : 'n');
+		spin_unlock(&lock->spinlock);
+	}
+	mlog(ML_NOTICE, "  converting queue: \n");
+	list_for_each(iter2, &res->converting) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		spin_lock(&lock->spinlock);
+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       list_empty(&lock->ast_list) ? 'y' : 'n',
+		       lock->ast_pending ? 'y' : 'n',
+		       list_empty(&lock->bast_list) ? 'y' : 'n',
+		       lock->bast_pending ? 'y' : 'n');
+		spin_unlock(&lock->spinlock);
+	}
+	mlog(ML_NOTICE, "  blocked queue: \n");
+	list_for_each(iter2, &res->blocked) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		spin_lock(&lock->spinlock);
+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       list_empty(&lock->ast_list) ? 'y' : 'n',
+		       lock->ast_pending ? 'y' : 'n',
+		       list_empty(&lock->bast_list) ? 'y' : 'n',
+		       lock->bast_pending ? 'y' : 'n');
+		spin_unlock(&lock->spinlock);
+	}
+}
+
+void dlm_print_one_lock(struct dlm_lock *lockid)
+{
+	dlm_print_one_lock_resource(lockid->lockres);
+}
+EXPORT_SYMBOL_GPL(dlm_print_one_lock);
+
+void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
+{
+	struct dlm_lock_resource *res;
+	struct list_head *iter;
+	struct list_head *bucket;
+	int i;
+
+	mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
+		  dlm->name, dlm->node_num, dlm->key);
+	if (!dlm || !dlm->name) {
+		mlog(ML_ERROR, "dlm=%p\n", dlm);
+		return;
+	}
+
+	spin_lock(&dlm->spinlock);
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry(iter, struct dlm_lock_resource, list);
+			dlm_print_one_lock_resource(res);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static const char *dlm_errnames[] = {
+	[DLM_NORMAL] =			"DLM_NORMAL",
+	[DLM_GRANTED] =			"DLM_GRANTED",
+	[DLM_DENIED] =			"DLM_DENIED",
+	[DLM_DENIED_NOLOCKS] =		"DLM_DENIED_NOLOCKS",
+	[DLM_WORKING] =			"DLM_WORKING",
+	[DLM_BLOCKED] =			"DLM_BLOCKED",
+	[DLM_BLOCKED_ORPHAN] =		"DLM_BLOCKED_ORPHAN",
+	[DLM_DENIED_GRACE_PERIOD] =	"DLM_DENIED_GRACE_PERIOD",
+	[DLM_SYSERR] =			"DLM_SYSERR",
+	[DLM_NOSUPPORT] =		"DLM_NOSUPPORT",
+	[DLM_CANCELGRANT] =		"DLM_CANCELGRANT",
+	[DLM_IVLOCKID] =		"DLM_IVLOCKID",
+	[DLM_SYNC] =			"DLM_SYNC",
+	[DLM_BADTYPE] =			"DLM_BADTYPE",
+	[DLM_BADRESOURCE] =		"DLM_BADRESOURCE",
+	[DLM_MAXHANDLES] =		"DLM_MAXHANDLES",
+	[DLM_NOCLINFO] =		"DLM_NOCLINFO",
+	[DLM_NOLOCKMGR] =		"DLM_NOLOCKMGR",
+	[DLM_NOPURGED] =		"DLM_NOPURGED",
+	[DLM_BADARGS] =			"DLM_BADARGS",
+	[DLM_VOID] =			"DLM_VOID",
+	[DLM_NOTQUEUED] =		"DLM_NOTQUEUED",
+	[DLM_IVBUFLEN] =		"DLM_IVBUFLEN",
+	[DLM_CVTUNGRANT] =		"DLM_CVTUNGRANT",
+	[DLM_BADPARAM] =		"DLM_BADPARAM",
+	[DLM_VALNOTVALID] =		"DLM_VALNOTVALID",
+	[DLM_REJECTED] =		"DLM_REJECTED",
+	[DLM_ABORT] =			"DLM_ABORT",
+	[DLM_CANCEL] =			"DLM_CANCEL",
+	[DLM_IVRESHANDLE] =		"DLM_IVRESHANDLE",
+	[DLM_DEADLOCK] =		"DLM_DEADLOCK",
+	[DLM_DENIED_NOASTS] =		"DLM_DENIED_NOASTS",
+	[DLM_FORWARD] =			"DLM_FORWARD",
+	[DLM_TIMEOUT] =			"DLM_TIMEOUT",
+	[DLM_IVGROUPID] =		"DLM_IVGROUPID",
+	[DLM_VERS_CONFLICT] =		"DLM_VERS_CONFLICT",
+	[DLM_BAD_DEVICE_PATH] =		"DLM_BAD_DEVICE_PATH",
+	[DLM_NO_DEVICE_PERMISSION] =	"DLM_NO_DEVICE_PERMISSION",
+	[DLM_NO_CONTROL_DEVICE ] =	"DLM_NO_CONTROL_DEVICE ",
+	[DLM_RECOVERING] =		"DLM_RECOVERING",
+	[DLM_MIGRATING] =		"DLM_MIGRATING",
+	[DLM_MAXSTATS] =		"DLM_MAXSTATS",
+};
+
+static const char *dlm_errmsgs[] = {
+	[DLM_NORMAL] = 			"request in progress",
+	[DLM_GRANTED] = 		"request granted",
+	[DLM_DENIED] = 			"request denied",
+	[DLM_DENIED_NOLOCKS] = 		"request denied, out of system resources",
+	[DLM_WORKING] = 		"async request in progress",
+	[DLM_BLOCKED] = 		"lock request blocked",
+	[DLM_BLOCKED_ORPHAN] = 		"lock request blocked by a orphan lock",
+	[DLM_DENIED_GRACE_PERIOD] = 	"topological change in progress",
+	[DLM_SYSERR] = 			"system error",
+	[DLM_NOSUPPORT] = 		"unsupported",
+	[DLM_CANCELGRANT] = 		"can't cancel convert: already granted",
+	[DLM_IVLOCKID] = 		"bad lockid",
+	[DLM_SYNC] = 			"synchronous request granted",
+	[DLM_BADTYPE] = 		"bad resource type",
+	[DLM_BADRESOURCE] = 		"bad resource handle",
+	[DLM_MAXHANDLES] = 		"no more resource handles",
+	[DLM_NOCLINFO] = 		"can't contact cluster manager",
+	[DLM_NOLOCKMGR] = 		"can't contact lock manager",
+	[DLM_NOPURGED] = 		"can't contact purge daemon",
+	[DLM_BADARGS] = 		"bad api args",
+	[DLM_VOID] = 			"no status",
+	[DLM_NOTQUEUED] = 		"NOQUEUE was specified and request failed",
+	[DLM_IVBUFLEN] = 		"invalid resource name length",
+	[DLM_CVTUNGRANT] = 		"attempted to convert ungranted lock",
+	[DLM_BADPARAM] = 		"invalid lock mode specified",
+	[DLM_VALNOTVALID] = 		"value block has been invalidated",
+	[DLM_REJECTED] = 		"request rejected, unrecognized client",
+	[DLM_ABORT] = 			"blocked lock request cancelled",
+	[DLM_CANCEL] = 			"conversion request cancelled",
+	[DLM_IVRESHANDLE] = 		"invalid resource handle",
+	[DLM_DEADLOCK] = 		"deadlock recovery refused this request",
+	[DLM_DENIED_NOASTS] = 		"failed to allocate AST",
+	[DLM_FORWARD] = 		"request must wait for primary's response",
+	[DLM_TIMEOUT] = 		"timeout value for lock has expired",
+	[DLM_IVGROUPID] = 		"invalid group specification",
+	[DLM_VERS_CONFLICT] = 		"version conflicts prevent request handling",
+	[DLM_BAD_DEVICE_PATH] = 	"Locks device does not exist or path wrong",
+	[DLM_NO_DEVICE_PERMISSION] = 	"Client has insufficient perms for device",
+	[DLM_NO_CONTROL_DEVICE] = 	"Cannot set options on opened device ",
+	[DLM_RECOVERING] = 		"lock resource being recovered",
+	[DLM_MIGRATING] = 		"lock resource being migrated",
+	[DLM_MAXSTATS] = 		"invalid error number",
+};
+
+const char *dlm_errmsg(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errmsgs[DLM_MAXSTATS];
+	return dlm_errmsgs[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errmsg);
+
+const char *dlm_errname(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errnames[DLM_MAXSTATS];
+	return dlm_errnames[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errname);
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000000..6858510c3ccd
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+
+void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
+
+#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
new file mode 100644
index 000000000000..da3c22045f89
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -0,0 +1,1469 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.c
+ *
+ * defines domain join / leave apis
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmdebug.h"
+#include "dlmdomain.h"
+
+#include "dlmver.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
+#include "cluster/masklog.h"
+
+/*
+ *
+ * spinlock lock ordering: if multiple locks are needed, obey this ordering:
+ *    dlm_domain_lock
+ *    struct dlm_ctxt->spinlock
+ *    struct dlm_lock_resource->spinlock
+ *    struct dlm_ctxt->master_lock
+ *    struct dlm_ctxt->ast_lock
+ *    dlm_master_list_entry->spinlock
+ *    dlm_lock->spinlock
+ *
+ */
+
+spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(dlm_domains);
+static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
+
+#define DLM_DOMAIN_BACKOFF_MS 200
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
+
+void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+{
+	list_del_init(&lockres->list);
+	dlm_lockres_put(lockres);
+}
+
+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *res)
+{
+	struct list_head *bucket;
+	struct qstr *q;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	q = &res->lockname;
+	q->hash = full_name_hash(q->name, q->len);
+	bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
+
+	/* get a reference for our hashtable */
+	dlm_lockres_get(res);
+
+	list_add_tail(&res->list, bucket);
+}
+
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+					 const char *name,
+					 unsigned int len)
+{
+	unsigned int hash;
+	struct list_head *iter;
+	struct dlm_lock_resource *tmpres=NULL;
+	struct list_head *bucket;
+
+	mlog_entry("%.*s\n", len, name);
+
+	assert_spin_locked(&dlm->spinlock);
+
+	hash = full_name_hash(name, len);
+
+	bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
+
+	/* check for pre-existing lock */
+	list_for_each(iter, bucket) {
+		tmpres = list_entry(iter, struct dlm_lock_resource, list);
+		if (tmpres->lockname.len == len &&
+		    memcmp(tmpres->lockname.name, name, len) == 0) {
+			dlm_lockres_get(tmpres);
+			break;
+		}
+
+		tmpres = NULL;
+	}
+	return tmpres;
+}
+
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+				    const char *name,
+				    unsigned int len)
+{
+	struct dlm_lock_resource *res;
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, len);
+	spin_unlock(&dlm->spinlock);
+	return res;
+}
+
+static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
+{
+	struct dlm_ctxt *tmp = NULL;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm_domain_lock);
+
+	/* tmp->name here is always NULL terminated,
+	 * but domain may not be! */
+	list_for_each(iter, &dlm_domains) {
+		tmp = list_entry (iter, struct dlm_ctxt, list);
+		if (strlen(tmp->name) == len &&
+		    memcmp(tmp->name, domain, len)==0)
+			break;
+		tmp = NULL;
+	}
+
+	return tmp;
+}
+
+/* For null terminated domain strings ONLY */
+static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
+{
+	assert_spin_locked(&dlm_domain_lock);
+
+	return __dlm_lookup_domain_full(domain, strlen(domain));
+}
+
+
+/* returns true on one of two conditions:
+ * 1) the domain does not exist
+ * 2) the domain exists and it's state is "joined" */
+static int dlm_wait_on_domain_helper(const char *domain)
+{
+	int ret = 0;
+	struct dlm_ctxt *tmp = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	tmp = __dlm_lookup_domain(domain);
+	if (!tmp)
+		ret = 1;
+	else if (tmp->dlm_state == DLM_CTXT_JOINED)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+	return ret;
+}
+
+static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
+{
+	if (dlm->resources)
+		free_page((unsigned long) dlm->resources);
+
+	if (dlm->name)
+		kfree(dlm->name);
+
+	kfree(dlm);
+}
+
+/* A little strange - this function will be called while holding
+ * dlm_domain_lock and is expected to be holding it on the way out. We
+ * will however drop and reacquire it multiple times */
+static void dlm_ctxt_release(struct kref *kref)
+{
+	struct dlm_ctxt *dlm;
+
+	dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
+
+	BUG_ON(dlm->num_joins);
+	BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
+
+	/* we may still be in the list if we hit an error during join. */
+	list_del_init(&dlm->list);
+
+	spin_unlock(&dlm_domain_lock);
+
+	mlog(0, "freeing memory from domain %s\n", dlm->name);
+
+	wake_up(&dlm_domain_events);
+
+	dlm_free_ctxt_mem(dlm);
+
+	spin_lock(&dlm_domain_lock);
+}
+
+void dlm_put(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm_domain_lock);
+	kref_put(&dlm->dlm_refs, dlm_ctxt_release);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_get(struct dlm_ctxt *dlm)
+{
+	kref_get(&dlm->dlm_refs);
+}
+
+/* given a questionable reference to a dlm object, gets a reference if
+ * it can find it in the list, otherwise returns NULL in which case
+ * you shouldn't trust your pointer. */
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
+{
+	struct list_head *iter;
+	struct dlm_ctxt *target = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	list_for_each(iter, &dlm_domains) {
+		target = list_entry (iter, struct dlm_ctxt, list);
+
+		if (target == dlm) {
+			__dlm_get(target);
+			break;
+		}
+
+		target = NULL;
+	}
+
+	spin_unlock(&dlm_domain_lock);
+
+	return target;
+}
+
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm_domain_lock);
+	ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
+		(dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain_handlers(dlm);
+	dlm_complete_thread(dlm);
+	dlm_complete_recovery_thread(dlm);
+
+	/* We've left the domain. Now we can take ourselves out of the
+	 * list and allow the kref stuff to help us free the
+	 * memory. */
+	spin_lock(&dlm_domain_lock);
+	list_del_init(&dlm->list);
+	spin_unlock(&dlm_domain_lock);
+
+	/* Wake up anyone waiting for us to remove this domain */
+	wake_up(&dlm_domain_events);
+}
+
+static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
+{
+	int i;
+	struct dlm_lock_resource *res;
+
+	mlog(0, "Migrating locks from domain %s\n", dlm->name);
+restart:
+	spin_lock(&dlm->spinlock);
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		while (!list_empty(&dlm->resources[i])) {
+			res = list_entry(dlm->resources[i].next,
+				     struct dlm_lock_resource, list);
+			/* need reference when manually grabbing lockres */
+			dlm_lockres_get(res);
+			/* this should unhash the lockres
+			 * and exit with dlm->spinlock */
+			mlog(0, "purging res=%p\n", res);
+			if (dlm_lockres_is_dirty(dlm, res)) {
+				/* HACK!  this should absolutely go.
+				 * need to figure out why some empty
+				 * lockreses are still marked dirty */
+				mlog(ML_ERROR, "lockres %.*s dirty!\n",
+				     res->lockname.len, res->lockname.name);
+
+				spin_unlock(&dlm->spinlock);
+				dlm_kick_thread(dlm, res);
+				wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
+				dlm_lockres_put(res);
+				goto restart;
+			}
+			dlm_purge_lockres(dlm, res);
+			dlm_lockres_put(res);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
+}
+
+static int dlm_no_joining_node(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm->spinlock);
+	ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
+	spin_unlock(&dlm->spinlock);
+
+	return ret;
+}
+
+static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
+{
+	/* Yikes, a double spinlock! I need domain_lock for the dlm
+	 * state and the dlm spinlock for join state... Sorry! */
+again:
+	spin_lock(&dlm_domain_lock);
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "Node %d is joining, we wait on it.\n",
+			  dlm->joining_node);
+		spin_unlock(&dlm->spinlock);
+		spin_unlock(&dlm_domain_lock);
+
+		wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
+		goto again;
+	}
+
+	dlm->dlm_state = DLM_CTXT_LEAVING;
+	spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_print_nodes(struct dlm_ctxt *dlm)
+{
+	int node = -1;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
+
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		mlog(ML_NOTICE, " node %d\n", node);
+	}
+}
+
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	unsigned int node;
+	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+	mlog_entry("%p %u %p", msg, len, data);
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	node = exit_msg->node_idx;
+
+	mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
+
+	spin_lock(&dlm->spinlock);
+	clear_bit(node, dlm->domain_map);
+	__dlm_print_nodes(dlm);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, node, 0);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+
+	return 0;
+}
+
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_exit_domain leave_msg;
+
+	mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
+		  node, dlm->name, dlm->node_num);
+
+	memset(&leave_msg, 0, sizeof(leave_msg));
+	leave_msg.node_idx = dlm->node_num;
+
+	status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
+				    &leave_msg, sizeof(leave_msg), node,
+				    NULL);
+
+	mlog(0, "status return %d from o2net_send_message\n", status);
+
+	return status;
+}
+
+
+static void dlm_leave_domain(struct dlm_ctxt *dlm)
+{
+	int node, clear_node, status;
+
+	/* At this point we've migrated away all our locks and won't
+	 * accept mastership of new ones. The dlm is responsible for
+	 * almost nothing now. We make sure not to confuse any joining
+	 * nodes and then commence shutdown procedure. */
+
+	spin_lock(&dlm->spinlock);
+	/* Clear ourselves from the domain map */
+	clear_bit(dlm->node_num, dlm->domain_map);
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     0)) < O2NM_MAX_NODES) {
+		/* Drop the dlm spinlock. This is safe wrt the domain_map.
+		 * -nodes cannot be added now as the
+		 *   query_join_handlers knows to respond with OK_NO_MAP
+		 * -we catch the right network errors if a node is
+		 *   removed from the map while we're sending him the
+		 *   exit message. */
+		spin_unlock(&dlm->spinlock);
+
+		clear_node = 1;
+
+		status = dlm_send_one_domain_exit(dlm, node);
+		if (status < 0 &&
+		    status != -ENOPROTOOPT &&
+		    status != -ENOTCONN) {
+			mlog(ML_NOTICE, "Error %d sending domain exit message "
+			     "to node %d\n", status, node);
+
+			/* Not sure what to do here but lets sleep for
+			 * a bit in case this was a transient
+			 * error... */
+			msleep(DLM_DOMAIN_BACKOFF_MS);
+			clear_node = 0;
+		}
+
+		spin_lock(&dlm->spinlock);
+		/* If we're not clearing the node bit then we intend
+		 * to loop back around to try again. */
+		if (clear_node)
+			clear_bit(node, dlm->domain_map);
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+int dlm_joined(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+
+	spin_lock(&dlm_domain_lock);
+
+	if (dlm->dlm_state == DLM_CTXT_JOINED)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+int dlm_shutting_down(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+
+	spin_lock(&dlm_domain_lock);
+
+	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm)
+{
+	int leave = 0;
+
+	spin_lock(&dlm_domain_lock);
+	BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
+	BUG_ON(!dlm->num_joins);
+
+	dlm->num_joins--;
+	if (!dlm->num_joins) {
+		/* We mark it "in shutdown" now so new register
+		 * requests wait until we've completely left the
+		 * domain. Don't use DLM_CTXT_LEAVING yet as we still
+		 * want new domain joins to communicate with us at
+		 * least until we've completed migration of our
+		 * resources. */
+		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
+		leave = 1;
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	if (leave) {
+		mlog(0, "shutting down domain %s\n", dlm->name);
+
+		/* We changed dlm state, notify the thread */
+		dlm_kick_thread(dlm, NULL);
+
+		dlm_migrate_all_locks(dlm);
+		dlm_mark_domain_leaving(dlm);
+		dlm_leave_domain(dlm);
+		dlm_complete_dlm_shutdown(dlm);
+	}
+	dlm_put(dlm);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_domain);
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_query_join_request *query;
+	enum dlm_query_join_response response;
+	struct dlm_ctxt *dlm = NULL;
+
+	query = (struct dlm_query_join_request *) msg->buf;
+
+	mlog(0, "node %u wants to join domain %s\n", query->node_idx,
+		  query->domain);
+
+	/*
+	 * If heartbeat doesn't consider the node live, tell it
+	 * to back off and try again.  This gives heartbeat a chance
+	 * to catch up.
+	 */
+	if (!o2hb_check_node_heartbeating(query->node_idx)) {
+		mlog(0, "node %u is not in our live map yet\n",
+		     query->node_idx);
+
+		response = JOIN_DISALLOW;
+		goto respond;
+	}
+
+	response = JOIN_OK_NO_MAP;
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
+	/* Once the dlm ctxt is marked as leaving then we don't want
+	 * to be put in someone's domain map. */
+	if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
+		spin_lock(&dlm->spinlock);
+
+		if (dlm->dlm_state == DLM_CTXT_NEW &&
+		    dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/*If this is a brand new context and we
+			 * haven't started our join process yet, then
+			 * the other node won the race. */
+			response = JOIN_OK_NO_MAP;
+		} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/* Disallow parallel joins. */
+			response = JOIN_DISALLOW;
+		} else {
+			/* Alright we're fully a part of this domain
+			 * so we keep some state as to who's joining
+			 * and indicate to him that needs to be fixed
+			 * up. */
+			response = JOIN_OK;
+			__dlm_set_joining_node(dlm, query->node_idx);
+		}
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+respond:
+	mlog(0, "We respond with %u\n", response);
+
+	return response;
+}
+
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_assert_joined *assert;
+	struct dlm_ctxt *dlm = NULL;
+
+	assert = (struct dlm_assert_joined *) msg->buf;
+
+	mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
+		  assert->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
+	/* XXX should we consider no dlm ctxt an error? */
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Alright, this node has officially joined our
+		 * domain. Set him in the map and clean up our
+		 * leftover join state. */
+		BUG_ON(dlm->joining_node != assert->node_idx);
+		set_bit(assert->node_idx, dlm->domain_map);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		__dlm_print_nodes(dlm);
+
+		/* notify anything attached to the heartbeat events */
+		dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_cancel_join *cancel;
+	struct dlm_ctxt *dlm = NULL;
+
+	cancel = (struct dlm_cancel_join *) msg->buf;
+
+	mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
+		  cancel->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
+
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Yikes, this guy wants to cancel his join. No
+		 * problem, we simply cleanup our join state. */
+		BUG_ON(dlm->joining_node != cancel->node_idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_cancel_join cancel_msg;
+
+	memset(&cancel_msg, 0, sizeof(cancel_msg));
+	cancel_msg.node_idx = dlm->node_num;
+	cancel_msg.name_len = strlen(dlm->name);
+	memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
+
+	status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+				    &cancel_msg, sizeof(cancel_msg), node,
+				    NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	return status;
+}
+
+/* map_size should be in bytes. */
+static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
+				 unsigned long *node_map,
+				 unsigned int map_size)
+{
+	int status, tmpstat;
+	unsigned int node;
+
+	if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
+			 sizeof(unsigned long))) {
+		mlog(ML_ERROR,
+		     "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
+		     map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
+		return -EINVAL;
+	}
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		tmpstat = dlm_send_one_join_cancel(dlm, node);
+		if (tmpstat) {
+			mlog(ML_ERROR, "Error return %d cancelling join on "
+			     "node %d\n", tmpstat, node);
+			if (!status)
+				status = tmpstat;
+		}
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int dlm_request_join(struct dlm_ctxt *dlm,
+			    int node,
+			    enum dlm_query_join_response *response)
+{
+	int status, retval;
+	struct dlm_query_join_request join_msg;
+
+	mlog(0, "querying node %d\n", node);
+
+	memset(&join_msg, 0, sizeof(join_msg));
+	join_msg.node_idx = dlm->node_num;
+	join_msg.name_len = strlen(dlm->name);
+	memcpy(join_msg.domain, dlm->name, join_msg.name_len);
+
+	status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
+				    sizeof(join_msg), node, &retval);
+	if (status < 0 && status != -ENOPROTOOPT) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* -ENOPROTOOPT from the net code means the other side isn't
+	    listening for our message type -- that's fine, it means
+	    his dlm isn't up, so we can consider him a 'yes' but not
+	    joined into the domain.  */
+	if (status == -ENOPROTOOPT) {
+		status = 0;
+		*response = JOIN_OK_NO_MAP;
+	} else if (retval == JOIN_DISALLOW ||
+		   retval == JOIN_OK ||
+		   retval == JOIN_OK_NO_MAP) {
+		*response = retval;
+	} else {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
+		     node);
+	}
+
+	mlog(0, "status %d, node %d response is %d\n", status, node,
+		  *response);
+
+bail:
+	return status;
+}
+
+static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_assert_joined assert_msg;
+
+	mlog(0, "Sending join assert to node %u\n", node);
+
+	memset(&assert_msg, 0, sizeof(assert_msg));
+	assert_msg.node_idx = dlm->node_num;
+	assert_msg.name_len = strlen(dlm->name);
+	memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
+
+	status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+				    &assert_msg, sizeof(assert_msg), node,
+				    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
+				  unsigned long *node_map)
+{
+	int status, node, live;
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		do {
+			/* It is very important that this message be
+			 * received so we spin until either the node
+			 * has died or it gets the message. */
+			status = dlm_send_one_join_assert(dlm, node);
+
+			spin_lock(&dlm->spinlock);
+			live = test_bit(node, dlm->live_nodes_map);
+			spin_unlock(&dlm->spinlock);
+
+			if (status) {
+				mlog(ML_ERROR, "Error return %d asserting "
+				     "join on node %d\n", status, node);
+
+				/* give us some time between errors... */
+				if (live)
+					msleep(DLM_DOMAIN_BACKOFF_MS);
+			}
+		} while (status && live);
+	}
+}
+
+struct domain_join_ctxt {
+	unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+static int dlm_should_restart_join(struct dlm_ctxt *dlm,
+				   struct domain_join_ctxt *ctxt,
+				   enum dlm_query_join_response response)
+{
+	int ret;
+
+	if (response == JOIN_DISALLOW) {
+		mlog(0, "Latest response of disallow -- should restart\n");
+		return 1;
+	}
+
+	spin_lock(&dlm->spinlock);
+	/* For now, we restart the process if the node maps have
+	 * changed at all */
+	ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
+		     sizeof(dlm->live_nodes_map));
+	spin_unlock(&dlm->spinlock);
+
+	if (ret)
+		mlog(0, "Node maps changed -- should restart\n");
+
+	return ret;
+}
+
+static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
+{
+	int status = 0, tmpstat, node;
+	struct domain_join_ctxt *ctxt;
+	enum dlm_query_join_response response;
+
+	mlog_entry("%p", dlm);
+
+	ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* group sem locking should work for us here -- we're already
+	 * registered for heartbeat events so filling this should be
+	 * atomic wrt getting those handlers called. */
+	o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+
+	spin_lock(&dlm->spinlock);
+	memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
+
+	__dlm_set_joining_node(dlm, dlm->node_num);
+
+	spin_unlock(&dlm->spinlock);
+
+	node = -1;
+	while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		status = dlm_request_join(dlm, node, &response);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Ok, either we got a response or the node doesn't have a
+		 * dlm up. */
+		if (response == JOIN_OK)
+			set_bit(node, ctxt->yes_resp_map);
+
+		if (dlm_should_restart_join(dlm, ctxt, response)) {
+			status = -EAGAIN;
+			goto bail;
+		}
+	}
+
+	mlog(0, "Yay, done querying nodes!\n");
+
+	/* Yay, everyone agree's we can join the domain. My domain is
+	 * comprised of all nodes who were put in the
+	 * yes_resp_map. Copy that into our domain map and send a join
+	 * assert message to clean up everyone elses state. */
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->domain_map, ctxt->yes_resp_map,
+	       sizeof(ctxt->yes_resp_map));
+	set_bit(dlm->node_num, dlm->domain_map);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
+
+	/* Joined state *must* be set before the joining node
+	 * information, otherwise the query_join handler may read no
+	 * current joiner but a state of NEW and tell joining nodes
+	 * we're not in the domain. */
+	spin_lock(&dlm_domain_lock);
+	dlm->dlm_state = DLM_CTXT_JOINED;
+	dlm->num_joins++;
+	spin_unlock(&dlm_domain_lock);
+
+bail:
+	spin_lock(&dlm->spinlock);
+	__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	if (!status)
+		__dlm_print_nodes(dlm);
+	spin_unlock(&dlm->spinlock);
+
+	if (ctxt) {
+		/* Do we need to send a cancel message to any nodes? */
+		if (status < 0) {
+			tmpstat = dlm_send_join_cancels(dlm,
+							ctxt->yes_resp_map,
+							sizeof(ctxt->yes_resp_map));
+			if (tmpstat < 0)
+				mlog_errno(tmpstat);
+		}
+		kfree(ctxt);
+	}
+
+	mlog(0, "returning %d\n", status);
+	return status;
+}
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
+{
+	o2hb_unregister_callback(&dlm->dlm_hb_up);
+	o2hb_unregister_callback(&dlm->dlm_hb_down);
+	o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
+}
+
+static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
+{
+	int status;
+
+	mlog(0, "registering handlers.\n");
+
+	o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
+			    dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+	status = o2hb_register_callback(&dlm->dlm_hb_down);
+	if (status)
+		goto bail;
+
+	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+	status = o2hb_register_callback(&dlm->dlm_hb_up);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_master_request),
+					dlm_master_request_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
+					sizeof(struct dlm_assert_master),
+					dlm_assert_master_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
+					sizeof(struct dlm_create_lock),
+					dlm_create_lock_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
+					DLM_CONVERT_LOCK_MAX_LEN,
+					dlm_convert_lock_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					DLM_UNLOCK_LOCK_MAX_LEN,
+					dlm_unlock_lock_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
+					DLM_PROXY_AST_MAX_LEN,
+					dlm_proxy_ast_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
+					sizeof(struct dlm_exit_domain),
+					dlm_exit_domain_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_migrate_request),
+					dlm_migrate_request_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
+					DLM_MIG_LOCKRES_MAX_LEN,
+					dlm_mig_lockres_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
+					sizeof(struct dlm_master_requery),
+					dlm_master_requery_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_lock_request),
+					dlm_request_all_locks_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
+					sizeof(struct dlm_reco_data_done),
+					dlm_reco_data_done_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
+					sizeof(struct dlm_begin_reco),
+					dlm_begin_reco_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
+					sizeof(struct dlm_finalize_reco),
+					dlm_finalize_reco_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+bail:
+	if (status)
+		dlm_unregister_domain_handlers(dlm);
+
+	return status;
+}
+
+static int dlm_join_domain(struct dlm_ctxt *dlm)
+{
+	int status;
+
+	BUG_ON(!dlm);
+
+	mlog(0, "Join domain %s\n", dlm->name);
+
+	status = dlm_register_domain_handlers(dlm);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_recovery_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	do {
+		unsigned int backoff;
+		status = dlm_try_to_join_domain(dlm);
+
+		/* If we're racing another node to the join, then we
+		 * need to back off temporarily and let them
+		 * complete. */
+		if (status == -EAGAIN) {
+			if (signal_pending(current)) {
+				status = -ERESTARTSYS;
+				goto bail;
+			}
+
+			/*
+			 * <chip> After you!
+			 * <dale> No, after you!
+			 * <chip> I insist!
+			 * <dale> But you first!
+			 * ...
+			 */
+			backoff = (unsigned int)(jiffies & 0x3);
+			backoff *= DLM_DOMAIN_BACKOFF_MS;
+			mlog(0, "backoff %d\n", backoff);
+			msleep(backoff);
+		}
+	} while (status == -EAGAIN);
+
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	wake_up(&dlm_domain_events);
+
+	if (status) {
+		dlm_unregister_domain_handlers(dlm);
+		dlm_complete_thread(dlm);
+		dlm_complete_recovery_thread(dlm);
+	}
+
+	return status;
+}
+
+static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
+				u32 key)
+{
+	int i;
+	struct dlm_ctxt *dlm = NULL;
+
+	dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL);
+	if (!dlm) {
+		mlog_errno(-ENOMEM);
+		goto leave;
+	}
+
+	dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+	if (dlm->name == NULL) {
+		mlog_errno(-ENOMEM);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
+	if (!dlm->resources) {
+		mlog_errno(-ENOMEM);
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+	memset(dlm->resources, 0, PAGE_SIZE);
+
+	for (i=0; i<DLM_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&dlm->resources[i]);
+
+	strcpy(dlm->name, domain);
+	dlm->key = key;
+	dlm->node_num = o2nm_this_node();
+
+	spin_lock_init(&dlm->spinlock);
+	spin_lock_init(&dlm->master_lock);
+	spin_lock_init(&dlm->ast_lock);
+	INIT_LIST_HEAD(&dlm->list);
+	INIT_LIST_HEAD(&dlm->dirty_list);
+	INIT_LIST_HEAD(&dlm->reco.resources);
+	INIT_LIST_HEAD(&dlm->reco.received);
+	INIT_LIST_HEAD(&dlm->reco.node_data);
+	INIT_LIST_HEAD(&dlm->purge_list);
+	INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+	dlm->reco.state = 0;
+
+	INIT_LIST_HEAD(&dlm->pending_asts);
+	INIT_LIST_HEAD(&dlm->pending_basts);
+
+	mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
+		  dlm->recovery_map, &(dlm->recovery_map[0]));
+
+	memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
+	memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
+	memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+
+	dlm->dlm_thread_task = NULL;
+	dlm->dlm_reco_thread_task = NULL;
+	init_waitqueue_head(&dlm->dlm_thread_wq);
+	init_waitqueue_head(&dlm->dlm_reco_thread_wq);
+	init_waitqueue_head(&dlm->reco.event);
+	init_waitqueue_head(&dlm->ast_wq);
+	init_waitqueue_head(&dlm->migration_wq);
+	INIT_LIST_HEAD(&dlm->master_list);
+	INIT_LIST_HEAD(&dlm->mle_hb_events);
+
+	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
+	init_waitqueue_head(&dlm->dlm_join_events);
+
+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	atomic_set(&dlm->local_resources, 0);
+	atomic_set(&dlm->remote_resources, 0);
+	atomic_set(&dlm->unknown_resources, 0);
+
+	spin_lock_init(&dlm->work_lock);
+	INIT_LIST_HEAD(&dlm->work_list);
+	INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm);
+
+	kref_init(&dlm->dlm_refs);
+	dlm->dlm_state = DLM_CTXT_NEW;
+
+	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
+
+	mlog(0, "context init: refcount %u\n",
+		  atomic_read(&dlm->dlm_refs.refcount));
+
+leave:
+	return dlm;
+}
+
+/*
+ * dlm_register_domain: one-time setup per "domain"
+ */
+struct dlm_ctxt * dlm_register_domain(const char *domain,
+			       u32 key)
+{
+	int ret;
+	struct dlm_ctxt *dlm = NULL;
+	struct dlm_ctxt *new_ctxt = NULL;
+
+	if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		mlog(ML_ERROR, "domain name length too long\n");
+		goto leave;
+	}
+
+	if (!o2hb_check_local_node_heartbeating()) {
+		mlog(ML_ERROR, "the local node has not been configured, or is "
+		     "not heartbeating\n");
+		ret = -EPROTO;
+		goto leave;
+	}
+
+	mlog(0, "register called for domain \"%s\"\n", domain);
+
+retry:
+	dlm = NULL;
+	if (signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	spin_lock(&dlm_domain_lock);
+
+	dlm = __dlm_lookup_domain(domain);
+	if (dlm) {
+		if (dlm->dlm_state != DLM_CTXT_JOINED) {
+			spin_unlock(&dlm_domain_lock);
+
+			mlog(0, "This ctxt is not joined yet!\n");
+			wait_event_interruptible(dlm_domain_events,
+						 dlm_wait_on_domain_helper(
+							 domain));
+			goto retry;
+		}
+
+		__dlm_get(dlm);
+		dlm->num_joins++;
+
+		spin_unlock(&dlm_domain_lock);
+
+		ret = 0;
+		goto leave;
+	}
+
+	/* doesn't exist */
+	if (!new_ctxt) {
+		spin_unlock(&dlm_domain_lock);
+
+		new_ctxt = dlm_alloc_ctxt(domain, key);
+		if (new_ctxt)
+			goto retry;
+
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	/* a little variable switch-a-roo here... */
+	dlm = new_ctxt;
+	new_ctxt = NULL;
+
+	/* add the new domain */
+	list_add_tail(&dlm->list, &dlm_domains);
+	spin_unlock(&dlm_domain_lock);
+
+	ret = dlm_join_domain(dlm);
+	if (ret) {
+		mlog_errno(ret);
+		dlm_put(dlm);
+		goto leave;
+	}
+
+	ret = 0;
+leave:
+	if (new_ctxt)
+		dlm_free_ctxt_mem(new_ctxt);
+
+	if (ret < 0)
+		dlm = ERR_PTR(ret);
+
+	return dlm;
+}
+EXPORT_SYMBOL_GPL(dlm_register_domain);
+
+static LIST_HEAD(dlm_join_handlers);
+
+static void dlm_unregister_net_handlers(void)
+{
+	o2net_unregister_handler_list(&dlm_join_handlers);
+}
+
+static int dlm_register_net_handlers(void)
+{
+	int status = 0;
+
+	status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_query_join_request),
+					dlm_query_join_handler,
+					NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_assert_joined),
+					dlm_assert_joined_handler,
+					NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_cancel_join),
+					dlm_cancel_join_handler,
+					NULL, &dlm_join_handlers);
+
+bail:
+	if (status < 0)
+		dlm_unregister_net_handlers();
+
+	return status;
+}
+
+/* Domain eviction callback handling.
+ *
+ * The file system requires notification of node death *before* the
+ * dlm completes it's recovery work, otherwise it may be able to
+ * acquire locks on resources requiring recovery. Since the dlm can
+ * evict a node from it's domain *before* heartbeat fires, a similar
+ * mechanism is required. */
+
+/* Eviction is not expected to happen often, so a per-domain lock is
+ * not necessary. Eviction callbacks are allowed to sleep for short
+ * periods of time. */
+static DECLARE_RWSEM(dlm_callback_sem);
+
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num)
+{
+	struct list_head *iter;
+	struct dlm_eviction_cb *cb;
+
+	down_read(&dlm_callback_sem);
+	list_for_each(iter, &dlm->dlm_eviction_callbacks) {
+		cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
+
+		cb->ec_func(node_num, cb->ec_data);
+	}
+	up_read(&dlm_callback_sem);
+}
+
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data)
+{
+	INIT_LIST_HEAD(&cb->ec_item);
+	cb->ec_func = f;
+	cb->ec_data = data;
+}
+EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb);
+
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_register_eviction_cb);
+
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_del_init(&cb->ec_item);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb);
+
+static int __init dlm_init(void)
+{
+	int status;
+
+	dlm_print_version();
+
+	status = dlm_init_mle_cache();
+	if (status)
+		return -1;
+
+	status = dlm_register_net_handlers();
+	if (status) {
+		dlm_destroy_mle_cache();
+		return -1;
+	}
+
+	return 0;
+}
+
+static void __exit dlm_exit (void)
+{
+	dlm_unregister_net_handlers();
+	dlm_destroy_mle_cache();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(dlm_init);
+module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
new file mode 100644
index 000000000000..2f7f60bfeb3b
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDOMAIN_H
+#define DLMDOMAIN_H
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+
+int dlm_joined(struct dlm_ctxt *dlm);
+int dlm_shutting_down(struct dlm_ctxt *dlm);
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num);
+
+#endif
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
new file mode 100644
index 000000000000..d1a0038557a3
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -0,0 +1,676 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmlock.c
+ *
+ * underlying calls for lock creation
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_next_cookie = 1;
+
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags);
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie);
+static void dlm_lock_release(struct kref *kref);
+static void dlm_lock_detach_lockres(struct dlm_lock *lock);
+
+/* Tell us whether we can grant a new lock request.
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         none
+ *   held on exit:  none
+ * returns: 1 if the lock can be granted, 0 otherwise.
+ */
+static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
+				  struct dlm_lock *lock)
+{
+	struct list_head *iter;
+	struct dlm_lock *tmplock;
+
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+	}
+
+	return 1;
+}
+
+/* performs lock creation at the lockres master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOTQUEUED
+ */
+static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status = DLM_NORMAL;
+
+	mlog_entry("type=%d\n", lock->ml.type);
+
+	spin_lock(&res->spinlock);
+	/* if called from dlm_create_lock_handler, need to
+	 * ensure it will not sleep in dlm_wait_on_lockres */
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL &&
+	    lock->ml.node != dlm->node_num) {
+		/* erf.  state changed after lock was dropped. */
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		return status;
+	}
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+
+	if (dlm_can_grant_new_lock(res, lock)) {
+		mlog(0, "I can grant this lock right away\n");
+		/* got it right away */
+		lock->lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+
+		/* for the recovery lock, we can't allow the ast
+		 * to be queued since the dlmthread is already
+		 * frozen.  but the recovery lock is always locked
+		 * with LKM_NOQUEUE so we do not need the ast in
+		 * this special case */
+		if (!dlm_is_recovery_lock(res->lockname.name,
+					  res->lockname.len)) {
+			kick_thread = 1;
+			call_ast = 1;
+		}
+	} else {
+		/* for NOQUEUE request, unless we get the
+		 * lock right away, return DLM_NOTQUEUED */
+		if (flags & LKM_NOQUEUE)
+			status = DLM_NOTQUEUED;
+		else {
+			dlm_lock_get(lock);
+			list_add_tail(&lock->list, &res->blocked);
+			kick_thread = 1;
+		}
+	}
+
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	dlm_lockres_calc_usage(dlm, res);
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock)
+{
+	/* remove from local queue if it failed */
+	list_del_init(&lock->list);
+	lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+}
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_DENIED, DLM_RECOVERING, or net status
+ */
+static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	enum dlm_status status = DLM_DENIED;
+
+	mlog_entry("type=%d\n", lock->ml.type);
+	mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
+	     res->lockname.name, flags);
+
+	spin_lock(&res->spinlock);
+
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	/* add lock to local (secondary) queue */
+	dlm_lock_get(lock);
+	list_add_tail(&lock->list, &res->blocked);
+	lock->lock_pending = 1;
+	spin_unlock(&res->spinlock);
+
+	/* spec seems to say that you will get DLM_NORMAL when the lock
+	 * has been queued, meaning we need to wait for a reply here. */
+	status = dlm_send_remote_lock_request(dlm, res, lock, flags);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->lock_pending = 0;
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		dlm_revert_pending_lock(res, lock);
+		dlm_lock_put(lock);
+	}
+	spin_unlock(&res->spinlock);
+
+	dlm_lockres_calc_usage(dlm, res);
+
+	wake_up(&res->wq);
+	return status;
+}
+
+
+/* for remote lock creation.
+ * locking:
+ *   caller needs:  none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, or net status
+ */
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags)
+{
+	struct dlm_create_lock create;
+	int tmpret, status = 0;
+	enum dlm_status ret;
+
+	mlog_entry_void();
+
+	memset(&create, 0, sizeof(create));
+	create.node_idx = dlm->node_num;
+	create.requested_type = lock->ml.type;
+	create.cookie = lock->ml.cookie;
+	create.namelen = res->lockname.len;
+	create.flags = cpu_to_be32(flags);
+	memcpy(create.name, res->lockname.name, create.namelen);
+
+	tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
+				    sizeof(create), res->owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		ret = status;  // this is already a dlm_status
+	} else {
+		mlog_errno(tmpret);
+		if (dlm_is_host_down(tmpret)) {
+			ret = DLM_RECOVERING;
+			mlog(0, "node %u died so returning DLM_RECOVERING "
+			     "from lock message!\n", res->owner);
+		} else {
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+void dlm_lock_get(struct dlm_lock *lock)
+{
+	kref_get(&lock->lock_refs);
+}
+
+void dlm_lock_put(struct dlm_lock *lock)
+{
+	kref_put(&lock->lock_refs, dlm_lock_release);
+}
+
+static void dlm_lock_release(struct kref *kref)
+{
+	struct dlm_lock *lock;
+
+	lock = container_of(kref, struct dlm_lock, lock_refs);
+
+	BUG_ON(!list_empty(&lock->list));
+	BUG_ON(!list_empty(&lock->ast_list));
+	BUG_ON(!list_empty(&lock->bast_list));
+	BUG_ON(lock->ast_pending);
+	BUG_ON(lock->bast_pending);
+
+	dlm_lock_detach_lockres(lock);
+
+	if (lock->lksb_kernel_allocated) {
+		mlog(0, "freeing kernel-allocated lksb\n");
+		kfree(lock->lksb);
+	}
+	kfree(lock);
+}
+
+/* associate a lock with it's lockres, getting a ref on the lockres */
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res)
+{
+	dlm_lockres_get(res);
+	lock->lockres = res;
+}
+
+/* drop ref on lockres, if there is still one associated with lock */
+static void dlm_lock_detach_lockres(struct dlm_lock *lock)
+{
+	struct dlm_lock_resource *res;
+
+	res = lock->lockres;
+	if (res) {
+		lock->lockres = NULL;
+		mlog(0, "removing lock's lockres reference\n");
+		dlm_lockres_put(res);
+	}
+}
+
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie)
+{
+	INIT_LIST_HEAD(&newlock->list);
+	INIT_LIST_HEAD(&newlock->ast_list);
+	INIT_LIST_HEAD(&newlock->bast_list);
+	spin_lock_init(&newlock->spinlock);
+	newlock->ml.type = type;
+	newlock->ml.convert_type = LKM_IVMODE;
+	newlock->ml.highest_blocked = LKM_IVMODE;
+	newlock->ml.node = node;
+	newlock->ml.pad1 = 0;
+	newlock->ml.list = 0;
+	newlock->ml.flags = 0;
+	newlock->ast = NULL;
+	newlock->bast = NULL;
+	newlock->astdata = NULL;
+	newlock->ml.cookie = cpu_to_be64(cookie);
+	newlock->ast_pending = 0;
+	newlock->bast_pending = 0;
+	newlock->convert_pending = 0;
+	newlock->lock_pending = 0;
+	newlock->unlock_pending = 0;
+	newlock->cancel_pending = 0;
+	newlock->lksb_kernel_allocated = 0;
+
+	kref_init(&newlock->lock_refs);
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb)
+{
+	struct dlm_lock *lock;
+	int kernel_allocated = 0;
+
+	lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
+	if (!lock)
+		return NULL;
+
+	if (!lksb) {
+		/* zero memory only if kernel-allocated */
+		lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
+		if (!lksb) {
+			kfree(lock);
+			return NULL;
+		}
+		kernel_allocated = 1;
+	}
+
+	dlm_init_lock(lock, type, node, cookie);
+	if (kernel_allocated)
+		lock->lksb_kernel_allocated = 1;
+	lock->lksb = lksb;
+	lksb->lockid = lock;
+	return lock;
+}
+
+/* handler for lock creation net message
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
+ */
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	char *name;
+	unsigned int namelen;
+
+	BUG_ON(!dlm);
+
+	mlog_entry_void();
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	name = create->name;
+	namelen = create->namelen;
+
+	status = DLM_IVBUFLEN;
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	status = DLM_SYSERR;
+	newlock = dlm_new_lock(create->requested_type,
+			       create->node_idx,
+			       be64_to_cpu(create->cookie), NULL);
+	if (!newlock) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	lksb = newlock->lksb;
+
+	if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
+		lksb->flags |= DLM_LKSB_GET_LVB;
+		mlog(0, "set DLM_LKSB_GET_LVB flag\n");
+	}
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, name, namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	spin_unlock(&res->spinlock);
+
+	if (status != DLM_NORMAL) {
+		mlog(0, "lockres recovering/migrating/in-progress\n");
+		goto leave;
+	}
+
+	dlm_lock_attach_lockres(newlock, res);
+
+	status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
+leave:
+	if (status != DLM_NORMAL)
+		if (newlock)
+			dlm_lock_put(newlock);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
+static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
+{
+	u64 tmpnode = node_num;
+
+	/* shift single byte of node num into top 8 bits */
+	tmpnode <<= 56;
+
+	spin_lock(&dlm_cookie_lock);
+	*cookie = (dlm_next_cookie | tmpnode);
+	if (++dlm_next_cookie & 0xff00000000000000ull) {
+		mlog(0, "This node's cookie will now wrap!\n");
+		dlm_next_cookie = 1;
+	}
+	spin_unlock(&dlm_cookie_lock);
+}
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
+			struct dlm_lockstatus *lksb, int flags,
+			const char *name, dlm_astlockfunc_t *ast, void *data,
+			dlm_bastlockfunc_t *bast)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	int convert = 0, recovery = 0;
+
+	/* yes this function is a mess.
+	 * TODO: clean this up.  lots of common code in the
+	 *       lock and convert paths, especially in the retry blocks */
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	status = DLM_BADPARAM;
+	if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
+		dlm_error(status);
+		goto error;
+	}
+
+	if (flags & ~LKM_VALID_FLAGS) {
+		dlm_error(status);
+		goto error;
+	}
+
+	convert = (flags & LKM_CONVERT);
+	recovery = (flags & LKM_RECOVERY);
+
+	if (recovery &&
+	    (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) {
+		dlm_error(status);
+		goto error;
+	}
+	if (convert && (flags & LKM_LOCAL)) {
+		mlog(ML_ERROR, "strange LOCAL convert request!\n");
+		goto error;
+	}
+
+	if (convert) {
+		/* CONVERT request */
+
+		/* if converting, must pass in a valid dlm_lock */
+		lock = lksb->lockid;
+		if (!lock) {
+			mlog(ML_ERROR, "NULL lock pointer in convert "
+			     "request\n");
+			goto error;
+		}
+
+		res = lock->lockres;
+		if (!res) {
+			mlog(ML_ERROR, "NULL lockres pointer in convert "
+			     "request\n");
+			goto error;
+		}
+		dlm_lockres_get(res);
+
+		/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
+	 	 * static after the original lock call.  convert requests will
+		 * ensure that everything is the same, or return DLM_BADARGS.
+	 	 * this means that DLM_DENIED_NOASTS will never be returned.
+	 	 */
+		if (lock->lksb != lksb || lock->ast != ast ||
+		    lock->bast != bast || lock->astdata != data) {
+			status = DLM_BADARGS;
+			mlog(ML_ERROR, "new args:  lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lksb, ast, bast, data);
+			mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lock->lksb, lock->ast,
+			     lock->bast, lock->astdata);
+			goto error;
+		}
+retry_convert:
+		dlm_wait_for_recovery(dlm);
+
+		if (res->owner == dlm->node_num)
+			status = dlmconvert_master(dlm, res, lock, flags, mode);
+		else
+			status = dlmconvert_remote(dlm, res, lock, flags, mode);
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			/* for now, see how this works without sleeping
+			 * and just retry right away.  I suspect the reco
+			 * or migration will complete fast enough that
+			 * no waiting will be necessary */
+			mlog(0, "retrying convert with migration/recovery/"
+			     "in-progress\n");
+			msleep(100);
+			goto retry_convert;
+		}
+	} else {
+		u64 tmpcookie;
+
+		/* LOCK request */
+		status = DLM_BADARGS;
+		if (!name) {
+			dlm_error(status);
+			goto error;
+		}
+
+		status = DLM_IVBUFLEN;
+		if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) {
+			dlm_error(status);
+			goto error;
+		}
+
+		dlm_get_next_cookie(dlm->node_num, &tmpcookie);
+		lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
+		if (!lock) {
+			dlm_error(status);
+			goto error;
+		}
+
+		if (!recovery)
+			dlm_wait_for_recovery(dlm);
+
+		/* find or create the lock resource */
+		res = dlm_get_lock_resource(dlm, name, flags);
+		if (!res) {
+			status = DLM_IVLOCKID;
+			dlm_error(status);
+			goto error;
+		}
+
+		mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
+		mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
+
+		dlm_lock_attach_lockres(lock, res);
+		lock->ast = ast;
+		lock->bast = bast;
+		lock->astdata = data;
+
+retry_lock:
+		if (flags & LKM_VALBLK) {
+			mlog(0, "LKM_VALBLK passed by caller\n");
+
+			/* LVB requests for non PR, PW or EX locks are
+			 * ignored. */
+			if (mode < LKM_PRMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+
+		if (res->owner == dlm->node_num)
+			status = dlmlock_master(dlm, res, lock, flags);
+		else
+			status = dlmlock_remote(dlm, res, lock, flags);
+
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			mlog(0, "retrying lock with migration/"
+			     "recovery/in progress\n");
+			msleep(100);
+			dlm_wait_for_recovery(dlm);
+			goto retry_lock;
+		}
+
+		if (status != DLM_NORMAL) {
+			lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+			if (status != DLM_NOTQUEUED)
+				dlm_error(status);
+			goto error;
+		}
+	}
+
+error:
+	if (status != DLM_NORMAL) {
+		if (lock && !convert)
+			dlm_lock_put(lock);
+		// this is kind of unnecessary
+		lksb->status = status;
+	}
+
+	/* put lockres ref from the convert path
+	 * or from dlm_get_lock_resource */
+	if (res)
+		dlm_lockres_put(res);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmlock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
new file mode 100644
index 000000000000..047279546b4f
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -0,0 +1,2666 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
+#include "cluster/masklog.h"
+
+enum dlm_mle_type {
+	DLM_MLE_BLOCK,
+	DLM_MLE_MASTER,
+	DLM_MLE_MIGRATION
+};
+
+struct dlm_lock_name
+{
+	u8 len;
+	u8 name[DLM_LOCKID_NAME_MAX];
+};
+
+struct dlm_master_list_entry
+{
+	struct list_head list;
+	struct list_head hb_events;
+	struct dlm_ctxt *dlm;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	atomic_t woken;
+	struct kref mle_refs;
+	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	u8 master;
+	u8 new_master;
+	enum dlm_mle_type type;
+	struct o2hb_callback_func mle_hb_up;
+	struct o2hb_callback_func mle_hb_down;
+	union {
+		struct dlm_lock_resource *res;
+		struct dlm_lock_name name;
+	} u;
+};
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node,
+			      int idx);
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node,
+			    int idx);
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
+static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
+				unsigned int namelen, void *nodemap,
+				u32 flags);
+
+static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
+				struct dlm_master_list_entry *mle,
+				const char *name,
+				unsigned int namelen)
+{
+	struct dlm_lock_resource *res;
+
+	if (dlm != mle->dlm)
+		return 0;
+
+	if (mle->type == DLM_MLE_BLOCK ||
+	    mle->type == DLM_MLE_MIGRATION) {
+		if (namelen != mle->u.name.len ||
+    	    	    memcmp(name, mle->u.name.name, namelen)!=0)
+			return 0;
+	} else {
+		res = mle->u.res;
+		if (namelen != res->lockname.len ||
+		    memcmp(res->lockname.name, name, namelen) != 0)
+			return 0;
+	}
+	return 1;
+}
+
+#if 0
+/* Code here is included but defined out as it aids debugging */
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+	int i = 0, refs;
+	char *type;
+	char attached;
+	u8 master;
+	unsigned int namelen;
+	const char *name;
+	struct kref *k;
+
+	k = &mle->mle_refs;
+	if (mle->type == DLM_MLE_BLOCK)
+		type = "BLK";
+	else if (mle->type == DLM_MLE_MASTER)
+		type = "MAS";
+	else
+		type = "MIG";
+	refs = atomic_read(&k->refcount);
+	master = mle->master;
+	attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
+
+	if (mle->type != DLM_MLE_MASTER) {
+		namelen = mle->u.name.len;
+		name = mle->u.name.name;
+	} else {
+		namelen = mle->u.res->lockname.len;
+		name = mle->u.res->lockname.name;
+	}
+
+	mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
+		  i, type, refs, master, mle->new_master, attached,
+		  namelen, namelen, name);
+}
+
+static void dlm_dump_mles(struct dlm_ctxt *dlm)
+{
+	struct dlm_master_list_entry *mle;
+	struct list_head *iter;
+	
+	mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
+	mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
+	spin_lock(&dlm->master_lock);
+	list_for_each(iter, &dlm->master_list) {
+		mle = list_entry(iter, struct dlm_master_list_entry, list);
+		dlm_print_one_mle(mle);
+	}
+	spin_unlock(&dlm->master_lock);
+}
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+
+int dlm_dump_all_mles(const char __user *data, unsigned int len)
+{
+	struct list_head *iter;
+	struct dlm_ctxt *dlm;
+
+	spin_lock(&dlm_domain_lock);
+	list_for_each(iter, &dlm_domains) {
+		dlm = list_entry (iter, struct dlm_ctxt, list);
+		mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
+		dlm_dump_mles(dlm);
+	}
+	spin_unlock(&dlm_domain_lock);
+	return len;
+}
+EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
+
+#endif  /*  0  */
+
+
+static kmem_cache_t *dlm_mle_cache = NULL;
+
+
+static void dlm_mle_release(struct kref *kref);
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen);
+static void dlm_put_mle(struct dlm_master_list_entry *mle);
+static void __dlm_put_mle(struct dlm_master_list_entry *mle);
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen);
+
+static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
+
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked);
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked);
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master);
+
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res);
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res);
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target);
+
+
+int dlm_is_host_down(int errno)
+{
+	switch (errno) {
+		case -EBADF:
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+		case -EHOSTDOWN:
+		case -EHOSTUNREACH:
+		case -ETIMEDOUT:
+		case -ECONNABORTED:
+		case -ENETDOWN:
+		case -ENETUNREACH:
+		case -ENETRESET:
+		case -ESHUTDOWN:
+		case -ENOPROTOOPT:
+		case -EINVAL:   /* if returned from our tcp code,
+				   this means there is no socket */
+			return 1;
+	}
+	return 0;
+}
+
+
+/*
+ * MASTER LIST FUNCTIONS
+ */
+
+
+/*
+ * regarding master list entries and heartbeat callbacks:
+ *
+ * in order to avoid sleeping and allocation that occurs in
+ * heartbeat, master list entries are simply attached to the
+ * dlm's established heartbeat callbacks.  the mle is attached
+ * when it is created, and since the dlm->spinlock is held at
+ * that time, any heartbeat event will be properly discovered
+ * by the mle.  the mle needs to be detached from the
+ * dlm->mle_hb_events list as soon as heartbeat events are no
+ * longer useful to the mle, and before the mle is freed.
+ *
+ * as a general rule, heartbeat events are no longer needed by
+ * the mle once an "answer" regarding the lock master has been
+ * received.
+ */
+static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
+}
+
+
+static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	if (!list_empty(&mle->hb_events))
+		list_del_init(&mle->hb_events);
+}
+
+
+static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					    struct dlm_master_list_entry *mle)
+{
+	spin_lock(&dlm->spinlock);
+	__dlm_mle_detach_hb_events(dlm, mle);
+	spin_unlock(&dlm->spinlock);
+}
+
+/* remove from list and free */
+static void __dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	BUG_ON(!atomic_read(&mle->mle_refs.refcount));
+
+	kref_put(&mle->mle_refs, dlm_mle_release);
+}
+
+
+/* must not have any spinlocks coming in */
+static void dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
+{
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	mle->dlm = dlm;
+	mle->type = type;
+	INIT_LIST_HEAD(&mle->list);
+	INIT_LIST_HEAD(&mle->hb_events);
+	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+	spin_lock_init(&mle->spinlock);
+	init_waitqueue_head(&mle->wq);
+	atomic_set(&mle->woken, 0);
+	kref_init(&mle->mle_refs);
+	memset(mle->response_map, 0, sizeof(mle->response_map));
+	mle->master = O2NM_MAX_NODES;
+	mle->new_master = O2NM_MAX_NODES;
+
+	if (mle->type == DLM_MLE_MASTER) {
+		BUG_ON(!res);
+		mle->u.res = res;
+	} else if (mle->type == DLM_MLE_BLOCK) {
+		BUG_ON(!name);
+		memcpy(mle->u.name.name, name, namelen);
+		mle->u.name.len = namelen;
+	} else /* DLM_MLE_MIGRATION */ {
+		BUG_ON(!name);
+		memcpy(mle->u.name.name, name, namelen);
+		mle->u.name.len = namelen;
+	}
+
+	/* copy off the node_map and register hb callbacks on our copy */
+	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
+	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+	clear_bit(dlm->node_num, mle->vote_map);
+	clear_bit(dlm->node_num, mle->node_map);
+
+	/* attach the mle to the domain node up/down events */
+	__dlm_mle_attach_hb_events(dlm, mle);
+}
+
+
+/* returns 1 if found, 0 if not */
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen)
+{
+	struct dlm_master_list_entry *tmpmle;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm->master_lock);
+
+	list_for_each(iter, &dlm->master_list) {
+		tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
+		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
+			continue;
+		dlm_get_mle(tmpmle);
+		*mle = tmpmle;
+		return 1;
+	}
+	return 0;
+}
+
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
+{
+	struct dlm_master_list_entry *mle;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm->spinlock);
+	
+	list_for_each(iter, &dlm->mle_hb_events) {
+		mle = list_entry(iter, struct dlm_master_list_entry, 
+				 hb_events);
+		if (node_up)
+			dlm_mle_node_up(dlm, mle, NULL, idx);
+		else
+			dlm_mle_node_down(dlm, mle, NULL, idx);
+	}
+}
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (!test_bit(idx, mle->node_map))
+		mlog(0, "node %u already removed from nodemap!\n", idx);
+	else
+		clear_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (test_bit(idx, mle->node_map))
+		mlog(0, "node %u already in node map!\n", idx);
+	else
+		set_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+
+int dlm_init_mle_cache(void)
+{
+	dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
+					  sizeof(struct dlm_master_list_entry),
+					  0, SLAB_HWCACHE_ALIGN,
+					  NULL, NULL);
+	if (dlm_mle_cache == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void dlm_destroy_mle_cache(void)
+{
+	if (dlm_mle_cache)
+		kmem_cache_destroy(dlm_mle_cache);
+}
+
+static void dlm_mle_release(struct kref *kref)
+{
+	struct dlm_master_list_entry *mle;
+	struct dlm_ctxt *dlm;
+
+	mlog_entry_void();
+
+	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
+	dlm = mle->dlm;
+
+	if (mle->type != DLM_MLE_MASTER) {
+		mlog(0, "calling mle_release for %.*s, type %d\n",
+		     mle->u.name.len, mle->u.name.name, mle->type);
+	} else {
+		mlog(0, "calling mle_release for %.*s, type %d\n",
+		     mle->u.res->lockname.len,
+		     mle->u.res->lockname.name, mle->type);
+	}
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	/* remove from list if not already */
+	if (!list_empty(&mle->list))
+		list_del_init(&mle->list);
+
+	/* detach the mle from the domain node up/down events */
+	__dlm_mle_detach_hb_events(dlm, mle);
+
+	/* NOTE: kfree under spinlock here.
+	 * if this is bad, we can move this to a freelist. */
+	kmem_cache_free(dlm_mle_cache, mle);
+}
+
+
+/*
+ * LOCK RESOURCE FUNCTIONS
+ */
+
+static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
+
+	if (owner == dlm->node_num)
+		atomic_inc(&dlm->local_resources);
+	else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
+		atomic_inc(&dlm->unknown_resources);
+	else
+		atomic_inc(&dlm->remote_resources);
+
+	res->owner = owner;
+}
+
+void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res, u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	if (owner == res->owner)
+		return;
+
+	if (res->owner == dlm->node_num)
+		atomic_dec(&dlm->local_resources);
+	else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
+		atomic_dec(&dlm->unknown_resources);
+	else
+		atomic_dec(&dlm->remote_resources);
+
+	dlm_set_lockres_owner(dlm, res, owner);
+}
+
+
+static void dlm_lockres_release(struct kref *kref)
+{
+	struct dlm_lock_resource *res;
+
+	res = container_of(kref, struct dlm_lock_resource, refs);
+
+	/* This should not happen -- all lockres' have a name
+	 * associated with them at init time. */
+	BUG_ON(!res->lockname.name);
+
+	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
+	     res->lockname.name);
+
+	/* By the time we're ready to blow this guy away, we shouldn't
+	 * be on any lists. */
+	BUG_ON(!list_empty(&res->list));
+	BUG_ON(!list_empty(&res->granted));
+	BUG_ON(!list_empty(&res->converting));
+	BUG_ON(!list_empty(&res->blocked));
+	BUG_ON(!list_empty(&res->dirty));
+	BUG_ON(!list_empty(&res->recovering));
+	BUG_ON(!list_empty(&res->purge));
+
+	kfree(res->lockname.name);
+
+	kfree(res);
+}
+
+void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+	kref_get(&res->refs);
+}
+
+void dlm_lockres_put(struct dlm_lock_resource *res)
+{
+	kref_put(&res->refs, dlm_lockres_release);
+}
+
+static void dlm_init_lockres(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res,
+			     const char *name, unsigned int namelen)
+{
+	char *qname;
+
+	/* If we memset here, we lose our reference to the kmalloc'd
+	 * res->lockname.name, so be sure to init every field
+	 * correctly! */
+
+	qname = (char *) res->lockname.name;
+	memcpy(qname, name, namelen);
+
+	res->lockname.len = namelen;
+	res->lockname.hash = full_name_hash(name, namelen);
+
+	init_waitqueue_head(&res->wq);
+	spin_lock_init(&res->spinlock);
+	INIT_LIST_HEAD(&res->list);
+	INIT_LIST_HEAD(&res->granted);
+	INIT_LIST_HEAD(&res->converting);
+	INIT_LIST_HEAD(&res->blocked);
+	INIT_LIST_HEAD(&res->dirty);
+	INIT_LIST_HEAD(&res->recovering);
+	INIT_LIST_HEAD(&res->purge);
+	atomic_set(&res->asts_reserved, 0);
+	res->migration_pending = 0;
+
+	kref_init(&res->refs);
+
+	/* just for consistency */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+	spin_unlock(&res->spinlock);
+
+	res->state = DLM_LOCK_RES_IN_PROGRESS;
+
+	res->last_used = 0;
+
+	memset(res->lvb, 0, DLM_LVB_LEN);
+}
+
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+				   const char *name,
+				   unsigned int namelen)
+{
+	struct dlm_lock_resource *res;
+
+	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+	if (!res)
+		return NULL;
+
+	res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+	if (!res->lockname.name) {
+		kfree(res);
+		return NULL;
+	}
+
+	dlm_init_lockres(dlm, res, name, namelen);
+	return res;
+}
+
+/*
+ * lookup a lock resource by name.
+ * may already exist in the hashtable.
+ * lockid is null terminated
+ *
+ * if not, allocate enough for the lockres and for
+ * the temporary structure used in doing the mastering.
+ *
+ * also, do a lookup in the dlm->master_list to see
+ * if another node has begun mastering the same lock.
+ * if so, there should be a block entry in there
+ * for this name, and we should *not* attempt to master
+ * the lock here.   need to wait around for that node
+ * to assert_master (or die).
+ *
+ */
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+					  const char *lockid,
+					  int flags)
+{
+	struct dlm_lock_resource *tmpres=NULL, *res=NULL;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *alloc_mle = NULL;
+	int blocked = 0;
+	int ret, nodenum;
+	struct dlm_node_iter iter;
+	unsigned int namelen;
+	int tries = 0;
+
+	BUG_ON(!lockid);
+
+	namelen = strlen(lockid);
+
+	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
+
+lookup:
+	spin_lock(&dlm->spinlock);
+	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+	if (tmpres) {
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "found in hash!\n");
+		if (res)
+			dlm_lockres_put(res);
+		res = tmpres;
+		goto leave;
+	}
+
+	if (!res) {
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "allocating a new resource\n");
+		/* nothing found and we need to allocate one. */
+		alloc_mle = (struct dlm_master_list_entry *)
+			kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+		if (!alloc_mle)
+			goto leave;
+		res = dlm_new_lockres(dlm, lockid, namelen);
+		if (!res)
+			goto leave;
+		goto lookup;
+	}
+
+	mlog(0, "no lockres found, allocated our own: %p\n", res);
+
+	if (flags & LKM_LOCAL) {
+		/* caller knows it's safe to assume it's not mastered elsewhere
+		 * DONE!  return right away */
+		spin_lock(&res->spinlock);
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+		__dlm_insert_lockres(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+		/* lockres still marked IN_PROGRESS */
+		goto wake_waiters;
+	}
+
+	/* check master list to see if another node has started mastering it */
+	spin_lock(&dlm->master_lock);
+
+	/* if we found a block, wait for lock to be mastered by another node */
+	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
+	if (blocked) {
+		if (mle->type == DLM_MLE_MASTER) {
+			mlog(ML_ERROR, "master entry for nonexistent lock!\n");
+			BUG();
+		} else if (mle->type == DLM_MLE_MIGRATION) {
+			/* migration is in progress! */
+			/* the good news is that we now know the
+			 * "current" master (mle->master). */
+
+			spin_unlock(&dlm->master_lock);
+			assert_spin_locked(&dlm->spinlock);
+
+			/* set the lockres owner and hash it */
+			spin_lock(&res->spinlock);
+			dlm_set_lockres_owner(dlm, res, mle->master);
+			__dlm_insert_lockres(dlm, res);
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->spinlock);
+
+			/* master is known, detach */
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+			mle = NULL;
+			goto wake_waiters;
+		}
+	} else {
+		/* go ahead and try to master lock on this node */
+		mle = alloc_mle;
+		/* make sure this does not get freed below */
+		alloc_mle = NULL;
+		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
+		set_bit(dlm->node_num, mle->maybe_map);
+		list_add(&mle->list, &dlm->master_list);
+	}
+
+	/* at this point there is either a DLM_MLE_BLOCK or a
+	 * DLM_MLE_MASTER on the master list, so it's safe to add the
+	 * lockres to the hashtable.  anyone who finds the lock will
+	 * still have to wait on the IN_PROGRESS. */
+
+	/* finally add the lockres to its hash bucket */
+	__dlm_insert_lockres(dlm, res);
+	/* get an extra ref on the mle in case this is a BLOCK
+	 * if so, the creator of the BLOCK may try to put the last
+	 * ref at this time in the assert master handler, so we
+	 * need an extra one to keep from a bad ptr deref. */
+	dlm_get_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	/* must wait for lock to be mastered elsewhere */
+	if (blocked)
+		goto wait;
+
+redo_request:
+	ret = -EINVAL;
+	dlm_node_iter_init(mle->vote_map, &iter);
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = dlm_do_master_request(mle, nodenum);
+		if (ret < 0)
+			mlog_errno(ret);
+		if (mle->master != O2NM_MAX_NODES) {
+			/* found a master ! */
+			break;
+		}
+	}
+
+wait:
+	/* keep going until the response map includes all nodes */
+	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
+	if (ret < 0) {
+		mlog(0, "%s:%.*s: node map changed, redo the "
+		     "master request now, blocked=%d\n",
+		     dlm->name, res->lockname.len,
+		     res->lockname.name, blocked);
+		if (++tries > 20) {
+			mlog(ML_ERROR, "%s:%.*s: spinning on "
+			     "dlm_wait_for_lock_mastery, blocked=%d\n", 
+			     dlm->name, res->lockname.len, 
+			     res->lockname.name, blocked);
+			dlm_print_one_lock_resource(res);
+			/* dlm_print_one_mle(mle); */
+			tries = 0;
+		}
+		goto redo_request;
+	}
+
+	mlog(0, "lockres mastered by %u\n", res->owner);
+	/* make sure we never continue without this */
+	BUG_ON(res->owner == O2NM_MAX_NODES);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle(mle);
+	/* put the extra ref */
+	dlm_put_mle(mle);
+
+wake_waiters:
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+leave:
+	/* need to free the unused mle */
+	if (alloc_mle)
+		kmem_cache_free(dlm_mle_cache, alloc_mle);
+
+	return res;
+}
+
+
+#define DLM_MASTERY_TIMEOUT_MS   5000
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked)
+{
+	u8 m;
+	int ret, bit;
+	int map_changed, voting_done;
+	int assert, sleep;
+
+recheck:
+	ret = 0;
+	assert = 0;
+
+	/* check if another node has already become the owner */
+	spin_lock(&res->spinlock);
+	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	spin_unlock(&res->spinlock);
+
+	spin_lock(&mle->spinlock);
+	m = mle->master;
+	map_changed = (memcmp(mle->vote_map, mle->node_map,
+			      sizeof(mle->vote_map)) != 0);
+	voting_done = (memcmp(mle->vote_map, mle->response_map,
+			     sizeof(mle->vote_map)) == 0);
+
+	/* restart if we hit any errors */
+	if (map_changed) {
+		int b;
+		mlog(0, "%s: %.*s: node map changed, restarting\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
+		b = (mle->type == DLM_MLE_BLOCK);
+		if ((*blocked && !b) || (!*blocked && b)) {
+			mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 
+			     dlm->name, res->lockname.len, res->lockname.name,
+			     *blocked, b);
+			*blocked = b;
+		}
+		spin_unlock(&mle->spinlock);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto leave;
+		}
+		mlog(0, "%s:%.*s: restart lock mastery succeeded, "
+		     "rechecking now\n", dlm->name, res->lockname.len,
+		     res->lockname.name);
+		goto recheck;
+	}
+
+	if (m != O2NM_MAX_NODES) {
+		/* another node has done an assert!
+		 * all done! */
+		sleep = 0;
+	} else {
+		sleep = 1;
+		/* have all nodes responded? */
+		if (voting_done && !*blocked) {
+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+			if (dlm->node_num <= bit) {
+				/* my node number is lowest.
+			 	 * now tell other nodes that I am
+				 * mastering this. */
+				mle->master = dlm->node_num;
+				assert = 1;
+				sleep = 0;
+			}
+			/* if voting is done, but we have not received
+			 * an assert master yet, we must sleep */
+		}
+	}
+
+	spin_unlock(&mle->spinlock);
+
+	/* sleep if we haven't finished voting yet */
+	if (sleep) {
+		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
+
+		/*
+		if (atomic_read(&mle->mle_refs.refcount) < 2)
+			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
+			atomic_read(&mle->mle_refs.refcount),
+			res->lockname.len, res->lockname.name);
+		*/
+		atomic_set(&mle->woken, 0);
+		(void)wait_event_timeout(mle->wq,
+					 (atomic_read(&mle->woken) == 1),
+					 timeo);
+		if (res->owner == O2NM_MAX_NODES) {
+			mlog(0, "waiting again\n");
+			goto recheck;
+		}
+		mlog(0, "done waiting, master is %u\n", res->owner);
+		ret = 0;
+		goto leave;
+	}
+
+	ret = 0;   /* done */
+	if (assert) {
+		m = dlm->node_num;
+		mlog(0, "about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, m);
+		ret = dlm_do_assert_master(dlm, res->lockname.name,
+					   res->lockname.len, mle->vote_map, 0);
+		if (ret) {
+			/* This is a failure in the network path,
+			 * not in the response to the assert_master
+			 * (any nonzero response is a BUG on this node).
+			 * Most likely a socket just got disconnected
+			 * due to node death. */
+			mlog_errno(ret);
+		}
+		/* no longer need to restart lock mastery.
+		 * all living nodes have been contacted. */
+		ret = 0;
+	}
+
+	/* set the lockres owner */
+	spin_lock(&res->spinlock);
+	dlm_change_lockres_owner(dlm, res, m);
+	spin_unlock(&res->spinlock);
+
+leave:
+	return ret;
+}
+
+struct dlm_bitmap_diff_iter
+{
+	int curnode;
+	unsigned long *orig_bm;
+	unsigned long *cur_bm;
+	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+enum dlm_node_state_change
+{
+	NODE_DOWN = -1,
+	NODE_NO_CHANGE = 0,
+	NODE_UP
+};
+
+static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
+				      unsigned long *orig_bm,
+				      unsigned long *cur_bm)
+{
+	unsigned long p1, p2;
+	int i;
+
+	iter->curnode = -1;
+	iter->orig_bm = orig_bm;
+	iter->cur_bm = cur_bm;
+
+	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
+       		p1 = *(iter->orig_bm + i);
+	       	p2 = *(iter->cur_bm + i);
+		iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
+	}
+}
+
+static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
+				     enum dlm_node_state_change *state)
+{
+	int bit;
+
+	if (iter->curnode >= O2NM_MAX_NODES)
+		return -ENOENT;
+
+	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
+			    iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+
+	/* if it was there in the original then this node died */
+	if (test_bit(bit, iter->orig_bm))
+		*state = NODE_DOWN;
+	else
+		*state = NODE_UP;
+
+	iter->curnode = bit;
+	return bit;
+}
+
+
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked)
+{
+	struct dlm_bitmap_diff_iter bdi;
+	enum dlm_node_state_change sc;
+	int node;
+	int ret = 0;
+
+	mlog(0, "something happened such that the "
+	     "master process may need to be restarted!\n");
+
+	assert_spin_locked(&mle->spinlock);
+
+	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
+	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	while (node >= 0) {
+		if (sc == NODE_UP) {
+			/* a node came up.  easy.  might not even need
+			 * to talk to it if its node number is higher
+			 * or if we are already blocked. */
+			mlog(0, "node up! %d\n", node);
+			if (blocked)
+				goto next;
+
+			if (node > dlm->node_num) {
+				mlog(0, "node > this node. skipping.\n");
+				goto next;
+			}
+
+			/* redo the master request, but only for the new node */
+			mlog(0, "sending request to new node\n");
+			clear_bit(node, mle->response_map);
+			set_bit(node, mle->vote_map);
+		} else {
+			mlog(ML_ERROR, "node down! %d\n", node);
+
+			/* if the node wasn't involved in mastery skip it,
+			 * but clear it out from the maps so that it will
+			 * not affect mastery of this lockres */
+			clear_bit(node, mle->response_map);
+			clear_bit(node, mle->vote_map);
+			if (!test_bit(node, mle->maybe_map))
+				goto next;
+
+			/* if we're already blocked on lock mastery, and the
+			 * dead node wasn't the expected master, or there is
+			 * another node in the maybe_map, keep waiting */
+			if (blocked) {
+				int lowest = find_next_bit(mle->maybe_map,
+						       O2NM_MAX_NODES, 0);
+
+				/* act like it was never there */
+				clear_bit(node, mle->maybe_map);
+
+			       	if (node != lowest)
+					goto next;
+
+				mlog(ML_ERROR, "expected master %u died while "
+				     "this node was blocked waiting on it!\n",
+				     node);
+				lowest = find_next_bit(mle->maybe_map,
+						       O2NM_MAX_NODES,
+						       lowest+1);
+				if (lowest < O2NM_MAX_NODES) {
+					mlog(0, "still blocked. waiting "
+					     "on %u now\n", lowest);
+					goto next;
+				}
+
+				/* mle is an MLE_BLOCK, but there is now
+				 * nothing left to block on.  we need to return
+				 * all the way back out and try again with
+				 * an MLE_MASTER. dlm_do_local_recovery_cleanup
+				 * has already run, so the mle refcount is ok */
+				mlog(0, "no longer blocking. we can "
+				     "try to master this here\n");
+				mle->type = DLM_MLE_MASTER;
+				memset(mle->maybe_map, 0,
+				       sizeof(mle->maybe_map));
+				memset(mle->response_map, 0,
+				       sizeof(mle->maybe_map));
+				memcpy(mle->vote_map, mle->node_map,
+				       sizeof(mle->node_map));
+				mle->u.res = res;
+				set_bit(dlm->node_num, mle->maybe_map);
+
+				ret = -EAGAIN;
+				goto next;
+			}
+
+			clear_bit(node, mle->maybe_map);
+			if (node > dlm->node_num)
+				goto next;
+
+			mlog(0, "dead node in map!\n");
+			/* yuck. go back and re-contact all nodes
+			 * in the vote_map, removing this node. */
+			memset(mle->response_map, 0,
+			       sizeof(mle->response_map));
+		}
+		ret = -EAGAIN;
+next:
+		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	}
+	return ret;
+}
+
+
+/*
+ * DLM_MASTER_REQUEST_MSG
+ *
+ * returns: 0 on success,
+ *          -errno on a network error
+ *
+ * on error, the caller should assume the target node is "dead"
+ *
+ */
+
+static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
+{
+	struct dlm_ctxt *dlm = mle->dlm;
+	struct dlm_master_request request;
+	int ret, response=0, resend;
+
+	memset(&request, 0, sizeof(request));
+	request.node_idx = dlm->node_num;
+
+	BUG_ON(mle->type == DLM_MLE_MIGRATION);
+
+	if (mle->type != DLM_MLE_MASTER) {
+		request.namelen = mle->u.name.len;
+		memcpy(request.name, mle->u.name.name, request.namelen);
+	} else {
+		request.namelen = mle->u.res->lockname.len;
+		memcpy(request.name, mle->u.res->lockname.name,
+			request.namelen);
+	}
+
+again:
+	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
+				 sizeof(request), to, &response);
+	if (ret < 0)  {
+		if (ret == -ESRCH) {
+			/* should never happen */
+			mlog(ML_ERROR, "TCP stack not ready!\n");
+			BUG();
+		} else if (ret == -EINVAL) {
+			mlog(ML_ERROR, "bad args passed to o2net!\n");
+			BUG();
+		} else if (ret == -ENOMEM) {
+			mlog(ML_ERROR, "out of memory while trying to send "
+			     "network message!  retrying\n");
+			/* this is totally crude */
+			msleep(50);
+			goto again;
+		} else if (!dlm_is_host_down(ret)) {
+			/* not a network error. bad. */
+			mlog_errno(ret);
+			mlog(ML_ERROR, "unhandled error!");
+			BUG();
+		}
+		/* all other errors should be network errors,
+		 * and likely indicate node death */
+		mlog(ML_ERROR, "link to %d went down!\n", to);
+		goto out;
+	}
+
+	ret = 0;
+	resend = 0;
+	spin_lock(&mle->spinlock);
+	switch (response) {
+		case DLM_MASTER_RESP_YES:
+			set_bit(to, mle->response_map);
+			mlog(0, "node %u is the master, response=YES\n", to);
+			mle->master = to;
+			break;
+		case DLM_MASTER_RESP_NO:
+			mlog(0, "node %u not master, response=NO\n", to);
+			set_bit(to, mle->response_map);
+			break;
+		case DLM_MASTER_RESP_MAYBE:
+			mlog(0, "node %u not master, response=MAYBE\n", to);
+			set_bit(to, mle->response_map);
+			set_bit(to, mle->maybe_map);
+			break;
+		case DLM_MASTER_RESP_ERROR:
+			mlog(0, "node %u hit an error, resending\n", to);
+			resend = 1;
+			response = 0;
+			break;
+		default:
+			mlog(ML_ERROR, "bad response! %u\n", response);
+			BUG();
+	}
+	spin_unlock(&mle->spinlock);
+	if (resend) {
+		/* this is also totally crude */
+		msleep(50);
+		goto again;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	u8 response = DLM_MASTER_RESP_MAYBE;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res;
+	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+	char *name;
+	unsigned int namelen;
+	int found, ret;
+	int set_maybe;
+
+	if (!dlm_grab(dlm))
+		return DLM_MASTER_RESP_NO;
+
+	if (!dlm_domain_fully_joined(dlm)) {
+		response = DLM_MASTER_RESP_NO;
+		goto send_response;
+	}
+
+	name = request->name;
+	namelen = request->namelen;
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		response = DLM_IVBUFLEN;
+		goto send_response;
+	}
+
+way_up_top:
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen);
+	if (res) {
+		spin_unlock(&dlm->spinlock);
+
+		/* take care of the easy cases up front */
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			spin_unlock(&res->spinlock);
+			mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
+			     "being recovered\n");
+			response = DLM_MASTER_RESP_ERROR;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		if (res->owner == dlm->node_num) {
+			u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP;
+			spin_unlock(&res->spinlock);
+			// mlog(0, "this node is the master\n");
+			response = DLM_MASTER_RESP_YES;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+
+			/* this node is the owner.
+			 * there is some extra work that needs to
+			 * happen now.  the requesting node has
+			 * caused all nodes up to this one to
+			 * create mles.  this node now needs to
+			 * go back and clean those up. */
+			mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
+			     dlm->node_num, res->lockname.len, res->lockname.name);
+			ret = dlm_dispatch_assert_master(dlm, res, 1,
+							 request->node_idx,
+							 flags);
+			if (ret < 0) {
+				mlog(ML_ERROR, "failed to dispatch assert "
+				     "master work\n");
+				response = DLM_MASTER_RESP_ERROR;
+			}
+			goto send_response;
+		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			spin_unlock(&res->spinlock);
+			// mlog(0, "node %u is the master\n", res->owner);
+			response = DLM_MASTER_RESP_NO;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		/* ok, there is no owner.  either this node is
+		 * being blocked, or it is actively trying to
+		 * master this lock. */
+		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+			mlog(ML_ERROR, "lock with no owner should be "
+			     "in-progress!\n");
+			BUG();
+		}
+
+		// mlog(0, "lockres is in progress...\n");
+		spin_lock(&dlm->master_lock);
+		found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+		if (!found) {
+			mlog(ML_ERROR, "no mle found for this lock!\n");
+			BUG();
+		}
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK) {
+			// mlog(0, "this node is waiting for "
+			// "lockres to be mastered\n");
+			response = DLM_MASTER_RESP_NO;
+		} else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "node %u is master, but trying to migrate to "
+			     "node %u.\n", tmpmle->master, tmpmle->new_master);
+			if (tmpmle->master == dlm->node_num) {
+				response = DLM_MASTER_RESP_YES;
+				mlog(ML_ERROR, "no owner on lockres, but this "
+				     "node is trying to migrate it to %u?!\n",
+				     tmpmle->new_master);
+				BUG();
+			} else {
+				/* the real master can respond on its own */
+				response = DLM_MASTER_RESP_NO;
+			}
+		} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			set_maybe = 0;
+			if (tmpmle->master == dlm->node_num)
+				response = DLM_MASTER_RESP_YES;
+			else
+				response = DLM_MASTER_RESP_NO;
+		} else {
+			// mlog(0, "this node is attempting to "
+			// "master lockres\n");
+			response = DLM_MASTER_RESP_MAYBE;
+		}
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&res->spinlock);
+
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+		if (mle)
+			kmem_cache_free(dlm_mle_cache, mle);
+		goto send_response;
+	}
+
+	/*
+	 * lockres doesn't exist on this node
+	 * if there is an MLE_BLOCK, return NO
+	 * if there is an MLE_MASTER, return MAYBE
+	 * otherwise, add an MLE_BLOCK, return NO
+	 */
+	spin_lock(&dlm->master_lock);
+	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+	if (!found) {
+		/* this lockid has never been seen on this node yet */
+		// mlog(0, "no mle found\n");
+		if (!mle) {
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
+
+			mle = (struct dlm_master_list_entry *)
+				kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+			if (!mle) {
+				// bad bad bad... this sucks.
+				response = DLM_MASTER_RESP_ERROR;
+				goto send_response;
+			}
+			spin_lock(&dlm->spinlock);
+			dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
+					 name, namelen);
+			spin_unlock(&dlm->spinlock);
+			goto way_up_top;
+		}
+
+		// mlog(0, "this is second time thru, already allocated, "
+		// "add the block.\n");
+		set_bit(request->node_idx, mle->maybe_map);
+		list_add(&mle->list, &dlm->master_list);
+		response = DLM_MASTER_RESP_NO;
+	} else {
+		// mlog(0, "mle was found\n");
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK)
+			response = DLM_MASTER_RESP_NO;
+		else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "migration mle was found (%u->%u)\n",
+			     tmpmle->master, tmpmle->new_master);
+			if (tmpmle->master == dlm->node_num) {
+				mlog(ML_ERROR, "no lockres, but migration mle "
+				     "says that this node is master!\n");
+				BUG();
+			}
+			/* real master can respond on its own */
+			response = DLM_MASTER_RESP_NO;
+		} else {
+			if (tmpmle->master == dlm->node_num) {
+				response = DLM_MASTER_RESP_YES;
+				set_maybe = 0;
+			} else
+				response = DLM_MASTER_RESP_MAYBE;
+		}
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+	}
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (found) {
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+	}
+send_response:
+	dlm_put(dlm);
+	return response;
+}
+
+/*
+ * DLM_ASSERT_MASTER_MSG
+ */
+
+
+/*
+ * NOTE: this can be used for debugging
+ * can periodically run all locks owned by this node
+ * and re-assert across the cluster...
+ */
+static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
+				unsigned int namelen, void *nodemap,
+				u32 flags)
+{
+	struct dlm_assert_master assert;
+	int to, tmpret;
+	struct dlm_node_iter iter;
+	int ret = 0;
+
+	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+	/* note that if this nodemap is empty, it returns 0 */
+	dlm_node_iter_init(nodemap, &iter);
+	while ((to = dlm_node_iter_next(&iter)) >= 0) {
+		int r = 0;
+		mlog(0, "sending assert master to %d (%.*s)\n", to,
+		     namelen, lockname);
+		memset(&assert, 0, sizeof(assert));
+		assert.node_idx = dlm->node_num;
+		assert.namelen = namelen;
+		memcpy(assert.name, lockname, namelen);
+		assert.flags = cpu_to_be32(flags);
+
+		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
+					    &assert, sizeof(assert), to, &r);
+		if (tmpret < 0) {
+			mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+			if (!dlm_is_host_down(tmpret)) {
+				mlog(ML_ERROR, "unhandled error!\n");
+				BUG();
+			}
+			/* a node died.  finish out the rest of the nodes. */
+			mlog(ML_ERROR, "link to %d went down!\n", to);
+			/* any nonzero status return will do */
+			ret = tmpret;
+		} else if (r < 0) {
+			/* ok, something horribly messed.  kill thyself. */
+			mlog(ML_ERROR,"during assert master of %.*s to %u, "
+			     "got %d.\n", namelen, lockname, to, r);
+			dlm_dump_lock_resources(dlm);
+			BUG();
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	char *name;
+	unsigned int namelen;
+	u32 flags;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	name = assert->name;
+	namelen = assert->namelen;
+	flags = be32_to_cpu(assert->flags);
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length!");
+		goto done;
+	}
+
+	spin_lock(&dlm->spinlock);
+
+	if (flags)
+		mlog(0, "assert_master with flags: %u\n", flags);
+
+	/* find the MLE */
+	spin_lock(&dlm->master_lock);
+	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
+		/* not an error, could be master just re-asserting */
+		mlog(0, "just got an assert_master from %u, but no "
+		     "MLE for it! (%.*s)\n", assert->node_idx,
+		     namelen, name);
+	} else {
+		int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
+		if (bit >= O2NM_MAX_NODES) {
+			/* not necessarily an error, though less likely.
+			 * could be master just re-asserting. */
+			mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
+			     "is asserting! (%.*s)\n", assert->node_idx,
+			     namelen, name);
+		} else if (bit != assert->node_idx) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "master %u was found, %u should "
+				     "back off\n", assert->node_idx, bit);
+			} else {
+				/* with the fix for bug 569, a higher node
+				 * number winning the mastery will respond
+				 * YES to mastery requests, but this node
+				 * had no way of knowing.  let it pass. */
+				mlog(ML_ERROR, "%u is the lowest node, "
+				     "%u is asserting. (%.*s)  %u must "
+				     "have begun after %u won.\n", bit,
+				     assert->node_idx, namelen, name, bit,
+				     assert->node_idx);
+			}
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+
+	/* ok everything checks out with the MLE
+	 * now check to see if there is a lockres */
+	res = __dlm_lookup_lockres(dlm, name, namelen);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING)  {
+			mlog(ML_ERROR, "%u asserting but %.*s is "
+			     "RECOVERING!\n", assert->node_idx, namelen, name);
+			goto kill;
+		}
+		if (!mle) {
+			if (res->owner != assert->node_idx) {
+				mlog(ML_ERROR, "assert_master from "
+					  "%u, but current owner is "
+					  "%u! (%.*s)\n",
+				       assert->node_idx, res->owner,
+				       namelen, name);
+				goto kill;
+			}
+		} else if (mle->type != DLM_MLE_MIGRATION) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+				/* owner is just re-asserting */
+				if (res->owner == assert->node_idx) {
+					mlog(0, "owner %u re-asserting on "
+					     "lock %.*s\n", assert->node_idx,
+					     namelen, name);
+					goto ok;
+				}
+				mlog(ML_ERROR, "got assert_master from "
+				     "node %u, but %u is the owner! "
+				     "(%.*s)\n", assert->node_idx,
+				     res->owner, namelen, name);
+				goto kill;
+			}
+			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+				mlog(ML_ERROR, "got assert from %u, but lock "
+				     "with no owner should be "
+				     "in-progress! (%.*s)\n",
+				     assert->node_idx,
+				     namelen, name);
+				goto kill;
+			}
+		} else /* mle->type == DLM_MLE_MIGRATION */ {
+			/* should only be getting an assert from new master */
+			if (assert->node_idx != mle->new_master) {
+				mlog(ML_ERROR, "got assert from %u, but "
+				     "new master is %u, and old master "
+				     "was %u (%.*s)\n",
+				     assert->node_idx, mle->new_master,
+				     mle->master, namelen, name);
+				goto kill;
+			}
+
+		}
+ok:
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	// mlog(0, "woo!  got an assert_master from node %u!\n",
+	// 	     assert->node_idx);
+	if (mle) {
+		int extra_ref;
+		
+		spin_lock(&mle->spinlock);
+		extra_ref = !!(mle->type == DLM_MLE_BLOCK
+			       || mle->type == DLM_MLE_MIGRATION);
+		mle->master = assert->node_idx;
+		atomic_set(&mle->woken, 1);
+		wake_up(&mle->wq);
+		spin_unlock(&mle->spinlock);
+
+		if (mle->type == DLM_MLE_MIGRATION && res) {
+			mlog(0, "finishing off migration of lockres %.*s, "
+			     "from %u to %u\n",
+			       res->lockname.len, res->lockname.name,
+			       dlm->node_num, mle->new_master);
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_MIGRATING;
+			dlm_change_lockres_owner(dlm, res, mle->new_master);
+			BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			spin_unlock(&res->spinlock);
+		}
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, mle);
+		dlm_put_mle(mle);
+		
+		if (extra_ref) {
+			/* the assert master message now balances the extra
+		 	 * ref given by the master / migration request message.
+		 	 * if this is the last put, it will be removed
+		 	 * from the list. */
+			dlm_put_mle(mle);
+		}
+	}
+
+done:
+	if (res)
+		dlm_lockres_put(res);
+	dlm_put(dlm);
+	return 0;
+
+kill:
+	/* kill the caller! */
+	spin_unlock(&res->spinlock);
+	spin_unlock(&dlm->spinlock);
+	dlm_lockres_put(res);
+	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+	     "and killing the other node now!  This node is OK and can continue.\n");
+	dlm_dump_lock_resources(dlm);
+	dlm_put(dlm);
+	return -EINVAL;
+}
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher, u8 request_from, u32 flags)
+{
+	struct dlm_work_item *item;
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+
+	/* queue up work for dlm_assert_master_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
+	item->u.am.lockres = res; /* already have a ref */
+	/* can optionally ignore node numbers higher than this node */
+	item->u.am.ignore_higher = ignore_higher;
+	item->u.am.request_from = request_from;
+	item->u.am.flags = flags;
+
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+
+	schedule_work(&dlm->dispatched_work);
+	return 0;
+}
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int ignore_higher;
+	int bit;
+	u8 request_from;
+	u32 flags;
+
+	dlm = item->dlm;
+	res = item->u.am.lockres;
+	ignore_higher = item->u.am.ignore_higher;
+	request_from = item->u.am.request_from;
+	flags = item->u.am.flags;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dlm->node_num, nodemap);
+	if (ignore_higher) {
+		/* if is this just to clear up mles for nodes below
+		 * this node, do not send the message to the original
+		 * caller or any node number higher than this */
+		clear_bit(request_from, nodemap);
+		bit = dlm->node_num;
+		while (1) {
+			bit = find_next_bit(nodemap, O2NM_MAX_NODES,
+					    bit+1);
+		       	if (bit >= O2NM_MAX_NODES)
+				break;
+			clear_bit(bit, nodemap);
+		}
+	}
+
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	mlog(0, "worker about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, dlm->node_num);
+	ret = dlm_do_assert_master(dlm, res->lockname.name,
+				   res->lockname.len,
+				   nodemap, flags);
+	if (ret < 0) {
+		/* no need to restart, we are done */
+		mlog_errno(ret);
+	}
+
+	dlm_lockres_put(res);
+
+	mlog(0, "finished with dlm_assert_master_worker\n");
+}
+
+
+/*
+ * DLM_MIGRATE_LOCKRES
+ */
+
+
+int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			u8 target)
+{
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *oldmle = NULL;
+ 	struct dlm_migratable_lockres *mres = NULL;
+	int ret = -EINVAL;
+	const char *name;
+	unsigned int namelen;
+	int mle_added = 0;
+	struct list_head *queue, *iter;
+	int i;
+	struct dlm_lock *lock;
+	int empty = 1;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	name = res->lockname.name;
+	namelen = res->lockname.len;
+
+	mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+
+	/*
+	 * ensure this lockres is a proper candidate for migration
+	 */
+	spin_lock(&res->spinlock);
+	if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "cannot migrate lockres with unknown owner!\n");
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	if (res->owner != dlm->node_num) {
+		mlog(0, "cannot migrate lockres this node doesn't own!\n");
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	mlog(0, "checking queues...\n");
+	queue = &res->granted;
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			empty = 0;
+			if (lock->ml.node == dlm->node_num) {
+				mlog(0, "found a lock owned by this node "
+				     "still on the %s queue!  will not "
+				     "migrate this lockres\n",
+				     i==0 ? "granted" :
+				     (i==1 ? "converting" : "blocked"));
+				spin_unlock(&res->spinlock);
+				ret = -ENOTEMPTY;
+				goto leave;
+			}
+		}
+		queue++;
+	}
+	mlog(0, "all locks on this lockres are nonlocal.  continuing\n");
+	spin_unlock(&res->spinlock);
+
+	/* no work to do */
+	if (empty) {
+		mlog(0, "no locks were found on this lockres! done!\n");
+		ret = 0;
+		goto leave;
+	}
+
+	/*
+	 * preallocate up front
+	 * if this fails, abort
+	 */
+
+	ret = -ENOMEM;
+	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+	if (!mres) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+								GFP_KERNEL);
+	if (!mle) {
+		mlog_errno(ret);
+		goto leave;
+	}
+	ret = 0;
+
+	/*
+	 * find a node to migrate the lockres to
+	 */
+
+	mlog(0, "picking a migration node\n");
+	spin_lock(&dlm->spinlock);
+	/* pick a new node */
+	if (!test_bit(target, dlm->domain_map) ||
+	    target >= O2NM_MAX_NODES) {
+		target = dlm_pick_migration_target(dlm, res);
+	}
+	mlog(0, "node %u chosen for migration\n", target);
+
+	if (target >= O2NM_MAX_NODES ||
+	    !test_bit(target, dlm->domain_map)) {
+		/* target chosen is not alive */
+		ret = -EINVAL;
+	}
+
+	if (ret) {
+		spin_unlock(&dlm->spinlock);
+		goto fail;
+	}
+
+	mlog(0, "continuing with target = %u\n", target);
+
+	/*
+	 * clear any existing master requests and
+	 * add the migration mle to the list
+	 */
+	spin_lock(&dlm->master_lock);
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
+				    namelen, target, dlm->node_num);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (ret == -EEXIST) {
+		mlog(0, "another process is already migrating it\n");
+		goto fail;
+	}
+	mle_added = 1;
+
+	/*
+	 * set the MIGRATING flag and flush asts
+	 * if we fail after this we need to re-dirty the lockres
+	 */
+	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
+		mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
+		     "the target went down.\n", res->lockname.len,
+		     res->lockname.name, target);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
+		ret = -EINVAL;
+	}
+
+fail:
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (ret < 0) {
+		if (mle_added) {
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+		} else if (mle) {
+			kmem_cache_free(dlm_mle_cache, mle);
+		}
+		goto leave;
+	}
+
+	/*
+	 * at this point, we have a migration target, an mle
+	 * in the master list, and the MIGRATING flag set on
+	 * the lockres
+	 */
+
+
+	/* get an extra reference on the mle.
+	 * otherwise the assert_master from the new
+	 * master will destroy this.
+	 * also, make sure that all callers of dlm_get_mle
+	 * take both dlm->spinlock and dlm->master_lock */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	dlm_get_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	/* notify new node and send all lock state */
+	/* call send_one_lockres with migration flag.
+	 * this serves as notice to the target node that a
+	 * migration is starting. */
+	ret = dlm_send_one_lockres(dlm, res, mres, target,
+				   DLM_MRES_MIGRATION);
+
+	if (ret < 0) {
+		mlog(0, "migration to node %u failed with %d\n",
+		     target, ret);
+		/* migration failed, detach and clean up mle */
+		dlm_mle_detach_hb_events(dlm, mle);
+		dlm_put_mle(mle);
+		dlm_put_mle(mle);
+		goto leave;
+	}
+
+	/* at this point, the target sends a message to all nodes,
+	 * (using dlm_do_migrate_request).  this node is skipped since
+	 * we had to put an mle in the list to begin the process.  this
+	 * node now waits for target to do an assert master.  this node
+	 * will be the last one notified, ensuring that the migration
+	 * is complete everywhere.  if the target dies while this is
+	 * going on, some nodes could potentially see the target as the
+	 * master, so it is important that my recovery finds the migration
+	 * mle and sets the master to UNKNONWN. */
+
+
+	/* wait for new node to assert master */
+	while (1) {
+		ret = wait_event_interruptible_timeout(mle->wq,
+					(atomic_read(&mle->woken) == 1),
+					msecs_to_jiffies(5000));
+
+		if (ret >= 0) {
+		       	if (atomic_read(&mle->woken) == 1 ||
+			    res->owner == target)
+				break;
+
+			mlog(0, "timed out during migration\n");
+		}
+		if (ret == -ERESTARTSYS) {
+			/* migration failed, detach and clean up mle */
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+			dlm_put_mle(mle);
+			goto leave;
+		}
+		/* TODO: if node died: stop, clean up, return error */
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, target);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	dlm_remove_nonlocal_locks(dlm, res);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle(mle);
+	ret = 0;
+
+	dlm_lockres_calc_usage(dlm, res);
+
+leave:
+	/* re-dirty the lockres if we failed */
+	if (ret < 0)
+		dlm_kick_thread(dlm, res);
+
+	/* TODO: cleanup */
+	if (mres)
+		free_page((unsigned long)mres);
+
+	dlm_put(dlm);
+
+	mlog(0, "returning %d\n", ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
+
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	int ret;
+	spin_lock(&dlm->ast_lock);
+	spin_lock(&lock->spinlock);
+	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&dlm->ast_lock);
+	return ret;
+}
+
+static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     u8 mig_target)
+{
+	int can_proceed;
+	spin_lock(&res->spinlock);
+	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
+	spin_unlock(&res->spinlock);
+
+	/* target has died, so make the caller break out of the 
+	 * wait_event, but caller must recheck the domain_map */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(mig_target, dlm->domain_map))
+		can_proceed = 1;
+	spin_unlock(&dlm->spinlock);
+	return can_proceed;
+}
+
+int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	int ret;
+	spin_lock(&res->spinlock);
+	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
+	spin_unlock(&res->spinlock);
+	return ret;
+}
+
+
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target)
+{
+	int ret = 0;
+
+	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
+	       res->lockname.len, res->lockname.name, dlm->node_num,
+	       target);
+	/* need to set MIGRATING flag on lockres.  this is done by
+	 * ensuring that all asts have been flushed for this lockres. */
+	spin_lock(&res->spinlock);
+	BUG_ON(res->migration_pending);
+	res->migration_pending = 1;
+	/* strategy is to reserve an extra ast then release
+	 * it below, letting the release do all of the work */
+	__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
+	/* now flush all the pending asts.. hang out for a bit */
+	dlm_kick_thread(dlm, res);
+	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
+	dlm_lockres_release_ast(dlm, res);
+
+	mlog(0, "about to wait on migration_wq, dirty=%s\n",
+	       res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+	/* if the extra ref we just put was the final one, this
+	 * will pass thru immediately.  otherwise, we need to wait
+	 * for the last ast to finish. */
+again:
+	ret = wait_event_interruptible_timeout(dlm->migration_wq,
+		   dlm_migration_can_proceed(dlm, res, target),
+		   msecs_to_jiffies(1000));
+	if (ret < 0) {
+		mlog(0, "woken again: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	} else {
+		mlog(0, "all is well: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	}
+	if (!dlm_migration_can_proceed(dlm, res, target)) {
+		mlog(0, "trying again...\n");
+		goto again;
+	}
+
+	/* did the target go down or die? */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(target, dlm->domain_map)) {
+		mlog(ML_ERROR, "aha. migration target %u just went down\n",
+		     target);
+		ret = -EHOSTDOWN;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	/*
+	 * at this point:
+	 *
+	 *   o the DLM_LOCK_RES_MIGRATING flag is set
+	 *   o there are no pending asts on this lockres
+	 *   o all processes trying to reserve an ast on this
+	 *     lockres must wait for the MIGRATING flag to clear
+	 */
+	return ret;
+}
+
+/* last step in the migration process.
+ * original master calls this to free all of the dlm_lock
+ * structures that used to be for other nodes. */
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res)
+{
+	struct list_head *iter, *iter2;
+	struct list_head *queue = &res->granted;
+	int i;
+	struct dlm_lock *lock;
+
+	assert_spin_locked(&res->spinlock);
+
+	BUG_ON(res->owner == dlm->node_num);
+
+	for (i=0; i<3; i++) {
+		list_for_each_safe(iter, iter2, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			if (lock->ml.node != dlm->node_num) {
+				mlog(0, "putting lock for node %u\n",
+				     lock->ml.node);
+				/* be extra careful */
+				BUG_ON(!list_empty(&lock->ast_list));
+				BUG_ON(!list_empty(&lock->bast_list));
+				BUG_ON(lock->ast_pending);
+				BUG_ON(lock->bast_pending);
+				list_del_init(&lock->list);
+				dlm_lock_put(lock);
+			}
+		}
+		queue++;
+	}
+}
+
+/* for now this is not too intelligent.  we will
+ * need stats to make this do the right thing.
+ * this just finds the first lock on one of the
+ * queues and uses that node as the target. */
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res)
+{
+	int i;
+	struct list_head *queue = &res->granted;
+	struct list_head *iter;
+	struct dlm_lock *lock;
+	int nodenum;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	spin_lock(&res->spinlock);
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			/* up to the caller to make sure this node
+			 * is alive */
+			lock = list_entry (iter, struct dlm_lock, list);
+			if (lock->ml.node != dlm->node_num) {
+				spin_unlock(&res->spinlock);
+				return lock->ml.node;
+			}
+		}
+		queue++;
+	}
+	spin_unlock(&res->spinlock);
+	mlog(0, "have not found a suitable target yet! checking domain map\n");
+
+	/* ok now we're getting desperate.  pick anyone alive. */
+	nodenum = -1;
+	while (1) {
+		nodenum = find_next_bit(dlm->domain_map,
+					O2NM_MAX_NODES, nodenum+1);
+		mlog(0, "found %d in domain map\n", nodenum);
+		if (nodenum >= O2NM_MAX_NODES)
+			break;
+		if (nodenum != dlm->node_num) {
+			mlog(0, "picking %d\n", nodenum);
+			return nodenum;
+		}
+	}
+
+	mlog(0, "giving up.  no master to migrate to\n");
+	return DLM_LOCK_RES_OWNER_UNKNOWN;
+}
+
+
+
+/* this is called by the new master once all lockres
+ * data has been received */
+static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  u8 master, u8 new_master,
+				  struct dlm_node_iter *iter)
+{
+	struct dlm_migrate_request migrate;
+	int ret, status = 0;
+	int nodenum;
+
+	memset(&migrate, 0, sizeof(migrate));
+	migrate.namelen = res->lockname.len;
+	memcpy(migrate.name, res->lockname.name, migrate.namelen);
+	migrate.new_master = new_master;
+	migrate.master = master;
+
+	ret = 0;
+
+	/* send message to all nodes, except the master and myself */
+	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
+		if (nodenum == master ||
+		    nodenum == new_master)
+			continue;
+
+		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					 &migrate, sizeof(migrate), nodenum,
+					 &status);
+		if (ret < 0)
+			mlog_errno(ret);
+		else if (status < 0) {
+			mlog(0, "migrate request (node %u) returned %d!\n",
+			     nodenum, status);
+			ret = status;
+		}
+	}
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	mlog(0, "returning ret=%d\n", ret);
+	return ret;
+}
+
+
+/* if there is an existing mle for this lockres, we now know who the master is.
+ * (the one who sent us *this* message) we can clear it up right away.
+ * since the process that put the mle on the list still has a reference to it,
+ * we can unhash it now, set the master and wake the process.  as a result,
+ * we will have no mle in the list to start with.  now we can add an mle for
+ * the migration and this should be the only one found for those scanning the
+ * list.  */
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
+	const char *name;
+	unsigned int namelen;
+	int ret = 0;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	name = migrate->name;
+	namelen = migrate->namelen;
+
+	/* preallocate.. if this fails, abort */
+	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+							 GFP_KERNEL);
+
+	if (!mle) {
+		ret = -ENOMEM;
+		goto leave;
+	}
+
+	/* check for pre-existing lock */
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen);
+	spin_lock(&dlm->master_lock);
+
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			/* if all is working ok, this can only mean that we got
+		 	* a migrate request from a node that we now see as
+		 	* dead.  what can we do here?  drop it to the floor? */
+			spin_unlock(&res->spinlock);
+			mlog(ML_ERROR, "Got a migrate request, but the "
+			     "lockres is marked as recovering!");
+			kmem_cache_free(dlm_mle_cache, mle);
+			ret = -EINVAL; /* need a better solution */
+			goto unlock;
+		}
+		res->state |= DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
+	}
+
+	/* ignore status.  only nonzero status would BUG. */
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
+				    name, namelen,
+				    migrate->new_master,
+				    migrate->master);
+
+unlock:
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (res)
+		dlm_lockres_put(res);
+leave:
+	dlm_put(dlm);
+	return ret;
+}
+
+/* must be holding dlm->spinlock and dlm->master_lock
+ * when adding a migration mle, we can clear any other mles
+ * in the master list because we know with certainty that
+ * the master is "master".  so we remove any old mle from
+ * the list after setting it's master field, and then add
+ * the new migration mle.  this way we can hold with the rule
+ * of having only one mle for a given lock name at all times. */
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master)
+{
+	int found;
+	int ret = 0;
+
+	*oldmle = NULL;
+
+	mlog_entry_void();
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	/* caller is responsible for any ref taken here on oldmle */
+	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
+	if (found) {
+		struct dlm_master_list_entry *tmp = *oldmle;
+		spin_lock(&tmp->spinlock);
+		if (tmp->type == DLM_MLE_MIGRATION) {
+			if (master == dlm->node_num) {
+				/* ah another process raced me to it */
+				mlog(0, "tried to migrate %.*s, but some "
+				     "process beat me to it\n",
+				     namelen, name);
+				ret = -EEXIST;
+			} else {
+				/* bad.  2 NODES are trying to migrate! */
+				mlog(ML_ERROR, "migration error  mle: "
+				     "master=%u new_master=%u // request: "
+				     "master=%u new_master=%u // "
+				     "lockres=%.*s\n",
+				     tmp->master, tmp->new_master,
+				     master, new_master,
+				     namelen, name);
+				BUG();
+			}
+		} else {
+			/* this is essentially what assert_master does */
+			tmp->master = master;
+			atomic_set(&tmp->woken, 1);
+			wake_up(&tmp->wq);
+			/* remove it from the list so that only one
+			 * mle will be found */
+			list_del_init(&tmp->list);
+		}
+		spin_unlock(&tmp->spinlock);
+	}
+
+	/* now add a migration mle to the tail of the list */
+	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
+	mle->new_master = new_master;
+	mle->master = master;
+	/* do this for consistency with other mle types */
+	set_bit(new_master, mle->maybe_map);
+	list_add(&mle->list, &dlm->master_list);
+
+	return ret;
+}
+
+
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter, *iter2;
+	struct dlm_master_list_entry *mle;
+	struct dlm_lock_resource *res;
+
+	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+top:
+	assert_spin_locked(&dlm->spinlock);
+
+	/* clean the master list */
+	spin_lock(&dlm->master_lock);
+	list_for_each_safe(iter, iter2, &dlm->master_list) {
+		mle = list_entry(iter, struct dlm_master_list_entry, list);
+
+		BUG_ON(mle->type != DLM_MLE_BLOCK &&
+		       mle->type != DLM_MLE_MASTER &&
+		       mle->type != DLM_MLE_MIGRATION);
+
+		/* MASTER mles are initiated locally.  the waiting
+		 * process will notice the node map change
+		 * shortly.  let that happen as normal. */
+		if (mle->type == DLM_MLE_MASTER)
+			continue;
+
+
+		/* BLOCK mles are initiated by other nodes.
+		 * need to clean up if the dead node would have
+		 * been the master. */
+		if (mle->type == DLM_MLE_BLOCK) {
+			int bit;
+
+			spin_lock(&mle->spinlock);
+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+			if (bit != dead_node) {
+				mlog(0, "mle found, but dead node %u would "
+				     "not have been master\n", dead_node);
+				spin_unlock(&mle->spinlock);
+			} else {
+				/* must drop the refcount by one since the
+				 * assert_master will never arrive.  this
+				 * may result in the mle being unlinked and
+				 * freed, but there may still be a process
+				 * waiting in the dlmlock path which is fine. */
+				mlog(ML_ERROR, "node %u was expected master\n",
+				     dead_node);
+				atomic_set(&mle->woken, 1);
+				spin_unlock(&mle->spinlock);
+				wake_up(&mle->wq);
+				/* final put will take care of list removal */
+				__dlm_put_mle(mle);
+			}
+			continue;
+		}
+
+		/* everything else is a MIGRATION mle */
+
+		/* the rule for MIGRATION mles is that the master
+		 * becomes UNKNOWN if *either* the original or
+		 * the new master dies.  all UNKNOWN lockreses
+		 * are sent to whichever node becomes the recovery
+		 * master.  the new master is responsible for
+		 * determining if there is still a master for
+		 * this lockres, or if he needs to take over
+		 * mastery.  either way, this node should expect
+		 * another message to resolve this. */
+		if (mle->master != dead_node &&
+		    mle->new_master != dead_node)
+			continue;
+
+		/* if we have reached this point, this mle needs to
+		 * be removed from the list and freed. */
+
+		/* remove from the list early.  NOTE: unlinking
+		 * list_head while in list_for_each_safe */
+		spin_lock(&mle->spinlock);
+		list_del_init(&mle->list);
+		atomic_set(&mle->woken, 1);
+		spin_unlock(&mle->spinlock);
+		wake_up(&mle->wq);
+
+		mlog(0, "node %u died during migration from "
+		     "%u to %u!\n", dead_node,
+		     mle->master, mle->new_master);
+		/* if there is a lockres associated with this
+	 	 * mle, find it and set its owner to UNKNOWN */
+		res = __dlm_lookup_lockres(dlm, mle->u.name.name,
+					mle->u.name.len);
+		if (res) {
+			/* unfortunately if we hit this rare case, our
+		 	 * lock ordering is messed.  we need to drop
+		 	 * the master lock so that we can take the
+		  	 * lockres lock, meaning that we will have to
+			 * restart from the head of list. */
+			spin_unlock(&dlm->master_lock);
+
+			/* move lockres onto recovery list */
+			spin_lock(&res->spinlock);
+			dlm_set_lockres_owner(dlm, res,
+				      	DLM_LOCK_RES_OWNER_UNKNOWN);
+			dlm_move_lockres_to_recovery_list(dlm, res);
+			spin_unlock(&res->spinlock);
+			dlm_lockres_put(res);
+
+			/* dump the mle */
+			spin_lock(&dlm->master_lock);
+			__dlm_put_mle(mle);
+			spin_unlock(&dlm->master_lock);
+
+			/* restart */
+			goto top;
+		}
+
+		/* this may be the last reference */
+		__dlm_put_mle(mle);
+	}
+	spin_unlock(&dlm->master_lock);
+}
+
+
+int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 u8 old_master)
+{
+	struct dlm_node_iter iter;
+	int ret = 0;
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	clear_bit(old_master, iter.node_map);
+	clear_bit(dlm->node_num, iter.node_map);
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "now time to do a migrate request to other nodes\n");
+	ret = dlm_do_migrate_request(dlm, res, old_master,
+				     dlm->node_num, &iter);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mlog(0, "doing assert master of %.*s to all except the original node\n",
+	     res->lockname.len, res->lockname.name);
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	ret = dlm_do_assert_master(dlm, res->lockname.name,
+				   res->lockname.len, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		/* no longer need to retry.  all living nodes contacted. */
+		mlog_errno(ret);
+		ret = 0;
+	}
+
+	memset(iter.node_map, 0, sizeof(iter.node_map));
+	set_bit(old_master, iter.node_map);
+	mlog(0, "doing assert master of %.*s back to %u\n",
+	     res->lockname.len, res->lockname.name, old_master);
+	ret = dlm_do_assert_master(dlm, res->lockname.name,
+				   res->lockname.len, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		mlog(0, "assert master to original master failed "
+		     "with %d.\n", ret);
+		/* the only nonzero status here would be because of
+		 * a dead original node.  we're done. */
+		ret = 0;
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, dlm->node_num);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	/* re-dirty it on the new master */
+	dlm_kick_thread(dlm, res);
+	wake_up(&res->wq);
+leave:
+	return ret;
+}
+
+/*
+ * LOCKRES AST REFCOUNT
+ * this is integral to migration
+ */
+
+/* for future intent to call an ast, reserve one ahead of time.
+ * this should be called only after waiting on the lockres
+ * with dlm_wait_on_lockres, and while still holding the
+ * spinlock after the call. */
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		__dlm_print_one_lock_resource(res);
+	}
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+
+	atomic_inc(&res->asts_reserved);
+}
+
+/*
+ * used to drop the reserved ast, either because it went unused,
+ * or because the ast/bast was actually called.
+ *
+ * also, if there is a pending migration on this lockres,
+ * and this was the last pending ast on the lockres,
+ * atomically set the MIGRATING flag before we drop the lock.
+ * this is how we ensure that migration can proceed with no
+ * asts in progress.  note that it is ok if the state of the
+ * queues is such that a lock should be granted in the future
+ * or that a bast should be fired, because the new master will
+ * shuffle the lists on this lockres as soon as it is migrated.
+ */
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res)
+{
+	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
+		return;
+
+	if (!res->migration_pending) {
+		spin_unlock(&res->spinlock);
+		return;
+	}
+
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+	res->migration_pending = 0;
+	res->state |= DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	wake_up(&dlm->migration_wq);
+}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
new file mode 100644
index 000000000000..0c8eb1093f00
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -0,0 +1,2132 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
+#include "cluster/masklog.h"
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
+
+static int dlm_recovery_thread(void *data);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
+static int dlm_do_recovery(struct dlm_ctxt *dlm);
+
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_request_all_locks(struct dlm_ctxt *dlm,
+				 u8 request_from, u8 dead_node);
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master);
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				    struct dlm_migratable_lockres *mres,
+				    u8 send_to,
+				    struct dlm_lock_resource *res,
+				    int total_locks);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master);
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres);
+static int dlm_do_master_requery(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 u8 nodenum, u8 *real_master);
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
+				 u8 dead_node, u8 send_to);
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list, u8 dead_node);
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master);
+static void dlm_reco_ast(void *astdata);
+static void dlm_reco_bast(void *astdata, int blocked_type);
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
+static void dlm_request_all_locks_worker(struct dlm_work_item *item,
+					 void *data);
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+
+static u64 dlm_get_next_mig_cookie(void);
+
+static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_mig_cookie = 1;
+
+static u64 dlm_get_next_mig_cookie(void)
+{
+	u64 c;
+	spin_lock(&dlm_mig_cookie_lock);
+	c = dlm_mig_cookie;
+	if (dlm_mig_cookie == (~0ULL))
+		dlm_mig_cookie = 1;
+	else
+		dlm_mig_cookie++;
+	spin_unlock(&dlm_mig_cookie_lock);
+	return c;
+}
+
+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	spin_unlock(&dlm->spinlock);
+}
+
+/* Worker function used during recovery. */
+void dlm_dispatch_work(void *data)
+{
+	struct dlm_ctxt *dlm = (struct dlm_ctxt *)data;
+	LIST_HEAD(tmp_list);
+	struct list_head *iter, *iter2;
+	struct dlm_work_item *item;
+	dlm_workfunc_t *workfunc;
+
+	spin_lock(&dlm->work_lock);
+	list_splice_init(&dlm->work_list, &tmp_list);
+	spin_unlock(&dlm->work_lock);
+
+	list_for_each_safe(iter, iter2, &tmp_list) {
+		item = list_entry(iter, struct dlm_work_item, list);
+		workfunc = item->func;
+		list_del_init(&item->list);
+
+		/* already have ref on dlm to avoid having
+		 * it disappear.  just double-check. */
+		BUG_ON(item->dlm != dlm);
+
+		/* this is allowed to sleep and
+		 * call network stuff */
+		workfunc(item, item->data);
+
+		dlm_put(dlm);
+		kfree(item);
+	}
+}
+
+/*
+ * RECOVERY THREAD
+ */
+
+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
+{
+	/* wake the recovery thread
+	 * this will wake the reco thread in one of three places
+	 * 1) sleeping with no recovery happening
+	 * 2) sleeping with recovery mastered elsewhere
+	 * 3) recovery mastered here, waiting on reco data */
+
+	wake_up(&dlm->dlm_reco_thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "starting dlm recovery thread...\n");
+
+	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
+						"dlm_reco_thread");
+	if (IS_ERR(dlm->dlm_reco_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
+		dlm->dlm_reco_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_reco_thread_task) {
+		mlog(0, "waiting for dlm recovery thread to exit\n");
+		kthread_stop(dlm->dlm_reco_thread_task);
+		dlm->dlm_reco_thread_task = NULL;
+	}
+}
+
+
+
+/*
+ * this is lame, but here's how recovery works...
+ * 1) all recovery threads cluster wide will work on recovering
+ *    ONE node at a time
+ * 2) negotiate who will take over all the locks for the dead node.
+ *    thats right... ALL the locks.
+ * 3) once a new master is chosen, everyone scans all locks
+ *    and moves aside those mastered by the dead guy
+ * 4) each of these locks should be locked until recovery is done
+ * 5) the new master collects up all of secondary lock queue info
+ *    one lock at a time, forcing each node to communicate back
+ *    before continuing
+ * 6) each secondary lock queue responds with the full known lock info
+ * 7) once the new master has run all its locks, it sends a ALLDONE!
+ *    message to everyone
+ * 8) upon receiving this message, the secondary queue node unlocks
+ *    and responds to the ALLDONE
+ * 9) once the new master gets responses from everyone, he unlocks
+ *    everything and recovery for this dead node is done
+ *10) go back to 2) while there are still dead nodes
+ *
+ */
+
+
+#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
+
+static int dlm_recovery_thread(void *data)
+{
+	int status;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		if (dlm_joined(dlm)) {
+			status = dlm_do_recovery(dlm);
+			if (status == -EAGAIN) {
+				/* do not sleep, recheck immediately. */
+				continue;
+			}
+			if (status < 0)
+				mlog_errno(status);
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM recovery thread\n");
+	return 0;
+}
+
+/* callers of the top-level api calls (dlmlock/dlmunlock) should
+ * block on the dlm->reco.event when recovery is in progress.
+ * the dlm recovery thread will set this state when it begins
+ * recovering a dead node (as the new master or not) and clear
+ * the state and wake as soon as all affected lock resources have
+ * been marked with the RECOVERY flag */
+static int dlm_in_recovery(struct dlm_ctxt *dlm)
+{
+	int in_recovery;
+	spin_lock(&dlm->spinlock);
+	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	spin_unlock(&dlm->spinlock);
+	return in_recovery;
+}
+
+
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
+{
+	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
+}
+
+static void dlm_begin_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_end_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
+	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+	wake_up(&dlm->reco.event);
+}
+
+static int dlm_do_recovery(struct dlm_ctxt *dlm)
+{
+	int status = 0;
+
+	spin_lock(&dlm->spinlock);
+
+	/* check to see if the new master has died */
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
+	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+		mlog(0, "new master %u died while recovering %u!\n",
+		     dlm->reco.new_master, dlm->reco.dead_node);
+		/* unset the new_master, leave dead_node */
+		dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	}
+
+	/* select a target to recover */
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		int bit;
+
+		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+		if (bit >= O2NM_MAX_NODES || bit < 0)
+			dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+		else
+			dlm->reco.dead_node = bit;
+	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+		/* BUG? */
+		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
+		     dlm->reco.dead_node);
+		dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	}
+
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		// mlog(0, "nothing to recover!  sleeping now!\n");
+		spin_unlock(&dlm->spinlock);
+		/* return to main thread loop and sleep. */
+		return 0;
+	}
+	mlog(0, "recovery thread found node %u in the recovery map!\n",
+	     dlm->reco.dead_node);
+	spin_unlock(&dlm->spinlock);
+
+	/* take write barrier */
+	/* (stops the list reshuffling thread, proxy ast handling) */
+	dlm_begin_recovery(dlm);
+
+	if (dlm->reco.new_master == dlm->node_num)
+		goto master_here;
+
+	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+		/* choose a new master */
+		if (!dlm_pick_recovery_master(dlm)) {
+			/* already notified everyone.  go. */
+			dlm->reco.new_master = dlm->node_num;
+			goto master_here;
+		}
+		mlog(0, "another node will master this recovery session.\n");
+	}
+	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
+	     dlm->name, dlm->reco.new_master,
+	     dlm->node_num, dlm->reco.dead_node);
+
+	/* it is safe to start everything back up here
+	 * because all of the dead node's lock resources
+	 * have been marked as in-recovery */
+	dlm_end_recovery(dlm);
+
+	/* sleep out in main dlm_recovery_thread loop. */
+	return 0;
+
+master_here:
+	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+	     dlm->name, dlm->reco.dead_node, dlm->node_num);
+
+	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
+	if (status < 0) {
+		mlog(ML_ERROR, "error %d remastering locks for node %u, "
+		     "retrying.\n", status, dlm->reco.dead_node);
+	} else {
+		/* success!  see if any other nodes need recovery */
+		dlm_reset_recovery(dlm);
+	}
+	dlm_end_recovery(dlm);
+
+	/* continue and look for another dead node */
+	return -EAGAIN;
+}
+
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int status = 0;
+	struct dlm_reco_node_data *ndata;
+	struct list_head *iter;
+	int all_nodes_done;
+	int destroy = 0;
+	int pass = 0;
+
+	status = dlm_init_recovery_area(dlm, dead_node);
+	if (status < 0)
+		goto leave;
+
+	/* safe to access the node data list without a lock, since this
+	 * process is the only one to change the list */
+	list_for_each(iter, &dlm->reco.node_data) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
+		ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
+
+		mlog(0, "requesting lock info from node %u\n",
+		     ndata->node_num);
+
+		if (ndata->node_num == dlm->node_num) {
+			ndata->state = DLM_RECO_NODE_DATA_DONE;
+			continue;
+		}
+
+		status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
+		if (status < 0) {
+			mlog_errno(status);
+			if (dlm_is_host_down(status))
+				ndata->state = DLM_RECO_NODE_DATA_DEAD;
+			else {
+				destroy = 1;
+				goto leave;
+			}
+		}
+
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				mlog(0, "node %u died after requesting "
+				     "recovery info for node %u\n",
+				     ndata->node_num, dead_node);
+				// start all over
+				destroy = 1;
+				status = -EAGAIN;
+				goto leave;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
+				mlog(0, "now receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				mlog(0, "already receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				mlog(0, "already DONE receiving recovery data "
+				     "from node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+		}
+	}
+
+	mlog(0, "done requesting all lock info\n");
+
+	/* nodes should be sending reco data now
+	 * just need to wait */
+
+	while (1) {
+		/* check all the nodes now to see if we are
+		 * done, or if anyone died */
+		all_nodes_done = 1;
+		spin_lock(&dlm_reco_state_lock);
+		list_for_each(iter, &dlm->reco.node_data) {
+			ndata = list_entry (iter, struct dlm_reco_node_data, list);
+
+			mlog(0, "checking recovery state of node %u\n",
+			     ndata->node_num);
+			switch (ndata->state) {
+				case DLM_RECO_NODE_DATA_INIT:
+				case DLM_RECO_NODE_DATA_REQUESTING:
+					mlog(ML_ERROR, "bad ndata state for "
+					     "node %u: state=%d\n",
+					     ndata->node_num, ndata->state);
+					BUG();
+					break;
+				case DLM_RECO_NODE_DATA_DEAD:
+					mlog(0, "node %u died after "
+					     "requesting recovery info for "
+					     "node %u\n", ndata->node_num,
+					     dead_node);
+					spin_unlock(&dlm_reco_state_lock);
+					// start all over
+					destroy = 1;
+					status = -EAGAIN;
+					goto leave;
+				case DLM_RECO_NODE_DATA_RECEIVING:
+				case DLM_RECO_NODE_DATA_REQUESTED:
+					all_nodes_done = 0;
+					break;
+				case DLM_RECO_NODE_DATA_DONE:
+					break;
+				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+					break;
+			}
+		}
+		spin_unlock(&dlm_reco_state_lock);
+
+		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
+		     all_nodes_done?"yes":"no");
+		if (all_nodes_done) {
+			int ret;
+
+			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
+	 		 * just send a finalize message to everyone and
+	 		 * clean up */
+			mlog(0, "all nodes are done! send finalize\n");
+			ret = dlm_send_finalize_reco_message(dlm);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			spin_lock(&dlm->spinlock);
+			dlm_finish_local_lockres_recovery(dlm, dead_node,
+							  dlm->node_num);
+			spin_unlock(&dlm->spinlock);
+			mlog(0, "should be done with recovery!\n");
+
+			mlog(0, "finishing recovery of %s at %lu, "
+			     "dead=%u, this=%u, new=%u\n", dlm->name,
+			     jiffies, dlm->reco.dead_node,
+			     dlm->node_num, dlm->reco.new_master);
+			destroy = 1;
+			status = ret;
+			/* rescan everything marked dirty along the way */
+			dlm_kick_thread(dlm, NULL);
+			break;
+		}
+		/* wait to be signalled, with periodic timeout
+		 * to check for node death */
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+					 kthread_should_stop(),
+					 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
+
+	}
+
+leave:
+	if (destroy)
+		dlm_destroy_recovery_area(dlm, dead_node);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int num=0;
+	struct dlm_reco_node_data *ndata;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+	/* nodes can only be removed (by dying) after dropping
+	 * this lock, and death will be trapped later, so this should do */
+	spin_unlock(&dlm->spinlock);
+
+	while (1) {
+		num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
+		if (num >= O2NM_MAX_NODES) {
+			break;
+		}
+		BUG_ON(num == dead_node);
+
+		ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+		if (!ndata) {
+			dlm_destroy_recovery_area(dlm, dead_node);
+			return -ENOMEM;
+		}
+		ndata->node_num = num;
+		ndata->state = DLM_RECO_NODE_DATA_INIT;
+		spin_lock(&dlm_reco_state_lock);
+		list_add_tail(&ndata->list, &dlm->reco.node_data);
+		spin_unlock(&dlm_reco_state_lock);
+		num++;
+	}
+
+	return 0;
+}
+
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter, *iter2;
+	struct dlm_reco_node_data *ndata;
+	LIST_HEAD(tmplist);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_splice_init(&dlm->reco.node_data, &tmplist);
+	spin_unlock(&dlm_reco_state_lock);
+
+	list_for_each_safe(iter, iter2, &tmplist) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		list_del_init(&ndata->list);
+		kfree(ndata);
+	}
+}
+
+static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
+				 u8 dead_node)
+{
+	struct dlm_lock_request lr;
+	enum dlm_status ret;
+
+	mlog(0, "\n");
+
+
+	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
+		  "to %u\n", dead_node, request_from);
+
+	memset(&lr, 0, sizeof(lr));
+	lr.node_idx = dlm->node_num;
+	lr.dead_node = dead_node;
+
+	// send message
+	ret = DLM_NOLOCKMGR;
+	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
+				 &lr, sizeof(lr), request_from, NULL);
+
+	/* negative status is handled by caller */
+	if (ret < 0)
+		mlog_errno(ret);
+
+	// return from here, then
+	// sleep until all received or error
+	return ret;
+
+}
+
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(lr->dead_node != dlm->reco.dead_node);
+
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!item) {
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* this will get freed by dlm_request_all_locks_worker */
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!buf) {
+		kfree(item);
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* queue up work for dlm_request_all_locks_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
+	item->u.ral.reco_master = lr->node_idx;
+	item->u.ral.dead_node = lr->dead_node;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	schedule_work(&dlm->dispatched_work);
+
+	dlm_put(dlm);
+	return 0;
+}
+
+static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_migratable_lockres *mres;
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
+	LIST_HEAD(resources);
+	struct list_head *iter;
+	int ret;
+	u8 dead_node, reco_master;
+
+	dlm = item->dlm;
+	dead_node = item->u.ral.dead_node;
+	reco_master = item->u.ral.reco_master;
+	BUG_ON(dead_node != dlm->reco.dead_node);
+	BUG_ON(reco_master != dlm->reco.new_master);
+
+	mres = (struct dlm_migratable_lockres *)data;
+
+	/* lock resources should have already been moved to the
+ 	 * dlm->reco.resources list.  now move items from that list
+ 	 * to a temp list if the dead owner matches.  note that the
+	 * whole cluster recovers only one node at a time, so we
+	 * can safely move UNKNOWN lock resources for each recovery
+	 * session. */
+	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
+
+	/* now we can begin blasting lockreses without the dlm lock */
+	list_for_each(iter, &resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
+				   	DLM_MRES_RECOVERY);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	/* move the resources back to the list */
+	spin_lock(&dlm->spinlock);
+	list_splice_init(&resources, &dlm->reco.resources);
+	spin_unlock(&dlm->spinlock);
+
+	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	free_page((unsigned long)data);
+}
+
+
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
+{
+	int ret, tmpret;
+	struct dlm_reco_data_done done_msg;
+
+	memset(&done_msg, 0, sizeof(done_msg));
+	done_msg.node_idx = dlm->node_num;
+	done_msg.dead_node = dead_node;
+	mlog(0, "sending DATA DONE message to %u, "
+	     "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
+	     done_msg.dead_node);
+
+	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
+				 sizeof(done_msg), send_to, &tmpret);
+	/* negative status is ignored by the caller */
+	if (ret >= 0)
+		ret = tmpret;
+	return ret;
+}
+
+
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
+	struct list_head *iter;
+	struct dlm_reco_node_data *ndata = NULL;
+	int ret = -EINVAL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+	     "node_idx=%u, this node=%u\n", done->dead_node,
+	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
+	BUG_ON(done->dead_node != dlm->reco.dead_node);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_for_each(iter, &dlm->reco.node_data) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		if (ndata->node_num != done->node_idx)
+			continue;
+
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_DEAD:
+			case DLM_RECO_NODE_DATA_DONE:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				mlog(ML_ERROR, "bad ndata state for node %u:"
+				     " state=%d\n", ndata->node_num,
+				     ndata->state);
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				mlog(0, "node %u is DONE sending "
+					  "recovery data!\n",
+					  ndata->node_num);
+
+				ndata->state = DLM_RECO_NODE_DATA_DONE;
+				ret = 0;
+				break;
+		}
+	}
+	spin_unlock(&dlm_reco_state_lock);
+
+	/* wake the recovery thread, some node is done */
+	if (!ret)
+		dlm_kick_recovery_thread(dlm);
+
+	if (ret < 0)
+		mlog(ML_ERROR, "failed to find recovery node data for node "
+		     "%u\n", done->node_idx);
+	dlm_put(dlm);
+
+	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
+	return ret;
+}
+
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list,
+				       	u8 dead_node)
+{
+	struct dlm_lock_resource *res;
+	struct list_head *iter, *iter2;
+
+	spin_lock(&dlm->spinlock);
+	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		if (dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len))
+			continue;
+		if (res->owner == dead_node) {
+			mlog(0, "found lockres owned by dead node while "
+				  "doing recovery for node %u. sending it.\n",
+				  dead_node);
+			list_del_init(&res->recovering);
+			list_add_tail(&res->recovering, list);
+		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "found UNKNOWN owner while doing recovery "
+				  "for node %u. sending it.\n", dead_node);
+			list_del_init(&res->recovering);
+			list_add_tail(&res->recovering, list);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
+{
+	int total_locks = 0;
+	struct list_head *iter, *queue = &res->granted;
+	int i;
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue)
+			total_locks++;
+		queue++;
+	}
+	return total_locks;
+}
+
+
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				      struct dlm_migratable_lockres *mres,
+				      u8 send_to,
+				      struct dlm_lock_resource *res,
+				      int total_locks)
+{
+	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
+	int mres_total_locks = be32_to_cpu(mres->total_locks);
+	int sz, ret = 0, status = 0;
+	u8 orig_flags = mres->flags,
+	   orig_master = mres->master;
+
+	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
+	if (!mres->num_locks)
+		return 0;
+
+	sz = sizeof(struct dlm_migratable_lockres) +
+		(mres->num_locks * sizeof(struct dlm_migratable_lock));
+
+	/* add an all-done flag if we reached the last lock */
+	orig_flags = mres->flags;
+	BUG_ON(total_locks > mres_total_locks);
+	if (total_locks == mres_total_locks)
+		mres->flags |= DLM_MRES_ALL_DONE;
+
+	/* send it */
+	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
+				 sz, send_to, &status);
+	if (ret < 0) {
+		/* XXX: negative status is not handled.
+		 * this will end up killing this node. */
+		mlog_errno(ret);
+	} else {
+		/* might get an -ENOMEM back here */
+		ret = status;
+		if (ret < 0) {
+			mlog_errno(ret);
+
+			if (ret == -EFAULT) {
+				mlog(ML_ERROR, "node %u told me to kill "
+				     "myself!\n", send_to);
+				BUG();
+			}
+		}
+	}
+
+	/* zero and reinit the message buffer */
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, mres_total_locks,
+				    mig_cookie, orig_flags, orig_master);
+	return ret;
+}
+
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master)
+{
+	/* mres here is one full page */
+	memset(mres, 0, PAGE_SIZE);
+	mres->lockname_len = namelen;
+	memcpy(mres->lockname, lockname, namelen);
+	mres->num_locks = 0;
+	mres->total_locks = cpu_to_be32(total_locks);
+	mres->mig_cookie = cpu_to_be64(cookie);
+	mres->flags = flags;
+	mres->master = master;
+}
+
+
+/* returns 1 if this lock fills the network structure,
+ * 0 otherwise */
+static int dlm_add_lock_to_array(struct dlm_lock *lock,
+				 struct dlm_migratable_lockres *mres, int queue)
+{
+	struct dlm_migratable_lock *ml;
+	int lock_num = mres->num_locks;
+
+	ml = &(mres->ml[lock_num]);
+	ml->cookie = lock->ml.cookie;
+	ml->type = lock->ml.type;
+	ml->convert_type = lock->ml.convert_type;
+	ml->highest_blocked = lock->ml.highest_blocked;
+	ml->list = queue;
+	if (lock->lksb) {
+		ml->flags = lock->lksb->flags;
+		/* send our current lvb */
+		if (ml->type == LKM_EXMODE ||
+		    ml->type == LKM_PRMODE) {
+			/* if it is already set, this had better be a PR
+			 * and it has to match */
+			if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
+			    memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+				mlog(ML_ERROR, "mismatched lvbs!\n");
+				__dlm_print_one_lock_resource(lock->lockres);
+				BUG();
+			}
+			memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+		}
+	}
+	ml->node = lock->ml.node;
+	mres->num_locks++;
+	/* we reached the max, send this network message */
+	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
+		return 1;
+	return 0;
+}
+
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to, u8 flags)
+{
+	struct list_head *queue, *iter;
+	int total_locks, i;
+	u64 mig_cookie = 0;
+	struct dlm_lock *lock;
+	int ret = 0;
+
+	BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	mlog(0, "sending to %u\n", send_to);
+
+	total_locks = dlm_num_locks_in_lockres(res);
+	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
+		/* rare, but possible */
+		mlog(0, "argh.  lockres has %d locks.  this will "
+			  "require more than one network packet to "
+			  "migrate\n", total_locks);
+		mig_cookie = dlm_get_next_mig_cookie();
+	}
+
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, total_locks,
+				    mig_cookie, flags, res->owner);
+
+	total_locks = 0;
+	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+
+			/* add another lock. */
+			total_locks++;
+			if (!dlm_add_lock_to_array(lock, mres, i))
+				continue;
+
+			/* this filled the lock message,
+			 * we must send it immediately. */
+			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
+						       res, total_locks);
+			if (ret < 0) {
+				// TODO
+				mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
+				     "returned %d, TODO\n", ret);
+				BUG();
+			}
+		}
+	}
+	/* flush any remaining locks */
+	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
+	if (ret < 0) {
+		// TODO
+		mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
+		     "TODO\n", ret);
+		BUG();
+	}
+	return ret;
+}
+
+
+
+/*
+ * this message will contain no more than one page worth of
+ * recovery data, and it will work on only one lockres.
+ * there may be many locks in this page, and we may need to wait
+ * for additional packets to complete all the locks (rare, but
+ * possible).
+ */
+/*
+ * NOTE: the allocation error cases here are scary
+ * we really cannot afford to fail an alloc in recovery
+ * do we spin?  returning an error only delays the problem really
+ */
+
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_migratable_lockres *mres =
+		(struct dlm_migratable_lockres *)msg->buf;
+	int ret = 0;
+	u8 real_master;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+	struct dlm_lock_resource *res = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	real_master = mres->master;
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* cannot migrate a lockres with no master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+	}
+
+	mlog(0, "%s message received from node %u\n",
+		  (mres->flags & DLM_MRES_RECOVERY) ?
+		  "recovery" : "migration", mres->master);
+	if (mres->flags & DLM_MRES_ALL_DONE)
+		mlog(0, "all done flag.  all lockres data received!\n");
+
+	ret = -ENOMEM;
+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!buf || !item)
+		goto leave;
+
+	/* lookup the lock to see if we have a secondary queue for this
+	 * already...  just add the locks in and this will have its owner
+	 * and RECOVERY flag changed when it completes. */
+	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+	if (res) {
+	 	/* this will get a ref on res */
+		/* mark it as recovering/migrating and hash it */
+		spin_lock(&res->spinlock);
+		if (mres->flags & DLM_MRES_RECOVERY) {
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		} else {
+			if (res->state & DLM_LOCK_RES_MIGRATING) {
+				/* this is at least the second
+				 * lockres message */
+				mlog(0, "lock %.*s is already migrating\n",
+					  mres->lockname_len,
+					  mres->lockname);
+			} else if (res->state & DLM_LOCK_RES_RECOVERING) {
+				/* caller should BUG */
+				mlog(ML_ERROR, "node is attempting to migrate "
+				     "lock %.*s, but marked as recovering!\n",
+				     mres->lockname_len, mres->lockname);
+				ret = -EFAULT;
+				spin_unlock(&res->spinlock);
+				goto leave;
+			}
+			res->state |= DLM_LOCK_RES_MIGRATING;
+		}
+		spin_unlock(&res->spinlock);
+	} else {
+		/* need to allocate, just like if it was
+		 * mastered here normally  */
+		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
+		if (!res)
+			goto leave;
+
+		/* to match the ref that we would have gotten if
+		 * dlm_lookup_lockres had succeeded */
+		dlm_lockres_get(res);
+
+		/* mark it as recovering/migrating and hash it */
+		if (mres->flags & DLM_MRES_RECOVERY)
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		else
+			res->state |= DLM_LOCK_RES_MIGRATING;
+
+		spin_lock(&dlm->spinlock);
+		__dlm_insert_lockres(dlm, res);
+		spin_unlock(&dlm->spinlock);
+
+		/* now that the new lockres is inserted,
+		 * make it usable by other processes */
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		spin_unlock(&res->spinlock);
+
+		/* add an extra ref for just-allocated lockres 
+		 * otherwise the lockres will be purged immediately */
+		dlm_lockres_get(res);
+
+	}
+
+	/* at this point we have allocated everything we need,
+	 * and we have a hashed lockres with an extra ref and
+	 * the proper res->state flags. */
+	ret = 0;
+	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* migration cannot have an unknown master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+		mlog(0, "recovery has passed me a lockres with an "
+			  "unknown owner.. will need to requery: "
+			  "%.*s\n", mres->lockname_len, mres->lockname);
+	} else {
+		spin_lock(&res->spinlock);
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+		spin_unlock(&res->spinlock);
+	}
+
+	/* queue up work for dlm_mig_lockres_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len));  /* copy the whole message */
+	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
+	item->u.ml.lockres = res; /* already have a ref */
+	item->u.ml.real_master = real_master;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	schedule_work(&dlm->dispatched_work);
+
+leave:
+	dlm_put(dlm);
+	if (ret < 0) {
+		if (buf)
+			kfree(buf);
+		if (item)
+			kfree(item);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_migratable_lockres *mres;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	u8 real_master;
+
+	dlm = item->dlm;
+	mres = (struct dlm_migratable_lockres *)data;
+
+	res = item->u.ml.lockres;
+	real_master = item->u.ml.real_master;
+
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* this case is super-rare. only occurs if
+		 * node death happens during migration. */
+again:
+		ret = dlm_lockres_master_requery(dlm, res, &real_master);
+		if (ret < 0) {
+			mlog(0, "dlm_lockres_master_requery failure: %d\n",
+				  ret);
+			goto again;
+		}
+		if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lockres %.*s not claimed.  "
+				   "this node will take it.\n",
+				   res->lockname.len, res->lockname.name);
+		} else {
+			mlog(0, "master needs to respond to sender "
+				  "that node %u still owns %.*s\n",
+				  real_master, res->lockname.len,
+				  res->lockname.name);
+			/* cannot touch this lockres */
+			goto leave;
+		}
+	}
+
+	ret = dlm_process_recovery_data(dlm, res, mres);
+	if (ret < 0)
+		mlog(0, "dlm_process_recovery_data returned  %d\n", ret);
+	else
+		mlog(0, "dlm_process_recovery_data succeeded\n");
+
+	if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
+	                   (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
+		ret = dlm_finish_migration(dlm, res, mres->master);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+leave:
+	kfree(data);
+	mlog_exit(ret);
+}
+
+
+
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master)
+{
+	struct dlm_node_iter iter;
+	int nodenum;
+	int ret = 0;
+
+	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	/* we only reach here if one of the two nodes in a
+	 * migration died while the migration was in progress.
+	 * at this point we need to requery the master.  we
+	 * know that the new_master got as far as creating
+	 * an mle on at least one node, but we do not know
+	 * if any nodes had actually cleared the mle and set
+	 * the master to the new_master.  the old master
+	 * is supposed to set the owner to UNKNOWN in the
+	 * event of a new_master death, so the only possible
+	 * responses that we can get from nodes here are
+	 * that the master is new_master, or that the master
+	 * is UNKNOWN.
+	 * if all nodes come back with UNKNOWN then we know
+	 * the lock needs remastering here.
+	 * if any node comes back with a valid master, check
+	 * to see if that master is the one that we are
+	 * recovering.  if so, then the new_master died and
+	 * we need to remaster this lock.  if not, then the
+	 * new_master survived and that node will respond to
+	 * other nodes about the owner.
+	 * if there is an owner, this node needs to dump this
+	 * lockres and alert the sender that this lockres
+	 * was rejected. */
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		/* do not send to self */
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
+		if (ret < 0) {
+			mlog_errno(ret);
+			BUG();
+			/* TODO: need to figure a way to restart this */
+		}
+		if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lock master is %u\n", *real_master);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+static int dlm_do_master_requery(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 u8 nodenum, u8 *real_master)
+{
+	int ret = -EINVAL;
+	struct dlm_master_requery req;
+	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	memset(&req, 0, sizeof(req));
+	req.node_idx = dlm->node_num;
+	req.namelen = res->lockname.len;
+	memcpy(req.name, res->lockname.name, res->lockname.len);
+
+	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
+				 &req, sizeof(req), nodenum, &status);
+	/* XXX: negative status not handled properly here. */
+	if (ret < 0)
+		mlog_errno(ret);
+	else {
+		BUG_ON(status < 0);
+		BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
+		*real_master = (u8) (status & 0xff);
+		mlog(0, "node %u responded to master requery with %u\n",
+			  nodenum, *real_master);
+		ret = 0;
+	}
+	return ret;
+}
+
+
+/* this function cannot error, so unless the sending
+ * or receiving of the message failed, the owner can
+ * be trusted */
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
+	u32 flags = DLM_ASSERT_MASTER_REQUERY;
+
+	if (!dlm_grab(dlm)) {
+		/* since the domain has gone away on this
+		 * node, the proper response is UNKNOWN */
+		return master;
+	}
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
+	if (res) {
+		spin_lock(&res->spinlock);
+		master = res->owner;
+		if (master == dlm->node_num) {
+			int ret = dlm_dispatch_assert_master(dlm, res,
+							     0, 0, flags);
+			if (ret < 0) {
+				mlog_errno(-ENOMEM);
+				/* retry!? */
+				BUG();
+			}
+		}
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+	return master;
+}
+
+static inline struct list_head *
+dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
+{
+	struct list_head *ret;
+	BUG_ON(list_num < 0);
+	BUG_ON(list_num > 2);
+	ret = &(res->granted);
+	ret += list_num;
+	return ret;
+}
+/* TODO: do ast flush business
+ * TODO: do MIGRATING and RECOVERING spinning
+ */
+
+/*
+* NOTE about in-flight requests during migration:
+*
+* Before attempting the migrate, the master has marked the lockres as
+* MIGRATING and then flushed all of its pending ASTS.  So any in-flight
+* requests either got queued before the MIGRATING flag got set, in which
+* case the lock data will reflect the change and a return message is on
+* the way, or the request failed to get in before MIGRATING got set.  In
+* this case, the caller will be told to spin and wait for the MIGRATING
+* flag to be dropped, then recheck the master.
+* This holds true for the convert, cancel and unlock cases, and since lvb
+* updates are tied to these same messages, it applies to lvb updates as
+* well.  For the lock case, there is no way a lock can be on the master
+* queue and not be on the secondary queue since the lock is always added
+* locally first.  This means that the new target node will never be sent
+* a lock that he doesn't already have on the list.
+* In total, this means that the local lock is correct and should not be
+* updated to match the one sent by the master.  Any messages sent back
+* from the master before the MIGRATING flag will bring the lock properly
+* up-to-date, and the change will be ordered properly for the waiter.
+* We will *not* attempt to modify the lock underneath the waiter.
+*/
+
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres)
+{
+	struct dlm_migratable_lock *ml;
+	struct list_head *queue;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	int ret = 0;
+	int i;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+
+	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
+	for (i=0; i<mres->num_locks; i++) {
+		ml = &(mres->ml[i]);
+		BUG_ON(ml->highest_blocked != LKM_IVMODE);
+		newlock = NULL;
+		lksb = NULL;
+
+		queue = dlm_list_num_to_pointer(res, ml->list);
+
+		/* if the lock is for the local node it needs to
+		 * be moved to the proper location within the queue.
+		 * do not allocate a new lock structure. */
+		if (ml->node == dlm->node_num) {
+			/* MIGRATION ONLY! */
+			BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+
+			spin_lock(&res->spinlock);
+			list_for_each(iter, queue) {
+				lock = list_entry (iter, struct dlm_lock, list);
+				if (lock->ml.cookie != ml->cookie)
+					lock = NULL;
+				else
+					break;
+			}
+
+			/* lock is always created locally first, and
+			 * destroyed locally last.  it must be on the list */
+			if (!lock) {
+				mlog(ML_ERROR, "could not find local lock "
+					       "with cookie %"MLFu64"!\n",
+				     ml->cookie);
+				BUG();
+			}
+			BUG_ON(lock->ml.node != ml->node);
+
+			/* see NOTE above about why we do not update
+			 * to match the master here */
+
+			/* move the lock to its proper place */
+			/* do not alter lock refcount.  switching lists. */
+			list_del_init(&lock->list);
+			list_add_tail(&lock->list, queue);
+			spin_unlock(&res->spinlock);
+
+			mlog(0, "just reordered a local lock!\n");
+			continue;
+		}
+
+		/* lock is for another node. */
+		newlock = dlm_new_lock(ml->type, ml->node,
+				       be64_to_cpu(ml->cookie), NULL);
+		if (!newlock) {
+			ret = -ENOMEM;
+			goto leave;
+		}
+		lksb = newlock->lksb;
+		dlm_lock_attach_lockres(newlock, res);
+
+		if (ml->convert_type != LKM_IVMODE) {
+			BUG_ON(queue != &res->converting);
+			newlock->ml.convert_type = ml->convert_type;
+		}
+		lksb->flags |= (ml->flags &
+				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+			
+		if (mres->lvb[0]) {
+			if (lksb->flags & DLM_LKSB_PUT_LVB) {
+				/* other node was trying to update
+				 * lvb when node died.  recreate the
+				 * lksb with the updated lvb. */
+				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+			} else {
+				/* otherwise, the node is sending its 
+				 * most recent valid lvb info */
+				BUG_ON(ml->type != LKM_EXMODE &&
+				       ml->type != LKM_PRMODE);
+				if (res->lvb[0] && (ml->type == LKM_EXMODE ||
+				    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+					mlog(ML_ERROR, "received bad lvb!\n");
+					__dlm_print_one_lock_resource(res);
+					BUG();
+				}
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
+			}
+		}
+
+
+		/* NOTE:
+		 * wrt lock queue ordering and recovery:
+		 *    1. order of locks on granted queue is
+		 *       meaningless.
+		 *    2. order of locks on converting queue is
+		 *       LOST with the node death.  sorry charlie.
+		 *    3. order of locks on the blocked queue is
+		 *       also LOST.
+		 * order of locks does not affect integrity, it
+		 * just means that a lock request may get pushed
+		 * back in line as a result of the node death.
+		 * also note that for a given node the lock order
+		 * for its secondary queue locks is preserved
+		 * relative to each other, but clearly *not*
+		 * preserved relative to locks from other nodes.
+		 */
+		spin_lock(&res->spinlock);
+		dlm_lock_get(newlock);
+		list_add_tail(&newlock->list, queue);
+		spin_unlock(&res->spinlock);
+	}
+	mlog(0, "done running all the locks\n");
+
+leave:
+	if (ret < 0) {
+		mlog_errno(ret);
+		if (newlock)
+			dlm_lock_put(newlock);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res)
+{
+	int i;
+	struct list_head *queue, *iter, *iter2;
+	struct dlm_lock *lock;
+
+	res->state |= DLM_LOCK_RES_RECOVERING;
+	if (!list_empty(&res->recovering))
+		list_del_init(&res->recovering);
+	list_add_tail(&res->recovering, &dlm->reco.resources);
+
+	/* find any pending locks and put them back on proper list */
+	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_safe(iter, iter2, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			dlm_lock_get(lock);
+			if (lock->convert_pending) {
+				/* move converting lock back to granted */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with convert pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_revert_pending_convert(res, lock);
+				lock->convert_pending = 0;
+			} else if (lock->lock_pending) {
+				/* remove pending lock requests completely */
+				BUG_ON(i != DLM_BLOCKED_LIST);
+				mlog(0, "node died with lock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				/* lock will be floating until ref in
+				 * dlmlock_remote is freed after the network
+				 * call returns.  ok for it to not be on any
+				 * list since no ast can be called
+				 * (the master is dead). */
+				dlm_revert_pending_lock(res, lock);
+				lock->lock_pending = 0;
+			} else if (lock->unlock_pending) {
+				/* if an unlock was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master.  note that the dlm_unlock
+				 * call is still responsible for calling
+				 * the unlockast.  that will happen after
+				 * the network call times out.  for now,
+				 * just move lists to prepare the new
+				 * recovery master.  */
+				BUG_ON(i != DLM_GRANTED_LIST);
+				mlog(0, "node died with unlock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_unlock(res, lock);
+				lock->unlock_pending = 0;
+			} else if (lock->cancel_pending) {
+				/* if a cancel was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with cancel pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_cancel(res, lock);
+				lock->cancel_pending = 0;
+			}
+			dlm_lock_put(lock);
+		}
+	}
+}
+
+
+
+/* removes all recovered locks from the recovery list.
+ * sets the res->owner to the new master.
+ * unsets the RECOVERY flag and wakes waiters. */
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master)
+{
+	int i;
+	struct list_head *iter, *iter2, *bucket;
+	struct dlm_lock_resource *res;
+
+	mlog_entry_void();
+
+	assert_spin_locked(&dlm->spinlock);
+
+	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		if (res->owner == dead_node) {
+			list_del_init(&res->recovering);
+			spin_lock(&res->spinlock);
+			dlm_change_lockres_owner(dlm, res, new_master);
+			res->state &= ~DLM_LOCK_RES_RECOVERING;
+			__dlm_dirty_lockres(dlm, res);
+			spin_unlock(&res->spinlock);
+			wake_up(&res->wq);
+		}
+	}
+
+	/* this will become unnecessary eventually, but
+	 * for now we need to run the whole hash, clear
+	 * the RECOVERING state and set the owner
+	 * if necessary */
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry (iter, struct dlm_lock_resource, list);
+			if (res->state & DLM_LOCK_RES_RECOVERING) {
+				if (res->owner == dead_node) {
+					mlog(0, "(this=%u) res %.*s owner=%u "
+					     "was not on recovering list, but "
+					     "clearing state anyway\n",
+					     dlm->node_num, res->lockname.len,
+					     res->lockname.name, new_master);
+				} else if (res->owner == dlm->node_num) {
+					mlog(0, "(this=%u) res %.*s owner=%u "
+					     "was not on recovering list, "
+					     "owner is THIS node, clearing\n",
+					     dlm->node_num, res->lockname.len,
+					     res->lockname.name, new_master);
+				} else
+					continue;
+
+				spin_lock(&res->spinlock);
+				dlm_change_lockres_owner(dlm, res, new_master);
+				res->state &= ~DLM_LOCK_RES_RECOVERING;
+				__dlm_dirty_lockres(dlm, res);
+				spin_unlock(&res->spinlock);
+				wake_up(&res->wq);
+			}
+		}
+	}
+}
+
+static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
+{
+	if (local) {
+		if (lock->ml.type != LKM_EXMODE &&
+		    lock->ml.type != LKM_PRMODE)
+			return 1;
+	} else if (lock->ml.type == LKM_EXMODE)
+		return 1;
+	return 0;
+}
+
+static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct list_head *iter, *queue;
+	struct dlm_lock *lock;
+	int blank_lvb = 0, local = 0;
+	int i;
+	u8 search_node;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (res->owner == dlm->node_num)
+		/* if this node owned the lockres, and if the dead node 
+		 * had an EX when he died, blank out the lvb */
+		search_node = dead_node;
+	else {
+		/* if this is a secondary lockres, and we had no EX or PR
+		 * locks granted, we can no longer trust the lvb */
+		search_node = dlm->node_num;
+		local = 1;  /* check local state for valid lvb */
+	}
+
+	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			if (lock->ml.node == search_node) {
+				if (dlm_lvb_needs_invalidation(lock, local)) {
+					/* zero the lksb lvb and lockres lvb */
+					blank_lvb = 1;
+					memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
+				}
+			}
+		}
+	}
+
+	if (blank_lvb) {
+		mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
+		     res->lockname.len, res->lockname.name, dead_node);
+		memset(res->lvb, 0, DLM_LVB_LEN);
+	}
+}
+
+static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct list_head *iter, *tmpiter;
+	struct dlm_lock *lock;
+
+	/* this node is the lockres master:
+	 * 1) remove any stale locks for the dead node
+	 * 2) if the dead node had an EX when he died, blank out the lvb 
+	 */
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* TODO: check pending_asts, pending_basts here */
+	list_for_each_safe(iter, tmpiter, &res->granted) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+		}
+	}
+	list_for_each_safe(iter, tmpiter, &res->converting) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+		}
+	}
+	list_for_each_safe(iter, tmpiter, &res->blocked) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+		}
+	}
+
+	/* do not kick thread yet */
+	__dlm_dirty_lockres(dlm, res);
+}
+
+/* if this node is the recovery master, and there are no
+ * locks for a given lockres owned by this node that are in
+ * either PR or EX mode, zero out the lvb before requesting.
+ *
+ */
+
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter;
+	struct dlm_lock_resource *res;
+	int i;
+	struct list_head *bucket;
+
+
+	/* purge any stale mles */
+	dlm_clean_master_list(dlm, dead_node);
+
+	/*
+	 * now clean up all lock resources.  there are two rules:
+	 *
+	 * 1) if the dead node was the master, move the lockres
+	 *    to the recovering list.  set the RECOVERING flag.
+	 *    this lockres needs to be cleaned up before it can
+	 *    be used further.
+	 *
+	 * 2) if this node was the master, remove all locks from
+	 *    each of the lockres queues that were owned by the
+	 *    dead node.  once recovery finishes, the dlm thread
+	 *    can be kicked again to see if any ASTs or BASTs
+	 *    need to be fired as a result.
+	 */
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry (iter, struct dlm_lock_resource, list);
+			if (dlm_is_recovery_lock(res->lockname.name,
+						 res->lockname.len))
+				continue;
+			
+			spin_lock(&res->spinlock);
+			/* zero the lvb if necessary */
+			dlm_revalidate_lvb(dlm, res, dead_node);
+			if (res->owner == dead_node)
+				dlm_move_lockres_to_recovery_list(dlm, res);
+			else if (res->owner == dlm->node_num) {
+				dlm_free_dead_locks(dlm, res, dead_node);
+				__dlm_lockres_calc_usage(dlm, res);
+			}
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+}
+
+static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	/* check to see if the node is already considered dead */
+	if (!test_bit(idx, dlm->live_nodes_map)) {
+		mlog(0, "for domain %s, node %d is already dead. "
+		     "another node likely did recovery already.\n",
+		     dlm->name, idx);
+		return;
+	}
+
+	/* check to see if we do not care about this node */
+	if (!test_bit(idx, dlm->domain_map)) {
+		/* This also catches the case that we get a node down
+		 * but haven't joined the domain yet. */
+		mlog(0, "node %u already removed from domain!\n", idx);
+		return;
+	}
+
+	clear_bit(idx, dlm->live_nodes_map);
+
+	/* Clean up join state on node death. */
+	if (dlm->joining_node == idx) {
+		mlog(0, "Clearing join state for node %u\n", idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	}
+
+	/* make sure local cleanup occurs before the heartbeat events */
+	if (!test_bit(idx, dlm->recovery_map))
+		dlm_do_local_recovery_cleanup(dlm, idx);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, idx, 0);
+
+	mlog(0, "node %u being removed from domain map!\n", idx);
+	clear_bit(idx, dlm->domain_map);
+	/* wake up migration waiters if a node goes down.
+	 * perhaps later we can genericize this for other waiters. */
+	wake_up(&dlm->migration_wq);
+
+	if (test_bit(idx, dlm->recovery_map))
+		mlog(0, "domain %s, node %u already added "
+		     "to recovery map!\n", dlm->name, idx);
+	else
+		set_bit(idx, dlm->recovery_map);
+}
+
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	spin_lock(&dlm->spinlock);
+	__dlm_hb_node_down(dlm, idx);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	spin_lock(&dlm->spinlock);
+
+	set_bit(idx, dlm->live_nodes_map);
+
+	/* notify any mles attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, idx, 1);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+static void dlm_reco_ast(void *astdata)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_bast(void *astdata, int blocked_type)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
+{
+	mlog(0, "unlockast for recovery lock fired!\n");
+}
+
+
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
+{
+	enum dlm_status ret;
+	struct dlm_lockstatus lksb;
+	int status = -EINVAL;
+
+	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
+	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
+retry:
+	memset(&lksb, 0, sizeof(lksb));
+
+	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
+		      DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+
+	if (ret == DLM_NORMAL) {
+		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		/* I am master, send message to all nodes saying
+		 * that I am beginning a recovery session */
+		status = dlm_send_begin_reco_message(dlm,
+					      dlm->reco.dead_node);
+
+		/* recovery lock is a special case.  ast will not get fired,
+		 * so just go ahead and unlock it. */
+		ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
+		if (ret != DLM_NORMAL) {
+			/* this would really suck. this could only happen
+			 * if there was a network error during the unlock
+			 * because of node death.  this means the unlock
+			 * is actually "done" and the lock structure is
+			 * even freed.  we can continue, but only
+			 * because this specific lock name is special. */
+			mlog(0, "dlmunlock returned %d\n", ret);
+		}
+
+		if (status < 0) {
+			mlog(0, "failed to send recovery message. "
+				   "must retry with new node map.\n");
+			goto retry;
+		}
+	} else if (ret == DLM_NOTQUEUED) {
+		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		/* another node is master. wait on
+		 * reco.new_master != O2NM_INVALID_NODE_NUM */
+		status = -EEXIST;
+	}
+
+	return status;
+}
+
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_begin_reco br;
+	int ret = 0;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+
+	mlog_entry("%u\n", dead_node);
+
+	mlog(0, "dead node is %u\n", dead_node);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dead_node, iter.node_map);
+
+	memset(&br, 0, sizeof(br));
+	br.node_idx = dlm->node_num;
+	br.dead_node = dead_node;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = 0;
+		if (nodenum == dead_node) {
+			mlog(0, "not sending begin reco to dead node "
+				  "%u\n", dead_node);
+			continue;
+		}
+		if (nodenum == dlm->node_num) {
+			mlog(0, "not sending begin reco to self\n");
+			continue;
+		}
+
+		ret = -EINVAL;
+		mlog(0, "attempting to send begin reco msg to %d\n",
+			  nodenum);
+		ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
+					 &br, sizeof(br), nodenum, &status);
+		/* negative status is handled ok by caller here */
+		if (ret >= 0)
+			ret = status;
+		if (ret < 0) {
+			struct dlm_lock_resource *res;
+			mlog_errno(ret);
+			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
+			    " returned %d\n", dlm->name, nodenum, ret);
+			res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
+						 DLM_RECOVERY_LOCK_NAME_LEN);
+			if (res) {
+				dlm_print_one_lock_resource(res);
+				dlm_lockres_put(res);
+			} else {
+				mlog(ML_ERROR, "recovery lock not found\n");
+			}
+			break;
+		}
+	}
+
+	return ret;
+}
+
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	mlog(0, "node %u wants to recover node %u\n",
+		  br->node_idx, br->dead_node);
+
+	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
+
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+		mlog(0, "new_master already set to %u!\n",
+			  dlm->reco.new_master);
+	}
+	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
+		mlog(0, "dead_node already set to %u!\n",
+			  dlm->reco.dead_node);
+	}
+	dlm->reco.new_master = br->node_idx;
+	dlm->reco.dead_node = br->dead_node;
+	if (!test_bit(br->dead_node, dlm->recovery_map)) {
+		mlog(ML_ERROR, "recovery master %u sees %u as dead, but this "
+		     "node has not yet.  marking %u as dead\n",
+		     br->node_idx, br->dead_node, br->dead_node);
+		__dlm_hb_node_down(dlm, br->dead_node);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_kick_recovery_thread(dlm);
+	dlm_put(dlm);
+	return 0;
+}
+
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+	struct dlm_finalize_reco fr;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+
+	mlog(0, "finishing recovery for node %s:%u\n",
+	     dlm->name, dlm->reco.dead_node);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	memset(&fr, 0, sizeof(fr));
+	fr.node_idx = dlm->node_num;
+	fr.dead_node = dlm->reco.dead_node;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
+					 &fr, sizeof(fr), nodenum, &status);
+		if (ret >= 0) {
+			ret = status;
+			if (dlm_is_host_down(ret)) {
+				/* this has no effect on this recovery 
+				 * session, so set the status to zero to 
+				 * finish out the last recovery */
+				mlog(ML_ERROR, "node %u went down after this "
+				     "node finished recovery.\n", nodenum);
+				ret = 0;
+			}
+		}
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	mlog(0, "node %u finalizing recovery of node %u\n",
+	     fr->node_idx, fr->dead_node);
+
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->reco.new_master != fr->node_idx) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
+		     "%u is supposed to be the new master, dead=%u\n",
+		     fr->node_idx, dlm->reco.new_master, fr->dead_node);
+		BUG();
+	}
+	if (dlm->reco.dead_node != fr->dead_node) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
+		     "node %u, but node %u is supposed to be dead\n",
+		     fr->node_idx, fr->dead_node, dlm->reco.dead_node);
+		BUG();
+	}
+
+	dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_reset_recovery(dlm);
+
+	dlm_kick_recovery_thread(dlm);
+	dlm_put(dlm);
+	return 0;
+}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
new file mode 100644
index 000000000000..92cd5cd66db8
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -0,0 +1,695 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmthread.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
+#include "cluster/masklog.h"
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+
+static int dlm_thread(void *data);
+
+static void dlm_flush_asts(struct dlm_ctxt *dlm);
+
+#define dlm_lock_is_remote(dlm, lock)     ((lock)->ml.node != (dlm)->node_num)
+
+/* will exit holding res->spinlock, but may drop in function */
+/* waits until flags are cleared on res->state */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	assert_spin_locked(&res->spinlock);
+
+	add_wait_queue(&res->wq, &wait);
+repeat:
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (res->state & flags) {
+		spin_unlock(&res->spinlock);
+		schedule();
+		spin_lock(&res->spinlock);
+		goto repeat;
+	}
+	remove_wait_queue(&res->wq, &wait);
+	current->state = TASK_RUNNING;
+}
+
+
+static int __dlm_lockres_unused(struct dlm_lock_resource *res)
+{
+	if (list_empty(&res->granted) &&
+	    list_empty(&res->converting) &&
+	    list_empty(&res->blocked) &&
+	    list_empty(&res->dirty))
+		return 1;
+	return 0;
+}
+
+
+/* Call whenever you may have added or deleted something from one of
+ * the lockres queue's. This will figure out whether it belongs on the
+ * unused list or not and does the appropriate thing. */
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (__dlm_lockres_unused(res)){
+		if (list_empty(&res->purge)) {
+			mlog(0, "putting lockres %.*s from purge list\n",
+			     res->lockname.len, res->lockname.name);
+
+			res->last_used = jiffies;
+			list_add_tail(&res->purge, &dlm->purge_list);
+			dlm->purge_count++;
+		}
+	} else if (!list_empty(&res->purge)) {
+		mlog(0, "removing lockres %.*s from purge list\n",
+		     res->lockname.len, res->lockname.name);
+
+		list_del_init(&res->purge);
+		dlm->purge_count--;
+	}
+}
+
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res)
+{
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+	spin_lock(&dlm->spinlock);
+	spin_lock(&res->spinlock);
+
+	__dlm_lockres_calc_usage(dlm, res);
+
+	spin_unlock(&res->spinlock);
+	spin_unlock(&dlm->spinlock);
+}
+
+/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
+ * to do migration, but will re-acquire before exit. */
+void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
+{
+	int master;
+	int ret;
+
+	spin_lock(&lockres->spinlock);
+	master = lockres->owner == dlm->node_num;
+	spin_unlock(&lockres->spinlock);
+
+	mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
+	     lockres->lockname.name, master);
+
+	/* Non master is the easy case -- no migration required, just
+	 * quit. */
+	if (!master)
+		goto finish;
+
+	/* Wheee! Migrate lockres here! */
+	spin_unlock(&dlm->spinlock);
+again:
+
+	ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
+	if (ret == -ENOTEMPTY) {
+		mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
+		     lockres->lockname.len, lockres->lockname.name);
+
+		BUG();
+	} else if (ret < 0) {
+		mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
+		     lockres->lockname.len, lockres->lockname.name);
+		goto again;
+	}
+
+	spin_lock(&dlm->spinlock);
+
+finish:
+	if (!list_empty(&lockres->purge)) {
+		list_del_init(&lockres->purge);
+		dlm->purge_count--;
+	}
+	__dlm_unhash_lockres(lockres);
+}
+
+static void dlm_run_purge_list(struct dlm_ctxt *dlm,
+			       int purge_now)
+{
+	unsigned int run_max, unused;
+	unsigned long purge_jiffies;
+	struct dlm_lock_resource *lockres;
+
+	spin_lock(&dlm->spinlock);
+	run_max = dlm->purge_count;
+
+	while(run_max && !list_empty(&dlm->purge_list)) {
+		run_max--;
+
+		lockres = list_entry(dlm->purge_list.next,
+				     struct dlm_lock_resource, purge);
+
+		/* Status of the lockres *might* change so double
+		 * check. If the lockres is unused, holding the dlm
+		 * spinlock will prevent people from getting and more
+		 * refs on it -- there's no need to keep the lockres
+		 * spinlock. */
+		spin_lock(&lockres->spinlock);
+		unused = __dlm_lockres_unused(lockres);
+		spin_unlock(&lockres->spinlock);
+
+		if (!unused)
+			continue;
+
+		purge_jiffies = lockres->last_used +
+			msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
+
+		/* Make sure that we want to be processing this guy at
+		 * this time. */
+		if (!purge_now && time_after(purge_jiffies, jiffies)) {
+			/* Since resources are added to the purge list
+			 * in tail order, we can stop at the first
+			 * unpurgable resource -- anyone added after
+			 * him will have a greater last_used value */
+			break;
+		}
+
+		list_del_init(&lockres->purge);
+		dlm->purge_count--;
+
+		/* This may drop and reacquire the dlm spinlock if it
+		 * has to do migration. */
+		mlog(0, "calling dlm_purge_lockres!\n");
+		dlm_purge_lockres(dlm, lockres);
+		mlog(0, "DONE calling dlm_purge_lockres!\n");
+
+		/* Avoid adding any scheduling latencies */
+		cond_resched_lock(&dlm->spinlock);
+	}
+
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	struct dlm_lock *lock, *target;
+	struct list_head *iter;
+	struct list_head *head;
+	int can_grant = 1;
+
+	//mlog(0, "res->lockname.len=%d\n", res->lockname.len);
+	//mlog(0, "res->lockname.name=%p\n", res->lockname.name);
+	//mlog(0, "shuffle res %.*s\n", res->lockname.len,
+	//	  res->lockname.name);
+
+	/* because this function is called with the lockres
+	 * spinlock, and because we know that it is not migrating/
+	 * recovering/in-progress, it is fine to reserve asts and
+	 * basts right before queueing them all throughout */
+	assert_spin_locked(&res->spinlock);
+	BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
+			      DLM_LOCK_RES_RECOVERING|
+			      DLM_LOCK_RES_IN_PROGRESS)));
+
+converting:
+	if (list_empty(&res->converting))
+		goto blocked;
+	mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
+	     res->lockname.name);
+
+	target = list_entry(res->converting.next, struct dlm_lock, list);
+	if (target->ml.convert_type == LKM_IVMODE) {
+		mlog(ML_ERROR, "%.*s: converting a lock with no "
+		     "convert_type!\n", res->lockname.len, res->lockname.name);
+		BUG();
+	}
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			/* queue the BAST if not already */
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			/* update the highest_blocked if needed */
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+
+	/* we can convert the lock */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "calling ast for converting lock: %.*s, have: %d, "
+		     "granting: %d, node: %u\n", res->lockname.len,
+		     res->lockname.name, target->ml.type,
+		     target->ml.convert_type, target->ml.node);
+
+		target->ml.type = target->ml.convert_type;
+		target->ml.convert_type = LKM_IVMODE;
+		list_del_init(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+blocked:
+	if (list_empty(&res->blocked))
+		goto leave;
+	target = list_entry(res->blocked.next, struct dlm_lock, list);
+
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	/* we can grant the blocked lock (only
+	 * possible if converting list empty) */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
+		     "node: %u\n", res->lockname.len, res->lockname.name,
+		     target->ml.type, target->ml.node);
+
+		// target->ml.type is already correct
+		list_del_init(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+leave:
+	return;
+}
+
+/* must have NO locks when calling this with res !=NULL * */
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
+	if (res) {
+		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
+		__dlm_dirty_lockres(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+	}
+	wake_up(&dlm->dlm_thread_wq);
+}
+
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* don't shuffle secondary queues */
+	if ((res->owner == dlm->node_num) &&
+	    !(res->state & DLM_LOCK_RES_DIRTY)) {
+		list_add_tail(&res->dirty, &dlm->dirty_list);
+		res->state |= DLM_LOCK_RES_DIRTY;
+	}
+}
+
+
+/* Launch the NM thread for the mounted volume */
+int dlm_launch_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "starting dlm thread...\n");
+
+	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+	if (IS_ERR(dlm->dlm_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_thread_task));
+		dlm->dlm_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_thread_task) {
+		mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+		kthread_stop(dlm->dlm_thread_task);
+		dlm->dlm_thread_task = NULL;
+	}
+}
+
+static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
+{
+	int empty;
+
+	spin_lock(&dlm->spinlock);
+	empty = list_empty(&dlm->dirty_list);
+	spin_unlock(&dlm->spinlock);
+
+	return empty;
+}
+
+static void dlm_flush_asts(struct dlm_ctxt *dlm)
+{
+	int ret;
+	struct dlm_lock *lock;
+	struct dlm_lock_resource *res;
+	u8 hi;
+
+	spin_lock(&dlm->ast_lock);
+	while (!list_empty(&dlm->pending_asts)) {
+		lock = list_entry(dlm->pending_asts.next,
+				  struct dlm_lock, ast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+		mlog(0, "delivering an ast for this lockres\n");
+
+		BUG_ON(!lock->ast_pending);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->ast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_do_remote_ast(dlm, res, lock);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_ast(dlm, res, lock);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another ast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->ast_list)) {
+			mlog(0, "aha another ast got queued while "
+			     "we were finishing the last one.  will "
+			     "keep the ast_pending flag set.\n");
+		} else
+			lock->ast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+
+	while (!list_empty(&dlm->pending_basts)) {
+		lock = list_entry(dlm->pending_basts.next,
+				  struct dlm_lock, bast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+
+		BUG_ON(!lock->bast_pending);
+
+		/* get the highest blocked lock, and reset */
+		spin_lock(&lock->spinlock);
+		BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
+		hi = lock->ml.highest_blocked;
+		lock->ml.highest_blocked = LKM_IVMODE;
+		spin_unlock(&lock->spinlock);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->bast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		mlog(0, "delivering a bast for this lockres "
+		     "(blocked = %d\n", hi);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_send_proxy_bast(dlm, res, lock, hi);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_bast(dlm, res, lock, hi);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another bast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->bast_list)) {
+			mlog(0, "aha another bast got queued while "
+			     "we were finishing the last one.  will "
+			     "keep the bast_pending flag set.\n");
+		} else
+			lock->bast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+	wake_up(&dlm->ast_wq);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
+#define DLM_THREAD_MAX_DIRTY  100
+#define DLM_THREAD_MAX_ASTS   10
+
+static int dlm_thread(void *data)
+{
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		int n = DLM_THREAD_MAX_DIRTY;
+
+		/* dlm_shutting_down is very point-in-time, but that
+		 * doesn't matter as we'll just loop back around if we
+		 * get false on the leading edge of a state
+		 * transition. */
+		dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
+
+		/* We really don't want to hold dlm->spinlock while
+		 * calling dlm_shuffle_lists on each lockres that
+		 * needs to have its queues adjusted and AST/BASTs
+		 * run.  So let's pull each entry off the dirty_list
+		 * and drop dlm->spinlock ASAP.  Once off the list,
+		 * res->spinlock needs to be taken again to protect
+		 * the queues while calling dlm_shuffle_lists.  */
+		spin_lock(&dlm->spinlock);
+		while (!list_empty(&dlm->dirty_list)) {
+			int delay = 0;
+			res = list_entry(dlm->dirty_list.next,
+					 struct dlm_lock_resource, dirty);
+
+			/* peel a lockres off, remove it from the list,
+			 * unset the dirty flag and drop the dlm lock */
+			BUG_ON(!res);
+			dlm_lockres_get(res);
+
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_DIRTY;
+			list_del_init(&res->dirty);
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->spinlock);
+
+		 	/* lockres can be re-dirtied/re-added to the
+			 * dirty_list in this gap, but that is ok */
+
+			spin_lock(&res->spinlock);
+			if (res->owner != dlm->node_num) {
+				__dlm_print_one_lock_resource(res);
+				mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
+				     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+			}
+			BUG_ON(res->owner != dlm->node_num);
+
+			/* it is now ok to move lockreses in these states
+			 * to the dirty list, assuming that they will only be
+			 * dirty for a short while. */
+			if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
+					  DLM_LOCK_RES_MIGRATING |
+					  DLM_LOCK_RES_RECOVERING)) {
+				/* move it to the tail and keep going */
+				spin_unlock(&res->spinlock);
+				mlog(0, "delaying list shuffling for in-"
+				     "progress lockres %.*s, state=%d\n",
+				     res->lockname.len, res->lockname.name,
+				     res->state);
+				delay = 1;
+				goto in_progress;
+			}
+
+			/* at this point the lockres is not migrating/
+			 * recovering/in-progress.  we have the lockres
+			 * spinlock and do NOT have the dlm lock.
+			 * safe to reserve/queue asts and run the lists. */
+
+			mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
+			     "res=%p\n", dlm, res);
+
+			/* called while holding lockres lock */
+			dlm_shuffle_lists(dlm, res);
+			spin_unlock(&res->spinlock);
+
+			dlm_lockres_calc_usage(dlm, res);
+
+in_progress:
+
+			spin_lock(&dlm->spinlock);
+			/* if the lock was in-progress, stick
+			 * it on the back of the list */
+			if (delay) {
+				spin_lock(&res->spinlock);
+				list_add_tail(&res->dirty, &dlm->dirty_list);
+				res->state |= DLM_LOCK_RES_DIRTY;
+				spin_unlock(&res->spinlock);
+			}
+			dlm_lockres_put(res);
+
+			/* unlikely, but we may need to give time to
+			 * other tasks */
+			if (!--n) {
+				mlog(0, "throttling dlm_thread\n");
+				break;
+			}
+		}
+
+		spin_unlock(&dlm->spinlock);
+		dlm_flush_asts(dlm);
+
+		/* yield and continue right away if there is more work to do */
+		if (!n) {
+			yield();
+			continue;
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_thread_wq,
+						 !dlm_dirty_list_empty(dlm) ||
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM thread\n");
+	return 0;
+}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
new file mode 100644
index 000000000000..cec2ce1cd318
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -0,0 +1,672 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmunlock.c
+ *
+ * underlying calls for unlocking locks
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+#define DLM_UNLOCK_FREE_LOCK           0x00000001
+#define DLM_UNLOCK_CALL_AST            0x00000002
+#define DLM_UNLOCK_REMOVE_LOCK         0x00000004
+#define DLM_UNLOCK_REGRANT_LOCK        0x00000008
+#define DLM_UNLOCK_CLEAR_CONVERT_TYPE  0x00000010
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner);
+
+
+/*
+ * according to the spec:
+ * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+ *
+ *  flags & LKM_CANCEL != 0: must be converting or blocked
+ *  flags & LKM_CANCEL == 0: must be granted
+ *
+ * So to unlock a converting lock, you must first cancel the
+ * convert (passing LKM_CANCEL in flags), then call the unlock
+ * again (with no LKM_CANCEL in flags).
+ */
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         res->spinlock and lock->spinlock taken and dropped
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ * all callers should have taken an extra ref on lock coming in
+ */
+static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
+					struct dlm_lock_resource *res,
+					struct dlm_lock *lock,
+					struct dlm_lockstatus *lksb,
+					int flags, int *call_ast,
+					int master_node)
+{
+	enum dlm_status status;
+	int actions = 0;
+	int in_use;
+        u8 owner;
+
+	mlog(0, "master_node = %d, valblk = %d\n", master_node,
+	     flags & LKM_VALBLK);
+
+	if (master_node)
+		BUG_ON(res->owner != dlm->node_num);
+	else
+		BUG_ON(res->owner == dlm->node_num);
+
+	spin_lock(&dlm->spinlock);
+	/* We want to be sure that we're not freeing a lock
+	 * that still has AST's pending... */
+	in_use = !list_empty(&lock->ast_list);
+	spin_unlock(&dlm->spinlock);
+	if (in_use) {
+	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
+		    "while waiting for an ast!", res->lockname.len,
+		    res->lockname.name);
+		return DLM_BADPARAM;
+	}
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+		if (master_node) {
+			mlog(ML_ERROR, "lockres in progress!\n");
+			spin_unlock(&res->spinlock);
+			return DLM_FORWARD;
+		}
+		/* ok for this to sleep if not in a network handler */
+		__dlm_wait_on_lockres(res);
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	}
+	spin_lock(&lock->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+
+	/* see above for what the spec says about
+	 * LKM_CANCEL and the lock queue state */
+	if (flags & LKM_CANCEL)
+		status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
+	else
+		status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
+
+	if (status != DLM_NORMAL)
+		goto leave;
+
+	/* By now this has been masked out of cancel requests. */
+	if (flags & LKM_VALBLK) {
+		/* make the final update to the lvb */
+		if (master_node)
+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
+		else
+			flags |= LKM_PUT_LVB; /* let the send function
+					       * handle it. */
+	}
+
+	if (!master_node) {
+		owner = res->owner;
+		/* drop locks and send message */
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 1;
+		else
+			lock->unlock_pending = 1;
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
+							flags, owner);
+		spin_lock(&res->spinlock);
+		spin_lock(&lock->spinlock);
+		/* if the master told us the lock was already granted,
+		 * let the ast handle all of these actions */
+		if (status == DLM_NORMAL &&
+		    lksb->status == DLM_CANCELGRANT) {
+			actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
+				     DLM_UNLOCK_REGRANT_LOCK|
+				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+		}
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 0;
+		else
+			lock->unlock_pending = 0;
+
+	}
+
+	/* get an extra ref on lock.  if we are just switching
+	 * lists here, we dont want the lock to go away. */
+	dlm_lock_get(lock);
+
+	if (actions & DLM_UNLOCK_REMOVE_LOCK) {
+		list_del_init(&lock->list);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_REGRANT_LOCK) {
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+	}
+	if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
+		mlog(0, "clearing convert_type at %smaster node\n",
+		     master_node ? "" : "non-");
+		lock->ml.convert_type = LKM_IVMODE;
+	}
+
+	/* remove the extra ref on lock */
+	dlm_lock_put(lock);
+
+leave:
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	if (!dlm_lock_on_list(&res->converting, lock))
+		BUG_ON(lock->ml.convert_type != LKM_IVMODE);
+	else
+		BUG_ON(lock->ml.convert_type == LKM_IVMODE);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* let the caller's final dlm_lock_put handle the actual kfree */
+	if (actions & DLM_UNLOCK_FREE_LOCK) {
+		/* this should always be coupled with list removal */
+		BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
+		mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n",
+		     lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_CALL_AST)
+		*call_ast = 1;
+
+	/* if cancel or unlock succeeded, lvb work is done */
+	if (status == DLM_NORMAL)
+		lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+
+	return status;
+}
+
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	/* leave DLM_LKSB_PUT_LVB on the lksb so any final
+	 * update of the lvb will be sent to the new master */
+	list_del_init(&lock->list);
+}
+
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+}
+
+
+static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags,
+					  int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
+}
+
+static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags, int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ */
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner)
+{
+	struct dlm_unlock_lock unlock;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
+	memset(&unlock, 0, sizeof(unlock));
+	unlock.node_idx = dlm->node_num;
+	unlock.flags = cpu_to_be32(flags);
+	unlock.cookie = lock->ml.cookie;
+	unlock.namelen = res->lockname.len;
+	memcpy(unlock.name, res->lockname.name, unlock.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_unlock_lock);
+	vec[0].iov_base = &unlock;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					vec, veclen, owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		if (status == DLM_CANCELGRANT)
+			ret = DLM_NORMAL;
+		else if (status == DLM_FORWARD) {
+			mlog(0, "master was in-progress.  retry\n");
+			ret = DLM_FORWARD;
+		} else
+			ret = status;
+		lksb->status = status;
+	} else {
+		mlog_errno(tmpret);
+		if (dlm_is_host_down(tmpret)) {
+			/* NOTE: this seems strange, but it is what we want.
+			 * when the master goes down during a cancel or
+			 * unlock, the recovery code completes the operation
+			 * as if the master had not died, then passes the
+			 * updated state to the recovery master.  this thread
+			 * just needs to finish out the operation and call
+			 * the unlockast. */
+			ret = DLM_NORMAL;
+		} else {
+			/* something bad.  this will BUG in ocfs2 */
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+		lksb->status = ret;
+	}
+
+	return ret;
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
+ *          return value from dlmunlock_master
+ */
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	int found = 0, i;
+	struct dlm_lockstatus *lksb = NULL;
+	int ignore;
+	u32 flags;
+	struct list_head *queue;
+
+	flags = be32_to_cpu(unlock->flags);
+
+	if (flags & LKM_GET_LVB) {
+		mlog(ML_ERROR, "bad args!  GET_LVB specified on unlock!\n");
+		return DLM_BADARGS;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
+		mlog(ML_ERROR, "bad args!  cannot modify lvb on a CANCEL "
+		     "request!\n");
+		return DLM_BADARGS;
+	}
+
+	if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
+		return DLM_IVBUFLEN;
+	}
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
+
+	res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
+	if (!res) {
+		/* We assume here that a no lock resource simply means
+		 * it was migrated away and destroyed before the other
+		 * node could detect it. */
+		mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
+		status = DLM_FORWARD;
+		goto not_found;
+	}
+
+	queue=&res->granted;
+	found = 0;
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_RECOVERING\n");
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_MIGRATING\n");
+		status = DLM_MIGRATING;
+		goto leave;
+	}
+
+	if (res->owner != dlm->node_num) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_FORWARD -- not master\n");
+		status = DLM_FORWARD;
+		goto leave;
+	}
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			lock = list_entry(iter, struct dlm_lock, list);
+			if (lock->ml.cookie == unlock->cookie &&
+		    	    lock->ml.node == unlock->node_idx) {
+				dlm_lock_get(lock);
+				found = 1;
+				break;
+			}
+		}
+		if (found)
+			break;
+		/* scan granted -> converting -> blocked queues */
+		queue++;
+	}
+	spin_unlock(&res->spinlock);
+	if (!found) {
+		status = DLM_IVLOCKID;
+		goto not_found;
+	}
+
+	/* lock was found on queue */
+	lksb = lock->lksb;
+	/* unlockast only called on originating node */
+	if (flags & LKM_PUT_LVB) {
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
+	}
+
+	/* if this is in-progress, propagate the DLM_FORWARD
+	 * all the way back out */
+	status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
+	if (status == DLM_FORWARD)
+		mlog(0, "lockres is in progress\n");
+
+	if (flags & LKM_PUT_LVB)
+		lksb->flags &= ~DLM_LKSB_PUT_LVB;
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_kick_thread(dlm, res);
+
+not_found:
+	if (!found)
+		mlog(ML_ERROR, "failed to find lock to unlock! "
+			       "cookie=%"MLFu64"\n",
+		     unlock->cookie);
+	else {
+		/* send the lksb->status back to the other node */
+		status = lksb->status;
+		dlm_lock_put(lock);
+	}
+
+leave:
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	if (dlm_lock_on_list(&res->blocked, lock)) {
+		/* cancel this outright */
+		lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	} else if (dlm_lock_on_list(&res->converting, lock)) {
+		/* cancel the request, put back on granted */
+		lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK |
+			    DLM_UNLOCK_REGRANT_LOCK |
+			    DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+	} else if (dlm_lock_on_list(&res->granted, lock)) {
+		/* too late, already granted.  DLM_CANCELGRANT */
+		lksb->status = DLM_CANCELGRANT;
+		status = DLM_NORMAL;
+		*actions = DLM_UNLOCK_CALL_AST;
+	} else {
+		mlog(ML_ERROR, "lock to cancel is not on any list!\n");
+		lksb->status = DLM_IVLOCKID;
+		status = DLM_IVLOCKID;
+		*actions = 0;
+	}
+	return status;
+}
+
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	/* unlock request */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		lksb->status = DLM_DENIED;
+		status = DLM_DENIED;
+		dlm_error(status);
+		*actions = 0;
+	} else {
+		/* unlock granted lock */
+		lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_FREE_LOCK |
+			    DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	}
+	return status;
+}
+
+/* there seems to be no point in doing this async
+ * since (even for the remote case) there is really
+ * no work to queue up... so just do it and fire the
+ * unlockast by hand when done... */
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
+			  int flags, dlm_astunlockfunc_t *unlockast, void *data)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res;
+	struct dlm_lock *lock = NULL;
+	int call_ast, is_master;
+
+	mlog_entry_void();
+
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
+		mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
+		flags &= ~LKM_VALBLK;
+	}
+
+	if (!lksb->lockid || !lksb->lockid->lockres) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	lock = lksb->lockid;
+	BUG_ON(!lock);
+	dlm_lock_get(lock);
+
+	res = lock->lockres;
+	BUG_ON(!res);
+	dlm_lockres_get(res);
+retry:
+	call_ast = 0;
+	/* need to retry up here because owner may have changed */
+	mlog(0, "lock=%p res=%p\n", lock, res);
+
+	spin_lock(&res->spinlock);
+	is_master = (res->owner == dlm->node_num);
+	spin_unlock(&res->spinlock);
+
+	if (is_master) {
+		status = dlmunlock_master(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_master: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	} else {
+		status = dlmunlock_remote(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_remote: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	}
+
+	if (status == DLM_RECOVERING ||
+	    status == DLM_MIGRATING ||
+	    status == DLM_FORWARD) {
+		/* We want to go away for a tiny bit to allow recovery
+		 * / migration to complete on this resource. I don't
+		 * know of any wait queue we could sleep on as this
+		 * may be happening on another node. Perhaps the
+		 * proper solution is to queue up requests on the
+		 * other end? */
+
+		/* do we want to yield(); ?? */
+		msleep(50);
+
+		mlog(0, "retrying unlock due to pending recovery/"
+		     "migration/in-progress\n");
+		goto retry;
+	}
+
+	if (call_ast) {
+		mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status);
+		if (is_master) {
+			/* it is possible that there is one last bast 
+			 * pending.  make sure it is flushed, then
+			 * call the unlockast.
+			 * not an issue if this is a mastered remotely,
+			 * since this lock has been removed from the
+			 * lockres queues and cannot be found. */
+			dlm_kick_thread(dlm, NULL);
+			wait_event(dlm->ast_wq, 
+				   dlm_lock_basts_flushed(dlm, lock));
+		}
+		(*unlockast)(data, lksb->status);
+	}
+
+	if (status == DLM_NORMAL) {
+		mlog(0, "kicking the thread\n");
+		dlm_kick_thread(dlm, res);
+	} else
+		dlm_error(status);
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_lockres_put(res);
+	dlm_lock_put(lock);
+
+	mlog(0, "returning status=%d!\n", status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmunlock);
+
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
new file mode 100644
index 000000000000..7ef2653f8f41
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmver.h"
+
+#define DLM_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
+
+void dlm_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
new file mode 100644
index 000000000000..f674aee77a16
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLM_VER_H
+#define DLM_VER_H
+
+void dlm_print_version(void);
+
+#endif /* DLM_VER_H */
-- 
cgit v1.2.3


From 8df08c89c668e1bd922a053fdb5ba1fadbecbb38 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

dlmfs: A minimal dlm userspace interface implemented via a virtual
file system.
Most of the OCFS2 tools make use of this to take cluster locks when
doing operations on the file system.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 Documentation/filesystems/00-INDEX  |   2 +
 Documentation/filesystems/dlmfs.txt | 130 +++++++
 fs/ocfs2/dlm/Makefile               |   4 +-
 fs/ocfs2/dlm/dlmfs.c                | 640 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmfsver.c             |  42 +++
 fs/ocfs2/dlm/dlmfsver.h             |  31 ++
 fs/ocfs2/dlm/userdlm.c              | 658 ++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/userdlm.h              | 111 ++++++
 8 files changed, 1617 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/filesystems/dlmfs.txt
 create mode 100644 fs/ocfs2/dlm/dlmfs.c
 create mode 100644 fs/ocfs2/dlm/dlmfsver.c
 create mode 100644 fs/ocfs2/dlm/dlmfsver.h
 create mode 100644 fs/ocfs2/dlm/userdlm.c
 create mode 100644 fs/ocfs2/dlm/userdlm.h

(limited to 'fs')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 628f8a7adb85..d9b0a0691866 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -18,6 +18,8 @@ cramfs.txt
 	- info on the cram filesystem for small storage (ROMs etc)
 devfs/
 	- directory containing devfs documentation.
+dlmfs.txt
+	- info on the userspace interface to the OCFS2 DLM.
 ext2.txt
 	- info, mount options and specifications for the Ext2 filesystem.
 fat_cvf.txt
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
new file mode 100644
index 000000000000..9afab845a906
--- /dev/null
+++ b/Documentation/filesystems/dlmfs.txt
@@ -0,0 +1,130 @@
+dlmfs
+==================
+A minimal DLM userspace interface implemented via a virtual file
+system.
+
+dlmfs is built with OCFS2 as it requires most of its infrastructure.
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS
+=======
+
+Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
+and Transmeta Corp.
+
+Mark Fasheh <mark.fasheh@oracle.com>
+
+Caveats
+=======
+- Right now it only works with the OCFS2 DLM, though support for other
+  DLM implementations should not be a major issue.
+
+Mount options
+=============
+None
+
+Usage
+=====
+
+If you're just interested in OCFS2, then please see ocfs2.txt. The
+rest of this document will be geared towards those who want to use
+dlmfs for easy to setup and easy to use clustered locking in
+userspace.
+
+Setup
+=====
+
+dlmfs requires that the OCFS2 cluster infrastructure be in
+place. Please download ocfs2-tools from the above url and configure a
+cluster.
+
+You'll want to start heartbeating on a volume which all the nodes in
+your lockspace can access. The easiest way to do this is via
+ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
+that an OCFS2 file system be in place so that it can automatically
+find it's heartbeat area, though it will eventually support heartbeat
+against raw disks.
+
+Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
+with ocfs2-tools.
+
+Once you're heartbeating, DLM lock 'domains' can be easily created /
+destroyed and locks within them accessed.
+
+Locking
+=======
+
+Users may access dlmfs via standard file system calls, or they can use
+'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
+system calls and presents a more traditional locking api.
+
+dlmfs handles lock caching automatically for the user, so a lock
+request for an already acquired lock will not generate another DLM
+call. Userspace programs are assumed to handle their own local
+locking.
+
+Two levels of locks are supported - Shared Read, and Exlcusive.
+Also supported is a Trylock operation.
+
+For information on the libo2dlm interface, please see o2dlm.h,
+distributed with ocfs2-tools.
+
+Lock value blocks can be read and written to a resource via read(2)
+and write(2) against the fd obtained via your open(2) call. The
+maximum currently supported LVB length is 64 bytes (though that is an
+OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
+small amounts of data amongst their nodes.
+
+mkdir(2) signals dlmfs to join a domain (which will have the same name
+as the resulting directory)
+
+rmdir(2) signals dlmfs to leave the domain
+
+Locks for a given domain are represented by regular inodes inside the
+domain directory.  Locking against them is done via the open(2) system
+call.
+
+The open(2) call will not return until your lock has been granted or
+an error has occurred, unless it has been instructed to do a trylock
+operation. If the lock succeeds, you'll get an fd.
+
+open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
+not automatically create inodes for existing lock resources.
+
+Open Flag     Lock Request Type
+---------     -----------------
+O_RDONLY      Shared Read
+O_RDWR        Exclusive
+
+Open Flag     Resulting Locking Behavior
+---------     --------------------------
+O_NONBLOCK    Trylock operation
+
+You must provide exactly one of O_RDONLY or O_RDWR.
+
+If O_NONBLOCK is also provided and the trylock operation was valid but
+could not lock the resource then open(2) will return ETXTBUSY.
+
+close(2) drops the lock associated with your fd.
+
+Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
+supported locally as well. This means you can use them to restrict
+access to the resources via dlmfs on your local node only.
+
+The resource LVB may be read from the fd in either Shared Read or
+Exclusive modes via the read(2) system call. It can be written via
+write(2) only when open in Exclusive mode.
+
+Once written, an LVB will be visible to other nodes who obtain Read
+Only or higher level locks on the resource.
+
+See Also
+========
+http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+
+For more information on the VMS distributed locking API.
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 2a5274bcc8bb..ce3f7c29d270 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,8 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
 
-obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
 
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
 	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
new file mode 100644
index 000000000000..dd2d24dc25e0
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -0,0 +1,640 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfs.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM. This file handles the virtual file system
+ * used for communication with userspace. Credit should go to ramfs,
+ * which was a template for the fs side of this module.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* Simple VFS hooks based on: */
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+
+#include <asm/uaccess.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+
+#include "userdlm.h"
+
+#include "dlmfsver.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+static struct super_operations dlmfs_ops;
+static struct file_operations dlmfs_file_operations;
+static struct inode_operations dlmfs_dir_inode_operations;
+static struct inode_operations dlmfs_root_inode_operations;
+static struct inode_operations dlmfs_file_inode_operations;
+static kmem_cache_t *dlmfs_inode_cache;
+
+struct workqueue_struct *user_dlm_worker;
+
+/*
+ * decodes a set of open flags into a valid lock level and a set of flags.
+ * returns < 0 if we have invalid flags
+ * flags which mean something to us:
+ * O_RDONLY -> PRMODE level
+ * O_WRONLY -> EXMODE level
+ *
+ * O_NONBLOCK -> LKM_NOQUEUE
+ */
+static int dlmfs_decode_open_flags(int open_flags,
+				   int *level,
+				   int *flags)
+{
+	if (open_flags & (O_WRONLY|O_RDWR))
+		*level = LKM_EXMODE;
+	else
+		*level = LKM_PRMODE;
+
+	*flags = 0;
+	if (open_flags & O_NONBLOCK)
+		*flags |= LKM_NOQUEUE;
+
+	return 0;
+}
+
+static int dlmfs_file_open(struct inode *inode,
+			   struct file *file)
+{
+	int status, level, flags;
+	struct dlmfs_filp_private *fp = NULL;
+	struct dlmfs_inode_private *ip;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
+		file->f_flags);
+
+	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
+	if (status < 0)
+		goto bail;
+
+	/* We don't want to honor O_APPEND at read/write time as it
+	 * doesn't make sense for LVB writes. */
+	file->f_flags &= ~O_APPEND;
+
+	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	if (!fp) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	fp->fp_lock_level = level;
+
+	ip = DLMFS_I(inode);
+
+	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
+	if (status < 0) {
+		/* this is a strange error to return here but I want
+		 * to be able userspace to be able to distinguish a
+		 * valid lock request from one that simply couldn't be
+		 * granted. */
+		if (flags & LKM_NOQUEUE && status == -EAGAIN)
+			status = -ETXTBSY;
+		kfree(fp);
+		goto bail;
+	}
+
+	file->private_data = fp;
+bail:
+	return status;
+}
+
+static int dlmfs_file_release(struct inode *inode,
+			      struct file *file)
+{
+	int level, status;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+	struct dlmfs_filp_private *fp =
+		(struct dlmfs_filp_private *) file->private_data;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "close called on inode %lu\n", inode->i_ino);
+
+	status = 0;
+	if (fp) {
+		level = fp->fp_lock_level;
+		if (level != LKM_IVMODE)
+			user_dlm_cluster_unlock(&ip->ip_lockres, level);
+
+		kfree(fp);
+		file->private_data = NULL;
+	}
+
+	return 0;
+}
+
+static ssize_t dlmfs_file_read(struct file *filp,
+			       char __user *buf,
+			       size_t count,
+			       loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t readlen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return 0;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	/* don't read past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		readlen = i_size_read(inode) - *ppos;
+	else
+		readlen = count - *ppos;
+
+	lvb_buf = kmalloc(readlen, GFP_KERNEL);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	user_dlm_read_lvb(inode, lvb_buf, readlen);
+	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+	readlen -= bytes_left;
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + readlen;
+
+	mlog(0, "read %zd bytes\n", readlen);
+	return readlen;
+}
+
+static ssize_t dlmfs_file_write(struct file *filp,
+				const char __user *buf,
+				size_t count,
+				loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t writelen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return -ENOSPC;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	/* don't write past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		writelen = i_size_read(inode) - *ppos;
+	else
+		writelen = count - *ppos;
+
+	lvb_buf = kmalloc(writelen, GFP_KERNEL);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	bytes_left = copy_from_user(lvb_buf, buf, writelen);
+	writelen -= bytes_left;
+	if (writelen)
+		user_dlm_write_lvb(inode, lvb_buf, writelen);
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + writelen;
+	mlog(0, "wrote %zd bytes\n", writelen);
+	return writelen;
+}
+
+static void dlmfs_init_once(void *foo,
+			    kmem_cache_t *cachep,
+			    unsigned long flags)
+{
+	struct dlmfs_inode_private *ip =
+		(struct dlmfs_inode_private *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		ip->ip_dlm = NULL;
+		ip->ip_parent = NULL;
+
+		inode_init_once(&ip->ip_vfs_inode);
+	}
+}
+
+static struct inode *dlmfs_alloc_inode(struct super_block *sb)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
+	if (!ip)
+		return NULL;
+
+	return &ip->ip_vfs_inode;
+}
+
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
+}
+
+static void dlmfs_clear_inode(struct inode *inode)
+{
+	int status;
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return;
+
+	mlog(0, "inode %lu\n", inode->i_ino);
+
+	ip = DLMFS_I(inode);
+
+	if (S_ISREG(inode->i_mode)) {
+		status = user_dlm_destroy_lock(&ip->ip_lockres);
+		if (status < 0)
+			mlog_errno(status);
+		iput(ip->ip_parent);
+		goto clear_fields;
+	}
+
+	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+	/* we must be a directory. If required, lets unregister the
+	 * dlm context now. */
+	if (ip->ip_dlm)
+		user_dlm_unregister_context(ip->ip_dlm);
+clear_fields:
+	ip->ip_parent = NULL;
+	ip->ip_dlm = NULL;
+}
+
+static struct backing_dev_info dlmfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+static struct inode *dlmfs_get_root_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	int mode = S_IFDIR | 0755;
+	struct dlmfs_inode_private *ip;
+
+	if (inode) {
+		ip = DLMFS_I(inode);
+
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_nlink++;
+
+		inode->i_fop = &simple_dir_operations;
+		inode->i_op = &dlmfs_root_inode_operations;
+	}
+
+	return inode;
+}
+
+static struct inode *dlmfs_get_inode(struct inode *parent,
+				     struct dentry *dentry,
+				     int mode)
+{
+	struct super_block *sb = parent->i_sb;
+	struct inode * inode = new_inode(sb);
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_mode = mode;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	ip = DLMFS_I(inode);
+	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+
+	switch (mode & S_IFMT) {
+	default:
+		/* for now we don't support anything other than
+		 * directories and regular files. */
+		BUG();
+		break;
+	case S_IFREG:
+		inode->i_op = &dlmfs_file_inode_operations;
+		inode->i_fop = &dlmfs_file_operations;
+
+		i_size_write(inode,  DLM_LVB_LEN);
+
+		user_dlm_lock_res_init(&ip->ip_lockres, dentry);
+
+		/* released at clear_inode time, this insures that we
+		 * get to drop the dlm reference on each lock *before*
+		 * we call the unregister code for releasing parent
+		 * directories. */
+		ip->ip_parent = igrab(parent);
+		BUG_ON(!ip->ip_parent);
+		break;
+	case S_IFDIR:
+		inode->i_op = &dlmfs_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* directory inodes start off with i_nlink ==
+		 * 2 (for "." entry) */
+		inode->i_nlink++;
+		break;
+	}
+
+	if (parent->i_mode & S_ISGID) {
+		inode->i_gid = parent->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	}
+
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int dlmfs_mkdir(struct inode * dir,
+		       struct dentry * dentry,
+		       int mode)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct qstr *domain = &dentry->d_name;
+	struct dlmfs_inode_private *ip;
+	struct dlm_ctxt *dlm;
+
+	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
+
+	/* verify that we have a proper domain */
+	if (domain->len >= O2NM_MAX_NAME_LEN) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid domain name for directory.\n");
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ip = DLMFS_I(inode);
+
+	dlm = user_dlm_register_context(domain);
+	if (IS_ERR(dlm)) {
+		status = PTR_ERR(dlm);
+		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
+		     status, domain->len, domain->name);
+		goto bail;
+	}
+	ip->ip_dlm = dlm;
+
+	dir->i_nlink++;
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+
+	status = 0;
+bail:
+	if (status < 0)
+		iput(inode);
+	return status;
+}
+
+static int dlmfs_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int status = 0;
+	struct inode *inode;
+	struct qstr *name = &dentry->d_name;
+
+	mlog(0, "create %.*s\n", name->len, name->name);
+
+	/* verify name is valid and doesn't contain any dlm reserved
+	 * characters */
+	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
+	    name->name[0] == '$') {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
+		     name->name);
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+bail:
+	return status;
+}
+
+static int dlmfs_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	struct inode *inode = dentry->d_inode;
+
+	mlog(0, "unlink inode %lu\n", inode->i_ino);
+
+	/* if there are no current holders, or none that are waiting
+	 * to acquire a lock, this basically destroys our lockres. */
+	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
+	if (status < 0) {
+		mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
+		     dentry->d_name.len, dentry->d_name.name, status);
+		goto bail;
+	}
+	status = simple_unlink(dir, dentry);
+bail:
+	return status;
+}
+
+static int dlmfs_fill_super(struct super_block * sb,
+			    void * data,
+			    int silent)
+{
+	struct inode * inode;
+	struct dentry * root;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = DLMFS_MAGIC;
+	sb->s_op = &dlmfs_ops;
+	inode = dlmfs_get_root_inode(sb);
+	if (!inode)
+		return -ENOMEM;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+	return 0;
+}
+
+static struct file_operations dlmfs_file_operations = {
+	.open		= dlmfs_file_open,
+	.release	= dlmfs_file_release,
+	.read		= dlmfs_file_read,
+	.write		= dlmfs_file_write,
+};
+
+static struct inode_operations dlmfs_dir_inode_operations = {
+	.create		= dlmfs_create,
+	.lookup		= simple_lookup,
+	.unlink		= dlmfs_unlink,
+};
+
+/* this way we can restrict mkdir to only the toplevel of the fs. */
+static struct inode_operations dlmfs_root_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= dlmfs_mkdir,
+	.rmdir		= simple_rmdir,
+};
+
+static struct super_operations dlmfs_ops = {
+	.statfs		= simple_statfs,
+	.alloc_inode	= dlmfs_alloc_inode,
+	.destroy_inode	= dlmfs_destroy_inode,
+	.clear_inode	= dlmfs_clear_inode,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct inode_operations dlmfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
+
+static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+}
+
+static struct file_system_type dlmfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ocfs2_dlmfs",
+	.get_sb		= dlmfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_dlmfs_fs(void)
+{
+	int status;
+	int cleanup_inode = 0, cleanup_worker = 0;
+
+	dlmfs_print_version();
+
+	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
+				sizeof(struct dlmfs_inode_private),
+				0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
+				dlmfs_init_once, NULL);
+	if (!dlmfs_inode_cache)
+		return -ENOMEM;
+	cleanup_inode = 1;
+
+	user_dlm_worker = create_singlethread_workqueue("user_dlm");
+	if (!user_dlm_worker) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_worker = 1;
+
+	status = register_filesystem(&dlmfs_fs_type);
+bail:
+	if (status) {
+		if (cleanup_inode)
+			kmem_cache_destroy(dlmfs_inode_cache);
+		if (cleanup_worker)
+			destroy_workqueue(user_dlm_worker);
+	} else
+		printk("OCFS2 User DLM kernel interface loaded\n");
+	return status;
+}
+
+static void __exit exit_dlmfs_fs(void)
+{
+	unregister_filesystem(&dlmfs_fs_type);
+
+	flush_workqueue(user_dlm_worker);
+	destroy_workqueue(user_dlm_worker);
+
+	if (kmem_cache_destroy(dlmfs_inode_cache))
+		printk(KERN_INFO "dlmfs_inode_cache: not all structures "
+		       "were freed\n");
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_dlmfs_fs)
+module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
new file mode 100644
index 000000000000..d2be3ad841f9
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmfsver.h"
+
+#define DLM_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
+
+void dlmfs_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h
new file mode 100644
index 000000000000..f35eadbed25c
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLMFS_VER_H
+#define DLMFS_VER_H
+
+void dlmfs_print_version(void);
+
+#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
new file mode 100644
index 000000000000..e1fdd288796e
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -0,0 +1,658 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM.
+ *
+ * Many of the functions here are pared down versions of dlmglue.c
+ * functions.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <asm/signal.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+
+#include "userdlm.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+static inline int user_check_wait_flag(struct user_lock_res *lockres,
+				       int flag)
+{
+	int ret;
+
+	spin_lock(&lockres->l_lock);
+	ret = lockres->l_flags & flag;
+	spin_unlock(&lockres->l_lock);
+
+	return ret;
+}
+
+static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
+}
+
+static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
+}
+
+/* I heart container_of... */
+static inline struct dlm_ctxt *
+dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return ip->ip_dlm;
+}
+
+static struct inode *
+user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return &ip->ip_vfs_inode;
+}
+
+static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+}
+
+#define user_log_dlm_error(_func, _stat, _lockres) do {		\
+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
+		_lockres->l_name, dlm_errmsg(_stat));		\
+} while (0)
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int user_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+static void user_ast(void *opaque)
+{
+	struct user_lock_res *lockres = opaque;
+	struct dlm_lockstatus *lksb;
+
+	mlog(0, "AST fired for lockres %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
+		     lksb->status, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		return;
+	}
+
+	/* we're downconverting. */
+	if (lockres->l_requested < lockres->l_level) {
+		if (lockres->l_requested <=
+		    user_highest_compat_lock_level(lockres->l_blocking)) {
+			lockres->l_blocking = LKM_NLMODE;
+			lockres->l_flags &= ~USER_LOCK_BLOCKED;
+		}
+	}
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_flags |= USER_LOCK_ATTACHED;
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	if (!igrab(inode))
+		BUG();
+}
+
+static void user_dlm_unblock_lock(void *opaque);
+
+static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
+{
+	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
+		user_dlm_grab_inode_ref(lockres);
+
+		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
+			  lockres);
+
+		queue_work(user_dlm_worker, &lockres->l_work);
+		lockres->l_flags |= USER_LOCK_QUEUED;
+	}
+}
+
+static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
+{
+	int queue = 0;
+
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
+		return;
+
+	switch (lockres->l_blocking) {
+	case LKM_EXMODE:
+		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+			queue = 1;
+		break;
+	case LKM_PRMODE:
+		if (!lockres->l_ex_holders)
+			queue = 1;
+		break;
+	default:
+		BUG();
+	}
+
+	if (queue)
+		__user_dlm_queue_lockres(lockres);
+}
+
+static void user_bast(void *opaque, int level)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
+		lockres->l_name, level);
+
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags |= USER_LOCK_BLOCKED;
+	if (level > lockres->l_blocking)
+		lockres->l_blocking = level;
+
+	__user_dlm_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static void user_unlock_ast(void *opaque, enum dlm_status status)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
+
+	if (status != DLM_NORMAL)
+		mlog(ML_ERROR, "Dlm returns status %d\n", status);
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
+		lockres->l_level = LKM_IVMODE;
+	else {
+		lockres->l_requested = LKM_IVMODE; /* cancel an
+						    * upconvert
+						    * request. */
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		/* we want the unblock thread to look at it again
+		 * now. */
+		__user_dlm_queue_lockres(lockres);
+	}
+
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	iput(inode);
+}
+
+static void user_dlm_unblock_lock(void *opaque)
+{
+	int new_level, status;
+	struct user_lock_res *lockres = (struct user_lock_res *) opaque;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "processing lockres %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+	BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
+
+	/* notice that we don't clear USER_LOCK_BLOCKED here. That's
+	 * for user_ast to do. */
+	lockres->l_flags &= ~USER_LOCK_QUEUED;
+
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(0, "lock is in teardown so we do nothing\n");
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_BUSY) {
+		mlog(0, "BUSY flag detected...\n");
+		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+			spin_unlock(&lockres->l_lock);
+			goto drop_ref;
+		}
+
+		lockres->l_flags |= USER_LOCK_IN_CANCEL;
+		spin_unlock(&lockres->l_lock);
+
+		status = dlmunlock(dlm,
+				   &lockres->l_lksb,
+				   LKM_CANCEL,
+				   user_unlock_ast,
+				   lockres);
+		if (status == DLM_CANCELGRANT) {
+			/* If we got this, then the ast was fired
+			 * before we could cancel. We cleanup our
+			 * state, and restart the function. */
+			spin_lock(&lockres->l_lock);
+			lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+			spin_unlock(&lockres->l_lock);
+		} else if (status != DLM_NORMAL)
+			user_log_dlm_error("dlmunlock", status, lockres);
+		goto drop_ref;
+	}
+
+	/* If there are still incompat holders, we can exit safely
+	 * without worrying about re-queueing this lock as that will
+	 * happen on the last call to user_cluster_unlock. */
+	if ((lockres->l_blocking == LKM_EXMODE)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
+			lockres->l_ro_holders, lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	if ((lockres->l_blocking == LKM_PRMODE)
+	    && lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for pr: ex = %u\n",
+			lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	/* yay, we can downconvert now. */
+	new_level = user_highest_compat_lock_level(lockres->l_blocking);
+	lockres->l_requested = new_level;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	mlog(0, "Downconvert lock from %d to %d\n",
+		lockres->l_level, new_level);
+	spin_unlock(&lockres->l_lock);
+
+	/* need lock downconvert request now... */
+	status = dlmlock(dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 LKM_CONVERT|LKM_VALBLK,
+			 lockres->l_name,
+			 user_ast,
+			 lockres,
+			 user_bast);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmlock", status, lockres);
+		user_recover_from_dlm_error(lockres);
+	}
+
+drop_ref:
+	user_dlm_drop_inode_ref(lockres);
+}
+
+static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int
+user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
+				  int wanted)
+{
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+
+	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
+}
+
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags)
+{
+	int status, local_flags;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %s: invalid request!\n",
+		     lockres->l_name);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
+		lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
+		lkm_flags);
+
+again:
+	if (signal_pending(current)) {
+		status = -ERESTARTSYS;
+		goto bail;
+	}
+
+	spin_lock(&lockres->l_lock);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if ((lockres->l_flags & USER_LOCK_BUSY) &&
+	    (level > lockres->l_level)) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
+	    (!user_may_continue_on_blocked_lock(lockres, level))) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_blocked_lock(lockres);
+		goto again;
+	}
+
+	if (level > lockres->l_level) {
+		local_flags = lkm_flags | LKM_VALBLK;
+		if (lockres->l_level != LKM_IVMODE)
+			local_flags |= LKM_CONVERT;
+
+		lockres->l_requested = level;
+		lockres->l_flags |= USER_LOCK_BUSY;
+		spin_unlock(&lockres->l_lock);
+
+		BUG_ON(level == LKM_IVMODE);
+		BUG_ON(level == LKM_NLMODE);
+
+		mlog(0, "lock %s, get lock from %d to level = %d\n",
+			lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(dlm,
+				 level,
+				 &lockres->l_lksb,
+				 local_flags,
+				 lockres->l_name,
+				 user_ast,
+				 lockres,
+				 user_bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				status = -EAGAIN;
+			else {
+				user_log_dlm_error("dlmlock", status, lockres);
+				status = -EINVAL;
+			}
+			user_recover_from_dlm_error(lockres);
+			goto bail;
+		}
+
+		mlog(0, "lock %s, successfull return from dlmlock\n",
+			lockres->l_name);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	user_dlm_inc_holders(lockres, level);
+	spin_unlock(&lockres->l_lock);
+
+	mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
+
+	status = 0;
+bail:
+	return status;
+}
+
+static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level)
+{
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
+		return;
+	}
+
+	mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
+
+	spin_lock(&lockres->l_lock);
+	user_dlm_dec_holders(lockres, level);
+	__user_dlm_cond_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_EXMODE);
+	memcpy(lvb, val, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_PRMODE);
+	memcpy(val, lvb, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry)
+{
+	memset(lockres, 0, sizeof(*lockres));
+
+	spin_lock_init(&lockres->l_lock);
+	init_waitqueue_head(&lockres->l_event);
+	lockres->l_level = LKM_IVMODE;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_blocking = LKM_IVMODE;
+
+	/* should have been checked before getting here. */
+	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
+
+	memcpy(lockres->l_name,
+	       dentry->d_name.name,
+	       dentry->d_name.len);
+}
+
+int user_dlm_destroy_lock(struct user_lock_res *lockres)
+{
+	int status = -EBUSY;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "asked to destroy %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+	while (lockres->l_flags & USER_LOCK_BUSY) {
+		spin_unlock(&lockres->l_lock);
+
+		mlog(0, "lock %s is busy\n", lockres->l_name);
+
+		user_wait_on_busy_lock(lockres);
+
+		spin_lock(&lockres->l_lock);
+	}
+
+	if (lockres->l_ro_holders || lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "lock %s has holders\n", lockres->l_name);
+		goto bail;
+	}
+
+	status = 0;
+	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "lock %s is not attached\n", lockres->l_name);
+		goto bail;
+	}
+
+	lockres->l_flags &= ~USER_LOCK_ATTACHED;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+	spin_unlock(&lockres->l_lock);
+
+	mlog(0, "unlocking lockres %s\n", lockres->l_name);
+	status = dlmunlock(dlm,
+			   &lockres->l_lksb,
+			   LKM_VALBLK,
+			   user_unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmunlock", status, lockres);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	user_wait_on_busy_lock(lockres);
+
+	status = 0;
+bail:
+	return status;
+}
+
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
+{
+	struct dlm_ctxt *dlm;
+	u32 dlm_key;
+	char *domain;
+
+	domain = kmalloc(name->len + 1, GFP_KERNEL);
+	if (!domain) {
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dlm_key = crc32_le(0, name->name, name->len);
+
+	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+
+	dlm = dlm_register_domain(domain, dlm_key);
+	if (IS_ERR(dlm))
+		mlog_errno(PTR_ERR(dlm));
+
+	kfree(domain);
+	return dlm;
+}
+
+void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain(dlm);
+}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
new file mode 100644
index 000000000000..04178bc40b76
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.h
+ *
+ * Userspace dlm defines
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef USERDLM_H
+#define USERDLM_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/* user_lock_res->l_flags flags. */
+#define USER_LOCK_ATTACHED      (0x00000001) /* have we initialized
+					       * the lvb */
+#define USER_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define USER_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					      * downconvert*/
+#define USER_LOCK_IN_TEARDOWN   (0x00000008) /* we're currently
+					      * destroying this
+					      * lock. */
+#define USER_LOCK_QUEUED        (0x00000010) /* lock is on the
+					      * workqueue */
+#define USER_LOCK_IN_CANCEL     (0x00000020)
+
+struct user_lock_res {
+	spinlock_t               l_lock;
+
+	int                      l_flags;
+
+#define USER_DLM_LOCK_ID_MAX_LEN  32
+	char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct dlm_lockstatus    l_lksb;
+
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct work_struct       l_work;
+};
+
+extern struct workqueue_struct *user_dlm_worker;
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry);
+int user_dlm_destroy_lock(struct user_lock_res *lockres);
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags);
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level);
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len);
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len);
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
+void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+
+struct dlmfs_inode_private {
+	struct dlm_ctxt             *ip_dlm;
+
+	struct user_lock_res ip_lockres; /* unused for directories. */
+	struct inode         *ip_parent;
+
+	struct inode         ip_vfs_inode;
+};
+
+static inline struct dlmfs_inode_private *
+DLMFS_I(struct inode *inode)
+{
+        return container_of(inode,
+			    struct dlmfs_inode_private,
+			    ip_vfs_inode);
+}
+
+struct dlmfs_filp_private {
+	int                  fp_lock_level;
+};
+
+#define DLMFS_MAGIC	0x76a9f425
+
+#endif /* USERDLM_H */
-- 
cgit v1.2.3


From ccd979bdbce9fba8412beb3f1de68a9d0171b12c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:24 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

The OCFS2 file system module.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 Documentation/filesystems/00-INDEX  |    2 +
 Documentation/filesystems/ocfs2.txt |   55 +
 MAINTAINERS                         |    9 +
 fs/ocfs2/Makefile                   |   33 +
 fs/ocfs2/alloc.c                    | 2040 ++++++++++++++++++++++++
 fs/ocfs2/alloc.h                    |   82 +
 fs/ocfs2/aops.c                     |  643 ++++++++
 fs/ocfs2/aops.h                     |   41 +
 fs/ocfs2/buffer_head_io.c           |  232 +++
 fs/ocfs2/buffer_head_io.h           |   73 +
 fs/ocfs2/dcache.c                   |   91 ++
 fs/ocfs2/dcache.h                   |   31 +
 fs/ocfs2/dir.c                      |  618 ++++++++
 fs/ocfs2/dir.h                      |   54 +
 fs/ocfs2/dlmglue.c                  | 2904 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlmglue.h                  |  111 ++
 fs/ocfs2/endian.h                   |   45 +
 fs/ocfs2/export.c                   |  248 +++
 fs/ocfs2/export.h                   |   31 +
 fs/ocfs2/extent_map.c               |  994 ++++++++++++
 fs/ocfs2/extent_map.h               |   46 +
 fs/ocfs2/file.c                     | 1237 +++++++++++++++
 fs/ocfs2/file.h                     |   57 +
 fs/ocfs2/heartbeat.c                |  378 +++++
 fs/ocfs2/heartbeat.h                |   67 +
 fs/ocfs2/inode.c                    | 1140 ++++++++++++++
 fs/ocfs2/inode.h                    |  145 ++
 fs/ocfs2/journal.c                  | 1652 ++++++++++++++++++++
 fs/ocfs2/journal.h                  |  457 ++++++
 fs/ocfs2/localalloc.c               |  983 ++++++++++++
 fs/ocfs2/localalloc.h               |   56 +
 fs/ocfs2/mmap.c                     |  102 ++
 fs/ocfs2/mmap.h                     |    6 +
 fs/ocfs2/namei.c                    | 2264 +++++++++++++++++++++++++++
 fs/ocfs2/namei.h                    |   58 +
 fs/ocfs2/ocfs1_fs_compat.h          |  109 ++
 fs/ocfs2/ocfs2.h                    |  464 ++++++
 fs/ocfs2/ocfs2_fs.h                 |  638 ++++++++
 fs/ocfs2/ocfs2_lockid.h             |   73 +
 fs/ocfs2/slot_map.c                 |  303 ++++
 fs/ocfs2/slot_map.h                 |   66 +
 fs/ocfs2/suballoc.c                 | 1651 ++++++++++++++++++++
 fs/ocfs2/suballoc.h                 |  132 ++
 fs/ocfs2/super.c                    | 1733 +++++++++++++++++++++
 fs/ocfs2/super.h                    |   44 +
 fs/ocfs2/symlink.c                  |  180 +++
 fs/ocfs2/symlink.h                  |   42 +
 fs/ocfs2/sysfile.c                  |  131 ++
 fs/ocfs2/sysfile.h                  |   33 +
 fs/ocfs2/uptodate.c                 |  544 +++++++
 fs/ocfs2/uptodate.h                 |   44 +
 fs/ocfs2/ver.c                      |   43 +
 fs/ocfs2/ver.h                      |   31 +
 fs/ocfs2/vote.c                     | 1202 +++++++++++++++
 fs/ocfs2/vote.h                     |   56 +
 55 files changed, 24504 insertions(+)
 create mode 100644 Documentation/filesystems/ocfs2.txt
 create mode 100644 fs/ocfs2/Makefile
 create mode 100644 fs/ocfs2/alloc.c
 create mode 100644 fs/ocfs2/alloc.h
 create mode 100644 fs/ocfs2/aops.c
 create mode 100644 fs/ocfs2/aops.h
 create mode 100644 fs/ocfs2/buffer_head_io.c
 create mode 100644 fs/ocfs2/buffer_head_io.h
 create mode 100644 fs/ocfs2/dcache.c
 create mode 100644 fs/ocfs2/dcache.h
 create mode 100644 fs/ocfs2/dir.c
 create mode 100644 fs/ocfs2/dir.h
 create mode 100644 fs/ocfs2/dlmglue.c
 create mode 100644 fs/ocfs2/dlmglue.h
 create mode 100644 fs/ocfs2/endian.h
 create mode 100644 fs/ocfs2/export.c
 create mode 100644 fs/ocfs2/export.h
 create mode 100644 fs/ocfs2/extent_map.c
 create mode 100644 fs/ocfs2/extent_map.h
 create mode 100644 fs/ocfs2/file.c
 create mode 100644 fs/ocfs2/file.h
 create mode 100644 fs/ocfs2/heartbeat.c
 create mode 100644 fs/ocfs2/heartbeat.h
 create mode 100644 fs/ocfs2/inode.c
 create mode 100644 fs/ocfs2/inode.h
 create mode 100644 fs/ocfs2/journal.c
 create mode 100644 fs/ocfs2/journal.h
 create mode 100644 fs/ocfs2/localalloc.c
 create mode 100644 fs/ocfs2/localalloc.h
 create mode 100644 fs/ocfs2/mmap.c
 create mode 100644 fs/ocfs2/mmap.h
 create mode 100644 fs/ocfs2/namei.c
 create mode 100644 fs/ocfs2/namei.h
 create mode 100644 fs/ocfs2/ocfs1_fs_compat.h
 create mode 100644 fs/ocfs2/ocfs2.h
 create mode 100644 fs/ocfs2/ocfs2_fs.h
 create mode 100644 fs/ocfs2/ocfs2_lockid.h
 create mode 100644 fs/ocfs2/slot_map.c
 create mode 100644 fs/ocfs2/slot_map.h
 create mode 100644 fs/ocfs2/suballoc.c
 create mode 100644 fs/ocfs2/suballoc.h
 create mode 100644 fs/ocfs2/super.c
 create mode 100644 fs/ocfs2/super.h
 create mode 100644 fs/ocfs2/symlink.c
 create mode 100644 fs/ocfs2/symlink.h
 create mode 100644 fs/ocfs2/sysfile.c
 create mode 100644 fs/ocfs2/sysfile.h
 create mode 100644 fs/ocfs2/uptodate.c
 create mode 100644 fs/ocfs2/uptodate.h
 create mode 100644 fs/ocfs2/ver.c
 create mode 100644 fs/ocfs2/ver.h
 create mode 100644 fs/ocfs2/vote.c
 create mode 100644 fs/ocfs2/vote.h

(limited to 'fs')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index d9b0a0691866..2580ada100a0 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -36,6 +36,8 @@ ntfs.txt
 	- info and mount options for the NTFS filesystem (Windows NT).
 proc.txt
 	- info on Linux's /proc filesystem.
+ocfs2.txt
+	- info and mount options for the OCFS2 clustered filesystem.
 romfs.txt
 	- Description of the ROMFS filesystem.
 smbfs.txt
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
new file mode 100644
index 000000000000..f2595caf052e
--- /dev/null
+++ b/Documentation/filesystems/ocfs2.txt
@@ -0,0 +1,55 @@
+OCFS2 filesystem
+==================
+OCFS2 is a general purpose extent based shared disk cluster file
+system with many similarities to ext3. It supports 64 bit inode
+numbers, and has automatically extending metadata groups which may
+also make it attractive for non-clustered use.
+
+You'll want to install the ocfs2-tools package in order to at least
+get "mount.ocfs2" and "ocfs2_hb_ctl".
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS:
+Lots of code taken from ext3 and other projects.
+
+Authors in alphabetical order:
+Joel Becker   <joel.becker@oracle.com>
+Zach Brown    <zach.brown@oracle.com>
+Mark Fasheh   <mark.fasheh@oracle.com>
+Kurt Hackel   <kurt.hackel@oracle.com>
+Sunil Mushran <sunil.mushran@oracle.com>
+Manish Singh  <manish.singh@oracle.com>
+
+Caveats
+=======
+Features which OCFS2 does not support yet:
+	- sparse files
+	- extended attributes
+	- shared writeable mmap
+	- loopback is supported, but data written will not
+	  be cluster coherent.
+	- quotas
+	- cluster aware flock
+	- Directory change notification (F_NOTIFY)
+	- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+	- POSIX ACLs
+	- readpages / writepages (not user visible)
+
+Mount options
+=============
+
+OCFS2 supports the following mount options:
+(*) == default
+
+barrier=1		This enables/disables barriers. barrier=0 disables it,
+			barrier=1 enables it.
+errors=remount-ro(*)	Remount the filesystem read-only on an error.
+errors=panic		Panic and halt the machine if an error occurs.
+intr		(*)	Allow signals to interrupt cluster operations.
+nointr			Do not allow signals to interrupt cluster
+			operations.
diff --git a/MAINTAINERS b/MAINTAINERS
index 86ee06f43794..15888302025f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1905,6 +1905,15 @@ M:	ajoshi@shell.unixbox.com
 L:	linux-nvidia@lists.surfsouth.com
 S:	Maintained
 
+ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
+P:	Mark Fasheh
+M:	mark.fasheh@oracle.com
+P:	Kurt Hackel
+M:	kurt.hackel@oracle.com
+L:	ocfs2-devel@oss.oracle.com
+W:	http://oss.oracle.com/projects/ocfs2/
+S:	Supported	
+
 OLYMPIC NETWORK DRIVER
 P:	Peter De Shrijver
 M:	p2@ace.ulyssis.student.kuleuven.ac.be
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644
index 000000000000..7d3be845a614
--- /dev/null
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,33 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+
+ocfs2-objs := \
+	alloc.o 		\
+	aops.o 			\
+	buffer_head_io.o	\
+	dcache.o 		\
+	dir.o 			\
+	dlmglue.o 		\
+	export.o 		\
+	extent_map.o 		\
+	file.o 			\
+	heartbeat.o 		\
+	inode.o 		\
+	journal.o 		\
+	localalloc.o 		\
+	mmap.o 			\
+	namei.o 		\
+	slot_map.o 		\
+	suballoc.o 		\
+	super.o 		\
+	symlink.o 		\
+	sysfile.o 		\
+	uptodate.o		\
+	ver.o 			\
+	vote.o
+
+obj-$(CONFIG_OCFS2_FS) += cluster/
+obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 000000000000..465f797451ee
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.c
+ *
+ * Extent allocs and frees
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+#include "super.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_extent_contig(struct inode *inode,
+			       struct ocfs2_extent_rec *ext,
+			       u64 blkno);
+
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[]);
+
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac);
+
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh);
+
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 blkno,
+				  u32 new_clusters);
+
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh);
+
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+				       struct inode *inode,
+				       struct ocfs2_dinode *fe,
+				       unsigned int new_i_clusters,
+				       struct buffer_head *old_last_eb,
+				       struct buffer_head **new_last_eb);
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+
+static int ocfs2_extent_contig(struct inode *inode,
+			       struct ocfs2_extent_rec *ext,
+			       u64 blkno)
+{
+	return blkno == (le64_to_cpu(ext->e_blkno) +
+			 ocfs2_clusters_to_blocks(inode->i_sb,
+						  le32_to_cpu(ext->e_clusters)));
+}
+
+/*
+ * How many free extents have we got before we need more meta data?
+ */
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct ocfs2_dinode *fe)
+{
+	int retval;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_block *eb;
+	struct buffer_head *eb_bh = NULL;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		retval = -EIO;
+		goto bail;
+	}
+
+	if (fe->i_last_eb_blk) {
+		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &eb_bh, OCFS2_BH_CACHED, inode);
+		if (retval < 0) {
+			mlog_errno(retval);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+
+	BUG_ON(el->l_tree_depth != 0);
+
+	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
+bail:
+	if (eb_bh)
+		brelse(eb_bh);
+
+	mlog_exit(retval);
+	return retval;
+}
+
+/* expects array to already be allocated
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
+ * l_count for you
+ */
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[])
+{
+	int count, status, i;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	struct ocfs2_extent_block *eb;
+
+	mlog_entry_void();
+
+	count = 0;
+	while (count < wanted) {
+		status = ocfs2_claim_metadata(osb,
+					      handle,
+					      meta_ac,
+					      wanted - count,
+					      &suballoc_bit_start,
+					      &num_got,
+					      &first_blkno);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = count;  i < (num_got + count); i++) {
+			bhs[i] = sb_getblk(osb->sb, first_blkno);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+
+			status = ocfs2_journal_access(handle, inode, bhs[i],
+						      OCFS2_JOURNAL_ACCESS_CREATE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
+			/* Ok, setup the minimal stuff here. */
+			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+			eb->h_blkno = cpu_to_le64(first_blkno);
+			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+			/* we always use slot zero's suballocator */
+			eb->h_suballoc_slot = 0;
+#else
+			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+#endif
+			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+			eb->h_list.l_count =
+				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+			suballoc_bit_start++;
+			first_blkno++;
+
+			/* We'll also be dirtied by the caller, so
+			 * this isn't absolutely necessary. */
+			status = ocfs2_journal_dirty(handle, bhs[i]);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		count += num_got;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		for(i = 0; i < wanted; i++) {
+			if (bhs[i])
+				brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+	}
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the dinode
+ * structure.
+ *
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with e_clusters == 0.
+ */
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac)
+{
+	int status, new_blocks, i;
+	u64 next_blkno, new_last_eb_blk;
+	struct buffer_head *bh;
+	struct buffer_head **new_eb_bhs = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *eb_el;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	BUG_ON(!last_eb_bh);
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (eb_bh) {
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+
+	/* we never add a branch to a leaf. */
+	BUG_ON(!el->l_tree_depth);
+
+	new_blocks = le16_to_cpu(el->l_tree_depth);
+
+	/* allocate the number of new eb blocks we need */
+	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
+			     GFP_KERNEL);
+	if (!new_eb_bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+					   meta_ac, new_eb_bhs);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+	 * linked with the rest of the tree.
+	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
+	 *
+	 * when we leave the loop, new_last_eb_blk will point to the
+	 * newest leaf, and next_blkno will point to the topmost extent
+	 * block. */
+	next_blkno = new_last_eb_blk = 0;
+	for(i = 0; i < new_blocks; i++) {
+		bh = new_eb_bhs[i];
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		eb_el = &eb->h_list;
+
+		status = ocfs2_journal_access(handle, inode, bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb->h_next_leaf_blk = 0;
+		eb_el->l_tree_depth = cpu_to_le16(i);
+		eb_el->l_next_free_rec = cpu_to_le16(1);
+		eb_el->l_recs[0].e_cpos = fe->i_clusters;
+		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
+		eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+		if (!eb_el->l_tree_depth)
+			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
+
+		status = ocfs2_journal_dirty(handle, bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		next_blkno = le64_to_cpu(eb->h_blkno);
+	}
+
+	/* This is a bit hairy. We want to update up to three blocks
+	 * here without leaving any of them in an inconsistent state
+	 * in case of error. We don't have to worry about
+	 * journal_dirty erroring as it won't unless we've aborted the
+	 * handle (in which case we would never be here) so reserving
+	 * the write with journal_access is all we need to do. */
+	status = ocfs2_journal_access(handle, inode, last_eb_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (eb_bh) {
+		status = ocfs2_journal_access(handle, inode, eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Link the new branch into the rest of the tree (el will
+	 * either be on the fe, or the extent block passed in. */
+	i = le16_to_cpu(el->l_next_free_rec);
+	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
+	el->l_recs[i].e_cpos = fe->i_clusters;
+	el->l_recs[i].e_clusters = 0;
+	le16_add_cpu(&el->l_next_free_rec, 1);
+
+	/* fe needs a new last extent block pointer, as does the
+	 * next_leaf on the previously last-extent-block. */
+	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+
+	eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
+
+	status = ocfs2_journal_dirty(handle, last_eb_bh);
+	if (status < 0)
+		mlog_errno(status);
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+	if (eb_bh) {
+		status = ocfs2_journal_dirty(handle, eb_bh);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	status = 0;
+bail:
+	if (new_eb_bhs) {
+		for (i = 0; i < new_blocks; i++)
+			if (new_eb_bhs[i])
+				brelse(new_eb_bhs[i]);
+		kfree(new_eb_bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call.
+ */
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh)
+{
+	int status, i;
+	struct buffer_head *new_eb_bh = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *fe_el;
+	struct ocfs2_extent_list  *eb_el;
+
+	mlog_entry_void();
+
+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+					   &new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		status = -EIO;
+		goto bail;
+	}
+
+	eb_el = &eb->h_list;
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	fe_el = &fe->id2.i_list;
+
+	status = ocfs2_journal_access(handle, inode, new_eb_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* copy the fe data into the new extent block */
+	eb_el->l_tree_depth = fe_el->l_tree_depth;
+	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
+	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
+		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
+	}
+
+	status = ocfs2_journal_dirty(handle, new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* update fe now */
+	le16_add_cpu(&fe_el->l_tree_depth, 1);
+	fe_el->l_recs[0].e_cpos = 0;
+	fe_el->l_recs[0].e_blkno = eb->h_blkno;
+	fe_el->l_recs[0].e_clusters = fe->i_clusters;
+	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+		fe_el->l_recs[i].e_cpos = 0;
+		fe_el->l_recs[i].e_clusters = 0;
+		fe_el->l_recs[i].e_blkno = 0;
+	}
+	fe_el->l_next_free_rec = cpu_to_le16(1);
+
+	/* If this is our 1st tree depth shift, then last_eb_blk
+	 * becomes the allocated extent block */
+	if (fe_el->l_tree_depth == cpu_to_le16(1))
+		fe->i_last_eb_blk = eb->h_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*ret_new_eb_bh = new_eb_bh;
+	new_eb_bh = NULL;
+	status = 0;
+bail:
+	if (new_eb_bh)
+		brelse(new_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Expects the tree to already have room in the rightmost leaf for the
+ * extent.  Updates all the extent blocks (and the dinode) on the way
+ * down.
+ */
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 start_blk,
+				  u32 new_clusters)
+{
+	int status, i, num_bhs = 0;
+	u64 next_blkno;
+	u16 next_free;
+	struct buffer_head **eb_bhs = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+	if (el->l_tree_depth) {
+		/* This is another operation where we want to be
+		 * careful about our tree updates. An error here means
+		 * none of the previous changes we made should roll
+		 * forward. As a result, we have to record the buffers
+		 * for this part of the tree in an array and reserve a
+		 * journal write to them before making any changes. */
+		num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+		eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
+				 GFP_KERNEL);
+		if (!eb_bhs) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		i = 0;
+		while(el->l_tree_depth) {
+			next_free = le16_to_cpu(el->l_next_free_rec);
+			if (next_free == 0) {
+				ocfs2_error(inode->i_sb,
+					    "Dinode %"MLFu64" has a bad "
+					    "extent list",
+					    OCFS2_I(inode)->ip_blkno);
+				status = -EIO;
+				goto bail;
+			}
+			next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
+
+			BUG_ON(i >= num_bhs);
+			status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
+						  OCFS2_BH_CACHED, inode);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+			if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+								 eb);
+				status = -EIO;
+				goto bail;
+			}
+
+			status = ocfs2_journal_access(handle, inode, eb_bhs[i],
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			el = &eb->h_list;
+			i++;
+			/* When we leave this loop, eb_bhs[num_bhs - 1] will
+			 * hold the bottom-most leaf extent block. */
+		}
+		BUG_ON(el->l_tree_depth);
+
+		el = &fe->id2.i_list;
+		/* If we have tree depth, then the fe update is
+		 * trivial, and we want to switch el out for the
+		 * bottom-most leaf in order to update it with the
+		 * actual extent data below. */
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (next_free == 0) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %"MLFu64" has a bad "
+				    "extent list",
+				    OCFS2_I(inode)->ip_blkno);
+			status = -EIO;
+			goto bail;
+		}
+		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+			     new_clusters);
+		/* (num_bhs - 1) to avoid the leaf */
+		for(i = 0; i < (num_bhs - 1); i++) {
+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+			el = &eb->h_list;
+
+			/* finally, make our actual change to the
+			 * intermediate extent blocks. */
+			next_free = le16_to_cpu(el->l_next_free_rec);
+			le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+				     new_clusters);
+
+			status = ocfs2_journal_dirty(handle, eb_bhs[i]);
+			if (status < 0)
+				mlog_errno(status);
+		}
+		BUG_ON(i != (num_bhs - 1));
+		/* note that the leaf block wasn't touched in
+		 * the loop above */
+		eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
+		el = &eb->h_list;
+		BUG_ON(el->l_tree_depth);
+	}
+
+	/* yay, we can finally add the actual extent now! */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le16_to_cpu(el->l_next_free_rec) &&
+	    ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
+		le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
+	} else if (le16_to_cpu(el->l_next_free_rec) &&
+		   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
+		/* having an empty extent at eof is legal. */
+		if (el->l_recs[i].e_cpos != fe->i_clusters) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %"MLFu64" trailing extent is bad: "
+				    "cpos (%u) != number of clusters (%u)",
+				    le32_to_cpu(el->l_recs[i].e_cpos),
+				    le32_to_cpu(fe->i_clusters));
+			status = -EIO;
+			goto bail;
+		}
+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+	} else {
+		/* No contiguous record, or no empty record at eof, so
+		 * we add a new one. */
+
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
+		       le16_to_cpu(el->l_count));
+		i = le16_to_cpu(el->l_next_free_rec);
+
+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+		el->l_recs[i].e_cpos = fe->i_clusters;
+		le16_add_cpu(&el->l_next_free_rec, 1);
+	}
+
+	/*
+	 * extent_map errors are not fatal, so they are ignored outside
+	 * of flushing the thing.
+	 */
+	status = ocfs2_extent_map_append(inode, &el->l_recs[i],
+					 new_clusters);
+	if (status) {
+		mlog_errno(status);
+		ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
+	}
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+	if (fe->id2.i_list.l_tree_depth) {
+		status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	status = 0;
+bail:
+	if (eb_bhs) {
+		for (i = 0; i < num_bhs; i++)
+			if (eb_bhs[i])
+				brelse(eb_bhs[i]);
+		kfree(eb_bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ *    *lowest_eb_bh and return '0'
+ *
+ * 2) the search fails to find anything, but the dinode has room. We
+ *    pass NULL back in *lowest_eb_bh, but still return '0'
+ *
+ * 3) the search fails to find anything AND the dinode is full, in
+ *    which case we return > 0
+ *
+ * return status < 0 indicates an error.
+ */
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh)
+{
+	int status = 0, i;
+	u64 blkno;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lowest_bh = NULL;
+
+	mlog_entry_void();
+
+	*target_bh = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+
+	while(le16_to_cpu(el->l_tree_depth) > 1) {
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
+				    "extent list (next_free_rec == 0)",
+				    OCFS2_I(inode)->ip_blkno);
+			status = -EIO;
+			goto bail;
+		}
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+		if (!blkno) {
+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
+				    "list where extent # %d has no physical "
+				    "block start",
+				    OCFS2_I(inode)->ip_blkno, i);
+			status = -EIO;
+			goto bail;
+		}
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
+					  inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &eb->h_list;
+
+		if (le16_to_cpu(el->l_next_free_rec) <
+		    le16_to_cpu(el->l_count)) {
+			if (lowest_bh)
+				brelse(lowest_bh);
+			lowest_bh = bh;
+			get_bh(lowest_bh);
+		}
+	}
+
+	/* If we didn't find one and the fe doesn't have any room,
+	 * then return '1' */
+	if (!lowest_bh
+	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+		status = 1;
+
+	*target_bh = lowest_bh;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* the caller needs to update fe->i_clusters */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 start_blk,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac)
+{
+	int status, i, shift;
+	struct buffer_head *last_eb_bh = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	mlog(0, "add %u clusters starting at block %"MLFu64" to "
+		"inode %"MLFu64"\n",
+	     new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+
+	if (el->l_tree_depth) {
+		/* jump to end of tree */
+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_exit(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &eb->h_list;
+	}
+
+	/* Can we allocate without adding/shifting tree bits? */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le16_to_cpu(el->l_next_free_rec) == 0
+	    || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
+	    || le32_to_cpu(el->l_recs[i].e_clusters) == 0
+	    || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
+		goto out_add;
+
+	mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
+	     "tree now.\n");
+
+	shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
+	if (shift < 0) {
+		status = shift;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* We traveled all the way to the bottom of the allocation tree
+	 * and didn't find room for any more extents - we need to add
+	 * another tree level */
+	if (shift) {
+		/* if we hit a leaf, we'd better be empty :) */
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
+		       le16_to_cpu(el->l_count));
+		BUG_ON(bh);
+		mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
+		     "(current = %u)\n",
+		     le16_to_cpu(fe->id2.i_list.l_tree_depth));
+
+		/* ocfs2_shift_tree_depth will return us a buffer with
+		 * the new extent block (so we can pass that to
+		 * ocfs2_add_branch). */
+		status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
+						meta_ac, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		/* Special case: we have room now if we shifted from
+		 * tree_depth 0 */
+		if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+			goto out_add;
+	}
+
+	/* call ocfs2_add_branch to add the final part of the tree with
+	 * the new data. */
+	mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+	status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
+				  meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+out_add:
+	/* Finally, we can add clusters. */
+	status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
+					start_blk, new_clusters);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	if (last_eb_bh)
+		brelse(last_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+{
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+
+	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
+			"slot %d, invalid truncate log parameters: used = "
+			"%u, count = %u\n", osb->slot_num,
+			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
+	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
+}
+
+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
+					   unsigned int new_start)
+{
+	unsigned int tail_index;
+	unsigned int current_tail;
+
+	/* No records, nothing to coalesce */
+	if (!le16_to_cpu(tl->tl_used))
+		return 0;
+
+	tail_index = le16_to_cpu(tl->tl_used) - 1;
+	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
+	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
+
+	return current_tail == new_start;
+}
+
+static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     u64 start_blk,
+				     unsigned int num_clusters)
+{
+	int status, index;
+	unsigned int start_cluster, tl_count;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
+		   num_clusters);
+
+	BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	tl_count = le16_to_cpu(tl->tl_count);
+	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+			tl_count == 0,
+			"Truncate record count on #%"MLFu64" invalid ("
+			"wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
+			ocfs2_truncate_recs_per_inode(osb->sb),
+			le16_to_cpu(tl->tl_count));
+
+	/* Caller should have known to flush before calling us. */
+	index = le16_to_cpu(tl->tl_used);
+	if (index >= tl_count) {
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
+	     "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
+	     OCFS2_I(tl_inode)->ip_blkno, index);
+
+	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
+		/*
+		 * Move index back to the record we are coalescing with.
+		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
+		 */
+		index--;
+
+		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
+		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
+		     index, le32_to_cpu(tl->tl_recs[index].t_start),
+		     num_clusters);
+	} else {
+		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
+		tl->tl_used = cpu_to_le16(index + 1);
+	}
+	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
+
+	status = ocfs2_journal_dirty(handle, tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
+					 struct ocfs2_journal_handle *handle,
+					 struct inode *data_alloc_inode,
+					 struct buffer_head *data_alloc_bh)
+{
+	int status = 0;
+	int i;
+	unsigned int num_clusters;
+	u64 start_blk;
+	struct ocfs2_truncate_rec rec;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+
+	mlog_entry_void();
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	i = le16_to_cpu(tl->tl_used) - 1;
+	while (i >= 0) {
+		/* Caller has given us at least enough credits to
+		 * update the truncate log dinode */
+		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		tl->tl_used = cpu_to_le16(i);
+
+		status = ocfs2_journal_dirty(handle, tl_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* TODO: Perhaps we can calculate the bulk of the
+		 * credits up front rather than extending like
+		 * this. */
+		status = ocfs2_extend_trans(handle,
+					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		rec = tl->tl_recs[i];
+		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
+						    le32_to_cpu(rec.t_start));
+		num_clusters = le32_to_cpu(rec.t_clusters);
+
+		/* if start_blk is not set, we ignore the record as
+		 * invalid. */
+		if (start_blk) {
+			mlog(0, "free record %d, start = %u, clusters = %u\n",
+			     i, le32_to_cpu(rec.t_start), num_clusters);
+
+			status = ocfs2_free_clusters(handle, data_alloc_inode,
+						     data_alloc_bh, start_blk,
+						     num_clusters);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		i--;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* Expects you to already be holding tl_inode->i_sem */
+static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	unsigned int num_to_flush;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct inode *data_alloc_inode = NULL;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct buffer_head *data_alloc_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry_void();
+
+	BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	num_to_flush = le16_to_cpu(tl->tl_used);
+	mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
+	     num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
+	if (!num_to_flush) {
+		status = 0;
+		goto bail;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	data_alloc_inode = ocfs2_get_system_file_inode(osb,
+						       GLOBAL_BITMAP_SYSTEM_INODE,
+						       OCFS2_INVALID_SLOT);
+	if (!data_alloc_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, data_alloc_inode);
+	status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+					       data_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (data_alloc_inode)
+		iput(data_alloc_inode);
+
+	if (data_alloc_bh)
+		brelse(data_alloc_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	down(&tl_inode->i_sem);
+	status = __ocfs2_flush_truncate_log(osb);
+	up(&tl_inode->i_sem);
+
+	return status;
+}
+
+static void ocfs2_truncate_log_worker(void *data)
+{
+	int status;
+	struct ocfs2_super *osb = data;
+
+	mlog_entry_void();
+
+	status = ocfs2_flush_truncate_log(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+}
+
+#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel)
+{
+	if (osb->osb_tl_inode) {
+		/* We want to push off log flushes while truncates are
+		 * still running. */
+		if (cancel)
+			cancel_delayed_work(&osb->osb_truncate_log_wq);
+
+		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
+	}
+}
+
+static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
+				       int slot_num,
+				       struct inode **tl_inode,
+				       struct buffer_head **tl_bh)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					   TRUNCATE_LOG_SYSTEM_INODE,
+					   slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		iput(inode);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*tl_inode = inode;
+	*tl_bh    = bh;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* called during the 1st stage of node recovery. we stamp a clean
+ * truncate log and pass back a copy for processing later. if the
+ * truncate log does not require processing, a *tl_copy is set to
+ * NULL. */
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	*tl_copy = NULL;
+
+	mlog(0, "recover truncate log from slot %d\n", slot_num);
+
+	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	if (le16_to_cpu(tl->tl_used)) {
+		mlog(0, "We'll have %u logs to recover\n",
+		     le16_to_cpu(tl->tl_used));
+
+		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
+		if (!(*tl_copy)) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Assuming the write-out below goes well, this copy
+		 * will be passed back to recovery for processing. */
+		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
+
+		/* All we need to do to clear the truncate log is set
+		 * tl_used. */
+		tl->tl_used = 0;
+
+		status = ocfs2_write_block(osb, tl_bh, tl_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+bail:
+	if (tl_inode)
+		iput(tl_inode);
+	if (tl_bh)
+		brelse(tl_bh);
+
+	if (status < 0 && (*tl_copy)) {
+		kfree(*tl_copy);
+		*tl_copy = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy)
+{
+	int status = 0;
+	int i;
+	unsigned int clusters, num_recs, start_cluster;
+	u64 start_blk;
+	struct ocfs2_journal_handle *handle;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry_void();
+
+	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
+		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
+		return -EINVAL;
+	}
+
+	tl = &tl_copy->id2.i_dealloc;
+	num_recs = le16_to_cpu(tl->tl_used);
+	mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
+	     tl_copy->i_blkno);
+
+	down(&tl_inode->i_sem);
+	for(i = 0; i < num_recs; i++) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			status = __ocfs2_flush_truncate_log(osb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail_up;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, NULL,
+					   OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_up;
+		}
+
+		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
+		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
+		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
+
+		status = ocfs2_truncate_log_append(osb, handle,
+						   start_blk, clusters);
+		ocfs2_commit_trans(handle);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_up;
+		}
+	}
+
+bail_up:
+	up(&tl_inode->i_sem);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mlog_entry_void();
+
+	if (tl_inode) {
+		cancel_delayed_work(&osb->osb_truncate_log_wq);
+		flush_workqueue(ocfs2_wq);
+
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		brelse(osb->osb_tl_bh);
+		iput(osb->osb_tl_inode);
+	}
+
+	mlog_exit_void();
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_get_truncate_log_info(osb,
+					     osb->slot_num,
+					     &tl_inode,
+					     &tl_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* ocfs2_truncate_log_shutdown keys on the existence of
+	 * osb->osb_tl_inode so we don't set any of the osb variables
+	 * until we're sure all is well. */
+	INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
+	osb->osb_tl_bh    = tl_bh;
+	osb->osb_tl_inode = tl_inode;
+
+	mlog_exit(status);
+	return status;
+}
+
+/* This function will figure out whether the currently last extent
+ * block will be deleted, and if it will, what the new last extent
+ * block will be so we can update his h_next_leaf_blk field, as well
+ * as the dinodes i_last_eb_blk */
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+				       struct inode *inode,
+				       struct ocfs2_dinode *fe,
+				       u32 new_i_clusters,
+				       struct buffer_head *old_last_eb,
+				       struct buffer_head **new_last_eb)
+{
+	int i, status = 0;
+	u64 block = 0;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *bh = NULL;
+
+	*new_last_eb = NULL;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+
+	/* we have no tree, so of course, no last_eb. */
+	if (!fe->id2.i_list.l_tree_depth)
+		goto bail;
+
+	/* trunc to zero special case - this makes tree_depth = 0
+	 * regardless of what it is.  */
+	if (!new_i_clusters)
+		goto bail;
+
+	eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
+	el = &(eb->h_list);
+	BUG_ON(!el->l_next_free_rec);
+
+	/* Make sure that this guy will actually be empty after we
+	 * clear away the data. */
+	if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
+		goto bail;
+
+	/* Ok, at this point, we know that last_eb will definitely
+	 * change, so lets traverse the tree and find the second to
+	 * last extent block. */
+	el = &(fe->id2.i_list);
+	/* go down the tree, */
+	do {
+		for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
+			if (le32_to_cpu(el->l_recs[i].e_cpos) <
+			    new_i_clusters) {
+				block = le64_to_cpu(el->l_recs[i].e_blkno);
+				break;
+			}
+		}
+		BUG_ON(i < 0);
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
+					 inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+	} while (el->l_tree_depth);
+
+	*new_last_eb = bh;
+	get_bh(*new_last_eb);
+	mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
+bail:
+	if (bh)
+		brelse(bh);
+
+	return status;
+}
+
+static int ocfs2_do_truncate(struct ocfs2_super *osb,
+			     unsigned int clusters_to_del,
+			     struct inode *inode,
+			     struct buffer_head *fe_bh,
+			     struct buffer_head *old_last_eb_bh,
+			     struct ocfs2_journal_handle *handle,
+			     struct ocfs2_truncate_context *tc)
+{
+	int status, i, depth;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_block *last_eb = NULL;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *eb_bh = NULL;
+	struct buffer_head *last_eb_bh = NULL;
+	u64 next_eb = 0;
+	u64 delete_blk = 0;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	status = ocfs2_find_new_last_ext_blk(osb,
+					     inode,
+					     fe,
+					     le32_to_cpu(fe->i_clusters) -
+					     		clusters_to_del,
+					     old_last_eb_bh,
+					     &last_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (last_eb_bh)
+		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	el = &(fe->id2.i_list);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
+				      clusters_to_del;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
+	fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+	BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+	le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+	/* tree depth zero, we can just delete the clusters, otherwise
+	 * we need to record the offset of the next level extent block
+	 * as we may overwrite it. */
+	if (!el->l_tree_depth)
+		delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+			+ ocfs2_clusters_to_blocks(osb->sb,
+					le32_to_cpu(el->l_recs[i].e_clusters));
+	else
+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+
+	if (!el->l_recs[i].e_clusters) {
+		/* if we deleted the whole extent record, then clear
+		 * out the other fields and update the extent
+		 * list. For depth > 0 trees, we've already recorded
+		 * the extent block in 'next_eb' */
+		el->l_recs[i].e_cpos = 0;
+		el->l_recs[i].e_blkno = 0;
+		BUG_ON(!el->l_next_free_rec);
+		le16_add_cpu(&el->l_next_free_rec, -1);
+	}
+
+	depth = le16_to_cpu(el->l_tree_depth);
+	if (!fe->i_clusters) {
+		/* trunc to zero is a special case. */
+		el->l_tree_depth = 0;
+		fe->i_last_eb_blk = 0;
+	} else if (last_eb)
+		fe->i_last_eb_blk = last_eb->h_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (last_eb) {
+		/* If there will be a new last extent block, then by
+		 * definition, there cannot be any leaves to the right of
+		 * him. */
+		status = ocfs2_journal_access(handle, inode, last_eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		last_eb->h_next_leaf_blk = 0;
+		status = ocfs2_journal_dirty(handle, last_eb_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* if our tree depth > 0, update all the tree blocks below us. */
+	while (depth) {
+		mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
+		     depth,  next_eb);
+		status = ocfs2_read_block(osb, next_eb, &eb_bh,
+					  OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+
+		status = ocfs2_journal_access(handle, inode, eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+		BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+		mlog(0, "extent block %"MLFu64", before: record %d: "
+		     "(%u, %u, %"MLFu64"), next = %u\n",
+		     le64_to_cpu(eb->h_blkno), i,
+		     le32_to_cpu(el->l_recs[i].e_cpos),
+		     le32_to_cpu(el->l_recs[i].e_clusters),
+		     le64_to_cpu(el->l_recs[i].e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+		le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+
+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+		/* bottom-most block requires us to delete data.*/
+		if (!el->l_tree_depth)
+			delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+				+ ocfs2_clusters_to_blocks(osb->sb,
+					le32_to_cpu(el->l_recs[i].e_clusters));
+		if (!el->l_recs[i].e_clusters) {
+			el->l_recs[i].e_cpos = 0;
+			el->l_recs[i].e_blkno = 0;
+			BUG_ON(!el->l_next_free_rec);
+			le16_add_cpu(&el->l_next_free_rec, -1);
+		}
+		mlog(0, "extent block %"MLFu64", after: record %d: "
+		     "(%u, %u, %"MLFu64"), next = %u\n",
+		     le64_to_cpu(eb->h_blkno), i,
+		     le32_to_cpu(el->l_recs[i].e_cpos),
+		     le32_to_cpu(el->l_recs[i].e_clusters),
+		     le64_to_cpu(el->l_recs[i].e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		status = ocfs2_journal_dirty(handle, eb_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!el->l_next_free_rec) {
+			mlog(0, "deleting this extent block.\n");
+
+			ocfs2_remove_from_cache(inode, eb_bh);
+
+			BUG_ON(eb->h_suballoc_slot);
+			BUG_ON(el->l_recs[0].e_clusters);
+			BUG_ON(el->l_recs[0].e_cpos);
+			BUG_ON(el->l_recs[0].e_blkno);
+			status = ocfs2_free_extent_block(handle,
+							 tc->tc_ext_alloc_inode,
+							 tc->tc_ext_alloc_bh,
+							 eb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		brelse(eb_bh);
+		eb_bh = NULL;
+		depth--;
+	}
+
+	BUG_ON(!delete_blk);
+	status = ocfs2_truncate_log_append(osb, handle, delete_blk,
+					   clusters_to_del);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	if (!status)
+		ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
+	else
+		ocfs2_extent_map_drop(inode, 0);
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * It is expected, that by the time you call this function,
+ * inode->i_size and fe->i_size have been adjusted.
+ *
+ * WARNING: This will kfree the truncate context
+ */
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *fe_bh,
+			  struct ocfs2_truncate_context *tc)
+{
+	int status, i, credits, tl_sem = 0;
+	u32 clusters_to_del, target_i_clusters;
+	u64 last_eb = 0;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mlog_entry_void();
+
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						     i_size_read(inode));
+
+	last_eb_bh = tc->tc_last_eb_bh;
+	tc->tc_last_eb_bh = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (fe->id2.i_list.l_tree_depth) {
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+	last_eb = le64_to_cpu(fe->i_last_eb_blk);
+start:
+	mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
+	     "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
+	     "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
+	     le32_to_cpu(fe->i_clusters), last_eb,
+	     le64_to_cpu(fe->i_last_eb_blk),
+	     le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
+
+	if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
+		mlog(0, "last_eb changed!\n");
+		BUG_ON(!fe->id2.i_list.l_tree_depth);
+		last_eb = le64_to_cpu(fe->i_last_eb_blk);
+		/* i_last_eb_blk may have changed, read it if
+		 * necessary. We don't have to worry about the
+		 * truncate to zero case here (where there becomes no
+		 * last_eb) because we never loop back after our work
+		 * is done. */
+		if (last_eb_bh) {
+			brelse(last_eb_bh);
+			last_eb_bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, last_eb,
+					  &last_eb_bh, OCFS2_BH_CACHED,
+					  inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+	}
+
+	/* by now, el will point to the extent list on the bottom most
+	 * portion of this tree. */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
+		clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+	else
+		clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+				   le32_to_cpu(el->l_recs[i].e_cpos)) -
+				  target_i_clusters;
+
+	mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+
+	down(&tl_inode->i_sem);
+	tl_sem = 1;
+	/* ocfs2_truncate_log_needs_flush guarantees us at least one
+	 * record is free for use. If there isn't any, we flush to get
+	 * an empty truncate log.  */
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		status = __ocfs2_flush_truncate_log(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+						fe, el);
+	handle = ocfs2_start_trans(osb, NULL, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
+				   last_eb_bh, handle, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	up(&tl_inode->i_sem);
+	tl_sem = 0;
+
+	ocfs2_commit_trans(handle);
+	handle = NULL;
+
+	BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
+	if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
+		goto start;
+bail:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	if (tl_sem)
+		up(&tl_inode->i_sem);
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (last_eb_bh)
+		brelse(last_eb_bh);
+
+	/* This will drop the ext_alloc cluster lock for us */
+	ocfs2_free_truncate_context(tc);
+
+	mlog_exit(status);
+	return status;
+}
+
+
+/*
+ * Expects the inode to already be locked. This will figure out which
+ * inodes need to be locked and will put them on the returned truncate
+ * context.
+ */
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc)
+{
+	int status, metadata_delete;
+	unsigned int new_i_clusters;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh = NULL;
+	struct inode *ext_alloc_inode = NULL;
+	struct buffer_head *ext_alloc_bh = NULL;
+
+	mlog_entry_void();
+
+	*tc = NULL;
+
+	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						  i_size_read(inode));
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
+	     "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
+
+	if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
+		ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
+			    "%u and size %"MLFu64" whereas struct inode has "
+			    "cluster count %u and size %llu which caused an "
+			    "invalid truncate to %u clusters.",
+			    le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->i_clusters),
+			    le64_to_cpu(fe->i_size),
+			    OCFS2_I(inode)->ip_clusters, i_size_read(inode),
+			    new_i_clusters);
+		mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
+		status = -EIO;
+		goto bail;
+	}
+
+	*tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
+	if (!(*tc)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	metadata_delete = 0;
+	if (fe->id2.i_list.l_tree_depth) {
+		/* If we have a tree, then the truncate may result in
+		 * metadata deletes. Figure this out from the
+		 * rightmost leaf block.*/
+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+
+			brelse(last_eb_bh);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+		if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+			metadata_delete = 1;
+	}
+
+	(*tc)->tc_last_eb_bh = last_eb_bh;
+
+	if (metadata_delete) {
+		mlog(0, "Will have to delete metadata for this trunc. "
+		     "locking allocator.\n");
+		ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
+		if (!ext_alloc_inode) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		down(&ext_alloc_inode->i_sem);
+		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
+
+		status = ocfs2_meta_lock(ext_alloc_inode,
+					 NULL,
+					 &ext_alloc_bh,
+					 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		(*tc)->tc_ext_alloc_bh = ext_alloc_bh;
+		(*tc)->tc_ext_alloc_locked = 1;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		if (*tc)
+			ocfs2_free_truncate_context(*tc);
+		*tc = NULL;
+	}
+	mlog_exit_void();
+	return status;
+}
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
+{
+	if (tc->tc_ext_alloc_inode) {
+		if (tc->tc_ext_alloc_locked)
+			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
+
+		up(&tc->tc_ext_alloc_inode->i_sem);
+		iput(tc->tc_ext_alloc_inode);
+	}
+
+	if (tc->tc_ext_alloc_bh)
+		brelse(tc->tc_ext_alloc_bh);
+
+	if (tc->tc_last_eb_bh)
+		brelse(tc->tc_last_eb_bh);
+
+	kfree(tc);
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
new file mode 100644
index 000000000000..12ba897743f4
--- /dev/null
+++ b/fs/ocfs2/alloc.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ALLOC_H
+#define OCFS2_ALLOC_H
+
+struct ocfs2_alloc_context;
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 blkno,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac);
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct ocfs2_dinode *fe);
+/* how many new metadata chunks would an allocation need at maximum? */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+{
+	/*
+	 * Rather than do all the work of determining how much we need
+	 * (involves a ton of reads and locks), just ask for the
+	 * maximal limit.  That's a tree depth shift.  So, one block for
+	 * level of the tree (current l_tree_depth), one block for the
+	 * new tree_depth==0 extent_block, and one block at the new
+	 * top-of-the tree.
+	 */
+	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb);
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel);
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy);
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy);
+
+struct ocfs2_truncate_context {
+	struct inode *tc_ext_alloc_inode;
+	struct buffer_head *tc_ext_alloc_bh;
+	int tc_ext_alloc_locked; /* is it cluster locked? */
+	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
+	struct buffer_head *tc_last_eb_bh;
+};
+
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc);
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *fe_bh,
+			  struct ocfs2_truncate_context *tc);
+
+#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
new file mode 100644
index 000000000000..8f4467a930a5
--- /dev/null
+++ b/fs/ocfs2/aops.c
@@ -0,0 +1,643 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "symlink.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int status;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *buffer_cache_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	void *kaddr;
+
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+
+	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
+
+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
+		     (unsigned long long)iblock);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  &bh, OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+		     fe->i_blkno, 7, fe->i_signature);
+		goto bail;
+	}
+
+	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+						    le32_to_cpu(fe->i_clusters))) {
+		mlog(ML_ERROR, "block offset is outside the allocated size: "
+		     "%llu\n", (unsigned long long)iblock);
+		goto bail;
+	}
+
+	/* We don't use the page cache to create symlink data, so if
+	 * need be, copy it over from the buffer cache. */
+	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
+		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
+			    iblock;
+		buffer_cache_bh = sb_getblk(osb->sb, blkno);
+		if (!buffer_cache_bh) {
+			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
+			goto bail;
+		}
+
+		/* we haven't locked out transactions, so a commit
+		 * could've happened. Since we've got a reference on
+		 * the bh, even if it commits while we're doing the
+		 * copy, the data is still good. */
+		if (buffer_jbd(buffer_cache_bh)
+		    && ocfs2_inode_is_new(inode)) {
+			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+			if (!kaddr) {
+				mlog(ML_ERROR, "couldn't kmap!\n");
+				goto bail;
+			}
+			memcpy(kaddr + (bh_result->b_size * iblock),
+			       buffer_cache_bh->b_data,
+			       bh_result->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			set_buffer_uptodate(bh_result);
+		}
+		brelse(buffer_cache_bh);
+	}
+
+	map_bh(bh_result, inode->i_sb,
+	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
+
+	err = 0;
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(err);
+	return err;
+}
+
+static int ocfs2_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh_result, int create)
+{
+	int err = 0;
+	u64 p_blkno, past_eof;
+
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
+		     inode, inode->i_ino);
+
+	if (S_ISLNK(inode->i_mode)) {
+		/* this always does I/O for some reason. */
+		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
+		goto bail;
+	}
+
+	/* this can happen if another node truncs after our extend! */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+					       OCFS2_I(inode)->ip_clusters))
+		err = -EIO;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	if (err)
+		goto bail;
+
+	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  NULL);
+	if (err) {
+		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
+		     "%"MLFu64", NULL)\n", err, inode,
+		     (unsigned long long)iblock, p_blkno);
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, p_blkno);
+
+	if (bh_result->b_blocknr == 0) {
+		err = -EIO;
+		mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
+		     "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
+		     p_blkno, OCFS2_I(inode)->ip_blkno);
+	}
+
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+	mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
+
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
+
+bail:
+	if (err < 0)
+		err = -EIO;
+
+	mlog_exit(err);
+	return err;
+}
+
+static int ocfs2_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int ret, unlock = 1;
+
+	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * i_size might have just been updated as we grabed the meta lock.  We
+	 * might now be discovering a truncate that hit on another node.
+	 * block_read_full_page->get_block freaks out if it is asked to read
+	 * beyond the end of a file, so we check here.  Callers
+	 * (generic_file_read, fault->nopage) are clever enough to check i_size
+	 * and notice that the page they just read isn't needed.
+	 *
+	 * XXX sys_readahead() seems to get that wrong?
+	 */
+	if (start >= i_size_read(inode)) {
+		char *addr = kmap(page);
+		memset(addr, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		kunmap(page);
+		SetPageUptodate(page);
+		ret = 0;
+		goto out_alloc;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out_alloc;
+	}
+
+	ret = block_read_full_page(page, ocfs2_get_block);
+	unlock = 0;
+
+	ocfs2_data_unlock(inode, 0);
+out_alloc:
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 0);
+out:
+	if (unlock)
+		unlock_page(page);
+	mlog_exit(ret);
+	return ret;
+}
+
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing.  It can't block on any cluster locks
+ * to during block mapping.  It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
+ */
+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+
+	mlog_entry("(0x%p)\n", page);
+
+	ret = block_write_full_page(page, ocfs2_get_block, wbc);
+
+	mlog_exit(ret);
+
+	return ret;
+}
+
+/*
+ * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
+ * from loopback.  It must be able to perform its own locking around
+ * ocfs2_get_block().
+ */
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_meta_unlock(inode, 0);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_journal_handle *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		ret = walk_page_buffers(handle->k_handle,
+					page_buffers(page),
+					from, to, NULL,
+					ocfs2_journal_dirty_data);
+		if (ret < 0) 
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (handle)
+			ocfs2_commit_trans(handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+
+static int ocfs2_commit_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to)
+{
+	int ret, extending = 0, locklevel = 0;
+	loff_t new_i_size;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = page->mapping->host;
+	struct ocfs2_journal_handle *handle = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
+	 * us to sample inode->i_size here without the metadata lock:
+	 *
+	 * 1) We're currently holding the inode alloc lock, so no
+	 *    nodes can change it underneath us.
+	 *
+	 * 2) We've had to take the metadata lock at least once
+	 *    already to check for extending writes, hence insuring
+	 *    that our current copy is also up to date.
+	 */
+	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (new_i_size > i_size_read(inode)) {
+		extending = 1;
+		locklevel = 1;
+	}
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 1, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out_unlock_meta;
+	}
+
+	if (extending) {
+		handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock_data;
+		}
+
+		/* Mark our buffer early. We'd rather catch this error up here
+		 * as opposed to after a successful commit_write which would
+		 * require us to set back inode->i_size. */
+		ret = ocfs2_journal_access(handle, inode, di_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/* might update i_size */
+	ret = generic_commit_write(file, page, from, to);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (extending) {
+		loff_t size = (u64) i_size_read(inode);
+		struct ocfs2_dinode *di =
+			(struct ocfs2_dinode *)di_bh->b_data;
+
+		/* ocfs2_mark_inode_dirty is too heavy to use here. */
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+		di->i_size = cpu_to_le64(size);
+		di->i_ctime = di->i_mtime = 
+				cpu_to_le64(inode->i_mtime.tv_sec);
+		di->i_ctime_nsec = di->i_mtime_nsec = 
+				cpu_to_le32(inode->i_mtime.tv_nsec);
+
+		ret = ocfs2_journal_dirty(handle, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	BUG_ON(extending && (i_size_read(inode) != new_i_size));
+
+out_commit:
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+out_unlock_meta:
+	ocfs2_meta_unlock(inode, locklevel);
+out:
+	if (di_bh)
+		brelse(di_bh);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t status;
+	u64 p_blkno = 0;
+	int err = 0;
+	struct inode *inode = mapping->host;
+
+	mlog_entry("(block = %llu)\n", (unsigned long long)block);
+
+	/* We don't need to lock journal system files, since they aren't
+	 * accessed concurrently from multiple nodes.
+	 */
+	if (!INODE_JOURNAL(inode)) {
+		err = ocfs2_meta_lock(inode, NULL, NULL, 0);
+		if (err) {
+			if (err != -ENOENT)
+				mlog_errno(err);
+			goto bail;
+		}
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	}
+
+	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+					  NULL);
+
+	if (!INODE_JOURNAL(inode)) {
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		ocfs2_meta_unlock(inode, 0);
+	}
+
+	if (err) {
+		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
+		     (unsigned long long)block);
+		mlog_errno(err);
+		goto bail;
+	}
+
+
+bail:
+	status = err ? 0 : p_blkno;
+
+	mlog_exit((int)status);
+
+	return status;
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+				     unsigned long max_blocks,
+				     struct buffer_head *bh_result, int create)
+{
+	int ret;
+	u64 vbo_max; /* file offset, max_blocks from iblock */
+	u64 p_blkno;
+	int contig_blocks;
+	unsigned char blocksize_bits;
+
+	if (!inode || !bh_result) {
+		mlog(ML_ERROR, "inode or bh_result is null\n");
+		return -EIO;
+	}
+
+	blocksize_bits = inode->i_sb->s_blocksize_bits;
+
+	/* This function won't even be called if the request isn't all
+	 * nicely aligned and of the right size, so there's no need
+	 * for us to check any of that. */
+
+	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if ((iblock + max_blocks) >
+	    ocfs2_clusters_to_blocks(inode->i_sb,
+				     OCFS2_I(inode)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		ret = -EIO;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* This figures out the size of the next contiguous block, and
+	 * our logical offset */
+	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  &contig_blocks);
+	if (ret) {
+		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+		     (unsigned long long)iblock);
+		ret = -EIO;
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, p_blkno);
+
+	/* make sure we don't map more than max_blocks blocks here as
+	   that's all the kernel will handle at this point. */
+	if (max_blocks < contig_blocks)
+		contig_blocks = max_blocks;
+	bh_result->b_size = contig_blocks << blocksize_bits;
+bail:
+	return ret;
+}
+
+/* 
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  Like the core uses
+ * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
+ * truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+			     loff_t offset,
+			     ssize_t bytes,
+			     void *private)
+{
+	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+
+	/* this io's submitter should not have unlocked this before we could */
+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+	ocfs2_iocb_clear_rw_locked(iocb);
+	up_read(&inode->i_alloc_sem);
+	ocfs2_rw_unlock(inode, 0);
+}
+
+static ssize_t ocfs2_direct_IO(int rw,
+			       struct kiocb *iocb,
+			       const struct iovec *iov,
+			       loff_t offset,
+			       unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+	int ret;
+
+	mlog_entry_void();
+	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+					    inode->i_sb->s_bdev, iov, offset,
+					    nr_segs, 
+					    ocfs2_direct_IO_get_blocks,
+					    ocfs2_dio_end_io);
+	mlog_exit(ret);
+	return ret;
+}
+
+struct address_space_operations ocfs2_aops = {
+	.readpage	= ocfs2_readpage,
+	.writepage	= ocfs2_writepage,
+	.prepare_write	= ocfs2_prepare_write,
+	.commit_write	= ocfs2_commit_write,
+	.bmap		= ocfs2_bmap,
+	.sync_page	= block_sync_page,
+	.direct_IO	= ocfs2_direct_IO
+};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
new file mode 100644
index 000000000000..d40456d509a0
--- /dev/null
+++ b/fs/ocfs2/aops.h
@@ -0,0 +1,41 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to);
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to);
+
+/* all ocfs2_dio_end_io()'s fault */
+#define ocfs2_iocb_is_rw_locked(iocb) \
+	test_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_rw_locked(iocb) \
+	set_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_rw_locked(iocb) \
+	clear_bit(0, (unsigned long *)&iocb->private)
+
+#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
new file mode 100644
index 000000000000..d424041b38e9
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,232 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * io.c
+ *
+ * Buffer cache handling
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "inode.h"
+#include "journal.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
+		      struct inode *inode)
+{
+	int ret = 0;
+
+	mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
+		   (unsigned long long)bh->b_blocknr, inode);
+
+	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
+	BUG_ON(buffer_jbd(bh));
+
+	/* No need to check for a soft readonly file system here. non
+	 * journalled writes are only ever done on system files which
+	 * can get modified during recovery even if read-only. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	down(&OCFS2_I(inode)->ip_io_sem);
+
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+
+	get_bh(bh); /* for end_buffer_write_sync() */                   
+	bh->b_end_io = end_buffer_write_sync;
+	submit_bh(WRITE, bh);
+
+	wait_on_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		ocfs2_set_buffer_uptodate(inode, bh);
+	} else {
+		/* We don't need to remove the clustered uptodate
+		 * information for this bh as it's not marked locally
+		 * uptodate. */
+		ret = -EIO;
+		brelse(bh);
+	}
+
+	up(&OCFS2_I(inode)->ip_io_sem);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      struct inode *inode)
+{
+	int status = 0;
+	struct super_block *sb;
+	int i, ignore_cache = 0;
+	struct buffer_head *bh;
+
+	mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
+		   block, nr, flags, inode);
+
+	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr < 0) {
+		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr == 0) {
+		mlog(ML_BH_IO, "No buffers will be read!\n");
+		status = 0;
+		goto bail;
+	}
+
+	sb = osb->sb;
+
+	if (flags & OCFS2_BH_CACHED && !inode)
+		flags &= ~OCFS2_BH_CACHED;
+
+	if (inode)
+		down(&OCFS2_I(inode)->ip_io_sem);
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(sb, block++);
+			if (bhs[i] == NULL) {
+				if (inode)
+					up(&OCFS2_I(inode)->ip_io_sem);
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+		ignore_cache = 0;
+
+		if (flags & OCFS2_BH_CACHED &&
+		    !ocfs2_buffer_uptodate(inode, bh)) {
+			mlog(ML_UPTODATE,
+			     "bh (%llu), inode %"MLFu64" not uptodate\n",
+			     (unsigned long long)bh->b_blocknr,
+			     OCFS2_I(inode)->ip_blkno);
+			ignore_cache = 1;
+		}
+
+		/* XXX: Can we ever get this and *not* have the cached
+		 * flag set? */
+		if (buffer_jbd(bh)) {
+			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+				mlog(ML_BH_IO, "trying to sync read a jbd "
+					       "managed bh (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+			if (buffer_dirty(bh)) {
+				/* This should probably be a BUG, or
+				 * at least return an error. */
+				mlog(ML_BH_IO, "asking me to sync read a dirty "
+					       "buffer! (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+				continue;
+			}
+
+			lock_buffer(bh);
+			if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
+				mlog(ML_ERROR, "block %llu had the JBD bit set "
+					       "while I was in lock_buffer!",
+				     (unsigned long long)bh->b_blocknr);
+				BUG();
+#else
+				unlock_buffer(bh);
+				continue;
+#endif
+			}
+			clear_buffer_uptodate(bh);
+			get_bh(bh); /* for end_buffer_read_sync() */
+			bh->b_end_io = end_buffer_read_sync;
+			if (flags & OCFS2_BH_READAHEAD)
+				submit_bh(READA, bh);
+			else
+				submit_bh(READ, bh);
+			continue;
+		}
+	}
+
+	status = 0;
+
+	for (i = (nr - 1); i >= 0; i--) {
+		bh = bhs[i];
+
+		/* We know this can't have changed as we hold the
+		 * inode sem. Avoid doing any work on the bh if the
+		 * journal has it. */
+		if (!buffer_jbd(bh))
+			wait_on_buffer(bh);
+
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. Don't need to
+			 * remove the clustered uptodate information
+			 * for this bh as it's not marked locally
+			 * uptodate. */
+			status = -EIO;
+			brelse(bh);
+			bhs[i] = NULL;
+			continue;
+		}
+
+		if (inode)
+			ocfs2_set_buffer_uptodate(inode, bh);
+	}
+	if (inode)
+		up(&OCFS2_I(inode)->ip_io_sem);
+
+	mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
+	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+
+bail:
+
+	mlog_exit(status);
+	return status;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
new file mode 100644
index 000000000000..6ecb90937b68
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_buffer_head.h
+ *
+ * Buffer cache handling functions defined
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_BUFFER_HEAD_IO_H
+#define OCFS2_BUFFER_HEAD_IO_H
+
+#include <linux/buffer_head.h>
+
+void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
+			     int uptodate);
+
+static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+				   u64                  off,
+				   struct buffer_head **bh,
+				   int                  flags,
+				   struct inode        *inode);
+
+int ocfs2_write_block(struct ocfs2_super          *osb,
+		      struct buffer_head  *bh,
+		      struct inode        *inode);
+int ocfs2_read_blocks(struct ocfs2_super          *osb,
+		      u64                  block,
+		      int                  nr,
+		      struct buffer_head  *bhs[],
+		      int                  flags,
+		      struct inode        *inode);
+
+
+#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_READAHEAD         8	/* use this to pass READA down to submit_bh */
+
+static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
+				   struct buffer_head **bh, int flags,
+				   struct inode *inode)
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_blocks(osb, off, 1, bh,
+				   flags, inode);
+
+bail:
+	return status;
+}
+
+#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
new file mode 100644
index 000000000000..bd85182e97bc
--- /dev/null
+++ b/fs/ocfs2/dcache.c
@@ -0,0 +1,91 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.c
+ *
+ * dentry cache handling code
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+
+#define MLOG_MASK_PREFIX ML_DCACHE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "file.h"
+#include "inode.h"
+
+static int ocfs2_dentry_revalidate(struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	int ret = 0;    /* if all else fails, just return false */
+	struct ocfs2_super *osb;
+
+	mlog_entry("(0x%p, '%.*s')\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	/* Never trust a negative dentry - force a new lookup. */
+	if (inode == NULL) {
+		mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
+		     dentry->d_name.name);
+		goto bail;
+	}
+
+	osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!osb);
+
+	if (inode != osb->root_inode) {
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		/* did we or someone else delete this inode? */
+		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+			mlog(0, "inode (%"MLFu64") deleted, returning false\n",
+			     OCFS2_I(inode)->ip_blkno);
+			goto bail;
+		}
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		if (!inode->i_nlink) {
+			mlog(0, "Inode %"MLFu64" orphaned, returning false "
+			     "dir = %d\n", OCFS2_I(inode)->ip_blkno,
+			     S_ISDIR(inode->i_mode));
+			goto bail;
+		}
+	}
+
+	ret = 1;
+
+bail:
+	mlog_exit(ret);
+
+	return ret;
+}
+
+struct dentry_operations ocfs2_dentry_ops = {
+	.d_revalidate		= ocfs2_dentry_revalidate,
+};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
new file mode 100644
index 000000000000..90072771114b
--- /dev/null
+++ b/fs/ocfs2/dcache.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DCACHE_H
+#define OCFS2_DCACHE_H
+
+extern struct dentry_operations ocfs2_dentry_ops;
+
+#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
new file mode 100644
index 000000000000..856e20ae8263
--- /dev/null
+++ b/fs/ocfs2/dir.c
@@ -0,0 +1,618 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c
+ *
+ * Creates, reads, walks and deletes directory-nodes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static unsigned char ocfs2_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh);
+/*
+ * ocfs2_readdir()
+ *
+ */
+int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	int error = 0;
+	unsigned long offset, blk;
+	int i, num, stored;
+	struct buffer_head * bh, * tmp;
+	struct ocfs2_dir_entry * de;
+	int err;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block * sb = inode->i_sb;
+	int have_disk_lock = 0;
+
+	mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	stored = 0;
+	bh = NULL;
+
+	error = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (error < 0) {
+		if (error != -ENOENT)
+			mlog_errno(error);
+		/* we haven't got any yet, so propagate the error. */
+		stored = error;
+		goto bail;
+	}
+	have_disk_lock = 1;
+
+	offset = filp->f_pos & (sb->s_blocksize - 1);
+
+	while (!error && !stored && filp->f_pos < i_size_read(inode)) {
+		blk = (filp->f_pos) >> sb->s_blocksize_bits;
+		bh = ocfs2_bread(inode, blk, &err, 0);
+		if (!bh) {
+			mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
+				       "at offset %lld\n",
+			     OCFS2_I(inode)->ip_blkno,
+			     filp->f_pos);
+			filp->f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+		/*
+		 * Do the readahead (8k)
+		 */
+		if (!offset) {
+			for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
+			     i > 0; i--) {
+				tmp = ocfs2_bread(inode, ++blk, &err, 1);
+				if (tmp)
+					brelse(tmp);
+			}
+		}
+
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (filp->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ocfs2_dir_entry *) (bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			offset = i;
+			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+				| offset;
+			filp->f_version = inode->i_version;
+		}
+
+		while (!error && filp->f_pos < i_size_read(inode)
+		       && offset < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
+			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+				/* On error, skip the f_pos to the
+				   next block. */
+				filp->f_pos = (filp->f_pos |
+					       (sb->s_blocksize - 1)) + 1;
+				brelse(bh);
+				goto bail;
+			}
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				/* We might block in the next section
+				 * if the data destination is
+				 * currently swapped out.  So, use a
+				 * version stamp to detect whether or
+				 * not the directory has been modified
+				 * during the copy operation.
+				 */
+				unsigned long version = filp->f_version;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (de->file_type < OCFS2_FT_MAX)
+					d_type = ocfs2_filetype_table[de->file_type];
+				error = filldir(dirent, de->name,
+						de->name_len,
+						filp->f_pos,
+						ino_from_blkno(sb, le64_to_cpu(de->inode)),
+						d_type);
+				if (error)
+					break;
+				if (version != filp->f_version)
+					goto revalidate;
+				stored ++;
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		offset = 0;
+		brelse(bh);
+	}
+
+	stored = 0;
+bail:
+	if (have_disk_lock)
+		ocfs2_meta_unlock(inode, 0);
+
+	mlog_exit(stored);
+
+	return stored;
+}
+
+/*
+ * NOTE: this should always be called with parent dir i_sem taken.
+ */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent)
+{
+	int status = -ENOENT;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
+		   "inode=%p)\n",
+		   osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
+
+	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
+	if (!*dirent_bh || !*dirent) {
+		status = -ENOENT;
+		goto leave;
+	}
+
+	*blkno = le64_to_cpu((*dirent)->inode);
+
+	status = 0;
+leave:
+	if (status < 0) {
+		*dirent = NULL;
+		if (*dirent_bh) {
+			brelse(*dirent_bh);
+			*dirent_bh = NULL;
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Check for a name within a directory.
+ *
+ * Return 0 if the name does not exist
+ * Return -EEXIST if the directory contains the name
+ *
+ * Callers should have i_sem + a cluster lock on dir
+ */
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen)
+{
+	int ret;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+
+	mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
+		   namelen, name);
+
+	ret = -EEXIST;
+	dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
+	if (dirent_bh)
+		goto bail;
+
+	ret = 0;
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ocfs2_empty_dir(struct inode *inode)
+{
+	unsigned long offset;
+	struct buffer_head * bh;
+	struct ocfs2_dir_entry * de, * de1;
+	struct super_block * sb;
+	int err;
+
+	sb = inode->i_sb;
+	if ((i_size_read(inode) <
+	     (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
+	    !(bh = ocfs2_bread(inode, 0, &err, 0))) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no data block\n",
+		     OCFS2_I(inode)->ip_blkno);
+		return 1;
+	}
+
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	de1 = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
+			!le64_to_cpu(de1->inode) ||
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no `.' or `..'\n",
+		     OCFS2_I(inode)->ip_blkno);
+		brelse(bh);
+		return 1;
+	}
+	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+	de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
+	while (offset < i_size_read(inode) ) {
+		if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
+			brelse(bh);
+			bh = ocfs2_bread(inode,
+					 offset >> sb->s_blocksize_bits, &err, 0);
+			if (!bh) {
+				mlog(ML_ERROR, "directory #%"MLFu64" contains "
+					       "a hole at offset %lu\n",
+				     OCFS2_I(inode)->ip_blkno, offset);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+			brelse(bh);
+			return 1;
+		}
+		if (le64_to_cpu(de->inode)) {
+			brelse(bh);
+			return 0;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	}
+	brelse(bh);
+	return 1;
+}
+
+/* returns a bh of the 1st new block in the allocation. */
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh)
+{
+	int status;
+	int extend;
+	u64 p_blkno;
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+	if (extend) {
+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+						    parent_fe_bh, handle,
+						    data_ac, meta_ac, NULL);
+		BUG_ON(status == -EAGAIN);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
+						   (sb->s_blocksize_bits - 9)),
+					     1, &p_blkno, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_bh = sb_getblk(sb, p_blkno);
+	if (!*new_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* assumes you already have a cluster lock on the directory. */
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh)
+{
+	int status = 0;
+	int credits, num_free_extents;
+	loff_t dir_i_size;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry * de;
+	struct super_block *sb = osb->sb;
+
+	mlog_entry_void();
+
+	dir_i_size = i_size_read(dir);
+	mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
+	     OCFS2_I(dir)->ip_blkno, dir_i_size);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* dir->i_size is always block aligned. */
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		if (num_free_extents < 0) {
+			status = num_free_extents;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!num_free_extents) {
+			status = ocfs2_reserve_new_metadata(osb, handle,
+							    fe, &meta_ac);
+			if (status < 0) {
+				if (status != -ENOSPC)
+					mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+	} else {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
+				     data_ac, meta_ac, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(dir, new_bh);
+
+	status = ocfs2_journal_access(handle, dir, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, sb->s_blocksize);
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = 0;
+	de->rec_len = cpu_to_le16(sb->s_blocksize);
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	dir_i_size += dir->i_sb->s_blocksize;
+	i_size_write(dir, dir_i_size);
+	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_de_bh = new_bh;
+	get_bh(*new_de_bh);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Search the dir for a good spot, extending it if necessary. The
+ * block containing an appropriate record is returned in ret_de_bh.
+ */
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh)
+{
+	unsigned long offset;
+	struct buffer_head * bh = NULL;
+	unsigned short rec_len;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb;
+	int status;
+
+	mlog_entry_void();
+
+	mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
+	     namelen, OCFS2_I(dir)->ip_blkno);
+
+	BUG_ON(!S_ISDIR(dir->i_mode));
+	fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
+
+	sb = dir->i_sb;
+
+	if (!namelen) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bh = ocfs2_bread(dir, 0, &status, 0);
+	if (!bh) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (1) {
+		if ((char *)de >= sb->s_blocksize + bh->b_data) {
+			brelse(bh);
+			bh = NULL;
+
+			if (i_size_read(dir) <= offset) {
+				status = ocfs2_extend_dir(osb,
+							  dir,
+							  parent_fe_bh,
+							  &bh);
+				if (status < 0) {
+					mlog_errno(status);
+					goto bail;
+				}
+				BUG_ON(!bh);
+				*ret_de_bh = bh;
+				get_bh(*ret_de_bh);
+				goto bail;
+			}
+			bh = ocfs2_bread(dir,
+					 offset >> sb->s_blocksize_bits,
+					 &status,
+					 0);
+			if (!bh) {
+				mlog_errno(status);
+				goto bail;
+			}
+			/* move to next block */
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+			status = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			status = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = bh;
+			get_bh(*ret_de_bh);
+			status = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	status = 0;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
new file mode 100644
index 000000000000..5f614ec9649c
--- /dev/null
+++ b/fs/ocfs2/dir.h
@@ -0,0 +1,54 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DIR_H
+#define OCFS2_DIR_H
+
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen);
+int ocfs2_empty_dir(struct inode *inode);  /* FIXME: to namei.c */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent);
+int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh);
+struct ocfs2_alloc_context;
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh);
+#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
new file mode 100644
index 000000000000..e971ec2f8407
--- /dev/null
+++ b/fs/ocfs2/dlmglue.c
@@ -0,0 +1,2904 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.c
+ *
+ * Code which implements an OCFS2 specific interface to our DLM.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/crc32.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_DLM_GLUE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "slot_map.h"
+#include "super.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_mask_waiter {
+	struct list_head	mw_item;
+	int			mw_status;
+	struct completion	mw_complete;
+	unsigned long		mw_mask;
+	unsigned long		mw_goal;
+};
+
+static void ocfs2_inode_ast_func(void *opaque);
+static void ocfs2_inode_bast_func(void *opaque,
+				  int level);
+static void ocfs2_super_ast_func(void *opaque);
+static void ocfs2_super_bast_func(void *opaque,
+				  int level);
+static void ocfs2_rename_ast_func(void *opaque);
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level);
+
+/* so far, all locks have gotten along with the same unlock ast */
+static void ocfs2_unlock_ast_func(void *opaque,
+				  enum dlm_status status);
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue);
+static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue);
+typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
+				      struct ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker);
+
+struct ocfs2_lock_res_ops {
+	void (*ast)(void *);
+	void (*bast)(void *, int);
+	void (*unlock_ast)(void *, enum dlm_status);
+	int  (*unblock)(struct ocfs2_lock_res *, int *);
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_inode_lock,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_meta,
+};
+
+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				      int blocking);
+
+static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_data,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_super_lops = {
+	.ast		= ocfs2_super_ast_func,
+	.bast		= ocfs2_super_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
+	.ast		= ocfs2_rename_ast_func,
+	.bast		= ocfs2_rename_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
+};
+
+static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
+		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
+		lockres->l_type == OCFS2_LOCK_TYPE_RW;
+}
+
+static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
+}
+
+static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
+}
+
+static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_super_lock(lockres)
+	       && !ocfs2_is_rename_lock(lockres));
+
+	return (struct ocfs2_super *) lockres->l_priv;
+}
+
+static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	return (struct inode *) lockres->l_priv;
+}
+
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     int dlm_flags);
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted);
+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int level);
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres);
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert);
+#define ocfs2_log_dlm_error(_func, _stat, _lockres) do {	\
+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
+		_lockres->l_name, dlm_errmsg(_stat));		\
+} while (0)
+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres);
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh);
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
+static inline int ocfs2_highest_compat_lock_level(int level);
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  struct ocfs2_lock_res *lockres,
+						  int new_level);
+
+static char *ocfs2_lock_type_strings[] = {
+	[OCFS2_LOCK_TYPE_META] = "Meta",
+	[OCFS2_LOCK_TYPE_DATA] = "Data",
+	[OCFS2_LOCK_TYPE_SUPER] = "Super",
+	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
+	/* Need to differntiate from [R]ename.. serializing writes is the
+	 * important job it does, anyway. */
+	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
+};
+
+static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
+{
+	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+	return ocfs2_lock_type_strings[type];
+}
+
+static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
+				  u64 blkno,
+				  u32 generation,
+				  char *name)
+{
+	int len;
+
+	mlog_entry_void();
+
+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
+
+	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x",
+		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno,
+		       generation);
+
+	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
+
+	mlog(0, "built lock resource with name: %s\n", name);
+
+	mlog_exit_void();
+}
+
+static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+
+static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
+				       struct ocfs2_dlm_debug *dlm_debug)
+{
+	mlog(0, "Add tracking for lockres %s\n", res->l_name);
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
+{
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	if (!list_empty(&res->l_debug_list))
+		list_del_init(&res->l_debug_list);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
+				       struct ocfs2_lock_res *res,
+				       enum ocfs2_lock_type type,
+				       u64 blkno,
+				       u32 generation,
+				       struct ocfs2_lock_res_ops *ops,
+				       void *priv)
+{
+	ocfs2_build_lock_name(type, blkno, generation, res->l_name);
+
+	res->l_type          = type;
+	res->l_ops           = ops;
+	res->l_priv          = priv;
+
+	res->l_level         = LKM_IVMODE;
+	res->l_requested     = LKM_IVMODE;
+	res->l_blocking      = LKM_IVMODE;
+	res->l_action        = OCFS2_AST_INVALID;
+	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+	res->l_flags         = OCFS2_LOCK_INITIALIZED;
+
+	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+}
+
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
+{
+	/* This also clears out the lock status block */
+	memset(res, 0, sizeof(struct ocfs2_lock_res));
+	spin_lock_init(&res->l_lock);
+	init_waitqueue_head(&res->l_event);
+	INIT_LIST_HEAD(&res->l_blocked_list);
+	INIT_LIST_HEAD(&res->l_mask_waiters);
+}
+
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       struct inode *inode)
+{
+	struct ocfs2_lock_res_ops *ops;
+
+	switch(type) {
+		case OCFS2_LOCK_TYPE_RW:
+			ops = &ocfs2_inode_rw_lops;
+			break;
+		case OCFS2_LOCK_TYPE_META:
+			ops = &ocfs2_inode_meta_lops;
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			ops = &ocfs2_inode_data_lops;
+			break;
+		default:
+			mlog_bug_on_msg(1, "type: %d\n", type);
+			ops = NULL; /* thanks, gcc */
+			break;
+	};
+
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
+				   OCFS2_I(inode)->ip_blkno,
+				   inode->i_generation, ops, inode);
+}
+
+static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
+				      struct ocfs2_super *osb)
+{
+	/* Superblock lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
+				   OCFS2_SUPER_BLOCK_BLKNO, 0,
+				   &ocfs2_super_lops, osb);
+}
+
+static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
+				       struct ocfs2_super *osb)
+{
+	/* Rename lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
+				   &ocfs2_rename_lops, osb);
+}
+
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+{
+	mlog_entry_void();
+
+	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
+		return;
+
+	ocfs2_remove_lockres_tracking(res);
+
+	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
+			"Lockres %s is on the blocked list\n",
+			res->l_name);
+	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
+			"Lockres %s has mask waiters pending\n",
+			res->l_name);
+	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
+			"Lockres %s is locked\n",
+			res->l_name);
+	mlog_bug_on_msg(res->l_ro_holders,
+			"Lockres %s has %u ro holders\n",
+			res->l_name, res->l_ro_holders);
+	mlog_bug_on_msg(res->l_ex_holders,
+			"Lockres %s has %u ex holders\n",
+			res->l_name, res->l_ex_holders);
+
+	/* Need to clear out the lock status block for the dlm */
+	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
+
+	res->l_flags = 0UL;
+	mlog_exit_void();
+}
+
+static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+	mlog_exit_void();
+}
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int ocfs2_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+static void lockres_set_flags(struct ocfs2_lock_res *lockres,
+			      unsigned long newflags)
+{
+	struct list_head *pos, *tmp;
+	struct ocfs2_mask_waiter *mw;
+
+ 	assert_spin_locked(&lockres->l_lock);
+
+	lockres->l_flags = newflags;
+
+	list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
+		mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			continue;
+
+		list_del_init(&mw->mw_item);
+		mw->mw_status = 0;
+		complete(&mw->mw_complete);
+	}
+}
+static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
+{
+	lockres_set_flags(lockres, lockres->l_flags | or);
+}
+static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
+				unsigned long clear)
+{
+	lockres_set_flags(lockres, lockres->l_flags & ~clear);
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+
+	lockres->l_level = lockres->l_requested;
+	if (lockres->l_level <=
+	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
+		lockres->l_blocking = LKM_NLMODE;
+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+	}
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+
+	/* Convert from RO to EX doesn't really need anything as our
+	 * information is already up to data. Convert from NL to
+	 * *anything* however should mark ourselves as needing an
+	 * update */
+	if (lockres->l_level == LKM_NLMODE)
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+
+	if (lockres->l_requested > LKM_NLMODE &&
+	    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_inode_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct inode *inode;
+	struct dlm_lockstatus *lksb;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	inode = ocfs2_lock_res_inode(lockres);
+
+	mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
+	     OCFS2_I(inode)->ip_blkno, lockres->l_action,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
+		     "on inode %"MLFu64"\n", lksb->status,
+		     OCFS2_I(inode)->ip_blkno);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		mlog_exit_void();
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
+		     "lockres flags = 0x%lx, unlock action: %u\n",
+		     lockres->l_name, lockres->l_action, lockres->l_flags,
+		     lockres->l_unlock_action);
+
+		BUG();
+	}
+
+	/* data and rw locking ignores refresh flag for now. */
+	if (lockres->l_type != OCFS2_LOCK_TYPE_META)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	int needs_downconvert = 0;
+	mlog_entry_void();
+
+	assert_spin_locked(&lockres->l_lock);
+
+	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+
+	if (level > lockres->l_blocking) {
+		/* only schedule a downconvert if we haven't already scheduled
+		 * one that goes low enough to satisfy the level we're
+		 * blocking.  this also catches the case where we get
+		 * duplicate BASTs */
+		if (ocfs2_highest_compat_lock_level(level) <
+		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
+			needs_downconvert = 1;
+
+		lockres->l_blocking = level;
+	}
+
+	mlog_exit(needs_downconvert);
+	return needs_downconvert;
+}
+
+static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
+				    struct ocfs2_lock_res *lockres,
+				    int level)
+{
+	int needs_downconvert;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	BUG_ON(level <= LKM_NLMODE);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
+	if (needs_downconvert)
+		ocfs2_schedule_blocked_lock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ocfs2_kick_vote_thread(osb);
+
+	wake_up(&lockres->l_event);
+	mlog_exit_void();
+}
+
+static void ocfs2_inode_bast_func(void *opaque, int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct inode *inode;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	inode = ocfs2_lock_res_inode(lockres);
+	osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
+	     "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
+	     lockres->l_level,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
+				   int ignore_refresh)
+{
+	struct dlm_lockstatus *lksb = &lockres->l_lksb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
+		     lockres->l_name, lksb->status);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		BUG();
+	}
+
+	if (ignore_refresh)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+}
+
+static void ocfs2_super_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+	mlog(0, "Superblock AST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+	ocfs2_generic_ast_func(lockres, 0);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_super_bast_func(void *opaque,
+				  int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+	mlog(0, "Superblock BAST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+       	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename AST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	ocfs2_generic_ast_func(lockres, 1);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename BAST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert)
+{
+	unsigned long flags;
+
+	mlog_entry_void();
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	if (convert)
+		lockres->l_action = OCFS2_AST_INVALID;
+	else
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+	mlog_exit_void();
+}
+
+/* Note: If we detect another process working on the lock (i.e.,
+ * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
+ * to do the right thing in that case.
+ */
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     int dlm_flags)
+{
+	int ret = 0;
+	enum dlm_status status;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+	     dlm_flags);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
+	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	lockres->l_action = OCFS2_AST_ATTACH;
+	lockres->l_requested = level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = dlmlock(osb->dlm,
+			 level,
+			 &lockres->l_lksb,
+			 dlm_flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+	}
+
+	mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
+					int flag)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ret = lockres->l_flags & flag;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+}
+
+static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
+}
+
+static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
+}
+
+static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
+{
+	INIT_LIST_HEAD(&mw->mw_item);
+	init_completion(&mw->mw_complete);
+}
+
+static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
+{
+	wait_for_completion(&mw->mw_complete);
+	/* Re-arm the completion in case we want to wait on it again */
+	INIT_COMPLETION(mw->mw_complete);
+	return mw->mw_status;
+}
+
+static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
+				    struct ocfs2_mask_waiter *mw,
+				    unsigned long mask,
+				    unsigned long goal)
+{
+	BUG_ON(!list_empty(&mw->mw_item));
+
+	assert_spin_locked(&lockres->l_lock);
+
+	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
+	mw->mw_mask = mask;
+	mw->mw_goal = goal;
+}
+
+/* returns 0 if the mw that was removed was already satisfied, -EBUSY
+ * if the mask still hadn't reached its goal */
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+				      struct ocfs2_mask_waiter *mw)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!list_empty(&mw->mw_item)) {
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			ret = -EBUSY;
+
+		list_del_init(&mw->mw_item);
+		init_completion(&mw->mw_complete);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+
+}
+
+static int ocfs2_cluster_lock(struct ocfs2_super *osb,
+			      struct ocfs2_lock_res *lockres,
+			      int level,
+			      int lkm_flags,
+			      int arg_flags)
+{
+	struct ocfs2_mask_waiter mw;
+	enum dlm_status status;
+	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
+	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	ocfs2_init_mask_waiter(&mw);
+
+again:
+	wait = 0;
+
+	if (catch_signals && signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		goto out;
+	}
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
+			"Cluster lock called on freeing lockres %s! flags "
+			"0x%lx\n", lockres->l_name, lockres->l_flags);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
+	    level > lockres->l_level) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		/* lock has not been created yet. */
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		goto again;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
+	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (level > lockres->l_level) {
+		if (lockres->l_action != OCFS2_AST_INVALID)
+			mlog(ML_ERROR, "lockres %s has action %u pending\n",
+			     lockres->l_name, lockres->l_action);
+
+		lockres->l_action = OCFS2_AST_CONVERT;
+		lockres->l_requested = level;
+		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		BUG_ON(level == LKM_IVMODE);
+		BUG_ON(level == LKM_NLMODE);
+
+		mlog(0, "lock %s, convert from %d to level = %d\n",
+		     lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(osb->dlm,
+				 level,
+				 &lockres->l_lksb,
+				 lkm_flags|LKM_CONVERT|LKM_VALBLK,
+				 lockres->l_name,
+				 lockres->l_ops->ast,
+				 lockres,
+				 lockres->l_ops->bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				ret = -EAGAIN;
+			else {
+				ocfs2_log_dlm_error("dlmlock", status,
+						    lockres);
+				ret = -EINVAL;
+			}
+			ocfs2_recover_from_dlm_error(lockres, 1);
+			goto out;
+		}
+
+		mlog(0, "lock %s, successfull return from dlmlock\n",
+		     lockres->l_name);
+
+		/* At this point we've gone inside the dlm and need to
+		 * complete our work regardless. */
+		catch_signals = 0;
+
+		/* wait for busy to clear and carry on */
+		goto again;
+	}
+
+	/* Ok, if we get here then we're good to go. */
+	ocfs2_inc_holders(lockres, level);
+
+	ret = 0;
+unlock:
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+out:
+	/*
+	 * This is helping work around a lock inversion between the page lock
+	 * and dlm locks.  One path holds the page lock while calling aops
+	 * which block acquiring dlm locks.  The voting thread holds dlm
+	 * locks while acquiring page locks while down converting data locks.
+	 * This block is helping an aop path notice the inversion and back
+	 * off to unlock its page lock before trying the dlm lock again.
+	 */
+	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
+	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
+		wait = 0;
+		if (lockres_remove_mask_waiter(lockres, &mw))
+			ret = -EAGAIN;
+		else
+			goto again;
+	}
+	if (wait) {
+		ret = ocfs2_wait_for_mask(&mw);
+		if (ret == 0)
+			goto again;
+		mlog_errno(ret);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int level)
+{
+	unsigned long flags;
+
+	mlog_entry_void();
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ocfs2_dec_holders(lockres, level);
+	ocfs2_vote_on_unlock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	mlog_exit_void();
+}
+
+static int ocfs2_create_new_inode_lock(struct inode *inode,
+				       struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+}
+
+/* Grants us an EX lock on the data and metadata resources, skipping
+ * the normal cluster directory lookup. Use this ONLY on newly created
+ * inodes which other nodes can't possibly see, and which haven't been
+ * hashed in the inode hash yet. This can give us a good performance
+ * increase as it'll skip the network broadcast normally associated
+ * with creating a new lock resource. */
+int ocfs2_create_new_inode_locks(struct inode *inode)
+{
+	int ret;
+
+	BUG_ON(!inode);
+	BUG_ON(!ocfs2_inode_is_new(inode));
+
+	mlog_entry_void();
+
+	mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	/* NOTE: That we don't increment any of the holder counts, nor
+	 * do we add anything to a journal handle. Since this is
+	 * supposed to be a new inode which the cluster doesn't know
+	 * about yet, there is no need to.  As far as the LVB handling
+	 * is concerned, this is basically like acquiring an EX lock
+	 * on a resource which has an invalid one -- we'll set it
+	 * valid when we release the EX. */
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_rw_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_meta_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_data_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+int ocfs2_rw_lock(struct inode *inode, int write)
+{
+	int status, level;
+	struct ocfs2_lock_res *lockres;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" take %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
+				    0);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_rw_unlock(struct inode *inode, int write)
+{
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags)
+{
+	int status = 0, level;
+	struct ocfs2_lock_res *lockres;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" take %s DATA lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	/* We'll allow faking a readonly data lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
+		if (write) {
+			status = -EROFS;
+			mlog_errno(status);
+		}
+		goto out;
+	}
+
+	lockres = &OCFS2_I(inode)->ip_data_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
+				    0, arg_flags);
+	if (status < 0 && status != -EAGAIN)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* see ocfs2_meta_lock_with_page() */
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_data_lock(inode, write) == 0)
+			ocfs2_data_unlock(inode, write);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres)
+{
+	int kick = 0;
+
+	mlog_entry_void();
+
+	/* If we know that another node is waiting on our lock, kick
+	 * the vote thread * pre-emptively when we reach a release
+	 * condition. */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
+		switch(lockres->l_blocking) {
+		case LKM_EXMODE:
+			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+				kick = 1;
+			break;
+		case LKM_PRMODE:
+			if (!lockres->l_ex_holders)
+				kick = 1;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	if (kick)
+		ocfs2_kick_vote_thread(osb);
+
+	mlog_exit_void();
+}
+
+void ocfs2_data_unlock(struct inode *inode,
+		       int write)
+{
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s DATA lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+#define OCFS2_SEC_BITS   34
+#define OCFS2_SEC_SHIFT  (64 - 34)
+#define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
+
+/* LVB only has room for 64 bits of time here so we pack it for
+ * now. */
+static u64 ocfs2_pack_timespec(struct timespec *spec)
+{
+	u64 res;
+	u64 sec = spec->tv_sec;
+	u32 nsec = spec->tv_nsec;
+
+	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
+
+	return res;
+}
+
+/* Call this with the lockres locked. I am reasonably sure we don't
+ * need ip_lock in this function as anyone who would be changing those
+ * values is supposed to be blocked in ocfs2_meta_lock right now. */
+static void __ocfs2_stuff_meta_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_entry_void();
+
+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
+	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
+	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
+	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
+	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
+	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
+	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
+	lvb->lvb_iatime_packed  =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
+	lvb->lvb_ictime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
+	lvb->lvb_imtime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+
+	mlog_meta_lvb(0, lockres);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_unpack_timespec(struct timespec *spec,
+				  u64 packed_time)
+{
+	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
+	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
+}
+
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_entry_void();
+
+	mlog_meta_lvb(0, lockres);
+
+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	/* We're safe here without the lockres lock... */
+	spin_lock(&oi->ip_lock);
+	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
+	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
+
+	/* fast-symlinks are a special case */
+	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks =
+			ocfs2_align_bytes_to_sectors(i_size_read(inode));
+
+	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
+	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
+	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
+	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
+	ocfs2_unpack_timespec(&inode->i_atime,
+			      be64_to_cpu(lvb->lvb_iatime_packed));
+	ocfs2_unpack_timespec(&inode->i_mtime,
+			      be64_to_cpu(lvb->lvb_imtime_packed));
+	ocfs2_unpack_timespec(&inode->i_ctime,
+			      be64_to_cpu(lvb->lvb_ictime_packed));
+	spin_unlock(&oi->ip_lock);
+
+	mlog_exit_void();
+}
+
+static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
+		return 1;
+	return 0;
+}
+
+/* Determine whether a lock resource needs to be refreshed, and
+ * arbitrate who gets to refresh it.
+ *
+ *   0 means no refresh needed.
+ *
+ *   > 0 means you need to refresh this and you MUST call
+ *   ocfs2_complete_lock_res_refresh afterwards. */
+static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
+{
+	unsigned long flags;
+	int status = 0;
+
+	mlog_entry_void();
+
+refresh_check:
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ocfs2_wait_on_refreshing_lock(lockres);
+		goto refresh_check;
+	}
+
+	/* Ok, I'll be the one to refresh this lock. */
+	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = 1;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* If status is non zero, I'll mark it as not being in refresh
+ * anymroe, but i won't clear the needs refresh flag. */
+static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
+						   int status)
+{
+	unsigned long flags;
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
+	if (!status)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+/* may or may not return a bh if it went to disk. */
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
+		mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
+		     "were waiting on a lock. ip_flags = 0x%x\n",
+		     oi->ip_blkno, oi->ip_flags);
+		spin_unlock(&oi->ip_lock);
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	lockres = &oi->ip_meta_lockres;
+
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+
+	/* This will discard any caching information we might have had
+	 * for the inode metadata. */
+	ocfs2_metadata_cache_purge(inode);
+
+	/* will do nothing for inode types that don't use the extent
+	 * map (directories, bitmap files, etc) */
+	ocfs2_extent_map_trunc(inode, 0);
+
+	if (ocfs2_meta_lvb_is_trustable(lockres)) {
+		mlog(0, "Trusting LVB on inode %"MLFu64"\n",
+		     oi->ip_blkno);
+		ocfs2_refresh_inode_from_lvb(inode);
+	} else {
+		/* Boo, we have to go to disk. */
+		/* read bh, cast, ocfs2_refresh_inode */
+		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
+					  bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_refresh;
+		}
+		fe = (struct ocfs2_dinode *) (*bh)->b_data;
+
+		/* This is a good chance to make sure we're not
+		 * locking an invalid object.
+		 *
+		 * We bug on a stale inode here because we checked
+		 * above whether it was wiped from disk. The wiping
+		 * node provides a guarantee that we receive that
+		 * message and can mark the inode before dropping any
+		 * locks associated with it. */
+		if (!OCFS2_IS_VALID_DINODE(fe)) {
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+			status = -EIO;
+			goto bail_refresh;
+		}
+		mlog_bug_on_msg(inode->i_generation !=
+				le32_to_cpu(fe->i_generation),
+				"Invalid dinode %"MLFu64" disk generation: %u "
+				"inode->i_generation: %u\n",
+				oi->ip_blkno, le32_to_cpu(fe->i_generation),
+				inode->i_generation);
+		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
+				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
+				"Stale dinode %"MLFu64" dtime: %"MLFu64" "
+				"flags: 0x%x\n", oi->ip_blkno,
+				le64_to_cpu(fe->i_dtime),
+				le32_to_cpu(fe->i_flags));
+
+		ocfs2_refresh_inode(inode, fe);
+	}
+
+	status = 0;
+bail_refresh:
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_assign_bh(struct inode *inode,
+			   struct buffer_head **ret_bh,
+			   struct buffer_head *passed_bh)
+{
+	int status;
+
+	if (passed_bh) {
+		/* Ok, the update went to disk for us, use the
+		 * returned bh. */
+		*ret_bh = passed_bh;
+		get_bh(*ret_bh);
+
+		return 0;
+	}
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  ret_bh,
+				  OCFS2_BH_CACHED,
+				  inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * returns < 0 error if the callback will never be called, otherwise
+ * the result of the lock will be communicated via the callback.
+ */
+int ocfs2_meta_lock_full(struct inode *inode,
+			 struct ocfs2_journal_handle *handle,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags)
+{
+	int status, level, dlm_flags, acquired;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *local_bh = NULL;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64", take %s META lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	status = 0;
+	acquired = 0;
+	/* We'll allow faking a readonly metadata lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		wait_event(osb->recovery_event,
+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+
+	acquired = 0;
+	lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	level = ex ? LKM_EXMODE : LKM_PRMODE;
+	dlm_flags = 0;
+	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
+		dlm_flags |= LKM_NOQUEUE;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
+	if (status < 0) {
+		if (status != -EAGAIN && status != -EIOCBRETRY)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* Notify the error cleanup path to drop the cluster lock. */
+	acquired = 1;
+
+	/* We wait twice because a node may have died while we were in
+	 * the lower dlm layers. The second time though, we've
+	 * committed to owning this lock so we don't allow signals to
+	 * abort the operation. */
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		wait_event(osb->recovery_event,
+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+
+	/* This is fun. The caller may want a bh back, or it may
+	 * not. ocfs2_meta_lock_update definitely wants one in, but
+	 * may or may not read one, depending on what's in the
+	 * LVB. The result of all of this is that we've *only* gone to
+	 * disk if we have to, so the complexity is worthwhile. */
+	status = ocfs2_meta_lock_update(inode, &local_bh);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	if (ret_bh) {
+		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (handle) {
+		status = ocfs2_handle_add_lock(handle, inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+bail:
+	if (status < 0) {
+		if (ret_bh && (*ret_bh)) {
+			brelse(*ret_bh);
+			*ret_bh = NULL;
+		}
+		if (acquired)
+			ocfs2_meta_unlock(inode, ex);
+	}
+
+	if (local_bh)
+		brelse(local_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * This is working around a lock inversion between tasks acquiring DLM locks
+ * while holding a page lock and the vote thread which blocks dlm lock acquiry
+ * while acquiring page locks.
+ *
+ * ** These _with_page variantes are only intended to be called from aop
+ * methods that hold page locks and return a very specific *positive* error
+ * code that aop methods pass up to the VFS -- test for errors with != 0. **
+ *
+ * The DLM is called such that it returns -EAGAIN if it would have blocked
+ * waiting for the vote thread.  In that case we unlock our page so the vote
+ * thread can make progress.  Once we've done this we have to return
+ * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
+ * into the VFS who will then immediately retry the aop call.
+ *
+ * We do a blocking lock and immediate unlock before returning, though, so that
+ * the lock has a great chance of being cached on this node by the time the VFS
+ * calls back to retry the aop.    This has a potential to livelock as nodes
+ * ping locks back and forth, but that's a risk we're willing to take to avoid
+ * the lock inversion simply.
+ */
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      struct ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
+				   OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
+			ocfs2_meta_unlock(inode, ex);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s META lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex)
+{
+	int status;
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+	struct buffer_head *bh;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	mlog_entry_void();
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* The super block lock path is really in the best position to
+	 * know when resources covered by the lock need to be
+	 * refreshed, so we do it here. Of course, making sense of
+	 * everything is up to the caller :) */
+	status = ocfs2_should_refresh_lock_res(lockres);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (status) {
+		bh = si->si_bh;
+		status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
+					  si->si_inode);
+		if (status == 0)
+			ocfs2_update_slot_info(si);
+
+		ocfs2_complete_lock_res_refresh(lockres, status);
+
+		if (status < 0)
+			mlog_errno(status);
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+int ocfs2_rename_lock(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_rename_unlock(struct ocfs2_super *osb)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+}
+
+/* Reference counting of the dlm debug structure. We want this because
+ * open references on the debug inodes can live on after a mount, so
+ * we can't rely on the ocfs2_super to always exist. */
+static void ocfs2_dlm_debug_free(struct kref *kref)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
+
+	kfree(dlm_debug);
+}
+
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
+{
+	if (dlm_debug)
+		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
+}
+
+static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
+{
+	kref_get(&debug->d_refcnt);
+}
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
+	if (!dlm_debug) {
+		mlog_errno(-ENOMEM);
+		goto out;
+	}
+
+	kref_init(&dlm_debug->d_refcnt);
+	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
+	dlm_debug->d_locking_state = NULL;
+out:
+	return dlm_debug;
+}
+
+/* Access to this is arbitrated for us via seq_file->sem. */
+struct ocfs2_dlm_seq_priv {
+	struct ocfs2_dlm_debug *p_dlm_debug;
+	struct ocfs2_lock_res p_iter_res;
+	struct ocfs2_lock_res p_tmp_res;
+};
+
+static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
+						 struct ocfs2_dlm_seq_priv *priv)
+{
+	struct ocfs2_lock_res *iter, *ret = NULL;
+	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
+
+	assert_spin_locked(&ocfs2_dlm_tracking_lock);
+
+	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
+		/* discover the head of the list */
+		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
+			mlog(0, "End of list found, %p\n", ret);
+			break;
+		}
+
+		/* We track our "dummy" iteration lockres' by a NULL
+		 * l_ops field. */
+		if (iter->l_ops != NULL) {
+			ret = iter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
+	if (iter) {
+		/* Since lockres' have the lifetime of their container
+		 * (which can be inodes, ocfs2_supers, etc) we want to
+		 * copy this out to a temporary lockres while still
+		 * under the spinlock. Obviously after this we can't
+		 * trust any pointers on the copy returned, but that's
+		 * ok as the information we want isn't typically held
+		 * in them. */
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter = v;
+	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(iter, priv);
+	list_del_init(&dummy->l_debug_list);
+	if (iter) {
+		list_add(&dummy->l_debug_list, &iter->l_debug_list);
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define OCFS2_DLM_DEBUG_STR_VERSION 1
+static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
+{
+	int i;
+	char *lvb;
+	struct ocfs2_lock_res *lockres = v;
+
+	if (!lockres)
+		return -EINVAL;
+
+	seq_printf(m, "0x%x\t"
+		   "%.*s\t"
+		   "%d\t"
+		   "0x%lx\t"
+		   "0x%x\t"
+		   "0x%x\t"
+		   "%u\t"
+		   "%u\t"
+		   "%d\t"
+		   "%d\t",
+		   OCFS2_DLM_DEBUG_STR_VERSION,
+		   OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
+		   lockres->l_level,
+		   lockres->l_flags,
+		   lockres->l_action,
+		   lockres->l_unlock_action,
+		   lockres->l_ro_holders,
+		   lockres->l_ex_holders,
+		   lockres->l_requested,
+		   lockres->l_blocking);
+
+	/* Dump the raw LVB */
+	lvb = lockres->l_lksb.lvb;
+	for(i = 0; i < DLM_LVB_LEN; i++)
+		seq_printf(m, "0x%x\t", lvb[i]);
+
+	/* End the line */
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static struct seq_operations ocfs2_dlm_seq_ops = {
+	.start =	ocfs2_dlm_seq_start,
+	.stop =		ocfs2_dlm_seq_stop,
+	.next =		ocfs2_dlm_seq_next,
+	.show =		ocfs2_dlm_seq_show,
+};
+
+static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = (struct seq_file *) file->private_data;
+	struct ocfs2_dlm_seq_priv *priv = seq->private;
+	struct ocfs2_lock_res *res = &priv->p_iter_res;
+
+	ocfs2_remove_lockres_tracking(res);
+	ocfs2_put_dlm_debug(priv->p_dlm_debug);
+	return seq_release_private(inode, file);
+}
+
+static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct ocfs2_dlm_seq_priv *priv;
+	struct seq_file *seq;
+	struct ocfs2_super *osb;
+
+	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	osb = (struct ocfs2_super *) inode->u.generic_ip;
+	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
+	priv->p_dlm_debug = osb->osb_dlm_debug;
+	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
+
+	ret = seq_open(file, &ocfs2_dlm_seq_ops);
+	if (ret) {
+		kfree(priv);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	seq = (struct seq_file *) file->private_data;
+	seq->private = priv;
+
+	ocfs2_add_lockres_tracking(&priv->p_iter_res,
+				   priv->p_dlm_debug);
+
+out:
+	return ret;
+}
+
+static struct file_operations ocfs2_dlm_debug_fops = {
+	.open =		ocfs2_dlm_debug_open,
+	.release =	ocfs2_dlm_debug_release,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+};
+
+static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
+							 S_IFREG|S_IRUSR,
+							 osb->osb_debug_root,
+							 osb,
+							 &ocfs2_dlm_debug_fops);
+	if (!dlm_debug->d_locking_state) {
+		ret = -EINVAL;
+		mlog(ML_ERROR,
+		     "Unable to create locking state debugfs file.\n");
+		goto out;
+	}
+
+	ocfs2_get_dlm_debug(dlm_debug);
+out:
+	return ret;
+}
+
+static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
+{
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	if (dlm_debug) {
+		debugfs_remove(dlm_debug->d_locking_state);
+		ocfs2_put_dlm_debug(dlm_debug);
+	}
+}
+
+int ocfs2_dlm_init(struct ocfs2_super *osb)
+{
+	int status;
+	u32 dlm_key;
+	struct dlm_ctxt *dlm;
+
+	mlog_entry_void();
+
+	status = ocfs2_dlm_init_debug(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* launch vote thread */
+	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
+				     osb->osb_id);
+	if (IS_ERR(osb->vote_task)) {
+		status = PTR_ERR(osb->vote_task);
+		osb->vote_task = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. */
+	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
+
+	/* for now, uuid == domain */
+	dlm = dlm_register_domain(osb->uuid_str, dlm_key);
+	if (IS_ERR(dlm)) {
+		status = PTR_ERR(dlm);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
+	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+
+	dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
+
+	osb->dlm = dlm;
+
+	status = 0;
+bail:
+	if (status < 0) {
+		ocfs2_dlm_shutdown_debug(osb);
+		if (osb->vote_task)
+			kthread_stop(osb->vote_task);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+{
+	mlog_entry_void();
+
+	dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
+
+	ocfs2_drop_osb_locks(osb);
+
+	if (osb->vote_task) {
+		kthread_stop(osb->vote_task);
+		osb->vote_task = NULL;
+	}
+
+	ocfs2_lock_res_free(&osb->osb_super_lockres);
+	ocfs2_lock_res_free(&osb->osb_rename_lockres);
+
+	dlm_unregister_domain(osb->dlm);
+	osb->dlm = NULL;
+
+	ocfs2_dlm_shutdown_debug(osb);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
+	     lockres->l_unlock_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	/* We tried to cancel a convert request, but it was already
+	 * granted. All we want to do here is clear our unlock
+	 * state. The wake_up call done at the bottom is redundant
+	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
+	 * hurt anything anyway */
+	if (status == DLM_CANCELGRANT &&
+	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
+
+		/* We don't clear the busy flag in this case as it
+		 * should have been cleared by the ast which the dlm
+		 * has called. */
+		goto complete_unlock;
+	}
+
+	if (status != DLM_NORMAL) {
+		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
+		     "unlock_action %d\n", status, lockres->l_name,
+		     lockres->l_unlock_action);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_unlock_action) {
+	case OCFS2_UNLOCK_CANCEL_CONVERT:
+		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+		lockres->l_action = OCFS2_AST_INVALID;
+		break;
+	case OCFS2_UNLOCK_DROP_LOCK:
+		lockres->l_level = LKM_IVMODE;
+		break;
+	default:
+		BUG();
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+complete_unlock:
+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
+
+struct drop_lock_cb {
+	ocfs2_pre_drop_cb_t	*drop_func;
+	void			*drop_data;
+};
+
+static int ocfs2_drop_lock(struct ocfs2_super *osb,
+			   struct ocfs2_lock_res *lockres,
+			   struct drop_lock_cb *dcb)
+{
+	enum dlm_status status;
+	unsigned long flags;
+
+	/* We didn't get anywhere near actually using this lockres. */
+	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
+		goto out;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
+			"lockres %s, flags 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
+		     "%u, unlock_action = %u\n",
+		     lockres->l_name, lockres->l_flags, lockres->l_action,
+		     lockres->l_unlock_action);
+
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		/* XXX: Today we just wait on any busy
+		 * locks... Perhaps we need to cancel converts in the
+		 * future? */
+		ocfs2_wait_on_busy_lock(lockres);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+
+	if (dcb)
+		dcb->drop_func(lockres, dcb->drop_data);
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY)
+		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
+		     lockres->l_name);
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto out;
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
+
+	/* make sure we never get here while waiting for an ast to
+	 * fire. */
+	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
+
+	/* is this necessary? */
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
+			   lockres->l_ops->unlock_ast, lockres);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
+		dlm_print_one_lock(lockres->l_lksb.lockid);
+		BUG();
+	}
+	mlog(0, "lock %s, successfull return from dlmunlock\n",
+	     lockres->l_name);
+
+	ocfs2_wait_on_busy_lock(lockres);
+out:
+	mlog_exit(0);
+	return 0;
+}
+
+/* Mark the lockres as being dropped. It will no longer be
+ * queued if blocking, but we still may have to wait on it
+ * being dequeued from the vote thread before we can consider
+ * it safe to drop. 
+ *
+ * You can *not* attempt to call cluster_lock on this lockres anymore. */
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+{
+	int status;
+	struct ocfs2_mask_waiter mw;
+	unsigned long flags;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres->l_flags |= OCFS2_LOCK_FREEING;
+	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
+
+		status = ocfs2_wait_for_mask(&mw);
+		if (status)
+			mlog_errno(status);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
+{
+	int status;
+
+	mlog_entry_void();
+
+	ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
+
+	status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
+
+	status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+}
+
+static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
+{
+	struct inode *inode = data;
+
+	/* the metadata lock requires a bit more work as we have an
+	 * LVB to worry about. */
+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+	    lockres->l_level == LKM_EXMODE &&
+	    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+		__ocfs2_stuff_meta_lvb(inode);
+}
+
+int ocfs2_drop_inode_locks(struct inode *inode)
+{
+	int status, err;
+	struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
+
+	mlog_entry_void();
+
+	/* No need to call ocfs2_mark_lockres_freeing here -
+	 * ocfs2_clear_inode has done it for us. */
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_data_lockres,
+			      NULL);
+	if (err < 0)
+		mlog_errno(err);
+
+	status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_meta_lockres,
+			      &meta_dcb);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_rw_lockres,
+			      NULL);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	mlog_exit(status);
+	return status;
+}
+
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+				      int new_level)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+
+	if (lockres->l_level <= new_level) {
+		mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
+		     lockres->l_level, new_level);
+		BUG();
+	}
+
+	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
+	     lockres->l_name, new_level, lockres->l_blocking);
+
+	lockres->l_action = OCFS2_AST_DOWNCONVERT;
+	lockres->l_requested = new_level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb)
+{
+	int ret, dlm_flags = LKM_CONVERT;
+	enum dlm_status status;
+
+	mlog_entry_void();
+
+	if (lvb)
+		dlm_flags |= LKM_VALBLK;
+
+	status = dlmlock(osb->dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 dlm_flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* returns 1 when the caller should unlock and call dlmunlock */
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	mlog_entry_void();
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		/* If we're already trying to cancel a lock conversion
+		 * then just drop the spinlock and allow the caller to
+		 * requeue this lock. */
+
+		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
+		return 0;
+	}
+
+	/* were we in a convert when we got the bast fire? */
+	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
+	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
+	/* set things up for the unlockast to know to just
+	 * clear out the ast_action and unset busy, etc. */
+	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
+			"lock %s, invalid flags: 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	return 1;
+}
+
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int ret;
+	enum dlm_status status;
+
+	mlog_entry_void();
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	ret = 0;
+	status = dlmunlock(osb->dlm,
+			   &lockres->l_lksb,
+			   LKM_CANCEL,
+			   lockres->l_ops->unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 0);
+	}
+
+	mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  struct ocfs2_lock_res *lockres,
+						  int new_level)
+{
+	int ret;
+
+	mlog_entry_void();
+
+	BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		ret = 0;
+		mlog(0, "lockres %s currently being refreshed -- backing "
+		     "off!\n", lockres->l_name);
+	} else if (new_level == LKM_PRMODE)
+		ret = !lockres->l_ex_holders &&
+			ocfs2_inode_fully_checkpointed(inode);
+	else /* Must be NLMODE we're converting to. */
+		ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
+			ocfs2_inode_fully_checkpointed(inode);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue)
+{
+	int new_level;
+	int set_lvb = 0;
+	int ret = 0;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	unsigned long flags;
+
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
+	     lockres->l_blocking);
+
+	BUG_ON(lockres->l_level != LKM_EXMODE &&
+	       lockres->l_level != LKM_PRMODE);
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
+	     lockres->l_level, lockres->l_blocking, new_level);
+
+	if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
+		if (lockres->l_level == LKM_EXMODE)
+			set_lvb = 1;
+
+		/* If the lock hasn't been refreshed yet (rare), then
+		 * our memory inode values are old and we skip
+		 * stuffing the lvb. There's no need to actually clear
+		 * out the lvb here as it's value is still valid. */
+		if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+			if (set_lvb)
+				__ocfs2_stuff_meta_lvb(inode);
+		} else
+			mlog(0, "lockres %s: downconverting stale lock!\n",
+			     lockres->l_name);
+
+		mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
+		     "l_blocking=%d, new_level=%d\n",
+		     lockres->l_level, lockres->l_blocking, new_level);
+
+		ocfs2_prepare_downconvert(lockres, new_level);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+		goto leave;
+	}
+	if (!ocfs2_inode_fully_checkpointed(inode))
+		ocfs2_start_checkpoint(osb);
+
+	*requeue = 1;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = 0;
+leave:
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
+				      struct ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker)
+{
+	unsigned long flags;
+	int blocking;
+	int new_level;
+	int ret = 0;
+
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+recheck:
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	/* if we're blocking an exclusive and we have *any* holders,
+	 * then requeue. */
+	if ((lockres->l_blocking == LKM_EXMODE)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		*requeue = 1;
+		ret = 0;
+		goto leave;
+	}
+
+	/* If it's a PR we're blocking, then only
+	 * requeue if we've got any EX holders */
+	if (lockres->l_blocking == LKM_PRMODE &&
+	    lockres->l_ex_holders) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		*requeue = 1;
+		ret = 0;
+		goto leave;
+	}
+
+	/* If we get here, then we know that there are no more
+	 * incompatible holders (and anyone asking for an incompatible
+	 * lock is blocked). We can now downconvert the lock */
+	if (!worker)
+		goto downconvert;
+
+	/* Some lockres types want to do a bit of work before
+	 * downconverting a lock. Allow that here. The worker function
+	 * may sleep, so we save off a copy of what we're blocking as
+	 * it may change while we're not holding the spin lock. */
+	blocking = lockres->l_blocking;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	worker(lockres, blocking);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (blocking != lockres->l_blocking) {
+		/* If this changed underneath us, then we can't drop
+		 * it just yet. */
+		goto recheck;
+	}
+
+downconvert:
+	*requeue = 0;
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	ocfs2_prepare_downconvert(lockres, new_level);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
+leave:
+	mlog_exit(ret);
+	return ret;
+}
+
+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				      int blocking)
+{
+	struct inode *inode;
+	struct address_space *mapping;
+
+	mlog_entry_void();
+
+       	inode = ocfs2_lock_res_inode(lockres);
+	mapping = inode->i_mapping;
+
+	if (filemap_fdatawrite(mapping)) {
+		mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
+		     OCFS2_I(inode)->ip_blkno);
+	}
+	sync_mapping_buffers(mapping);
+	if (blocking == LKM_EXMODE) {
+		truncate_inode_pages(mapping, 0);
+		unmap_mapping_range(mapping, 0, 0, 0);
+	} else {
+		/* We only need to wait on the I/O if we're not also
+		 * truncating pages because truncate_inode_pages waits
+		 * for us above. We don't truncate pages if we're
+		 * blocking anything < EXMODE because we want to keep
+		 * them around in that case. */
+		filemap_fdatawait(mapping);
+	}
+
+	mlog_exit_void();
+}
+
+int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
+		       int *requeue)
+{
+	int status;
+	struct inode *inode;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	inode = ocfs2_lock_res_inode(lockres);
+	osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    ocfs2_data_convert_worker);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog(0, "inode %"MLFu64", requeue = %d\n",
+	     OCFS2_I(inode)->ip_blkno, *requeue);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+				    int *requeue)
+{
+	int status;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
+	inode  = ocfs2_lock_res_inode(lockres);
+
+	status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+
+int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
+		       int *requeue)
+{
+	int status;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+       	inode = ocfs2_lock_res_inode(lockres);
+
+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_do_unblock_meta(inode, requeue);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog(0, "inode %"MLFu64", requeue = %d\n",
+	     OCFS2_I(inode)->ip_blkno, *requeue);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Generic unblock function for any lockres whose private data is an
+ * ocfs2_super pointer. */
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue)
+{
+	int status;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
+	osb = ocfs2_lock_res_super(lockres);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int status;
+	int requeue = 0;
+	unsigned long flags;
+
+	/* Our reference to the lockres in this function can be
+	 * considered valid until we remove the OCFS2_LOCK_QUEUED
+	 * flag. */
+
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+	BUG_ON(!lockres->l_ops);
+	BUG_ON(!lockres->l_ops->unblock);
+
+	mlog(0, "lockres %s blocked.\n", lockres->l_name);
+
+	/* Detect whether a lock has been marked as going away while
+	 * the vote thread was processing other things. A lock can
+	 * still be marked with OCFS2_LOCK_FREEING after this check,
+	 * but short circuiting here will still save us some
+	 * performance. */
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (lockres->l_flags & OCFS2_LOCK_FREEING)
+		goto unqueue;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = lockres->l_ops->unblock(lockres, &requeue);
+	if (status < 0)
+		mlog_errno(status);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+unqueue:
+	if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
+		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
+	} else
+		ocfs2_schedule_blocked_lock(osb, lockres);
+
+	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
+	     requeue ? "yes" : "no");
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	assert_spin_locked(&lockres->l_lock);
+
+	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
+		/* Do not schedule a lock for downconvert when it's on
+		 * the way to destruction - any nodes wanting access
+		 * to the resource will get it soon. */
+		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
+		     lockres->l_name, lockres->l_flags);
+		return;
+	}
+
+	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
+
+	spin_lock(&osb->vote_task_lock);
+	if (list_empty(&lockres->l_blocked_list)) {
+		list_add_tail(&lockres->l_blocked_list,
+			      &osb->blocked_lock_list);
+		osb->blocked_lock_count++;
+	}
+	spin_unlock(&osb->vote_task_lock);
+
+	mlog_exit_void();
+}
+
+/* This aids in debugging situations where a bad LVB might be involved. */
+void ocfs2_dump_meta_lvb_info(u64 level,
+			      const char *function,
+			      unsigned int line,
+			      struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	mlog(level, "LVB information for %s (called from %s:%u):\n",
+	     lockres->l_name, function, line);
+	mlog(level, "version: %u, clusters: %u\n",
+	     be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
+	mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
+	     be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
+	     be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));
+	mlog(level, "nlink %u, atime_packed 0x%"MLFx64", "
+	     "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n",
+	     be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed),
+	     be64_to_cpu(lvb->lvb_ictime_packed),
+	     be64_to_cpu(lvb->lvb_imtime_packed));
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
new file mode 100644
index 000000000000..8f2d1db2d9ea
--- /dev/null
+++ b/fs/ocfs2/dlmglue.h
@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef DLMGLUE_H
+#define DLMGLUE_H
+
+#define OCFS2_LVB_VERSION 2
+
+struct ocfs2_meta_lvb {
+	__be32       lvb_version;
+	__be32       lvb_iclusters;
+	__be32       lvb_iuid;
+	__be32       lvb_igid;
+	__be64       lvb_iatime_packed;
+	__be64       lvb_ictime_packed;
+	__be64       lvb_imtime_packed;
+	__be64       lvb_isize;
+	__be16       lvb_imode;
+	__be16       lvb_inlink;
+	__be32       lvb_reserved[3];
+};
+
+/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY	(0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+/* don't block waiting for the vote thread, instead return -EAGAIN */
+#define OCFS2_LOCK_NONBLOCK		(0x04)
+
+int ocfs2_dlm_init(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       struct inode *inode);
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
+int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_drop_inode_locks(struct inode *inode);
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags);
+#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page);
+void ocfs2_data_unlock(struct inode *inode,
+		       int write);
+int ocfs2_rw_lock(struct inode *inode, int write);
+void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_meta_lock_full(struct inode *inode,
+			 struct ocfs2_journal_handle *handle,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags);
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      struct ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page);
+/* 99% of the time we don't want to supply any additional flags --
+ * those are for very specific cases only. */
+#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex);
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex);
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex);
+int ocfs2_rename_lock(struct ocfs2_super *osb);
+void ocfs2_rename_unlock(struct ocfs2_super *osb);
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+
+/* for the vote thread */
+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres);
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
+
+/* aids in debugging and tracking lvbs */
+void ocfs2_dump_meta_lvb_info(u64 level,
+			      const char *function,
+			      unsigned int line,
+			      struct ocfs2_lock_res *lockres);
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+
+#endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
new file mode 100644
index 000000000000..f226b2207628
--- /dev/null
+++ b/fs/ocfs2/endian.h
@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ENDIAN_H
+#define OCFS2_ENDIAN_H
+
+static inline void le16_add_cpu(__le16 *var, u16 val)
+{
+	*var = cpu_to_le16(le16_to_cpu(*var) + val);
+}
+
+static inline void le32_add_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) + val);
+}
+
+static inline void le32_and_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) & val);
+}
+
+static inline void be32_add_cpu(__be32 *var, u32 val)
+{
+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
+}
+
+#endif /* OCFS2_ENDIAN_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
new file mode 100644
index 000000000000..5810160d92a8
--- /dev/null
+++ b/fs/ocfs2/export.c
@@ -0,0 +1,248 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.c
+ *
+ * Functions to facilitate NFS exporting
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#define MLOG_MASK_PREFIX ML_EXPORT
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dir.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "inode.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_inode_handle
+{
+	u64 ih_blkno;
+	u32 ih_generation;
+};
+
+static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
+{
+	struct ocfs2_inode_handle *handle = vobjp;
+	struct inode *inode;
+	struct dentry *result;
+
+	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+
+	if (handle->ih_blkno == 0) {
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+
+	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
+
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		return (void *)inode;
+	}
+
+	if (handle->ih_generation != inode->i_generation) {
+		iput(inode);
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+
+	result = d_alloc_anon(inode);
+
+	if (!result) {
+		iput(inode);
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mlog_exit_ptr(result);
+	return result;
+}
+
+static struct dentry *ocfs2_get_parent(struct dentry *child)
+{
+	int status;
+	u64 blkno;
+	struct dentry *parent;
+	struct inode *inode;
+	struct inode *dir = child->d_inode;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent;
+
+	mlog_entry("(0x%p, '%.*s')\n", child,
+		   child->d_name.len, child->d_name.name);
+
+	mlog(0, "find parent of directory %"MLFu64"\n",
+	     OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		parent = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
+					  &dirent);
+	if (status < 0) {
+		parent = ERR_PTR(-ENOENT);
+		goto bail_unlock;
+	}
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		parent = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		parent = ERR_PTR(-ENOMEM);
+	}
+
+bail_unlock:
+	ocfs2_meta_unlock(dir, 0);
+
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+bail:
+	mlog_exit_ptr(parent);
+
+	return parent;
+}
+
+static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
+			   int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	int len = *max_len;
+	int type = 1;
+	u64 blkno;
+	u32 generation;
+
+	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name,
+		   fh, len, connectable);
+
+	if (len < 3 || (connectable && len < 6)) {
+		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+		type = 255;
+		goto bail;
+	}
+
+	blkno = OCFS2_I(inode)->ip_blkno;
+	generation = inode->i_generation;
+
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     blkno, generation);
+
+	len = 3;
+	fh[0] = cpu_to_le32((u32)(blkno >> 32));
+	fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
+	fh[2] = cpu_to_le32(generation);
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+
+		parent = dentry->d_parent->d_inode;
+		blkno = OCFS2_I(parent)->ip_blkno;
+		generation = parent->i_generation;
+
+		fh[3] = cpu_to_le32((u32)(blkno >> 32));
+		fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
+		fh[5] = cpu_to_le32(generation);
+
+		spin_unlock(&dentry->d_lock);
+
+		len = 6;
+		type = 2;
+
+		mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
+		     blkno, generation);
+	}
+	
+	*max_len = len;
+
+bail:
+	mlog_exit(type);
+	return type;
+}
+
+static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
+				      int fh_len, int fileid_type,
+				      int (*acceptable)(void *context,
+						        struct dentry *de),
+				      void *context)
+{
+	struct ocfs2_inode_handle handle, parent;
+	struct dentry *ret = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
+		   sb, fh, fh_len, fileid_type, acceptable, context);
+
+	if (fh_len < 3 || fileid_type > 2)
+		goto bail;
+
+	if (fileid_type == 2) {
+		if (fh_len < 6)
+			goto bail;
+
+		parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
+		parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
+		parent.ih_generation = le32_to_cpu(fh[5]);
+
+		mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
+		     parent.ih_blkno, parent.ih_generation);
+	}
+
+	handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
+	handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
+	handle.ih_generation = le32_to_cpu(fh[2]);
+
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     handle.ih_blkno, handle.ih_generation);
+
+	ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
+						    acceptable, context);
+
+bail:
+	mlog_exit_ptr(ret);
+	return ret;
+}
+
+struct export_operations ocfs2_export_ops = {
+	.decode_fh	= ocfs2_decode_fh,
+	.encode_fh	= ocfs2_encode_fh,
+
+	.get_parent	= ocfs2_get_parent,
+	.get_dentry	= ocfs2_get_dentry,
+};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
new file mode 100644
index 000000000000..5b77ee7866ef
--- /dev/null
+++ b/fs/ocfs2/export.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_EXPORT_H
+#define OCFS2_EXPORT_H
+
+extern struct export_operations ocfs2_export_ops;
+
+#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
new file mode 100644
index 000000000000..f2fb40cd296a
--- /dev/null
+++ b/fs/ocfs2/extent_map.c
@@ -0,0 +1,994 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.c
+ *
+ * In-memory extent map for OCFS2.  Man, this code was prettier in
+ * the library.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_EXTENT_MAP
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "extent_map.h"
+#include "inode.h"
+#include "super.h"
+
+#include "buffer_head_io.h"
+
+
+/*
+ * SUCK SUCK SUCK
+ * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
+ */
+
+struct ocfs2_extent_map_entry {
+	struct rb_node e_node;
+	int e_tree_depth;
+	struct ocfs2_extent_rec e_rec;
+};
+
+struct ocfs2_em_insert_context {
+	int need_left;
+	int need_right;
+	struct ocfs2_extent_map_entry *new_ent;
+	struct ocfs2_extent_map_entry *old_ent;
+	struct ocfs2_extent_map_entry *left_ent;
+	struct ocfs2_extent_map_entry *right_ent;
+};
+
+static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
+
+
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent);
+static int ocfs2_extent_map_insert(struct inode *inode,
+				   struct ocfs2_extent_rec *rec,
+				   int tree_depth);
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+					 struct ocfs2_extent_map_entry *ent);
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      struct ocfs2_extent_list *el);
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+					u32 cpos, u32 clusters,
+					struct ocfs2_extent_map_entry **ret_ent);
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+				       struct ocfs2_extent_rec *rec,
+				       int tree_depth,
+				       struct ocfs2_em_insert_context *ctxt);
+
+/* returns 1 only if the rec contains all the given clusters -- that is that
+ * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
+ * clusters) is >= the argument's endpoint */
+static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
+					      u32 cpos, u32 clusters)
+{
+	if (le32_to_cpu(rec->e_cpos) > cpos)
+		return 0;
+	if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
+			      le32_to_cpu(rec->e_clusters))
+		return 0;
+	return 1;
+}
+
+
+/*
+ * Find an entry in the tree that intersects the region passed in.
+ * Note that this will find straddled intervals, it is up to the
+ * callers to enforce any boundary conditions.
+ *
+ * Callers must hold ip_lock.  This lookup is not guaranteed to return
+ * a tree_depth 0 match, and as such can race inserts if the lock
+ * were not held.
+ *
+ * The rb_node garbage lets insertion share the search.  Trivial
+ * callers pass NULL.
+ */
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent)
+{
+	struct rb_node **p = &em->em_extents.rb_node;
+	struct rb_node *parent = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	while (*p)
+	{
+		parent = *p;
+		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
+			       e_node);
+		if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
+			p = &(*p)->rb_left;
+			ent = NULL;
+		} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
+				    le32_to_cpu(ent->e_rec.e_clusters))) {
+			p = &(*p)->rb_right;
+			ent = NULL;
+		} else
+			break;
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+	return ent;
+}
+
+/*
+ * Find the leaf containing the interval we want.  While we're on our
+ * way down the tree, fill in every record we see at any depth, because
+ * we might want it later.
+ *
+ * Note that this code is run without ip_lock.  That's because it
+ * sleeps while reading.  If someone is also filling the extent list at
+ * the same time we are, we might have to restart.
+ */
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      struct ocfs2_extent_list *el)
+{
+	int i, ret;
+	struct buffer_head *eb_bh = NULL;
+	u64 blkno;
+	u32 rec_end;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+
+	/*
+	 * The bh data containing the el cannot change here, because
+	 * we hold alloc_sem.  So we can do this without other
+	 * locks.
+	 */
+	while (el->l_tree_depth)
+	{
+		blkno = 0;
+		for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+			rec = &el->l_recs[i];
+			rec_end = (le32_to_cpu(rec->e_cpos) +
+				   le32_to_cpu(rec->e_clusters));
+
+			ret = -EBADR;
+			if (rec_end > OCFS2_I(inode)->ip_clusters) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			if (rec_end <= cpos) {
+				ret = ocfs2_extent_map_insert(inode, rec,
+						le16_to_cpu(el->l_tree_depth));
+				if (ret && (ret != -EEXIST)) {
+					mlog_errno(ret);
+					goto out_free;
+				}
+				continue;
+			}
+			if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
+				ret = ocfs2_extent_map_insert(inode, rec,
+						le16_to_cpu(el->l_tree_depth));
+				if (ret && (ret != -EEXIST)) {
+					mlog_errno(ret);
+					goto out_free;
+				}
+				continue;
+			}
+
+			/*
+			 * We've found a record that matches our
+			 * interval.  We don't insert it because we're
+			 * about to traverse it.
+			 */
+
+			/* Check to see if we're stradling */
+			ret = -ESRCH;
+			if (!ocfs2_extent_rec_contains_clusters(rec,
+							        cpos,
+								clusters)) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			/*
+			 * If we've already found a record, the el has
+			 * two records covering the same interval.
+			 * EEEK!
+			 */
+			ret = -EBADR;
+			if (blkno) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			blkno = le64_to_cpu(rec->e_blkno);
+		}
+
+		/*
+		 * We don't support holes, and we're still up
+		 * in the branches, so we'd better have found someone
+		 */
+		ret = -EBADR;
+		if (!blkno) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+
+		if (eb_bh) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+		}
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       blkno, &eb_bh, OCFS2_BH_CACHED,
+				       inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			ret = -EIO;
+			goto out_free;
+		}
+		el = &eb->h_list;
+	}
+
+	if (el->l_tree_depth)
+		BUG();
+
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+		ret = ocfs2_extent_map_insert(inode, rec,
+					      le16_to_cpu(el->l_tree_depth));
+		if (ret) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+	}
+
+	ret = 0;
+
+out_free:
+	if (eb_bh)
+		brelse(eb_bh);
+
+	return ret;
+}
+
+/*
+ * This lookup actually will read from disk.  It has one invariant:
+ * It will never re-traverse blocks.  This means that all inserts should
+ * be new regions or more granular regions (both allowed by insert).
+ */
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+					u32 cpos,
+					u32 clusters,
+					struct ocfs2_extent_map_entry **ret_ent)
+{
+	int ret;
+	u64 blkno;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_list *el;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (ent) {
+		if (!ent->e_tree_depth) {
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+			*ret_ent = ent;
+			return 0;
+		}
+		blkno = le64_to_cpu(ent->e_rec.e_blkno);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		eb = (struct ocfs2_extent_block *)bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			brelse(bh);
+			return -EIO;
+		}
+		el = &eb->h_list;
+	} else {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       OCFS2_I(inode)->ip_blkno, &bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		di = (struct ocfs2_dinode *)bh->b_data;
+		if (!OCFS2_IS_VALID_DINODE(di)) {
+			brelse(bh);
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
+			return -EIO;
+		}
+		el = &di->id2.i_list;
+	}
+
+	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
+	brelse(bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (!ent) {
+		ret = -ESRCH;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ent->e_tree_depth)
+		BUG();  /* FIXME: Make sure this isn't a corruption */
+
+	*ret_ent = ent;
+
+	return 0;
+}
+
+/*
+ * Callers must hold ip_lock.  This can insert pieces of the tree,
+ * thus racing lookup if the lock weren't held.
+ */
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+					 struct ocfs2_extent_map_entry *ent)
+{
+	struct rb_node **p, *parent;
+	struct ocfs2_extent_map_entry *old_ent;
+
+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
+					  le32_to_cpu(ent->e_rec.e_clusters),
+					  &p, &parent);
+	if (old_ent)
+		return -EEXIST;
+
+	rb_link_node(&ent->e_node, parent, p);
+	rb_insert_color(&ent->e_node, &em->em_extents);
+
+	return 0;
+}
+
+
+/*
+ * Simple rule: on any return code other than -EAGAIN, anything left
+ * in the insert_context will be freed.
+ */
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+				       struct ocfs2_extent_rec *rec,
+				       int tree_depth,
+				       struct ocfs2_em_insert_context *ctxt)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *old_ent;
+
+	ctxt->need_left = 0;
+	ctxt->need_right = 0;
+	ctxt->old_ent = NULL;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+	if (!ret) {
+		ctxt->new_ent = NULL;
+		goto out_unlock;
+	}
+
+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
+					  le32_to_cpu(rec->e_clusters), NULL,
+					  NULL);
+
+	if (!old_ent)
+		BUG();
+
+	ret = -EEXIST;
+	if (old_ent->e_tree_depth < tree_depth)
+		goto out_unlock;
+
+	if (old_ent->e_tree_depth == tree_depth) {
+		if (!memcmp(rec, &old_ent->e_rec,
+			    sizeof(struct ocfs2_extent_rec)))
+			ret = 0;
+
+		/* FIXME: Should this be ESRCH/EBADR??? */
+		goto out_unlock;
+	}
+
+	/*
+	 * We do it in this order specifically so that no actual tree
+	 * changes occur until we have all the pieces we need.  We
+	 * don't want malloc failures to leave an inconsistent tree.
+	 * Whenever we drop the lock, another process could be
+	 * inserting.  Also note that, if another process just beat us
+	 * to an insert, we might not need the same pieces we needed
+	 * the first go round.  In the end, the pieces we need will
+	 * be used, and the pieces we don't will be freed.
+	 */
+	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
+			     le32_to_cpu(old_ent->e_rec.e_cpos));
+	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
+			       le32_to_cpu(old_ent->e_rec.e_clusters)) >
+			      (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
+	ret = -EAGAIN;
+	if (ctxt->need_left) {
+		if (!ctxt->left_ent)
+			goto out_unlock;
+		*(ctxt->left_ent) = *old_ent;
+		ctxt->left_ent->e_rec.e_clusters =
+			cpu_to_le32(le32_to_cpu(rec->e_cpos) -
+				    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
+	}
+	if (ctxt->need_right) {
+		if (!ctxt->right_ent)
+			goto out_unlock;
+		*(ctxt->right_ent) = *old_ent;
+		ctxt->right_ent->e_rec.e_cpos =
+			cpu_to_le32(le32_to_cpu(rec->e_cpos) +
+				    le32_to_cpu(rec->e_clusters));
+		ctxt->right_ent->e_rec.e_clusters =
+			cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
+				     le32_to_cpu(old_ent->e_rec.e_clusters)) -
+				    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
+	}
+
+	rb_erase(&old_ent->e_node, &em->em_extents);
+	/* Now that he's erased, set him up for deletion */
+	ctxt->old_ent = old_ent;
+
+	if (ctxt->need_left) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->left_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->left_ent = NULL;
+	}
+
+	if (ctxt->need_right) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->right_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->right_ent = NULL;
+	}
+
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+
+	if (!ret)
+		ctxt->new_ent = NULL;
+
+out_unlock:
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	return ret;
+}
+
+
+static int ocfs2_extent_map_insert(struct inode *inode,
+				   struct ocfs2_extent_rec *rec,
+				   int tree_depth)
+{
+	int ret;
+	struct ocfs2_em_insert_context ctxt = {0, };
+
+	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+	    OCFS2_I(inode)->ip_map.em_clusters) {
+		ret = -EBADR;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Zero e_clusters means a truncated tail record.  It better be EOF */
+	if (!rec->e_clusters) {
+		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
+		    OCFS2_I(inode)->ip_map.em_clusters) {
+			ret = -EBADR;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		/* Ignore the truncated tail */
+		return 0;
+	}
+
+	ret = -ENOMEM;
+	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
+					GFP_KERNEL);
+	if (!ctxt.new_ent) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ctxt.new_ent->e_rec = *rec;
+	ctxt.new_ent->e_tree_depth = tree_depth;
+
+	do {
+		ret = -ENOMEM;
+		if (ctxt.need_left && !ctxt.left_ent) {
+			ctxt.left_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.left_ent)
+				break;
+		}
+		if (ctxt.need_right && !ctxt.right_ent) {
+			ctxt.right_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.right_ent)
+				break;
+		}
+
+		ret = ocfs2_extent_map_try_insert(inode, rec,
+						  tree_depth, &ctxt);
+	} while (ret == -EAGAIN);
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	if (ctxt.left_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
+	if (ctxt.right_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
+	if (ctxt.old_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
+	if (ctxt.new_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
+
+	return ret;
+}
+
+/*
+ * Append this record to the tail of the extent map.  It must be
+ * tree_depth 0.  The record might be an extension of an existing
+ * record, and as such that needs to be handled.  eg:
+ *
+ * Existing record in the extent map:
+ *
+ *	cpos = 10, len = 10
+ * 	|---------|
+ *
+ * New Record:
+ *
+ *	cpos = 10, len = 20
+ * 	|------------------|
+ *
+ * The passed record is the new on-disk record.  The new_clusters value
+ * is how many clusters were added to the file.  If the append is a
+ * contiguous append, the new_clusters has been added to
+ * rec->e_clusters.  If the append is an entirely new extent, then
+ * rec->e_clusters is == new_clusters.
+ */
+int ocfs2_extent_map_append(struct inode *inode,
+			    struct ocfs2_extent_rec *rec,
+			    u32 new_clusters)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	struct ocfs2_extent_rec *old;
+
+	BUG_ON(!new_clusters);
+	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
+
+	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
+			 le32_to_cpu(rec->e_clusters)) !=
+			(em->em_clusters + new_clusters),
+			"Inode %"MLFu64":\n"
+			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
+			"em->em_clusters = %u + new_clusters = %u = %u\n",
+			OCFS2_I(inode)->ip_blkno,
+			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
+			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
+			em->em_clusters, new_clusters,
+			em->em_clusters + new_clusters);
+
+	em->em_clusters += new_clusters;
+
+	ret = -ENOENT;
+	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
+		/* This is a contiguous append */
+		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
+					      NULL, NULL);
+		if (ent) {
+			old = &ent->e_rec;
+			BUG_ON((le32_to_cpu(rec->e_cpos) +
+				le32_to_cpu(rec->e_clusters)) !=
+				 (le32_to_cpu(old->e_cpos) +
+				  le32_to_cpu(old->e_clusters) +
+				  new_clusters));
+			if (ent->e_tree_depth == 0) {
+				BUG_ON(le32_to_cpu(old->e_cpos) !=
+				       le32_to_cpu(rec->e_cpos));
+				BUG_ON(le64_to_cpu(old->e_blkno) !=
+				       le64_to_cpu(rec->e_blkno));
+				ret = 0;
+			}
+			/*
+			 * Let non-leafs fall through as -ENOENT to
+			 * force insertion of the new leaf.
+			 */
+			le32_add_cpu(&old->e_clusters, new_clusters);
+		}
+	}
+
+	if (ret == -ENOENT)
+		ret = ocfs2_extent_map_insert(inode, rec, 0);
+	if (ret < 0)
+		mlog_errno(ret);
+	return ret;
+}
+
+#if 0
+/* Code here is included but defined out as it completes the extent
+ * map api and may be used in the future. */
+
+/*
+ * Look up the record containing this cluster offset.  This record is
+ * part of the extent map.  Do not free it.  Any changes you make to
+ * it will reflect in the extent map.  So, if your last extent
+ * is (cpos = 10, clusters = 10) and you truncate the file by 5
+ * clusters, you can do:
+ *
+ * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
+ * rec->e_clusters -= 5;
+ *
+ * The lookup does not read from disk.  If the map isn't filled in for
+ * an entry, you won't find it.
+ *
+ * Also note that the returned record is valid until alloc_sem is
+ * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
+ */
+int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
+			     struct ocfs2_extent_rec **rec,
+			     int *tree_depth)
+{
+	int ret = -ENOENT;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*rec = NULL;
+
+	if (cpos >= OCFS2_I(inode)->ip_clusters)
+		return -EINVAL;
+
+	if (cpos >= em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters ;
+	}
+
+	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
+				      NULL, NULL);
+
+	if (ent) {
+		*rec = &ent->e_rec;
+		if (tree_depth)
+			*tree_depth = ent->e_tree_depth;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+int ocfs2_extent_map_get_clusters(struct inode *inode,
+				  u32 v_cpos, int count,
+				  u32 *p_cpos, int *ret_count)
+{
+	int ret;
+	u32 coff, ccount;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	*p_cpos = ccount = 0;
+
+	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
+		return -EINVAL;
+
+	if ((v_cpos + count) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+
+	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
+	if (ret)
+		return ret;
+
+	if (ent) {
+		/* We should never find ourselves straddling an interval */
+		if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
+							v_cpos,
+							count))
+			return -ESRCH;
+
+		coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
+		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+				le64_to_cpu(ent->e_rec.e_blkno)) +
+			  coff;
+
+		if (ret_count)
+			*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
+
+		return 0;
+	}
+
+
+	return -ENOENT;
+}
+
+#endif  /*  0  */
+
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count)
+{
+	int ret;
+	u64 boff;
+	u32 cpos, clusters;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	struct ocfs2_extent_map_entry *ent = NULL;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_rec *rec;
+
+	*p_blkno = 0;
+
+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
+					    (u64)count + bpc - 1);
+	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if ((cpos + clusters) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ent)
+	{
+		rec = &ent->e_rec;
+
+		/* We should never find ourselves straddling an interval */
+		if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
+			ret = -ESRCH;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
+						le32_to_cpu(rec->e_cpos));
+		boff += (v_blkno & (u64)(bpc - 1));
+		*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
+
+		if (ret_count) {
+			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
+					le32_to_cpu(rec->e_clusters)) - boff;
+		}
+
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+int ocfs2_extent_map_init(struct inode *inode)
+{
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+
+	em->em_extents = RB_ROOT;
+	em->em_clusters = 0;
+
+	return 0;
+}
+
+/* Needs the lock */
+static void __ocfs2_extent_map_drop(struct inode *inode,
+				    u32 new_clusters,
+				    struct rb_node **free_head,
+				    struct ocfs2_extent_map_entry **tail_ent)
+{
+	struct rb_node *node, *next;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*free_head = NULL;
+
+	ent = NULL;
+	node = rb_last(&em->em_extents);
+	while (node)
+	{
+		next = rb_prev(node);
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
+			break;
+
+		rb_erase(&ent->e_node, &em->em_extents);
+
+		node->rb_right = *free_head;
+		*free_head = node;
+
+		ent = NULL;
+		node = next;
+	}
+
+	/* Do we have an entry straddling new_clusters? */
+	if (tail_ent) {
+		if (ent &&
+		    ((le32_to_cpu(ent->e_rec.e_cpos) +
+		      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
+			*tail_ent = ent;
+		else
+			*tail_ent = NULL;
+	}
+}
+
+static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
+{
+	struct rb_node *node;
+	struct ocfs2_extent_map_entry *ent;
+
+	while (free_head) {
+		node = free_head;
+		free_head = node->rb_right;
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		kmem_cache_free(ocfs2_em_ent_cachep, ent);
+	}
+}
+
+/*
+ * Remove all entries past new_clusters, inclusive of an entry that
+ * contains new_clusters.  This is effectively a cache forget.
+ *
+ * If you want to also clip the last extent by some number of clusters,
+ * you need to call ocfs2_extent_map_trunc().
+ * This code does not check or modify ip_clusters.
+ */
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent) {
+		rb_erase(&ent->e_node, &em->em_extents);
+		ent->e_node.rb_right = free_head;
+		free_head = &ent->e_node;
+	}
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+/*
+ * Remove all entries past new_clusters and also clip any extent
+ * straddling new_clusters, if there is one.  This does not check
+ * or modify ip_clusters
+ */
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent)
+		ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
+					       le32_to_cpu(ent->e_rec.e_cpos));
+
+	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+int __init init_ocfs2_extent_maps(void)
+{
+	ocfs2_em_ent_cachep =
+		kmem_cache_create("ocfs2_em_ent",
+				  sizeof(struct ocfs2_extent_map_entry),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ocfs2_em_ent_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit exit_ocfs2_extent_maps(void)
+{
+	kmem_cache_destroy(ocfs2_em_ent_cachep);
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
new file mode 100644
index 000000000000..fa3745efa886
--- /dev/null
+++ b/fs/ocfs2/extent_map.h
@@ -0,0 +1,46 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.h
+ *
+ * In-memory file extent mappings for OCFS2.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _EXTENT_MAP_H
+#define _EXTENT_MAP_H
+
+int init_ocfs2_extent_maps(void);
+void exit_ocfs2_extent_maps(void);
+
+/*
+ * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
+ * to be held.  The allocation cannot change at all while the map is
+ * in the process of being updated.
+ */
+int ocfs2_extent_map_init(struct inode *inode);
+int ocfs2_extent_map_append(struct inode *inode,
+			    struct ocfs2_extent_rec *rec,
+			    u32 new_clusters);
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count);
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+
+#endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
new file mode 100644
index 000000000000..72ae9e3306f4
--- /dev/null
+++ b/fs/ocfs2/file.c
@@ -0,0 +1,1237 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c
+ *
+ * File open, close, extend, truncate
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "mmap.h"
+#include "suballoc.h"
+#include "super.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_sync_inode(struct inode *inode)
+{
+	filemap_fdatawrite(inode->i_mapping);
+	return sync_mapping_buffers(inode->i_mapping);
+}
+
+static int ocfs2_file_open(struct inode *inode, struct file *file)
+{
+	int status;
+	int mode = file->f_flags;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
+		   file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+
+	spin_lock(&oi->ip_lock);
+
+	/* Check that the inode hasn't been wiped from disk by another
+	 * node. If it hasn't then we're safe as long as we hold the
+	 * spin lock until our increment of open count. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&oi->ip_lock);
+
+		status = -ENOENT;
+		goto leave;
+	}
+
+	if (mode & O_DIRECT)
+		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
+
+	oi->ip_open_count++;
+	spin_unlock(&oi->ip_lock);
+	status = 0;
+leave:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_file_release(struct inode *inode, struct file *file)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
+		       file->f_dentry->d_name.len,
+		       file->f_dentry->d_name.name);
+
+	spin_lock(&oi->ip_lock);
+	if (!--oi->ip_open_count)
+		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+	spin_unlock(&oi->ip_lock);
+
+	mlog_exit(0);
+
+	return 0;
+}
+
+static int ocfs2_sync_file(struct file *file,
+			   struct dentry *dentry,
+			   int datasync)
+{
+	int err = 0;
+	journal_t *journal;
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	err = ocfs2_sync_inode(dentry->d_inode);
+	if (err)
+		goto bail;
+
+	journal = osb->journal->j_journal;
+	err = journal_force_commit(journal);
+
+bail:
+	mlog_exit(err);
+
+	return (err < 0) ? -EIO : 0;
+}
+
+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size)
+{
+	int status;
+
+	mlog_entry_void();
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_simple_size_update(struct inode *inode,
+				    struct buffer_head *di_bh,
+				    u64 new_i_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_journal_handle *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, NULL,
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_set_inode_size(handle, inode, di_bh,
+				   new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(handle);
+out:
+	return ret;
+}
+
+static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     u64 new_i_size)
+{
+	int status;
+	struct ocfs2_journal_handle *handle;
+
+	mlog_entry_void();
+
+	/* TODO: This needs to actually orphan the inode in this
+	 * transaction. */
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(handle);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_truncate_file(struct inode *inode,
+			       struct buffer_head *di_bh,
+			       u64 new_i_size)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_truncate_context *tc = NULL;
+
+	mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
+		   OCFS2_I(inode)->ip_blkno, new_i_size);
+
+	truncate_inode_pages(inode->i_mapping, new_i_size);
+
+	fe = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+
+	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
+			"Inode %"MLFu64", inode i_size = %lld != di "
+			"i_size = %"MLFu64", i_flags = 0x%x\n",
+			OCFS2_I(inode)->ip_blkno,
+			i_size_read(inode),
+			le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
+
+	if (new_i_size > le64_to_cpu(fe->i_size)) {
+		mlog(0, "asked to truncate file with size (%"MLFu64") "
+		     "to size (%"MLFu64")!\n",
+		     le64_to_cpu(fe->i_size), new_i_size);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
+	     le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
+
+	/* lets handle the simple truncate cases before doing any more
+	 * cluster locking. */
+	if (new_i_size == le64_to_cpu(fe->i_size))
+		goto bail;
+
+	if (le32_to_cpu(fe->i_clusters) ==
+	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
+		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
+		     fe->i_clusters);
+		/* No allocation change is required, so lets fast path
+		 * this truncate. */
+		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+		if (status < 0)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* This forces other nodes to sync and drop their pages */
+	status = ocfs2_data_lock(inode, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_data_unlock(inode, 1);
+
+	/* alright, we're going to need to do a full blown alloc size
+	 * change. Orphan the inode so that recovery can complete the
+	 * truncate if necessary. This does the task of marking
+	 * i_size. */
+	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* TODO: orphan dir cleanup here. */
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * extend allocation only here.
+ * we'll update all the disk stuff, and oip->alloc_size
+ *
+ * expect stuff to be locked, a transaction started and enough data /
+ * metadata reservations in the contexts.
+ *
+ * Will return -EAGAIN, and a reason if a restart is needed.
+ * If passed in, *reason will always be set, even in error.
+ */
+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason_ret)
+{
+	int status = 0;
+	int free_extents;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+
+	BUG_ON(!clusters_to_add);
+
+	free_extents = ocfs2_num_free_extents(osb, inode, fe);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		mlog(0, "we haven't reserved any metadata!\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(fe))) {
+		mlog(0, "filesystem is really fragmented...\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
+				      &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	/* reserve our write early -- insert_extent may update the inode */
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
+	     num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
+	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
+				     num_bits, meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	le32_add_cpu(&fe->i_clusters, num_bits);
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= num_bits;
+
+	if (clusters_to_add) {
+		mlog(0, "need to alloc once more, clusters = %u, wanted = "
+		     "%u\n", fe->i_clusters, clusters_to_add);
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+leave:
+	mlog_exit(status);
+	if (reason_ret)
+		*reason_ret = reason;
+	return status;
+}
+
+static int ocfs2_extend_allocation(struct inode *inode,
+				   u32 clusters_to_add)
+{
+	int status = 0;
+	int restart_func = 0;
+	int drop_alloc_sem = 0;
+	int credits, num_free_extents;
+	u32 prev_clusters;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto leave;
+	}
+
+restart_all:
+	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
+
+	mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+	     fe->i_clusters, clusters_to_add);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	num_free_extents = ocfs2_num_free_extents(osb,
+						  inode,
+						  fe);
+	if (num_free_extents < 0) {
+		status = num_free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (!num_free_extents) {
+		status = ocfs2_reserve_new_metadata(osb,
+						    handle,
+						    fe,
+						    &meta_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	status = ocfs2_reserve_clusters(osb,
+					handle,
+					clusters_to_add,
+					&data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	/* blocks peope in read/write from reading our allocation
+	 * until we're done changing it. We depend on i_sem to block
+	 * other extend/truncate calls while we're here. Ordering wrt
+	 * start_trans is important here -- always do it before! */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	drop_alloc_sem = 1;
+
+	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	/* reserve a write to the file entry early on - that we if we
+	 * run out of credits in the allocation path, we can still
+	 * update i_size. */
+	status = ocfs2_journal_access(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	prev_clusters = OCFS2_I(inode)->ip_clusters;
+
+	status = ocfs2_do_extend_allocation(osb,
+					    inode,
+					    clusters_to_add,
+					    bh,
+					    handle,
+					    data_ac,
+					    meta_ac,
+					    &why);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			mlog(0, "restarting function.\n");
+			restart_func = 1;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			mlog(0, "restarting transaction.\n");
+			/* TODO: This can be more intelligent. */
+			credits = ocfs2_calc_extend_credits(osb->sb,
+							    fe,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+	mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
+	     fe->i_clusters, fe->i_size);
+	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
+	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+
+leave:
+	if (drop_alloc_sem) {
+		up_write(&OCFS2_I(inode)->ip_alloc_sem);
+		drop_alloc_sem = 0;
+	}
+	if (handle) {
+		ocfs2_commit_trans(handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+	if (bh) {
+		brelse(bh);
+		bh = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Some parts of this taken from generic_cont_expand, which turned out
+ * to be too fragile to do exactly what we need without us having to
+ * worry about recursive locking in ->commit_write(). */
+static int ocfs2_write_zero_page(struct inode *inode,
+				 u64 size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long index;
+	unsigned int offset;
+	struct ocfs2_journal_handle *handle = NULL;
+	int ret;
+
+	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_prepare_write(NULL, page, offset, offset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		handle = ocfs2_start_walk_page_trans(inode, page, offset,
+						     offset);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock;
+		}
+	}
+
+	/* must not update i_size! */
+	ret = block_commit_write(page, offset, offset);
+	if (ret < 0)
+		mlog_errno(ret);
+	else
+		ret = 0;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+static int ocfs2_zero_extend(struct inode *inode,
+			     u64 zero_to_size)
+{
+	int ret = 0;
+	u64 start_off;
+	struct super_block *sb = inode->i_sb;
+
+	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+	while (start_off < zero_to_size) {
+		ret = ocfs2_write_zero_page(inode, start_off);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		start_off += sb->s_blocksize;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_extend_file(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret = 0;
+	u32 clusters_to_add;
+
+	/* setattr sometimes calls us like this. */
+	if (new_i_size == 0)
+		goto out;
+
+	if (i_size_read(inode) == new_i_size)
+  		goto out;
+	BUG_ON(new_i_size < i_size_read(inode));
+
+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
+		OCFS2_I(inode)->ip_clusters;
+
+	if (clusters_to_add) {
+		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_zero_extend(inode, new_i_size);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} 
+
+	/* No allocation required, we just use this helper to
+	 * do a trivial update of i_size. */
+	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int status = 0, size_change;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+
+	mlog_entry("(0x%p, '%.*s')\n", dentry,
+	           dentry->d_name.len, dentry->d_name.name);
+
+	if (attr->ia_valid & ATTR_MODE)
+		mlog(0, "mode change: %d\n", attr->ia_mode);
+	if (attr->ia_valid & ATTR_UID)
+		mlog(0, "uid change: %d\n", attr->ia_uid);
+	if (attr->ia_valid & ATTR_GID)
+		mlog(0, "gid change: %d\n", attr->ia_gid);
+	if (attr->ia_valid & ATTR_SIZE)
+		mlog(0, "size change...\n");
+	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
+		mlog(0, "time change...\n");
+
+#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
+			   | ATTR_GID | ATTR_UID | ATTR_MODE)
+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
+		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+		return 0;
+	}
+
+	status = inode_change_ok(inode, attr);
+	if (status)
+		return status;
+
+	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
+	if (size_change) {
+		status = ocfs2_rw_lock(inode, 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail_unlock_rw;
+	}
+
+	if (size_change && attr->ia_size != i_size_read(inode)) {
+		if (i_size_read(inode) > attr->ia_size)
+			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
+		else
+			status = ocfs2_extend_file(inode, bh, attr->ia_size);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			status = -ENOSPC;
+			goto bail_unlock;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	status = inode_setattr(inode, attr);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(handle);
+bail_unlock:
+	ocfs2_meta_unlock(inode, 1);
+bail_unlock_rw:
+	if (size_change)
+		ocfs2_rw_unlock(inode, 1);
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_getattr(struct vfsmount *mnt,
+		  struct dentry *dentry,
+		  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct ocfs2_super *osb = sb->s_fs_info;
+	int err;
+
+	mlog_entry_void();
+
+	err = ocfs2_inode_revalidate(dentry);
+	if (err) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	generic_fillattr(inode, stat);
+
+	/* We set the blksize from the cluster size for performance */
+	stat->blksize = osb->s_clustersize;
+
+bail:
+	mlog_exit(err);
+
+	return err;
+}
+
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_journal_handle *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
+
+	mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
+		   inode->i_mode);
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_trans;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_bh;
+	}
+
+	inode->i_mode &= ~S_ISUID;
+	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
+		inode->i_mode &= ~S_ISGID;
+
+	di = (struct ocfs2_dinode *) bh->b_data;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+
+	ret = ocfs2_journal_dirty(handle, bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_bh:
+	brelse(bh);
+out_trans:
+	ocfs2_commit_trans(handle);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_write_should_remove_suid(struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+
+	if (!capable(CAP_FSETID)) {
+		if (unlikely(mode & S_ISUID))
+			return 1;
+
+		if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+			return 1;
+	}
+	return 0;
+}
+
+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
+				    const char __user *buf,
+				    size_t count,
+				    loff_t pos)
+{
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+				   .iov_len = count };
+	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
+	u32 clusters;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+	loff_t newsize, saved_pos;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	/* happy write of zero bytes */
+	if (count == 0)
+		return 0;
+
+	if (!inode) {
+		mlog(0, "bad inode\n");
+		return -EIO;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	/* ugh, work around some applications which open everything O_DIRECT +
+	 * O_APPEND and really don't mean to use O_DIRECT. */
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 
+		filp->f_flags &= ~O_DIRECT;
+#endif
+
+	down(&inode->i_sem);
+	/* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
+	if (filp->f_flags & O_DIRECT) {
+		have_alloc_sem = 1;
+		down_read(&inode->i_alloc_sem);
+	}
+
+	/* concurrent O_DIRECT writes are allowed */
+	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+	ret = ocfs2_rw_lock(inode, rw_level);
+	if (ret < 0) {
+		rw_level = -1;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* 
+	 * We sample i_size under a read level meta lock to see if our write
+	 * is extending the file, if it is we back off and get a write level
+	 * meta lock.
+	 */
+	meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
+	for(;;) {
+		ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
+		if (ret < 0) {
+			meta_level = -1;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* Clear suid / sgid if necessary. We do this here
+		 * instead of later in the write path because
+		 * remove_suid() calls ->setattr without any hint that
+		 * we may have already done our cluster locking. Since
+		 * ocfs2_setattr() *must* take cluster locks to
+		 * proceeed, this will lead us to recursively lock the
+		 * inode. There's also the dinode i_size state which
+		 * can be lost via setattr during extending writes (we
+		 * set inode->i_size at the end of a write. */
+		if (ocfs2_write_should_remove_suid(inode)) {
+			if (meta_level == 0) {
+				ocfs2_meta_unlock(inode, meta_level);
+				meta_level = 1;
+				continue;
+			}
+
+			ret = ocfs2_write_remove_suid(inode);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		/* work on a copy of ppos until we're sure that we won't have
+		 * to recalculate it due to relocking. */
+		if (filp->f_flags & O_APPEND) {
+			saved_pos = i_size_read(inode);
+			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
+		} else {
+			saved_pos = iocb->ki_pos;
+		}
+		newsize = count + saved_pos;
+
+		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
+		     saved_pos, newsize, i_size_read(inode));
+
+		/* No need for a higher level metadata lock if we're
+		 * never going past i_size. */
+		if (newsize <= i_size_read(inode))
+			break;
+
+		if (meta_level == 0) {
+			ocfs2_meta_unlock(inode, meta_level);
+			meta_level = 1;
+			continue;
+		}
+
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
+			OCFS2_I(inode)->ip_clusters;
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		mlog(0, "Writing at EOF, may need more allocation: "
+		     "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
+		     i_size_read(inode), newsize, clusters);
+
+		/* We only want to continue the rest of this loop if
+		 * our extend will actually require more
+		 * allocation. */
+		if (!clusters)
+			break;
+
+		ret = ocfs2_extend_allocation(inode, clusters);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		/* Fill any holes which would've been created by this
+		 * write. If we're O_APPEND, this will wind up
+		 * (correctly) being a noop. */
+		ret = ocfs2_zero_extend(inode, (u64) newsize - count);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/* ok, we're done with i_size and alloc work */
+	iocb->ki_pos = saved_pos;
+	ocfs2_meta_unlock(inode, meta_level);
+	meta_level = -1;
+
+	/* communicate with ocfs2_dio_end_io */
+	ocfs2_iocb_set_rw_locked(iocb);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    filp->f_flags & O_DIRECT) {
+		unsigned int saved_flags = filp->f_flags;
+		int sector_size = 1 << osb->s_sectsize_bits;
+
+		if ((saved_pos & (sector_size - 1)) ||
+		    (count & (sector_size - 1)) ||
+		    ((unsigned long)buf & (sector_size - 1))) {
+			filp->f_flags |= O_SYNC;
+			filp->f_flags &= ~O_DIRECT;
+		}
+
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+		filp->f_flags = saved_flags;
+	} else
+#endif
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* 
+	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+	 * function pointer which is called when o_direct io completes so that
+	 * it can unlock our rw lock.  (it's the clustered equivalent of
+	 * i_alloc_sem; protects truncate from racing with pending ios).
+	 * Unfortunately there are error cases which call end_io and others
+	 * that don't.  so we don't have to unlock the rw_lock if either an
+	 * async dio is going to do it in the future or an end_io after an
+	 * error has already done it.
+	 */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+out:
+	if (meta_level != -1)
+		ocfs2_meta_unlock(inode, meta_level);
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	up(&inode->i_sem);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
+				   char __user *buf,
+				   size_t count,
+				   loff_t pos)
+{
+	int ret = 0, rw_level = -1, have_alloc_sem = 0;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+		if (filp->f_flags & O_DIRECT) {
+			int sector_size = 1 << osb->s_sectsize_bits;
+
+			if ((pos & (sector_size - 1)) ||
+			    (count & (sector_size - 1)) ||
+			    ((unsigned long)buf & (sector_size - 1)) ||
+			    (i_size_read(inode) & (sector_size -1))) {
+				filp->f_flags &= ~O_DIRECT;
+			}
+		}
+	}
+#endif
+
+	/* 
+	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
+	 * need locks to protect pending reads from racing with truncate.
+	 */
+	if (filp->f_flags & O_DIRECT) {
+		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
+
+		ret = ocfs2_rw_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+		rw_level = 0;
+		/* communicate with ocfs2_dio_end_io */
+		ocfs2_iocb_set_rw_locked(iocb);
+	}
+
+	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
+	if (ret == -EINVAL)
+		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* see ocfs2_file_aio_write */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+bail:
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+struct inode_operations ocfs2_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
+
+struct inode_operations ocfs2_special_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
+
+struct file_operations ocfs2_fops = {
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.sendfile	= generic_file_sendfile,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+};
+
+struct file_operations ocfs2_dops = {
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
new file mode 100644
index 000000000000..a5ea33b24060
--- /dev/null
+++ b/fs/ocfs2/file.h
@@ -0,0 +1,57 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_FILE_H
+#define OCFS2_FILE_H
+
+extern struct file_operations ocfs2_fops;
+extern struct file_operations ocfs2_dops;
+extern struct inode_operations ocfs2_file_iops;
+extern struct inode_operations ocfs2_special_file_iops;
+struct ocfs2_alloc_context;
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason);
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
+int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+
+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size);
+
+#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
new file mode 100644
index 000000000000..0bbd22f46c80
--- /dev/null
+++ b/fs/ocfs2/heartbeat.c
@@ -0,0 +1,378 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Register ourselves with the heartbaet service, keep our node maps
+ * up to date, and fire off recovery when needed.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kmod.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_HB_NODE_DOWN_PRI     (0x0000002)
+#define OCFS2_HB_NODE_UP_PRI	   OCFS2_HB_NODE_DOWN_PRI
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit);
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit);
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb)
+{
+	spin_lock_init(&osb->node_map_lock);
+	ocfs2_node_map_init(&osb->mounted_map);
+	ocfs2_node_map_init(&osb->recovery_map);
+	ocfs2_node_map_init(&osb->umount_map);
+}
+
+static void ocfs2_do_node_down(int node_num,
+			       struct ocfs2_super *osb)
+{
+	BUG_ON(osb->node_num == node_num);
+
+	mlog(0, "ocfs2: node down event for %d\n", node_num);
+
+	if (!osb->dlm) {
+		/*
+		 * No DLM means we're not even ready to participate yet.
+		 * We check the slots after the DLM comes up, so we will
+		 * notice the node death then.  We can safely ignore it
+		 * here.
+		 */
+		return;
+	}
+
+	if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
+		/* If a node is in the umount map, then we've been
+		 * expecting him to go down and we know ahead of time
+		 * that recovery is not necessary. */
+		ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+		return;
+	}
+
+	ocfs2_recovery_thread(osb, node_num);
+
+	ocfs2_remove_node_from_vote_queues(osb, node_num);
+}
+
+static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
+				  int node_num,
+				  void *data)
+{
+	ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
+}
+
+/* Called from the dlm when it's about to evict a node. We may also
+ * get a heartbeat callback later. */
+static void ocfs2_dlm_eviction_cb(int node_num,
+				  void *data)
+{
+	struct ocfs2_super *osb = (struct ocfs2_super *) data;
+	struct super_block *sb = osb->sb;
+
+	mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
+	     MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
+
+	ocfs2_do_node_down(node_num, osb);
+}
+
+static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
+				int node_num,
+				void *data)
+{
+	struct ocfs2_super *osb = data;
+
+	BUG_ON(osb->node_num == node_num);
+
+	mlog(0, "node up event for %d\n", node_num);
+	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
+{
+	o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
+			    ocfs2_hb_node_down_cb, osb,
+			    OCFS2_HB_NODE_DOWN_PRI);
+
+	o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
+			    ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
+
+	/* Not exactly a heartbeat callback, but leads to essentially
+	 * the same path so we set it up here. */
+	dlm_setup_eviction_cb(&osb->osb_eviction_cb,
+			      ocfs2_dlm_eviction_cb,
+			      osb);
+}
+
+/* Most functions here are just stubs for now... */
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+
+	status = o2hb_register_callback(&osb->osb_hb_down);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = o2hb_register_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	return status;
+}
+
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+
+	status = o2hb_unregister_callback(&osb->osb_hb_down);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = o2hb_unregister_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+}
+
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	if (!osb->uuid_str) {
+		/* This can happen if we don't get far enough in mount... */
+		mlog(0, "No UUID with which to stop heartbeat!\n\n");
+		return;
+	}
+
+	argv[0] = (char *)o2nm_get_hb_ctl_path();
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = osb->uuid_str;
+	argv[4] = NULL;
+
+	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+}
+
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!!  */
+void ocfs2_node_map_init(struct ocfs2_node_map *map)
+{
+	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
+	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
+	       sizeof(unsigned long));
+}
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit)
+{
+	set_bit(bit, map->map);
+}
+
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_set_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit)
+{
+	clear_bit(bit, map->map);
+}
+
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_clear_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	int ret;
+	if (bit >= map->num_nodes) {
+		mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
+		BUG();
+	}
+	spin_lock(&osb->node_map_lock);
+	ret = test_bit(bit, map->map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
+{
+	int bit;
+	bit = find_next_bit(map->map, map->num_nodes, 0);
+	if (bit < map->num_nodes)
+		return 0;
+	return 1;
+}
+
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map)
+{
+	int ret;
+	BUG_ON(map->num_nodes == 0);
+	spin_lock(&osb->node_map_lock);
+	ret = __ocfs2_node_map_is_empty(map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	BUG_ON(from->num_nodes == 0);
+	ocfs2_node_map_init(target);
+	__ocfs2_node_map_set(target, from);
+}
+
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit)
+{
+	struct ocfs2_node_map temp;
+	int ret;
+
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_dup(&temp, target);
+	__ocfs2_node_map_clear_bit(&temp, bit);
+	ret = __ocfs2_node_map_is_empty(&temp);
+	spin_unlock(&osb->node_map_lock);
+
+	return ret;
+}
+
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	int num_longs, i;
+
+	BUG_ON(target->num_nodes != from->num_nodes);
+	BUG_ON(target->num_nodes == 0);
+
+	num_longs = BITS_TO_LONGS(target->num_nodes);
+	for (i = 0; i < num_longs; i++)
+		target->map[i] = from->map[i];
+}
+
+/* Returns whether the recovery bit was actually set - it may not be
+ * if a node is still marked as needing recovery */
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num)
+{
+	int set = 0;
+
+	spin_lock(&osb->node_map_lock);
+
+	__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
+
+	if (!test_bit(num, osb->recovery_map.map)) {
+	    __ocfs2_node_map_set_bit(&osb->recovery_map, num);
+	    set = 1;
+	}
+
+	spin_unlock(&osb->node_map_lock);
+
+	return set;
+}
+
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num)
+{
+	ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
+}
+
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx)
+{
+	int i = idx;
+
+	idx = O2NM_INVALID_NODE_NUM;
+	spin_lock(&osb->node_map_lock);
+	if ((i != O2NM_INVALID_NODE_NUM) &&
+	    (i >= 0) &&
+	    (i < map->num_nodes)) {
+		while(i < map->num_nodes) {
+			if (test_bit(i, map->map)) {
+				idx = i;
+				break;
+			}
+			i++;
+		}
+	}
+	spin_unlock(&osb->node_map_lock);
+	return idx;
+}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
new file mode 100644
index 000000000000..e8fb079122e4
--- /dev/null
+++ b/fs/ocfs2/heartbeat.h
@@ -0,0 +1,67 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_HEARTBEAT_H
+#define OCFS2_HEARTBEAT_H
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb);
+
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
+
+/* node map functions - used to keep track of mounted and in-recovery
+ * nodes. */
+void ocfs2_node_map_init(struct ocfs2_node_map *map);
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map);
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit);
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx);
+static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
+					       struct ocfs2_node_map *map)
+{
+	return ocfs2_node_map_iterate(osb, map, 0);
+}
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num);
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num);
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit);
+
+#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
new file mode 100644
index 000000000000..a91ba4dec936
--- /dev/null
+++ b/fs/ocfs2/inode.c
@@ -0,0 +1,1140 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c
+ *
+ * vfs' aops, fops, dops and iops
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+
+#include <asm/byteorder.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_FI_FLAG_NOWAIT	0x1
+#define OCFS2_FI_FLAG_DELETE	0x2
+struct ocfs2_find_inode_args
+{
+	u64		fi_blkno;
+	unsigned long	fi_ino;
+	unsigned int	fi_flags;
+};
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args);
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
+static int ocfs2_find_actor(struct inode *inode, void *opaque);
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh);
+
+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
+				     u64 blkno,
+				     int delete_vote)
+{
+	struct ocfs2_find_inode_args args;
+
+	/* ocfs2_ilookup_for_vote should *only* be called from the
+	 * vote thread */
+	BUG_ON(current != osb->vote_task);
+
+	args.fi_blkno = blkno;
+	args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
+	if (delete_vote)
+		args.fi_flags |= OCFS2_FI_FLAG_DELETE;
+	args.fi_ino = ino_from_blkno(osb->sb, blkno);
+	return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
+}
+
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
+{
+	struct inode *inode = NULL;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_find_inode_args args;
+
+	mlog_entry("(blkno = %"MLFu64")\n", blkno);
+
+	/* Ok. By now we've either got the offsets passed to us by the
+	 * caller, or we just pulled them off the bh. Lets do some
+	 * sanity checks to make sure they're OK. */
+	if (blkno == 0) {
+		inode = ERR_PTR(-EINVAL);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+	args.fi_blkno = blkno;
+	args.fi_flags = 0;
+	args.fi_ino = ino_from_blkno(sb, blkno);
+
+	inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
+			     ocfs2_init_locked_inode, &args);
+	/* inode was *not* in the inode cache. 2.6.x requires
+	 * us to do our own read_inode call and unlock it
+	 * afterwards. */
+	if (inode && inode->i_state & I_NEW) {
+		mlog(0, "Inode was not in inode cache, reading it.\n");
+		ocfs2_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
+	if (inode == NULL) {
+		inode = ERR_PTR(-ENOMEM);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = ERR_PTR(-ESTALE);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+bail:
+	if (!IS_ERR(inode)) {
+		mlog(0, "returning inode with number %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_blkno);
+		mlog_exit_ptr(inode);
+	} else
+		mlog_errno(PTR_ERR(inode));
+
+	return inode;
+}
+
+
+/*
+ * here's how inodes get read from disk:
+ * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
+ * found? : return the in-memory inode
+ * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
+ */
+
+static int ocfs2_find_actor(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
+
+	args = opaque;
+
+	mlog_bug_on_msg(!inode, "No inode in find actor!\n");
+
+	if (oi->ip_blkno != args->fi_blkno)
+		goto bail;
+
+	/* OCFS2_FI_FLAG_NOWAIT is *only* set from
+	 * ocfs2_ilookup_for_vote which won't create an inode for one
+	 * that isn't found. The vote thread which doesn't want to get
+	 * an inode which is in the process of going away - otherwise
+	 * the call to __wait_on_freeing_inode in find_inode_fast will
+	 * cause it to deadlock on an inode which may be waiting on a
+	 * vote (or lock release) in delete_inode */
+	if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
+	    (inode->i_state & (I_FREEING|I_CLEAR))) {
+		/* As stated above, we're not going to return an
+		 * inode.  In the case of a delete vote, the voting
+		 * code is going to signal the other node to go
+		 * ahead. Mark that state here, so this freeing inode
+		 * has the state when it gets to delete_inode. */
+		if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
+			spin_lock(&oi->ip_lock);
+			ocfs2_mark_inode_remotely_deleted(inode);
+			spin_unlock(&oi->ip_lock);
+		}
+		goto bail;
+	}
+
+	ret = 1;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+/*
+ * initialize the new inode, but don't do anything that would cause
+ * us to sleep.
+ * return 0 on success, 1 on failure
+ */
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = opaque;
+
+	mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
+
+	inode->i_ino = args->fi_ino;
+	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+
+	mlog_exit(0);
+	return 0;
+}
+
+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+		     	 int create_ino)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	int status = -EINVAL;
+
+	mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size);
+
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	/* this means that read_inode cannot create a superblock inode
+	 * today.  change if needed. */
+	if (!OCFS2_IS_VALID_DINODE(fe) ||
+	    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", "
+		     "signature = %.*s, flags = 0x%x\n",
+		     inode->i_ino, le64_to_cpu(fe->i_blkno), 7,
+		     fe->i_signature, le32_to_cpu(fe->i_flags));
+		goto bail;
+	}
+
+	if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
+		mlog(ML_ERROR, "file entry generation does not match "
+		     "superblock! osb->fs_generation=%x, "
+		     "fe->i_fs_generation=%x\n",
+		     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
+		goto bail;
+	}
+
+	inode->i_version = 1;
+	inode->i_generation = le32_to_cpu(fe->i_generation);
+	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+	inode->i_blksize = (u32)osb->s_clustersize;
+
+	/* Fast symlinks will have i_size but no allocated clusters. */
+	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks =
+			ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
+	inode->i_mapping->a_ops = &ocfs2_aops;
+	inode->i_flags |= S_NOATIME;
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
+		mlog(ML_ERROR,
+		     "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n",
+		     OCFS2_I(inode)->ip_blkno, fe->i_blkno);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+
+	if (create_ino)
+		inode->i_ino = ino_from_blkno(inode->i_sb,
+			       le64_to_cpu(fe->i_blkno));
+
+	mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n",
+	     fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
+
+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+
+	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
+		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
+		/* we can't actually hit this as read_inode can't
+		 * handle superblocks today ;-) */
+		BUG();
+	}
+
+	switch (inode->i_mode & S_IFMT) {
+	    case S_IFREG:
+		    inode->i_fop = &ocfs2_fops;
+		    inode->i_op = &ocfs2_file_iops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFDIR:
+		    inode->i_op = &ocfs2_dir_iops;
+		    inode->i_fop = &ocfs2_dops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFLNK:
+		    if (ocfs2_inode_is_fast_symlink(inode))
+			inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		    else
+			inode->i_op = &ocfs2_symlink_inode_operations;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    default:
+		    inode->i_op = &ocfs2_special_file_iops;
+		    init_special_inode(inode, inode->i_mode,
+				       inode->i_rdev);
+		    break;
+	}
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
+				  OCFS2_LOCK_TYPE_RW, inode);
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+				  OCFS2_LOCK_TYPE_META, inode);
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
+				  OCFS2_LOCK_TYPE_DATA, inode);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *fe;
+	struct buffer_head *bh = NULL;
+	int status;
+	int sysfile = 0;
+
+	mlog_entry("(0x%p, 0x%p)\n", inode, args);
+
+	status = -EINVAL;
+	if (inode == NULL || inode->i_sb == NULL) {
+		mlog(ML_ERROR, "bad inode\n");
+		goto bail;
+	}
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	if (!args) {
+		mlog(ML_ERROR, "bad inode args\n");
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	/* Read the FE off disk. This is safe because the kernel only
+	 * does one read_inode2 for a new inode, and if it doesn't
+	 * exist yet then nobody can be working on it! */
+	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+		     fe->i_blkno, 7, fe->i_signature);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+		sysfile = 1;
+
+	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
+	    S_ISBLK(le16_to_cpu(fe->i_mode)))
+    		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+
+	status = -EINVAL;
+	if (ocfs2_populate_inode(inode, fe, 0) < 0) {
+		mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", "
+		     "i_ino=%lu\n", fe->i_blkno, inode->i_ino);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+
+	if (sysfile)
+	       OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+
+	status = 0;
+
+bail:
+	if (args && bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_sync_blockdev(struct super_block *sb)
+{
+	sync_blockdev(sb->s_bdev);
+}
+
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh)
+{
+	int status = 0;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_truncate_context *tc = NULL;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	/* zero allocation, zero truncate :) */
+	if (!fe->i_clusters)
+		goto bail;
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_commit_trans(handle);
+	handle = NULL;
+
+	status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_remove_inode(struct inode *inode,
+			      struct buffer_head *di_bh,
+			      struct inode *orphan_dir_inode,
+			      struct buffer_head *orphan_dir_bh)
+{
+	int status;
+	struct inode *inode_alloc_inode = NULL;
+	struct buffer_head *inode_alloc_bh = NULL;
+	struct ocfs2_journal_handle *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    le16_to_cpu(di->i_suballoc_slot));
+	if (!inode_alloc_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	down(&inode_alloc_inode->i_sem);
+	status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
+	if (status < 0) {
+		up(&inode_alloc_inode->i_sem);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+				  orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	/* set the inodes dtime */
+	status = ocfs2_journal_access(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+
+	status = ocfs2_journal_dirty(handle, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	ocfs2_remove_from_cache(inode, di_bh);
+
+	status = ocfs2_free_dinode(handle, inode_alloc_inode,
+				   inode_alloc_bh, di);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(handle);
+bail_unlock:
+	ocfs2_meta_unlock(inode_alloc_inode, 1);
+	up(&inode_alloc_inode->i_sem);
+	brelse(inode_alloc_bh);
+bail:
+	iput(inode_alloc_inode);
+
+	return status;
+}
+
+static int ocfs2_wipe_inode(struct inode *inode,
+			    struct buffer_head *di_bh)
+{
+	int status, orphaned_slot;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We've already voted on this so it should be readonly - no
+	 * spinlock needed. */
+	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       orphaned_slot);
+	if (!orphan_dir_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Lock the orphan dir. The lock will be held for the entire
+	 * delete_inode operation. We do this now to avoid races with
+	 * recovery completion on other nodes. */
+	down(&orphan_dir_inode->i_sem);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
+	if (status < 0) {
+		up(&orphan_dir_inode->i_sem);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* we do this while holding the orphan dir lock because we
+	 * don't want recovery being run from another node to vote for
+	 * an inode delete on us -- this will result in two nodes
+	 * truncating the same file! */
+	status = ocfs2_truncate_for_delete(osb, inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
+				    orphan_dir_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_unlock_dir:
+	ocfs2_meta_unlock(orphan_dir_inode, 1);
+	up(&orphan_dir_inode->i_sem);
+	brelse(orphan_dir_bh);
+bail:
+	iput(orphan_dir_inode);
+
+	return status;
+}
+
+/* There is a series of simple checks that should be done before a
+ * vote is even considered. Encapsulate those in this function. */
+static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We shouldn't be getting here for the root directory
+	 * inode.. */
+	if (inode == osb->root_inode) {
+		mlog(ML_ERROR, "Skipping delete of root inode.\n");
+		goto bail;
+	}
+
+	/* If we're coming from process_vote we can't go into our own
+	 * voting [hello, deadlock city!], so unforuntately we just
+	 * have to skip deleting this guy. That's OK though because
+	 * the node who's doing the actual deleting should handle it
+	 * anyway. */
+	if (current == osb->vote_task) {
+		mlog(0, "Skipping delete of %lu because we're currently "
+		     "in process_vote\n", inode->i_ino);
+		goto bail;
+	}
+
+	spin_lock(&oi->ip_lock);
+	/* OCFS2 *never* deletes system files. This should technically
+	 * never get here as system file inodes should always have a
+	 * positive link count. */
+	if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
+		     oi->ip_blkno);
+		goto bail_unlock;
+	}
+
+	/* If we have voted "yes" on the wipe of this inode for
+	 * another node, it will be marked here so we can safely skip
+	 * it. Recovery will cleanup any inodes we might inadvertantly
+	 * skip here. */
+	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
+		mlog(0, "Skipping delete of %lu because another node "
+		     "has done this for us.\n", inode->i_ino);
+		goto bail_unlock;
+	}
+
+	ret = 1;
+bail_unlock:
+	spin_unlock(&oi->ip_lock);
+bail:
+	return ret;
+}
+
+/* Query the cluster to determine whether we should wipe an inode from
+ * disk or not.
+ *
+ * Requires the inode to have the cluster lock. */
+static int ocfs2_query_inode_wipe(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  int *wipe)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di;
+
+	*wipe = 0;
+
+	/* While we were waiting for the cluster lock in
+	 * ocfs2_delete_inode, another node might have asked to delete
+	 * the inode. Recheck our flags to catch this. */
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
+		     oi->ip_blkno);
+		goto bail;
+	}
+
+	/* Now that we have an up to date inode, we can double check
+	 * the link count. */
+	if (inode->i_nlink) {
+		mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
+		     oi->ip_blkno, inode->i_nlink);
+		goto bail;
+	}
+
+	/* Do some basic inode verification... */
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+		/* for lack of a better error? */
+		status = -EEXIST;
+		mlog(ML_ERROR,
+		     "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! "
+		     "Disk flags  0x%x, inode flags 0x%x\n",
+		     oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags);
+		goto bail;
+	}
+
+	/* has someone already deleted us?! baaad... */
+	if (di->i_dtime) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_request_delete_vote(inode);
+	/* -EBUSY means that other nodes are still using the
+	 * inode. We're done here though, so avoid doing anything on
+	 * disk and let them worry about deleting it. */
+	if (status == -EBUSY) {
+		status = 0;
+		mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
+		     "other nodes\n", oi->ip_blkno);
+		goto bail;
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
+		/* Nobody knew which slot this inode was orphaned
+		 * into. This may happen during node death and
+		 * recovery knows how to clean it up so we can safely
+		 * ignore this inode for now on. */
+		mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
+		     oi->ip_blkno);
+	} else {
+		*wipe = 1;
+
+		mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
+		     oi->ip_blkno, oi->ip_orphaned_slot);
+	}
+	spin_unlock(&oi->ip_lock);
+
+bail:
+	return status;
+}
+
+/* Support function for ocfs2_delete_inode. Will help us keep the
+ * inode data in a consistent state for clear_inode. Always truncates
+ * pages, optionally sync's them first. */
+static void ocfs2_cleanup_delete_inode(struct inode *inode,
+				       int sync_data)
+{
+	mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
+	     OCFS2_I(inode)->ip_blkno, sync_data);
+	if (sync_data)
+		write_inode_now(inode, 1);
+	truncate_inode_pages(&inode->i_data, 0);
+}
+
+void ocfs2_delete_inode(struct inode *inode)
+{
+	int wipe, status;
+	sigset_t blocked, oldset;
+	struct buffer_head *di_bh = NULL;
+
+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	if (is_bad_inode(inode)) {
+		mlog(0, "Skipping delete of bad inode\n");
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		/* It's probably not necessary to truncate_inode_pages
+		 * here but we do it for safety anyway (it will most
+		 * likely be a no-op anyway) */
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail;
+	}
+
+	/* We want to block signals in delete_inode as the lock and
+	 * messaging paths may return us -ERESTARTSYS. Which would
+	 * cause us to exit early, resulting in inodes being orphaned
+	 * forever. */
+	sigfillset(&blocked);
+	status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail;
+	}
+
+	/* Lock down the inode. This gives us an up to date view of
+	 * it's metadata (for verification), and allows us to
+	 * serialize delete_inode votes. 
+	 *
+	 * Even though we might be doing a truncate, we don't take the
+	 * allocation lock here as it won't be needed - nobody will
+	 * have the file open.
+	 */
+	status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unblock;
+	}
+
+	/* Query the cluster. This will be the final decision made
+	 * before we go ahead and wipe the inode. */
+	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
+	if (!wipe || status < 0) {
+		/* Error and inode busy vote both mean we won't be
+		 * removing the inode, so they take almost the same
+		 * path. */
+		if (status < 0)
+			mlog_errno(status);
+
+		/* Someone in the cluster has voted to not wipe this
+		 * inode, or it was never completely orphaned. Write
+		 * out the pages and exit now. */
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail_unlock_inode;
+	}
+
+	ocfs2_cleanup_delete_inode(inode, 0);
+
+	status = ocfs2_wipe_inode(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_inode;
+	}
+
+	/* Mark the inode as successfully deleted. This is important
+	 * for ocfs2_clear_inode as it will check this flag and skip
+	 * any checkpointing work */
+	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
+
+bail_unlock_inode:
+	ocfs2_meta_unlock(inode, 1);
+	brelse(di_bh);
+bail_unblock:
+	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (status < 0)
+		mlog_errno(status);
+bail:
+	clear_inode(inode);
+	mlog_exit_void();
+}
+
+void ocfs2_clear_inode(struct inode *inode)
+{
+	int status;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry_void();
+
+	if (!inode)
+		goto bail;
+
+	mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
+	     OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+
+	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
+			"Inode=%lu\n", inode->i_ino);
+
+	/* Do these before all the other work so that we don't bounce
+	 * the vote thread while waiting to destroy the locks. */
+	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+
+	/* We very well may get a clear_inode before all an inodes
+	 * metadata has hit disk. Of course, we can't drop any cluster
+	 * locks until the journal has finished with it. The only
+	 * exception here are successfully wiped inodes - their
+	 * metadata can now be considered to be part of the system
+	 * inodes from which it came. */
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
+		ocfs2_checkpoint_inode(inode);
+
+	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
+			"Clear inode of %"MLFu64", inode has io markers\n",
+			oi->ip_blkno);
+
+	ocfs2_extent_map_drop(inode, 0);
+	ocfs2_extent_map_init(inode);
+
+	status = ocfs2_drop_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
+	ocfs2_lock_res_free(&oi->ip_meta_lockres);
+	ocfs2_lock_res_free(&oi->ip_data_lockres);
+
+	ocfs2_metadata_cache_purge(inode);
+
+	mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
+			"Clear inode of %"MLFu64", inode has %u cache items\n",
+			oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
+
+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+			"Clear inode of %"MLFu64", inode has a bad flag\n",
+			oi->ip_blkno);
+
+	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
+			"Clear inode of %"MLFu64", inode is locked\n",
+			oi->ip_blkno);
+
+	mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
+			"Clear inode of %"MLFu64", io_sem is locked\n",
+			oi->ip_blkno);
+	up(&oi->ip_io_sem);
+
+	/*
+	 * down_trylock() returns 0, down_write_trylock() returns 1
+	 * kernel 1, world 0
+	 */
+	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
+			"Clear inode of %"MLFu64", alloc_sem is locked\n",
+			oi->ip_blkno);
+	up_write(&oi->ip_alloc_sem);
+
+	mlog_bug_on_msg(oi->ip_open_count,
+			"Clear inode of %"MLFu64" has open count %d\n",
+			oi->ip_blkno, oi->ip_open_count);
+	mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
+			"Clear inode of %"MLFu64" has non empty handle list\n",
+			oi->ip_blkno);
+	mlog_bug_on_msg(oi->ip_handle,
+			"Clear inode of %"MLFu64" has non empty handle pointer\n",
+			oi->ip_blkno);
+
+	/* Clear all other flags. */
+	oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
+	oi->ip_created_trans = 0;
+	oi->ip_last_trans = 0;
+	oi->ip_dir_start_lookup = 0;
+	oi->ip_blkno = 0ULL;
+
+bail:
+	mlog_exit_void();
+}
+
+/* Called under inode_lock, with no more references on the
+ * struct inode, so it's safe here to check the flags field
+ * and to manipulate i_nlink without any other locks. */
+void ocfs2_drop_inode(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry_void();
+
+	mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
+	     oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+
+	/* Testing ip_orphaned_slot here wouldn't work because we may
+	 * not have gotten a delete_inode vote from any other nodes
+	 * yet. */
+	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
+		mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
+		inode->i_nlink = 0;
+	}
+
+	generic_drop_inode(inode);
+
+	mlog_exit_void();
+}
+
+/*
+ * TODO: this should probably be merged into ocfs2_get_block
+ *
+ * However, you now need to pay attention to the cont_prepare_write()
+ * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
+ * expects never to extend).
+ */
+struct buffer_head *ocfs2_bread(struct inode *inode,
+				int block, int *err, int reada)
+{
+	struct buffer_head *bh = NULL;
+	int tmperr;
+	u64 p_blkno;
+	int readflags = OCFS2_BH_CACHED;
+
+#if 0
+	/* only turn this on if we know we can deal with read_block
+	 * returning nothing */
+	if (reada)
+		readflags |= OCFS2_BH_READAHEAD;
+#endif
+
+	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!reada);
+		return NULL;
+	}
+
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
+					     &p_blkno, NULL);
+	if (tmperr < 0) {
+		mlog_errno(tmperr);
+		goto fail;
+	}
+
+	tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
+				  readflags, inode);
+	if (tmperr < 0)
+		goto fail;
+
+	tmperr = 0;
+
+	*err = 0;
+	return bh;
+
+fail:
+	if (bh) {
+		brelse(bh);
+		bh = NULL;
+	}
+	*err = -EIO;
+	return NULL;
+}
+
+/*
+ * This is called from our getattr.
+ */
+int ocfs2_inode_revalidate(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int status = 0;
+
+	mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode,
+		   inode ? OCFS2_I(inode)->ip_blkno : 0ULL);
+
+	if (!inode) {
+		mlog(0, "eep, no inode!\n");
+		status = -ENOENT;
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		mlog(0, "inode deleted!\n");
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* Let ocfs2_meta_lock do the work of updating our struct
+	 * inode for us. */
+	status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_meta_unlock(inode, 0);
+bail:
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * Updates a disk inode from a
+ * struct inode.
+ * Only takes ip_lock.
+ */
+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh)
+{
+	int status;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+
+	mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_journal_access(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	fe->i_size = cpu_to_le64(i_size_read(inode));
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	fe->i_uid = cpu_to_le32(inode->i_uid);
+	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = 0;
+leave:
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ *
+ * Updates a struct inode from a disk inode.
+ * does no i/o, only takes ip_lock.
+ */
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	i_size_write(inode, le64_to_cpu(fe->i_size));
+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	inode->i_blksize = (u32) osb->s_clustersize;
+	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
new file mode 100644
index 000000000000..9b0177433653
--- /dev/null
+++ b/fs/ocfs2/inode.h
@@ -0,0 +1,145 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_INODE_H
+#define OCFS2_INODE_H
+
+/* OCFS2 Inode Private Data */
+struct ocfs2_inode_info
+{
+	u64			ip_blkno;
+
+	struct ocfs2_lock_res		ip_rw_lockres;
+	struct ocfs2_lock_res		ip_meta_lockres;
+	struct ocfs2_lock_res		ip_data_lockres;
+
+	/* protects allocation changes on this inode. */
+	struct rw_semaphore		ip_alloc_sem;
+
+	/* These fields are protected by ip_lock */
+	spinlock_t			ip_lock;
+	u32				ip_open_count;
+	u32				ip_clusters;
+	struct ocfs2_extent_map		ip_map;
+	struct list_head		ip_io_markers;
+	int				ip_orphaned_slot;
+
+	struct semaphore		ip_io_sem;
+
+	/* Used by the journalling code to attach an inode to a
+	 * handle.  These are protected by ip_io_sem in order to lock
+	 * out other I/O to the inode until we either commit or
+	 * abort. */
+	struct list_head		ip_handle_list;
+	struct ocfs2_journal_handle	*ip_handle;
+
+	u32				ip_flags; /* see below */
+
+	/* protected by recovery_lock. */
+	struct inode			*ip_next_orphan;
+
+	u32				ip_dir_start_lookup;
+
+	/* next two are protected by trans_inc_lock */
+	/* which transaction were we created on? Zero if none. */
+	unsigned long			ip_created_trans;
+	/* last transaction we were a part of. */
+	unsigned long			ip_last_trans;
+
+	struct ocfs2_caching_info	ip_metadata_cache;
+
+	struct inode			vfs_inode;
+};
+
+/*
+ * Flags for the ip_flags field
+ */
+/* System file inodes  */
+#define OCFS2_INODE_SYSTEM_FILE		0x00000001
+#define OCFS2_INODE_JOURNAL		0x00000002
+#define OCFS2_INODE_BITMAP		0x00000004
+/* This inode has been wiped from disk */
+#define OCFS2_INODE_DELETED		0x00000008
+/* Another node is deleting, so our delete is a nop */
+#define OCFS2_INODE_SKIP_DELETE		0x00000010
+/* Has the inode been orphaned on another node?
+ *
+ * This hints to ocfs2_drop_inode that it should clear i_nlink before
+ * continuing.
+ *
+ * We *only* set this on unlink vote from another node. If the inode
+ * was locally orphaned, then we're sure of the state and don't need
+ * to twiddle i_nlink later - it's either zero or not depending on
+ * whether our unlink succeeded. Otherwise we got this from a node
+ * whose intention was to orphan the inode, however he may have
+ * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
+ * rely on ocfs2_delete_inode to sort things out under the proper
+ * cluster locks.
+ */
+#define OCFS2_INODE_MAYBE_ORPHANED	0x00000020
+/* Does someone have the file open O_DIRECT */
+#define OCFS2_INODE_OPEN_DIRECT		0x00000040
+/* Indicates that the metadata cache should be used as an array. */
+#define OCFS2_INODE_CACHE_INLINE	0x00000080
+
+static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
+{
+	return container_of(inode, struct ocfs2_inode_info, vfs_inode);
+}
+
+#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
+#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
+
+extern kmem_cache_t *ocfs2_inode_cache;
+
+extern struct address_space_operations ocfs2_aops;
+
+struct buffer_head *ocfs2_bread(struct inode *inode, int block,
+				int *err, int reada);
+void ocfs2_clear_inode(struct inode *inode);
+void ocfs2_delete_inode(struct inode *inode);
+void ocfs2_drop_inode(struct inode *inode);
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
+				     u64 blkno,
+				     int delete_vote);
+int ocfs2_inode_init_private(struct inode *inode);
+int ocfs2_inode_revalidate(struct dentry *dentry);
+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			 int create_ino);
+void ocfs2_read_inode(struct inode *inode);
+void ocfs2_read_inode2(struct inode *inode, void *opaque);
+ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
+			size_t size, loff_t *offp);
+void ocfs2_sync_blockdev(struct super_block *sb);
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe);
+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh);
+int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
+int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+
+#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
new file mode 100644
index 000000000000..04428042e5e5
--- /dev/null
+++ b/fs/ocfs2/journal.c
@@ -0,0 +1,1652 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.c
+ *
+ * Defines functions of journalling api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+
+#define MLOG_MASK_PREFIX ML_JOURNAL
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "namei.h"
+#include "slot_map.h"
+#include "super.h"
+#include "vote.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+
+static int ocfs2_force_read_journal(struct inode *inode);
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num);
+static int __ocfs2_recovery_thread(void *arg);
+static int ocfs2_commit_cache(struct ocfs2_super *osb);
+static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
+				       struct ocfs2_journal_handle *handle);
+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty);
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num);
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot);
+static int ocfs2_commit_thread(void *arg);
+
+static int ocfs2_commit_cache(struct ocfs2_super *osb)
+{
+	int status = 0;
+	unsigned int flushed;
+	unsigned long old_id;
+	struct ocfs2_journal *journal = NULL;
+
+	mlog_entry_void();
+
+	journal = osb->journal;
+
+	/* Flush all pending commits and checkpoint the journal. */
+	down_write(&journal->j_trans_barrier);
+
+	if (atomic_read(&journal->j_num_trans) == 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog(0, "No transactions for me to flush!\n");
+		goto finally;
+	}
+
+	journal_lock_updates(journal->j_journal);
+	status = journal_flush(journal->j_journal);
+	journal_unlock_updates(journal->j_journal);
+	if (status < 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog_errno(status);
+		goto finally;
+	}
+
+	old_id = ocfs2_inc_trans_id(journal);
+
+	flushed = atomic_read(&journal->j_num_trans);
+	atomic_set(&journal->j_num_trans, 0);
+	up_write(&journal->j_trans_barrier);
+
+	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
+	     journal->j_trans_id, flushed);
+
+	ocfs2_kick_vote_thread(osb);
+	wake_up(&journal->j_checkpointed);
+finally:
+	mlog_exit(status);
+	return status;
+}
+
+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal_handle *retval = NULL;
+
+	retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
+	if (!retval) {
+		mlog(ML_ERROR, "Failed to allocate memory for journal "
+		     "handle!\n");
+		return NULL;
+	}
+
+	retval->max_buffs = 0;
+	retval->num_locks = 0;
+	retval->k_handle = NULL;
+
+	INIT_LIST_HEAD(&retval->locks);
+	INIT_LIST_HEAD(&retval->inode_list);
+	retval->journal = osb->journal;
+
+	return retval;
+}
+
+/* pass it NULL and it will allocate a new handle object for you.  If
+ * you pass it a handle however, it may still return error, in which
+ * case it has free'd the passed handle for you. */
+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
+					       struct ocfs2_journal_handle *handle,
+					       int max_buffs)
+{
+	int ret;
+	journal_t *journal = osb->journal->j_journal;
+
+	mlog_entry("(max_buffs = %d)\n", max_buffs);
+
+	if (!osb || !osb->journal->j_journal)
+		BUG();
+
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		goto done_free;
+	}
+
+	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
+	BUG_ON(max_buffs <= 0);
+
+	/* JBD might support this, but our journalling code doesn't yet. */
+	if (journal_current_handle()) {
+		mlog(ML_ERROR, "Recursive transaction attempted!\n");
+		BUG();
+	}
+
+	if (!handle)
+		handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog(ML_ERROR, "Failed to allocate memory for journal "
+		     "handle!\n");
+		goto done_free;
+	}
+
+	handle->max_buffs = max_buffs;
+
+	down_read(&osb->journal->j_trans_barrier);
+
+	/* actually start the transaction now */
+	handle->k_handle = journal_start(journal, max_buffs);
+	if (IS_ERR(handle->k_handle)) {
+		up_read(&osb->journal->j_trans_barrier);
+
+		ret = PTR_ERR(handle->k_handle);
+		handle->k_handle = NULL;
+		mlog_errno(ret);
+
+		if (is_journal_aborted(journal)) {
+			ocfs2_abort(osb->sb, "Detected aborted journal");
+			ret = -EROFS;
+		}
+		goto done_free;
+	}
+
+	atomic_inc(&(osb->journal->j_num_trans));
+	handle->flags |= OCFS2_HANDLE_STARTED;
+
+	mlog_exit_ptr(handle);
+	return handle;
+
+done_free:
+	if (handle)
+		ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
+
+	mlog_exit(ret);
+	return ERR_PTR(ret);
+}
+
+void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
+			    struct inode *inode)
+{
+	BUG_ON(!handle);
+	BUG_ON(!inode);
+
+	atomic_inc(&inode->i_count);
+
+	/* we're obviously changing it... */
+	down(&inode->i_sem);
+
+	/* sanity check */
+	BUG_ON(OCFS2_I(inode)->ip_handle);
+	BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
+
+	OCFS2_I(inode)->ip_handle = handle;
+	list_del(&(OCFS2_I(inode)->ip_handle_list));
+	list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
+}
+
+static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
+{
+	struct list_head *p, *n;
+	struct inode *inode;
+	struct ocfs2_inode_info *oi;
+
+	list_for_each_safe(p, n, &handle->inode_list) {
+		oi = list_entry(p, struct ocfs2_inode_info,
+				ip_handle_list);
+		inode = &oi->vfs_inode;
+
+		OCFS2_I(inode)->ip_handle = NULL;
+		list_del_init(&OCFS2_I(inode)->ip_handle_list);
+
+		up(&inode->i_sem);
+		iput(inode);
+	}
+}
+
+/* This is trivial so we do it out of the main commit
+ * paths. Beware, it can be called from start_trans too! */
+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
+{
+	mlog_entry_void();
+
+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+
+	ocfs2_handle_unlock_inodes(handle);
+	/* You are allowed to add journal locks before the transaction
+	 * has started. */
+	ocfs2_handle_cleanup_locks(handle->journal, handle);
+
+	kfree(handle);
+
+	mlog_exit_void();
+}
+
+void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
+{
+	handle_t *jbd_handle;
+	int retval;
+	struct ocfs2_journal *journal = handle->journal;
+
+	mlog_entry_void();
+
+	BUG_ON(!handle);
+
+	if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
+		ocfs2_commit_unstarted_handle(handle);
+		mlog_exit_void();
+		return;
+	}
+
+	/* release inode semaphores we took during this transaction */
+	ocfs2_handle_unlock_inodes(handle);
+
+	/* ocfs2_extend_trans may have had to call journal_restart
+	 * which will always commit the transaction, but may return
+	 * error for any number of reasons. If this is the case, we
+	 * clear k_handle as it's not valid any more. */
+	if (handle->k_handle) {
+		jbd_handle = handle->k_handle;
+
+		if (handle->flags & OCFS2_HANDLE_SYNC)
+			jbd_handle->h_sync = 1;
+		else
+			jbd_handle->h_sync = 0;
+
+		/* actually stop the transaction. if we've set h_sync,
+		 * it'll have been committed when we return */
+		retval = journal_stop(jbd_handle);
+		if (retval < 0) {
+			mlog_errno(retval);
+			mlog(ML_ERROR, "Could not commit transaction\n");
+			BUG();
+		}
+
+		handle->k_handle = NULL; /* it's been free'd in journal_stop */
+	}
+
+	ocfs2_handle_cleanup_locks(journal, handle);
+
+	up_read(&journal->j_trans_barrier);
+
+	kfree(handle);
+	mlog_exit_void();
+}
+
+/*
+ * 'nblocks' is what you want to add to the current
+ * transaction. extend_trans will either extend the current handle by
+ * nblocks, or commit it and start a new one with nblocks credits.
+ *
+ * WARNING: This will not release any semaphores or disk locks taken
+ * during the transaction, so make sure they were taken *before*
+ * start_trans or we'll have ordering deadlocks.
+ *
+ * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
+ * good because transaction ids haven't yet been recorded on the
+ * cluster locks associated with this handle.
+ */
+int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
+		       int nblocks)
+{
+	int status;
+
+	BUG_ON(!handle);
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+	BUG_ON(!nblocks);
+
+	mlog_entry_void();
+
+	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+
+	status = journal_extend(handle->k_handle, nblocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (status > 0) {
+		mlog(0, "journal_extend failed, trying journal_restart\n");
+		status = journal_restart(handle->k_handle, nblocks);
+		if (status < 0) {
+			handle->k_handle = NULL;
+			mlog_errno(status);
+			goto bail;
+		}
+		handle->max_buffs = nblocks;
+	} else
+		handle->max_buffs += nblocks;
+
+	status = 0;
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *bh,
+			 int type)
+{
+	int status;
+
+	BUG_ON(!inode);
+	BUG_ON(!handle);
+	BUG_ON(!bh);
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+
+	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
+		   (unsigned long long)bh->b_blocknr, type,
+		   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
+		   "OCFS2_JOURNAL_ACCESS_CREATE" :
+		   "OCFS2_JOURNAL_ACCESS_WRITE",
+		   bh->b_size);
+
+	/* we can safely remove this assertion after testing. */
+	if (!buffer_uptodate(bh)) {
+		mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
+		mlog(ML_ERROR, "b_blocknr=%llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		BUG();
+	}
+
+	/* Set the current transaction information on the inode so
+	 * that the locking code knows whether it can drop it's locks
+	 * on this inode or not. We're protected from the commit
+	 * thread updating the current transaction id until
+	 * ocfs2_commit_trans() because ocfs2_start_trans() took
+	 * j_trans_barrier for us. */
+	ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
+
+	down(&OCFS2_I(inode)->ip_io_sem);
+	switch (type) {
+	case OCFS2_JOURNAL_ACCESS_CREATE:
+	case OCFS2_JOURNAL_ACCESS_WRITE:
+		status = journal_get_write_access(handle->k_handle, bh);
+		break;
+
+	case OCFS2_JOURNAL_ACCESS_UNDO:
+		status = journal_get_undo_access(handle->k_handle, bh);
+		break;
+
+	default:
+		status = -EINVAL;
+		mlog(ML_ERROR, "Uknown access type!\n");
+	}
+	up(&OCFS2_I(inode)->ip_io_sem);
+
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
+		     status, type);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
+			struct buffer_head *bh)
+{
+	int status;
+
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+
+	mlog_entry("(bh->b_blocknr=%llu)\n",
+		   (unsigned long long)bh->b_blocknr);
+
+	status = journal_dirty_metadata(handle->k_handle, bh);
+	if (status < 0)
+		mlog(ML_ERROR, "Could not dirty metadata buffer. "
+		     "(bh->b_blocknr=%llu)\n",
+		     (unsigned long long)bh->b_blocknr);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_dirty_data(handle_t *handle,
+			     struct buffer_head *bh)
+{
+	int err = journal_dirty_data(handle, bh);
+	if (err)
+		mlog_errno(err);
+	/* TODO: When we can handle it, abort the handle and go RO on
+	 * error here. */
+
+	return err;
+}
+
+/* We always assume you're adding a metadata lock at level 'ex' */
+int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
+			  struct inode *inode)
+{
+	int status;
+	struct ocfs2_journal_lock *lock;
+
+	BUG_ON(!inode);
+
+	lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
+	if (!lock) {
+		status = -ENOMEM;
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	if (!igrab(inode))
+		BUG();
+	lock->jl_inode = inode;
+
+	list_add_tail(&(lock->jl_lock_list), &(handle->locks));
+	handle->num_locks++;
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
+				       struct ocfs2_journal_handle *handle)
+{
+	struct list_head *p, *n;
+	struct ocfs2_journal_lock *lock;
+	struct inode *inode;
+
+	list_for_each_safe(p, n, &(handle->locks)) {
+		lock = list_entry(p, struct ocfs2_journal_lock,
+				  jl_lock_list);
+		list_del(&lock->jl_lock_list);
+		handle->num_locks--;
+
+		inode = lock->jl_inode;
+		ocfs2_meta_unlock(inode, 1);
+		if (atomic_read(&inode->i_count) == 1)
+			mlog(ML_ERROR,
+			     "Inode %"MLFu64", I'm doing a last iput for!",
+			     OCFS2_I(inode)->ip_blkno);
+		iput(inode);
+		kmem_cache_free(ocfs2_lock_cache, lock);
+	}
+}
+
+#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * 5)
+
+void ocfs2_set_journal_params(struct ocfs2_super *osb)
+{
+	journal_t *journal = osb->journal->j_journal;
+
+	spin_lock(&journal->j_state_lock);
+	journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+		journal->j_flags |= JFS_BARRIER;
+	else
+		journal->j_flags &= ~JFS_BARRIER;
+	spin_unlock(&journal->j_state_lock);
+}
+
+int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+{
+	int status = -1;
+	struct inode *inode = NULL; /* the journal inode */
+	journal_t *j_journal = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_super *osb;
+	int meta_lock = 0;
+
+	mlog_entry_void();
+
+	BUG_ON(!journal);
+
+	osb = journal->j_osb;
+
+	/* already have the inode for our journal */
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    osb->slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto done;
+	}
+
+	SET_INODE_JOURNAL(inode);
+	OCFS2_I(inode)->ip_open_count++;
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	if (status < 0) {
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not get lock on journal!\n");
+		goto done;
+	}
+
+	meta_lock = 1;
+	di = (struct ocfs2_dinode *)bh->b_data;
+
+	if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
+		mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
+		     inode->i_size);
+		status = -EINVAL;
+		goto done;
+	}
+
+	mlog(0, "inode->i_size = %lld\n", inode->i_size);
+	mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks);
+	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
+
+	/* call the kernels journal init function now */
+	j_journal = journal_init_inode(inode);
+	if (j_journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EINVAL;
+		goto done;
+	}
+
+	mlog(0, "Returned from journal_init_inode\n");
+	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
+
+	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
+		  OCFS2_JOURNAL_DIRTY_FL);
+
+	journal->j_journal = j_journal;
+	journal->j_inode = inode;
+	journal->j_bh = bh;
+
+	ocfs2_set_journal_params(osb);
+
+	journal->j_state = OCFS2_JOURNAL_LOADED;
+
+	status = 0;
+done:
+	if (status < 0) {
+		if (meta_lock)
+			ocfs2_meta_unlock(inode, 1);
+		if (bh != NULL)
+			brelse(bh);
+		if (inode) {
+			OCFS2_I(inode)->ip_open_count--;
+			iput(inode);
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty)
+{
+	int status;
+	unsigned int flags;
+	struct ocfs2_journal *journal = osb->journal;
+	struct buffer_head *bh = journal->j_bh;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	fe = (struct ocfs2_dinode *)bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		/* This is called from startup/shutdown which will
+		 * handle the errors in a specific manner, so no need
+		 * to call ocfs2_error() here. */
+		mlog(ML_ERROR, "Journal dinode %"MLFu64"  has invalid "
+		     "signature: %.*s", fe->i_blkno, 7, fe->i_signature);
+		status = -EIO;
+		goto out;
+	}
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	if (dirty)
+		flags |= OCFS2_JOURNAL_DIRTY_FL;
+	else
+		flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	status = ocfs2_write_block(osb, bh, journal->j_inode);
+	if (status < 0)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * If the journal has been kmalloc'd it needs to be freed after this
+ * call.
+ */
+void ocfs2_journal_shutdown(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = NULL;
+	int status = 0;
+	struct inode *inode = NULL;
+	int num_running_trans = 0;
+
+	mlog_entry_void();
+
+	if (!osb)
+		BUG();
+
+	journal = osb->journal;
+	if (!journal)
+		goto done;
+
+	inode = journal->j_inode;
+
+	if (journal->j_state != OCFS2_JOURNAL_LOADED)
+		goto done;
+
+	/* need to inc inode use count as journal_destroy will iput. */
+	if (!igrab(inode))
+		BUG();
+
+	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
+	if (num_running_trans > 0)
+		mlog(0, "Shutting down journal: must wait on %d "
+		     "running transactions!\n",
+		     num_running_trans);
+
+	/* Do a commit_cache here. It will flush our journal, *and*
+	 * release any locks that are still held.
+	 * set the SHUTDOWN flag and release the trans lock.
+	 * the commit thread will take the trans lock for us below. */
+	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
+
+	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
+	 * drop the trans_lock (which we want to hold until we
+	 * completely destroy the journal. */
+	if (osb->commit_task) {
+		/* Wait for the commit thread */
+		mlog(0, "Waiting for ocfs2commit to exit....\n");
+		kthread_stop(osb->commit_task);
+		osb->commit_task = NULL;
+	}
+
+	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
+
+	status = ocfs2_journal_toggle_dirty(osb, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Shutdown the kernel journal system */
+	journal_destroy(journal->j_journal);
+
+	OCFS2_I(inode)->ip_open_count--;
+
+	/* unlock our journal */
+	ocfs2_meta_unlock(inode, 1);
+
+	brelse(journal->j_bh);
+	journal->j_bh = NULL;
+
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+//	up_write(&journal->j_trans_barrier);
+done:
+	if (inode)
+		iput(inode);
+	mlog_exit_void();
+}
+
+static void ocfs2_clear_journal_error(struct super_block *sb,
+				      journal_t *journal,
+				      int slot)
+{
+	int olderr;
+
+	olderr = journal_errno(journal);
+	if (olderr) {
+		mlog(ML_ERROR, "File system error %d recorded in "
+		     "journal %u.\n", olderr, slot);
+		mlog(ML_ERROR, "File system on device %s needs checking.\n",
+		     sb->s_id);
+
+		journal_ack_err(journal);
+		journal_clear_err(journal);
+	}
+}
+
+int ocfs2_journal_load(struct ocfs2_journal *journal)
+{
+	int status = 0;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	if (!journal)
+		BUG();
+
+	osb = journal->j_osb;
+
+	status = journal_load(journal->j_journal);
+	if (status < 0) {
+		mlog(ML_ERROR, "Failed to load journal!\n");
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+
+	status = ocfs2_journal_toggle_dirty(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Launch the commit thread */
+	osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
+				       osb->osb_id);
+	if (IS_ERR(osb->commit_task)) {
+		status = PTR_ERR(osb->commit_task);
+		osb->commit_task = NULL;
+		mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
+		     status);
+		goto done;
+	}
+
+done:
+	mlog_exit(status);
+	return status;
+}
+
+
+/* 'full' flag tells us whether we clear out all blocks or if we just
+ * mark the journal clean */
+int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
+{
+	int status;
+
+	mlog_entry_void();
+
+	if (!journal)
+		BUG();
+
+	status = journal_wipe(journal->j_journal, full);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * JBD Might read a cached version of another nodes journal file. We
+ * don't want this as this file changes often and we get no
+ * notification on those changes. The only way to be sure that we've
+ * got the most up to date version of those blocks then is to force
+ * read them off disk. Just searching through the buffer cache won't
+ * work as there may be pages backing this file which are still marked
+ * up to date. We know things can't change on this file underneath us
+ * as we have the lock by now :)
+ */
+static int ocfs2_force_read_journal(struct inode *inode)
+{
+	int status = 0;
+	int i, p_blocks;
+	u64 v_blkno, p_blkno;
+#define CONCURRENT_JOURNAL_FILL 32
+	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
+
+	mlog_entry_void();
+
+	BUG_ON(inode->i_blocks !=
+		     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
+
+	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
+
+	mlog(0, "Force reading %lu blocks\n",
+	     (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9)));
+
+	v_blkno = 0;
+	while (v_blkno <
+	       (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
+
+		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
+						     1, &p_blkno,
+						     &p_blocks);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (p_blocks > CONCURRENT_JOURNAL_FILL)
+			p_blocks = CONCURRENT_JOURNAL_FILL;
+
+		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
+					   p_blkno, p_blocks, bhs, 0,
+					   inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = 0; i < p_blocks; i++) {
+			brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+
+		v_blkno += p_blocks;
+	}
+
+bail:
+	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
+		if (bhs[i])
+			brelse(bhs[i]);
+	mlog_exit(status);
+	return status;
+}
+
+struct ocfs2_la_recovery_item {
+	struct list_head	lri_list;
+	int			lri_slot;
+	struct ocfs2_dinode	*lri_la_dinode;
+	struct ocfs2_dinode	*lri_tl_dinode;
+};
+
+/* Does the second half of the recovery process. By this point, the
+ * node is marked clean and can actually be considered recovered,
+ * hence it's no longer in the recovery map, but there's still some
+ * cleanup we can do which shouldn't happen within the recovery thread
+ * as locking in that context becomes very difficult if we are to take
+ * recovering nodes into account.
+ *
+ * NOTE: This function can and will sleep on recovery of other nodes
+ * during cluster locking, just like any other ocfs2 process.
+ */
+void ocfs2_complete_recovery(void *data)
+{
+	int ret;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_journal *journal = osb->journal;
+	struct ocfs2_dinode *la_dinode, *tl_dinode;
+	struct ocfs2_la_recovery_item *item;
+	struct list_head *p, *n;
+	LIST_HEAD(tmp_la_list);
+
+	mlog_entry_void();
+
+	mlog(0, "completing recovery from keventd\n");
+
+	spin_lock(&journal->j_lock);
+	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
+	spin_unlock(&journal->j_lock);
+
+	list_for_each_safe(p, n, &tmp_la_list) {
+		item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
+		list_del_init(&item->lri_list);
+
+		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+
+		la_dinode = item->lri_la_dinode;
+		if (la_dinode) {
+			mlog(0, "Clean up local alloc %"MLFu64"\n",
+			     la_dinode->i_blkno);
+
+			ret = ocfs2_complete_local_alloc_recovery(osb,
+								  la_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(la_dinode);
+		}
+
+		tl_dinode = item->lri_tl_dinode;
+		if (tl_dinode) {
+			mlog(0, "Clean up truncate log %"MLFu64"\n",
+			     tl_dinode->i_blkno);
+
+			ret = ocfs2_complete_truncate_log_recovery(osb,
+								   tl_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(tl_dinode);
+		}
+
+		ret = ocfs2_recover_orphans(osb, item->lri_slot);
+		if (ret < 0)
+			mlog_errno(ret);
+
+		kfree(item);
+	}
+
+	mlog(0, "Recovery completion\n");
+	mlog_exit_void();
+}
+
+/* NOTE: This function always eats your references to la_dinode and
+ * tl_dinode, either manually on error, or by passing them to
+ * ocfs2_complete_recovery */
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+					    int slot_num,
+					    struct ocfs2_dinode *la_dinode,
+					    struct ocfs2_dinode *tl_dinode)
+{
+	struct ocfs2_la_recovery_item *item;
+
+	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
+	if (!item) {
+		/* Though we wish to avoid it, we are in fact safe in
+		 * skipping local alloc cleanup as fsck.ocfs2 is more
+		 * than capable of reclaiming unused space. */
+		if (la_dinode)
+			kfree(la_dinode);
+
+		if (tl_dinode)
+			kfree(tl_dinode);
+
+		mlog_errno(-ENOMEM);
+		return;
+	}
+
+	INIT_LIST_HEAD(&item->lri_list);
+	item->lri_la_dinode = la_dinode;
+	item->lri_slot = slot_num;
+	item->lri_tl_dinode = tl_dinode;
+
+	spin_lock(&journal->j_lock);
+	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
+	queue_work(ocfs2_wq, &journal->j_recovery_work);
+	spin_unlock(&journal->j_lock);
+}
+
+/* Called by the mount code to queue recovery the last part of
+ * recovery for it's own slot. */
+void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = osb->journal;
+
+	if (osb->dirty) {
+		/* No need to queue up our truncate_log as regular
+		 * cleanup will catch that. */
+		ocfs2_queue_recovery_completion(journal,
+						osb->slot_num,
+						osb->local_alloc_copy,
+						NULL);
+		ocfs2_schedule_truncate_log_flush(osb, 0);
+
+		osb->local_alloc_copy = NULL;
+		osb->dirty = 0;
+	}
+}
+
+static int __ocfs2_recovery_thread(void *arg)
+{
+	int status, node_num;
+	struct ocfs2_super *osb = arg;
+
+	mlog_entry_void();
+
+	status = ocfs2_wait_on_mount(osb);
+	if (status < 0) {
+		goto bail;
+	}
+
+restart:
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+		node_num = ocfs2_node_map_first_set_bit(osb,
+							&osb->recovery_map);
+		if (node_num == O2NM_INVALID_NODE_NUM) {
+			mlog(0, "Out of nodes to recover.\n");
+			break;
+		}
+
+		status = ocfs2_recover_node(osb, node_num);
+		if (status < 0) {
+			mlog(ML_ERROR,
+			     "Error %d recovering node %d on device (%u,%u)!\n",
+			     status, node_num,
+			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+			mlog(ML_ERROR, "Volume requires unmount.\n");
+			continue;
+		}
+
+		ocfs2_recovery_map_clear(osb, node_num);
+	}
+	ocfs2_super_unlock(osb, 1);
+
+	/* We always run recovery on our own orphan dir - the dead
+	 * node(s) may have voted "no" on an inode delete earlier. A
+	 * revote is therefore required. */
+	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+					NULL);
+
+bail:
+	down(&osb->recovery_lock);
+	if (!status &&
+	    !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+		up(&osb->recovery_lock);
+		goto restart;
+	}
+
+	osb->recovery_thread_task = NULL;
+	mb(); /* sync with ocfs2_recovery_thread_running */
+	wake_up(&osb->recovery_event);
+
+	up(&osb->recovery_lock);
+
+	mlog_exit(status);
+	/* no one is callint kthread_stop() for us so the kthread() api
+	 * requires that we call do_exit().  And it isn't exported, but
+	 * complete_and_exit() seems to be a minimal wrapper around it. */
+	complete_and_exit(NULL, status);
+	return status;
+}
+
+void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
+{
+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+		   node_num, osb->node_num);
+
+	down(&osb->recovery_lock);
+	if (osb->disable_recovery)
+		goto out;
+
+	/* People waiting on recovery will wait on
+	 * the recovery map to empty. */
+	if (!ocfs2_recovery_map_set(osb, node_num))
+		mlog(0, "node %d already be in recovery.\n", node_num);
+
+	mlog(0, "starting recovery thread...\n");
+
+	if (osb->recovery_thread_task)
+		goto out;
+
+	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
+						 "ocfs2rec-%d", osb->osb_id);
+	if (IS_ERR(osb->recovery_thread_task)) {
+		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
+		osb->recovery_thread_task = NULL;
+	}
+
+out:
+	up(&osb->recovery_lock);
+	wake_up(&osb->recovery_event);
+
+	mlog_exit_void();
+}
+
+/* Does the actual journal replay and marks the journal inode as
+ * clean. Will only replay if the journal inode is marked dirty. */
+static int ocfs2_replay_journal(struct ocfs2_super *osb,
+				int node_num,
+				int slot_num)
+{
+	int status;
+	int got_lock = 0;
+	unsigned int flags;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *fe;
+	journal_t *journal = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		status = -EACCES;
+		iput(inode);
+		inode = NULL;
+		mlog_errno(status);
+		goto done;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
+				      OCFS2_META_LOCK_RECOVERY);
+	if (status < 0) {
+		mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not lock journal!\n");
+		goto done;
+	}
+	got_lock = 1;
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+
+	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
+		mlog(0, "No recovery required for node %d\n", node_num);
+		goto done;
+	}
+
+	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
+	     node_num, slot_num,
+	     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+
+	status = ocfs2_force_read_journal(inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	mlog(0, "calling journal_init_inode\n");
+	journal = journal_init_inode(inode);
+	if (journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EIO;
+		goto done;
+	}
+
+	status = journal_load(journal);
+	if (status < 0) {
+		mlog_errno(status);
+		if (!igrab(inode))
+			BUG();
+		journal_destroy(journal);
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
+
+	/* wipe the journal */
+	mlog(0, "flushing the journal.\n");
+	journal_lock_updates(journal);
+	status = journal_flush(journal);
+	journal_unlock_updates(journal);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will mark the node clean */
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	status = ocfs2_write_block(osb, bh, inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	if (!igrab(inode))
+		BUG();
+
+	journal_destroy(journal);
+
+done:
+	/* drop the lock on this nodes journal */
+	if (got_lock)
+		ocfs2_meta_unlock(inode, 1);
+
+	if (inode)
+		iput(inode);
+
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Do the most important parts of node recovery:
+ *  - Replay it's journal
+ *  - Stamp a clean local allocator file
+ *  - Stamp a clean truncate log
+ *  - Mark the node clean
+ *
+ * If this function completes without error, a node in OCFS2 can be
+ * said to have been safely recovered. As a result, failure during the
+ * second part of a nodes recovery process (local alloc recovery) is
+ * far less concerning.
+ */
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num)
+{
+	int status = 0;
+	int slot_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+	struct ocfs2_dinode *la_copy = NULL;
+	struct ocfs2_dinode *tl_copy = NULL;
+
+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+		   node_num, osb->node_num);
+
+	mlog(0, "checking node %d\n", node_num);
+
+	/* Should not ever be called to recover ourselves -- in that
+	 * case we should've called ocfs2_journal_load instead. */
+	if (osb->node_num == node_num)
+		BUG();
+
+	slot_num = ocfs2_node_num_to_slot(si, node_num);
+	if (slot_num == OCFS2_INVALID_SLOT) {
+		status = 0;
+		mlog(0, "no slot for this node, so no recovery required.\n");
+		goto done;
+	}
+
+	mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+
+	status = ocfs2_replay_journal(osb, node_num, slot_num);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Stamp a clean local alloc file AFTER recovering the journal... */
+	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* An error from begin_truncate_log_recovery is not
+	 * serious enough to warrant halting the rest of
+	 * recovery. */
+	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Likewise, this would be a strange but ultimately not so
+	 * harmful place to get an error... */
+	ocfs2_clear_slot(si, slot_num);
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will kfree the memory pointed to by la_copy and tl_copy */
+	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
+					tl_copy);
+
+	status = 0;
+done:
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Test node liveness by trylocking his journal. If we get the lock,
+ * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
+ * still alive (we couldn't get the lock) and < 0 on error. */
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num)
+{
+	int status, flags;
+	struct inode *inode = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		mlog(ML_ERROR, "access error\n");
+		status = -EACCES;
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
+	status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_meta_unlock(inode, 1);
+bail:
+	if (inode)
+		iput(inode);
+
+	return status;
+}
+
+/* Call this underneath ocfs2_super_lock. It also assumes that the
+ * slot info struct has been updated from disk. */
+int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
+{
+	int status, i, node_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	/* This is called with the super block cluster lock, so we
+	 * know that the slot map can't change underneath us. */
+
+	spin_lock(&si->si_lock);
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (i == osb->slot_num)
+			continue;
+		if (ocfs2_is_empty_slot(si, i))
+			continue;
+
+		node_num = si->si_global_node_nums[i];
+		if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+			continue;
+		spin_unlock(&si->si_lock);
+
+		/* Ok, we have a slot occupied by another node which
+		 * is not in the recovery map. We trylock his journal
+		 * file here to test if he's alive. */
+		status = ocfs2_trylock_journal(osb, i);
+		if (!status) {
+			/* Since we're called from mount, we know that
+			 * the recovery thread can't race us on
+			 * setting / checking the recovery bits. */
+			ocfs2_recovery_thread(osb, node_num);
+		} else if ((status < 0) && (status != -EAGAIN)) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		spin_lock(&si->si_lock);
+	}
+	spin_unlock(&si->si_lock);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot)
+{
+	int status = 0;
+	int have_disk_lock = 0;
+	struct inode *inode = NULL;
+	struct inode *iter;
+	struct inode *orphan_dir_inode = NULL;
+	unsigned long offset, blk, local;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_inode_info *oi;
+
+	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       slot);
+	if  (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto out;
+	}
+
+	down(&orphan_dir_inode->i_sem);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
+	if (status < 0) {
+		up(&orphan_dir_inode->i_sem);
+		mlog_errno(status);
+		goto out;
+	}
+	have_disk_lock = 1;
+
+	offset = 0;
+	iter = NULL;
+	while(offset < i_size_read(orphan_dir_inode)) {
+		blk = offset >> sb->s_blocksize_bits;
+
+		bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
+		if (!bh)
+			status = -EINVAL;
+		if (status < 0) {
+			up(&orphan_dir_inode->i_sem);
+			if (bh)
+				brelse(bh);
+			mlog_errno(status);
+			goto out;
+		}
+
+		local = 0;
+		while(offset < i_size_read(orphan_dir_inode)
+		      && local < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + local);
+
+			if (!ocfs2_check_dir_entry(orphan_dir_inode,
+						  de, bh, local)) {
+				up(&orphan_dir_inode->i_sem);
+				status = -EINVAL;
+				mlog_errno(status);
+				brelse(bh);
+				goto out;
+			}
+
+			local += le16_to_cpu(de->rec_len);
+			offset += le16_to_cpu(de->rec_len);
+
+			/* I guess we silently fail on no inode? */
+			if (!le64_to_cpu(de->inode))
+				continue;
+			if (de->file_type > OCFS2_FT_MAX) {
+				mlog(ML_ERROR,
+				     "block %llu contains invalid de: "
+				     "inode = %"MLFu64", rec_len = %u, "
+				     "name_len = %u, file_type = %u, "
+				     "name='%.*s'\n",
+				     (unsigned long long)bh->b_blocknr,
+				     le64_to_cpu(de->inode),
+				     le16_to_cpu(de->rec_len),
+				     de->name_len,
+				     de->file_type,
+				     de->name_len,
+				     de->name);
+				continue;
+			}
+			if (de->name_len == 1 && !strncmp(".", de->name, 1))
+				continue;
+			if (de->name_len == 2 && !strncmp("..", de->name, 2))
+				continue;
+
+			iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
+			if (IS_ERR(iter))
+				continue;
+
+			mlog(0, "queue orphan %"MLFu64"\n",
+			     OCFS2_I(iter)->ip_blkno);
+			OCFS2_I(iter)->ip_next_orphan = inode;
+			inode = iter;
+		}
+		brelse(bh);
+	}
+	up(&orphan_dir_inode->i_sem);
+
+	ocfs2_meta_unlock(orphan_dir_inode, 0);
+	have_disk_lock = 0;
+
+	iput(orphan_dir_inode);
+	orphan_dir_inode = NULL;
+
+	while (inode) {
+		oi = OCFS2_I(inode);
+		mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
+
+		iter = oi->ip_next_orphan;
+
+		spin_lock(&oi->ip_lock);
+		/* Delete voting may have set these on the assumption
+		 * that the other node would wipe them successfully.
+		 * If they are still in the node's orphan dir, we need
+		 * to reset that state. */
+		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
+
+		/* Set the proper information to get us going into
+		 * ocfs2_delete_inode. */
+		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+		oi->ip_orphaned_slot = slot;
+		spin_unlock(&oi->ip_lock);
+
+		iput(inode);
+
+		inode = iter;
+	}
+
+out:
+	if (have_disk_lock)
+		ocfs2_meta_unlock(orphan_dir_inode, 0);
+
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	return status;
+}
+
+static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+	/* This check is good because ocfs2 will wait on our recovery
+	 * thread before changing it to something other than MOUNTED
+	 * or DISABLED. */
+	wait_event(osb->osb_mount_event,
+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
+
+	/* If there's an error on mount, then we may never get to the
+	 * MOUNTED flag, but this is set right before
+	 * dismount_volume() so we can trust it. */
+	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+		mlog(0, "mount error, exiting!\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int ocfs2_commit_thread(void *arg)
+{
+	int status;
+	struct ocfs2_super *osb = arg;
+	struct ocfs2_journal *journal = osb->journal;
+
+	/* we can trust j_num_trans here because _should_stop() is only set in
+	 * shutdown and nobody other than ourselves should be able to start
+	 * transactions.  committing on shutdown might take a few iterations
+	 * as final transactions put deleted inodes on the list */
+	while (!(kthread_should_stop() &&
+		 atomic_read(&journal->j_num_trans) == 0)) {
+
+		wait_event_interruptible_timeout(osb->checkpoint_event,
+						 atomic_read(&journal->j_num_trans)
+						 || kthread_should_stop(),
+						 OCFS2_CHECKPOINT_INTERVAL);
+
+		status = ocfs2_commit_cache(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
+			mlog(ML_KTHREAD,
+			     "commit_thread: %u transactions pending on "
+			     "shutdown\n",
+			     atomic_read(&journal->j_num_trans));
+		}
+	}
+
+	return 0;
+}
+
+/* Look for a dirty journal without taking any cluster locks. Used for
+ * hard readonly access to determine whether the file system journals
+ * require recovery. */
+int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	unsigned int slot;
+	struct buffer_head *di_bh;
+	struct ocfs2_dinode *di;
+	struct inode *journal = NULL;
+
+	for(slot = 0; slot < osb->max_slots; slot++) {
+		journal = ocfs2_get_system_file_inode(osb,
+						      JOURNAL_SYSTEM_INODE,
+						      slot);
+		if (!journal || is_bad_inode(journal)) {
+			ret = -EACCES;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di_bh = NULL;
+		ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
+				       0, journal);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di = (struct ocfs2_dinode *) di_bh->b_data;
+
+		if (le32_to_cpu(di->id1.journal1.ij_flags) &
+		    OCFS2_JOURNAL_DIRTY_FL)
+			ret = -EROFS;
+
+		brelse(di_bh);
+		if (ret)
+			break;
+	}
+
+out:
+	if (journal)
+		iput(journal);
+
+	return ret;
+}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
new file mode 100644
index 000000000000..7d0a816184fa
--- /dev/null
+++ b/fs/ocfs2/journal.h
@@ -0,0 +1,457 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.h
+ *
+ * Defines journalling api and structures.
+ *
+ * Copyright (C) 2003, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_JOURNAL_H
+#define OCFS2_JOURNAL_H
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+
+#define OCFS2_CHECKPOINT_INTERVAL        (8 * HZ)
+
+enum ocfs2_journal_state {
+	OCFS2_JOURNAL_FREE = 0,
+	OCFS2_JOURNAL_LOADED,
+	OCFS2_JOURNAL_IN_SHUTDOWN,
+};
+
+struct ocfs2_super;
+struct ocfs2_dinode;
+struct ocfs2_journal_handle;
+
+struct ocfs2_journal {
+	enum ocfs2_journal_state   j_state;    /* Journals current state   */
+
+	journal_t                 *j_journal; /* The kernels journal type */
+	struct inode              *j_inode;   /* Kernel inode pointing to
+					       * this journal             */
+	struct ocfs2_super        *j_osb;     /* pointer to the super
+					       * block for the node
+					       * we're currently
+					       * running on -- not
+					       * necessarily the super
+					       * block from the node
+					       * which we usually run
+					       * from (recovery,
+					       * etc)                     */
+	struct buffer_head        *j_bh;      /* Journal disk inode block */
+	atomic_t                  j_num_trans; /* Number of transactions
+					        * currently in the system. */
+	unsigned long             j_trans_id;
+	struct rw_semaphore       j_trans_barrier;
+	wait_queue_head_t         j_checkpointed;
+
+	spinlock_t                j_lock;
+	struct list_head          j_la_cleanups;
+	struct work_struct        j_recovery_work;
+};
+
+extern spinlock_t trans_inc_lock;
+
+/* wrap j_trans_id so we never have it equal to zero. */
+static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
+{
+	unsigned long old_id;
+	spin_lock(&trans_inc_lock);
+	old_id = j->j_trans_id++;
+	if (unlikely(!j->j_trans_id))
+		j->j_trans_id = 1;
+	spin_unlock(&trans_inc_lock);
+	return old_id;
+}
+
+static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
+					      struct inode *inode)
+{
+	spin_lock(&trans_inc_lock);
+	OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+/* Used to figure out whether it's safe to drop a metadata lock on an
+ * inode. Returns true if all the inodes changes have been
+ * checkpointed to disk. You should be holding the spinlock on the
+ * metadata lock while calling this to be sure that nobody can take
+ * the lock and put it on another transaction. */
+static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
+{
+	int ret;
+	struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
+
+	spin_lock(&trans_inc_lock);
+	ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+/* convenience function to check if an inode is still new (has never
+ * hit disk) Will do you a favor and set created_trans = 0 when you've
+ * been checkpointed.  returns '1' if the inode is still new. */
+static inline int ocfs2_inode_is_new(struct inode *inode)
+{
+	int ret;
+
+	/* System files are never "new" as they're written out by
+	 * mkfs. This helps us early during mount, before we have the
+	 * journal open and j_trans_id could be junk. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		return 0;
+	spin_lock(&trans_inc_lock);
+	ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
+			   OCFS2_I(inode)->ip_created_trans));
+	if (!ret)
+		OCFS2_I(inode)->ip_created_trans = 0;
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
+				       struct inode *inode)
+{
+	spin_lock(&trans_inc_lock);
+	OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+extern kmem_cache_t *ocfs2_lock_cache;
+
+struct ocfs2_journal_lock {
+	struct inode     *jl_inode;
+	struct list_head  jl_lock_list;
+};
+
+struct ocfs2_journal_handle {
+	handle_t            *k_handle; /* kernel handle.                */
+	struct ocfs2_journal        *journal;
+	u32                 flags;     /* see flags below.              */
+	int                 max_buffs; /* Buffs reserved by this handle */
+
+	/* The following two fields are for ocfs2_handle_add_lock */
+	int                 num_locks;
+	struct list_head    locks;     /* A bunch of locks to
+					* release on commit. This
+					* should be a list_head */
+
+	struct list_head     inode_list;
+};
+
+#define OCFS2_HANDLE_STARTED			1
+/* should we sync-commit this handle? */
+#define OCFS2_HANDLE_SYNC			2
+static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
+{
+	return handle->flags & OCFS2_HANDLE_STARTED;
+}
+
+static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
+{
+	if (sync)
+		handle->flags |= OCFS2_HANDLE_SYNC;
+	else
+		handle->flags &= ~OCFS2_HANDLE_SYNC;
+}
+
+/* Exported only for the journal struct init code in super.c. Do not call. */
+void ocfs2_complete_recovery(void *data);
+
+/*
+ *  Journal Control:
+ *  Initialize, Load, Shutdown, Wipe a journal.
+ *
+ *  ocfs2_journal_init     - Initialize journal structures in the OSB.
+ *  ocfs2_journal_load     - Load the given journal off disk. Replay it if
+ *                          there's transactions still in there.
+ *  ocfs2_journal_shutdown - Shutdown a journal, this will flush all
+ *                          uncommitted, uncheckpointed transactions.
+ *  ocfs2_journal_wipe     - Wipe transactions from a journal. Optionally
+ *                          zero out each block.
+ *  ocfs2_recovery_thread  - Perform recovery on a node. osb is our own osb.
+ *  ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
+ *                          event on.
+ *  ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
+ */
+void   ocfs2_set_journal_params(struct ocfs2_super *osb);
+int    ocfs2_journal_init(struct ocfs2_journal *journal,
+			  int *dirty);
+void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
+int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
+			  int full);
+int    ocfs2_journal_load(struct ocfs2_journal *journal);
+int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
+void   ocfs2_recovery_thread(struct ocfs2_super *osb,
+			     int node_num);
+int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
+void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+
+static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
+{
+	atomic_set(&osb->needs_checkpoint, 1);
+	wake_up(&osb->checkpoint_event);
+}
+
+static inline void ocfs2_checkpoint_inode(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!ocfs2_inode_fully_checkpointed(inode)) {
+		/* WARNING: This only kicks off a single
+		 * checkpoint. If someone races you and adds more
+		 * metadata to the journal, you won't know, and will
+		 * wind up waiting *alot* longer than necessary. Right
+		 * now we only use this in clear_inode so that's
+		 * OK. */
+		ocfs2_start_checkpoint(osb);
+
+		wait_event(osb->journal->j_checkpointed,
+			   ocfs2_inode_fully_checkpointed(inode));
+	}
+}
+
+/*
+ *  Transaction Handling:
+ *  Manage the lifetime of a transaction handle.
+ *
+ *  ocfs2_alloc_handle     - Only allocate a handle so we can start putting
+ *                          cluster locks on it. To actually change blocks,
+ *                          call ocfs2_start_trans with the handle returned
+ *                          from this function. You may call ocfs2_commit_trans
+ *                           at any time in the lifetime of a handle.
+ *  ocfs2_start_trans      - Begin a transaction. Give it an upper estimate of
+ *                          the number of blocks that will be changed during
+ *                          this handle.
+ *  ocfs2_commit_trans     - Complete a handle.
+ *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
+ *                          commit the handle to disk in the process, but will
+ *                          not release any locks taken during the transaction.
+ *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *                          buffer. Will have to call ocfs2_journal_dirty once
+ *                          we've actually dirtied it. Type is one of . or .
+ *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
+ *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
+ *                             the current handle commits.
+ *  ocfs2_handle_add_lock  - Sometimes we need to delay lock release
+ *                          until after a transaction has been completed. Use
+ *                          ocfs2_handle_add_lock to indicate that a lock needs
+ *                          to be released at the end of that handle. Locks
+ *                          will be released in the order that they are added.
+ *  ocfs2_handle_add_inode - Add a locked inode to a transaction.
+ */
+
+/* You must always start_trans with a number of buffs > 0, but it's
+ * perfectly legal to go through an entire transaction without having
+ * dirtied any buffers. */
+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
+					       struct ocfs2_journal_handle *handle,
+					       int max_buffs);
+void			     ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
+int			     ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
+						int nblocks);
+
+/*
+ * Create access is for when we get a newly created buffer and we're
+ * not gonna read it off disk, but rather fill it ourselves.  Right
+ * now, we don't do anything special with this (it turns into a write
+ * request), but this is a good placeholder in case we do...
+ *
+ * Write access is for when we read a block off disk and are going to
+ * modify it. This way the journalling layer knows it may need to make
+ * a copy of that block (if it's part of another, uncommitted
+ * transaction) before we do so.
+ */
+#define OCFS2_JOURNAL_ACCESS_CREATE 0
+#define OCFS2_JOURNAL_ACCESS_WRITE  1
+#define OCFS2_JOURNAL_ACCESS_UNDO   2
+
+int                  ocfs2_journal_access(struct ocfs2_journal_handle *handle,
+					  struct inode *inode,
+					  struct buffer_head *bh,
+					  int type);
+/*
+ * A word about the journal_access/journal_dirty "dance". It is
+ * entirely legal to journal_access a buffer more than once (as long
+ * as the access type is the same -- I'm not sure what will happen if
+ * access type is different but this should never happen anyway) It is
+ * also legal to journal_dirty a buffer more than once. In fact, you
+ * can even journal_access a buffer after you've done a
+ * journal_access/journal_dirty pair. The only thing you cannot do
+ * however, is journal_dirty a buffer which you haven't yet passed to
+ * journal_access at least once.
+ *
+ * That said, 99% of the time this doesn't matter and this is what the
+ * path looks like:
+ *
+ *	<read a bh>
+ *	ocfs2_journal_access(handle, bh,	OCFS2_JOURNAL_ACCESS_WRITE);
+ *	<modify the bh>
+ * 	ocfs2_journal_dirty(handle, bh);
+ */
+int                  ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
+					 struct buffer_head *bh);
+int                  ocfs2_journal_dirty_data(handle_t *handle,
+					      struct buffer_head *bh);
+int                  ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
+					   struct inode *inode);
+/*
+ * Use this to protect from other processes reading buffer state while
+ * it's in flight.
+ */
+void                 ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
+					    struct inode *inode);
+
+/*
+ *  Credit Macros:
+ *  Convenience macros to calculate number of credits needed.
+ *
+ *  For convenience sake, I have a set of macros here which calculate
+ *  the *maximum* number of sectors which will be changed for various
+ *  metadata updates.
+ */
+
+/* simple file updates like chmod, etc. */
+#define OCFS2_INODE_UPDATE_CREDITS 1
+
+/* get one bit out of a suballocator: dinode + group descriptor +
+ * prev. group desc. if we relink. */
+#define OCFS2_SUBALLOC_ALLOC (3)
+
+/* dinode + group descriptor update. We don't relink on free yet. */
+#define OCFS2_SUBALLOC_FREE  (2)
+
+#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
+#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
+					 + OCFS2_TRUNCATE_LOG_UPDATE)
+
+/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
+ * bitmap block for the new bit) */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+
+/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
+ * group descriptor + mkdir/symlink blocks */
+#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
+			    + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+
+/* local alloc metadata change + main bitmap updates */
+#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
+				  + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
+
+/* used when we don't need an allocation change for a dir extend. One
+ * for the dinode, one for the new block. */
+#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
+
+/* file update (nlink, etc) + dir entry block */
+#define OCFS2_LINK_CREDITS  (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
+ * dir inode link */
+#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
+			      + OCFS2_LINK_CREDITS)
+
+/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
+ * inode alloc group descriptor */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+
+/* dinode update, old dir dinode update, new dir dinode update, old
+ * dir dir entry, new dir dir entry, dir entry update for renaming
+ * directory + target unlink */
+#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
+			     + OCFS2_UNLINK_CREDITS)
+
+static inline int ocfs2_calc_extend_credits(struct super_block *sb,
+					    struct ocfs2_dinode *fe,
+					    u32 bits_wanted)
+{
+	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+
+	/* bitmap dinode, group desc. + relinked group. */
+	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
+
+	/* we might need to shift tree depth so lets assume an
+	 * absolute worst case of complete fragmentation.  Even with
+	 * that, we only need one update for the dinode, and then
+	 * however many metadata chunks needed * a remaining suballoc
+	 * alloc. */
+	sysfile_bitmap_blocks = 1 +
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+
+	/* this does not include *new* metadata blocks, which are
+	 * accounted for in sysfile_bitmap_blocks. fe +
+	 * prev. last_eb_blk + blocks along edge of tree.
+	 * calc_symlink_credits passes because we just need 1
+	 * credit for the dinode there. */
+	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+
+	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+}
+
+static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
+{
+	int blocks = OCFS2_MKNOD_CREDITS;
+
+	/* links can be longer than one block so we may update many
+	 * within our single allocated extent. */
+	blocks += ocfs2_clusters_to_blocks(sb, 1);
+
+	return blocks;
+}
+
+static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
+						 unsigned int cpg)
+{
+	int blocks;
+	int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
+	/* parent inode update + new block group header + bitmap inode update
+	   + bitmap blocks affected */
+	blocks = 1 + 1 + 1 + bitmap_blocks;
+	return blocks;
+}
+
+static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
+						unsigned int clusters_to_del,
+						struct ocfs2_dinode *fe,
+						struct ocfs2_extent_list *last_el)
+{
+ 	/* for dinode + all headers in this pass + update to next leaf */
+	u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
+	u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	int credits = 1 + tree_depth + 1;
+	int i;
+
+	i = next_free - 1;
+	BUG_ON(i < 0);
+
+	/* We may be deleting metadata blocks, so metadata alloc dinode +
+	   one desc. block for each possible delete. */
+	if (tree_depth && next_free == 1 &&
+	    le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+		credits += 1 + tree_depth;
+
+	/* update to the truncate log. */
+	credits += OCFS2_TRUNCATE_LOG_UPDATE;
+
+	return credits;
+}
+
+#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
new file mode 100644
index 000000000000..fe373a2101d9
--- /dev/null
+++ b/fs/ocfs2/localalloc.c
@@ -0,0 +1,983 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.c
+ *
+ * Node local data allocation
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
+
+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 numbits);
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
+
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh);
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_journal_handle *handle,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh);
+
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					struct ocfs2_journal_handle *handle,
+					struct ocfs2_alloc_context *ac);
+
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode);
+
+/*
+ * Determine how large our local alloc window should be, in bits.
+ *
+ * These values (and the behavior in ocfs2_alloc_should_use_local) have
+ * been chosen so that most allocations, including new block groups go
+ * through local alloc.
+ */
+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+{
+	BUG_ON(osb->s_clustersize_bits < 12);
+
+	return 2048 >> (osb->s_clustersize_bits - 12);
+}
+
+/*
+ * Tell us whether a given allocation should use the local alloc
+ * file. Otherwise, it has to go to the main bitmap.
+ */
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
+{
+	int la_bits = ocfs2_local_alloc_window_bits(osb);
+
+	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+		return 0;
+
+	/* la_bits should be at least twice the size (in clusters) of
+	 * a new block group. We want to be sure block group
+	 * allocations go through the local alloc, so allow an
+	 * allocation to take up to half the bitmap. */
+	if (bits > (la_bits / 2))
+		return 0;
+
+	return 1;
+}
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb)
+{
+	int status = 0;
+	struct ocfs2_dinode *alloc = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	u32 num_used;
+	struct inode *inode = NULL;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+
+	/* read the alloc off disk */
+	inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (!(le32_to_cpu(alloc->i_flags) &
+	    (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
+		mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_blkno);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if ((la->la_size == 0) ||
+	    (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
+		mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
+		     le16_to_cpu(la->la_size));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* do a little verification. */
+	num_used = ocfs2_local_alloc_count_bits(alloc);
+
+	/* hopefully the local alloc has always been recovered before
+	 * we load it. */
+	if (num_used
+	    || alloc->id1.bitmap1.i_used
+	    || alloc->id1.bitmap1.i_total
+	    || la->la_bm_off)
+		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+		     "found = %u, set = %u, taken = %u, off = %u\n",
+		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
+		     le32_to_cpu(alloc->id1.bitmap1.i_total),
+		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+
+	osb->local_alloc_bh = alloc_bh;
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+
+bail:
+	if (status < 0)
+		if (alloc_bh)
+			brelse(alloc_bh);
+	if (inode)
+		iput(inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * return any unused bits to the bitmap and write out a clean
+ * local_alloc.
+ *
+ * local_alloc_bh is optional. If not passed, we will simply use the
+ * one off osb. If you do pass it however, be warned that it *will* be
+ * returned brelse'd and NULL'd out.*/
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *local_alloc_inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_dinode *alloc = NULL;
+
+	mlog_entry_void();
+
+	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
+		goto bail;
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->local_alloc_state = OCFS2_LA_DISABLED;
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* WINDOW_MOVE_CREDITS is a bit heavy... */
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		handle = NULL;
+		goto bail;
+	}
+
+	bh = osb->local_alloc_bh;
+	alloc = (struct ocfs2_dinode *) bh->b_data;
+
+	alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, bh->b_size);
+
+	status = ocfs2_journal_access(handle, local_alloc_inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	brelse(bh);
+	osb->local_alloc_bh = NULL;
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	mlog_exit_void();
+}
+
+/*
+ * We want to free the bitmap bits outside of any recovery context as
+ * we'll need a cluster lock to do so, but we must clear the local
+ * alloc before giving up the recovered nodes journal. To solve this,
+ * we kmalloc a copy of the local alloc before it's change for the
+ * caller to process with ocfs2_complete_local_alloc_recovery
+ */
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int slot_num,
+				     struct ocfs2_dinode **alloc_copy)
+{
+	int status = 0;
+	struct buffer_head *alloc_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *alloc;
+
+	mlog_entry("(slot_num = %d)\n", slot_num);
+
+	*alloc_copy = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	down(&inode->i_sem);
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
+	if (!(*alloc_copy)) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_write_block(osb, alloc_bh, inode);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if ((status < 0) && (*alloc_copy)) {
+		kfree(*alloc_copy);
+		*alloc_copy = NULL;
+	}
+
+	if (alloc_bh)
+		brelse(alloc_bh);
+
+	if (inode) {
+		up(&inode->i_sem);
+		iput(inode);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Step 2: By now, we've completed the journal recovery, we've stamped
+ * a clean local alloc on disk and dropped the node out of the
+ * recovery map. Dlm locks will no longer stall, so lets clear out the
+ * main bitmap.
+ */
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc)
+{
+	int status;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* we want the bitmap change to be recorded on disk asap */
+	ocfs2_handle_set_sync(handle, 1);
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * make sure we've got at least bitswanted contiguous bits in the
+ * local alloc. You lose them when you drop i_sem.
+ *
+ * We will add ourselves to the transaction passed in, but may start
+ * our own in order to shift windows.
+ */
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   struct ocfs2_journal_handle *passed_handle,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac)
+{
+	int status;
+	struct ocfs2_dinode *alloc;
+	struct inode *local_alloc_inode;
+	unsigned int free_bits;
+
+	mlog_entry_void();
+
+	BUG_ON(!passed_handle);
+	BUG_ON(!ac);
+	BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
+
+	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
+		mlog(0, "Asking for more than my max window size!\n");
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
+	    ocfs2_local_alloc_count_bits(alloc)) {
+		ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
+			    "%u free bits, but a count shows %u",
+			    le64_to_cpu(alloc->i_blkno),
+			    le32_to_cpu(alloc->id1.bitmap1.i_used),
+			    ocfs2_local_alloc_count_bits(alloc));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+		le32_to_cpu(alloc->id1.bitmap1.i_used);
+	if (bits_wanted > free_bits) {
+		/* uhoh, window change time. */
+		status =
+			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	ac->ac_inode = igrab(local_alloc_inode);
+	get_bh(osb->local_alloc_bh);
+	ac->ac_bh = osb->local_alloc_bh;
+	ac->ac_which = OCFS2_AC_USE_LOCAL;
+	status = 0;
+bail:
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 struct ocfs2_journal_handle *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 min_bits,
+				 u32 *bit_off,
+				 u32 *num_bits)
+{
+	int status, start;
+	struct inode *local_alloc_inode;
+	u32 bits_wanted;
+	void *bitmap;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+	bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	local_alloc_inode = ac->ac_inode;
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (start == -1) {
+		/* TODO: Shouldn't we just BUG here? */
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bitmap = la->la_bitmap;
+	*bit_off = le32_to_cpu(la->la_bm_off) + start;
+	/* local alloc is always contiguous by nature -- we never
+	 * delete bits from it! */
+	*num_bits = bits_wanted;
+
+	status = ocfs2_journal_access(handle, local_alloc_inode,
+				      osb->local_alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while(bits_wanted--)
+		ocfs2_set_bit(start++, bitmap);
+
+	alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
+				le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
+{
+	int i;
+	u8 *buffer;
+	u32 count = 0;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	mlog_entry_void();
+
+	buffer = la->la_bitmap;
+	for (i = 0; i < le16_to_cpu(la->la_size); i++)
+		count += hweight8(buffer[i]);
+
+	mlog_exit(count);
+	return count;
+}
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 numbits)
+{
+	int numfound, bitoff, left, startoff, lastzero;
+	void *bitmap = NULL;
+
+	mlog_entry("(numbits wanted = %u)\n", numbits);
+
+	if (!alloc->id1.bitmap1.i_total) {
+		mlog(0, "No bits in my window!\n");
+		bitoff = -1;
+		goto bail;
+	}
+
+	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
+
+	numfound = bitoff = startoff = 0;
+	lastzero = -1;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
+		if (bitoff == left) {
+			/* mlog(0, "bitoff (%d) == left", bitoff); */
+			break;
+		}
+		/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
+		   "numfound = %d\n", bitoff, startoff, numfound);*/
+
+		/* Ok, we found a zero bit... is it contig. or do we
+		 * start over?*/
+		if (bitoff == startoff) {
+			/* we found a zero */
+			numfound++;
+			startoff++;
+		} else {
+			/* got a zero after some ones */
+			numfound = 1;
+			startoff = bitoff+1;
+		}
+		/* we got everything we needed */
+		if (numfound == numbits) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
+	     numfound);
+
+	if (numfound == numbits)
+		bitoff = startoff - numfound;
+	else
+		bitoff = -1;
+
+bail:
+	mlog_exit(bitoff);
+	return bitoff;
+}
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
+{
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+	int i;
+	mlog_entry_void();
+
+	alloc->id1.bitmap1.i_total = 0;
+	alloc->id1.bitmap1.i_used = 0;
+	la->la_bm_off = 0;
+	for(i = 0; i < le16_to_cpu(la->la_size); i++)
+		la->la_bitmap[i] = 0;
+
+	mlog_exit_void();
+}
+
+#if 0
+/* turn this on and uncomment below to aid debugging window shifts. */
+static void ocfs2_verify_zero_bits(unsigned long *bitmap,
+				   unsigned int start,
+				   unsigned int count)
+{
+	unsigned int tmp = count;
+	while(tmp--) {
+		if (ocfs2_test_bit(start + tmp, bitmap)) {
+			printk("ocfs2_verify_zero_bits: start = %u, count = "
+			       "%u\n", start, count);
+			printk("ocfs2_verify_zero_bits: bit %u is set!",
+			       start + tmp);
+			BUG();
+		}
+	}
+}
+#endif
+
+/*
+ * sync the local alloc to main bitmap.
+ *
+ * assumes you've already locked the main bitmap -- the bitmap inode
+ * passed is used for caching.
+ */
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh)
+{
+	int status = 0;
+	int bit_off, left, count, start;
+	u64 la_start_blk;
+	u64 blkno;
+	void *bitmap;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	mlog_entry("total = %u, COUNT = %u, used = %u\n",
+		   le32_to_cpu(alloc->id1.bitmap1.i_total),
+		   ocfs2_local_alloc_count_bits(alloc),
+		   le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	if (!alloc->id1.bitmap1.i_total) {
+		mlog(0, "nothing to sync!\n");
+		goto bail;
+	}
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
+	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
+		mlog(0, "all bits were taken!\n");
+		goto bail;
+	}
+
+	la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(la->la_bm_off));
+	bitmap = la->la_bitmap;
+	start = count = bit_off = 0;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+
+	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
+	       != -1) {
+		if ((bit_off < left) && (bit_off == start)) {
+			count++;
+			start++;
+			continue;
+		}
+		if (count) {
+			blkno = la_start_blk +
+				ocfs2_clusters_to_blocks(osb->sb,
+							 start - count);
+
+			mlog(0, "freeing %u bits starting at local "
+			     "alloc bit %u (la_start_blk = %"MLFu64", "
+			     "blkno = %"MLFu64")\n", count, start - count,
+			     la_start_blk, blkno);
+
+			status = ocfs2_free_clusters(handle, main_bm_inode,
+						     main_bm_bh, blkno, count);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		if (bit_off >= left)
+			break;
+		count = 1;
+		start = bit_off + 1;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_journal_handle *handle,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh)
+{
+	int status;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+
+	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	*bitmap_inode = (*ac)->ac_inode;
+	igrab(*bitmap_inode);
+	*bitmap_bh = (*ac)->ac_bh;
+	get_bh(*bitmap_bh);
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * pass it the bitmap lock in lock_bh if you have it.
+ */
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					struct ocfs2_journal_handle *handle,
+					struct ocfs2_alloc_context *ac)
+{
+	int status = 0;
+	u32 cluster_off, cluster_count;
+	struct ocfs2_dinode *alloc = NULL;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (alloc->id1.bitmap1.i_total)
+		mlog(0, "asking me to alloc a new window over a non-empty "
+		     "one\n");
+
+	mlog(0, "Allocating %u clusters for a new window.\n",
+	     ocfs2_local_alloc_window_bits(osb));
+	/* we used the generic suballoc reserve function, but we set
+	 * everything up nicely, so there's no reason why we can't use
+	 * the more specific cluster api to claim bits. */
+	status = ocfs2_claim_clusters(osb, handle, ac,
+				      ocfs2_local_alloc_window_bits(osb),
+				      &cluster_off, &cluster_count);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	la->la_bm_off = cpu_to_le32(cluster_off);
+	alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
+	/* just in case... In the future when we find space ourselves,
+	 * we don't have to get all contiguous -- but we'll have to
+	 * set all previously used bits in bitmap and update
+	 * la_bits_set before setting the bits in the main bitmap. */
+	alloc->id1.bitmap1.i_used = 0;
+	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
+	       le16_to_cpu(la->la_size));
+
+	mlog(0, "New window allocated:\n");
+	mlog(0, "window la_bm_off = %u\n",
+	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+	mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* Note that we do *NOT* lock the local alloc inode here as
+ * it's been locked already for us. */
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode)
+{
+	int status = 0;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_alloc_context *ac = NULL;
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* This will lock the main bitmap for us. */
+	status = ocfs2_local_alloc_reserve_for_window(osb,
+						      handle,
+						      &ac,
+						      &main_bm_inode,
+						      &main_bm_bh);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	/* We want to clear the local alloc before doing anything
+	 * else, so that if we error later during this operation,
+	 * local alloc shutdown won't try to double free main bitmap
+	 * bits. Make a copy so the sync function knows which bits to
+	 * free. */
+	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
+
+	status = ocfs2_journal_access(handle, local_alloc_inode,
+				      osb->local_alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_local_alloc_new_window(osb, handle, ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_inc(&osb->alloc_stats.moves);
+
+	status = 0;
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	mlog_exit(status);
+	return status;
+}
+
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
new file mode 100644
index 000000000000..30f88ce14e46
--- /dev/null
+++ b/fs/ocfs2/localalloc.h
@@ -0,0 +1,56 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCALALLOC_H
+#define OCFS2_LOCALALLOC_H
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb);
+
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int node_num,
+				     struct ocfs2_dinode **alloc_copy);
+
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc);
+
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
+				 u64 bits);
+
+struct ocfs2_alloc_context;
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   struct ocfs2_journal_handle *passed_handle,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac);
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 struct ocfs2_journal_handle *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 min_bits,
+				 u32 *bit_off,
+				 u32 *num_bits);
+
+#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
new file mode 100644
index 000000000000..afdeec4b0eef
--- /dev/null
+++ b/fs/ocfs2/mmap.c
@@ -0,0 +1,102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mmap.c
+ *
+ * Code to deal with the mess that is clustered mmap.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/signal.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "mmap.h"
+
+static struct page *ocfs2_nopage(struct vm_area_struct * area,
+				 unsigned long address,
+				 int *type)
+{
+	struct inode *inode = area->vm_file->f_dentry->d_inode;
+	struct page *page = NOPAGE_SIGBUS;
+	sigset_t blocked, oldset;
+	int ret;
+
+	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
+
+	/* The best way to deal with signals in this path is
+	 * to block them upfront, rather than allowing the
+	 * locking paths to return -ERESTARTSYS. */
+	sigfillset(&blocked);
+
+	/* We should technically never get a bad ret return
+	 * from sigprocmask */
+	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	page = filemap_nopage(area, address, type);
+
+	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (ret < 0)
+		mlog_errno(ret);
+out:
+	mlog_exit_ptr(page);
+	return page;
+}
+
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+	.nopage = ocfs2_nopage,
+};
+
+int ocfs2_mmap(struct file *file,
+	       struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+
+	/* We don't want to support shared writable mappings yet. */
+	if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
+	    && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
+		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
+		/* This is -EINVAL because generic_file_readonly_mmap
+		 * returns it in a similar situation. */
+		return -EINVAL;
+	}
+
+	update_atime(inode);
+	vma->vm_ops = &ocfs2_file_vm_ops;
+	return 0;
+}
+
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
new file mode 100644
index 000000000000..1274ee0f1fe2
--- /dev/null
+++ b/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
+#ifndef OCFS2_MMAP_H
+#define OCFS2_MMAP_H
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
+
+#endif  /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
new file mode 100644
index 000000000000..f6b77ff1d2bf
--- /dev/null
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2264 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.c
+ *
+ * Create and rename file, directory, symlinks
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir);
+
+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
+			      struct inode *dir,
+			      struct ocfs2_dir_entry *de_del,
+			      struct buffer_head *bh);
+
+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+			     struct inode *dir,
+			     const char *name, int namelen,
+			     struct inode *inode, u64 blkno,
+			     struct buffer_head *parent_fe_bh,
+			     struct buffer_head *insert_bh);
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac);
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac);
+
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2);
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct inode *inode,
+				    char *name,
+				    struct buffer_head **de_bh);
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct ocfs2_dinode *fe,
+			    char *name,
+			    struct buffer_head *de_bh);
+
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     const char *symname);
+
+static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+				  struct dentry *dentry,
+				  struct inode *inode, u64 blkno,
+				  struct buffer_head *parent_fe_bh,
+				  struct buffer_head *insert_bh)
+{
+	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+				 dentry->d_name.name, dentry->d_name.len,
+				 inode, blkno, parent_fe_bh, insert_bh);
+}
+
+/* An orphan dir name is an 8 byte value, printed as a hex string */
+#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+
+static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	int status;
+	u64 blkno;
+	struct buffer_head *dirent_bh = NULL;
+	struct inode *inode = NULL;
+	struct dentry *ret;
+	struct ocfs2_dir_entry *dirent;
+	struct ocfs2_inode_info *oi;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
+		ret = ERR_PTR(-ENAMETOOLONG);
+		goto bail;
+	}
+
+	mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
+	     dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ret = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0)
+		goto bail_add;
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		ret = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	oi = OCFS2_I(inode);
+	/* Clear any orphaned state... If we were able to look up the
+	 * inode from a directory, it certainly can't be orphaned. We
+	 * might have the bad state from a node which intended to
+	 * orphan this inode but crashed before it could commit the
+	 * unlink. */
+	spin_lock(&oi->ip_lock);
+	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
+	oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&oi->ip_lock);
+
+bail_add:
+
+	dentry->d_op = &ocfs2_dentry_ops;
+	ret = d_splice_alias(inode, dentry);
+
+bail_unlock:
+	/* Don't drop the cluster lock until *after* the d_add --
+	 * unlink on another node will message us to remove that
+	 * dentry under this lock so otherwise we can race this with
+	 * the vote thread and have a stale dentry. */
+	ocfs2_meta_unlock(dir, 0);
+
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	mlog_exit_ptr(ret);
+
+	return ret;
+}
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac)
+{
+	int status;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
+				     data_ac, NULL, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+	status = ocfs2_journal_access(handle, inode, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	ocfs2_set_de_type(de, S_IFDIR);
+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
+				  OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	inode->i_nlink = 2;
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mknod(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode,
+		       dev_t dev)
+{
+	int status = 0;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *dirfe;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	/* get our super block */
+	osb = OCFS2_SB(dir->i_sb);
+
+	if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+		mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
+		     OCFS2_I(dir)->ip_blkno, dir->i_nlink);
+		status = -EMLINK;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	/* are we making a directory? If so, reserve a cluster for his
+	 * 1st extent. */
+	if (S_ISDIR(mode)) {
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the real work now. */
+	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    &inode, inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(mode)) {
+		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
+					    new_fe_bh, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+
+		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		le16_add_cpu(&dirfe->i_links_count, 1);
+		status = ocfs2_journal_dirty(handle, parent_fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		dir->i_nlink++;
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+				 de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+	status = 0;
+leave:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (status == -ENOSPC)
+		mlog(0, "Disk is full\n");
+
+	if (new_fe_bh)
+		brelse(new_fe_bh);
+
+	if (de_bh)
+		brelse(de_bh);
+
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	if ((status < 0) && inode)
+		iput(inode);
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_extent_list *fel;
+	u64 fe_blkno = 0;
+	u16 suballoc_bit;
+	struct inode *inode = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	*new_fe_bh = NULL;
+	*ret_inode = NULL;
+
+	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
+				       &fe_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	inode = new_inode(dir->i_sb);
+	if (IS_ERR(inode)) {
+		status = PTR_ERR(inode);
+		mlog(ML_ERROR, "new_inode failed!\n");
+		goto leave;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
+	OCFS2_I(inode)->ip_blkno = fe_blkno;
+	if (S_ISDIR(mode))
+		inode->i_nlink = 2;
+	else
+		inode->i_nlink = 1;
+	inode->i_mode = mode;
+	spin_lock(&osb->osb_lock);
+	inode->i_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
+	if (!*new_fe_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto leave;
+	}
+	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
+
+	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
+	memset(fe, 0, osb->sb->s_blocksize);
+
+	fe->i_generation = cpu_to_le32(inode->i_generation);
+	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
+	fe->i_blkno = cpu_to_le64(fe_blkno);
+	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
+	fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+	fe->i_uid = cpu_to_le32(current->fsuid);
+	if (dir->i_mode & S_ISGID) {
+		fe->i_gid = cpu_to_le32(dir->i_gid);
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		fe->i_gid = cpu_to_le32(current->fsgid);
+	fe->i_mode = cpu_to_le16(mode);
+	if (S_ISCHR(mode) || S_ISBLK(mode))
+		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
+
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+
+	fe->i_last_eb_blk = 0;
+	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
+	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+	fe->i_atime = fe->i_ctime = fe->i_mtime =
+		cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
+		cpu_to_le32(CURRENT_TIME.tv_nsec);
+	fe->i_dtime = 0;
+
+	fel = &fe->id2.i_list;
+	fel->l_tree_depth = 0;
+	fel->l_next_free_rec = 0;
+	fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+
+	status = ocfs2_journal_dirty(handle, *new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
+		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
+		     "i_blkno=%"MLFu64", i_ino=%lu\n",
+		     (unsigned long long) (*new_fe_bh)->b_blocknr,
+		     fe->i_blkno, inode->i_ino);
+		BUG();
+	}
+
+	ocfs2_inode_set_new(osb, inode);
+	status = ocfs2_create_new_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = 0; /* error in ocfs2_create_new_inode_locks is not
+		     * critical */
+
+	*ret_inode = inode;
+leave:
+	if (status < 0) {
+		if (*new_fe_bh) {
+			brelse(*new_fe_bh);
+			*new_fe_bh = NULL;
+		}
+		if (inode)
+			iput(inode);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mkdir(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_link(struct dentry *old_dentry,
+		      struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *inode = old_dentry->d_inode;
+	int err;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (S_ISDIR(inode->i_mode)) {
+		err = -EPERM;
+		goto bail;
+	}
+
+	if (inode->i_nlink >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		err = -ENOMEM;
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					dentry->d_name.len);
+	if (err)
+		goto bail;
+
+	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					   dentry->d_name.name,
+					   dentry->d_name.len, &de_bh);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_journal_access(handle, inode, fe_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	inode->i_nlink++;
+	inode->i_ctime = CURRENT_TIME;
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	err = ocfs2_journal_dirty(handle, fe_bh);
+	if (err < 0) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_add_entry(handle, dentry, inode,
+			      OCFS2_I(inode)->ip_blkno,
+			      parent_fe_bh, de_bh);
+	if (err) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	atomic_inc(&inode->i_count);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (de_bh)
+		brelse(de_bh);
+	if (fe_bh)
+		brelse(fe_bh);
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	mlog_exit(err);
+
+	return err;
+}
+
+static int ocfs2_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	unsigned int saved_nlink = 0;
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	u64 blkno;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_node_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+	struct buffer_head *dirent_bh = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	BUG_ON(dentry->d_parent->d_inode != dir);
+
+	mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	if (inode == osb->root_inode) {
+		mlog(0, "Cannot delete the root directory\n");
+		status = -EPERM;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (OCFS2_I(inode)->ip_blkno != blkno) {
+		status = -ENOENT;
+
+		mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
+		     "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
+		     OCFS2_I(inode)->ip_flags);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+	       	if (!ocfs2_empty_dir(inode)) {
+			status = -ENOTEMPTY;
+			goto leave;
+		} else if (inode->i_nlink != 2) {
+			status = -ENOTEMPTY;
+			goto leave;
+		}
+	}
+
+	/* There are still a few steps left until we can consider the
+	 * unlink to have succeeded. Save off nlink here before
+	 * modification so we can set it back in case we hit an issue
+	 * before commit. */
+	saved_nlink = inode->i_nlink;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink = 0;
+	else
+		inode->i_nlink--;
+
+	status = ocfs2_request_unlink_vote(inode, dentry,
+					   (unsigned int) inode->i_nlink);
+	if (status < 0) {
+		/* This vote should succeed under all normal
+		 * circumstances. */
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (!inode->i_nlink) {
+		status = ocfs2_prepare_orphan_dir(osb, handle, inode,
+						  orphan_name,
+						  &orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (!inode->i_nlink) {
+		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+					  orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/* delete the name from the parent dir */
+	status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* We can set nlink on the dinode now. clear the saved version
+	 * so that it doesn't get set later. */
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	saved_nlink = 0;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		dir->i_nlink--;
+		status = ocfs2_mark_inode_dirty(handle, dir,
+						parent_node_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			dir->i_nlink++;
+		}
+	}
+
+leave:
+	if (status < 0 && saved_nlink)
+		inode->i_nlink = saved_nlink;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (fe_bh)
+		brelse(fe_bh);
+
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	if (parent_node_bh)
+		brelse(parent_node_bh);
+
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * The only place this should be used is rename!
+ * if they have the same id, then the 1st one is the only one locked.
+ */
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2)
+{
+	int status;
+	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
+	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
+	struct buffer_head **tmpbh;
+	struct inode *tmpinode;
+
+	mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
+		   oi1->ip_blkno, oi2->ip_blkno);
+
+	BUG_ON(!handle);
+
+	if (*bh1)
+		*bh1 = NULL;
+	if (*bh2)
+		*bh2 = NULL;
+
+	/* we always want to lock the one with the lower lockid first. */
+	if (oi1->ip_blkno != oi2->ip_blkno) {
+		if (oi1->ip_blkno < oi2->ip_blkno) {
+			/* switch id1 and id2 around */
+			mlog(0, "switching them around...\n");
+			tmpbh = bh2;
+			bh2 = bh1;
+			bh1 = tmpbh;
+
+			tmpinode = inode2;
+			inode2 = inode1;
+			inode1 = tmpinode;
+		}
+		/* lock id2 */
+		status = ocfs2_meta_lock(inode2, handle, bh2, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+	/* lock id1 */
+	status = ocfs2_meta_lock(inode1, handle, bh1, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+#define PARENT_INO(buffer) \
+	((struct ocfs2_dir_entry *) \
+	 ((char *)buffer + \
+	  le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
+
+static int ocfs2_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir,
+			struct dentry *new_dentry)
+{
+	int status = 0, rename_lock = 0;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct ocfs2_dinode *newfe = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+	struct buffer_head *newfe_bh = NULL;
+	struct buffer_head *insert_entry_bh = NULL;
+	struct ocfs2_super *osb = NULL;
+	u64 newfe_blkno;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *old_dir_bh = NULL;
+	struct buffer_head *new_dir_bh = NULL;
+	struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
+							       // and new_dentry
+	struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
+	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
+						    // this is the 1st dirent bh
+	nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
+	unsigned int links_count;
+
+	/* At some point it might be nice to break this function up a
+	 * bit. */
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
+		   old_dir, old_dentry, new_dir, new_dentry,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   new_dentry->d_name.len, new_dentry->d_name.name);
+
+	osb = OCFS2_SB(old_dir->i_sb);
+
+	if (new_inode) {
+		if (!igrab(new_inode))
+			BUG();
+	}
+
+	if (atomic_read(&old_dentry->d_count) > 2) {
+		shrink_dcache_parent(old_dentry);
+		if (atomic_read(&old_dentry->d_count) > 2) {
+			status = -EBUSY;
+			goto bail;
+		}
+	}
+
+	/* Assume a directory heirarchy thusly:
+	 * a/b/c
+	 * a/d
+	 * a,b,c, and d are all directories.
+	 *
+	 * from cwd of 'a' on both nodes:
+	 * node1: mv b/c d
+	 * node2: mv d   b/c
+	 *
+	 * And that's why, just like the VFS, we need a file system
+	 * rename lock. */
+	if (old_dentry != new_dentry) {
+		status = ocfs2_rename_lock(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		rename_lock = 1;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* if old and new are the same, this'll just do one lock. */
+	status = ocfs2_double_lock(osb, handle,
+				  &old_dir_bh, old_dir,
+				  &new_dir_bh, new_dir);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* make sure both dirs have bhs
+	 * get an extra ref on old_dir_bh if old==new */
+	if (!new_dir_bh) {
+		if (old_dir_bh) {
+			new_dir_bh = old_dir_bh;
+			get_bh(new_dir_bh);
+		} else {
+			mlog(ML_ERROR, "no old_dir_bh!\n");
+			status = -EIO;
+			goto bail;
+		}
+	}
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		/* Directories actually require metadata updates to
+		 * the directory info so we can't get away with not
+		 * doing node locking on it. */
+		status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		status = -EIO;
+		old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
+		if (!old_inode_de_bh)
+			goto bail;
+
+		status = -EIO;
+		if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
+		    OCFS2_I(old_dir)->ip_blkno)
+			goto bail;
+		status = -EMLINK;
+		if (!new_inode && new_dir!=old_dir &&
+		    new_dir->i_nlink >= OCFS2_LINK_MAX)
+			goto bail;
+	} else {
+		/* Ah, the simple case - we're a file so just send a
+		 * message. */
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = -ENOENT;
+	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
+				     old_dentry->d_name.len,
+				     old_dir, &old_de);
+	if (!old_de_bh)
+		goto bail;
+
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
+		goto bail;
+
+	/* check if the target already exists (in which case we need
+	 * to delete it */
+	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
+					  new_dentry->d_name.len,
+					  &newfe_blkno, new_dir, &new_de_bh,
+					  &new_de);
+	/* The only error we allow here is -ENOENT because the new
+	 * file not existing is perfectly valid. */
+	if ((status < 0) && (status != -ENOENT)) {
+		/* If we cannot find the file specified we should just */
+		/* return the error... */
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!new_de && new_inode)
+		mlog(ML_ERROR, "inode %lu does not exist in it's parent "
+		     "directory!", new_inode->i_ino);
+
+	/* In case we need to overwrite an existing file, we blow it
+	 * away first */
+	if (new_de) {
+		/* VFS didn't think there existed an inode here, but
+		 * someone else in the cluster must have raced our
+		 * rename to create one. Today we error cleanly, in
+		 * the future we should consider calling iget to build
+		 * a new struct inode for this entry. */
+		if (!new_inode) {
+			status = -EACCES;
+
+			mlog(0, "We found an inode for name %.*s but VFS "
+			     "didn't give us one.\n", new_dentry->d_name.len,
+			     new_dentry->d_name.name);
+			goto bail;
+		}
+
+		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
+			status = -EACCES;
+
+			mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
+			     "disagree. ip_flags = %x\n",
+			     OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
+			     OCFS2_I(new_inode)->ip_flags);
+			goto bail;
+		}
+
+		status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode))
+			links_count = 0;
+		else
+			links_count = (unsigned int) (new_inode->i_nlink - 1);
+
+		status = ocfs2_request_unlink_vote(new_inode, new_dentry,
+						   links_count);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
+
+		mlog(0, "aha rename over existing... new_de=%p "
+		     "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n",
+		     new_de, newfe_blkno, newfe_bh, newfe_bh ?
+		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+
+		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
+			status = ocfs2_prepare_orphan_dir(osb, handle,
+							  new_inode,
+							  orphan_name,
+							  &orphan_entry_bh);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+	} else {
+		BUG_ON(new_dentry->d_parent->d_inode != new_dir);
+
+		status = ocfs2_check_dir_for_entry(new_dir,
+						   new_dentry->d_name.name,
+						   new_dentry->d_name.len);
+		if (status)
+			goto bail;
+
+		status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
+						      new_dentry->d_name.name,
+						      new_dentry->d_name.len,
+						      &insert_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_de) {
+		if (S_ISDIR(new_inode->i_mode)) {
+			if (!ocfs2_empty_dir(new_inode) ||
+			    new_inode->i_nlink != 2) {
+				status = -ENOTEMPTY;
+				goto bail;
+			}
+		}
+		status = ocfs2_journal_access(handle, new_inode, newfe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode) ||
+		    (newfe->i_links_count == cpu_to_le16(1))){
+			status = ocfs2_orphan_add(osb, handle, new_inode,
+						  newfe, orphan_name,
+						  orphan_entry_bh);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		/* change the dirent to point to the correct inode */
+		status = ocfs2_journal_access(handle, new_dir, new_de_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
+		new_de->file_type = old_de->file_type;
+		new_dir->i_version++;
+		status = ocfs2_journal_dirty(handle, new_de_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode))
+			newfe->i_links_count = 0;
+		else
+			le16_add_cpu(&newfe->i_links_count, -1);
+
+		status = ocfs2_journal_dirty(handle, newfe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	} else {
+		/* if the name was not found in new_dir, add it now */
+		status = ocfs2_add_entry(handle, new_dentry, old_inode,
+					 OCFS2_I(old_inode)->ip_blkno,
+					 new_dir_bh, insert_entry_bh);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(old_inode);
+
+	/* now that the name has been added to new_dir, remove the old name */
+	status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_inode) {
+		new_inode->i_nlink--;
+		new_inode->i_ctime = CURRENT_TIME;
+	}
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+	if (old_inode_de_bh) {
+		status = ocfs2_journal_access(handle, old_inode,
+					     old_inode_de_bh,
+					     OCFS2_JOURNAL_ACCESS_WRITE);
+		PARENT_INO(old_inode_de_bh->b_data) =
+			cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
+		status = ocfs2_journal_dirty(handle, old_inode_de_bh);
+		old_dir->i_nlink--;
+		if (new_inode) {
+			new_inode->i_nlink--;
+		} else {
+			new_dir->i_nlink++;
+			mark_inode_dirty(new_dir);
+		}
+	}
+	mark_inode_dirty(old_dir);
+	if (new_inode)
+		mark_inode_dirty(new_inode);
+
+	if (old_dir != new_dir)
+		if (new_dir_nlink != new_dir->i_nlink) {
+			if (!new_dir_bh) {
+				mlog(ML_ERROR, "need to change nlink for new "
+				     "dir %"MLFu64" from %d to %d but bh is "
+				     "NULL\n", OCFS2_I(new_dir)->ip_blkno,
+				     (int)new_dir_nlink, new_dir->i_nlink);
+			} else {
+				struct ocfs2_dinode *fe;
+				status = ocfs2_journal_access(handle,
+							      new_dir,
+							      new_dir_bh,
+							      OCFS2_JOURNAL_ACCESS_WRITE);
+				fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
+				fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
+				status = ocfs2_journal_dirty(handle, new_dir_bh);
+			}
+		}
+
+	if (old_dir_nlink != old_dir->i_nlink) {
+		if (!old_dir_bh) {
+			mlog(ML_ERROR, "need to change nlink for old dir "
+			     "%"MLFu64" from %d to %d but bh is NULL!\n",
+			     OCFS2_I(old_dir)->ip_blkno,
+			     (int)old_dir_nlink,
+			     old_dir->i_nlink);
+		} else {
+			struct ocfs2_dinode *fe;
+			status = ocfs2_journal_access(handle, old_dir,
+						      old_dir_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
+			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
+			status = ocfs2_journal_dirty(handle, old_dir_bh);
+		}
+	}
+
+	status = 0;
+bail:
+	if (rename_lock)
+		ocfs2_rename_unlock(osb);
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (new_inode)
+		sync_mapping_buffers(old_inode->i_mapping);
+
+	if (new_inode)
+		iput(new_inode);
+	if (newfe_bh)
+		brelse(newfe_bh);
+	if (old_dir_bh)
+		brelse(old_dir_bh);
+	if (new_dir_bh)
+		brelse(new_dir_bh);
+	if (new_de_bh)
+		brelse(new_de_bh);
+	if (old_de_bh)
+		brelse(old_de_bh);
+	if (old_inode_de_bh)
+		brelse(old_inode_de_bh);
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+	if (insert_entry_bh)
+		brelse(insert_entry_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * we expect i_size = strlen(symname). Copy symname into the file
+ * data, including the null terminator.
+ */
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     const char *symname)
+{
+	struct buffer_head **bhs = NULL;
+	const char *c;
+	struct super_block *sb = osb->sb;
+	u64 p_blkno;
+	int p_blocks;
+	int virtual, blocks, status, i, bytes_left;
+
+	bytes_left = i_size_read(inode) + 1;
+	/* we can't trust i_blocks because we're actually going to
+	 * write i_size + 1 bytes. */
+	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n",
+		       inode->i_blocks, i_size_read(inode), blocks);
+
+	/* Sanity check -- make sure we're going to fit. */
+	if (bytes_left >
+	    ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
+					     &p_blocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* links can never be larger than one cluster so we know this
+	 * is all going to be contiguous, but do a sanity check
+	 * anyway. */
+	if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	virtual = 0;
+	while(bytes_left > 0) {
+		c = &symname[virtual * sb->s_blocksize];
+
+		bhs[virtual] = sb_getblk(sb, p_blkno);
+		if (!bhs[virtual]) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+		ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
+
+		status = ocfs2_journal_access(handle, inode, bhs[virtual],
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
+
+		memcpy(bhs[virtual]->b_data, c,
+		       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
+		       bytes_left);
+
+		status = ocfs2_journal_dirty(handle, bhs[virtual]);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		virtual++;
+		p_blkno++;
+		bytes_left -= sb->s_blocksize;
+	}
+
+	status = 0;
+bail:
+
+	if (bhs) {
+		for(i = 0; i < blocks; i++)
+			if (bhs[i])
+				brelse(bhs[i]);
+		kfree(bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_symlink(struct inode *dir,
+			 struct dentry *dentry,
+			 const char *symname)
+{
+	int status, l, credits;
+	u64 newsize;
+	struct ocfs2_super *osb = NULL;
+	struct inode *inode = NULL;
+	struct super_block *sb;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_dinode *dirfe;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+
+	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
+		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
+
+	sb = dir->i_sb;
+	osb = OCFS2_SB(sb);
+
+	l = strlen(symname) + 1;
+
+	credits = ocfs2_calc_symlink_credits(sb);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* lock the parent directory */
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto bail;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto bail;
+
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* don't reserve bitmap space for fast symlinks. */
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_mknod_locked(osb, dir, dentry,
+				    S_IFLNK | S_IRWXUGO, 0,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    &inode, inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
+	inode->i_rdev = 0;
+	newsize = l - 1;
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		inode->i_op = &ocfs2_symlink_inode_operations;
+		status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+						    handle, data_ac, NULL,
+						    NULL);
+		if (status < 0) {
+			if (status != -ENOSPC && status != -EINTR) {
+				mlog(ML_ERROR, "Failed to extend file to "
+					       "%"MLFu64"\n",
+				     newsize);
+				mlog_errno(status);
+				status = -ENOSPC;
+			}
+			goto bail;
+		}
+		i_size_write(inode, newsize);
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+	} else {
+		inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		memcpy((char *) fe->id2.i_symlink, symname, l);
+		i_size_write(inode, newsize);
+		inode->i_blocks = 0;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_fast_symlink(inode)) {
+		status = ocfs2_create_symlink_data(osb, handle, inode,
+						   symname);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+				 de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (new_fe_bh)
+		brelse(new_fe_bh);
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+	if (de_bh)
+		brelse(de_bh);
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if ((status < 0) && inode)
+		iput(inode);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+int ocfs2_check_dir_entry(struct inode * dir,
+			  struct ocfs2_dir_entry * de,
+			  struct buffer_head * bh,
+			  unsigned long offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = le16_to_cpu(de->rec_len);
+
+	if (rlen < OCFS2_DIR_REC_LEN(1))
+		error_msg = "rec_len is smaller than minimal";
+	else if (rlen % 4 != 0)
+		error_msg = "rec_len % 4 != 0";
+	else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
+		error_msg = "rec_len is too small for name_len";
+	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+		error_msg = "directory entry across blocks";
+
+	if (error_msg != NULL)
+		mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - "
+		     "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n",
+		     OCFS2_I(dir)->ip_blkno, error_msg, offset,
+		     le64_to_cpu(de->inode), rlen, de->name_len);
+	return error_msg == NULL ? 1 : 0;
+}
+
+/* we don't always have a dentry for what we want to add, so people
+ * like orphan dir can call this instead.
+ *
+ * If you pass me insert_bh, I'll skip the search of the other dir
+ * blocks and put the record in there.
+ */
+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+			     struct inode *dir,
+			     const char *name, int namelen,
+			     struct inode *inode, u64 blkno,
+			     struct buffer_head *parent_fe_bh,
+			     struct buffer_head *insert_bh)
+{
+	unsigned long offset;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de, *de1;
+	struct super_block *sb;
+	int retval, status;
+
+	mlog_entry_void();
+
+	sb = dir->i_sb;
+
+	if (!namelen)
+		return -EINVAL;
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
+	while (1) {
+		BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
+		/* These checks should've already been passed by the
+		 * prepare function, but I guess we can leave them
+		 * here anyway. */
+		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
+			retval = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			retval = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			status = ocfs2_journal_access(handle, dir, insert_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			/* By now the buffer is marked for journaling */
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				de1 = (struct ocfs2_dir_entry *)((char *) de +
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de1->rec_len =
+					cpu_to_le16(le16_to_cpu(de->rec_len) -
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+				de = de1;
+			}
+			de->file_type = OCFS2_FT_UNKNOWN;
+			if (blkno) {
+				de->inode = cpu_to_le64(blkno);
+				ocfs2_set_de_type(de, inode->i_mode);
+			} else
+				de->inode = 0;
+			de->name_len = namelen;
+			memcpy(de->name, name, namelen);
+
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, insert_bh);
+			retval = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	/* when you think about it, the assert above should prevent us
+	 * from ever getting here. */
+	retval = -ENOSPC;
+bail:
+
+	mlog_exit(retval);
+	return retval;
+}
+
+
+/*
+ * ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
+			      struct inode *dir,
+			      struct ocfs2_dir_entry *de_del,
+			      struct buffer_head *bh)
+{
+	struct ocfs2_dir_entry *de, *pde;
+	int i, status = -ENOENT;
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+
+	i = 0;
+	pde = NULL;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (i < bh->b_size) {
+		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+			status = -EIO;
+			mlog_errno(status);
+			goto bail;
+		}
+		if (de == de_del)  {
+			status = ocfs2_journal_access(handle, dir, bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			if (pde)
+				pde->rec_len =
+					cpu_to_le16(le16_to_cpu(pde->rec_len) +
+						    le16_to_cpu(de->rec_len));
+			else
+				de->inode = 0;
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, bh);
+			goto bail;
+		}
+		i += le16_to_cpu(de->rec_len);
+		pde = de;
+		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir)
+{
+	struct ocfs2_dir_entry *de;
+	char *dlimit, *de_buf;
+	int de_len;
+	int ret = 0;
+
+	mlog_entry_void();
+
+	de_buf = bh->b_data;
+	dlimit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		de = (struct ocfs2_dir_entry *) de_buf;
+
+		if (de_buf + namelen <= dlimit &&
+		    ocfs2_match(namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+				ret = -1;
+				goto bail;
+			}
+			*res_dir = de;
+			ret = 1;
+			goto bail;
+		}
+
+		/* prevent looping on a bad block */
+		de_len = le16_to_cpu(de->rec_len);
+		if (de_len <= 0) {
+			ret = -1;
+			goto bail;
+		}
+
+		de_buf += de_len;
+		offset += de_len;
+	}
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir)
+{
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
+	unsigned long start, block, b;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+
+	mlog_entry_void();
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	start = OCFS2_I(dir)->ip_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+
+				/* XXX: questionable readahead stuff here */
+				bh = ocfs2_bread(dir, b++, &err, 1);
+				bh_use[ra_max] = bh;
+#if 0		// ???
+				if (bh)
+					ll_rw_block(READ, 1, &bh);
+#endif
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			brelse(bh);
+			goto next;
+		}
+		i = ocfs2_search_dirblock(bh, dir, name, namelen,
+					  block << sb->s_blocksize_bits,
+					  res_dir);
+		if (i == 1) {
+			OCFS2_I(dir)->ip_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse(bh_use[ra_ptr]);
+
+	mlog_exit_ptr(ret);
+	return ret;
+}
+
+static int ocfs2_blkno_stringify(u64 blkno, char *name)
+{
+	int status, namelen;
+
+	mlog_entry_void();
+
+	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64,
+			   blkno);
+	if (namelen <= 0) {
+		if (namelen)
+			status = namelen;
+		else
+			status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+	if (namelen != OCFS2_ORPHAN_NAMELEN) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
+	     namelen);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct inode *inode,
+				    char *name,
+				    struct buffer_head **de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_handle_add_inode(handle, orphan_dir_inode);
+	status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+					      orphan_dir_bh, name,
+					      OCFS2_ORPHAN_NAMELEN, de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+leave:
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	if (orphan_dir_bh)
+		brelse(orphan_dir_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct ocfs2_dinode *fe,
+			    char *name,
+			    struct buffer_head *de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+	struct ocfs2_dinode *orphan_fe;
+
+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_read_block(osb,
+				  OCFS2_I(orphan_dir_inode)->ip_blkno,
+				  &orphan_dir_bh, OCFS2_BH_CACHED,
+				  orphan_dir_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* we're a cluster, and nlink can change on disk from
+	 * underneath us... */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		le16_add_cpu(&orphan_fe->i_links_count, 1);
+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+
+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+				   OCFS2_ORPHAN_NAMELEN, inode,
+				   OCFS2_I(inode)->ip_blkno,
+				   orphan_dir_bh, de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+
+	/* Record which orphan dir our inode now resides
+	 * in. delete_inode will use this to determine which orphan
+	 * dir to lock. */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	mlog(0, "Inode %"MLFu64" orphaned in slot %d\n",
+	     OCFS2_I(inode)->ip_blkno, osb->slot_num);
+
+leave:
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	if (orphan_dir_bh)
+		brelse(orphan_dir_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* unlike orphan_add, we expect the orphan dir to already be locked here. */
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     struct ocfs2_journal_handle *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh)
+{
+	char name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct ocfs2_dinode *orphan_fe;
+	int status = 0;
+	struct buffer_head *target_de_bh = NULL;
+	struct ocfs2_dir_entry *target_de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n",
+	     name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN);
+
+	/* find it's spot in the orphan directory */
+	target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
+					orphan_dir_inode, &target_de);
+	if (!target_de_bh) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* remove it from the orphan directory */
+	status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
+				    target_de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the i_nlink dance! :) */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		le16_add_cpu(&orphan_fe->i_links_count, -1);
+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+
+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+leave:
+	if (target_de_bh)
+		brelse(target_de_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+struct inode_operations ocfs2_dir_iops = {
+	.create		= ocfs2_create,
+	.lookup		= ocfs2_lookup,
+	.link		= ocfs2_link,
+	.unlink		= ocfs2_unlink,
+	.rmdir		= ocfs2_unlink,
+	.symlink	= ocfs2_symlink,
+	.mkdir		= ocfs2_mkdir,
+	.mknod		= ocfs2_mknod,
+	.rename		= ocfs2_rename,
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
new file mode 100644
index 000000000000..deaaa97dbf0b
--- /dev/null
+++ b/fs/ocfs2/namei.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_NAMEI_H
+#define OCFS2_NAMEI_H
+
+extern struct inode_operations ocfs2_dir_iops;
+
+struct dentry *ocfs2_get_parent(struct dentry *child);
+
+int ocfs2_check_dir_entry (struct inode *dir,
+			   struct ocfs2_dir_entry *de,
+			   struct buffer_head *bh,
+			   unsigned long offset);
+struct buffer_head *ocfs2_find_entry(const char *name,
+				     int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir);
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     struct ocfs2_journal_handle *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh);
+
+static inline int ocfs2_match(int len,
+			      const char * const name,
+			      struct ocfs2_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
new file mode 100644
index 000000000000..0b499bccec5a
--- /dev/null
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs1_fs_compat.h
+ *
+ * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
+ * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
+ * This allows an OCFS1 volume to see the partition and cleanly fail to
+ * mount it.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS1_FS_COMPAT_H
+#define _OCFS1_FS_COMPAT_H
+
+#define OCFS1_MAX_VOL_SIGNATURE_LEN          128
+#define OCFS1_MAX_MOUNT_POINT_LEN            128
+#define OCFS1_MAX_VOL_ID_LENGTH               16
+#define OCFS1_MAX_VOL_LABEL_LEN               64
+#define OCFS1_MAX_CLUSTER_NAME_LEN            64
+
+#define OCFS1_MAJOR_VERSION              (2)
+#define OCFS1_MINOR_VERSION              (0)
+#define OCFS1_VOLUME_SIGNATURE		 "OracleCFS"
+
+/*
+ * OCFS1 superblock.  Lives at sector 0.
+ */
+struct ocfs1_vol_disk_hdr
+{
+/*00*/	__u32 minor_version;
+	__u32 major_version;
+/*08*/	__u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
+/*88*/	__u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
+/*108*/	__u64 serial_num;
+/*110*/	__u64 device_size;
+	__u64 start_off;
+/*120*/	__u64 bitmap_off;
+	__u64 publ_off;
+/*130*/	__u64 vote_off;
+	__u64 root_bitmap_off;
+/*140*/	__u64 data_start_off;
+	__u64 root_bitmap_size;
+/*150*/	__u64 root_off;
+	__u64 root_size;
+/*160*/	__u64 cluster_size;
+	__u64 num_nodes;
+/*170*/	__u64 num_clusters;
+	__u64 dir_node_size;
+/*180*/	__u64 file_node_size;
+	__u64 internal_off;
+/*190*/	__u64 node_cfg_off;
+	__u64 node_cfg_size;
+/*1A0*/	__u64 new_cfg_off;
+	__u32 prot_bits;
+	__s32 excl_mount;
+/*1B0*/
+};
+
+
+struct ocfs1_disk_lock
+{
+/*00*/	__u32 curr_master;
+	__u8 file_lock;
+	__u8 compat_pad[3];  /* Not in orignal definition.  Used to
+				make the already existing alignment
+				explicit */
+	__u64 last_write_time;
+/*10*/	__u64 last_read_time;
+	__u32 writer_node_num;
+	__u32 reader_node_num;
+/*20*/	__u64 oin_node_map;
+	__u64 dlock_seq_num;
+/*30*/
+};
+
+/*
+ * OCFS1 volume label.  Lives at sector 1.
+ */
+struct ocfs1_vol_label
+{
+/*00*/	struct ocfs1_disk_lock disk_lock;
+/*30*/	__u8 label[OCFS1_MAX_VOL_LABEL_LEN];
+/*70*/	__u16 label_len;
+/*72*/	__u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
+/*82*/	__u16 vol_id_len;
+/*84*/	__u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
+/*A4*/	__u16 cluster_name_len;
+/*A6*/
+};
+
+
+#endif /* _OCFS1_FS_COMPAT_H */
+
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
new file mode 100644
index 000000000000..f468c600cf92
--- /dev/null
+++ b/fs/ocfs2/ocfs2.h
@@ -0,0 +1,464 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2.h
+ *
+ * Defines macros and structures used in OCFS2
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_H
+#define OCFS2_H
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlm/dlmapi.h"
+
+#include "ocfs2_fs.h"
+#include "endian.h"
+#include "ocfs2_lockid.h"
+
+struct ocfs2_extent_map {
+	u32		em_clusters;
+	struct rb_root	em_extents;
+};
+
+/* Most user visible OCFS2 inodes will have very few pieces of
+ * metadata, but larger files (including bitmaps, etc) must be taken
+ * into account when designing an access scheme. We allow a small
+ * amount of inlined blocks to be stored on an array and grow the
+ * structure into a rb tree when necessary. */
+#define OCFS2_INODE_MAX_CACHE_ARRAY 2
+
+struct ocfs2_caching_info {
+	unsigned int		ci_num_cached;
+	union {
+		sector_t	ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
+		struct rb_root	ci_tree;
+	} ci_cache;
+};
+
+/* this limits us to 256 nodes
+ * if we need more, we can do a kmalloc for the map */
+#define OCFS2_NODE_MAP_MAX_NODES    256
+struct ocfs2_node_map {
+	u16 num_nodes;
+	unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
+};
+
+enum ocfs2_ast_action {
+	OCFS2_AST_INVALID = 0,
+	OCFS2_AST_ATTACH,
+	OCFS2_AST_CONVERT,
+	OCFS2_AST_DOWNCONVERT,
+};
+
+/* actions for an unlockast function to take. */
+enum ocfs2_unlock_action {
+	OCFS2_UNLOCK_INVALID = 0,
+	OCFS2_UNLOCK_CANCEL_CONVERT,
+	OCFS2_UNLOCK_DROP_LOCK,
+};
+
+/* ocfs2_lock_res->l_flags flags. */
+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* have we initialized
+					       * the lvb */
+#define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define OCFS2_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					       * downconvert*/
+#define OCFS2_LOCK_LOCAL         (0x00000008) /* newly created inode */
+#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
+#define OCFS2_LOCK_REFRESHING    (0x00000020)
+#define OCFS2_LOCK_INITIALIZED   (0x00000040) /* track initialization
+					       * for shutdown paths */
+#define OCFS2_LOCK_FREEING       (0x00000080) /* help dlmglue track
+					       * when to skip queueing
+					       * a lock because it's
+					       * about to be
+					       * dropped. */
+#define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+
+struct ocfs2_lock_res_ops;
+
+typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+
+struct ocfs2_lock_res {
+	void                    *l_priv;
+	struct ocfs2_lock_res_ops *l_ops;
+	spinlock_t               l_lock;
+
+	struct list_head         l_blocked_list;
+	struct list_head         l_mask_waiters;
+
+	enum ocfs2_lock_type     l_type;
+	unsigned long		 l_flags;
+	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct dlm_lockstatus    l_lksb;
+
+	/* used from AST/BAST funcs. */
+	enum ocfs2_ast_action    l_action;
+	enum ocfs2_unlock_action l_unlock_action;
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct list_head         l_debug_list;
+};
+
+struct ocfs2_dlm_debug {
+	struct kref d_refcnt;
+	struct dentry *d_locking_state;
+	struct list_head d_lockres_tracking;
+};
+
+enum ocfs2_vol_state
+{
+	VOLUME_INIT = 0,
+	VOLUME_MOUNTED,
+	VOLUME_DISMOUNTED,
+	VOLUME_DISABLED
+};
+
+struct ocfs2_alloc_stats
+{
+	atomic_t moves;
+	atomic_t local_data;
+	atomic_t bitmap_data;
+	atomic_t bg_allocs;
+	atomic_t bg_extends;
+};
+
+enum ocfs2_local_alloc_state
+{
+	OCFS2_LA_UNUSED = 0,
+	OCFS2_LA_ENABLED,
+	OCFS2_LA_DISABLED
+};
+
+enum ocfs2_mount_options
+{
+	OCFS2_MOUNT_HB_LOCAL   = 1 << 0, /* Heartbeat started in local mode */
+	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
+	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
+	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
+	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
+#endif
+};
+
+#define OCFS2_OSB_SOFT_RO	0x0001
+#define OCFS2_OSB_HARD_RO	0x0002
+#define OCFS2_OSB_ERROR_FS	0x0004
+
+struct ocfs2_journal;
+struct ocfs2_journal_handle;
+struct ocfs2_super
+{
+	u32 osb_id;		/* id used by the proc interface */
+	struct task_struct *commit_task;
+	struct super_block *sb;
+	struct inode *root_inode;
+	struct inode *sys_root_inode;
+	struct inode *system_inodes[NUM_SYSTEM_INODES];
+
+	struct ocfs2_slot_info *slot_info;
+
+	spinlock_t node_map_lock;
+	struct ocfs2_node_map mounted_map;
+	struct ocfs2_node_map recovery_map;
+	struct ocfs2_node_map umount_map;
+
+	u32 num_clusters;
+	u64 root_blkno;
+	u64 system_dir_blkno;
+	u64 bitmap_blkno;
+	u32 bitmap_cpg;
+	u8 *uuid;
+	char *uuid_str;
+	u8 *vol_label;
+	u64 first_cluster_group_blkno;
+	u32 fs_generation;
+
+	u32 s_feature_compat;
+	u32 s_feature_incompat;
+	u32 s_feature_ro_compat;
+
+	/* Protects s_next_generaion, osb_flags. Could protect more on
+	 * osb as it's very short lived. */
+	spinlock_t osb_lock;
+	u32 s_next_generation;
+	unsigned long osb_flags;
+
+	unsigned long s_mount_opt;
+
+	u16 max_slots;
+	u16 num_nodes;
+	s16 node_num;
+	s16 slot_num;
+	int s_sectsize_bits;
+	int s_clustersize;
+	int s_clustersize_bits;
+	struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
+
+	atomic_t vol_state;
+	struct semaphore recovery_lock;
+	struct task_struct *recovery_thread_task;
+	int disable_recovery;
+	wait_queue_head_t checkpoint_event;
+	atomic_t needs_checkpoint;
+	struct ocfs2_journal *journal;
+
+	enum ocfs2_local_alloc_state local_alloc_state;
+	struct buffer_head *local_alloc_bh;
+
+	/* Next two fields are for local node slot recovery during
+	 * mount. */
+	int dirty;
+	struct ocfs2_dinode *local_alloc_copy;
+
+	struct ocfs2_alloc_stats alloc_stats;
+	char dev_str[20];		/* "major,minor" of the device */
+
+	struct dlm_ctxt *dlm;
+	struct ocfs2_lock_res osb_super_lockres;
+	struct ocfs2_lock_res osb_rename_lockres;
+	struct dlm_eviction_cb osb_eviction_cb;
+	struct ocfs2_dlm_debug *osb_dlm_debug;
+
+	struct dentry *osb_debug_root;
+
+	wait_queue_head_t recovery_event;
+
+	spinlock_t vote_task_lock;
+	struct task_struct *vote_task;
+	wait_queue_head_t vote_event;
+	unsigned long vote_wake_sequence;
+	unsigned long vote_work_sequence;
+
+	struct list_head blocked_lock_list;
+	unsigned long blocked_lock_count;
+
+	struct list_head vote_list;
+	int vote_count;
+
+	u32 net_key;
+	spinlock_t net_response_lock;
+	unsigned int net_response_ids;
+	struct list_head net_response_list;
+
+	struct o2hb_callback_func osb_hb_up;
+	struct o2hb_callback_func osb_hb_down;
+
+	struct list_head	osb_net_handlers;
+
+	wait_queue_head_t		osb_mount_event;
+
+	/* Truncate log info */
+	struct inode			*osb_tl_inode;
+	struct buffer_head		*osb_tl_bh;
+	struct work_struct		osb_truncate_log_wq;
+};
+
+#define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
+#define OCFS2_MAX_OSB_ID             65536
+
+static inline int ocfs2_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
+		return 0;
+	return 1;
+}
+
+/* set / clear functions because cluster events can make these happen
+ * in parallel so we want the transitions to be atomic. this also
+ * means that any future flags osb_flags must be protected by spinlock
+ * too! */
+static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
+				      unsigned long flag)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags |= flag;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
+				     int hard)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
+	if (hard)
+		osb->osb_flags |= OCFS2_OSB_HARD_RO;
+	else
+		osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+#define OCFS2_IS_VALID_DINODE(ptr)					\
+	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)	do {			\
+	typeof(__di) ____di = (__di);					\
+	ocfs2_error((__sb), 						\
+		"Dinode # %"MLFu64" has bad signature %.*s",		\
+		(____di)->i_blkno, 7,					\
+		(____di)->i_signature);					\
+} while (0);
+
+#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
+	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)	do {		\
+	typeof(__eb) ____eb = (__eb);					\
+	ocfs2_error((__sb), 						\
+		"Extent Block # %"MLFu64" has bad signature %.*s",	\
+		(____eb)->h_blkno, 7,					\
+		(____eb)->h_signature);					\
+} while (0);
+
+#define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
+	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)	do {		\
+	typeof(__gd) ____gd = (__gd);					\
+		ocfs2_error((__sb),					\
+		"Group Descriptor # %"MLFu64" has bad signature %.*s",	\
+		(____gd)->bg_blkno, 7,					\
+		(____gd)->bg_signature);				\
+} while (0);
+
+static inline unsigned long ino_from_blkno(struct super_block *sb,
+					   u64 blkno)
+{
+	return (unsigned long)(blkno & (u64)ULONG_MAX);
+}
+
+static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
+					   u32 clusters)
+{
+	int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u64)clusters << c_to_b_bits;
+}
+
+static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
+					   u64 blocks)
+{
+	int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u32)(blocks >> b_to_c_bits);
+}
+
+static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
+						    u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	bytes += OCFS2_SB(sb)->s_clustersize - 1;
+	/* OCFS2 just cannot have enough clusters to overflow this */
+	clusters = (unsigned int)(bytes >> cl_bits);
+
+	return clusters;
+}
+
+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
+					 u64 bytes)
+{
+	bytes += sb->s_blocksize - 1;
+	return bytes >> sb->s_blocksize_bits;
+}
+
+static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
+					  u32 clusters)
+{
+	return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
+						u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	clusters = ocfs2_clusters_for_bytes(sb, bytes);
+	return (u64)clusters << cl_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
+					      u64 bytes)
+{
+	u64 blocks;
+
+        blocks = ocfs2_blocks_for_bytes(sb, bytes);
+	return blocks << sb->s_blocksize_bits;
+}
+
+static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
+{
+	return (unsigned long)((bytes + 511) >> 9);
+}
+
+#define ocfs2_set_bit ext2_set_bit
+#define ocfs2_clear_bit ext2_clear_bit
+#define ocfs2_test_bit ext2_test_bit
+#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#endif  /* OCFS2_H */
+
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
new file mode 100644
index 000000000000..dfb8a5bedfc8
--- /dev/null
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,638 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_fs.h
+ *
+ * On-disk structures for OCFS2.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_FS_H
+#define _OCFS2_FS_H
+
+/* Version */
+#define OCFS2_MAJOR_REV_LEVEL		0
+#define OCFS2_MINOR_REV_LEVEL          	90
+
+/*
+ * An OCFS2 volume starts this way:
+ * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
+ * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
+ * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
+ *
+ * All other structures are found from the superblock information.
+ *
+ * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors.  eg, for a
+ * blocksize of 2K, it is 4096 bytes into disk.
+ */
+#define OCFS2_SUPER_BLOCK_BLKNO		2
+
+/*
+ * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
+ * grow if needed.
+ */
+#define OCFS2_MIN_CLUSTERSIZE		4096
+#define OCFS2_MAX_CLUSTERSIZE		1048576
+
+/*
+ * Blocks cannot be bigger than clusters, so the maximum blocksize is the
+ * minimum cluster size.
+ */
+#define OCFS2_MIN_BLOCKSIZE		512
+#define OCFS2_MAX_BLOCKSIZE		OCFS2_MIN_CLUSTERSIZE
+
+/* Filesystem magic number */
+#define OCFS2_SUPER_MAGIC		0x7461636f
+
+/* Object signatures */
+#define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
+#define OCFS2_INODE_SIGNATURE		"INODE01"
+#define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
+#define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+
+/* Compatibility flags */
+#define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_compat & (mask) )
+#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
+#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_incompat & (mask) )
+#define OCFS2_SET_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat |= (mask)
+#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
+#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat |= (mask)
+#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat &= ~(mask)
+#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
+#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
+
+#define OCFS2_FEATURE_COMPAT_SUPP	0
+#define OCFS2_FEATURE_INCOMPAT_SUPP	0
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	0
+
+/*
+ * Heartbeat-only devices are missing journals and other files.  The
+ * filesystem driver can't load them, but the library can.  Never put
+ * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
+ */
+#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV	0x0002
+
+
+/*
+ * Flags on ocfs2_dinode.i_flags
+ */
+#define OCFS2_VALID_FL		(0x00000001)	/* Inode is valid */
+#define OCFS2_UNUSED2_FL	(0x00000002)
+#define OCFS2_ORPHANED_FL	(0x00000004)	/* On the orphan list */
+#define OCFS2_UNUSED3_FL	(0x00000008)
+/* System inode flags */
+#define OCFS2_SYSTEM_FL		(0x00000010)	/* System inode */
+#define OCFS2_SUPER_BLOCK_FL	(0x00000020)	/* Super block */
+#define OCFS2_LOCAL_ALLOC_FL	(0x00000040)	/* Slot local alloc bitmap */
+#define OCFS2_BITMAP_FL		(0x00000080)	/* Allocation bitmap */
+#define OCFS2_JOURNAL_FL	(0x00000100)	/* Slot local journal */
+#define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
+#define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
+#define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
+
+/*
+ * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
+ */
+#define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
+
+/*
+ * superblock s_state flags
+ */
+#define OCFS2_ERROR_FS		(0x00000001)	/* FS saw errors */
+
+/* Limit of space in ocfs2_dir_entry */
+#define OCFS2_MAX_FILENAME_LEN		255
+
+/* Maximum slots on an ocfs2 file system */
+#define OCFS2_MAX_SLOTS			255
+
+/* Slot map indicator for an empty slot */
+#define OCFS2_INVALID_SLOT		-1
+
+#define OCFS2_VOL_UUID_LEN		16
+#define OCFS2_MAX_VOL_LABEL_LEN		64
+
+/* Journal limits (in bytes) */
+#define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
+#define OCFS2_MAX_JOURNAL_SIZE		(500 * 1024 * 1024)
+
+struct ocfs2_system_inode_info {
+	char	*si_name;
+	int	si_iflags;
+	int	si_mode;
+};
+
+/* System file index */
+enum {
+	BAD_BLOCK_SYSTEM_INODE = 0,
+	GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+	SLOT_MAP_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
+	HEARTBEAT_SYSTEM_INODE,
+	GLOBAL_BITMAP_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+	ORPHAN_DIR_SYSTEM_INODE,
+	EXTENT_ALLOC_SYSTEM_INODE,
+	INODE_ALLOC_SYSTEM_INODE,
+	JOURNAL_SYSTEM_INODE,
+	LOCAL_ALLOC_SYSTEM_INODE,
+	TRUNCATE_LOG_SYSTEM_INODE,
+	NUM_SYSTEM_INODES
+};
+
+static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
+	/* Global system inodes (single copy) */
+	/* The first two are only used from userspace mfks/tunefs */
+	[BAD_BLOCK_SYSTEM_INODE]		= { "bad_blocks", 0, S_IFREG | 0644 },
+	[GLOBAL_INODE_ALLOC_SYSTEM_INODE] 	= { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+
+	/* These are used by the running filesystem */
+	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
+	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
+	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
+
+	/* Slot-specific system inodes (one copy per slot) */
+	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
+	[EXTENT_ALLOC_SYSTEM_INODE]		= { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
+	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+};
+
+/* Parameter passed from mount.ocfs2 to module */
+#define OCFS2_HB_NONE			"heartbeat=none"
+#define OCFS2_HB_LOCAL			"heartbeat=local"
+
+/*
+ * OCFS2 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define OCFS2_FT_UNKNOWN	0
+#define OCFS2_FT_REG_FILE	1
+#define OCFS2_FT_DIR		2
+#define OCFS2_FT_CHRDEV		3
+#define OCFS2_FT_BLKDEV		4
+#define OCFS2_FT_FIFO		5
+#define OCFS2_FT_SOCK		6
+#define OCFS2_FT_SYMLINK	7
+
+#define OCFS2_FT_MAX		8
+
+/*
+ * OCFS2_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define OCFS2_DIR_PAD			4
+#define OCFS2_DIR_ROUND			(OCFS2_DIR_PAD - 1)
+#define OCFS2_DIR_MEMBER_LEN 		offsetof(struct ocfs2_dir_entry, name)
+#define OCFS2_DIR_REC_LEN(name_len)	(((name_len) + OCFS2_DIR_MEMBER_LEN + \
+                                          OCFS2_DIR_ROUND) & \
+					 ~OCFS2_DIR_ROUND)
+
+#define OCFS2_LINK_MAX		32000
+
+#define S_SHIFT			12
+static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]  = OCFS2_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]  = OCFS2_FT_DIR,
+	[S_IFCHR >> S_SHIFT]  = OCFS2_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]  = OCFS2_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]  = OCFS2_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]  = OCFS2_FT_SYMLINK,
+};
+
+
+/*
+ * Convenience casts
+ */
+#define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
+
+/*
+ * On disk extent record for OCFS2
+ * It describes a range of clusters on disk.
+ */
+struct ocfs2_extent_rec {
+/*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
+	__le32 e_clusters;	/* Clusters covered by this extent */
+	__le64 e_blkno;		/* Physical disk offset, in blocks */
+/*10*/
+};
+
+struct ocfs2_chain_rec {
+	__le32 c_free;	/* Number of free bits in this chain. */
+	__le32 c_total;	/* Number of total bits in this chain */
+	__le64 c_blkno;	/* Physical disk offset (blocks) of 1st group */
+};
+
+struct ocfs2_truncate_rec {
+	__le32 t_start;		/* 1st cluster in this log */
+	__le32 t_clusters;	/* Number of total clusters covered */
+};
+
+/*
+ * On disk extent list for OCFS2 (node in the tree).  Note that this
+ * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
+ * offsets are relative to ocfs2_dinode.id2.i_list or
+ * ocfs2_extent_block.h_list, respectively.
+ */
+struct ocfs2_extent_list {
+/*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
+					   point.  0 means data extents
+					   hang directly off this
+					   header (a leaf) */
+	__le16 l_count;			/* Number of extent records */
+	__le16 l_next_free_rec;		/* Next unused extent slot */
+	__le16 l_reserved1;
+	__le64 l_reserved2;		/* Pad to
+					   sizeof(ocfs2_extent_rec) */
+/*10*/	struct ocfs2_extent_rec l_recs[0];	/* Extent records */
+};
+
+/*
+ * On disk allocation chain list for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_chain.
+ */
+struct ocfs2_chain_list {
+/*00*/	__le16 cl_cpg;			/* Clusters per Block Group */
+	__le16 cl_bpc;			/* Bits per cluster */
+	__le16 cl_count;		/* Total chains in this list */
+	__le16 cl_next_free_rec;	/* Next unused chain slot */
+	__le64 cl_reserved1;
+/*10*/	struct ocfs2_chain_rec cl_recs[0];	/* Chain records */
+};
+
+/*
+ * On disk deallocation log for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_dealloc.
+ */
+struct ocfs2_truncate_log {
+/*00*/	__le16 tl_count;		/* Total records in this log */
+	__le16 tl_used;			/* Number of records in use */
+	__le32 tl_reserved1;
+/*08*/	struct ocfs2_truncate_rec tl_recs[0];	/* Truncate records */
+};
+
+/*
+ * On disk extent block (indirect block) for OCFS2
+ */
+struct ocfs2_extent_block
+{
+/*00*/	__u8 h_signature[8];		/* Signature for verification */
+	__le64 h_reserved1;
+/*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
+					   extent_header belongs to */
+	__le16 h_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 h_fs_generation;		/* Must match super block */
+	__le64 h_blkno;			/* Offset on disk, in blocks */
+/*20*/	__le64 h_reserved3;
+	__le64 h_next_leaf_blk;		/* Offset on disk, in blocks,
+					   of next leaf header pointing
+					   to data */
+/*30*/	struct ocfs2_extent_list h_list;	/* Extent record list */
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On disk superblock for OCFS2
+ * Note that it is contained inside an ocfs2_dinode, so all offsets
+ * are relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_super_block {
+/*00*/	__le16 s_major_rev_level;
+	__le16 s_minor_rev_level;
+	__le16 s_mnt_count;
+	__le16 s_max_mnt_count;
+	__le16 s_state;			/* File system state */
+	__le16 s_errors;			/* Behaviour when detecting errors */
+	__le32 s_checkinterval;		/* Max time between checks */
+/*10*/	__le64 s_lastcheck;		/* Time of last check */
+	__le32 s_creator_os;		/* OS */
+	__le32 s_feature_compat;		/* Compatible feature set */
+/*20*/	__le32 s_feature_incompat;	/* Incompatible feature set */
+	__le32 s_feature_ro_compat;	/* Readonly-compatible feature set */
+	__le64 s_root_blkno;		/* Offset, in blocks, of root directory
+					   dinode */
+/*30*/	__le64 s_system_dir_blkno;	/* Offset, in blocks, of system
+					   directory dinode */
+	__le32 s_blocksize_bits;		/* Blocksize for this fs */
+	__le32 s_clustersize_bits;	/* Clustersize for this fs */
+/*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
+					   before tunefs required */
+	__le16 s_reserved1;
+	__le32 s_reserved2;
+	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
+					 * group header */
+/*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
+/*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
+/*A0*/
+};
+
+/*
+ * Local allocation bitmap for OCFS2 slots
+ * Note that it exists inside an ocfs2_dinode, so all offsets are
+ * relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_local_alloc
+{
+/*00*/	__le32 la_bm_off;	/* Starting bit offset in main bitmap */
+	__le16 la_size;		/* Size of included bitmap, in bytes */
+	__le16 la_reserved1;
+	__le64 la_reserved2;
+/*10*/	__u8   la_bitmap[0];
+};
+
+/*
+ * On disk inode for OCFS2
+ */
+struct ocfs2_dinode {
+/*00*/	__u8 i_signature[8];		/* Signature for validation */
+	__le32 i_generation;		/* Generation number */
+	__le16 i_suballoc_slot;		/* Slot suballocator this inode
+					   belongs to */
+	__le16 i_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+/*10*/	__le32 i_reserved0;
+	__le32 i_clusters;		/* Cluster count */
+	__le32 i_uid;			/* Owner UID */
+	__le32 i_gid;			/* Owning GID */
+/*20*/	__le64 i_size;			/* Size in bytes */
+	__le16 i_mode;			/* File mode */
+	__le16 i_links_count;		/* Links count */
+	__le32 i_flags;			/* File flags */
+/*30*/	__le64 i_atime;			/* Access time */
+	__le64 i_ctime;			/* Creation time */
+/*40*/	__le64 i_mtime;			/* Modification time */
+	__le64 i_dtime;			/* Deletion time */
+/*50*/	__le64 i_blkno;			/* Offset on disk, in blocks */
+	__le64 i_last_eb_blk;		/* Pointer to last extent
+					   block */
+/*60*/	__le32 i_fs_generation;		/* Generation per fs-instance */
+	__le32 i_atime_nsec;
+	__le32 i_ctime_nsec;
+	__le32 i_mtime_nsec;
+/*70*/	__le64 i_reserved1[9];
+/*B8*/	union {
+		__le64 i_pad1;		/* Generic way to refer to this
+					   64bit union */
+		struct {
+			__le64 i_rdev;	/* Device number */
+		} dev1;
+		struct {		/* Info for bitmap system
+					   inodes */
+			__le32 i_used;	/* Bits (ie, clusters) used  */
+			__le32 i_total;	/* Total bits (clusters)
+					   available */
+		} bitmap1;
+		struct {		/* Info for journal system
+					   inodes */
+			__le32 ij_flags;	/* Mounted, version, etc. */
+			__le32 ij_pad;
+		} journal1;
+	} id1;				/* Inode type dependant 1 */
+/*C0*/	union {
+		struct ocfs2_super_block	i_super;
+		struct ocfs2_local_alloc	i_lab;
+		struct ocfs2_chain_list		i_chain;
+		struct ocfs2_extent_list	i_list;
+		struct ocfs2_truncate_log	i_dealloc;
+		__u8               		i_symlink[0];
+	} id2;
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On-disk directory entry structure for OCFS2
+ *
+ * Packed as this structure could be accessed unaligned on 64-bit platforms
+ */
+struct ocfs2_dir_entry {
+/*00*/	__le64   inode;                  /* Inode number */
+	__le16   rec_len;                /* Directory entry length */
+	__u8    name_len;               /* Name length */
+	__u8    file_type;
+/*0C*/	char    name[OCFS2_MAX_FILENAME_LEN];   /* File name */
+/* Actual on-disk length specified by rec_len */
+} __attribute__ ((packed));
+
+/*
+ * On disk allocator group structure for OCFS2
+ */
+struct ocfs2_group_desc
+{
+/*00*/	__u8    bg_signature[8];        /* Signature for validation */
+	__le16   bg_size;                /* Size of included bitmap in
+					   bytes. */
+	__le16   bg_bits;                /* Bits represented by this
+					   group. */
+	__le16	bg_free_bits_count;     /* Free bits count */
+	__le16   bg_chain;               /* What chain I am in. */
+/*10*/	__le32   bg_generation;
+	__le32	bg_reserved1;
+	__le64   bg_next_group;          /* Next group in my list, in
+					   blocks */
+/*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
+					   blocks */
+	__le64   bg_blkno;               /* Offset on disk, in blocks */
+/*30*/	__le64   bg_reserved2[2];
+/*40*/	__u8    bg_bitmap[0];
+};
+
+#ifdef __KERNEL__
+static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
+{
+	return  sb->s_blocksize -
+		 offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
+{
+	u16 size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+#else
+static inline int ocfs2_fast_symlink_chars(int blocksize)
+{
+	return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_extent_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline int ocfs2_extent_recs_per_eb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_local_alloc_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+#endif  /* __KERNEL__ */
+
+
+static inline int ocfs2_system_inode_is_global(int type)
+{
+	return ((type >= 0) &&
+		(type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
+}
+
+static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
+						  int type, int slot)
+{
+	int chars;
+
+        /*
+         * Global system inodes can only have one copy.  Everything
+         * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
+         * list has a copy per slot.
+         */
+	if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name);
+	else
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name,
+				 slot);
+
+	return chars;
+}
+
+static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
+				    umode_t mode)
+{
+	de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+#endif  /* _OCFS2_FS_H */
+
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
new file mode 100644
index 000000000000..7dd9e1e705b0
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_lockid.h
+ *
+ * Defines OCFS2 lockid bits.
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCKID_H
+#define OCFS2_LOCKID_H
+
+/* lock ids are made up in the following manner:
+ * name[0]     --> type
+ * name[1-6]   --> 6 pad characters, reserved for now
+ * name[7-22]  --> block number, expressed in hex as 16 chars
+ * name[23-30] --> i_generation, expressed in hex 8 chars
+ * name[31]    --> '\0' */
+#define OCFS2_LOCK_ID_MAX_LEN  32
+#define OCFS2_LOCK_ID_PAD "000000"
+
+enum ocfs2_lock_type {
+	OCFS2_LOCK_TYPE_META = 0,
+	OCFS2_LOCK_TYPE_DATA,
+	OCFS2_LOCK_TYPE_SUPER,
+	OCFS2_LOCK_TYPE_RENAME,
+	OCFS2_LOCK_TYPE_RW,
+	OCFS2_NUM_LOCK_TYPES
+};
+
+static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
+{
+	char c;
+	switch (type) {
+		case OCFS2_LOCK_TYPE_META:
+			c = 'M';
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			c = 'D';
+			break;
+		case OCFS2_LOCK_TYPE_SUPER:
+			c = 'S';
+			break;
+		case OCFS2_LOCK_TYPE_RENAME:
+			c = 'R';
+			break;
+		case OCFS2_LOCK_TYPE_RW:
+			c = 'W';
+			break;
+		default:
+			c = '\0';
+	}
+
+	return c;
+}
+
+#endif  /* OCFS2_LOCKID_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
new file mode 100644
index 000000000000..871627961d6d
--- /dev/null
+++ b/fs/ocfs2/slot_map.c
@@ -0,0 +1,303 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slot_map.c
+ *
+ *
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    s16 global);
+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num);
+
+/* Use the slot information we've collected to create a map of mounted
+ * nodes. Should be holding an EX on super block. assumes slot info is
+ * up to date. Note that we call this *after* we find a slot, so our
+ * own node should be set in the map too... */
+void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
+{
+	int i;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	spin_lock(&si->si_lock);
+
+	for (i = 0; i < si->si_size; i++)
+		if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
+			ocfs2_node_map_set_bit(osb, &osb->mounted_map,
+					      si->si_global_node_nums[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the slot information on disk into our slot_info struct. */
+void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+	int i;
+	__le16 *disk_info;
+
+	/* we don't read the slot block here as ocfs2_super_lock
+	 * should've made sure we have the most recent copy. */
+	spin_lock(&si->si_lock);
+	disk_info = (__le16 *) si->si_bh->b_data;
+
+	for (i = 0; i < si->si_size; i++)
+		si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the our slot info stuff into it's destination bh and write it
+ * out. */
+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+			    struct ocfs2_slot_info *si)
+{
+	int status, i;
+	__le16 *disk_info = (__le16 *) si->si_bh->b_data;
+
+	spin_lock(&si->si_lock);
+	for (i = 0; i < si->si_size; i++)
+		disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+	spin_unlock(&si->si_lock);
+
+	status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/* try to find global node in the slot info. Returns
+ * OCFS2_INVALID_SLOT if nothing is found. */
+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    s16 global)
+{
+	int i;
+	s16 ret = OCFS2_INVALID_SLOT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (global == si->si_global_node_nums[i]) {
+			ret = (s16) i;
+			break;
+		}
+	}
+	return ret;
+}
+
+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
+{
+	int i;
+	s16 ret = OCFS2_INVALID_SLOT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
+			ret = (s16) i;
+			break;
+		}
+	}
+	return ret;
+}
+
+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+			   s16 global)
+{
+	s16 ret;
+
+	spin_lock(&si->si_lock);
+	ret = __ocfs2_node_num_to_slot(si, global);
+	spin_unlock(&si->si_lock);
+	return ret;
+}
+
+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num)
+{
+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+	BUG_ON(slot_num >= si->si_num_slots);
+	BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
+	       (node_num >= O2NM_MAX_NODES));
+
+	si->si_global_node_nums[slot_num] = node_num;
+}
+
+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+		      s16 slot_num)
+{
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
+	spin_unlock(&si->si_lock);
+}
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+	int status, i;
+	u64 blkno;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_slot_info *si;
+
+	si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+	if (!si) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock_init(&si->si_lock);
+	si->si_num_slots = osb->max_slots;
+	si->si_size = OCFS2_MAX_SLOTS;
+
+	for(i = 0; i < si->si_num_slots; i++)
+		si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
+
+	inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	si->si_inode = inode;
+	si->si_bh = bh;
+	osb->slot_info = si;
+bail:
+	if (status < 0 && si)
+		ocfs2_free_slot_info(si);
+
+	return status;
+}
+
+void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+{
+	if (si->si_inode)
+		iput(si->si_inode);
+	if (si->si_bh)
+		brelse(si->si_bh);
+	kfree(si);
+}
+
+int ocfs2_find_slot(struct ocfs2_super *osb)
+{
+	int status;
+	s16 slot;
+	struct ocfs2_slot_info *si;
+
+	mlog_entry_void();
+
+	si = osb->slot_info;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	/* search for ourselves first and take the slot if it already
+	 * exists. Perhaps we need to mark this in a variable for our
+	 * own journal recovery? Possibly not, though we certainly
+	 * need to warn to the user */
+	slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+	if (slot == OCFS2_INVALID_SLOT) {
+		/* if no slot yet, then just take 1st available
+		 * one. */
+		slot = __ocfs2_find_empty_slot(si);
+		if (slot == OCFS2_INVALID_SLOT) {
+			spin_unlock(&si->si_lock);
+			mlog(ML_ERROR, "no free slots available!\n");
+			status = -EINVAL;
+			goto bail;
+		}
+	} else
+		mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
+		     slot);
+
+	__ocfs2_fill_slot(si, slot, osb->node_num);
+	osb->slot_num = slot;
+	spin_unlock(&si->si_lock);
+
+	mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_put_slot(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (!si)
+		return;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+	osb->slot_num = OCFS2_INVALID_SLOT;
+	spin_unlock(&si->si_lock);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	osb->slot_info = NULL;
+	ocfs2_free_slot_info(si);
+}
+
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
new file mode 100644
index 000000000000..d8c8ceed031b
--- /dev/null
+++ b/fs/ocfs2/slot_map.h
@@ -0,0 +1,66 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slotmap.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef SLOTMAP_H
+#define SLOTMAP_H
+
+struct ocfs2_slot_info {
+	spinlock_t si_lock;
+
+       	struct inode *si_inode;
+	struct buffer_head *si_bh;
+	unsigned int si_num_slots;
+	unsigned int si_size;
+	s16 si_global_node_nums[OCFS2_MAX_SLOTS];
+};
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb);
+void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+
+int ocfs2_find_slot(struct ocfs2_super *osb);
+void ocfs2_put_slot(struct ocfs2_super *osb);
+
+void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+			    struct ocfs2_slot_info *si);
+
+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+			   s16 global);
+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+		      s16 slot_num);
+
+void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
+
+static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
+				      int slot_num)
+{
+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+	assert_spin_locked(&si->si_lock);
+
+	return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
+}
+
+#endif
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
new file mode 100644
index 000000000000..c46c164aefbb
--- /dev/null
+++ b/fs/ocfs2/suballoc.c
@@ -0,0 +1,1651 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.c
+ *
+ * metadata alloc and free
+ * Inspired by ext3 block groups.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh);
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac);
+
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u16 *bit_off, u16 *bits_found);
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u16 *bit_off, u16 *bits_found);
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      u16 *bit_off,
+			      unsigned int *num_bits,
+			      u64 *bg_blkno);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+				     struct ocfs2_alloc_context *ac,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     u16 *bit_off,
+				     unsigned int *num_bits,
+				     u64 *bg_blkno);
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr);
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     u16 *bit_off,
+					     u16 *bits_found);
+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits);
+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
+					       struct inode *alloc_inode,
+					       struct ocfs2_group_desc *bg,
+					       struct buffer_head *group_bh,
+					       unsigned int bit_off,
+					       unsigned int num_bits);
+
+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain);
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted);
+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *alloc_bh,
+				    unsigned int start_bit,
+				    u64 bg_blkno,
+				    unsigned int count);
+static inline u64 ocfs2_which_suballoc_group(u64 block,
+					     unsigned int bit);
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off);
+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+					    u32 cluster);
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off);
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+	if (ac->ac_inode)
+		iput(ac->ac_inode);
+	if (ac->ac_bh)
+		brelse(ac->ac_bh);
+	kfree(ac);
+}
+
+static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
+{
+	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
+}
+
+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl)
+{
+	int status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct super_block * sb = alloc_inode->i_sb;
+
+	mlog_entry_void();
+
+	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
+		ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
+			    "!= b_blocknr (%llu)", group_blkno,
+			    (unsigned long long) bg_bh->b_blocknr);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      bg_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	memset(bg, 0, sb->s_blocksize);
+	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
+	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
+	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+	bg->bg_chain = cpu_to_le16(my_chain);
+	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
+	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
+	bg->bg_blkno = cpu_to_le64(group_blkno);
+	/* set the 1st bit in the bitmap to account for the descriptor block */
+	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
+	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
+
+	status = ocfs2_journal_dirty(handle, bg_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* There is no need to zero out or otherwise initialize the
+	 * other blocks in a group - All valid FS metadata in a block
+	 * group stores the superblock fs_generation value at
+	 * allocation time. */
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_count)) {
+		if (le32_to_cpu(cl->cl_recs[best].c_total) >
+		    le32_to_cpu(cl->cl_recs[curr].c_total))
+			best = curr;
+		curr++;
+	}
+	return best;
+}
+
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh)
+{
+	int status, credits;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_alloc_context *ac = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	u32 bit_off, num_bits;
+	u16 alloc_rec;
+	u64 bg_blkno;
+	struct buffer_head *bg_bh = NULL;
+	struct ocfs2_group_desc *bg;
+
+	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	cl = &fe->id2.i_chain;
+	status = ocfs2_reserve_clusters(osb,
+					handle,
+					le16_to_cpu(cl->cl_cpg),
+					&ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	credits = ocfs2_calc_group_alloc_credits(osb->sb,
+						 le16_to_cpu(cl->cl_cpg));
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_claim_clusters(osb,
+				      handle,
+				      ac,
+				      le16_to_cpu(cl->cl_cpg),
+				      &bit_off,
+				      &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	alloc_rec = ocfs2_find_smallest_chain(cl);
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
+	     alloc_rec, bg_blkno);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
+
+	status = ocfs2_block_group_fill(handle,
+					alloc_inode,
+					bg_bh,
+					bg_blkno,
+					alloc_rec,
+					cl);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	status = ocfs2_journal_access(handle, alloc_inode,
+				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
+		     le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
+	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
+		le16_add_cpu(&cl->cl_next_free_rec, 1);
+
+	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
+					le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
+	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
+	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
+					     le32_to_cpu(fe->i_clusters)));
+	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
+	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
+	alloc_inode->i_blocks =
+		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
+
+	status = 0;
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	if (bg_bh)
+		brelse(bg_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac)
+{
+	int status;
+	u32 bits_wanted = ac->ac_bits_wanted;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_journal_handle *handle = ac->ac_handle;
+	struct ocfs2_dinode *fe;
+	u32 free_bits;
+
+	mlog_entry_void();
+
+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+
+	ocfs2_handle_add_inode(handle, alloc_inode);
+	status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
+		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
+			    "# %"MLFu64, le64_to_cpu(fe->i_blkno));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
+		le32_to_cpu(fe->id1.bitmap1.i_used);
+
+	if (bits_wanted > free_bits) {
+		/* cluster bitmap never grows */
+		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
+			     bits_wanted, free_bits);
+			status = -ENOSPC;
+			goto bail;
+		}
+
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		atomic_inc(&osb->alloc_stats.bg_extends);
+
+		/* You should never ask for this much metadata */
+		BUG_ON(bits_wanted >
+		       (le32_to_cpu(fe->id1.bitmap1.i_total)
+			- le32_to_cpu(fe->id1.bitmap1.i_used)));
+	}
+
+	get_bh(bh);
+	ac->ac_bh = bh;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_dinode *fe,
+			       struct ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS2_AC_USE_META;
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  EXTENT_ALLOC_SYSTEM_INODE,
+						  0);
+#else
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  EXTENT_ALLOC_SYSTEM_INODE,
+						  osb->slot_num);
+#endif
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, (*ac));
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = 1;
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS2_AC_USE_INODE;
+
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  INODE_ALLOC_SYSTEM_INODE,
+						  osb->slot_num);
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, *ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* local alloc code has to do the same thing, so rather than do this
+ * twice.. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac)
+{
+	int status;
+
+	ac->ac_inode = ocfs2_get_system_file_inode(osb,
+						   GLOBAL_BITMAP_SYSTEM_INODE,
+						   OCFS2_INVALID_SLOT);
+	if (!ac->ac_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto bail;
+	}
+	ac->ac_which = OCFS2_AC_USE_MAIN;
+	ac->ac_group_search = ocfs2_cluster_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, ac);
+	if (status < 0 && status != -ENOSPC)
+		mlog_errno(status);
+bail:
+	return status;
+}
+
+/* Callers don't need to care which bitmap (local alloc or main) to
+ * use so we figure it out for them, but unfortunately this clutters
+ * things a bit. */
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   struct ocfs2_journal_handle *handle,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	int status;
+
+	mlog_entry_void();
+
+	BUG_ON(!handle);
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_handle = handle;
+
+	status = -ENOSPC;
+	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+		status = ocfs2_reserve_local_alloc_bits(osb,
+							handle,
+							bits_wanted,
+							*ac);
+		if ((status < 0) && (status != -ENOSPC)) {
+			mlog_errno(status);
+			goto bail;
+		} else if (status == -ENOSPC) {
+			/* reserve_local_bits will return enospc with
+			 * the local alloc inode still locked, so we
+			 * can change this safely here. */
+			mlog(0, "Disabling local alloc\n");
+			/* We set to OCFS2_LA_DISABLED so that umount
+			 * can clean up what's left of the local
+			 * allocation */
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+	}
+
+	if (status == -ENOSPC) {
+		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * More or less lifted from ext3. I'll leave their description below:
+ *
+ * "For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes."
+ *
+ * Note: OCFS2 already does this differently for metadata vs data
+ * allocations, as those bitmaps are seperate and undo access is never
+ * called on a metadata group descriptor.
+ */
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr)
+{
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
+		return 0;
+	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
+		return 1;
+
+	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+}
+
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     u16 *bit_off,
+					     u16 *bits_found)
+{
+	void *bitmap;
+	u16 best_offset, best_size;
+	int offset, start, found, status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
+		return -EIO;
+	}
+
+	found = start = best_offset = best_size = 0;
+	bitmap = bg->bg_bitmap;
+
+	while((offset = ocfs2_find_next_zero_bit(bitmap,
+						 le16_to_cpu(bg->bg_bits),
+						 start)) != -1) {
+		if (offset == le16_to_cpu(bg->bg_bits))
+			break;
+
+		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
+			/* We found a zero, but we can't use it as it
+			 * hasn't been put to disk yet! */
+			found = 0;
+			start = offset + 1;
+		} else if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_size) {
+			best_size = found;
+			best_offset = start - found;
+		}
+		/* we got everything we needed */
+		if (found == bits_wanted) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	/* XXX: I think the first clause is equivalent to the second
+	 * 	- jlbec */
+	if (found == bits_wanted) {
+		*bit_off = start - found;
+		*bits_found = found;
+	} else if (best_size) {
+		*bit_off = best_offset;
+		*bits_found = best_size;
+	} else {
+		status = -ENOSPC;
+		/* No error log here -- see the comment above
+		 * ocfs2_test_bg_bit_allocatable */
+	}
+
+	return status;
+}
+
+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+	     num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      group_bh,
+				      journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+
+	while(num_bits--)
+		ocfs2_set_bit(bit_off++, bitmap);
+
+	status = ocfs2_journal_dirty(handle,
+				     group_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* find the one with the most empty bits */
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	BUG_ON(!cl->cl_next_free_rec);
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
+		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
+		    le32_to_cpu(cl->cl_recs[best].c_free))
+			best = curr;
+		curr++;
+	}
+
+	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
+	return best;
+}
+
+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain)
+{
+	int status;
+	/* there is a really tiny chance the journal calls could fail,
+	 * but we wouldn't want inconsistent blocks in *any* case. */
+	u64 fe_ptr, bg_ptr, prev_bg_ptr;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto out;
+	}
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto out;
+	}
+	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
+		status = -EIO;
+		goto out;
+	}
+
+	mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
+	     "top, prev = %"MLFu64"\n",
+	     fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
+
+	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
+	bg_ptr = le64_to_cpu(bg->bg_next_group);
+	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
+
+	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	prev_bg->bg_next_group = bg->bg_next_group;
+
+	status = ocfs2_journal_dirty(handle, prev_bg_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+
+	status = ocfs2_journal_dirty(handle, bg_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = 0;
+out_rollback:
+	if (status < 0) {
+		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
+		bg->bg_next_group = cpu_to_le64(bg_ptr);
+		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+	}
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted)
+{
+	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
+}
+
+/* return 0 on success, -ENOSPC to keep searching and any other < 0
+ * value on error. */
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u16 *bit_off, u16 *bits_found)
+{
+	int search = -ENOSPC;
+	int ret;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	u16 tmp_off, tmp_found;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count) {
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							&tmp_off, &tmp_found);
+		if (ret)
+			return ret;
+
+		/* ocfs2_block_group_find_clear_bits() might
+		 * return success, but we still want to return
+		 * -ENOSPC unless it found the minimum number
+		 * of bits. */
+		if (min_bits <= tmp_found) {
+			*bit_off = tmp_off;
+			*bits_found = tmp_found;
+			search = 0; /* success */
+		}
+	}
+
+	return search;
+}
+
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u16 *bit_off, u16 *bits_found)
+{
+	int ret = -ENOSPC;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	BUG_ON(min_bits != 1);
+	BUG_ON(ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count)
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							bit_off, bits_found);
+
+	return ret;
+}
+
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      u16 *bit_off,
+			      unsigned int *num_bits,
+			      u64 *bg_blkno)
+{
+	int status;
+	u16 chain, tmp_bits;
+	u32 tmp_used;
+	u64 next_group;
+	struct ocfs2_journal_handle *handle = ac->ac_handle;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *group_bh = NULL;
+	struct buffer_head *prev_group_bh = NULL;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+	struct ocfs2_group_desc *bg;
+
+	chain = ac->ac_chain;
+	mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
+	     bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
+
+	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
+				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = -ENOSPC;
+	/* for now, the chain search is a bit simplistic. We just use
+	 * the 1st group with any empty bits. */
+	while ((status = ac->ac_group_search(alloc_inode, group_bh,
+					     bits_wanted, min_bits, bit_off,
+					     &tmp_bits)) == -ENOSPC) {
+		if (!bg->bg_next_group)
+			break;
+
+		if (prev_group_bh) {
+			brelse(prev_group_bh);
+			prev_group_bh = NULL;
+		}
+		next_group = le64_to_cpu(bg->bg_next_group);
+		prev_group_bh = group_bh;
+		group_bh = NULL;
+		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+					  next_group, &group_bh,
+					  OCFS2_BH_CACHED, alloc_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		bg = (struct ocfs2_group_desc *) group_bh->b_data;
+		if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+			OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+			status = -EIO;
+			goto bail;
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
+	     tmp_bits, bg->bg_blkno);
+
+	*num_bits = tmp_bits;
+
+	BUG_ON(*num_bits == 0);
+
+	/*
+	 * Keep track of previous block descriptor read. When
+	 * we find a target, if we have read more than X
+	 * number of descriptors, and the target is reasonably
+	 * empty, relink him to top of his chain.
+	 *
+	 * We've read 0 extra blocks and only send one more to
+	 * the transaction, yet the next guy to search has a
+	 * much easier time.
+	 *
+	 * Do this *after* figuring out how many bits we're taking out
+	 * of our target group.
+	 */
+	if (ac->ac_allow_chain_relink &&
+	    (prev_group_bh) &&
+	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+		status = ocfs2_relink_block_group(handle, alloc_inode,
+						  ac->ac_bh, group_bh,
+						  prev_group_bh, chain);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Ok, claim our bits now: set the info on dinode, chainlist
+	 * and then the group */
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      ac->ac_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+
+	status = ocfs2_journal_dirty(handle,
+				     ac->ac_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_set_bits(handle,
+					    alloc_inode,
+					    bg,
+					    group_bh,
+					    *bit_off,
+					    *num_bits);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
+	     *num_bits, fe->i_blkno);
+
+	*bg_blkno = le64_to_cpu(bg->bg_blkno);
+bail:
+	if (group_bh)
+		brelse(group_bh);
+	if (prev_group_bh)
+		brelse(prev_group_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* will give out up to bits_wanted contiguous bits. */
+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+				     struct ocfs2_alloc_context *ac,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     u16 *bit_off,
+				     unsigned int *num_bits,
+				     u64 *bg_blkno)
+{
+	int status;
+	u16 victim, i;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
+	BUG_ON(!ac->ac_bh);
+
+	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
+	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
+		ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
+			    "used bits but only %u total.",
+			    le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->id1.bitmap1.i_used),
+			    le32_to_cpu(fe->id1.bitmap1.i_total));
+		status = -EIO;
+		goto bail;
+	}
+
+	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+
+	victim = ocfs2_find_victim_chain(cl);
+	ac->ac_chain = victim;
+	ac->ac_allow_chain_relink = 1;
+
+	status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
+				    num_bits, bg_blkno);
+	if (!status)
+		goto bail;
+	if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Search of victim chain %u came up with nothing, "
+	     "trying all chains now.\n", victim);
+
+	/* If we didn't pick a good victim, then just default to
+	 * searching each chain in order. Don't allow chain relinking
+	 * because we only calculate enough journal credits for one
+	 * relink per alloc. */
+	ac->ac_allow_chain_relink = 0;
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
+		if (i == victim)
+			continue;
+		if (!cl->cl_recs[i].c_free)
+			continue;
+
+		ac->ac_chain = i;
+		status = ocfs2_search_chain(ac, bits_wanted, min_bits,
+					    bit_off, num_bits,
+					    bg_blkno);
+		if (!status)
+			break;
+		if (status < 0 && status != -ENOSPC) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_metadata(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u16 *suballoc_bit_start,
+			 unsigned int *num_bits,
+			 u64 *blkno_start)
+{
+	int status;
+	u64 bg_blkno;
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
+	BUG_ON(ac->ac_handle != handle);
+
+	status = ocfs2_claim_suballoc_bits(osb,
+					   ac,
+					   bits_wanted,
+					   1,
+					   suballoc_bit_start,
+					   num_bits,
+					   &bg_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&osb->alloc_stats.bg_allocs);
+
+	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+	ac->ac_bits_given += (*num_bits);
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+			  struct ocfs2_journal_handle *handle,
+			  struct ocfs2_alloc_context *ac,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno)
+{
+	int status;
+	unsigned int num_bits;
+	u64 bg_blkno;
+
+	mlog_entry_void();
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given != 0);
+	BUG_ON(ac->ac_bits_wanted != 1);
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+	BUG_ON(ac->ac_handle != handle);
+
+	status = ocfs2_claim_suballoc_bits(osb,
+					   ac,
+					   1,
+					   1,
+					   suballoc_bit,
+					   &num_bits,
+					   &bg_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&osb->alloc_stats.bg_allocs);
+
+	BUG_ON(num_bits != 1);
+
+	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+	ac->ac_bits_given++;
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* translate a group desc. blkno and it's bitmap offset into
+ * disk cluster offset. */
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 cluster = 0;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg_blkno != osb->first_cluster_group_blkno)
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
+	cluster += (u32) bg_bit_off;
+	return cluster;
+}
+
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+					    u32 cluster)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 group_no;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	group_no = cluster / osb->bitmap_cpg;
+	if (!group_no)
+		return osb->first_cluster_group_blkno;
+	return ocfs2_clusters_to_blocks(inode->i_sb,
+					group_no * osb->bitmap_cpg);
+}
+
+/* given the block number of a cluster start, calculate which cluster
+ * group and descriptor bitmap offset that corresponds to. */
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	*bg_blkno = ocfs2_which_cluster_group(inode,
+					      data_cluster);
+
+	if (*bg_blkno == osb->first_cluster_group_blkno)
+		*bg_bit_off = (u16) data_cluster;
+	else
+		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
+							     data_blkno - *bg_blkno);
+}
+
+/*
+ * min_bits - minimum contiguous chunk from this total allocation we
+ * can handle. set to what we asked for originally for a full
+ * contig. allocation, set to '1' to indicate we can deal with extents
+ * of any size.
+ */
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters)
+{
+	int status;
+	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	u64 bg_blkno;
+	u16 bg_bit_off;
+
+	mlog_entry_void();
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
+	       && ac->ac_which != OCFS2_AC_USE_MAIN);
+	BUG_ON(ac->ac_handle != handle);
+
+	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+		status = ocfs2_claim_local_alloc_bits(osb,
+						      handle,
+						      ac,
+						      bits_wanted,
+						      cluster_start,
+						      num_clusters);
+		if (!status)
+			atomic_inc(&osb->alloc_stats.local_data);
+	} else {
+		if (min_clusters > (osb->bitmap_cpg - 1)) {
+			/* The only paths asking for contiguousness
+			 * should know about this already. */
+			mlog(ML_ERROR, "minimum allocation requested exceeds "
+				       "group bitmap size!");
+			status = -ENOSPC;
+			goto bail;
+		}
+		/* clamp the current request down to a realistic size. */
+		if (bits_wanted > (osb->bitmap_cpg - 1))
+			bits_wanted = osb->bitmap_cpg - 1;
+
+		status = ocfs2_claim_suballoc_bits(osb,
+						   ac,
+						   bits_wanted,
+						   min_clusters,
+						   &bg_bit_off,
+						   num_clusters,
+						   &bg_blkno);
+		if (!status) {
+			*cluster_start =
+				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
+								 bg_blkno,
+								 bg_bit_off);
+			atomic_inc(&osb->alloc_stats.bitmap_data);
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ac->ac_bits_given += *num_clusters;
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
+					       struct inode *alloc_inode,
+					       struct ocfs2_group_desc *bg,
+					       struct buffer_head *group_bh,
+					       unsigned int bit_off,
+					       unsigned int num_bits)
+{
+	int status;
+	unsigned int tmp;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+	struct ocfs2_group_desc *undo_bg = NULL;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+
+	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
+				      journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
+
+	tmp = num_bits;
+	while(tmp--) {
+		ocfs2_clear_bit((bit_off + tmp),
+				(unsigned long *) bg->bg_bitmap);
+		if (ocfs2_is_cluster_bitmap(alloc_inode))
+			ocfs2_set_bit(bit_off + tmp,
+				      (unsigned long *) undo_bg->bg_bitmap);
+	}
+	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+
+	status = ocfs2_journal_dirty(handle, group_bh);
+	if (status < 0)
+		mlog_errno(status);
+bail:
+	return status;
+}
+
+/*
+ * expects the suballoc inode to already be locked.
+ */
+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *alloc_bh,
+				    unsigned int start_bit,
+				    u64 bg_blkno,
+				    unsigned int count)
+{
+	int status = 0;
+	u32 tmp_used;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *group;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
+
+	mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
+	     ", starting at %u\n",
+	     OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
+	     start_bit);
+
+	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
+				  alloc_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	group = (struct ocfs2_group_desc *) group_bh->b_data;
+	if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
+
+	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
+					      group, group_bh,
+					      start_bit, count);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
+		     count);
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+
+	status = ocfs2_journal_dirty(handle, alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	if (group_bh)
+		brelse(group_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+	u64 group = block - (u64) bit;
+
+	return group;
+}
+
+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di)
+{
+	u64 blk = le64_to_cpu(di->i_blkno);
+	u16 bit = le16_to_cpu(di->i_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
+					inode_alloc_bh, bit, bg_blkno, 1);
+}
+
+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
+			    struct inode *eb_alloc_inode,
+			    struct buffer_head *eb_alloc_bh,
+			    struct ocfs2_extent_block *eb)
+{
+	u64 blk = le64_to_cpu(eb->h_blkno);
+	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
+					bit, bg_blkno, 1);
+}
+
+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
+		       struct inode *bitmap_inode,
+		       struct buffer_head *bitmap_bh,
+		       u64 start_blk,
+		       unsigned int num_clusters)
+{
+	int status;
+	u16 bg_start_bit;
+	u64 bg_blkno;
+	struct ocfs2_dinode *fe;
+
+	/* You can't ever have a contiguous set of clusters
+	 * bigger than a block group bitmap so we never have to worry
+	 * about looping on them. */
+
+	mlog_entry_void();
+
+	/* This is expensive. We can safely remove once this stuff has
+	 * gotten tested really well. */
+	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
+
+	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
+
+	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
+				     &bg_start_bit);
+
+	mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
+	     num_clusters, start_blk);
+	mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
+	     bg_blkno, bg_start_bit);
+
+	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+					  bg_start_bit, bg_blkno,
+					  num_clusters);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
+{
+	printk("Block Group:\n");
+	printk("bg_signature:       %s\n", bg->bg_signature);
+	printk("bg_size:            %u\n", bg->bg_size);
+	printk("bg_bits:            %u\n", bg->bg_bits);
+	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
+	printk("bg_chain:           %u\n", bg->bg_chain);
+	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
+	printk("bg_next_group:      %"MLFu64"\n", bg->bg_next_group);
+	printk("bg_parent_dinode:   %"MLFu64"\n", bg->bg_parent_dinode);
+	printk("bg_blkno:           %"MLFu64"\n", bg->bg_blkno);
+}
+
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
+{
+	int i;
+
+	printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
+	printk("i_signature:                  %s\n", fe->i_signature);
+	printk("i_size:                       %"MLFu64"\n", fe->i_size);
+	printk("i_clusters:                   %u\n", fe->i_clusters);
+	printk("i_generation:                 %u\n",
+	       le32_to_cpu(fe->i_generation));
+	printk("id1.bitmap1.i_used:           %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_used));
+	printk("id1.bitmap1.i_total:          %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_total));
+	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
+	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
+	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
+	printk("id2.i_chain.cl_next_free_rec: %u\n",
+	       fe->id2.i_chain.cl_next_free_rec);
+	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
+		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_free);
+		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_total);
+		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_blkno);
+	}
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
new file mode 100644
index 000000000000..a76c82a7ceac
--- /dev/null
+++ b/fs/ocfs2/suballoc.h
@@ -0,0 +1,132 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.h
+ *
+ * Defines sub allocator api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _CHAINALLOC_H_
+#define _CHAINALLOC_H_
+
+typedef int (group_search_t)(struct inode *,
+			     struct buffer_head *,
+			     u32,
+			     u32,
+			     u16 *,
+			     u16 *);
+
+struct ocfs2_alloc_context {
+	struct inode *ac_inode;    /* which bitmap are we allocating from? */
+	struct buffer_head *ac_bh; /* file entry bh */
+	u32    ac_bits_wanted;
+	u32    ac_bits_given;
+#define OCFS2_AC_USE_LOCAL 1
+#define OCFS2_AC_USE_MAIN  2
+#define OCFS2_AC_USE_INODE 3
+#define OCFS2_AC_USE_META  4
+	u32    ac_which;
+	struct ocfs2_journal_handle *ac_handle;
+
+	/* these are used by the chain search */
+	u16    ac_chain;
+	int    ac_allow_chain_relink;
+	group_search_t *ac_group_search;
+};
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
+static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
+{
+	return ac->ac_bits_wanted - ac->ac_bits_given;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_dinode *fe,
+			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   struct ocfs2_journal_handle *handle,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac);
+
+int ocfs2_claim_metadata(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u16 *suballoc_bit_start,
+			 u32 *num_bits,
+			 u64 *blkno_start);
+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+			  struct ocfs2_journal_handle *handle,
+			  struct ocfs2_alloc_context *ac,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno);
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters);
+
+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di);
+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
+			    struct inode *eb_alloc_inode,
+			    struct buffer_head *eb_alloc_bh,
+			    struct ocfs2_extent_block *eb);
+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters);
+
+static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
+					  u64 bg_blkno)
+{
+	/* This should work for all block group descriptors as only
+	 * the 1st group descriptor of the cluster bitmap is
+	 * different. */
+
+	if (bg_blkno == osb->first_cluster_group_blkno)
+		return 0;
+
+	/* the rest of the block groups are located at the beginning
+	 * of their 1st cluster, so a direct translation just
+	 * works. */
+	return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
+}
+
+static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
+}
+
+/* This is for local alloc ONLY. Others should use the task-specific
+ * apis above. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac);
+
+#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
new file mode 100644
index 000000000000..48bf7f0ce544
--- /dev/null
+++ b/fs/ocfs2/super.c
@@ -0,0 +1,1733 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.c
+ *
+ * load/unload driver, mount/dismount volumes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/parser.h>
+#include <linux/crc32.h>
+#include <linux/debugfs.h>
+
+#include <cluster/nodemanager.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+/* this should be the only file to include a version 1 header */
+#include "ocfs1_fs_compat.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "namei.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ver.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+/*
+ * Globals
+ */
+static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
+
+static u32 osb_id;             /* Keeps track of next available OSB Id */
+
+static kmem_cache_t *ocfs2_inode_cachep = NULL;
+
+kmem_cache_t *ocfs2_lock_cache = NULL;
+
+/* OCFS2 needs to schedule several differnt types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own. */
+struct workqueue_struct *ocfs2_wq = NULL;
+
+static struct dentry *ocfs2_debugfs_root = NULL;
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+static int ocfs2_parse_options(struct super_block *sb, char *options,
+			       unsigned long *mount_opt, int is_remount);
+static void ocfs2_put_super(struct super_block *sb);
+static int ocfs2_mount_volume(struct super_block *sb);
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
+static int ocfs2_initialize_mem_caches(void);
+static void ocfs2_free_mem_caches(void);
+static void ocfs2_delete_osb(struct ocfs2_super *osb);
+
+static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait);
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
+static int ocfs2_check_volume(struct ocfs2_super *osb);
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 sectsize);
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size);
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size);
+static void ocfs2_write_super(struct super_block *sb);
+static struct inode *ocfs2_alloc_inode(struct super_block *sb);
+static void ocfs2_destroy_inode(struct inode *inode);
+
+static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
+
+static struct super_operations ocfs2_sops = {
+	.statfs		= ocfs2_statfs,
+	.alloc_inode	= ocfs2_alloc_inode,
+	.destroy_inode	= ocfs2_destroy_inode,
+	.drop_inode	= ocfs2_drop_inode,
+	.clear_inode	= ocfs2_clear_inode,
+	.delete_inode	= ocfs2_delete_inode,
+	.sync_fs	= ocfs2_sync_fs,
+	.write_super	= ocfs2_write_super,
+	.put_super	= ocfs2_put_super,
+	.remount_fs	= ocfs2_remount,
+};
+
+enum {
+	Opt_barrier,
+	Opt_err_panic,
+	Opt_err_ro,
+	Opt_intr,
+	Opt_nointr,
+	Opt_hb_none,
+	Opt_hb_local,
+	Opt_data_ordered,
+	Opt_data_writeback,
+	Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_barrier, "barrier=%u"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_intr, "intr"},
+	{Opt_nointr, "nointr"},
+	{Opt_hb_none, OCFS2_HB_NONE},
+	{Opt_hb_local, OCFS2_HB_LOCAL},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_err, NULL}
+};
+
+/*
+ * write_super and sync_fs ripped right out of ext3.
+ */
+static void ocfs2_write_super(struct super_block *sb)
+{
+	if (down_trylock(&sb->s_lock) == 0)
+		BUG();
+	sb->s_dirt = 0;
+}
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait)
+{
+	int status = 0;
+	tid_t target;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	sb->s_dirt = 0;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (wait) {
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+	} else {
+		ocfs2_schedule_truncate_log_flush(osb, 0);
+	}
+
+	if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+		if (wait)
+			log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+					target);
+	}
+	return 0;
+}
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	mlog_entry_void();
+
+	new = ocfs2_iget(osb, osb->root_blkno);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->root_inode = new;
+
+	new = ocfs2_iget(osb, osb->system_dir_blkno);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->sys_root_inode = new;
+
+	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
+	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog_errno(status);
+			/* FIXME: Should ERROR_RO_FS */
+			mlog(ML_ERROR, "Unable to load system inode %d, "
+			     "possibly corrupt fs?", i);
+			goto bail;
+		}
+		// the array now has one ref, so drop this one
+		iput(new);
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	mlog_entry_void();
+
+	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
+	     i < NUM_SYSTEM_INODES;
+	     i++) {
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
+			     status, i, osb->slot_num);
+			goto bail;
+		}
+		/* the array now has one ref, so drop this one */
+		iput(new);
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
+{
+	int status = 0, i;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+	for (i = 0; i < NUM_SYSTEM_INODES; i++) {
+		inode = osb->system_inodes[i];
+		if (inode) {
+			iput(inode);
+			osb->system_inodes[i] = NULL;
+		}
+	}
+
+	inode = osb->sys_root_inode;
+	if (inode) {
+		iput(inode);
+		osb->sys_root_inode = NULL;
+	}
+
+	inode = osb->root_inode;
+	if (inode) {
+		iput(inode);
+		osb->root_inode = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* We're allocating fs objects, use GFP_NOFS */
+static struct inode *ocfs2_alloc_inode(struct super_block *sb)
+{
+	struct ocfs2_inode_info *oi;
+
+	oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
+	if (!oi)
+		return NULL;
+
+	return &oi->vfs_inode;
+}
+
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
+}
+
+/* From xfs_super.c:xfs_max_file_offset
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.
+ */
+static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
+{
+	unsigned int pagefactor = 1;
+	unsigned int bitshift = BITS_PER_LONG - 1;
+
+	/* Figure out maximum filesize, on Linux this can depend on
+	 * the filesystem blocksize (on 32 bit platforms).
+	 * __block_prepare_write does this in an [unsigned] long...
+	 *      page->index << (PAGE_CACHE_SHIFT - bbits)
+	 * So, for page sized blocks (4K on 32 bit platforms),
+	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
+	 *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+	 * but for smaller blocksizes it is less (bbits = log2 bsize).
+	 * Note1: get_block_t takes a long (implicit cast from above)
+	 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
+	 * can optionally convert the [unsigned] long from above into
+	 * an [unsigned] long long.
+	 */
+
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBD)
+	BUG_ON(sizeof(sector_t) != 8);
+	pagefactor = PAGE_CACHE_SIZE;
+	bitshift = BITS_PER_LONG;
+# else
+	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+# endif
+#endif
+
+	return (((unsigned long long)pagefactor) << bitshift) - 1;
+}
+
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
+{
+	int incompat_features;
+	int ret = 0;
+	unsigned long parsed_options;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
+	    (parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
+		goto out;
+	}
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
+	    (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change data mode on remount\n");
+		goto out;
+	}
+
+	/* We're going to/from readonly mode. */
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		/* Lock here so the check of HARD_RO and the potential
+		 * setting of SOFT_RO is atomic. */
+		spin_lock(&osb->osb_lock);
+		if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
+			mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
+			ret = -EROFS;
+			goto unlock_osb;
+		}
+
+		if (*flags & MS_RDONLY) {
+			mlog(0, "Going to ro mode.\n");
+			sb->s_flags |= MS_RDONLY;
+			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+		} else {
+			mlog(0, "Making ro filesystem writeable.\n");
+
+			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
+				mlog(ML_ERROR, "Cannot remount RDWR "
+				     "filesystem due to previous errors.\n");
+				ret = -EROFS;
+				goto unlock_osb;
+			}
+			incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
+			if (incompat_features) {
+				mlog(ML_ERROR, "Cannot remount RDWR because "
+				     "of unsupported optional features "
+				     "(%x).\n", incompat_features);
+				ret = -EINVAL;
+				goto unlock_osb;
+			}
+			sb->s_flags &= ~MS_RDONLY;
+			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
+		}
+unlock_osb:
+		spin_unlock(&osb->osb_lock);
+	}
+
+	if (!ret) {
+		if (!ocfs2_is_hard_readonly(osb))
+			ocfs2_set_journal_params(osb);
+
+		/* Only save off the new mount options in case of a successful
+		 * remount. */
+		osb->s_mount_opt = parsed_options;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_sb_probe(struct super_block *sb,
+			  struct buffer_head **bh,
+			  int *sector_size)
+{
+	int status = 0, tmpstat;
+	struct ocfs1_vol_disk_hdr *hdr;
+	struct ocfs2_dinode *di;
+	int blksize;
+
+	*bh = NULL;
+
+	/* may be > 512 */
+	*sector_size = bdev_hardsect_size(sb->s_bdev);
+	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
+		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
+		     *sector_size, OCFS2_MAX_BLOCKSIZE);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* Can this really happen? */
+	if (*sector_size < OCFS2_MIN_BLOCKSIZE)
+		*sector_size = OCFS2_MIN_BLOCKSIZE;
+
+	/* check block zero for old format */
+	status = ocfs2_get_sector(sb, bh, 0, *sector_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
+	if (hdr->major_version == OCFS1_MAJOR_VERSION) {
+		mlog(ML_ERROR, "incompatible version: %u.%u\n",
+		     hdr->major_version, hdr->minor_version);
+		status = -EINVAL;
+	}
+	if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
+		   strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
+		mlog(ML_ERROR, "incompatible volume signature: %8s\n",
+		     hdr->signature);
+		status = -EINVAL;
+	}
+	brelse(*bh);
+	*bh = NULL;
+	if (status < 0) {
+		mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
+		     "upgraded before mounting with ocfs v2\n");
+		goto bail;
+	}
+
+	/*
+	 * Now check at magic offset for 512, 1024, 2048, 4096
+	 * blocksizes.  4096 is the maximum blocksize because it is
+	 * the minimum clustersize.
+	 */
+	status = -EINVAL;
+	for (blksize = *sector_size;
+	     blksize <= OCFS2_MAX_BLOCKSIZE;
+	     blksize <<= 1) {
+		tmpstat = ocfs2_get_sector(sb, bh,
+					   OCFS2_SUPER_BLOCK_BLKNO,
+					   blksize);
+		if (tmpstat < 0) {
+			status = tmpstat;
+			mlog_errno(status);
+			goto bail;
+		}
+		di = (struct ocfs2_dinode *) (*bh)->b_data;
+		status = ocfs2_verify_volume(di, *bh, blksize);
+		if (status >= 0)
+			goto bail;
+		brelse(*bh);
+		*bh = NULL;
+		if (status != -EAGAIN)
+			break;
+	}
+
+bail:
+	return status;
+}
+
+static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct dentry *root;
+	int status, sector_size;
+	unsigned long parsed_opt;
+	struct inode *inode = NULL;
+	struct ocfs2_super *osb = NULL;
+	struct buffer_head *bh = NULL;
+
+	mlog_entry("%p, %p, %i", sb, data, silent);
+
+	/* for now we only have one cluster/node, make sure we see it
+	 * in the heartbeat universe */
+	if (!o2hb_check_local_node_heartbeating()) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+
+	/* probe for superblock */
+	status = ocfs2_sb_probe(sb, &bh, &sector_size);
+	if (status < 0) {
+		mlog(ML_ERROR, "superblock probe failed!\n");
+		goto read_super_error;
+	}
+
+	status = ocfs2_initialize_super(sb, bh, sector_size);
+	osb = OCFS2_SB(sb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto read_super_error;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+	osb->s_mount_opt = parsed_opt;
+
+	sb->s_magic = OCFS2_SUPER_MAGIC;
+
+	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+	 * heartbeat=none */
+	if (bdev_read_only(sb->s_bdev)) {
+		if (!(sb->s_flags & MS_RDONLY)) {
+			status = -EACCES;
+			mlog(ML_ERROR, "Readonly device detected but readonly "
+			     "mount was not specified.\n");
+			goto read_super_error;
+		}
+
+		/* You should not be able to start a local heartbeat
+		 * on a readonly device. */
+		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+			status = -EROFS;
+			mlog(ML_ERROR, "Local heartbeat specified on readonly "
+			     "device.\n");
+			goto read_super_error;
+		}
+
+		status = ocfs2_check_journals_nolocks(osb);
+		if (status < 0) {
+			if (status == -EROFS)
+				mlog(ML_ERROR, "Recovery required on readonly "
+				     "file system, but write access is "
+				     "unavailable.\n");
+			else
+				mlog_errno(status);			
+			goto read_super_error;
+		}
+
+		ocfs2_set_ro_flag(osb, 1);
+
+		printk(KERN_NOTICE "Readonly device detected. No cluster "
+		       "services will be utilized for this mount. Recovery "
+		       "will be skipped.\n");
+	}
+
+	if (!ocfs2_is_hard_readonly(osb)) {
+		/* If this isn't a hard readonly mount, then we need
+		 * to make sure that heartbeat is in a valid state,
+		 * and that we mark ourselves soft readonly is -oro
+		 * was specified. */
+		if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+			mlog(ML_ERROR, "No heartbeat for device (%s)\n",
+			     sb->s_id);
+			status = -EINVAL;
+			goto read_super_error;
+		}
+
+		if (sb->s_flags & MS_RDONLY)
+			ocfs2_set_ro_flag(osb, 0);
+	}
+
+	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
+						 ocfs2_debugfs_root);
+	if (!osb->osb_debug_root) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
+		goto read_super_error;
+	}
+
+	status = ocfs2_mount_volume(sb);
+	if (osb->root_inode)
+		inode = igrab(osb->root_inode);
+
+	if (status < 0)
+		goto read_super_error;
+
+	if (!inode) {
+		status = -EIO;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	sb->s_root = root;
+
+	ocfs2_complete_mount_recovery(osb);
+
+	printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
+	       "data mode.\n",
+	       MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
+	       osb->slot_num,
+	       osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
+	       "ordered");
+
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
+	wake_up(&osb->osb_mount_event);
+
+	mlog_exit(status);
+	return status;
+
+read_super_error:
+	if (bh != NULL)
+		brelse(bh);
+
+	if (inode)
+		iput(inode);
+
+	if (osb) {
+		atomic_set(&osb->vol_state, VOLUME_DISABLED);
+		wake_up(&osb->osb_mount_event);
+		ocfs2_dismount_volume(sb, 1);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
+					int flags,
+					const char *dev_name,
+					void *data)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+}
+
+static struct file_system_type ocfs2_fs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "ocfs2",
+	.get_sb         = ocfs2_get_sb, /* is this called when we mount
+					* the fs? */
+	.kill_sb        = kill_block_super, /* set to the generic one
+					     * right now, but do we
+					     * need to change that? */
+	.fs_flags       = FS_REQUIRES_DEV,
+	.next           = NULL
+};
+
+static int ocfs2_parse_options(struct super_block *sb,
+			       char *options,
+			       unsigned long *mount_opt,
+			       int is_remount)
+{
+	int status;
+	char *p;
+
+	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
+		   options ? options : "(none)");
+
+	*mount_opt = 0;
+
+	if (!options) {
+		status = 1;
+		goto bail;
+	}
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, option;
+		substring_t args[MAX_OPT_ARGS];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_hb_local:
+			*mount_opt |= OCFS2_MOUNT_HB_LOCAL;
+			break;
+		case Opt_hb_none:
+			*mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+			break;
+		case Opt_barrier:
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option)
+				*mount_opt |= OCFS2_MOUNT_BARRIER;
+			else
+				*mount_opt &= ~OCFS2_MOUNT_BARRIER;
+			break;
+		case Opt_intr:
+			*mount_opt &= ~OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_nointr:
+			*mount_opt |= OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_err_panic:
+			*mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			*mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_data_ordered:
+			*mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		case Opt_data_writeback:
+			*mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		default:
+			mlog(ML_ERROR,
+			     "Unrecognized mount option \"%s\" "
+			     "or missing value\n", p);
+			status = 0;
+			goto bail;
+		}
+	}
+
+	status = 1;
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int __init ocfs2_init(void)
+{
+	int status;
+
+	mlog_entry_void();
+
+	ocfs2_print_version();
+
+	if (init_ocfs2_extent_maps())
+		return -ENOMEM;
+
+	status = init_ocfs2_uptodate_cache();
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_initialize_mem_caches();
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+	if (!ocfs2_wq) {
+		status = -ENOMEM;
+		goto leave;
+	}
+
+	spin_lock(&ocfs2_globals_lock);
+	osb_id = 0;
+	spin_unlock(&ocfs2_globals_lock);
+
+	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
+	if (!ocfs2_debugfs_root) {
+		status = -EFAULT;
+		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
+	}
+
+leave:
+	if (status < 0) {
+		ocfs2_free_mem_caches();
+		exit_ocfs2_uptodate_cache();
+		exit_ocfs2_extent_maps();
+	}
+
+	mlog_exit(status);
+
+	if (status >= 0) {
+		return register_filesystem(&ocfs2_fs_type);
+	} else
+		return -1;
+}
+
+static void __exit ocfs2_exit(void)
+{
+	mlog_entry_void();
+
+	if (ocfs2_wq) {
+		flush_workqueue(ocfs2_wq);
+		destroy_workqueue(ocfs2_wq);
+	}
+
+	debugfs_remove(ocfs2_debugfs_root);
+
+	ocfs2_free_mem_caches();
+
+	unregister_filesystem(&ocfs2_fs_type);
+
+	exit_ocfs2_extent_maps();
+
+	exit_ocfs2_uptodate_cache();
+
+	mlog_exit_void();
+}
+
+static void ocfs2_put_super(struct super_block *sb)
+{
+	mlog_entry("(0x%p)\n", sb);
+
+	ocfs2_sync_blockdev(sb);
+	ocfs2_dismount_volume(sb, 0);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	struct ocfs2_super *osb;
+	u32 numbits, freebits;
+	int status;
+	struct ocfs2_dinode *bm_lock;
+	struct buffer_head *bh = NULL;
+	struct inode *inode = NULL;
+
+	mlog_entry("(%p, %p)\n", sb, buf);
+
+	osb = OCFS2_SB(sb);
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		mlog(ML_ERROR, "failed to get bitmap inode\n");
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bm_lock = (struct ocfs2_dinode *) bh->b_data;
+
+	numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
+	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
+
+	buf->f_type = OCFS2_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
+	buf->f_blocks = ((sector_t) numbits) *
+			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bfree = ((sector_t) freebits) *
+		       (osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = numbits;
+	buf->f_ffree = freebits;
+
+	brelse(bh);
+
+	ocfs2_meta_unlock(inode, 0);
+	status = 0;
+bail:
+	if (inode)
+		iput(inode);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+static void ocfs2_inode_init_once(void *data,
+				  kmem_cache_t *cachep,
+				  unsigned long flags)
+{
+	struct ocfs2_inode_info *oi = data;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		oi->ip_flags = 0;
+		oi->ip_open_count = 0;
+		spin_lock_init(&oi->ip_lock);
+		ocfs2_extent_map_init(&oi->vfs_inode);
+		INIT_LIST_HEAD(&oi->ip_handle_list);
+		INIT_LIST_HEAD(&oi->ip_io_markers);
+		oi->ip_handle = NULL;
+		oi->ip_created_trans = 0;
+		oi->ip_last_trans = 0;
+		oi->ip_dir_start_lookup = 0;
+
+		init_rwsem(&oi->ip_alloc_sem);
+		init_MUTEX(&(oi->ip_io_sem));
+
+		oi->ip_blkno = 0ULL;
+		oi->ip_clusters = 0;
+
+		ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
+		ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
+		ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+
+		ocfs2_metadata_cache_init(&oi->vfs_inode);
+
+		inode_init_once(&oi->vfs_inode);
+	}
+}
+
+static int ocfs2_initialize_mem_caches(void)
+{
+	ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
+					       sizeof(struct ocfs2_inode_info),
+					       0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
+					       ocfs2_inode_init_once, NULL);
+	if (!ocfs2_inode_cachep)
+		return -ENOMEM;
+
+	ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
+					     sizeof(struct ocfs2_journal_lock),
+					     0,
+					     SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
+					     NULL, NULL);
+	if (!ocfs2_lock_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ocfs2_free_mem_caches(void)
+{
+	if (ocfs2_inode_cachep)
+		kmem_cache_destroy(ocfs2_inode_cachep);
+	if (ocfs2_lock_cache)
+		kmem_cache_destroy(ocfs2_lock_cache);
+
+	ocfs2_inode_cachep = NULL;
+	ocfs2_lock_cache = NULL;
+}
+
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size)
+{
+	if (!sb_set_blocksize(sb, sect_size)) {
+		mlog(ML_ERROR, "unable to set blocksize\n");
+		return -EIO;
+	}
+
+	*bh = sb_getblk(sb, block);
+	if (!*bh) {
+		mlog_errno(-EIO);
+		return -EIO;
+	}
+	lock_buffer(*bh);
+	if (!buffer_dirty(*bh))
+		clear_buffer_uptodate(*bh);
+	unlock_buffer(*bh);
+	ll_rw_block(READ, 1, bh);
+	wait_on_buffer(*bh);
+	return 0;
+}
+
+/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
+static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
+{
+	int status;
+
+	/* XXX hold a ref on the node while mounte?  easy enough, if
+	 * desirable. */
+	osb->node_num = o2nm_this_node();
+	if (osb->node_num == O2NM_MAX_NODES) {
+		mlog(ML_ERROR, "could not find this host's node number\n");
+		status = -ENOENT;
+		goto bail;
+	}
+
+	mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static int ocfs2_mount_volume(struct super_block *sb)
+{
+	int status = 0;
+	int unlock_super = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	mlog_entry_void();
+
+	if (ocfs2_is_hard_readonly(osb))
+		goto leave;
+
+	status = ocfs2_fill_local_node_info(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_register_hb_callbacks(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_dlm_init(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* requires vote_thread to be running. */
+	status = ocfs2_register_net_handlers(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+	unlock_super = 1;
+
+	/* This will load up the node map and add ourselves to it. */
+	status = ocfs2_find_slot(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_populate_mounted_map(osb);
+
+	/* load all node-local system inodes */
+	status = ocfs2_init_local_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_check_volume(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_truncate_log_init(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* This should be sent *after* we recovered our journal as it
+	 * will cause other nodes to unmark us as needing
+	 * recovery. However, we need to send it *before* dropping the
+	 * super block lock as otherwise their recovery threads might
+	 * try to clean us up while we're live! */
+	status = ocfs2_request_mount_vote(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+leave:
+	if (unlock_super)
+		ocfs2_super_unlock(osb, 1);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+	mb();
+	return osb->recovery_thread_task != NULL;
+}
+
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
+{
+	int tmp;
+	struct ocfs2_super *osb = NULL;
+
+	mlog_entry("(0x%p)\n", sb);
+
+	BUG_ON(!sb);
+	osb = OCFS2_SB(sb);
+	BUG_ON(!osb);
+
+	ocfs2_shutdown_local_alloc(osb);
+
+	ocfs2_truncate_log_shutdown(osb);
+
+	/* disable any new recovery threads and wait for any currently
+	 * running ones to exit. Do this before setting the vol_state. */
+	down(&osb->recovery_lock);
+	osb->disable_recovery = 1;
+	up(&osb->recovery_lock);
+	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+
+	/* At this point, we know that no more recovery threads can be
+	 * launched, so wait for any recovery completion work to
+	 * complete. */
+	flush_workqueue(ocfs2_wq);
+
+	ocfs2_journal_shutdown(osb);
+
+	ocfs2_sync_blockdev(sb);
+
+	/* No dlm means we've failed during mount, so skip all the
+	 * steps which depended on that to complete. */
+	if (osb->dlm) {
+		tmp = ocfs2_super_lock(osb, 1);
+		if (tmp < 0) {
+			mlog_errno(tmp);
+			return;
+		}
+
+		tmp = ocfs2_request_umount_vote(osb);
+		if (tmp < 0)
+			mlog_errno(tmp);
+
+		if (osb->slot_num != OCFS2_INVALID_SLOT)
+			ocfs2_put_slot(osb);
+
+		ocfs2_super_unlock(osb, 1);
+	}
+
+	ocfs2_release_system_inodes(osb);
+
+	if (osb->dlm) {
+		ocfs2_unregister_net_handlers(osb);
+
+		ocfs2_dlm_shutdown(osb);
+	}
+
+	ocfs2_clear_hb_callbacks(osb);
+
+	debugfs_remove(osb->osb_debug_root);
+
+	if (!mnt_err)
+		ocfs2_stop_heartbeat(osb);
+
+	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
+
+	printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
+	       MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
+
+	ocfs2_delete_osb(osb);
+	kfree(osb);
+	sb->s_dev = 0;
+	sb->s_fs_info = NULL;
+}
+
+static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
+				unsigned uuid_bytes)
+{
+	int i, ret;
+	char *ptr;
+
+	BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
+
+	osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
+	if (osb->uuid_str == NULL)
+		return -ENOMEM;
+
+	memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
+
+	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
+		/* print with null */
+		ret = snprintf(ptr, 3, "%02X", uuid[i]);
+		if (ret != 2) /* drop super cleans up */
+			return -EINVAL;
+		/* then only advance past the last char */
+		ptr += 2;
+	}
+
+	return 0;
+}
+
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size)
+{
+	int status = 0;
+	int i;
+	struct ocfs2_dinode *di = NULL;
+	struct inode *inode = NULL;
+	struct buffer_head *bitmap_bh = NULL;
+	struct ocfs2_journal *journal;
+	__le32 uuid_net_key;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
+	if (!osb) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	sb->s_fs_info = osb;
+	sb->s_op = &ocfs2_sops;
+	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_flags |= MS_NOATIME;
+	/* this is needed to support O_LARGEFILE */
+	sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
+
+	osb->sb = sb;
+	/* Save off for ocfs2_rw_direct */
+	osb->s_sectsize_bits = blksize_bits(sector_size);
+	if (!osb->s_sectsize_bits)
+		BUG();
+
+	osb->net_response_ids = 0;
+	spin_lock_init(&osb->net_response_lock);
+	INIT_LIST_HEAD(&osb->net_response_list);
+
+	INIT_LIST_HEAD(&osb->osb_net_handlers);
+	init_waitqueue_head(&osb->recovery_event);
+	spin_lock_init(&osb->vote_task_lock);
+	init_waitqueue_head(&osb->vote_event);
+	osb->vote_work_sequence = 0;
+	osb->vote_wake_sequence = 0;
+	INIT_LIST_HEAD(&osb->blocked_lock_list);
+	osb->blocked_lock_count = 0;
+	INIT_LIST_HEAD(&osb->vote_list);
+	spin_lock_init(&osb->osb_lock);
+
+	atomic_set(&osb->alloc_stats.moves, 0);
+	atomic_set(&osb->alloc_stats.local_data, 0);
+	atomic_set(&osb->alloc_stats.bitmap_data, 0);
+	atomic_set(&osb->alloc_stats.bg_allocs, 0);
+	atomic_set(&osb->alloc_stats.bg_extends, 0);
+
+	ocfs2_init_node_maps(osb);
+
+	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
+		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	init_MUTEX(&osb->recovery_lock);
+
+	osb->disable_recovery = 0;
+	osb->recovery_thread_task = NULL;
+
+	init_waitqueue_head(&osb->checkpoint_event);
+	atomic_set(&osb->needs_checkpoint, 0);
+
+	osb->node_num = O2NM_INVALID_NODE_NUM;
+	osb->slot_num = OCFS2_INVALID_SLOT;
+
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+	osb->local_alloc_bh = NULL;
+
+	ocfs2_setup_hb_callbacks(osb);
+
+	init_waitqueue_head(&osb->osb_mount_event);
+
+	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
+	if (!osb->vol_label) {
+		mlog(ML_ERROR, "unable to alloc vol label\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
+	if (!osb->uuid) {
+		mlog(ML_ERROR, "unable to alloc uuid\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *)bh->b_data;
+
+	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+		     osb->max_slots);
+		status = -EINVAL;
+		goto bail;
+	}
+	mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
+
+	osb->s_feature_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
+	osb->s_feature_ro_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
+	osb->s_feature_incompat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
+
+	if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount because of unsupported "
+		     "optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+	if (!(osb->sb->s_flags & MS_RDONLY) &&
+	    (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount RDWR because of "
+		     "unsupported optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	get_random_bytes(&osb->s_next_generation, sizeof(u32));
+
+	/* FIXME
+	 * This should be done in ocfs2_journal_init(), but unknown
+	 * ordering issues will cause the filesystem to crash.
+	 * If anyone wants to figure out what part of the code
+	 * refers to osb->journal before ocfs2_journal_init() is run,
+	 * be my guest.
+	 */
+	/* initialize our journal structure */
+
+	journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
+	if (!journal) {
+		mlog(ML_ERROR, "unable to alloc journal\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+	osb->journal = journal;
+	journal->j_osb = osb;
+
+	atomic_set(&journal->j_num_trans, 0);
+	init_rwsem(&journal->j_trans_barrier);
+	init_waitqueue_head(&journal->j_checkpointed);
+	spin_lock_init(&journal->j_lock);
+	journal->j_trans_id = (unsigned long) 1;
+	INIT_LIST_HEAD(&journal->j_la_cleanups);
+	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+	/* get some pseudo constants for clustersize bits */
+	osb->s_clustersize_bits =
+		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
+	osb->s_clustersize = 1 << osb->s_clustersize_bits;
+	mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
+
+	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
+	    osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
+		mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
+		     osb->s_clustersize);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
+	    > (u32)~0UL) {
+		mlog(ML_ERROR, "Volume might try to write to blocks beyond "
+		     "what jbd can address in 32 bits.\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
+				 sizeof(di->id2.i_super.s_uuid))) {
+		mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
+	osb->net_key = le32_to_cpu(uuid_net_key);
+
+	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
+	osb->vol_label[63] = '\0';
+	osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
+	osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
+	osb->first_cluster_group_blkno =
+		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
+	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+	mlog(0, "vol_label: %s\n", osb->vol_label);
+	mlog(0, "uuid: %s\n", osb->uuid_str);
+	mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n",
+	     osb->root_blkno, osb->system_dir_blkno);
+
+	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
+	if (!osb->osb_dlm_debug) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_set(&osb->vol_state, VOLUME_INIT);
+
+	/* load root, system_dir, and all global system inodes */
+	status = ocfs2_init_global_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * global bitmap
+	 */
+	inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+
+	status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
+				  inode);
+	iput(inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) bitmap_bh->b_data;
+	osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+	osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
+	brelse(bitmap_bh);
+	mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n",
+	     osb->bitmap_blkno, osb->bitmap_cpg);
+
+	status = ocfs2_init_slot_info(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*  Link this osb onto the global linked list of all osb structures. */
+	/*  The Global Link List is mainted for the whole driver . */
+	spin_lock(&ocfs2_globals_lock);
+	osb->osb_id = osb_id;
+	if (osb_id < OCFS2_MAX_OSB_ID)
+		osb_id++;
+	else {
+		mlog(ML_ERROR, "Too many volumes mounted\n");
+		status = -ENOMEM;
+	}
+	spin_unlock(&ocfs2_globals_lock);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * will return: -EAGAIN if it is ok to keep searching for superblocks
+ *              -EINVAL if there is a bad superblock
+ *              0 on success
+ */
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 blksz)
+{
+	int status = -EAGAIN;
+
+	mlog_entry_void();
+
+	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
+		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+		status = -EINVAL;
+		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
+			mlog(ML_ERROR, "found superblock with incorrect block "
+			     "size: found %u, should be %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
+			       blksz);
+		} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
+			   OCFS2_MAJOR_REV_LEVEL ||
+			   le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
+			   OCFS2_MINOR_REV_LEVEL) {
+			mlog(ML_ERROR, "found superblock with bad version: "
+			     "found %u.%u, should be %u.%u\n",
+			     le16_to_cpu(di->id2.i_super.s_major_rev_level),
+			     le16_to_cpu(di->id2.i_super.s_minor_rev_level),
+			     OCFS2_MAJOR_REV_LEVEL,
+			     OCFS2_MINOR_REV_LEVEL);
+		} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
+			mlog(ML_ERROR, "bad block number on superblock: "
+			     "found %"MLFu64", should be %llu\n",
+			     di->i_blkno, (unsigned long long)bh->b_blocknr);
+		} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
+			    le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
+			mlog(ML_ERROR, "bad cluster size found: %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
+		} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
+			mlog(ML_ERROR, "bad root_blkno: 0\n");
+		} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
+			mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
+		} else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
+			mlog(ML_ERROR,
+			     "Superblock slots found greater than file system "
+			     "maximum: found %u, max %u\n",
+			     le16_to_cpu(di->id2.i_super.s_max_slots),
+			     OCFS2_MAX_SLOTS);
+		} else {
+			/* found it! */
+			status = 0;
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_check_volume(struct ocfs2_super *osb)
+{
+	int status = 0;
+	int dirty;
+	struct ocfs2_dinode *local_alloc = NULL; /* only used if we
+						  * recover
+						  * ourselves. */
+
+	mlog_entry_void();
+
+	/* Init our journal object. */
+	status = ocfs2_journal_init(osb->journal, &dirty);
+	if (status < 0) {
+		mlog(ML_ERROR, "Could not initialize journal!\n");
+		goto finally;
+	}
+
+	/* If the journal was unmounted cleanly then we don't want to
+	 * recover anything. Otherwise, journal_load will do that
+	 * dirty work for us :) */
+	if (!dirty) {
+		status = ocfs2_journal_wipe(osb->journal, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+	} else {
+		mlog(ML_NOTICE, "File system was not unmounted cleanly, "
+		     "recovering volume.\n");
+	}
+
+	/* will play back anything left in the journal. */
+	ocfs2_journal_load(osb->journal);
+
+	if (dirty) {
+		/* recover my local alloc if we didn't unmount cleanly. */
+		status = ocfs2_begin_local_alloc_recovery(osb,
+							  osb->slot_num,
+							  &local_alloc);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+		/* we complete the recovery process after we've marked
+		 * ourselves as mounted. */
+	}
+
+	mlog(0, "Journal loaded.\n");
+
+	status = ocfs2_load_local_alloc(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto finally;
+	}
+
+	if (dirty) {
+		/* Recovery will be completed after we've mounted the
+		 * rest of the volume. */
+		osb->dirty = 1;
+		osb->local_alloc_copy = local_alloc;
+		local_alloc = NULL;
+	}
+
+	/* go through each journal, trylock it and if you get the
+	 * lock, and it's marked as dirty, set the bit in the recover
+	 * map and launch a recovery thread for it. */
+	status = ocfs2_mark_dead_nodes(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+finally:
+	if (local_alloc)
+		kfree(local_alloc);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * The routine gets called from dismount or close whenever a dismount on
+ * volume is requested and the osb open count becomes 1.
+ * It will remove the osb from the global list and also free up all the
+ * initialized resources and fileobject.
+ */
+static void ocfs2_delete_osb(struct ocfs2_super *osb)
+{
+	mlog_entry_void();
+
+	/* This function assumes that the caller has the main osb resource */
+
+	if (osb->slot_info)
+		ocfs2_free_slot_info(osb->slot_info);
+
+	/* FIXME
+	 * This belongs in journal shutdown, but because we have to
+	 * allocate osb->journal at the start of ocfs2_initalize_osb(),
+	 * we free it here.
+	 */
+	kfree(osb->journal);
+	if (osb->local_alloc_copy)
+		kfree(osb->local_alloc_copy);
+	kfree(osb->uuid_str);
+	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
+	memset(osb, 0, sizeof(struct ocfs2_super));
+
+	mlog_exit_void();
+}
+
+/* Put OCFS2 into a readonly state, or (if the user specifies it),
+ * panic(). We do not support continue-on-error operation. */
+static void ocfs2_handle_error(struct super_block *sb)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
+		panic("OCFS2: (device %s): panic forced after error\n",
+		      sb->s_id);
+
+	ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+
+	if (sb->s_flags & MS_RDONLY &&
+	    (ocfs2_is_soft_readonly(osb) ||
+	     ocfs2_is_hard_readonly(osb)))
+		return;
+
+	printk(KERN_CRIT "File system is now read-only due to the potential "
+	       "of on-disk corruption. Please run fsck.ocfs2 once the file "
+	       "system is unmounted.\n");
+	sb->s_flags |= MS_RDONLY;
+	ocfs2_set_ro_flag(osb, 0);
+}
+
+static char error_buf[1024];
+
+void __ocfs2_error(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsprintf(error_buf, fmt, args);
+	va_end(args);
+
+	/* Not using mlog here because we want to show the actual
+	 * function the error came from. */
+	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
+	       sb->s_id, function, error_buf);
+
+	ocfs2_handle_error(sb);
+}
+
+/* Handle critical errors. This is intentionally more drastic than
+ * ocfs2_handle_error, so we only use for things like journal errors,
+ * etc. */
+void __ocfs2_abort(struct super_block* sb,
+		   const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsprintf(error_buf, fmt, args);
+	va_end(args);
+
+	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
+	       sb->s_id, function, error_buf);
+
+	/* We don't have the cluster support yet to go straight to
+	 * hard readonly in here. Until then, we want to keep
+	 * ocfs2_abort() so that we can at least mark critical
+	 * errors.
+	 *
+	 * TODO: This should abort the journal and alert other nodes
+	 * that our slot needs recovery. */
+
+	/* Force a panic(). This stinks, but it's better than letting
+	 * things continue without having a proper hard readonly
+	 * here. */
+	OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+	ocfs2_handle_error(sb);
+}
+
+module_init(ocfs2_init);
+module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
new file mode 100644
index 000000000000..c564177dfbdc
--- /dev/null
+++ b/fs/ocfs2/super.h
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SUPER_H
+#define OCFS2_SUPER_H
+
+extern struct workqueue_struct *ocfs2_wq;
+
+int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
+				  int node_num);
+
+void __ocfs2_error(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...);
+#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+void __ocfs2_abort(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...);
+#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
new file mode 100644
index 000000000000..f6986bd79e75
--- /dev/null
+++ b/fs/ocfs2/symlink.c
@@ -0,0 +1,180 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ *  linux/cluster/ssi/cfs/symlink.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation; either version 2 of
+ *	the License, or (at your option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful,
+ *	but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
+ *	or NON INFRINGEMENT.  See the GNU General Public License for more
+ *	details.
+ *
+ * 	You should have received a copy of the GNU General Public License
+ * 	along with this program; if not, write to the Free Software
+ * 	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *	Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  Optimization changes Copyright (C) 1994 Florian La Roche
+ *
+ *  Jun 7 1999, cache symlink lookups in the page cache.  -DaveM
+ *
+ *  Portions Copyright (C) 2001 Compaq Computer Corporation
+ *
+ *  ocfs2 symlink handling code.
+ *
+ *  Copyright (C) 2004, 2005 Oracle.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/utsname.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "symlink.h"
+
+#include "buffer_head_io.h"
+
+static char *ocfs2_page_getlink(struct dentry * dentry,
+				struct page **ppage);
+static char *ocfs2_fast_symlink_getlink(struct inode *inode,
+					struct buffer_head **bh);
+
+/* get the link contents into pagecache */
+static char *ocfs2_page_getlink(struct dentry * dentry,
+				struct page **ppage)
+{
+	struct page * page;
+	struct address_space *mapping = dentry->d_inode->i_mapping;
+	page = read_cache_page(mapping, 0,
+			       (filler_t *)mapping->a_ops->readpage, NULL);
+	if (IS_ERR(page))
+		goto sync_fail;
+	wait_on_page_locked(page);
+	if (!PageUptodate(page))
+		goto async_fail;
+	*ppage = page;
+	return kmap(page);
+
+async_fail:
+	page_cache_release(page);
+	return ERR_PTR(-EIO);
+
+sync_fail:
+	return (char*)page;
+}
+
+static char *ocfs2_fast_symlink_getlink(struct inode *inode,
+					struct buffer_head **bh)
+{
+	int status;
+	char *link = NULL;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  bh,
+				  OCFS2_BH_CACHED,
+				  inode);
+	if (status < 0) {
+		mlog_errno(status);
+		link = ERR_PTR(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) (*bh)->b_data;
+	link = (char *) fe->id2.i_symlink;
+bail:
+	mlog_exit(status);
+
+	return link;
+}
+
+static int ocfs2_readlink(struct dentry *dentry,
+			  char __user *buffer,
+			  int buflen)
+{
+	int ret;
+	char *link;
+	struct buffer_head *bh = NULL;
+	struct inode *inode = dentry->d_inode;
+
+	mlog_entry_void();
+
+	link = ocfs2_fast_symlink_getlink(inode, &bh);
+	if (IS_ERR(link)) {
+		ret = PTR_ERR(link);
+		goto out;
+	}
+
+	ret = vfs_readlink(dentry, buffer, buflen, link);
+
+	brelse(bh);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static void *ocfs2_follow_link(struct dentry *dentry,
+			       struct nameidata *nd)
+{
+	int status;
+	char *link;
+	struct inode *inode = dentry->d_inode;
+	struct page *page = NULL;
+	struct buffer_head *bh = NULL;
+	
+	if (ocfs2_inode_is_fast_symlink(inode))
+		link = ocfs2_fast_symlink_getlink(inode, &bh);
+	else
+		link = ocfs2_page_getlink(dentry, &page);
+	if (IS_ERR(link)) {
+		status = PTR_ERR(link);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = vfs_follow_link(nd, link);
+	if (status)
+		mlog_errno(status);
+bail:
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+	if (bh)
+		brelse(bh);
+
+	return ERR_PTR(status);
+}
+
+struct inode_operations ocfs2_symlink_inode_operations = {
+	.readlink	= page_readlink,
+	.follow_link	= ocfs2_follow_link,
+	.getattr	= ocfs2_getattr,
+};
+struct inode_operations ocfs2_fast_symlink_inode_operations = {
+	.readlink	= ocfs2_readlink,
+	.follow_link	= ocfs2_follow_link,
+	.getattr	= ocfs2_getattr,
+};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
new file mode 100644
index 000000000000..1ea9e4d9e9eb
--- /dev/null
+++ b/fs/ocfs2/symlink.h
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYMLINK_H
+#define OCFS2_SYMLINK_H
+
+extern struct inode_operations ocfs2_symlink_inode_operations;
+extern struct inode_operations ocfs2_fast_symlink_inode_operations;
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
+{
+	return (S_ISLNK(inode->i_mode) &&
+		inode->i_blocks == 0);
+}
+
+
+#endif /* OCFS2_SYMLINK_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
new file mode 100644
index 000000000000..600a8bc5b541
--- /dev/null
+++ b/fs/ocfs2/sysfile.c
@@ -0,0 +1,131 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.c
+ *
+ * Initialize, read, write, etc. system files.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include "ocfs2.h"
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "alloc.h"
+#include "dir.h"
+#include "inode.h"
+#include "journal.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot);
+
+static inline int is_global_system_inode(int type);
+static inline int is_in_system_inode_array(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot);
+
+static inline int is_global_system_inode(int type)
+{
+	return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
+		type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
+}
+
+static inline int is_in_system_inode_array(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot)
+{
+	return slot == osb->slot_num || is_global_system_inode(type);
+}
+
+struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					  int type,
+					  u32 slot)
+{
+	struct inode *inode = NULL;
+	struct inode **arr = NULL;
+
+	/* avoid the lookup if cached in local system file array */
+	if (is_in_system_inode_array(osb, type, slot))
+		arr = &(osb->system_inodes[type]);
+
+	if (arr && ((inode = *arr) != NULL)) {
+		/* get a ref in addition to the array ref */
+		inode = igrab(inode);
+		if (!inode)
+			BUG();
+
+		return inode;
+	}
+
+	/* this gets one ref thru iget */
+	inode = _ocfs2_get_system_file_inode(osb, type, slot);
+
+	/* add one more if putting into array for first time */
+	if (arr && inode) {
+		*arr = igrab(inode);
+		if (!*arr)
+			BUG();
+	}
+	return inode;
+}
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot)
+{
+	char namebuf[40];
+	struct inode *inode = NULL;
+	u64 blkno;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+	int status = 0;
+
+	ocfs2_sprintf_system_inode_name(namebuf,
+					sizeof(namebuf),
+					type, slot);
+
+	status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
+					  &blkno, osb->sys_root_inode,
+					  &dirent_bh, &de);
+	if (status < 0) {
+		goto bail;
+	}
+
+	inode = ocfs2_iget(osb, blkno);
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		inode = NULL;
+		goto bail;
+	}
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+	return inode;
+}
+
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
new file mode 100644
index 000000000000..cc9ea661ffc1
--- /dev/null
+++ b/fs/ocfs2/sysfile.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYSFILE_H
+#define OCFS2_SYSFILE_H
+
+struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot);
+
+#endif /* OCFS2_SYSFILE_H */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
new file mode 100644
index 000000000000..3a0458fd3e1b
--- /dev/null
+++ b/fs/ocfs2/uptodate.c
@@ -0,0 +1,544 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.c
+ *
+ * Tracking the up-to-date-ness of a local buffer_head with respect to
+ * the cluster.
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Standard buffer head caching flags (uptodate, etc) are insufficient
+ * in a clustered environment - a buffer may be marked up to date on
+ * our local node but could have been modified by another cluster
+ * member. As a result an additional (and performant) caching scheme
+ * is required. A further requirement is that we consume as little
+ * memory as possible - we never pin buffer_head structures in order
+ * to cache them.
+ *
+ * We track the existence of up to date buffers on the inodes which
+ * are associated with them. Because we don't want to pin
+ * buffer_heads, this is only a (strong) hint and several other checks
+ * are made in the I/O path to ensure that we don't use a stale or
+ * invalid buffer without going to disk:
+ *	- buffer_jbd is used liberally - if a bh is in the journal on
+ *	  this node then it *must* be up to date.
+ *	- the standard buffer_uptodate() macro is used to detect buffers
+ *	  which may be invalid (even if we have an up to date tracking
+ * 	  item for them)
+ *
+ * For a full understanding of how this code works together, one
+ * should read the callers in dlmglue.c, the I/O functions in
+ * buffer_head_io.c and ocfs2_journal_access in journal.c
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/rbtree.h>
+#include <linux/jbd.h>
+
+#define MLOG_MASK_PREFIX ML_UPTODATE
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "inode.h"
+#include "uptodate.h"
+
+struct ocfs2_meta_cache_item {
+	struct rb_node	c_node;
+	sector_t	c_block;
+};
+
+static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
+
+void ocfs2_metadata_cache_init(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
+	ci->ci_num_cached = 0;
+}
+
+/* No lock taken here as 'root' is not expected to be visible to other
+ * processes. */
+static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
+{
+	unsigned int purged = 0;
+	struct rb_node *node;
+	struct ocfs2_meta_cache_item *item;
+
+	while ((node = rb_last(root)) != NULL) {
+		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
+
+		mlog(0, "Purge item %llu\n",
+		     (unsigned long long) item->c_block);
+
+		rb_erase(&item->c_node, root);
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+
+		purged++;
+	}
+	return purged;
+}
+
+/* Called from locking and called from ocfs2_clear_inode. Dump the
+ * cache for a given inode.
+ *
+ * This function is a few more lines longer than necessary due to some
+ * accounting done here, but I think it's worth tracking down those
+ * bugs sooner -- Mark */
+void ocfs2_metadata_cache_purge(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	unsigned int tree, to_purge, purged;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+	struct rb_root root = RB_ROOT;
+
+	spin_lock(&oi->ip_lock);
+	tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+	to_purge = ci->ci_num_cached;
+
+	mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
+	     tree ? "array" : "tree", oi->ip_blkno);
+
+	/* If we're a tree, save off the root so that we can safely
+	 * initialize the cache. We do the work to free tree members
+	 * without the spinlock. */
+	if (tree)
+		root = ci->ci_cache.ci_tree;
+
+	ocfs2_metadata_cache_init(inode);
+	spin_unlock(&oi->ip_lock);
+
+	purged = ocfs2_purge_copied_metadata_tree(&root);
+	/* If possible, track the number wiped so that we can more
+	 * easily detect counting errors. Unfortunately, this is only
+	 * meaningful for trees. */
+	if (tree && purged != to_purge)
+		mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
+		     oi->ip_blkno, to_purge, purged);
+}
+
+/* Returns the index in the cache array, -1 if not found.
+ * Requires ip_lock. */
+static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
+				    sector_t item)
+{
+	int i;
+
+	for (i = 0; i < ci->ci_num_cached; i++) {
+		if (item == ci->ci_cache.ci_array[i])
+			return i;
+	}
+
+	return -1;
+}
+
+/* Returns the cache item if found, otherwise NULL.
+ * Requires ip_lock. */
+static struct ocfs2_meta_cache_item *
+ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
+			sector_t block)
+{
+	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	while (n) {
+		item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < item->c_block)
+			n = n->rb_left;
+		else if (block > item->c_block)
+			n = n->rb_right;
+		else
+			return item;
+	}
+
+	return NULL;
+}
+
+static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
+			       struct buffer_head *bh)
+{
+	int index = -1;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	spin_lock(&oi->ip_lock);
+
+	mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
+	     oi->ip_blkno, (unsigned long long) bh->b_blocknr,
+	     !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
+
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
+		index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
+						 bh->b_blocknr);
+	else
+		item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
+					       bh->b_blocknr);
+
+	spin_unlock(&oi->ip_lock);
+
+	mlog(0, "index = %d, item = %p\n", index, item);
+
+	return (index != -1) || (item != NULL);
+}
+
+/* Warning: even if it returns true, this does *not* guarantee that
+ * the block is stored in our inode metadata cache. */
+int ocfs2_buffer_uptodate(struct inode *inode,
+			  struct buffer_head *bh)
+{
+	/* Doesn't matter if the bh is in our cache or not -- if it's
+	 * not marked uptodate then we know it can't have correct
+	 * data. */
+	if (!buffer_uptodate(bh))
+		return 0;
+
+	/* OCFS2 does not allow multiple nodes to be changing the same
+	 * block at the same time. */
+	if (buffer_jbd(bh))
+		return 1;
+
+	/* Ok, locally the buffer is marked as up to date, now search
+	 * our cache to see if we can trust that. */
+	return ocfs2_buffer_cached(OCFS2_I(inode), bh);
+}
+
+/* Requires ip_lock */
+static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
+				     sector_t block)
+{
+	BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
+
+	mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
+	     ci->ci_num_cached);
+
+	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
+	ci->ci_num_cached++;
+}
+
+/* By now the caller should have checked that the item does *not*
+ * exist in the tree.
+ * Requires ip_lock. */
+static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
+				      struct ocfs2_meta_cache_item *new)
+{
+	sector_t block = new->c_block;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *tmp;
+
+	mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
+	     ci->ci_num_cached);
+
+	while(*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < tmp->c_block)
+			p = &(*p)->rb_left;
+		else if (block > tmp->c_block)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate block %llu cached!\n",
+			     (unsigned long long) block);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->c_node, parent, p);
+	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached++;
+}
+
+static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
+					     struct ocfs2_caching_info *ci)
+{
+	assert_spin_locked(&oi->ip_lock);
+
+	return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
+		(ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
+}
+
+/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
+ * pointers in tree after we use them - this allows caller to detect
+ * when to free in case of error. */
+static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
+			       struct ocfs2_meta_cache_item **tree)
+{
+	int i;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
+			"Inode %"MLFu64", num cached = %u, should be %u\n",
+			oi->ip_blkno, ci->ci_num_cached,
+			OCFS2_INODE_MAX_CACHE_ARRAY);
+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+			"Inode %"MLFu64" not marked as inline anymore!\n",
+			oi->ip_blkno);
+	assert_spin_locked(&oi->ip_lock);
+
+	/* Be careful to initialize the tree members *first* because
+	 * once the ci_tree is used, the array is junk... */
+	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+		tree[i]->c_block = ci->ci_cache.ci_array[i];
+
+	oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
+	ci->ci_cache.ci_tree = RB_ROOT;
+	/* this will be set again by __ocfs2_insert_cache_tree */
+	ci->ci_num_cached = 0;
+
+	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+		__ocfs2_insert_cache_tree(ci, tree[i]);
+		tree[i] = NULL;
+	}
+
+	mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
+	     oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
+}
+
+/* Slow path function - memory allocation is necessary. See the
+ * comment above ocfs2_set_buffer_uptodate for more information. */
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
+					sector_t block,
+					int expand_tree)
+{
+	int i;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+	struct ocfs2_meta_cache_item *new = NULL;
+	struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
+		{ NULL, };
+
+	mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
+	     oi->ip_blkno, (unsigned long long) block, expand_tree);
+
+	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
+	if (!new) {
+		mlog_errno(-ENOMEM);
+		return;
+	}
+	new->c_block = block;
+
+	if (expand_tree) {
+		/* Do *not* allocate an array here - the removal code
+		 * has no way of tracking that. */
+		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+			tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
+						   GFP_KERNEL);
+			if (!tree[i]) {
+				mlog_errno(-ENOMEM);
+				goto out_free;
+			}
+
+			/* These are initialized in ocfs2_expand_cache! */
+		}
+	}
+
+	spin_lock(&oi->ip_lock);
+	if (ocfs2_insert_can_use_array(oi, ci)) {
+		mlog(0, "Someone cleared the tree underneath us\n");
+		/* Ok, items were removed from the cache in between
+		 * locks. Detect this and revert back to the fast path */
+		ocfs2_append_cache_array(ci, block);
+		spin_unlock(&oi->ip_lock);
+		goto out_free;
+	}
+
+	if (expand_tree)
+		ocfs2_expand_cache(oi, tree);
+
+	__ocfs2_insert_cache_tree(ci, new);
+	spin_unlock(&oi->ip_lock);
+
+	new = NULL;
+out_free:
+	if (new)
+		kmem_cache_free(ocfs2_uptodate_cachep, new);
+
+	/* If these were used, then ocfs2_expand_cache re-set them to
+	 * NULL for us. */
+	if (tree[0]) {
+		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+			if (tree[i])
+				kmem_cache_free(ocfs2_uptodate_cachep,
+						tree[i]);
+	}
+}
+
+/* Item insertion is guarded by ip_io_sem, so the insertion path takes
+ * advantage of this by not rechecking for a duplicate insert during
+ * the slow case. Additionally, if the cache needs to be bumped up to
+ * a tree, the code will not recheck after acquiring the lock --
+ * multiple paths cannot be expanding to a tree at the same time.
+ *
+ * The slow path takes into account that items can be removed
+ * (including the whole tree wiped and reset) when this process it out
+ * allocating memory. In those cases, it reverts back to the fast
+ * path.
+ *
+ * Note that this function may actually fail to insert the block if
+ * memory cannot be allocated. This is not fatal however (but may
+ * result in a performance penalty) */
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+			       struct buffer_head *bh)
+{
+	int expand;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	/* The block may very well exist in our cache already, so avoid
+	 * doing any more work in that case. */
+	if (ocfs2_buffer_cached(oi, bh))
+		return;
+
+	mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
+	     (unsigned long long) bh->b_blocknr);
+
+	/* No need to recheck under spinlock - insertion is guarded by
+	 * ip_io_sem */
+	spin_lock(&oi->ip_lock);
+	if (ocfs2_insert_can_use_array(oi, ci)) {
+		/* Fast case - it's an array and there's a free
+		 * spot. */
+		ocfs2_append_cache_array(ci, bh->b_blocknr);
+		spin_unlock(&oi->ip_lock);
+		return;
+	}
+
+	expand = 0;
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+		/* We need to bump things up to a tree. */
+		expand = 1;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	__ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
+}
+
+/* Called against a newly allocated buffer. Most likely nobody should
+ * be able to read this sort of metadata while it's still being
+ * allocated, but this is careful to take ip_io_sem anyway. */
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+				   struct buffer_head *bh)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	/* This should definitely *not* exist in our cache */
+	BUG_ON(ocfs2_buffer_cached(oi, bh));
+
+	set_buffer_uptodate(bh);
+
+	down(&oi->ip_io_sem);
+	ocfs2_set_buffer_uptodate(inode, bh);
+	up(&oi->ip_io_sem);
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
+					int index)
+{
+	sector_t *array = ci->ci_cache.ci_array;
+	int bytes;
+
+	BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
+	BUG_ON(index >= ci->ci_num_cached);
+	BUG_ON(!ci->ci_num_cached);
+
+	mlog(0, "remove index %d (num_cached = %u\n", index,
+	     ci->ci_num_cached);
+
+	ci->ci_num_cached--;
+
+	/* don't need to copy if the array is now empty, or if we
+	 * removed at the tail */
+	if (ci->ci_num_cached && index < ci->ci_num_cached) {
+		bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
+		memmove(&array[index], &array[index + 1], bytes);
+	}
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
+				       struct ocfs2_meta_cache_item *item)
+{
+	mlog(0, "remove block %llu from tree\n",
+	     (unsigned long long) item->c_block);
+
+	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached--;
+}
+
+/* Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit. */
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	int index;
+	sector_t block = bh->b_blocknr;
+	struct ocfs2_meta_cache_item *item = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	spin_lock(&oi->ip_lock);
+	mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
+	     oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
+	     oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+		index = ocfs2_search_cache_array(ci, block);
+		if (index != -1)
+			ocfs2_remove_metadata_array(ci, index);
+	} else {
+		item = ocfs2_search_cache_tree(ci, block);
+		if (item)
+			ocfs2_remove_metadata_tree(ci, item);
+	}
+	spin_unlock(&oi->ip_lock);
+
+	if (item)
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+}
+
+int __init init_ocfs2_uptodate_cache(void)
+{
+	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
+				  sizeof(struct ocfs2_meta_cache_item),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ocfs2_uptodate_cachep)
+		return -ENOMEM;
+
+	mlog(0, "%u inlined cache items per inode.\n",
+	     OCFS2_INODE_MAX_CACHE_ARRAY);
+
+	return 0;
+}
+
+void __exit exit_ocfs2_uptodate_cache(void)
+{
+	if (ocfs2_uptodate_cachep)
+		kmem_cache_destroy(ocfs2_uptodate_cachep);
+}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
new file mode 100644
index 000000000000..e5aacdf4eabf
--- /dev/null
+++ b/fs/ocfs2/uptodate.h
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.h
+ *
+ * Cluster uptodate tracking
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_UPTODATE_H
+#define OCFS2_UPTODATE_H
+
+int __init init_ocfs2_uptodate_cache(void);
+void __exit exit_ocfs2_uptodate_cache(void);
+
+void ocfs2_metadata_cache_init(struct inode *inode);
+void ocfs2_metadata_cache_purge(struct inode *inode);
+
+int ocfs2_buffer_uptodate(struct inode *inode,
+			  struct buffer_head *bh);
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+			       struct buffer_head *bh);
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+				   struct buffer_head *bh);
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh);
+
+#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
new file mode 100644
index 000000000000..5405ce121c99
--- /dev/null
+++ b/fs/ocfs2/ver.c
@@ -0,0 +1,43 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+#include "ver.h"
+
+#define OCFS2_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
+
+void ocfs2_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
new file mode 100644
index 000000000000..d7395cb91d2f
--- /dev/null
+++ b/fs/ocfs2/ver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_VER_H
+#define OCFS2_VER_H
+
+void ocfs2_print_version(void);
+
+#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
new file mode 100644
index 000000000000..021978e0576b
--- /dev/null
+++ b/fs/ocfs2/vote.c
@@ -0,0 +1,1202 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * vote.c
+ *
+ * description here
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+#include <linux/kthread.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_VOTE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "slot_map.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_MESSAGE_TYPE_VOTE     (0x1)
+#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
+struct ocfs2_msg_hdr
+{
+	__be32 h_response_id; /* used to lookup message handle on sending
+			    * node. */
+	__be32 h_request;
+	__be64 h_blkno;
+	__be32 h_generation;
+	__be32 h_node_num;    /* node sending this particular message. */
+};
+
+/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
+ * for the network. */
+#define OCFS2_VOTE_FILENAME_LEN 256
+struct ocfs2_vote_msg
+{
+	struct ocfs2_msg_hdr v_hdr;
+	union {
+		__be32 v_generic1;
+		__be32 v_orphaned_slot;	/* Used during delete votes */
+		__be32 v_nlink;		/* Used during unlink votes */
+	} md1;				/* Message type dependant 1 */
+	__be32 v_unlink_namelen;
+	__be64 v_unlink_parent;
+	u8  v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];
+};
+
+/* Responses are given these values to maintain backwards
+ * compatibility with older ocfs2 versions */
+#define OCFS2_RESPONSE_OK		(0)
+#define OCFS2_RESPONSE_BUSY		(-16)
+#define OCFS2_RESPONSE_BAD_MSG		(-22)
+
+struct ocfs2_response_msg
+{
+	struct ocfs2_msg_hdr r_hdr;
+	__be32 r_response;
+	__be32 r_orphaned_slot;
+};
+
+struct ocfs2_vote_work {
+	struct list_head   w_list;
+	struct ocfs2_vote_msg w_msg;
+};
+
+enum ocfs2_vote_request {
+	OCFS2_VOTE_REQ_INVALID = 0,
+	OCFS2_VOTE_REQ_DELETE,
+	OCFS2_VOTE_REQ_UNLINK,
+	OCFS2_VOTE_REQ_RENAME,
+	OCFS2_VOTE_REQ_MOUNT,
+	OCFS2_VOTE_REQ_UMOUNT,
+	OCFS2_VOTE_REQ_LAST
+};
+
+static inline int ocfs2_is_valid_vote_request(int request)
+{
+	return OCFS2_VOTE_REQ_INVALID < request &&
+		request < OCFS2_VOTE_REQ_LAST;
+}
+
+typedef void (*ocfs2_net_response_callback)(void *priv,
+					    struct ocfs2_response_msg *resp);
+struct ocfs2_net_response_cb {
+	ocfs2_net_response_callback	rc_cb;
+	void				*rc_priv;
+};
+
+struct ocfs2_net_wait_ctxt {
+	struct list_head        n_list;
+	u32                     n_response_id;
+	wait_queue_head_t       n_event;
+	struct ocfs2_node_map   n_node_map;
+	int                     n_response; /* an agreggate response. 0 if
+					     * all nodes are go, < 0 on any
+					     * negative response from any
+					     * node or network error. */
+	struct ocfs2_net_response_cb *n_callback;
+};
+
+static void ocfs2_process_mount_request(struct ocfs2_super *osb,
+					unsigned int node_num)
+{
+	mlog(0, "MOUNT vote from node %u\n", node_num);
+	/* The other node only sends us this message when he has an EX
+	 * on the superblock, so our recovery threads (if having been
+	 * launched) are waiting on it.*/
+	ocfs2_recovery_map_clear(osb, node_num);
+	ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
+
+	/* We clear the umount map here because a node may have been
+	 * previously mounted, safely unmounted but never stopped
+	 * heartbeating - in which case we'd have a stale entry. */
+	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+
+static void ocfs2_process_umount_request(struct ocfs2_super *osb,
+					 unsigned int node_num)
+{
+	mlog(0, "UMOUNT vote from node %u\n", node_num);
+	ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
+	ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
+}
+
+void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	assert_spin_locked(&oi->ip_lock);
+	/* We set the SKIP_DELETE flag on the inode so we don't try to
+	 * delete it in delete_inode ourselves, thus avoiding
+	 * unecessary lock pinging. If the other node failed to wipe
+	 * the inode as a result of a crash, then recovery will pick
+	 * up the slack. */
+	oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
+}
+
+static int ocfs2_process_delete_request(struct inode *inode,
+					int *orphaned_slot)
+{
+	int response = OCFS2_RESPONSE_BUSY;
+
+	mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
+	     inode->i_ino, inode->i_nlink, *orphaned_slot);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	/* Whatever our vote response is, we want to make sure that
+	 * the orphaned slot is recorded properly on this node *and*
+	 * on the requesting node. Technically, if the requesting node
+	 * did not know which slot the inode is orphaned in but we
+	 * respond with BUSY he doesn't actually need the orphaned
+	 * slot, but it doesn't hurt to do it here anyway. */
+	if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
+		mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
+				OCFS2_INVALID_SLOT &&
+				OCFS2_I(inode)->ip_orphaned_slot !=
+				(*orphaned_slot),
+				"Inode %"MLFu64": This node thinks it's "
+				"orphaned in slot %d, messaged it's in %d\n",
+				OCFS2_I(inode)->ip_blkno,
+				OCFS2_I(inode)->ip_orphaned_slot,
+				*orphaned_slot);
+
+		mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n",
+		     OCFS2_I(inode)->ip_blkno, *orphaned_slot);
+
+		OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
+	} else {
+		mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_orphaned_slot,
+		     OCFS2_I(inode)->ip_blkno);
+
+		*orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	}
+
+	/* vote no if the file is still open. */
+	if (OCFS2_I(inode)->ip_open_count) {
+		mlog(0, "open count = %u\n",
+		     OCFS2_I(inode)->ip_open_count);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		goto done;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* directories are a bit ugly... What if someone is sitting in
+	 * it? We want to make sure the inode is removed completely as
+	 * a result of the iput in process_vote. */
+	if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
+		mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
+		goto done;
+	}
+
+	if (filemap_fdatawrite(inode->i_mapping)) {
+		mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n",
+		     OCFS2_I(inode)->ip_blkno);
+		goto done;
+	}
+	sync_mapping_buffers(inode->i_mapping);
+	truncate_inode_pages(inode->i_mapping, 0);
+	ocfs2_extent_map_trunc(inode, 0);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	/* double check open count - someone might have raced this
+	 * thread into ocfs2_file_open while we were writing out
+	 * data. If we're to allow a wipe of this inode now, we *must*
+	 * hold the spinlock until we've marked it. */
+	if (OCFS2_I(inode)->ip_open_count) {
+		mlog(0, "Raced to wipe! open count = %u\n",
+		     OCFS2_I(inode)->ip_open_count);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		goto done;
+	}
+
+	/* Mark the inode as being wiped from disk. */
+	ocfs2_mark_inode_remotely_deleted(inode);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* Not sure this is necessary anymore. */
+	d_prune_aliases(inode);
+
+	/* If we get here, then we're voting 'yes', so commit the
+	 * delete on our side. */
+	response = OCFS2_RESPONSE_OK;
+done:
+	return response;
+}
+
+static int ocfs2_match_dentry(struct dentry *dentry,
+			      u64 parent_blkno,
+			      unsigned int namelen,
+			      const char *name)
+{
+	struct inode *parent;
+
+	if (!dentry->d_parent) {
+		mlog(0, "Detached from parent.\n");
+		return 0;
+	}
+
+	parent = dentry->d_parent->d_inode;
+	/* Negative parent dentry? */
+	if (!parent)
+		return 0;
+
+	/* Name is in a different directory. */
+	if (OCFS2_I(parent)->ip_blkno != parent_blkno)
+		return 0;
+
+	if (dentry->d_name.len != namelen)
+		return 0;
+
+	/* comparison above guarantees this is safe. */
+	if (memcmp(dentry->d_name.name, name, namelen))
+		return 0;
+
+	return 1;
+}
+
+static void ocfs2_process_dentry_request(struct inode *inode,
+					 int rename,
+					 unsigned int new_nlink,
+					 u64 parent_blkno,
+					 unsigned int namelen,
+					 const char *name)
+{
+	struct dentry *dentry = NULL;
+	struct list_head *p;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno,
+	     namelen, namelen, name);
+
+	spin_lock(&dcache_lock);
+
+	/* Another node is removing this name from the system. It is
+	 * up to us to find the corresponding dentry and if it exists,
+	 * unhash it from the dcache. */
+	list_for_each(p, &inode->i_dentry) {
+		dentry = list_entry(p, struct dentry, d_alias);
+
+		if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) {
+			mlog(0, "dentry found: %.*s\n",
+			     dentry->d_name.len, dentry->d_name.name);
+
+			dget_locked(dentry);
+			break;
+		}
+
+		dentry = NULL;
+	}
+
+	spin_unlock(&dcache_lock);
+
+	if (dentry) {
+		d_delete(dentry);
+		dput(dentry);
+	}
+
+	/* rename votes don't send link counts */
+	if (!rename) {
+		mlog(0, "new_nlink = %u\n", new_nlink);
+
+		/* We don't have the proper locks here to directly
+		 * change i_nlink and besides, the vote is sent
+		 * *before* the operation so it may have failed on the
+		 * other node. This passes a hint to ocfs2_drop_inode
+		 * to force ocfs2_delete_inode, who will take the
+		 * proper cluster locks to sort things out. */
+		if (new_nlink == 0) {
+			spin_lock(&oi->ip_lock);
+			oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+		}
+	}
+}
+
+static void ocfs2_process_vote(struct ocfs2_super *osb,
+			       struct ocfs2_vote_msg *msg)
+{
+	int net_status, vote_response;
+	int orphaned_slot = 0;
+	int rename = 0;
+	unsigned int node_num, generation, new_nlink, namelen;
+	u64 blkno, parent_blkno;
+	enum ocfs2_vote_request request;
+	struct inode *inode = NULL;
+	struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
+	struct ocfs2_response_msg response;
+
+	/* decode the network mumbo jumbo into local variables. */
+	request = be32_to_cpu(hdr->h_request);
+	blkno = be64_to_cpu(hdr->h_blkno);
+	generation = be32_to_cpu(hdr->h_generation);
+	node_num = be32_to_cpu(hdr->h_node_num);
+	if (request == OCFS2_VOTE_REQ_DELETE)
+		orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
+
+	mlog(0, "processing vote: request = %u, blkno = %"MLFu64", "
+	     "generation = %u, node_num = %u, priv1 = %u\n", request,
+	     blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1));
+
+	if (!ocfs2_is_valid_vote_request(request)) {
+		mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
+		     request, node_num);
+		vote_response = OCFS2_RESPONSE_BAD_MSG;
+		goto respond;
+	}
+
+	vote_response = OCFS2_RESPONSE_OK;
+
+	switch (request) {
+	case OCFS2_VOTE_REQ_UMOUNT:
+		ocfs2_process_umount_request(osb, node_num);
+		goto respond;
+	case OCFS2_VOTE_REQ_MOUNT:
+		ocfs2_process_mount_request(osb, node_num);
+		goto respond;
+	default:
+		/* avoids a gcc warning */
+		break;
+	}
+
+	/* We cannot process the remaining message types before we're
+	 * fully mounted. It's perfectly safe however to send a 'yes'
+	 * response as we can't possibly have any of the state they're
+	 * asking us to modify yet. */
+	if (atomic_read(&osb->vol_state) == VOLUME_INIT)
+		goto respond;
+
+	/* If we get here, then the request is against an inode. */
+	inode = ocfs2_ilookup_for_vote(osb, blkno,
+				       request == OCFS2_VOTE_REQ_DELETE);
+
+	/* Not finding the inode is perfectly valid - it means we're
+	 * not interested in what the other node is about to do to it
+	 * so in those cases we automatically respond with an
+	 * affirmative. Cluster locking ensures that we won't race
+	 * interest in the inode with this vote request. */
+	if (!inode)
+		goto respond;
+
+	/* Check generation values. It's possible for us to get a
+	 * request against a stale inode. If so then we proceed as if
+	 * we had not found an inode in the first place. */
+	if (inode->i_generation != generation) {
+		mlog(0, "generation passed %u != inode generation = %u, "
+		     "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", "
+		     "i_count = %u, message type = %u\n",
+		     generation, inode->i_generation, OCFS2_I(inode)->ip_flags,
+		     OCFS2_I(inode)->ip_blkno, blkno,
+		     atomic_read(&inode->i_count), request);
+		iput(inode);
+		inode = NULL;
+		goto respond;
+	}
+
+	switch (request) {
+	case OCFS2_VOTE_REQ_DELETE:
+		vote_response = ocfs2_process_delete_request(inode,
+							     &orphaned_slot);
+		break;
+	case OCFS2_VOTE_REQ_RENAME:
+		rename = 1;
+		/* fall through */
+	case OCFS2_VOTE_REQ_UNLINK:
+		parent_blkno = be64_to_cpu(msg->v_unlink_parent);
+		namelen = be32_to_cpu(msg->v_unlink_namelen);
+		/* new_nlink will be ignored in case of a rename vote */
+		new_nlink = be32_to_cpu(msg->md1.v_nlink);
+		ocfs2_process_dentry_request(inode, rename, new_nlink,
+					     parent_blkno, namelen,
+					     msg->v_unlink_dirent);
+		break;
+	default:
+		mlog(ML_ERROR, "node %u, invalid request: %u\n",
+		     node_num, request);
+		vote_response = OCFS2_RESPONSE_BAD_MSG;
+	}
+
+respond:
+	/* Response struture is small so we just put it on the stack
+	 * and stuff it inline. */
+	memset(&response, 0, sizeof(struct ocfs2_response_msg));
+	response.r_hdr.h_response_id = hdr->h_response_id;
+	response.r_hdr.h_blkno = hdr->h_blkno;
+	response.r_hdr.h_generation = hdr->h_generation;
+	response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
+	response.r_response = cpu_to_be32(vote_response);
+	response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
+
+	net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
+					osb->net_key,
+					&response,
+					sizeof(struct ocfs2_response_msg),
+					node_num,
+					NULL);
+	/* We still want to error print for ENOPROTOOPT here. The
+	 * sending node shouldn't have unregistered his net handler
+	 * without sending an unmount vote 1st */
+	if (net_status < 0
+	    && net_status != -ETIMEDOUT
+	    && net_status != -ENOTCONN)
+		mlog(ML_ERROR, "message to node %u fails with error %d!\n",
+		     node_num, net_status);
+
+	if (inode)
+		iput(inode);
+}
+
+static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
+{
+	unsigned long processed;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_vote_work *work;
+
+	mlog_entry_void();
+
+	spin_lock(&osb->vote_task_lock);
+	/* grab this early so we know to try again if a state change and
+	 * wake happens part-way through our work  */
+	osb->vote_work_sequence = osb->vote_wake_sequence;
+
+	processed = osb->blocked_lock_count;
+	while (processed) {
+		BUG_ON(list_empty(&osb->blocked_lock_list));
+
+		lockres = list_entry(osb->blocked_lock_list.next,
+				     struct ocfs2_lock_res, l_blocked_list);
+		list_del_init(&lockres->l_blocked_list);
+		osb->blocked_lock_count--;
+		spin_unlock(&osb->vote_task_lock);
+
+		BUG_ON(!processed);
+		processed--;
+
+		ocfs2_process_blocked_lock(osb, lockres);
+
+		spin_lock(&osb->vote_task_lock);
+	}
+
+	while (osb->vote_count) {
+		BUG_ON(list_empty(&osb->vote_list));
+		work = list_entry(osb->vote_list.next,
+				  struct ocfs2_vote_work, w_list);
+		list_del(&work->w_list);
+		osb->vote_count--;
+		spin_unlock(&osb->vote_task_lock);
+
+		ocfs2_process_vote(osb, &work->w_msg);
+		kfree(work);
+
+		spin_lock(&osb->vote_task_lock);
+	}
+	spin_unlock(&osb->vote_task_lock);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
+{
+	int empty = 0;
+
+	spin_lock(&osb->vote_task_lock);
+	if (list_empty(&osb->blocked_lock_list) &&
+	    list_empty(&osb->vote_list))
+		empty = 1;
+
+	spin_unlock(&osb->vote_task_lock);
+	return empty;
+}
+
+static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
+{
+	int should_wake = 0;
+
+	spin_lock(&osb->vote_task_lock);
+	if (osb->vote_work_sequence != osb->vote_wake_sequence)
+		should_wake = 1;
+	spin_unlock(&osb->vote_task_lock);
+
+	return should_wake;
+}
+
+int ocfs2_vote_thread(void *arg)
+{
+	int status = 0;
+	struct ocfs2_super *osb = arg;
+
+	/* only quit once we've been asked to stop and there is no more
+	 * work available */
+	while (!(kthread_should_stop() &&
+		 ocfs2_vote_thread_lists_empty(osb))) {
+
+		wait_event_interruptible(osb->vote_event,
+					 ocfs2_vote_thread_should_wake(osb) ||
+					 kthread_should_stop());
+
+		mlog(0, "vote_thread: awoken\n");
+
+		ocfs2_vote_thread_do_work(osb);
+	}
+
+	osb->vote_task = NULL;
+	return status;
+}
+
+static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
+{
+	struct ocfs2_net_wait_ctxt *w;
+
+	w = kcalloc(1, sizeof(*w), GFP_KERNEL);
+	if (!w) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	INIT_LIST_HEAD(&w->n_list);
+	init_waitqueue_head(&w->n_event);
+	ocfs2_node_map_init(&w->n_node_map);
+	w->n_response_id = response_id;
+	w->n_callback = NULL;
+bail:
+	return w;
+}
+
+static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
+{
+	unsigned int ret;
+
+	spin_lock(&osb->net_response_lock);
+	ret = ++osb->net_response_ids;
+	spin_unlock(&osb->net_response_lock);
+
+	return ret;
+}
+
+static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
+					struct ocfs2_net_wait_ctxt *w)
+{
+	spin_lock(&osb->net_response_lock);
+	list_del(&w->n_list);
+	spin_unlock(&osb->net_response_lock);
+}
+
+static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
+				      struct ocfs2_net_wait_ctxt *w)
+{
+	spin_lock(&osb->net_response_lock);
+	list_add_tail(&w->n_list,
+		      &osb->net_response_list);
+	spin_unlock(&osb->net_response_lock);
+}
+
+static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
+					struct ocfs2_net_wait_ctxt *w,
+					int node_num)
+{
+	assert_spin_locked(&osb->net_response_lock);
+
+	ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
+	if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
+		wake_up(&w->n_event);
+}
+
+/* Intended to be called from the node down callback, we fake remove
+ * the node from all our response contexts */
+void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
+					int node_num)
+{
+	struct list_head *p;
+	struct ocfs2_net_wait_ctxt *w = NULL;
+
+	spin_lock(&osb->net_response_lock);
+
+	list_for_each(p, &osb->net_response_list) {
+		w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
+
+		__ocfs2_mark_node_responded(osb, w, node_num);
+	}
+
+	spin_unlock(&osb->net_response_lock);
+}
+
+static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
+				struct ocfs2_vote_msg *request,
+				unsigned int response_id,
+				int *response,
+				struct ocfs2_net_response_cb *callback)
+{
+	int status, i, remote_err;
+	struct ocfs2_net_wait_ctxt *w = NULL;
+	int dequeued = 0;
+
+	mlog_entry_void();
+
+	w = ocfs2_new_net_wait_ctxt(response_id);
+	if (!w) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	w->n_callback = callback;
+
+	/* we're pretty much ready to go at this point, and this fills
+	 * in n_response which we need anyway... */
+	ocfs2_queue_net_wait_ctxt(osb, w);
+
+	i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
+
+	while (i != O2NM_INVALID_NODE_NUM) {
+		if (i != osb->node_num) {
+			mlog(0, "trying to send request to node %i\n", i);
+			ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
+
+			remote_err = 0;
+			status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
+						    osb->net_key,
+						    request,
+						    sizeof(*request),
+						    i,
+						    &remote_err);
+			if (status == -ETIMEDOUT) {
+				mlog(0, "remote node %d timed out!\n", i);
+				status = -EAGAIN;
+				goto bail;
+			}
+			if (remote_err < 0) {
+				status = remote_err;
+				mlog(0, "remote error %d on node %d!\n",
+				     remote_err, i);
+				mlog_errno(status);
+				goto bail;
+			}
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		i++;
+		i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
+		mlog(0, "next is %d, i am %d\n", i, osb->node_num);
+	}
+	mlog(0, "done sending, now waiting on responses...\n");
+
+	wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
+
+	ocfs2_dequeue_net_wait_ctxt(osb, w);
+	dequeued = 1;
+
+	*response = w->n_response;
+	status = 0;
+bail:
+	if (w) {
+		if (!dequeued)
+			ocfs2_dequeue_net_wait_ctxt(osb, w);
+		kfree(w);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
+						      u64 blkno,
+						      unsigned int generation,
+						      enum ocfs2_vote_request type,
+						      u32 priv)
+{
+	struct ocfs2_vote_msg *request;
+	struct ocfs2_msg_hdr *hdr;
+
+	BUG_ON(!ocfs2_is_valid_vote_request(type));
+
+	request = kcalloc(1, sizeof(*request), GFP_KERNEL);
+	if (!request) {
+		mlog_errno(-ENOMEM);
+	} else {
+		hdr = &request->v_hdr;
+		hdr->h_node_num = cpu_to_be32(osb->node_num);
+		hdr->h_request = cpu_to_be32(type);
+		hdr->h_blkno = cpu_to_be64(blkno);
+		hdr->h_generation = cpu_to_be32(generation);
+
+		request->md1.v_generic1 = cpu_to_be32(priv);
+	}
+
+	return request;
+}
+
+/* Complete the buildup of a new vote request and process the
+ * broadcast return value. */
+static int ocfs2_do_request_vote(struct ocfs2_super *osb,
+				 struct ocfs2_vote_msg *request,
+				 struct ocfs2_net_response_cb *callback)
+{
+	int status, response;
+	unsigned int response_id;
+	struct ocfs2_msg_hdr *hdr;
+
+	response_id = ocfs2_new_response_id(osb);
+
+	hdr = &request->v_hdr;
+	hdr->h_response_id = cpu_to_be32(response_id);
+
+	status = ocfs2_broadcast_vote(osb, request, response_id, &response,
+				      callback);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = response;
+bail:
+
+	return status;
+}
+
+static int ocfs2_request_vote(struct inode *inode,
+			      struct ocfs2_vote_msg *request,
+			      struct ocfs2_net_response_cb *callback)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_inode_is_new(inode))
+		return 0;
+
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
+		    signal_pending(current))
+			return -ERESTARTSYS;
+
+		status = ocfs2_super_lock(osb, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			break;
+		}
+
+		status = 0;
+		if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num))
+			status = ocfs2_do_request_vote(osb, request, callback);
+
+		ocfs2_super_unlock(osb, 0);
+	}
+	return status;
+}
+
+static void ocfs2_delete_response_cb(void *priv,
+				     struct ocfs2_response_msg *resp)
+{
+	int orphaned_slot, node;
+	struct inode *inode = priv;
+
+	orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
+	node = be32_to_cpu(resp->r_hdr.h_node_num);
+	mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot "
+	     "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot);
+
+	/* The other node may not actually know which slot the inode
+	 * is orphaned in. */
+	if (orphaned_slot == OCFS2_INVALID_SLOT)
+		return;
+
+	/* Ok, the responding node knows which slot this inode is
+	 * orphaned in. We verify that the information is correct and
+	 * then record this in the inode. ocfs2_delete_inode will use
+	 * this information to determine which lock to take. */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
+			OCFS2_I(inode)->ip_orphaned_slot
+			!= OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d "
+			"says it's orphaned in slot %d, we think it's in %d\n",
+			OCFS2_I(inode)->ip_blkno,
+			be32_to_cpu(resp->r_hdr.h_node_num),
+			orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
+
+	OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+int ocfs2_request_delete_vote(struct inode *inode)
+{
+	int orphaned_slot, status;
+	struct ocfs2_net_response_cb delete_cb;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_vote_msg *request;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	delete_cb.rc_cb = ocfs2_delete_response_cb;
+	delete_cb.rc_priv = inode;
+
+	mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n",
+	     OCFS2_I(inode)->ip_blkno, orphaned_slot);
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
+	if (request) {
+		status = ocfs2_request_vote(inode, request, &delete_cb);
+
+		kfree(request);
+	}
+
+	return status;
+}
+
+static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request,
+				    struct dentry *dentry)
+{
+	struct inode *parent = dentry->d_parent->d_inode;
+
+	/* We need some values which will uniquely identify a dentry
+	 * on the other nodes so that they can find it and run
+	 * d_delete against it. Parent directory block and full name
+	 * should suffice. */
+
+	mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n",
+	     OCFS2_I(parent)->ip_blkno, dentry->d_name.len,
+	     dentry->d_name.name);
+
+	request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno);
+	request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len);
+	memcpy(request->v_unlink_dirent, dentry->d_name.name,
+	       dentry->d_name.len);
+}
+
+int ocfs2_request_unlink_vote(struct inode *inode,
+			      struct dentry *dentry,
+			      unsigned int nlink)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_vote_msg *request;
+
+	if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
+		return -ENAMETOOLONG;
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_UNLINK, nlink);
+	if (request) {
+		ocfs2_setup_unlink_vote(request, dentry);
+
+		status = ocfs2_request_vote(inode, request, NULL);
+
+		kfree(request);
+	}
+	return status;
+}
+
+int ocfs2_request_rename_vote(struct inode *inode,
+			      struct dentry *dentry)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_vote_msg *request;
+
+	if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
+		return -ENAMETOOLONG;
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_RENAME, 0);
+	if (request) {
+		ocfs2_setup_unlink_vote(request, dentry);
+
+		status = ocfs2_request_vote(inode, request, NULL);
+
+		kfree(request);
+	}
+	return status;
+}
+
+int ocfs2_request_mount_vote(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_vote_msg *request = NULL;
+
+	request = ocfs2_new_vote_request(osb, 0ULL, 0,
+					 OCFS2_VOTE_REQ_MOUNT, 0);
+	if (!request) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
+		    signal_pending(current)) {
+			status = -ERESTARTSYS;
+			goto bail;
+		}
+
+		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num)) {
+			status = 0;
+			goto bail;
+		}
+
+		status = ocfs2_do_request_vote(osb, request, NULL);
+	}
+
+bail:
+	if (request)
+		kfree(request);
+
+	return status;
+}
+
+int ocfs2_request_umount_vote(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_vote_msg *request = NULL;
+
+	request = ocfs2_new_vote_request(osb, 0ULL, 0,
+					 OCFS2_VOTE_REQ_UMOUNT, 0);
+	if (!request) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		/* Do not check signals on this vote... We really want
+		 * this one to go all the way through. */
+
+		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num)) {
+			status = 0;
+			goto bail;
+		}
+
+		status = ocfs2_do_request_vote(osb, request, NULL);
+	}
+
+bail:
+	if (request)
+		kfree(request);
+
+	return status;
+}
+
+/* TODO: This should eventually be a hash table! */
+static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
+							       u32 response_id)
+{
+	struct list_head *p;
+	struct ocfs2_net_wait_ctxt *w = NULL;
+
+	list_for_each(p, &osb->net_response_list) {
+		w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
+		if (response_id == w->n_response_id)
+			break;
+		w = NULL;
+	}
+
+	return w;
+}
+
+/* Translate response codes into local node errno values */
+static inline int ocfs2_translate_response(int response)
+{
+	int ret;
+
+	switch (response) {
+	case OCFS2_RESPONSE_OK:
+		ret = 0;
+		break;
+
+	case OCFS2_RESPONSE_BUSY:
+		ret = -EBUSY;
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int ocfs2_handle_response_message(struct o2net_msg *msg,
+					 u32 len,
+					 void *data)
+{
+	unsigned int response_id, node_num;
+	int response_status;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_response_msg *resp;
+	struct ocfs2_net_wait_ctxt * w;
+	struct ocfs2_net_response_cb *resp_cb;
+
+	resp = (struct ocfs2_response_msg *) msg->buf;
+
+	response_id = be32_to_cpu(resp->r_hdr.h_response_id);
+	node_num = be32_to_cpu(resp->r_hdr.h_node_num);
+	response_status = 
+		ocfs2_translate_response(be32_to_cpu(resp->r_response));
+
+	mlog(0, "received response message:\n");
+	mlog(0, "h_response_id = %u\n", response_id);
+	mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
+	mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno));
+	mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
+	mlog(0, "h_node_num = %u\n", node_num);
+	mlog(0, "r_response = %d\n", response_status);
+
+	spin_lock(&osb->net_response_lock);
+	w = __ocfs2_find_net_wait_ctxt(osb, response_id);
+	if (!w) {
+		mlog(0, "request not found!\n");
+		goto bail;
+	}
+	resp_cb = w->n_callback;
+
+	if (response_status && (!w->n_response)) {
+		/* we only really need one negative response so don't
+		 * set it twice. */
+		w->n_response = response_status;
+	}
+
+	if (resp_cb) {
+		spin_unlock(&osb->net_response_lock);
+
+		resp_cb->rc_cb(resp_cb->rc_priv, resp);
+
+		spin_lock(&osb->net_response_lock);
+	}
+
+	__ocfs2_mark_node_responded(osb, w, node_num);
+bail:
+	spin_unlock(&osb->net_response_lock);
+
+	return 0;
+}
+
+static int ocfs2_handle_vote_message(struct o2net_msg *msg,
+				     u32 len,
+				     void *data)
+{
+	int status;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_vote_work *work;
+
+	work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL);
+	if (!work) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	INIT_LIST_HEAD(&work->w_list);
+	memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
+
+	mlog(0, "scheduling vote request:\n");
+	mlog(0, "h_response_id = %u\n",
+	     be32_to_cpu(work->w_msg.v_hdr.h_response_id));
+	mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
+	mlog(0, "h_blkno = %"MLFu64"\n",
+	     be64_to_cpu(work->w_msg.v_hdr.h_blkno));
+	mlog(0, "h_generation = %u\n",
+	     be32_to_cpu(work->w_msg.v_hdr.h_generation));
+	mlog(0, "h_node_num = %u\n",
+	     be32_to_cpu(work->w_msg.v_hdr.h_node_num));
+	mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
+
+	spin_lock(&osb->vote_task_lock);
+	list_add_tail(&work->w_list, &osb->vote_list);
+	osb->vote_count++;
+	spin_unlock(&osb->vote_task_lock);
+
+	ocfs2_kick_vote_thread(osb);
+
+	status = 0;
+bail:
+	return status;
+}
+
+void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
+{
+	if (!osb->net_key)
+		return;
+
+	o2net_unregister_handler_list(&osb->osb_net_handlers);
+
+	if (!list_empty(&osb->net_response_list))
+		mlog(ML_ERROR, "net response list not empty!\n");
+
+	osb->net_key = 0;
+}
+
+int ocfs2_register_net_handlers(struct ocfs2_super *osb)
+{
+	int status = 0;
+
+	status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
+					osb->net_key,
+					sizeof(struct ocfs2_response_msg),
+					ocfs2_handle_response_message,
+					osb, &osb->osb_net_handlers);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
+					osb->net_key,
+					sizeof(struct ocfs2_vote_msg),
+					ocfs2_handle_vote_message,
+					osb, &osb->osb_net_handlers);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+bail:
+	if (status < 0)
+		ocfs2_unregister_net_handlers(osb);
+
+	return status;
+}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
new file mode 100644
index 000000000000..9cce60703466
--- /dev/null
+++ b/fs/ocfs2/vote.h
@@ -0,0 +1,56 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * vote.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef VOTE_H
+#define VOTE_H
+
+int ocfs2_vote_thread(void *arg);
+static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->vote_task_lock);
+	/* make sure the voting thread gets a swipe at whatever changes
+	 * the caller may have made to the voting state */
+	osb->vote_wake_sequence++;
+	spin_unlock(&osb->vote_task_lock);
+	wake_up(&osb->vote_event);
+}
+
+int ocfs2_request_delete_vote(struct inode *inode);
+int ocfs2_request_unlink_vote(struct inode *inode,
+			      struct dentry *dentry,
+			      unsigned int nlink);
+int ocfs2_request_rename_vote(struct inode *inode,
+			      struct dentry *dentry);
+int ocfs2_request_mount_vote(struct ocfs2_super *osb);
+int ocfs2_request_umount_vote(struct ocfs2_super *osb);
+int ocfs2_register_net_handlers(struct ocfs2_super *osb);
+void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
+
+void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
+
+void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
+					int node_num);
+#endif
-- 
cgit v1.2.3


From b4e40a51881931bfcbc78a585e875bb2784d6d10 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:24 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

Link the code into the kernel build system. OCFS2 is marked as
experimental.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/Kconfig  | 53 ++++++++++++++++++++++++++++++++++++++++++-----------
 fs/Makefile |  1 +
 2 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index ba1dbe2b2202..59b179559312 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -70,6 +70,7 @@ config FS_XIP
 
 config EXT3_FS
 	tristate "Ext3 journalling file system support"
+	select JBD
 	help
 	  This is the journaling version of the Second extended file system
 	  (often called ext3), the de facto standard Linux file system
@@ -138,23 +139,20 @@ config EXT3_FS_SECURITY
 	  extended attributes for file security labels, say N.
 
 config JBD
-# CONFIG_JBD could be its own option (even modular), but until there are
-# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
-# dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
 	tristate
-	default EXT3_FS
 	help
 	  This is a generic journaling layer for block devices.  It is
-	  currently used by the ext3 file system, but it could also be used to
-	  add journal support to other file systems or block devices such as
-	  RAID or LVM.
+	  currently used by the ext3 and OCFS2 file systems, but it could
+	  also be used to add journal support to other file systems or block
+	  devices such as RAID or LVM.
 
-	  If you are using the ext3 file system, you need to say Y here. If
-	  you are not using ext3 then you will probably want to say N.
+	  If you are using the ext3 or OCFS2 file systems, you need to
+	  say Y here. If you are not using ext3 OCFS2 then you will probably
+	  want to say N.
 
 	  To compile this device as a module, choose M here: the module will be
-	  called jbd.  If you are compiling ext3 into the kernel, you cannot
-	  compile this code as a module.
+	  called jbd.  If you are compiling ext3 or OCFS2 into the kernel,
+	  you cannot compile this code as a module.
 
 config JBD_DEBUG
 	bool "JBD (ext3) debugging support"
@@ -326,6 +324,39 @@ config FS_POSIX_ACL
 
 source "fs/xfs/Kconfig"
 
+config OCFS2_FS
+	tristate "OCFS2 file system support (EXPERIMENTAL)"
+	depends on NET && EXPERIMENTAL
+	select CONFIGFS_FS
+	select JBD
+	select CRC32
+	select INET
+	help
+	  OCFS2 is a general purpose extent based shared disk cluster file
+	  system with many similarities to ext3. It supports 64 bit inode
+	  numbers, and has automatically extending metadata groups which may
+	  also make it attractive for non-clustered use.
+
+	  You'll want to install the ocfs2-tools package in order to at least
+	  get "mount.ocfs2".
+
+	  Project web page:    http://oss.oracle.com/projects/ocfs2
+	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+	  Note: Features which OCFS2 does not support yet:
+	          - extended attributes
+	          - readonly mount
+		  - shared writeable mmap
+	          - loopback is supported, but data written will not
+	            be cluster coherent.
+	          - quotas
+	          - cluster aware flock
+	          - Directory change notification (F_NOTIFY)
+	          - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+	          - POSIX ACLs
+	          - readpages / writepages (not user visible)
+
 config MINIX_FS
 	tristate "Minix fs support"
 	help
diff --git a/fs/Makefile b/fs/Makefile
index ff3d48a744f5..73676111ebbe 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -102,3 +102,4 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
+obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
-- 
cgit v1.2.3


From 82353b594c784deabb8d9764b477e65c2b3726f9 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 19 Dec 2005 11:16:07 -0800
Subject: [PATCH] This patch contains the following cleanups:

- cluster/sys.c: make needlessly global code static
- dlm/: "extern" declarations for variables belong into header files
        (and in this case, they are already in dlmdomain.h)

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/sys.c   | 4 ++--
 fs/ocfs2/dlm/dlmmaster.c | 4 +---
 fs/ocfs2/dlm/dlmthread.c | 3 ---
 3 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index f1e99461bb0d..1d9f6acafa2e 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -50,7 +50,7 @@ static ssize_t o2cb_interface_revision_show(char *buf)
 	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
 }
 
-O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
 
 static struct attribute *o2cb_attrs[] = {
 	&o2cb_attr_interface_revision.attr,
@@ -73,7 +73,7 @@ static struct kobj_type o2cb_subsys_type = {
 };
 
 /* gives us o2cb_subsys */
-decl_subsys(o2cb, NULL, NULL);
+static decl_subsys(o2cb, NULL, NULL);
 
 static ssize_t
 o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 047279546b4f..27e984f7e4cd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -48,6 +48,7 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdebug.h"
+#include "dlmdomain.h"
 
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
 #include "cluster/masklog.h"
@@ -178,9 +179,6 @@ static void dlm_dump_mles(struct dlm_ctxt *dlm)
 	spin_unlock(&dlm->master_lock);
 }
 
-extern spinlock_t dlm_domain_lock;
-extern struct list_head dlm_domains;
-
 int dlm_dump_all_mles(const char __user *data, unsigned int len)
 {
 	struct list_head *iter;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 92cd5cd66db8..5be9d14f12cb 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -52,9 +52,6 @@
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
 #include "cluster/masklog.h"
 
-extern spinlock_t dlm_domain_lock;
-extern struct list_head dlm_domains;
-
 static int dlm_thread(void *data);
 
 static void dlm_flush_asts(struct dlm_ctxt *dlm);
-- 
cgit v1.2.3


From 51e7a5987058c6b4d0c1337587f2ec0c34ffa708 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 19 Dec 2005 11:21:42 -0800
Subject: [PATCH] o Update Kconfig documentation to reflect support for
 readonly mounts.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 59b179559312..382e3b2883d5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -346,7 +346,6 @@ config OCFS2_FS
 
 	  Note: Features which OCFS2 does not support yet:
 	          - extended attributes
-	          - readonly mount
 		  - shared writeable mmap
 	          - loopback is supported, but data written will not
 	            be cluster coherent.
-- 
cgit v1.2.3


From 14c850212ed8f8cbb5972ad6b8812e08a0bc901c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 27 Dec 2005 02:43:12 -0200
Subject: [INET_SOCK]: Move struct inet_sock & helper functions to
 net/inet_sock.h

To help in reducing the number of include dependencies, several files were
touched as they were getting needed headers indirectly for stuff they use.

Thanks also to Alan Menegotto for pointing out that net/dccp/proto.c had
linux/dccp.h include twice.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |   2 +
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   2 +
 drivers/net/ns83820.c                          |   1 +
 drivers/net/sk98lin/skge.c                     |   1 +
 drivers/net/skge.c                             |   1 +
 drivers/net/tg3.c                              |   1 +
 fs/9p/trans_sock.c                             |   1 +
 fs/nfs/callback.c                              |   3 +
 include/linux/dccp.h                           |   3 +-
 include/linux/ip.h                             | 126 +---------------
 include/linux/ipv6.h                           |   7 +-
 include/linux/udp.h                            |   6 +-
 include/net/atmclip.h                          |   2 +-
 include/net/dst.h                              |   1 +
 include/net/icmp.h                             |   9 +-
 include/net/ieee80211_crypt.h                  |   9 +-
 include/net/inet_connection_sock.h             |   3 +-
 include/net/inet_ecn.h                         |   2 +
 include/net/inet_hashtables.h                  |  21 +--
 include/net/inet_sock.h                        | 193 +++++++++++++++++++++++++
 include/net/inet_timewait_sock.h               |   2 +-
 include/net/ip.h                               |  17 ++-
 include/net/ip_fib.h                           |   2 +
 include/net/ip_vs.h                            |  12 +-
 include/net/ipv6.h                             |   3 +
 include/net/ndisc.h                            |  17 ++-
 include/net/neighbour.h                        |   2 +-
 include/net/pkt_act.h                          |   1 -
 include/net/raw.h                              |   2 +
 include/net/udp.h                              |   4 +-
 include/net/xfrm.h                             |   3 +-
 net/bridge/br_netfilter.c                      |   4 +
 net/bridge/netfilter/ebt_log.c                 |   1 +
 net/core/netpoll.c                             |   1 +
 net/dccp/ccid.h                                |   2 +
 net/dccp/ipv4.c                                |   1 +
 net/dccp/ipv6.c                                |   1 +
 net/dccp/output.c                              |   1 +
 net/dccp/proto.c                               |   3 +-
 net/econet/af_econet.c                         |   1 +
 net/ipv4/af_inet.c                             |   1 +
 net/ipv4/ah4.c                                 |   1 +
 net/ipv4/arp.c                                 |   1 +
 net/ipv4/devinet.c                             |   1 +
 net/ipv4/esp4.c                                |   1 +
 net/ipv4/fib_frontend.c                        |   1 +
 net/ipv4/fib_hash.c                            |   1 +
 net/ipv4/fib_rules.c                           |   1 +
 net/ipv4/fib_semantics.c                       |   2 +
 net/ipv4/icmp.c                                |   1 +
 net/ipv4/igmp.c                                |   2 +
 net/ipv4/ip_input.c                            |   1 +
 net/ipv4/ip_options.c                          |   1 +
 net/ipv4/ip_sockglue.c                         |   1 +
 net/ipv4/ipcomp.c                              |   1 +
 net/ipv4/ipconfig.c                            |   2 +
 net/ipv4/ipmr.c                                |   1 +
 net/ipv4/ipvs/ip_vs_conn.c                     |   2 +
 net/ipv4/ipvs/ip_vs_ctl.c                      |   1 +
 net/ipv4/ipvs/ip_vs_dh.c                       |   2 +
 net/ipv4/ipvs/ip_vs_est.c                      |   3 +
 net/ipv4/ipvs/ip_vs_lblc.c                     |   2 +
 net/ipv4/ipvs/ip_vs_lblcr.c                    |   2 +
 net/ipv4/ipvs/ip_vs_proto_ah.c                 |   2 +
 net/ipv4/ipvs/ip_vs_proto_esp.c                |   2 +
 net/ipv4/ipvs/ip_vs_proto_udp.c                |   3 +
 net/ipv4/ipvs/ip_vs_sh.c                       |   2 +
 net/ipv4/ipvs/ip_vs_sync.c                     |   2 +
 net/ipv4/netfilter/ip_conntrack_amanda.c       |   2 +
 net/ipv4/netfilter/ip_conntrack_proto_gre.c    |   1 +
 net/ipv4/netfilter/ip_conntrack_proto_udp.c    |   1 +
 net/ipv4/netfilter/ip_conntrack_standalone.c   |   1 +
 net/ipv4/netfilter/ip_nat_snmp_basic.c         |   2 +
 net/ipv4/netfilter/ipt_MASQUERADE.c            |   2 +
 net/ipv4/netfilter/ipt_physdev.c               |   1 +
 net/ipv4/proc.c                                |   1 +
 net/ipv4/sysctl_net_ipv4.c                     |   1 +
 net/ipv4/udp.c                                 |   1 +
 net/ipv6/ah6.c                                 |   1 +
 net/ipv6/esp6.c                                |   1 +
 net/ipv6/ipcomp6.c                             |   1 +
 net/ipv6/netfilter/ip6_tables.c                |   1 +
 net/ipv6/netfilter/ip6t_LOG.c                  |   1 +
 net/ipv6/netfilter/ip6t_ah.c                   |   1 +
 net/ipv6/netfilter/ip6t_esp.c                  |   1 +
 net/sched/sch_teql.c                           |   1 +
 net/sctp/protocol.c                            |   1 +
 87 files changed, 355 insertions(+), 184 deletions(-)
 create mode 100644 include/net/inet_sock.h

(limited to 'fs')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 475d98fa9e26..780009c7eaa6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -47,6 +47,8 @@
 #include <linux/ip.h>
 #include <linux/in.h>
 
+#include <net/dst.h>
+
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index ef3ee035bbc8..ed0c2ead8bc1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -43,6 +43,8 @@
 #include <linux/delay.h>
 #include <linux/completion.h>
 
+#include <net/dst.h>
+
 #include "ipoib.h"
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index f857ae94d261..b0c3b6ab6263 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -115,6 +115,7 @@
 #include <linux/ethtool.h>
 #include <linux/timer.h>
 #include <linux/if_vlan.h>
+#include <linux/rtnetlink.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c
index ae7343934758..e1a2d52cc1fe 100644
--- a/drivers/net/sk98lin/skge.c
+++ b/drivers/net/sk98lin/skge.c
@@ -107,6 +107,7 @@
 
 #include	"h/skversion.h"
 
+#include	<linux/in.h>
 #include	<linux/module.h>
 #include	<linux/moduleparam.h>
 #include	<linux/init.h>
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 00d683063c01..d8cc3aea032a 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -25,6 +25,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 2fc9893d69e1..59d916ccc810 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -24,6 +24,7 @@
 #include <linux/compiler.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/in.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index a93c2bf94c33..6a9a75d40f73 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -26,6 +26,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <linux/ipv6.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f2ca782aba33..30cae3602867 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,9 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
+
+#include <net/inet_sock.h>
+
 #include "nfs4_fs.h"
 #include "callback.h"
 
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index d0bdb499cf8d..088529f54965 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -192,10 +192,9 @@ enum {
 #include <linux/workqueue.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/inet_sock.h>
 #include <net/inet_timewait_sock.h>
-#include <net/sock.h>
 #include <net/tcp_states.h>
-#include <net/tcp.h>
 
 enum dccp_state {
 	DCCP_OPEN	= TCP_ESTABLISHED,
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 6ccc596c19c8..9e2eb9a602eb 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -16,6 +16,7 @@
  */
 #ifndef _LINUX_IP_H
 #define _LINUX_IP_H
+#include <linux/types.h>
 #include <asm/byteorder.h>
 
 #define IPTOS_TOS_MASK		0x1E
@@ -78,131 +79,6 @@
 #define	IPOPT_TS_TSANDADDR	1		/* timestamps and addresses */
 #define	IPOPT_TS_PRESPEC	3		/* specified modules only */
 
-#ifdef __KERNEL__
-#include <linux/config.h>
-#include <linux/types.h>
-#include <net/request_sock.h>
-#include <net/sock.h>
-#include <linux/igmp.h>
-#include <net/flow.h>
-
-struct ip_options {
-  __u32		faddr;				/* Saved first hop address */
-  unsigned char	optlen;
-  unsigned char srr;
-  unsigned char rr;
-  unsigned char ts;
-  unsigned char is_setbyuser:1,			/* Set by setsockopt?			*/
-                is_data:1,			/* Options in __data, rather than skb	*/
-                is_strictroute:1,		/* Strict source route			*/
-                srr_is_hit:1,			/* Packet destination addr was our one	*/
-                is_changed:1,			/* IP checksum more not valid		*/	
-                rr_needaddr:1,			/* Need to record addr of outgoing dev	*/
-                ts_needtime:1,			/* Need to record timestamp		*/
-                ts_needaddr:1;			/* Need to record addr of outgoing dev  */
-  unsigned char router_alert;
-  unsigned char __pad1;
-  unsigned char __pad2;
-  unsigned char __data[0];
-};
-
-#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
-
-struct inet_request_sock {
-	struct request_sock	req;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	u16			inet6_rsk_offset;
-	/* 2 bytes hole, try to pack */
-#endif
-	u32			loc_addr;
-	u32			rmt_addr;
-	u16			rmt_port;
-	u16			snd_wscale : 4, 
-				rcv_wscale : 4, 
-				tstamp_ok  : 1,
-				sack_ok	   : 1,
-				wscale_ok  : 1,
-				ecn_ok	   : 1,
-				acked	   : 1;
-	struct ip_options	*opt;
-};
-
-static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
-{
-	return (struct inet_request_sock *)sk;
-}
-
-struct ipv6_pinfo;
-
-struct inet_sock {
-	/* sk and pinet6 has to be the first two members of inet_sock */
-	struct sock		sk;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	struct ipv6_pinfo	*pinet6;
-#endif
-	/* Socket demultiplex comparisons on incoming packets. */
-	__u32			daddr;		/* Foreign IPv4 addr */
-	__u32			rcv_saddr;	/* Bound local IPv4 addr */
-	__u16			dport;		/* Destination port */
-	__u16			num;		/* Local port */
-	__u32			saddr;		/* Sending source */
-	__s16			uc_ttl;		/* Unicast TTL */
-	__u16			cmsg_flags;
-	struct ip_options	*opt;
-	__u16			sport;		/* Source port */
-	__u16			id;		/* ID counter for DF pkts */
-	__u8			tos;		/* TOS */
-	__u8			mc_ttl;		/* Multicasting TTL */
-	__u8			pmtudisc;
-	unsigned		recverr : 1,
-				is_icsk : 1,	/* inet_connection_sock? */
-				freebind : 1,
-				hdrincl : 1,
-				mc_loop : 1;
-	int			mc_index;	/* Multicast device index */
-	__u32			mc_addr;
-	struct ip_mc_socklist	*mc_list;	/* Group array */
-	/*
-	 * Following members are used to retain the infomation to build
-	 * an ip header on each ip fragmentation while the socket is corked.
-	 */
-	struct {
-		unsigned int		flags;
-		unsigned int		fragsize;
-		struct ip_options	*opt;
-		struct rtable		*rt;
-		int			length; /* Total length of all frames */
-		u32			addr;
-		struct flowi		fl;
-	} cork;
-};
-
-#define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
-#define IPCORK_ALLFRAG	2	/* always fragment (for ipv6 for now) */
-
-static inline struct inet_sock *inet_sk(const struct sock *sk)
-{
-	return (struct inet_sock *)sk;
-}
-
-static inline void __inet_sk_copy_descendant(struct sock *sk_to,
-					     const struct sock *sk_from,
-					     const int ancestor_size)
-{
-	memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
-	       sk_from->sk_prot->obj_size - ancestor_size);
-}
-#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
-static inline void inet_sk_copy_descendant(struct sock *sk_to,
-					   const struct sock *sk_from)
-{
-	__inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
-}
-#endif
-#endif
-
-extern int inet_sk_rebuild_header(struct sock *sk);
-
 struct iphdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u8	ihl:4,
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index a0d04891fe12..93bbed5c6cf4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -171,12 +171,13 @@ enum {
 };
 
 #ifdef __KERNEL__
-#include <linux/in6.h>          /* struct sockaddr_in6 */
 #include <linux/icmpv6.h>
-#include <net/if_inet6.h>       /* struct ipv6_mc_socklist */
 #include <linux/tcp.h>
 #include <linux/udp.h>
 
+#include <net/if_inet6.h>       /* struct ipv6_mc_socklist */
+#include <net/inet_sock.h>
+
 /* 
    This structure contains results of exthdrs parsing
    as offsets from skb->nh.
@@ -346,8 +347,6 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 #define __ipv6_only_sock(sk)	(inet6_sk(sk)->ipv6only)
 #define ipv6_only_sock(sk)	((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
 
-#include <linux/tcp.h>
-
 struct inet6_timewait_sock {
 	struct in6_addr tw_v6_daddr;
 	struct in6_addr	tw_v6_rcv_saddr;
diff --git a/include/linux/udp.h b/include/linux/udp.h
index b60e0b4a25c4..85a55658831c 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -35,10 +35,10 @@ struct udphdr {
 #define UDP_ENCAP_ESPINUDP	2 /* draft-ietf-ipsec-udp-encaps-06 */
 
 #ifdef __KERNEL__
-
 #include <linux/config.h>
-#include <net/sock.h>
-#include <linux/ip.h>
+#include <linux/types.h>
+
+#include <net/inet_sock.h>
 
 struct udp_sock {
 	/* inet_sock has to be the first member */
diff --git a/include/net/atmclip.h b/include/net/atmclip.h
index 47048b1d179a..90fcc98e676f 100644
--- a/include/net/atmclip.h
+++ b/include/net/atmclip.h
@@ -7,7 +7,6 @@
 #define _ATMCLIP_H
 
 #include <linux/netdevice.h>
-#include <linux/skbuff.h>
 #include <linux/atm.h>
 #include <linux/atmdev.h>
 #include <linux/atmarp.h>
@@ -18,6 +17,7 @@
 #define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back))
 #define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key)
 
+struct sk_buff;
 
 struct clip_vcc {
 	struct atm_vcc	*vcc;		/* VCC descriptor */
diff --git a/include/net/dst.h b/include/net/dst.h
index 6c196a5baf24..bee8b84d329d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -9,6 +9,7 @@
 #define _NET_DST_H
 
 #include <linux/config.h>
+#include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
 #include <linux/jiffies.h>
diff --git a/include/net/icmp.h b/include/net/icmp.h
index 6cdebeee5f96..e7c3f20fbafc 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -20,12 +20,9 @@
 
 #include <linux/config.h>
 #include <linux/icmp.h>
-#include <linux/skbuff.h>
 
-#include <net/sock.h>
-#include <net/protocol.h>
+#include <net/inet_sock.h>
 #include <net/snmp.h>
-#include <linux/ip.h>
 
 struct icmp_err {
   int		errno;
@@ -38,6 +35,10 @@ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics);
 #define ICMP_INC_STATS_BH(field)	SNMP_INC_STATS_BH(icmp_statistics, field)
 #define ICMP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(icmp_statistics, field)
 
+struct dst_entry;
+struct net_proto_family;
+struct sk_buff;
+
 extern void	icmp_send(struct sk_buff *skb_in,  int type, int code, u32 info);
 extern int	icmp_rcv(struct sk_buff *skb);
 extern int	icmp_ioctl(struct sock *sk, int cmd, unsigned long arg);
diff --git a/include/net/ieee80211_crypt.h b/include/net/ieee80211_crypt.h
index 225fc751d464..03b766afdc39 100644
--- a/include/net/ieee80211_crypt.h
+++ b/include/net/ieee80211_crypt.h
@@ -23,12 +23,17 @@
 #ifndef IEEE80211_CRYPT_H
 #define IEEE80211_CRYPT_H
 
-#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
 
 enum {
 	IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0),
 };
 
+struct sk_buff;
+struct module;
+
 struct ieee80211_crypto_ops {
 	const char *name;
 	struct list_head list;
@@ -87,6 +92,8 @@ struct ieee80211_crypt_data {
 	atomic_t refcnt;
 };
 
+struct ieee80211_device;
+
 int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops);
 int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops);
 struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name);
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 91888967d3e3..50234fa56a68 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -16,9 +16,10 @@
 #define _INET_CONNECTION_SOCK_H
 
 #include <linux/compiler.h>
-#include <linux/ip.h>
 #include <linux/string.h>
 #include <linux/timer.h>
+
+#include <net/inet_sock.h>
 #include <net/request_sock.h>
 
 #define INET_CSK_DEBUG 1
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index b0c47e2eccf1..d599c6bfbb86 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -3,6 +3,8 @@
 
 #include <linux/ip.h>
 #include <linux/skbuff.h>
+
+#include <net/inet_sock.h>
 #include <net/dsfield.h>
 
 enum {
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index c83baa79f66e..135d80fd658e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -26,6 +26,7 @@
 #include <linux/wait.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/inet_sock.h>
 #include <net/route.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
@@ -128,26 +129,6 @@ struct inet_hashinfo {
 	kmem_cache_t			*bind_bucket_cachep;
 };
 
-static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
-			       const __u32 faddr, const __u16 fport)
-{
-	unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
-	h ^= h >> 16;
-	h ^= h >> 8;
-	return h;
-}
-
-static inline int inet_sk_ehashfn(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	const __u32 laddr = inet->rcv_saddr;
-	const __u16 lport = inet->num;
-	const __u32 faddr = inet->daddr;
-	const __u16 fport = inet->dport;
-
-	return inet_ehashfn(laddr, lport, faddr, fport);
-}
-
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
 	struct inet_hashinfo *hashinfo,
 	unsigned int hash)
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
new file mode 100644
index 000000000000..883eb529ef8e
--- /dev/null
+++ b/include/net/inet_sock.h
@@ -0,0 +1,193 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Definitions for inet_sock
+ *
+ * Authors:	Many, reorganised here by
+ * 		Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _INET_SOCK_H
+#define _INET_SOCK_H
+
+#include <linux/config.h>
+
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <net/flow.h>
+#include <net/sock.h>
+#include <net/request_sock.h>
+
+/** struct ip_options - IP Options
+ *
+ * @faddr - Saved first hop address
+ * @is_setbyuser - Set by setsockopt?
+ * @is_data - Options in __data, rather than skb
+ * @is_strictroute - Strict source route
+ * @srr_is_hit - Packet destination addr was our one
+ * @is_changed - IP checksum more not valid
+ * @rr_needaddr - Need to record addr of outgoing dev
+ * @ts_needtime - Need to record timestamp
+ * @ts_needaddr - Need to record addr of outgoing dev
+ */
+struct ip_options {
+	__u32		faddr;
+	unsigned char	optlen;
+	unsigned char	srr;
+	unsigned char	rr;
+	unsigned char	ts;
+	unsigned char	is_setbyuser:1,
+			is_data:1,
+			is_strictroute:1,
+			srr_is_hit:1,
+			is_changed:1,
+			rr_needaddr:1,
+			ts_needtime:1,
+			ts_needaddr:1;
+	unsigned char	router_alert;
+	unsigned char	__pad1;
+	unsigned char	__pad2;
+	unsigned char	__data[0];
+};
+
+#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
+
+struct inet_request_sock {
+	struct request_sock	req;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	u16			inet6_rsk_offset;
+	/* 2 bytes hole, try to pack */
+#endif
+	u32			loc_addr;
+	u32			rmt_addr;
+	u16			rmt_port;
+	u16			snd_wscale : 4, 
+				rcv_wscale : 4, 
+				tstamp_ok  : 1,
+				sack_ok	   : 1,
+				wscale_ok  : 1,
+				ecn_ok	   : 1,
+				acked	   : 1;
+	struct ip_options	*opt;
+};
+
+static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
+{
+	return (struct inet_request_sock *)sk;
+}
+
+struct ip_mc_socklist;
+struct ipv6_pinfo;
+struct rtable;
+
+/** struct inet_sock - representation of INET sockets
+ *
+ * @sk - ancestor class
+ * @pinet6 - pointer to IPv6 control block
+ * @daddr - Foreign IPv4 addr
+ * @rcv_saddr - Bound local IPv4 addr
+ * @dport - Destination port
+ * @num - Local port
+ * @saddr - Sending source
+ * @uc_ttl - Unicast TTL
+ * @sport - Source port
+ * @id - ID counter for DF pkts
+ * @tos - TOS
+ * @mc_ttl - Multicasting TTL
+ * @is_icsk - is this an inet_connection_sock?
+ * @mc_index - Multicast device index
+ * @mc_list - Group array
+ * @cork - info to build ip hdr on each ip frag while socket is corked
+ */
+struct inet_sock {
+	/* sk and pinet6 has to be the first two members of inet_sock */
+	struct sock		sk;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct ipv6_pinfo	*pinet6;
+#endif
+	/* Socket demultiplex comparisons on incoming packets. */
+	__u32			daddr;
+	__u32			rcv_saddr;
+	__u16			dport;
+	__u16			num;
+	__u32			saddr;
+	__s16			uc_ttl;
+	__u16			cmsg_flags;
+	struct ip_options	*opt;
+	__u16			sport;
+	__u16			id;
+	__u8			tos;
+	__u8			mc_ttl;
+	__u8			pmtudisc;
+	__u8			recverr:1,
+				is_icsk:1,
+				freebind:1,
+				hdrincl:1,
+				mc_loop:1;
+	int			mc_index;
+	__u32			mc_addr;
+	struct ip_mc_socklist	*mc_list;
+	struct {
+		unsigned int		flags;
+		unsigned int		fragsize;
+		struct ip_options	*opt;
+		struct rtable		*rt;
+		int			length; /* Total length of all frames */
+		u32			addr;
+		struct flowi		fl;
+	} cork;
+};
+
+#define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
+#define IPCORK_ALLFRAG	2	/* always fragment (for ipv6 for now) */
+
+static inline struct inet_sock *inet_sk(const struct sock *sk)
+{
+	return (struct inet_sock *)sk;
+}
+
+static inline void __inet_sk_copy_descendant(struct sock *sk_to,
+					     const struct sock *sk_from,
+					     const int ancestor_size)
+{
+	memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
+	       sk_from->sk_prot->obj_size - ancestor_size);
+}
+#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
+static inline void inet_sk_copy_descendant(struct sock *sk_to,
+					   const struct sock *sk_from)
+{
+	__inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
+}
+#endif
+
+extern int inet_sk_rebuild_header(struct sock *sk);
+
+static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
+					const __u32 faddr, const __u16 fport)
+{
+	unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
+	h ^= h >> 16;
+	h ^= h >> 8;
+	return h;
+}
+
+static inline int inet_sk_ehashfn(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const __u32 laddr = inet->rcv_saddr;
+	const __u16 lport = inet->num;
+	const __u32 faddr = inet->daddr;
+	const __u16 fport = inet->dport;
+
+	return inet_ehashfn(laddr, lport, faddr, fport);
+}
+
+#endif	/* _INET_SOCK_H */
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index e396a65473d7..1da294c47522 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -17,13 +17,13 @@
 
 #include <linux/config.h>
 
-#include <linux/ip.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
 #include <net/timewait_sock.h>
diff --git a/include/net/ip.h b/include/net/ip.h
index 4d6294ba038e..f7e7fd728b67 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -24,14 +24,10 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
-#include <linux/socket.h>
 #include <linux/ip.h>
 #include <linux/in.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/in_route.h>
-#include <net/route.h>
-#include <net/arp.h>
+
+#include <net/inet_sock.h>
 #include <net/snmp.h>
 
 struct sock;
@@ -75,6 +71,13 @@ extern rwlock_t ip_ra_lock;
 
 #define IP_FRAG_TIME	(30 * HZ)		/* fragment lifetime	*/
 
+struct msghdr;
+struct net_device;
+struct packet_type;
+struct rtable;
+struct sk_buff;
+struct sockaddr;
+
 extern void		ip_mc_dropsocket(struct sock *);
 extern void		ip_mc_dropdevice(struct net_device *dev);
 extern int		igmp_mc_proc_init(void);
@@ -184,6 +187,8 @@ extern int sysctl_ip_dynaddr;
 extern void ipfrag_init(void);
 
 #ifdef CONFIG_INET
+#include <net/dst.h>
+
 /* The function in 2.2 was invalid, producing wrong result for
  * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
 static inline
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 14de4ebd1211..e000fa2cd5f6 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -238,6 +238,8 @@ extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
 			       struct net_device *dev, u32 *spec_dst, u32 *itag);
 extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res);
 
+struct rtentry;
+
 /* Exported by fib_semantics.c */
 extern int ip_fib_check_default(u32 gw, struct net_device *dev);
 extern int fib_sync_down(u32 local, struct net_device *dev, int force);
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 3b5559a023a4..7d2674fde19a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -251,16 +251,15 @@ struct ip_vs_daemon_user {
 #include <linux/config.h>
 #include <linux/list.h>                 /* for struct list_head */
 #include <linux/spinlock.h>             /* for struct rwlock_t */
-#include <linux/skbuff.h>               /* for struct sk_buff */
-#include <linux/ip.h>                   /* for struct iphdr */
 #include <asm/atomic.h>                 /* for struct atomic_t */
-#include <linux/netdevice.h>		/* for struct neighbour */
-#include <net/dst.h>			/* for struct dst_entry */
-#include <net/udp.h>
 #include <linux/compiler.h>
+#include <linux/timer.h>
 
+#include <net/checksum.h>
 
 #ifdef CONFIG_IP_VS_DEBUG
+#include <linux/net.h>
+
 extern int ip_vs_get_debug_level(void);
 #define IP_VS_DBG(level, msg...)			\
     do {						\
@@ -429,8 +428,11 @@ struct ip_vs_stats
 	spinlock_t              lock;           /* spin lock */
 };
 
+struct dst_entry;
+struct iphdr;
 struct ip_vs_conn;
 struct ip_vs_app;
+struct sk_buff;
 
 struct ip_vs_protocol {
 	struct ip_vs_protocol	*next;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 11a725662c36..860bbac4c4ee 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -541,6 +541,9 @@ extern int sysctl_ip6frag_secret_interval;
 extern const struct proto_ops inet6_stream_ops;
 extern const struct proto_ops inet6_dgram_ops;
 
+struct group_source_req;
+struct group_filter;
+
 extern int ip6_mc_source(int add, int omode, struct sock *sk,
 			 struct group_source_req *pgsr);
 extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index f85d6e4b7442..bbac87eeb422 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -35,11 +35,20 @@ enum {
 
 #ifdef __KERNEL__
 
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include <linux/config.h>
+#include <linux/compiler.h>
 #include <linux/icmpv6.h>
+#include <linux/in6.h>
+#include <linux/types.h>
+
 #include <net/neighbour.h>
-#include <asm/atomic.h>
+
+struct ctl_table;
+struct file;
+struct inet6_dev;
+struct net_device;
+struct net_proto_family;
+struct sk_buff;
 
 extern struct neigh_table nd_tbl;
 
@@ -108,7 +117,7 @@ extern int			igmp6_event_report(struct sk_buff *skb);
 extern void			igmp6_cleanup(void);
 
 #ifdef CONFIG_SYSCTL
-extern int 			ndisc_ifinfo_sysctl_change(ctl_table *ctl,
+extern int 			ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
 							   int write,
 							   struct file * filp,
 							   void __user *buffer,
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 34c07731933d..6fa9ae190741 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -49,8 +49,8 @@
 #ifdef __KERNEL__
 
 #include <asm/atomic.h>
-#include <linux/skbuff.h>
 #include <linux/netdevice.h>
+#include <linux/skbuff.h>
 #include <linux/rcupdate.h>
 #include <linux/seq_file.h>
 
diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h
index bd08964b72c0..b225d8472b7e 100644
--- a/include/net/pkt_act.h
+++ b/include/net/pkt_act.h
@@ -15,7 +15,6 @@
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
diff --git a/include/net/raw.h b/include/net/raw.h
index f47917469b12..e67b28a0248c 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -19,6 +19,8 @@
 
 #include <linux/config.h>
 
+#include <net/protocol.h>
+
 extern struct proto raw_prot;
 
 extern void 	raw_err(struct sock *, struct sk_buff *, u32 info);
diff --git a/include/net/udp.h b/include/net/udp.h
index 107b9d791a1f..766fba1369ce 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -22,9 +22,8 @@
 #ifndef _UDP_H
 #define _UDP_H
 
-#include <linux/udp.h>
-#include <linux/ip.h>
 #include <linux/list.h>
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/snmp.h>
 #include <linux/seq_file.h>
@@ -62,6 +61,7 @@ static inline int udp_lport_inuse(u16 num)
 
 extern struct proto udp_prot;
 
+struct sk_buff;
 
 extern void	udp_err(struct sk_buff *, u32);
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 487abca3ca6f..07d7b50cdd76 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2,11 +2,12 @@
 #define _NET_XFRM_H
 
 #include <linux/compiler.h>
+#include <linux/in.h>
 #include <linux/xfrm.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include <linux/socket.h>
 #include <linux/crypto.h>
 #include <linux/pfkeyv2.h>
 #include <linux/in6.h>
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 23422bd53a5e..223f8270daee 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -26,6 +26,7 @@
 #include <linux/ip.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
 #include <linux/netfilter_bridge.h>
@@ -33,8 +34,11 @@
 #include <linux/netfilter_ipv6.h>
 #include <linux/netfilter_arp.h>
 #include <linux/in_route.h>
+
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/route.h>
+
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
 #include "br_private.h"
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index c436e6c6242b..9f6e0193ae10 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -9,6 +9,7 @@
  *
  */
 
+#include <linux/in.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_log.h>
 #include <linux/netfilter.h>
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 49424a42a2c0..281a632fa6a6 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -13,6 +13,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/string.h>
+#include <linux/if_arp.h>
 #include <linux/inetdevice.h>
 #include <linux/inet.h>
 #include <linux/interrupt.h>
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index c37eeeaf5c6e..de681c6ad081 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -21,6 +21,8 @@
 
 #define CCID_MAX 255
 
+struct tcp_info;
+
 struct ccid {
 	unsigned char	ccid_id;
 	const char	*ccid_name;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 99e8afa7ba1e..3f244670764a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -19,6 +19,7 @@
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/timewait_sock.h>
 #include <net/tcp_states.h>
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 2e194c8f9953..c609dc78f487 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -20,6 +20,7 @@
 #include <net/addrconf.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
 #include <net/inet6_connection_sock.h>
 #include <net/inet6_hashtables.h>
 #include <net/ip6_route.h>
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 95a3c2c6a3ce..efd7ffb903a1 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 
+#include <net/inet_sock.h>
 #include <net/sock.h>
 
 #include "ackvec.h"
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e4e629ed9bf7..65b11ea90d85 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,7 +24,7 @@
 #include <net/checksum.h>
 
 #include <net/inet_common.h>
-#include <net/ip.h>
+#include <net/inet_sock.h>
 #include <net/protocol.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
@@ -34,7 +34,6 @@
 #include <linux/timer.h>
 #include <linux/delay.h>
 #include <linux/poll.h>
-#include <linux/dccp.h>
 
 #include "ccid.h"
 #include "dccp.h"
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index ff58f49c8b4a..70fb2b88da65 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -31,6 +31,7 @@
 #include <linux/if_arp.h>
 #include <linux/wireless.h>
 #include <linux/skbuff.h>
+#include <linux/udp.h>
 #include <net/sock.h>
 #include <net/inet_common.h>
 #include <linux/stat.h>
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4ed8a814c6cb..36a6306ca5a3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -93,6 +93,7 @@
 #include <linux/smp_lock.h>
 #include <linux/inet.h>
 #include <linux/igmp.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <net/ip.h>
 #include <net/protocol.h>
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 035ad2c9e1ba..aed537fa2c88 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -6,6 +6,7 @@
 #include <linux/crypto.h>
 #include <linux/pfkeyv2.h>
 #include <net/icmp.h>
+#include <net/protocol.h>
 #include <asm/scatterlist.h>
 
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index b425748f02d7..37432088fe6d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -86,6 +86,7 @@
 #include <linux/in.h>
 #include <linux/mm.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/fddidevice.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 04a6fe3e95a2..7b9bb28e2ee9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -58,6 +58,7 @@
 #endif
 #include <linux/kmod.h>
 
+#include <net/arp.h>
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/ip_fib.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1b18ce66e7b7..73bfcae8af9c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -9,6 +9,7 @@
 #include <linux/pfkeyv2.h>
 #include <linux/random.h>
 #include <net/icmp.h>
+#include <net/protocol.h>
 #include <net/udp.h>
 
 /* decapsulation data for use when post-processing */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 19b1b984d687..18f5e509281a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -30,6 +30,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 7ea0209cb169..e2890ec8159e 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 0b298bbc1518..0dd4d06e456d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -33,6 +33,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6d2a6ac070e3..ef4724de7350 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
@@ -36,6 +37,7 @@
 #include <linux/netlink.h>
 #include <linux/init.h>
 
+#include <net/arp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 92e23b2ad4d2..be5a519cd2f8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -73,6 +73,7 @@
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/string.h>
 #include <linux/netfilter_ipv4.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4a195c724f01..34758118c10c 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -91,6 +91,8 @@
 #include <linux/if_arp.h>
 #include <linux/rtnetlink.h>
 #include <linux/times.h>
+
+#include <net/arp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 473d0f2b2e0d..e45846ae570b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -128,6 +128,7 @@
 #include <linux/sockios.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index dbe12da8d8b3..d3f6c468faf4 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -22,6 +22,7 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/icmp.h>
+#include <net/route.h>
 
 /* 
  * Write options to IP header, record destination address to
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index add019c746f8..6986e11d65cc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -25,6 +25,7 @@
 #include <linux/skbuff.h>
 #include <linux/ip.h>
 #include <linux/icmp.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <net/sock.h>
 #include <net/ip.h>
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index fc718df17b40..d64e2ec8da7b 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -28,6 +28,7 @@
 #include <net/xfrm.h>
 #include <net/icmp.h>
 #include <net/ipcomp.h>
+#include <net/protocol.h>
 
 struct ipcomp_tfms {
 	struct list_head list;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e8674baaa8d9..bb3613ec448c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -42,6 +42,7 @@
 #include <linux/in.h>
 #include <linux/if.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
@@ -58,6 +59,7 @@
 #include <net/arp.h>
 #include <net/ip.h>
 #include <net/ipconfig.h>
+#include <net/route.h>
 
 #include <asm/uaccess.h>
 #include <net/checksum.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 302b7eb507c9..caa3b7d2e48a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -52,6 +52,7 @@
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
+#include <net/route.h>
 #include <net/sock.h>
 #include <net/icmp.h>
 #include <net/udp.h>
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 2a3a8c59c655..d35cea31cb55 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -24,7 +24,9 @@
  *
  */
 
+#include <linux/in.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/proc_fs.h>		/* for proc_net_* */
 #include <linux/seq_file.h>
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 9bdcf31b760e..fe2c39d2a002 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,7 @@
 #include <linux/netfilter_ipv4.h>
 
 #include <net/ip.h>
+#include <net/route.h>
 #include <net/sock.h>
 
 #include <asm/uaccess.h>
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index f3bc320dce93..9fee19c4c617 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -37,8 +37,10 @@
  *
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 67b3e2fc1fa1..e7004741ac73 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -13,7 +13,10 @@
  * Changes:
  *
  */
+#include <linux/config.h>
 #include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 
 #include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index b6dad7e3710c..6e5cb92a5c83 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -41,8 +41,10 @@
  * me to write this module.
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 /* for sysctl */
 #include <linux/fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 8c78ef76c121..32ba37ba72d8 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -39,8 +39,10 @@
  *
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 /* for sysctl */
 #include <linux/fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 453e94a0bbd7..8b0505b09317 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -12,6 +12,8 @@
  *
  */
 
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
index 478e5c7c7e8e..c36ccf057a19 100644
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -12,6 +12,8 @@
  *
  */
 
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 8ae5f2e0aefa..89d9175d8f28 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -15,8 +15,11 @@
  *
  */
 
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/kernel.h>
 #include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 6f7c50e44a39..7775e6cc68be 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -34,8 +34,10 @@
  *
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 2e5ced3d8062..1bca714bda3d 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -21,12 +21,14 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/inetdevice.h>
 #include <linux/net.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
 #include <linux/skbuff.h>
 #include <linux/in.h>
 #include <linux/igmp.h>                 /* for ip_mc_join_group */
+#include <linux/udp.h>
 
 #include <net/ip.h>
 #include <net/sock.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index e52847fa10f5..0366eedb4d70 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -18,11 +18,13 @@
  *
  */
 
+#include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/netfilter.h>
 #include <linux/ip.h>
 #include <linux/moduleparam.h>
+#include <linux/udp.h>
 #include <net/checksum.h>
 #include <net/udp.h>
 
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 744abb9d377a..57956dee60c8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -31,6 +31,7 @@
 #include <linux/ip.h>
 #include <linux/in.h>
 #include <linux/list.h>
+#include <linux/seq_file.h>
 
 static DEFINE_RWLOCK(ip_ct_gre_lock);
 #define ASSERT_READ_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index f2dcac7c7660..46becbe4fe58 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -11,6 +11,7 @@
 #include <linux/timer.h>
 #include <linux/netfilter.h>
 #include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/udp.h>
 #include <linux/seq_file.h>
 #include <net/checksum.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index dd476b191f4b..a88bcc551244 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -27,6 +27,7 @@
 #endif
 #include <net/checksum.h>
 #include <net/ip.h>
+#include <net/route.h>
 
 #define ASSERT_READ_LOCK(x)
 #define ASSERT_WRITE_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 8acb7ed40b47..4f95d477805c 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -44,6 +44,7 @@
  *
  */
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -53,6 +54,7 @@
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_nat_helper.h>
 #include <linux/ip.h>
+#include <linux/udp.h>
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <asm/uaccess.h>
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 275a174c6fe6..27860510ca6d 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -11,6 +11,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/inetdevice.h>
 #include <linux/ip.h>
 #include <linux/timer.h>
 #include <linux/module.h>
@@ -18,6 +19,7 @@
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/checksum.h>
+#include <net/route.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv4/ip_nat_rule.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
index 1a53924041fc..03f554857a4d 100644
--- a/net/ipv4/netfilter/ipt_physdev.c
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/netfilter_ipv4/ipt_physdev.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0d7dc668db46..39d49dc333a7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -38,6 +38,7 @@
 #include <net/protocol.h>
 #include <net/tcp.h>
 #include <net/udp.h>
+#include <linux/inetdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <net/sock.h>
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index dbf82955aabe..16984d4a8a06 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
 #include <linux/sysctl.h>
 #include <linux/config.h>
 #include <linux/igmp.h>
+#include <linux/inetdevice.h>
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 67c036384e77..223abaa72bc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -86,6 +86,7 @@
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
+#include <linux/igmp.h>
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index f3629730eb15..13cc7f895583 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
+#include <net/protocol.h>
 #include <net/xfrm.h>
 #include <asm/scatterlist.h>
 
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 8bfbe9970793..6de8ee1a5ad9 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -36,6 +36,7 @@
 #include <linux/random.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
+#include <net/protocol.h>
 #include <linux/icmpv6.h>
 
 static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 55917fb17094..626dd39685f2 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -47,6 +47,7 @@
 #include <linux/rtnetlink.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
+#include <net/protocol.h>
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index dd80020d8740..ea43ef1d94a7 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -15,6 +15,7 @@
  *      - new extension header parser code
  */
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/skbuff.h>
 #include <linux/kmod.h>
 #include <linux/vmalloc.h>
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0cd1d1bd9033..ae4653bfd654 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/skbuff.h>
+#include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/spinlock.h>
 #include <linux/icmpv6.h>
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index dde37793d20b..268918d5deea 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -9,6 +9,7 @@
 
 #include <linux/module.h>
 #include <linux/skbuff.h>
+#include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/types.h>
 #include <net/checksum.h>
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index 24bc0cde43a1..65937de1b58c 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -9,6 +9,7 @@
 
 #include <linux/module.h>
 #include <linux/skbuff.h>
+#include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/types.h>
 #include <net/checksum.h>
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 6cf0342706b5..c4a2a8c4c339 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -22,6 +22,7 @@
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d1b0747a5b9d..de693b43c8ea 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -54,6 +54,7 @@
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/route.h>
 #include <net/sctp/sctp.h>
 #include <net/addrconf.h>
 #include <net/inet_common.h>
-- 
cgit v1.2.3


From 033b96fd30db52a710d97b06f87d16fc59fee0f1 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 11 Nov 2005 06:09:55 +0100
Subject: [PATCH] remove mount/umount uevents from superblock handling

The names of these events have been confusing from the beginning
on, as they have been more like claim/release events. We needed these
events for noticing HAL if storage devices have been mounted.

Thanks to Al, we have the proper solution now and can poll()
/proc/mounts instead to get notfied about mount tree changes.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/super.c              | 15 +--------------
 include/linux/kobject.h |  6 ++----
 lib/kobject_uevent.c    |  4 ----
 3 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 6689dded3c84..5a347a4f673a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -665,16 +665,6 @@ static int test_bdev_super(struct super_block *s, void *data)
 	return (void *)s->s_bdev == data;
 }
 
-static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
-{
-	if (bdev->bd_disk) {
-		if (bdev->bd_part)
-			kobject_uevent(&bdev->bd_part->kobj, action, NULL);
-		else
-			kobject_uevent(&bdev->bd_disk->kobj, action, NULL);
-	}
-}
-
 struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int))
@@ -717,10 +707,8 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 			up_write(&s->s_umount);
 			deactivate_super(s);
 			s = ERR_PTR(error);
-		} else {
+		} else
 			s->s_flags |= MS_ACTIVE;
-			bdev_uevent(bdev, KOBJ_MOUNT);
-		}
 	}
 
 	return s;
@@ -736,7 +724,6 @@ void kill_block_super(struct super_block *sb)
 {
 	struct block_device *bdev = sb->s_bdev;
 
-	bdev_uevent(bdev, KOBJ_UMOUNT);
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
 	close_bdev_excl(bdev);
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index baf5251d9f63..e6926b327538 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -42,10 +42,8 @@ enum kobject_action {
 	KOBJ_ADD	= (__force kobject_action_t) 0x01,	/* add event, for hotplug */
 	KOBJ_REMOVE	= (__force kobject_action_t) 0x02,	/* remove event, for hotplug */
 	KOBJ_CHANGE	= (__force kobject_action_t) 0x03,	/* a sysfs attribute file has changed */
-	KOBJ_MOUNT	= (__force kobject_action_t) 0x04,	/* mount event for block devices */
-	KOBJ_UMOUNT	= (__force kobject_action_t) 0x05,	/* umount event for block devices */
-	KOBJ_OFFLINE	= (__force kobject_action_t) 0x06,	/* offline event for hotplug devices */
-	KOBJ_ONLINE	= (__force kobject_action_t) 0x07,	/* online event for hotplug devices */
+	KOBJ_OFFLINE	= (__force kobject_action_t) 0x04,	/* offline event for hotplug devices */
+	KOBJ_ONLINE	= (__force kobject_action_t) 0x05,	/* online event for hotplug devices */
 };
 
 struct kobject {
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 1f90eea7eebc..845bf67d94ca 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -39,10 +39,6 @@ static char *action_to_string(enum kobject_action action)
 		return "remove";
 	case KOBJ_CHANGE:
 		return "change";
-	case KOBJ_MOUNT:
-		return "mount";
-	case KOBJ_UMOUNT:
-		return "umount";
 	case KOBJ_OFFLINE:
 		return "offline";
 	case KOBJ_ONLINE:
-- 
cgit v1.2.3


From 312c004d36ce6c739512bac83b452f4c20ab1f62 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Wed, 16 Nov 2005 09:00:00 +0100
Subject: [PATCH] driver core: replace "hotplug" by "uevent"

Leave the overloaded "hotplug" word to susbsystems which are handling
real devices. The driver core does not "plug" anything, it just exports
the state to userspace and generates events.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/powerpc/eeh-pci-error-recovery.txt | 31 ++++-----
 arch/powerpc/kernel/vio.c                        |  2 +-
 block/genhd.c                                    | 48 ++++++-------
 drivers/acpi/container.c                         |  8 +--
 drivers/acpi/processor_core.c                    |  8 +--
 drivers/acpi/scan.c                              | 14 ++--
 drivers/base/Kconfig                             |  4 +-
 drivers/base/class.c                             | 66 +++++++++---------
 drivers/base/core.c                              | 42 ++++++------
 drivers/base/cpu.c                               |  4 +-
 drivers/base/firmware_class.c                    | 45 ++++++-------
 drivers/base/memory.c                            | 12 ++--
 drivers/ieee1394/nodemgr.c                       | 20 +++---
 drivers/infiniband/core/sysfs.c                  | 16 ++---
 drivers/input/input.c                            | 14 ++--
 drivers/input/serio/serio.c                      | 22 +++---
 drivers/macintosh/macio_asic.c                   |  4 +-
 drivers/mmc/mmc_sysfs.c                          |  4 +-
 drivers/pci/hotplug.c                            | 44 ++++++------
 drivers/pci/pci-driver.c                         |  6 +-
 drivers/pci/pci.h                                |  4 +-
 drivers/pcmcia/cs.c                              | 10 +--
 drivers/pcmcia/ds.c                              | 50 +++++++-------
 drivers/scsi/ipr.c                               |  2 +-
 drivers/usb/core/usb.c                           | 86 +++++++++++-------------
 drivers/usb/host/hc_crisv10.c                    |  2 +-
 drivers/w1/w1.c                                  | 14 ++--
 fs/partitions/check.c                            |  6 +-
 include/linux/device.h                           | 14 ++--
 include/linux/firmware.h                         |  2 +-
 include/linux/kobject.h                          | 40 ++++++-----
 include/linux/sysctl.h                           |  2 +-
 include/linux/usb.h                              |  2 +-
 kernel/ksysfs.c                                  | 14 ++--
 kernel/sysctl.c                                  |  4 +-
 lib/kobject.c                                    |  4 +-
 lib/kobject_uevent.c                             | 64 +++++++++---------
 net/bluetooth/hci_sysfs.c                        |  4 +-
 net/bridge/br_sysfs_if.c                         |  4 +-
 net/core/net-sysfs.c                             |  8 +--
 40 files changed, 372 insertions(+), 378 deletions(-)

(limited to 'fs')

diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
index e75d7474322c..67a11a36270c 100644
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -115,7 +115,7 @@ Current PPC64 Linux EEH Implementation
 At this time, a generic EEH recovery mechanism has been implemented,
 so that individual device drivers do not need to be modified to support
 EEH recovery.  This generic mechanism piggy-backs on the PCI hotplug
-infrastructure,  and percolates events up through the hotplug/udev
+infrastructure,  and percolates events up through the userspace/udev
 infrastructure.  Followiing is a detailed description of how this is
 accomplished.
 
@@ -172,7 +172,7 @@ A handler for the EEH notifier_block events is implemented in
 drivers/pci/hotplug/pSeries_pci.c, called handle_eeh_events().
 It saves the device BAR's and then calls rpaphp_unconfig_pci_adapter().
 This last call causes the device driver for the card to be stopped,
-which causes hotplug events to go out to user space. This triggers
+which causes uevents to go out to user space. This triggers
 user-space scripts that might issue commands such as "ifdown eth0"
 for ethernet cards, and so on.  This handler then sleeps for 5 seconds,
 hoping to give the user-space scripts enough time to complete.
@@ -258,29 +258,30 @@ rpa_php_unconfig_pci_adapter() {             // in rpaphp_pci.c
     calls
     pci_destroy_dev (struct pci_dev *) {
       calls
-      device_unregister (&dev->dev) {      // in /drivers/base/core.c
+      device_unregister (&dev->dev) {        // in /drivers/base/core.c
         calls
-        device_del(struct device * dev) {  // in /drivers/base/core.c
+        device_del(struct device * dev) {    // in /drivers/base/core.c
           calls
-          kobject_del() {                  //in /libs/kobject.c
+          kobject_del() {                    //in /libs/kobject.c
             calls
-            kobject_hotplug() {            // in /libs/kobject.c
+            kobject_uevent() {               // in /libs/kobject.c
               calls
-              kset_hotplug() {             // in /lib/kobject.c
+              kset_uevent() {                // in /lib/kobject.c
                 calls
-                kset->hotplug_ops->hotplug() which is really just
+                kset->uevent_ops->uevent()   // which is really just
                 a call to
-                dev_hotplug() {           // in /drivers/base/core.c
+                dev_uevent() {               // in /drivers/base/core.c
                   calls
-                  dev->bus->hotplug() which is really just a call to
-                  pci_hotplug () {      // in drivers/pci/hotplug.c
+                  dev->bus->uevent() which is really just a call to
+                  pci_uevent () {            // in drivers/pci/hotplug.c
                     which prints device name, etc....
                  }
                }
-               then kset_hotplug() calls
-                call_usermodehelper () with
-                   argv[0]=hotplug_path[] which is "/sbin/hotplug"
-             --> event to userspace,
+               then kobject_uevent() sends a netlink uevent to userspace
+               --> userspace uevent
+               (during early boot, nobody listens to netlink events and
+               kobject_uevent() executes uevent_helper[], which runs the
+               event process /sbin/hotplug)
            }
          }
          kobject_del() then calls sysfs_remove_dir(), which would
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 71a6addf9f7f..13c41495fe06 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -293,6 +293,6 @@ static int vio_hotplug(struct device *dev, char **envp, int num_envp,
 
 struct bus_type vio_bus_type = {
 	.name = "vio",
-	.hotplug = vio_hotplug,
+	.uevent = vio_hotplug,
 	.match = vio_bus_match,
 };
diff --git a/block/genhd.c b/block/genhd.c
index f04609d553b8..f1ed83f3f083 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -358,7 +358,7 @@ static struct sysfs_ops disk_sysfs_ops = {
 static ssize_t disk_uevent_store(struct gendisk * disk,
 				 const char *buf, size_t count)
 {
-	kobject_hotplug(&disk->kobj, KOBJ_ADD);
+	kobject_uevent(&disk->kobj, KOBJ_ADD);
 	return count;
 }
 static ssize_t disk_dev_read(struct gendisk * disk, char *page)
@@ -455,14 +455,14 @@ static struct kobj_type ktype_block = {
 
 extern struct kobj_type ktype_part;
 
-static int block_hotplug_filter(struct kset *kset, struct kobject *kobj)
+static int block_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
 
 	return ((ktype == &ktype_block) || (ktype == &ktype_part));
 }
 
-static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int block_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			 int num_envp, char *buffer, int buffer_size)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
@@ -474,40 +474,40 @@ static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 
 	if (ktype == &ktype_block) {
 		disk = container_of(kobj, struct gendisk, kobj);
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "MINOR=%u", disk->first_minor);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "MINOR=%u", disk->first_minor);
 	} else if (ktype == &ktype_part) {
 		disk = container_of(kobj->parent, struct gendisk, kobj);
 		part = container_of(kobj, struct hd_struct, kobj);
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "MINOR=%u",
-				    disk->first_minor + part->partno);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "MINOR=%u",
+			       disk->first_minor + part->partno);
 	} else
 		return 0;
 
-	add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &length,
-			    "MAJOR=%u", disk->major);
+	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
+		       "MAJOR=%u", disk->major);
 
 	/* add physical device, backing this device  */
 	physdev = disk->driverfs_dev;
 	if (physdev) {
 		char *path = kobject_get_path(&physdev->kobj, GFP_KERNEL);
 
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "PHYSDEVPATH=%s", path);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "PHYSDEVPATH=%s", path);
 		kfree(path);
 
 		if (physdev->bus)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVBUS=%s",
-					    physdev->bus->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVBUS=%s",
+				       physdev->bus->name);
 
 		if (physdev->driver)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVDRIVER=%s",
-					    physdev->driver->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVDRIVER=%s",
+				       physdev->driver->name);
 	}
 
 	/* terminate, set to next free slot, shrink available space */
@@ -520,13 +520,13 @@ static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	return 0;
 }
 
-static struct kset_hotplug_ops block_hotplug_ops = {
-	.filter		= block_hotplug_filter,
-	.hotplug	= block_hotplug,
+static struct kset_uevent_ops block_uevent_ops = {
+	.filter		= block_uevent_filter,
+	.uevent		= block_uevent,
 };
 
 /* declare block_subsys. */
-static decl_subsys(block, &ktype_block, &block_hotplug_ops);
+static decl_subsys(block, &ktype_block, &block_uevent_ops);
 
 
 /*
diff --git a/drivers/acpi/container.c b/drivers/acpi/container.c
index 27ec12c1fab0..b69a8cad82b7 100644
--- a/drivers/acpi/container.c
+++ b/drivers/acpi/container.c
@@ -172,21 +172,21 @@ static void container_notify_cb(acpi_handle handle, u32 type, void *context)
 			if (ACPI_FAILURE(status) || !device) {
 				result = container_device_add(&device, handle);
 				if (!result)
-					kobject_hotplug(&device->kobj,
-							KOBJ_ONLINE);
+					kobject_uevent(&device->kobj,
+						       KOBJ_ONLINE);
 				else
 					printk("Failed to add container\n");
 			}
 		} else {
 			if (ACPI_SUCCESS(status)) {
 				/* device exist and this is a remove request */
-				kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+				kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 			}
 		}
 		break;
 	case ACPI_NOTIFY_EJECT_REQUEST:
 		if (!acpi_bus_get_device(handle, &device) && device) {
-			kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 		}
 		break;
 	default:
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 0c561c571f29..1278aca96fe3 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -748,7 +748,7 @@ int acpi_processor_device_add(acpi_handle handle, struct acpi_device **device)
 		return_VALUE(-ENODEV);
 
 	if ((pr->id >= 0) && (pr->id < NR_CPUS)) {
-		kobject_hotplug(&(*device)->kobj, KOBJ_ONLINE);
+		kobject_uevent(&(*device)->kobj, KOBJ_ONLINE);
 	}
 	return_VALUE(0);
 }
@@ -788,13 +788,13 @@ acpi_processor_hotplug_notify(acpi_handle handle, u32 event, void *data)
 		}
 
 		if (pr->id >= 0 && (pr->id < NR_CPUS)) {
-			kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 			break;
 		}
 
 		result = acpi_processor_start(device);
 		if ((!result) && ((pr->id >= 0) && (pr->id < NR_CPUS))) {
-			kobject_hotplug(&device->kobj, KOBJ_ONLINE);
+			kobject_uevent(&device->kobj, KOBJ_ONLINE);
 		} else {
 			ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
 					  "Device [%s] failed to start\n",
@@ -818,7 +818,7 @@ acpi_processor_hotplug_notify(acpi_handle handle, u32 event, void *data)
 		}
 
 		if ((pr->id < NR_CPUS) && (cpu_present(pr->id)))
-			kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 		break;
 	default:
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 31218e1d2a18..0745d20afb8c 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -78,7 +78,7 @@ static struct kobj_type ktype_acpi_ns = {
 	.release = acpi_device_release,
 };
 
-static int namespace_hotplug(struct kset *kset, struct kobject *kobj,
+static int namespace_uevent(struct kset *kset, struct kobject *kobj,
 			     char **envp, int num_envp, char *buffer,
 			     int buffer_size)
 {
@@ -89,8 +89,8 @@ static int namespace_hotplug(struct kset *kset, struct kobject *kobj,
 	if (!dev->driver)
 		return 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &len,
-				"PHYSDEVDRIVER=%s", dev->driver->name))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &len,
+			   "PHYSDEVDRIVER=%s", dev->driver->name))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -98,8 +98,8 @@ static int namespace_hotplug(struct kset *kset, struct kobject *kobj,
 	return 0;
 }
 
-static struct kset_hotplug_ops namespace_hotplug_ops = {
-	.hotplug = &namespace_hotplug,
+static struct kset_uevent_ops namespace_uevent_ops = {
+	.uevent = &namespace_uevent,
 };
 
 static struct kset acpi_namespace_kset = {
@@ -108,7 +108,7 @@ static struct kset acpi_namespace_kset = {
 		 },
 	.subsys = &acpi_subsys,
 	.ktype = &ktype_acpi_ns,
-	.hotplug_ops = &namespace_hotplug_ops,
+	.uevent_ops = &namespace_uevent_ops,
 };
 
 static void acpi_device_register(struct acpi_device *device,
@@ -347,7 +347,7 @@ static int acpi_bus_get_wakeup_device_flags(struct acpi_device *device)
 }
 
 /* --------------------------------------------------------------------------
-		ACPI hotplug sysfs device file support
+		ACPI sysfs device file support
    -------------------------------------------------------------------------- */
 static ssize_t acpi_eject_store(struct acpi_device *device,
 				const char *buf, size_t count);
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 934149c1512b..f0eff3dac58d 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -19,11 +19,11 @@ config PREVENT_FIRMWARE_BUILD
 	  If unsure say Y here.
 
 config FW_LOADER
-	tristate "Hotplug firmware loading support"
+	tristate "Userspace firmware loading support"
 	select HOTPLUG
 	---help---
 	  This option is provided for the case where no in-kernel-tree modules
-	  require hotplug firmware loading support, but a module built outside
+	  require userspace firmware loading support, but a module built outside
 	  the kernel tree does.
 
 config DEBUG_DRIVER
diff --git a/drivers/base/class.c b/drivers/base/class.c
index db65fd0babe9..df7fdabd0730 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -178,7 +178,7 @@ static void class_device_create_release(struct class_device *class_dev)
 }
 
 /* needed to allow these devices to have parent class devices */
-static int class_device_create_hotplug(struct class_device *class_dev,
+static int class_device_create_uevent(struct class_device *class_dev,
 				       char **envp, int num_envp,
 				       char *buffer, int buffer_size)
 {
@@ -331,7 +331,7 @@ static struct kobj_type ktype_class_device = {
 	.release	= class_dev_release,
 };
 
-static int class_hotplug_filter(struct kset *kset, struct kobject *kobj)
+static int class_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
 
@@ -343,14 +343,14 @@ static int class_hotplug_filter(struct kset *kset, struct kobject *kobj)
 	return 0;
 }
 
-static const char *class_hotplug_name(struct kset *kset, struct kobject *kobj)
+static const char *class_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	struct class_device *class_dev = to_class_dev(kobj);
 
 	return class_dev->class->name;
 }
 
-static int class_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int class_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			 int num_envp, char *buffer, int buffer_size)
 {
 	struct class_device *class_dev = to_class_dev(kobj);
@@ -365,29 +365,29 @@ static int class_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 		struct device *dev = class_dev->dev;
 		char *path = kobject_get_path(&dev->kobj, GFP_KERNEL);
 
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "PHYSDEVPATH=%s", path);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "PHYSDEVPATH=%s", path);
 		kfree(path);
 
 		if (dev->bus)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVBUS=%s", dev->bus->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVBUS=%s", dev->bus->name);
 
 		if (dev->driver)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVDRIVER=%s", dev->driver->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVDRIVER=%s", dev->driver->name);
 	}
 
 	if (MAJOR(class_dev->devt)) {
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "MAJOR=%u", MAJOR(class_dev->devt));
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MAJOR=%u", MAJOR(class_dev->devt));
 
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "MINOR=%u", MINOR(class_dev->devt));
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MINOR=%u", MINOR(class_dev->devt));
 	}
 
 	/* terminate, set to next free slot, shrink available space */
@@ -397,30 +397,30 @@ static int class_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	buffer = &buffer[length];
 	buffer_size -= length;
 
-	if (class_dev->hotplug) {
+	if (class_dev->uevent) {
 		/* have the class device specific function add its stuff */
-		retval = class_dev->hotplug(class_dev, envp, num_envp,
+		retval = class_dev->uevent(class_dev, envp, num_envp,
 					    buffer, buffer_size);
 		if (retval)
-			pr_debug("class_dev->hotplug() returned %d\n", retval);
-	} else if (class_dev->class->hotplug) {
+			pr_debug("class_dev->uevent() returned %d\n", retval);
+	} else if (class_dev->class->uevent) {
 		/* have the class specific function add its stuff */
-		retval = class_dev->class->hotplug(class_dev, envp, num_envp,
+		retval = class_dev->class->uevent(class_dev, envp, num_envp,
 						   buffer, buffer_size);
 		if (retval)
-			pr_debug("class->hotplug() returned %d\n", retval);
+			pr_debug("class->uevent() returned %d\n", retval);
 	}
 
 	return retval;
 }
 
-static struct kset_hotplug_ops class_hotplug_ops = {
-	.filter =	class_hotplug_filter,
-	.name =		class_hotplug_name,
-	.hotplug =	class_hotplug,
+static struct kset_uevent_ops class_uevent_ops = {
+	.filter =	class_uevent_filter,
+	.name =		class_uevent_name,
+	.uevent =	class_uevent,
 };
 
-static decl_subsys(class_obj, &ktype_class_device, &class_hotplug_ops);
+static decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops);
 
 
 static int class_device_add_attrs(struct class_device * cd)
@@ -464,7 +464,7 @@ static ssize_t show_dev(struct class_device *class_dev, char *buf)
 static ssize_t store_uevent(struct class_device *class_dev,
 			    const char *buf, size_t count)
 {
-	kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+	kobject_uevent(&class_dev->kobj, KOBJ_ADD);
 	return count;
 }
 
@@ -559,7 +559,7 @@ int class_device_add(struct class_device *class_dev)
 				  class_name);
 	}
 
-	kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+	kobject_uevent(&class_dev->kobj, KOBJ_ADD);
 
 	/* notify any interfaces this device is now here */
 	if (parent_class) {
@@ -632,7 +632,7 @@ struct class_device *class_device_create(struct class *cls,
 	class_dev->class = cls;
 	class_dev->parent = parent;
 	class_dev->release = class_device_create_release;
-	class_dev->hotplug = class_device_create_hotplug;
+	class_dev->uevent = class_device_create_uevent;
 
 	va_start(args, fmt);
 	vsnprintf(class_dev->class_id, BUS_ID_SIZE, fmt, args);
@@ -674,7 +674,7 @@ void class_device_del(struct class_device *class_dev)
 		class_device_remove_file(class_dev, class_dev->devt_attr);
 	class_device_remove_attrs(class_dev);
 
-	kobject_hotplug(&class_dev->kobj, KOBJ_REMOVE);
+	kobject_uevent(&class_dev->kobj, KOBJ_REMOVE);
 	kobject_del(&class_dev->kobj);
 
 	class_device_put(parent_device);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8615b42b517a..fd8059920dbf 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -90,7 +90,7 @@ static struct kobj_type ktype_device = {
 };
 
 
-static int dev_hotplug_filter(struct kset *kset, struct kobject *kobj)
+static int dev_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
 
@@ -102,14 +102,14 @@ static int dev_hotplug_filter(struct kset *kset, struct kobject *kobj)
 	return 0;
 }
 
-static const char *dev_hotplug_name(struct kset *kset, struct kobject *kobj)
+static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	struct device *dev = to_dev(kobj);
 
 	return dev->bus->name;
 }
 
-static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int dev_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			int num_envp, char *buffer, int buffer_size)
 {
 	struct device *dev = to_dev(kobj);
@@ -119,15 +119,15 @@ static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 
 	/* add bus name of physical device */
 	if (dev->bus)
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "PHYSDEVBUS=%s", dev->bus->name);
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "PHYSDEVBUS=%s", dev->bus->name);
 
 	/* add driver name of physical device */
 	if (dev->driver)
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "PHYSDEVDRIVER=%s", dev->driver->name);
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "PHYSDEVDRIVER=%s", dev->driver->name);
 
 	/* terminate, set to next free slot, shrink available space */
 	envp[i] = NULL;
@@ -136,11 +136,11 @@ static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	buffer = &buffer[length];
 	buffer_size -= length;
 
-	if (dev->bus && dev->bus->hotplug) {
+	if (dev->bus && dev->bus->uevent) {
 		/* have the bus specific function add its stuff */
-		retval = dev->bus->hotplug (dev, envp, num_envp, buffer, buffer_size);
+		retval = dev->bus->uevent(dev, envp, num_envp, buffer, buffer_size);
 			if (retval) {
-			pr_debug ("%s - hotplug() returned %d\n",
+			pr_debug ("%s - uevent() returned %d\n",
 				  __FUNCTION__, retval);
 		}
 	}
@@ -148,16 +148,16 @@ static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	return retval;
 }
 
-static struct kset_hotplug_ops device_hotplug_ops = {
-	.filter =	dev_hotplug_filter,
-	.name =		dev_hotplug_name,
-	.hotplug =	dev_hotplug,
+static struct kset_uevent_ops device_uevent_ops = {
+	.filter =	dev_uevent_filter,
+	.name =		dev_uevent_name,
+	.uevent =	dev_uevent,
 };
 
 static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	kobject_hotplug(&dev->kobj, KOBJ_ADD);
+	kobject_uevent(&dev->kobj, KOBJ_ADD);
 	return count;
 }
 
@@ -165,7 +165,7 @@ static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
  *	device_subsys - structure to be registered with kobject core.
  */
 
-decl_subsys(devices, &ktype_device, &device_hotplug_ops);
+decl_subsys(devices, &ktype_device, &device_uevent_ops);
 
 
 /**
@@ -274,7 +274,7 @@ int device_add(struct device *dev)
 	dev->uevent_attr.store = store_uevent;
 	device_create_file(dev, &dev->uevent_attr);
 
-	kobject_hotplug(&dev->kobj, KOBJ_ADD);
+	kobject_uevent(&dev->kobj, KOBJ_ADD);
 	if ((error = device_pm_add(dev)))
 		goto PMError;
 	if ((error = bus_add_device(dev)))
@@ -291,7 +291,7 @@ int device_add(struct device *dev)
  BusError:
 	device_pm_remove(dev);
  PMError:
-	kobject_hotplug(&dev->kobj, KOBJ_REMOVE);
+	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
  Error:
 	if (parent)
@@ -374,7 +374,7 @@ void device_del(struct device * dev)
 		platform_notify_remove(dev);
 	bus_remove_device(dev);
 	device_pm_remove(dev);
-	kobject_hotplug(&dev->kobj, KOBJ_REMOVE);
+	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
 	if (parent)
 		put_device(parent);
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index a95844790f7b..281d26784d25 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -41,14 +41,14 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
 	case '0':
 		ret = cpu_down(cpu->sysdev.id);
 		if (!ret)
-			kobject_hotplug(&dev->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
 		break;
 	case '1':
 		ret = smp_prepare_cpu(cpu->sysdev.id);
 		if (!ret)
 			ret = cpu_up(cpu->sysdev.id);
 		if (!ret)
-			kobject_hotplug(&dev->kobj, KOBJ_ONLINE);
+			kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 59dacb6552c0..5b3d5e9ddcb6 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -85,17 +85,17 @@ firmware_timeout_store(struct class *class, const char *buf, size_t count)
 static CLASS_ATTR(timeout, 0644, firmware_timeout_show, firmware_timeout_store);
 
 static void  fw_class_dev_release(struct class_device *class_dev);
-int firmware_class_hotplug(struct class_device *dev, char **envp,
+int firmware_class_uevent(struct class_device *dev, char **envp,
 			   int num_envp, char *buffer, int buffer_size);
 
 static struct class firmware_class = {
 	.name		= "firmware",
-	.hotplug	= firmware_class_hotplug,
+	.uevent	= firmware_class_uevent,
 	.release	= fw_class_dev_release,
 };
 
 int
-firmware_class_hotplug(struct class_device *class_dev, char **envp,
+firmware_class_uevent(struct class_device *class_dev, char **envp,
 		       int num_envp, char *buffer, int buffer_size)
 {
 	struct firmware_priv *fw_priv = class_get_devdata(class_dev);
@@ -104,13 +104,12 @@ firmware_class_hotplug(struct class_device *class_dev, char **envp,
 	if (!test_bit(FW_STATUS_READY, &fw_priv->status))
 		return -ENODEV;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &len,
-			"FIRMWARE=%s", fw_priv->fw_id))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &len,
+			   "FIRMWARE=%s", fw_priv->fw_id))
 		return -ENOMEM;
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &len,
-			"TIMEOUT=%i", loading_timeout))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &len,
+			   "TIMEOUT=%i", loading_timeout))
 		return -ENOMEM;
-
 	envp[i] = NULL;
 
 	return 0;
@@ -352,7 +351,7 @@ error_kfree:
 
 static int
 fw_setup_class_device(struct firmware *fw, struct class_device **class_dev_p,
-		      const char *fw_name, struct device *device, int hotplug)
+		      const char *fw_name, struct device *device, int uevent)
 {
 	struct class_device *class_dev;
 	struct firmware_priv *fw_priv;
@@ -384,7 +383,7 @@ fw_setup_class_device(struct firmware *fw, struct class_device **class_dev_p,
 		goto error_unreg;
 	}
 
-	if (hotplug)
+	if (uevent)
                 set_bit(FW_STATUS_READY, &fw_priv->status);
         else
                 set_bit(FW_STATUS_READY_NOHOTPLUG, &fw_priv->status);
@@ -399,7 +398,7 @@ out:
 
 static int
 _request_firmware(const struct firmware **firmware_p, const char *name,
-		 struct device *device, int hotplug)
+		 struct device *device, int uevent)
 {
 	struct class_device *class_dev;
 	struct firmware_priv *fw_priv;
@@ -418,19 +417,19 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 	}
 
 	retval = fw_setup_class_device(firmware, &class_dev, name, device,
-		hotplug);
+				       uevent);
 	if (retval)
 		goto error_kfree_fw;
 
 	fw_priv = class_get_devdata(class_dev);
 
-	if (hotplug) {
+	if (uevent) {
 		if (loading_timeout > 0) {
 			fw_priv->timeout.expires = jiffies + loading_timeout * HZ;
 			add_timer(&fw_priv->timeout);
 		}
 
-		kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+		kobject_uevent(&class_dev->kobj, KOBJ_ADD);
 		wait_for_completion(&fw_priv->completion);
 		set_bit(FW_STATUS_DONE, &fw_priv->status);
 		del_timer_sync(&fw_priv->timeout);
@@ -456,7 +455,7 @@ out:
 }
 
 /**
- * request_firmware: - request firmware to hotplug and wait for it
+ * request_firmware: - send firmware request and wait for it
  * @firmware_p: pointer to firmware image
  * @name: name of firmware file
  * @device: device for which firmware is being loaded
@@ -466,7 +465,7 @@ out:
  *
  *      Should be called from user context where sleeping is allowed.
  *
- *      @name will be used as $FIRMWARE in the hotplug environment and
+ *      @name will be used as $FIRMWARE in the uevent environment and
  *      should be distinctive enough not to be confused with any other
  *      firmware image for this or any other device.
  **/
@@ -474,8 +473,8 @@ int
 request_firmware(const struct firmware **firmware_p, const char *name,
                  struct device *device)
 {
-        int hotplug = 1;
-        return _request_firmware(firmware_p, name, device, hotplug);
+        int uevent = 1;
+        return _request_firmware(firmware_p, name, device, uevent);
 }
 
 /**
@@ -518,7 +517,7 @@ struct firmware_work {
 	struct device *device;
 	void *context;
 	void (*cont)(const struct firmware *fw, void *context);
-	int hotplug;
+	int uevent;
 };
 
 static int
@@ -533,7 +532,7 @@ request_firmware_work_func(void *arg)
 	}
 	daemonize("%s/%s", "firmware", fw_work->name);
 	ret = _request_firmware(&fw, fw_work->name, fw_work->device,
-		fw_work->hotplug);
+		fw_work->uevent);
 	if (ret < 0)
 		fw_work->cont(NULL, fw_work->context);
 	else {
@@ -548,7 +547,7 @@ request_firmware_work_func(void *arg)
 /**
  * request_firmware_nowait: asynchronous version of request_firmware
  * @module: module requesting the firmware
- * @hotplug: invokes hotplug event to copy the firmware image if this flag
+ * @uevent: sends uevent to copy the firmware image if this flag
  *	is non-zero else the firmware copy must be done manually.
  * @name: name of firmware file
  * @device: device for which firmware is being loaded
@@ -562,7 +561,7 @@ request_firmware_work_func(void *arg)
  **/
 int
 request_firmware_nowait(
-	struct module *module, int hotplug,
+	struct module *module, int uevent,
 	const char *name, struct device *device, void *context,
 	void (*cont)(const struct firmware *fw, void *context))
 {
@@ -583,7 +582,7 @@ request_firmware_nowait(
 		.device = device,
 		.context = context,
 		.cont = cont,
-		.hotplug = hotplug,
+		.uevent = uevent,
 	};
 
 	ret = kernel_thread(request_firmware_work_func, fw_work,
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bc3ca6a656b2..7e1d077874df 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -29,12 +29,12 @@ static struct sysdev_class memory_sysdev_class = {
 	set_kset_name(MEMORY_CLASS_NAME),
 };
 
-static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
+static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	return MEMORY_CLASS_NAME;
 }
 
-static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int memory_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			int num_envp, char *buffer, int buffer_size)
 {
 	int retval = 0;
@@ -42,9 +42,9 @@ static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	return retval;
 }
 
-static struct kset_hotplug_ops memory_hotplug_ops = {
-	.name		= memory_hotplug_name,
-	.hotplug	= memory_hotplug,
+static struct kset_uevent_ops memory_uevent_ops = {
+	.name		= memory_uevent_name,
+	.uevent		= memory_uevent,
 };
 
 static struct notifier_block *memory_chain;
@@ -431,7 +431,7 @@ int __init memory_dev_init(void)
 	unsigned int i;
 	int ret;
 
-	memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
+	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
 	ret = sysdev_class_register(&memory_sysdev_class);
 
 	/*
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 0ea37b1bccb2..f2453668acf5 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -121,8 +121,8 @@ struct host_info {
 };
 
 static int nodemgr_bus_match(struct device * dev, struct device_driver * drv);
-static int nodemgr_hotplug(struct class_device *cdev, char **envp, int num_envp,
-			   char *buffer, int buffer_size);
+static int nodemgr_uevent(struct class_device *cdev, char **envp, int num_envp,
+			  char *buffer, int buffer_size);
 static void nodemgr_resume_ne(struct node_entry *ne);
 static void nodemgr_remove_ne(struct node_entry *ne);
 static struct node_entry *find_entry_by_guid(u64 guid);
@@ -162,7 +162,7 @@ static void ud_cls_release(struct class_device *class_dev)
 static struct class nodemgr_ud_class = {
 	.name		= "ieee1394",
 	.release	= ud_cls_release,
-	.hotplug	= nodemgr_hotplug,
+	.uevent		= nodemgr_uevent,
 };
 
 static struct hpsb_highlevel nodemgr_highlevel;
@@ -966,7 +966,7 @@ static struct unit_directory *nodemgr_process_unit_directory
 				if (ud_child == NULL)
 					break;
 				
-				/* inherit unspecified values so hotplug picks it up */
+				/* inherit unspecified values, the driver core picks it up */
 				if ((ud->flags & UNIT_DIRECTORY_MODEL_ID) &&
 				    !(ud_child->flags & UNIT_DIRECTORY_MODEL_ID))
 				{
@@ -1062,8 +1062,8 @@ static void nodemgr_process_root_directory(struct host_info *hi, struct node_ent
 
 #ifdef CONFIG_HOTPLUG
 
-static int nodemgr_hotplug(struct class_device *cdev, char **envp, int num_envp,
-			   char *buffer, int buffer_size)
+static int nodemgr_uevent(struct class_device *cdev, char **envp, int num_envp,
+			  char *buffer, int buffer_size)
 {
 	struct unit_directory *ud;
 	int i = 0;
@@ -1112,8 +1112,8 @@ do {								\
 
 #else
 
-static int nodemgr_hotplug(struct class_device *cdev, char **envp, int num_envp,
-			   char *buffer, int buffer_size)
+static int nodemgr_uevent(struct class_device *cdev, char **envp, int num_envp,
+			  char *buffer, int buffer_size)
 {
 	return -ENODEV;
 }
@@ -1618,8 +1618,8 @@ static int nodemgr_host_thread(void *__hi)
 
 		/* Scan our nodes to get the bus options and create node
 		 * entries. This does not do the sysfs stuff, since that
-		 * would trigger hotplug callbacks and such, which is a
-		 * bad idea at this point. */
+		 * would trigger uevents and such, which is a bad idea at
+		 * this point. */
 		nodemgr_node_scan(hi, generation);
 
 		/* This actually does the full probe, with sysfs
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 08648b1a387e..1f1743c5c9a3 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -434,24 +434,24 @@ static void ib_device_release(struct class_device *cdev)
 	kfree(dev);
 }
 
-static int ib_device_hotplug(struct class_device *cdev, char **envp,
-			     int num_envp, char *buf, int size)
+static int ib_device_uevent(struct class_device *cdev, char **envp,
+			    int num_envp, char *buf, int size)
 {
 	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
 	int i = 0, len = 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buf, size, &len,
-				"NAME=%s", dev->name))
+	if (add_uevent_var(envp, num_envp, &i, buf, size, &len,
+			   "NAME=%s", dev->name))
 		return -ENOMEM;
 
 	/*
-	 * It might be nice to pass the node GUID to hotplug, but
+	 * It might be nice to pass the node GUID with the event, but
 	 * right now the only way to get it is to query the device
 	 * provider, and this can crash during device removal because
 	 * we are will be running after driver removal has started.
 	 * We could add a node_guid field to struct ib_device, or we
-	 * could just let the hotplug script read the node GUID from
-	 * sysfs when devices are added.
+	 * could just let userspace read the node GUID from sysfs when
+	 * devices are added.
 	 */
 
 	envp[i] = NULL;
@@ -653,7 +653,7 @@ static struct class_device_attribute *ib_class_attributes[] = {
 static struct class ib_class = {
 	.name    = "infiniband",
 	.release = ib_device_release,
-	.hotplug = ib_device_hotplug,
+	.uevent = ib_device_uevent,
 };
 
 int ib_device_register_sysfs(struct ib_device *device)
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 43b49ccd7dad..2d37b394e384 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -610,10 +610,10 @@ static void input_dev_release(struct class_device *class_dev)
 }
 
 /*
- * Input hotplugging interface - loading event handlers based on
+ * Input uevent interface - loading event handlers based on
  * device bitfields.
  */
-static int input_add_hotplug_bm_var(char **envp, int num_envp, int *cur_index,
+static int input_add_uevent_bm_var(char **envp, int num_envp, int *cur_index,
 				    char *buffer, int buffer_size, int *cur_len,
 				    const char *name, unsigned long *bitmap, int max)
 {
@@ -638,7 +638,7 @@ static int input_add_hotplug_bm_var(char **envp, int num_envp, int *cur_index,
 
 #define INPUT_ADD_HOTPLUG_VAR(fmt, val...)				\
 	do {								\
-		int err = add_hotplug_env_var(envp, num_envp, &i,	\
+		int err = add_uevent_var(envp, num_envp, &i,	\
 					buffer, buffer_size, &len,	\
 					fmt, val);			\
 		if (err)						\
@@ -647,15 +647,15 @@ static int input_add_hotplug_bm_var(char **envp, int num_envp, int *cur_index,
 
 #define INPUT_ADD_HOTPLUG_BM_VAR(name, bm, max)				\
 	do {								\
-		int err = input_add_hotplug_bm_var(envp, num_envp, &i,	\
+		int err = input_add_uevent_bm_var(envp, num_envp, &i,	\
 					buffer, buffer_size, &len,	\
 					name, bm, max);			\
 		if (err)						\
 			return err;					\
 	} while (0)
 
-static int input_dev_hotplug(struct class_device *cdev, char **envp,
-			     int num_envp, char *buffer, int buffer_size)
+static int input_dev_uevent(struct class_device *cdev, char **envp,
+			    int num_envp, char *buffer, int buffer_size)
 {
 	struct input_dev *dev = to_input_dev(cdev);
 	int i = 0;
@@ -697,7 +697,7 @@ static int input_dev_hotplug(struct class_device *cdev, char **envp,
 struct class input_class = {
 	.name			= "input",
 	.release		= input_dev_release,
-	.hotplug		= input_dev_hotplug,
+	.uevent			= input_dev_uevent,
 };
 
 struct input_dev *input_allocate_device(void)
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index fbb69ef6a77b..8e530cc970e1 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -800,16 +800,16 @@ static int serio_bus_match(struct device *dev, struct device_driver *drv)
 
 #ifdef CONFIG_HOTPLUG
 
-#define SERIO_ADD_HOTPLUG_VAR(fmt, val...)				\
+#define SERIO_ADD_UEVENT_VAR(fmt, val...)				\
 	do {								\
-		int err = add_hotplug_env_var(envp, num_envp, &i,	\
+		int err = add_uevent_var(envp, num_envp, &i,	\
 					buffer, buffer_size, &len,	\
 					fmt, val);			\
 		if (err)						\
 			return err;					\
 	} while (0)
 
-static int serio_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int serio_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	struct serio *serio;
 	int i = 0;
@@ -820,21 +820,21 @@ static int serio_hotplug(struct device *dev, char **envp, int num_envp, char *bu
 
 	serio = to_serio_port(dev);
 
-	SERIO_ADD_HOTPLUG_VAR("SERIO_TYPE=%02x", serio->id.type);
-	SERIO_ADD_HOTPLUG_VAR("SERIO_PROTO=%02x", serio->id.proto);
-	SERIO_ADD_HOTPLUG_VAR("SERIO_ID=%02x", serio->id.id);
-	SERIO_ADD_HOTPLUG_VAR("SERIO_EXTRA=%02x", serio->id.extra);
-	SERIO_ADD_HOTPLUG_VAR("MODALIAS=serio:ty%02Xpr%02Xid%02Xex%02X",
+	SERIO_ADD_UEVENT_VAR("SERIO_TYPE=%02x", serio->id.type);
+	SERIO_ADD_UEVENT_VAR("SERIO_PROTO=%02x", serio->id.proto);
+	SERIO_ADD_UEVENT_VAR("SERIO_ID=%02x", serio->id.id);
+	SERIO_ADD_UEVENT_VAR("SERIO_EXTRA=%02x", serio->id.extra);
+	SERIO_ADD_UEVENT_VAR("MODALIAS=serio:ty%02Xpr%02Xid%02Xex%02X",
 				serio->id.type, serio->id.proto, serio->id.id, serio->id.extra);
 	envp[i] = NULL;
 
 	return 0;
 }
-#undef SERIO_ADD_HOTPLUG_VAR
+#undef SERIO_ADD_UEVENT_VAR
 
 #else
 
-static int serio_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int serio_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	return -ENODEV;
 }
@@ -908,7 +908,7 @@ static int __init serio_init(void)
 	serio_bus.dev_attrs = serio_device_attrs;
 	serio_bus.drv_attrs = serio_driver_attrs;
 	serio_bus.match = serio_bus_match;
-	serio_bus.hotplug = serio_hotplug;
+	serio_bus.uevent = serio_uevent;
 	serio_bus.resume = serio_resume;
 	bus_register(&serio_bus);
 
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index c34c96d18907..228e1852a836 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -128,7 +128,7 @@ static int macio_device_resume(struct device * dev)
 	return 0;
 }
 
-static int macio_hotplug (struct device *dev, char **envp, int num_envp,
+static int macio_uevent(struct device *dev, char **envp, int num_envp,
                           char *buffer, int buffer_size)
 {
 	struct macio_dev * macio_dev;
@@ -203,7 +203,7 @@ extern struct device_attribute macio_dev_attrs[];
 struct bus_type macio_bus_type = {
        .name	= "macio",
        .match	= macio_bus_match,
-       .hotplug = macio_hotplug,
+       .uevent = macio_uevent,
        .suspend	= macio_device_suspend,
        .resume	= macio_device_resume,
        .dev_attrs = macio_dev_attrs,
diff --git a/drivers/mmc/mmc_sysfs.c b/drivers/mmc/mmc_sysfs.c
index 3f4a66ca9555..ec701667abfc 100644
--- a/drivers/mmc/mmc_sysfs.c
+++ b/drivers/mmc/mmc_sysfs.c
@@ -80,7 +80,7 @@ static int mmc_bus_match(struct device *dev, struct device_driver *drv)
 }
 
 static int
-mmc_bus_hotplug(struct device *dev, char **envp, int num_envp, char *buf,
+mmc_bus_uevent(struct device *dev, char **envp, int num_envp, char *buf,
 		int buf_size)
 {
 	struct mmc_card *card = dev_to_mmc_card(dev);
@@ -140,7 +140,7 @@ static struct bus_type mmc_bus_type = {
 	.name		= "mmc",
 	.dev_attrs	= mmc_dev_attrs,
 	.match		= mmc_bus_match,
-	.hotplug	= mmc_bus_hotplug,
+	.uevent		= mmc_bus_uevent,
 	.suspend	= mmc_bus_suspend,
 	.resume		= mmc_bus_resume,
 };
diff --git a/drivers/pci/hotplug.c b/drivers/pci/hotplug.c
index e1743be31909..1c97e7dd130b 100644
--- a/drivers/pci/hotplug.c
+++ b/drivers/pci/hotplug.c
@@ -3,8 +3,8 @@
 #include <linux/module.h>
 #include "pci.h"
 
-int pci_hotplug (struct device *dev, char **envp, int num_envp,
-		 char *buffer, int buffer_size)
+int pci_uevent(struct device *dev, char **envp, int num_envp,
+	       char *buffer, int buffer_size)
 {
 	struct pci_dev *pdev;
 	int i = 0;
@@ -17,34 +17,34 @@ int pci_hotplug (struct device *dev, char **envp, int num_envp,
 	if (!pdev)
 		return -ENODEV;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_CLASS=%04X", pdev->class))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_CLASS=%04X", pdev->class))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_ID=%04X:%04X", pdev->vendor, pdev->device))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_ID=%04X:%04X", pdev->vendor, pdev->device))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_SUBSYS_ID=%04X:%04X", pdev->subsystem_vendor,
-				pdev->subsystem_device))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_SUBSYS_ID=%04X:%04X", pdev->subsystem_vendor,
+			   pdev->subsystem_device))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_SLOT_NAME=%s", pci_name(pdev)))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_SLOT_NAME=%s", pci_name(pdev)))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"MODALIAS=pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02x",
-				pdev->vendor, pdev->device,
-				pdev->subsystem_vendor, pdev->subsystem_device,
-				(u8)(pdev->class >> 16), (u8)(pdev->class >> 8),
-				(u8)(pdev->class)))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "MODALIAS=pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02x",
+			   pdev->vendor, pdev->device,
+			   pdev->subsystem_vendor, pdev->subsystem_device,
+			   (u8)(pdev->class >> 16), (u8)(pdev->class >> 8),
+			   (u8)(pdev->class)))
 		return -ENOMEM;
 
 	envp[i] = NULL;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index a9046d4b8af3..7146b69b812c 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -502,8 +502,8 @@ void pci_dev_put(struct pci_dev *dev)
 }
 
 #ifndef CONFIG_HOTPLUG
-int pci_hotplug (struct device *dev, char **envp, int num_envp,
-		 char *buffer, int buffer_size)
+int pci_uevent(struct device *dev, char **envp, int num_envp,
+	       char *buffer, int buffer_size)
 {
 	return -ENODEV;
 }
@@ -512,7 +512,7 @@ int pci_hotplug (struct device *dev, char **envp, int num_envp,
 struct bus_type pci_bus_type = {
 	.name		= "pci",
 	.match		= pci_bus_match,
-	.hotplug	= pci_hotplug,
+	.uevent		= pci_uevent,
 	.suspend	= pci_device_suspend,
 	.resume		= pci_device_resume,
 	.dev_attrs	= pci_dev_attrs,
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 6527b36c9a61..294849d24590 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1,7 +1,7 @@
 /* Functions internal to the PCI core code */
 
-extern int pci_hotplug (struct device *dev, char **envp, int num_envp,
-			 char *buffer, int buffer_size);
+extern int pci_uevent(struct device *dev, char **envp, int num_envp,
+		      char *buffer, int buffer_size);
 extern int pci_create_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_cleanup_rom(struct pci_dev *dev);
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index a30aa74304a2..7cf09084ef61 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -901,14 +901,14 @@ int pcmcia_insert_card(struct pcmcia_socket *skt)
 EXPORT_SYMBOL(pcmcia_insert_card);
 
 
-static int pcmcia_socket_hotplug(struct class_device *dev, char **envp,
-				int num_envp, char *buffer, int buffer_size)
+static int pcmcia_socket_uevent(struct class_device *dev, char **envp,
+			        int num_envp, char *buffer, int buffer_size)
 {
 	struct pcmcia_socket *s = container_of(dev, struct pcmcia_socket, dev);
 	int i = 0, length = 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				&length, "SOCKET_NO=%u", s->sock))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			   &length, "SOCKET_NO=%u", s->sock))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -927,7 +927,7 @@ static void pcmcia_release_socket_class(struct class *data)
 
 struct class pcmcia_socket_class = {
 	.name = "pcmcia_socket",
-        .hotplug = pcmcia_socket_hotplug,
+	.uevent = pcmcia_socket_uevent,
 	.release = pcmcia_release_socket,
 	.class_release = pcmcia_release_socket_class,
 };
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 7f8219f3fd9e..6fb76399547e 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -779,8 +779,8 @@ static int pcmcia_bus_match(struct device * dev, struct device_driver * drv) {
 
 #ifdef CONFIG_HOTPLUG
 
-static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
-			      char *buffer, int buffer_size)
+static int pcmcia_bus_uevent(struct device *dev, char **envp, int num_envp,
+			     char *buffer, int buffer_size)
 {
 	struct pcmcia_device *p_dev;
 	int i, length = 0;
@@ -800,31 +800,31 @@ static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
 
 	i = 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"SOCKET_NO=%u",
-				p_dev->socket->sock))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "SOCKET_NO=%u",
+			   p_dev->socket->sock))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"DEVICE_NO=%02X",
-				p_dev->device_no))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "DEVICE_NO=%02X",
+			   p_dev->device_no))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"MODALIAS=pcmcia:m%04Xc%04Xf%02Xfn%02Xpfn%02X"
-				"pa%08Xpb%08Xpc%08Xpd%08X",
-				p_dev->has_manf_id ? p_dev->manf_id : 0,
-				p_dev->has_card_id ? p_dev->card_id : 0,
-				p_dev->has_func_id ? p_dev->func_id : 0,
-				p_dev->func,
-				p_dev->device_no,
-				hash[0],
-				hash[1],
-				hash[2],
-				hash[3]))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "MODALIAS=pcmcia:m%04Xc%04Xf%02Xfn%02Xpfn%02X"
+			   "pa%08Xpb%08Xpc%08Xpd%08X",
+			   p_dev->has_manf_id ? p_dev->manf_id : 0,
+			   p_dev->has_card_id ? p_dev->card_id : 0,
+			   p_dev->has_func_id ? p_dev->func_id : 0,
+			   p_dev->func,
+			   p_dev->device_no,
+			   hash[0],
+			   hash[1],
+			   hash[2],
+			   hash[3]))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -834,7 +834,7 @@ static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
 
 #else
 
-static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
+static int pcmcia_bus_uevent(struct device *dev, char **envp, int num_envp,
 			      char *buffer, int buffer_size)
 {
 	return -ENODEV;
@@ -1223,7 +1223,7 @@ static struct class_interface pcmcia_bus_interface = {
 
 struct bus_type pcmcia_bus_type = {
 	.name = "pcmcia",
-	.hotplug = pcmcia_bus_hotplug,
+	.uevent = pcmcia_bus_uevent,
 	.match = pcmcia_bus_match,
 	.dev_attrs = pcmcia_dev_attrs,
 };
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index bf44a409ba0d..07ddf9a38758 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -2132,7 +2132,7 @@ restart:
 	}
 
 	spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
-	kobject_hotplug(&ioa_cfg->host->shost_classdev.kobj, KOBJ_CHANGE);
+	kobject_uevent(&ioa_cfg->host->shost_classdev.kobj, KOBJ_CHANGE);
 	LEAVE;
 }
 
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index e80ef9467825..af2f0941baac 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -363,8 +363,7 @@ void usb_driver_release_interface(struct usb_driver *driver,
  * Most USB device drivers will use this indirectly, through the usb core,
  * but some layered driver frameworks use it directly.
  * These device tables are exported with MODULE_DEVICE_TABLE, through
- * modutils and "modules.usbmap", to support the driver loading
- * functionality of USB hotplugging.
+ * modutils, to support the driver loading functionality of USB hotplugging.
  *
  * What Matches:
  *
@@ -545,10 +544,7 @@ static int usb_device_match (struct device *dev, struct device_driver *drv)
 #ifdef	CONFIG_HOTPLUG
 
 /*
- * USB hotplugging invokes what /proc/sys/kernel/hotplug says
- * (normally /sbin/hotplug) when USB devices get added or removed.
- *
- * This invokes a user mode policy agent, typically helping to load driver
+ * This sends an uevent to userspace, typically helping to load driver
  * or other modules, configure the device, and more.  Drivers can provide
  * a MODULE_DEVICE_TABLE to help with module loading subtasks.
  *
@@ -557,8 +553,8 @@ static int usb_device_match (struct device *dev, struct device_driver *drv)
  * delays in event delivery.  Use sysfs (and DEVPATH) to make sure the
  * device (and this configuration!) are still present.
  */
-static int usb_hotplug (struct device *dev, char **envp, int num_envp,
-			char *buffer, int buffer_size)
+static int usb_uevent(struct device *dev, char **envp, int num_envp,
+		      char *buffer, int buffer_size)
 {
 	struct usb_interface *intf;
 	struct usb_device *usb_dev;
@@ -570,7 +566,7 @@ static int usb_hotplug (struct device *dev, char **envp, int num_envp,
 		return -ENODEV;
 
 	/* driver is often null here; dev_dbg() would oops */
-	pr_debug ("usb %s: hotplug\n", dev->bus_id);
+	pr_debug ("usb %s: uevent\n", dev->bus_id);
 
 	/* Must check driver_data here, as on remove driver is always NULL */
 	if ((dev->driver == &usb_generic_driver) || 
@@ -597,51 +593,51 @@ static int usb_hotplug (struct device *dev, char **envp, int num_envp,
 	 *
 	 * FIXME reduce hardwired intelligence here
 	 */
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"DEVICE=/proc/bus/usb/%03d/%03d",
-				usb_dev->bus->busnum, usb_dev->devnum))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "DEVICE=/proc/bus/usb/%03d/%03d",
+			   usb_dev->bus->busnum, usb_dev->devnum))
 		return -ENOMEM;
 #endif
 
 	/* per-device configurations are common */
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PRODUCT=%x/%x/%x",
-				le16_to_cpu(usb_dev->descriptor.idVendor),
-				le16_to_cpu(usb_dev->descriptor.idProduct),
-				le16_to_cpu(usb_dev->descriptor.bcdDevice)))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PRODUCT=%x/%x/%x",
+			   le16_to_cpu(usb_dev->descriptor.idVendor),
+			   le16_to_cpu(usb_dev->descriptor.idProduct),
+			   le16_to_cpu(usb_dev->descriptor.bcdDevice)))
 		return -ENOMEM;
 
 	/* class-based driver binding models */
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"TYPE=%d/%d/%d",
-				usb_dev->descriptor.bDeviceClass,
-				usb_dev->descriptor.bDeviceSubClass,
-				usb_dev->descriptor.bDeviceProtocol))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "TYPE=%d/%d/%d",
+			   usb_dev->descriptor.bDeviceClass,
+			   usb_dev->descriptor.bDeviceSubClass,
+			   usb_dev->descriptor.bDeviceProtocol))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"INTERFACE=%d/%d/%d",
-				alt->desc.bInterfaceClass,
-				alt->desc.bInterfaceSubClass,
-				alt->desc.bInterfaceProtocol))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "INTERFACE=%d/%d/%d",
+			   alt->desc.bInterfaceClass,
+			   alt->desc.bInterfaceSubClass,
+			   alt->desc.bInterfaceProtocol))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"MODALIAS=usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02Xic%02Xisc%02Xip%02X",
-				le16_to_cpu(usb_dev->descriptor.idVendor),
-				le16_to_cpu(usb_dev->descriptor.idProduct),
-				le16_to_cpu(usb_dev->descriptor.bcdDevice),
-				usb_dev->descriptor.bDeviceClass,
-				usb_dev->descriptor.bDeviceSubClass,
-				usb_dev->descriptor.bDeviceProtocol,
-				alt->desc.bInterfaceClass,
-				alt->desc.bInterfaceSubClass,
-				alt->desc.bInterfaceProtocol))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "MODALIAS=usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02Xic%02Xisc%02Xip%02X",
+			   le16_to_cpu(usb_dev->descriptor.idVendor),
+			   le16_to_cpu(usb_dev->descriptor.idProduct),
+			   le16_to_cpu(usb_dev->descriptor.bcdDevice),
+			   usb_dev->descriptor.bDeviceClass,
+			   usb_dev->descriptor.bDeviceSubClass,
+			   usb_dev->descriptor.bDeviceProtocol,
+			   alt->desc.bInterfaceClass,
+			   alt->desc.bInterfaceSubClass,
+			   alt->desc.bInterfaceProtocol))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -651,7 +647,7 @@ static int usb_hotplug (struct device *dev, char **envp, int num_envp,
 
 #else
 
-static int usb_hotplug (struct device *dev, char **envp,
+static int usb_uevent(struct device *dev, char **envp,
 			int num_envp, char *buffer, int buffer_size)
 {
 	return -ENODEV;
@@ -1491,7 +1487,7 @@ static int usb_generic_resume(struct device *dev)
 struct bus_type usb_bus_type = {
 	.name =		"usb",
 	.match =	usb_device_match,
-	.hotplug =	usb_hotplug,
+	.uevent =	usb_uevent,
 	.suspend =	usb_generic_suspend,
 	.resume =	usb_generic_resume,
 };
diff --git a/drivers/usb/host/hc_crisv10.c b/drivers/usb/host/hc_crisv10.c
index 0eaabeb37ac3..641268d7e6f3 100644
--- a/drivers/usb/host/hc_crisv10.c
+++ b/drivers/usb/host/hc_crisv10.c
@@ -4397,7 +4397,7 @@ static int __init etrax_usb_hc_init(void)
         device_initialize(&fake_device);
         kobject_set_name(&fake_device.kobj, "etrax_usb");
         kobject_add(&fake_device.kobj);
-        kobject_hotplug(&fake_device.kobj, KOBJ_ADD);
+	kobject_uevent(&fake_device.kobj, KOBJ_ADD);
         hc->bus->controller = &fake_device;
 	usb_register_bus(hc->bus);
 
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index 14016b1cd948..024206c4a0e4 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -142,12 +142,12 @@ static struct bin_attribute w1_slave_attr_bin_id = {
 /* Default family */
 static struct w1_family w1_default_family;
 
-static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
+static int w1_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
 
 static struct bus_type w1_bus_type = {
 	.name = "w1",
 	.match = w1_master_match,
-	.hotplug = w1_hotplug,
+	.uevent = w1_uevent,
 };
 
 struct device_driver w1_master_driver = {
@@ -361,7 +361,7 @@ void w1_destroy_master_attributes(struct w1_master *master)
 }
 
 #ifdef CONFIG_HOTPLUG
-static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int w1_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	struct w1_master *md = NULL;
 	struct w1_slave *sl = NULL;
@@ -377,7 +377,7 @@ static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffe
 		event_owner = "slave";
 		name = sl->name;
 	} else {
-		dev_dbg(dev, "Unknown hotplug event.\n");
+		dev_dbg(dev, "Unknown event.\n");
 		return -EINVAL;
 	}
 
@@ -386,18 +386,18 @@ static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffe
 	if (dev->driver != &w1_slave_driver || !sl)
 		return 0;
 
-	err = add_hotplug_env_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_FID=%02X", sl->reg_num.family);
+	err = add_uevent_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_FID=%02X", sl->reg_num.family);
 	if (err)
 		return err;
 
-	err = add_hotplug_env_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_SLAVE_ID=%024LX", (u64)sl->reg_num.id);
+	err = add_uevent_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_SLAVE_ID=%024LX", (u64)sl->reg_num.id);
 	if (err)
 		return err;
 
 	return 0;
 };
 #else
-static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int w1_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	return 0;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8dc1822a7022..7187a57d51e8 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,7 +226,7 @@ static struct sysfs_ops part_sysfs_ops = {
 static ssize_t part_uevent_store(struct hd_struct * p,
 				 const char *page, size_t count)
 {
-	kobject_hotplug(&p->kobj, KOBJ_ADD);
+	kobject_uevent(&p->kobj, KOBJ_ADD);
 	return count;
 }
 static ssize_t part_dev_read(struct hd_struct * p, char *page)
@@ -360,7 +360,7 @@ void register_disk(struct gendisk *disk)
 	if ((err = kobject_add(&disk->kobj)))
 		return;
 	disk_sysfs_symlinks(disk);
-	kobject_hotplug(&disk->kobj, KOBJ_ADD);
+	kobject_uevent(&disk->kobj, KOBJ_ADD);
 
 	/* No minors to use for partitions */
 	if (disk->minors == 1) {
@@ -465,6 +465,6 @@ void del_gendisk(struct gendisk *disk)
 		sysfs_remove_link(&disk->driverfs_dev->kobj, "block");
 		put_device(disk->driverfs_dev);
 	}
-	kobject_hotplug(&disk->kobj, KOBJ_REMOVE);
+	kobject_uevent(&disk->kobj, KOBJ_REMOVE);
 	kobject_del(&disk->kobj);
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 17cbc6db67b4..0cdee78e5ce1 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -47,8 +47,8 @@ struct bus_type {
 	struct driver_attribute	* drv_attrs;
 
 	int		(*match)(struct device * dev, struct device_driver * drv);
-	int		(*hotplug) (struct device *dev, char **envp, 
-				    int num_envp, char *buffer, int buffer_size);
+	int		(*uevent)(struct device *dev, char **envp,
+				  int num_envp, char *buffer, int buffer_size);
 	int		(*suspend)(struct device * dev, pm_message_t state);
 	int		(*resume)(struct device * dev);
 };
@@ -151,7 +151,7 @@ struct class {
 	struct class_attribute		* class_attrs;
 	struct class_device_attribute	* class_dev_attrs;
 
-	int	(*hotplug)(struct class_device *dev, char **envp, 
+	int	(*uevent)(struct class_device *dev, char **envp,
 			   int num_envp, char *buffer, int buffer_size);
 
 	void	(*release)(struct class_device *dev);
@@ -209,9 +209,9 @@ extern int class_device_create_file(struct class_device *,
  * set, this will be called instead of the class specific release function.
  * Only use this if you want to override the default release function, like
  * when you are nesting class_device structures.
- * @hotplug: pointer to a hotplug function for this struct class_device.  If
- * set, this will be called instead of the class specific hotplug function.
- * Only use this if you want to override the default hotplug function, like
+ * @uevent: pointer to a uevent function for this struct class_device.  If
+ * set, this will be called instead of the class specific uevent function.
+ * Only use this if you want to override the default uevent function, like
  * when you are nesting class_device structures.
  */
 struct class_device {
@@ -227,7 +227,7 @@ struct class_device {
 	struct class_device	*parent;	/* parent of this child device, if there is one */
 
 	void	(*release)(struct class_device *dev);
-	int	(*hotplug)(struct class_device *dev, char **envp,
+	int	(*uevent)(struct class_device *dev, char **envp,
 			   int num_envp, char *buffer, int buffer_size);
 	char	class_id[BUS_ID_SIZE];	/* unique to this class */
 };
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 2063c0839d4f..2d716080be4a 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -14,7 +14,7 @@ struct device;
 int request_firmware(const struct firmware **fw, const char *name,
 		     struct device *device);
 int request_firmware_nowait(
-	struct module *module, int hotplug,
+	struct module *module, int uevent,
 	const char *name, struct device *device, void *context,
 	void (*cont)(const struct firmware *fw, void *context));
 
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 5b08248fba72..8eb21f2f25e1 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -26,15 +26,14 @@
 #include <linux/kernel.h>
 #include <asm/atomic.h>
 
-#define KOBJ_NAME_LEN	20
-
-#define HOTPLUG_PATH_LEN	256
+#define KOBJ_NAME_LEN			20
+#define UEVENT_HELPER_PATH_LEN		256
 
 /* path to the userspace helper executed on an event */
-extern char hotplug_path[];
+extern char uevent_helper[];
 
-/* counter to tag the hotplug event, read only except for the kobject core */
-extern u64 hotplug_seqnum;
+/* counter to tag the uevent, read only except for the kobject core */
+extern u64 uevent_seqnum;
 
 /* the actions here must match the proper string in lib/kobject_uevent.c */
 typedef int __bitwise kobject_action_t;
@@ -101,15 +100,14 @@ struct kobj_type {
  *	of object; multiple ksets can belong to one subsystem. All 
  *	ksets of a subsystem share the subsystem's lock.
  *
- *      Each kset can support hotplugging; if it does, it will be given
- *      the opportunity to filter out specific kobjects from being
- *      reported, as well as to add its own "data" elements to the
- *      environment being passed to the hotplug helper.
+ *	Each kset can support specific event variables; it can
+ *	supress the event generation or add subsystem specific
+ *	variables carried with the event.
  */
-struct kset_hotplug_ops {
+struct kset_uevent_ops {
 	int (*filter)(struct kset *kset, struct kobject *kobj);
 	const char *(*name)(struct kset *kset, struct kobject *kobj);
-	int (*hotplug)(struct kset *kset, struct kobject *kobj, char **envp,
+	int (*uevent)(struct kset *kset, struct kobject *kobj, char **envp,
 			int num_envp, char *buffer, int buffer_size);
 };
 
@@ -119,7 +117,7 @@ struct kset {
 	struct list_head	list;
 	spinlock_t		list_lock;
 	struct kobject		kobj;
-	struct kset_hotplug_ops	* hotplug_ops;
+	struct kset_uevent_ops	* uevent_ops;
 };
 
 
@@ -167,20 +165,20 @@ struct subsystem {
 	struct rw_semaphore	rwsem;
 };
 
-#define decl_subsys(_name,_type,_hotplug_ops) \
+#define decl_subsys(_name,_type,_uevent_ops) \
 struct subsystem _name##_subsys = { \
 	.kset = { \
 		.kobj = { .name = __stringify(_name) }, \
 		.ktype = _type, \
-		.hotplug_ops =_hotplug_ops, \
+		.uevent_ops =_uevent_ops, \
 	} \
 }
-#define decl_subsys_name(_varname,_name,_type,_hotplug_ops) \
+#define decl_subsys_name(_varname,_name,_type,_uevent_ops) \
 struct subsystem _varname##_subsys = { \
 	.kset = { \
 		.kobj = { .name = __stringify(_name) }, \
 		.ktype = _type, \
-		.hotplug_ops =_hotplug_ops, \
+		.uevent_ops =_uevent_ops, \
 	} \
 }
 
@@ -256,16 +254,16 @@ extern int subsys_create_file(struct subsystem * , struct subsys_attribute *);
 extern void subsys_remove_file(struct subsystem * , struct subsys_attribute *);
 
 #ifdef CONFIG_HOTPLUG
-void kobject_hotplug(struct kobject *kobj, enum kobject_action action);
+void kobject_uevent(struct kobject *kobj, enum kobject_action action);
 
-int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
+int add_uevent_var(char **envp, int num_envp, int *cur_index,
 			char *buffer, int buffer_size, int *cur_len,
 			const char *format, ...)
 	__attribute__((format (printf, 7, 8)));
 #else
-static inline void kobject_hotplug(struct kobject *kobj, enum kobject_action action) { }
+static inline void kobject_uevent(struct kobject *kobj, enum kobject_action action) { }
 
-static inline int add_hotplug_env_var(char **envp, int num_envp, int *cur_index, 
+static inline int add_uevent_var(char **envp, int num_envp, int *cur_index,
 				      char *buffer, int buffer_size, int *cur_len, 
 				      const char *format, ...)
 { return 0; }
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4be34ef8c2f7..501564264518 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -124,7 +124,7 @@ enum
 	KERN_OVERFLOWUID=46,	/* int: overflow UID */
 	KERN_OVERFLOWGID=47,	/* int: overflow GID */
 	KERN_SHMPATH=48,	/* string: path to shm fs */
-	KERN_HOTPLUG=49,	/* string: path to hotplug policy agent */
+	KERN_HOTPLUG=49,	/* string: path to uevent helper (deprecated) */
 	KERN_IEEE_EMULATION_WARNINGS=50, /* int: unimplemented ieee instructions */
 	KERN_S390_USER_DEBUG_LOGGING=51,  /* int: dumps of user faults */
 	KERN_CORE_USES_PID=52,		/* int: use core or core.%pid */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index d81b050e5955..7a20997e8071 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -225,7 +225,7 @@ struct usb_interface_cache {
  * Device drivers should not attempt to activate configurations.  The choice
  * of which configuration to install is a policy decision based on such
  * considerations as available power, functionality provided, and the user's
- * desires (expressed through hotplug scripts).  However, drivers can call
+ * desires (expressed through userspace tools).  However, drivers can call
  * usb_reset_configuration() to reinitialize the current configuration and
  * all its interfaces.
  */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e975a76a9d5b..bfb4a7a54e22 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,23 +26,23 @@ static struct subsys_attribute _name##_attr = \
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
+	return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
 /* uevent helper program, used during early boo */
 static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%s\n", hotplug_path);
+	return sprintf(page, "%s\n", uevent_helper);
 }
 static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
 {
-	if (count+1 > HOTPLUG_PATH_LEN)
+	if (count+1 > UEVENT_HELPER_PATH_LEN)
 		return -ENOENT;
-	memcpy(hotplug_path, page, count);
-	hotplug_path[count] = '\0';
-	if (count && hotplug_path[count-1] == '\n')
-		hotplug_path[count-1] = '\0';
+	memcpy(uevent_helper, page, count);
+	uevent_helper[count] = '\0';
+	if (count && uevent_helper[count-1] == '\n')
+		uevent_helper[count-1] = '\0';
 	return count;
 }
 KERNEL_ATTR_RW(uevent_helper);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a51e25d4466..345f4a1d533f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -395,8 +395,8 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_HOTPLUG,
 		.procname	= "hotplug",
-		.data		= &hotplug_path,
-		.maxlen		= HOTPLUG_PATH_LEN,
+		.data		= &uevent_helper,
+		.maxlen		= UEVENT_HELPER_PATH_LEN,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
diff --git a/lib/kobject.c b/lib/kobject.c
index a181abed89f6..7a0e6809490d 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -207,7 +207,7 @@ int kobject_register(struct kobject * kobj)
 			       kobject_name(kobj),error);
 			dump_stack();
 		} else
-			kobject_hotplug(kobj, KOBJ_ADD);
+			kobject_uevent(kobj, KOBJ_ADD);
 	} else
 		error = -EINVAL;
 	return error;
@@ -312,7 +312,7 @@ void kobject_del(struct kobject * kobj)
 void kobject_unregister(struct kobject * kobj)
 {
 	pr_debug("kobject %s: unregistering\n",kobject_name(kobj));
-	kobject_hotplug(kobj, KOBJ_REMOVE);
+	kobject_uevent(kobj, KOBJ_REMOVE);
 	kobject_del(kobj);
 	kobject_put(kobj);
 }
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index dd061da3aba9..01479e5c6d18 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -22,12 +22,12 @@
 #include <linux/kobject.h>
 #include <net/sock.h>
 
-#define BUFFER_SIZE	1024	/* buffer for the hotplug env */
+#define BUFFER_SIZE	1024	/* buffer for the variables */
 #define NUM_ENVP	32	/* number of env pointers */
 
 #if defined(CONFIG_HOTPLUG)
-char hotplug_path[HOTPLUG_PATH_LEN] = "/sbin/hotplug";
-u64 hotplug_seqnum;
+char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
+u64 uevent_seqnum;
 static DEFINE_SPINLOCK(sequence_lock);
 static struct sock *uevent_sock;
 
@@ -50,12 +50,12 @@ static char *action_to_string(enum kobject_action action)
 }
 
 /**
- * kobject_hotplug - notify userspace by executing /sbin/hotplug
+ * kobject_uevent - notify userspace by ending an uevent
  *
- * @action: action that is happening (usually "ADD" or "REMOVE")
+ * @action: action that is happening (usually KOBJ_ADD and KOBJ_REMOVE)
  * @kobj: struct kobject that the action is happening to
  */
-void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
+void kobject_uevent(struct kobject *kobj, enum kobject_action action)
 {
 	char **envp;
 	char *buffer;
@@ -65,7 +65,7 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 	const char *subsystem;
 	struct kobject *top_kobj;
 	struct kset *kset;
-	struct kset_hotplug_ops *hotplug_ops;
+	struct kset_uevent_ops *uevent_ops;
 	u64 seq;
 	char *seq_buff;
 	int i = 0;
@@ -88,11 +88,11 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 		return;
 
 	kset = top_kobj->kset;
-	hotplug_ops = kset->hotplug_ops;
+	uevent_ops = kset->uevent_ops;
 
 	/*  skip the event, if the filter returns zero. */
-	if (hotplug_ops && hotplug_ops->filter)
-		if (!hotplug_ops->filter(kset, kobj))
+	if (uevent_ops && uevent_ops->filter)
+		if (!uevent_ops->filter(kset, kobj))
 			return;
 
 	/* environment index */
@@ -111,8 +111,8 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 		goto exit;
 
 	/* originating subsystem */
-	if (hotplug_ops && hotplug_ops->name)
-		subsystem = hotplug_ops->name(kset, kobj);
+	if (uevent_ops && uevent_ops->name)
+		subsystem = uevent_ops->name(kset, kobj);
 	else
 		subsystem = kobject_name(&kset->kobj);
 
@@ -134,12 +134,12 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 	scratch += strlen("SEQNUM=18446744073709551616") + 1;
 
 	/* let the kset specific function add its stuff */
-	if (hotplug_ops && hotplug_ops->hotplug) {
-		retval = hotplug_ops->hotplug (kset, kobj,
+	if (uevent_ops && uevent_ops->uevent) {
+		retval = uevent_ops->uevent(kset, kobj,
 				  &envp[i], NUM_ENVP - i, scratch,
 				  BUFFER_SIZE - (scratch - buffer));
 		if (retval) {
-			pr_debug ("%s - hotplug() returned %d\n",
+			pr_debug ("%s - uevent() returned %d\n",
 				  __FUNCTION__, retval);
 			goto exit;
 		}
@@ -147,7 +147,7 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 
 	/* we will send an event, request a new sequence number */
 	spin_lock(&sequence_lock);
-	seq = ++hotplug_seqnum;
+	seq = ++uevent_seqnum;
 	spin_unlock(&sequence_lock);
 	sprintf(seq_buff, "SEQNUM=%llu", (unsigned long long)seq);
 
@@ -177,10 +177,10 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 	}
 
 	/* call uevent_helper, usually only enabled during early boot */
-	if (hotplug_path[0]) {
+	if (uevent_helper[0]) {
 		char *argv [3];
 
-		argv [0] = hotplug_path;
+		argv [0] = uevent_helper;
 		argv [1] = (char *)subsystem;
 		argv [2] = NULL;
 		call_usermodehelper (argv[0], argv, envp, 0);
@@ -192,39 +192,39 @@ exit:
 	kfree(envp);
 	return;
 }
-EXPORT_SYMBOL(kobject_hotplug);
+EXPORT_SYMBOL_GPL(kobject_uevent);
 
 /**
- * add_hotplug_env_var - helper for creating hotplug environment variables
+ * add_uevent_var - helper for creating event variables
  * @envp: Pointer to table of environment variables, as passed into
- * hotplug() method.
+ * uevent() method.
  * @num_envp: Number of environment variable slots available, as
- * passed into hotplug() method.
+ * passed into uevent() method.
  * @cur_index: Pointer to current index into @envp.  It should be
- * initialized to 0 before the first call to add_hotplug_env_var(),
+ * initialized to 0 before the first call to add_uevent_var(),
  * and will be incremented on success.
  * @buffer: Pointer to buffer for environment variables, as passed
- * into hotplug() method.
- * @buffer_size: Length of @buffer, as passed into hotplug() method.
+ * into uevent() method.
+ * @buffer_size: Length of @buffer, as passed into uevent() method.
  * @cur_len: Pointer to current length of space used in @buffer.
  * Should be initialized to 0 before the first call to
- * add_hotplug_env_var(), and will be incremented on success.
+ * add_uevent_var(), and will be incremented on success.
  * @format: Format for creating environment variable (of the form
  * "XXX=%x") for snprintf().
  *
  * Returns 0 if environment variable was added successfully or -ENOMEM
  * if no space was available.
  */
-int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
-			char *buffer, int buffer_size, int *cur_len,
-			const char *format, ...)
+int add_uevent_var(char **envp, int num_envp, int *cur_index,
+		   char *buffer, int buffer_size, int *cur_len,
+		   const char *format, ...)
 {
 	va_list args;
 
 	/*
 	 * We check against num_envp - 1 to make sure there is at
-	 * least one slot left after we return, since the hotplug
-	 * method needs to set the last slot to NULL.
+	 * least one slot left after we return, since kobject_uevent()
+	 * needs to set the last slot to NULL.
 	 */
 	if (*cur_index >= num_envp - 1)
 		return -ENOMEM;
@@ -243,7 +243,7 @@ int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
 	(*cur_index)++;
 	return 0;
 }
-EXPORT_SYMBOL(add_hotplug_env_var);
+EXPORT_SYMBOL_GPL(add_uevent_var);
 
 static int __init kobject_uevent_init(void)
 {
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index bd7568ac87fc..0ed38740388c 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -78,7 +78,7 @@ static struct class_device_attribute *bt_attrs[] = {
 };
 
 #ifdef CONFIG_HOTPLUG
-static int bt_hotplug(struct class_device *cdev, char **envp, int num_envp, char *buf, int size)
+static int bt_uevent(struct class_device *cdev, char **envp, int num_envp, char *buf, int size)
 {
 	struct hci_dev *hdev = class_get_devdata(cdev);
 	int n, i = 0;
@@ -107,7 +107,7 @@ struct class bt_class = {
 	.name		= "bluetooth",
 	.release	= bt_release,
 #ifdef CONFIG_HOTPLUG
-	.hotplug	= bt_hotplug,
+	.uevent		= bt_uevent,
 #endif
 };
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index f6a19d53eaeb..2ebdc23bbe26 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -248,7 +248,7 @@ int br_sysfs_addif(struct net_bridge_port *p)
 	if (err)
 		goto out2;
 
-	kobject_hotplug(&p->kobj, KOBJ_ADD);
+	kobject_uevent(&p->kobj, KOBJ_ADD);
 	return 0;
  out2:
 	kobject_del(&p->kobj);
@@ -260,7 +260,7 @@ void br_sysfs_removeif(struct net_bridge_port *p)
 {
 	pr_debug("br_sysfs_removeif\n");
 	sysfs_remove_link(&p->br->ifobj, p->dev->name);
-	kobject_hotplug(&p->kobj, KOBJ_REMOVE);
+	kobject_uevent(&p->kobj, KOBJ_REMOVE);
 	kobject_del(&p->kobj);
 }
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e2137f3e489d..198655dd9a77 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -369,14 +369,14 @@ static struct attribute_group wireless_group = {
 #endif
 
 #ifdef CONFIG_HOTPLUG
-static int netdev_hotplug(struct class_device *cd, char **envp,
-			  int num_envp, char *buf, int size)
+static int netdev_uevent(struct class_device *cd, char **envp,
+			 int num_envp, char *buf, int size)
 {
 	struct net_device *dev = to_net_dev(cd);
 	int i = 0;
 	int n;
 
-	/* pass interface in env to hotplug. */
+	/* pass interface to uevent. */
 	envp[i++] = buf;
 	n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1;
 	buf += n;
@@ -408,7 +408,7 @@ static struct class net_class = {
 	.name = "net",
 	.release = netdev_release,
 #ifdef CONFIG_HOTPLUG
-	.hotplug = netdev_hotplug,
+	.uevent = netdev_uevent,
 #endif
 };
 
-- 
cgit v1.2.3


From 8218ef80932aa7e5e3d20c929a640c8d82133a9a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 13 Dec 2005 15:17:34 -0800
Subject: [PATCH] Driver core: Make block devices create the proper symlink
 name

Block devices need to add the block device name to the symlink they put
in the device directory, otherwise multiple symlinks of the same name
can be created.  This matches the class system, which works the same
way, we just forgot to convert block at the same time.

Cc: Pete Zaitcev <zaitcev@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7187a57d51e8..7881ce05daef 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -336,12 +336,31 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 	disk->part[part-1] = p;
 }
 
+static char *make_block_name(struct gendisk *disk)
+{
+	char *name;
+	static char *block_str = "block:";
+	int size;
+
+	size = strlen(block_str) + strlen(disk->disk_name) + 1;
+	name = kmalloc(size, GFP_KERNEL);
+	if (!name)
+		return NULL;
+	strcpy(name, block_str);
+	strcat(name, disk->disk_name);
+	return name;
+}
+
 static void disk_sysfs_symlinks(struct gendisk *disk)
 {
 	struct device *target = get_device(disk->driverfs_dev);
 	if (target) {
+		char *disk_name = make_block_name(disk);
 		sysfs_create_link(&disk->kobj,&target->kobj,"device");
-		sysfs_create_link(&target->kobj,&disk->kobj,"block");
+		if (disk_name) {
+			sysfs_create_link(&target->kobj,&disk->kobj,disk_name);
+			kfree(disk_name);
+		}
 	}
 }
 
@@ -461,8 +480,12 @@ void del_gendisk(struct gendisk *disk)
 	devfs_remove_disk(disk);
 
 	if (disk->driverfs_dev) {
+		char *disk_name = make_block_name(disk);
 		sysfs_remove_link(&disk->kobj, "device");
-		sysfs_remove_link(&disk->driverfs_dev->kobj, "block");
+		if (disk_name) {
+			sysfs_remove_link(&disk->driverfs_dev->kobj, disk_name);
+			kfree(disk_name);
+		}
 		put_device(disk->driverfs_dev);
 	}
 	kobject_uevent(&disk->kobj, KOBJ_REMOVE);
-- 
cgit v1.2.3


From e80a5dea8e056d8f398be1900d61c581d379f02f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 23 Nov 2005 09:15:44 -0500
Subject: [PATCH] sysfs: handle failures in sysfs_make_dirent

I noticed that if sysfs_make_dirent fails to allocate the sd, then a
null will be passed to sysfs_put.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 59734ba1ee60..d36780382176 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -112,7 +112,11 @@ static int create_dir(struct kobject * k, struct dentry * p,
 			}
 		}
 		if (error && (error != -EEXIST)) {
-			sysfs_put((*d)->d_fsdata);
+			struct sysfs_dirent *sd = (*d)->d_fsdata;
+			if (sd) {
+ 				list_del_init(&sd->s_sibling);
+				sysfs_put(sd);
+			}
 			d_drop(*d);
 		}
 		dput(*d);
-- 
cgit v1.2.3


From e28cc71572da38a5a12c1cfe4d7032017adccf69 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 4 Jan 2006 16:20:40 -0800
Subject: Relax the rw_verify_area() error checking.

In particular, allow over-large read- or write-requests to be downgraded
to a more reasonable range, rather than considering them outright errors.

We want to protect lower layers from (the sadly all too common) overflow
conditions, but prefer to do so by chopping the requests up, rather than
just refusing them outright.

Cc: Peter Anvin <hpa@zytor.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Al Viro <viro@ftp.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/kernel/linux32.c |  4 ++--
 fs/compat.c                |  2 +-
 fs/read_write.c            | 34 +++++++++++++++++++++++++---------
 3 files changed, 28 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 330cf84d21fe..60353f5acc48 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -420,7 +420,7 @@ asmlinkage ssize_t sys32_pread(unsigned int fd, char * buf,
 		goto out;
 	pos = merge_64(a4, a5);
 	ret = rw_verify_area(READ, file, &pos, count);
-	if (ret)
+	if (ret < 0)
 		goto out;
 	ret = -EINVAL;
 	if (!file->f_op || !(read = file->f_op->read))
@@ -455,7 +455,7 @@ asmlinkage ssize_t sys32_pwrite(unsigned int fd, const char * buf,
 		goto out;
 	pos = merge_64(a4, a5);
 	ret = rw_verify_area(WRITE, file, &pos, count);
-	if (ret)
+	if (ret < 0)
 		goto out;
 	ret = -EINVAL;
 	if (!file->f_op || !(write = file->f_op->write))
diff --git a/fs/compat.c b/fs/compat.c
index 818634120b69..55ac0324aaf1 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1170,7 +1170,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	}
 
 	ret = rw_verify_area(type, file, pos, tot_len);
-	if (ret)
+	if (ret < 0)
 		goto out;
 
 	fnv = NULL;
diff --git a/fs/read_write.c b/fs/read_write.c
index a091ee4f430d..df3468a22fea 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/pagemap.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -182,22 +183,33 @@ bad:
 }
 #endif
 
+/*
+ * rw_verify_area doesn't like huge counts. We limit
+ * them to something that fits in "int" so that others
+ * won't have to do range checks all the time.
+ */
+#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
 
 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 {
 	struct inode *inode;
 	loff_t pos;
 
-	if (unlikely(count > INT_MAX))
+	if (unlikely((ssize_t) count < 0))
 		goto Einval;
 	pos = *ppos;
 	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
 		goto Einval;
 
 	inode = file->f_dentry->d_inode;
-	if (inode->i_flock && MANDATORY_LOCK(inode))
-		return locks_mandatory_area(read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count);
-	return 0;
+	if (inode->i_flock && MANDATORY_LOCK(inode)) {
+		int retval = locks_mandatory_area(
+			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
+			inode, file, pos, count);
+		if (retval < 0)
+			return retval;
+	}
+	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 
 Einval:
 	return -EINVAL;
@@ -244,7 +256,8 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 		return -EFAULT;
 
 	ret = rw_verify_area(READ, file, pos, count);
-	if (!ret) {
+	if (ret >= 0) {
+		count = ret;
 		ret = security_file_permission (file, MAY_READ);
 		if (!ret) {
 			if (file->f_op->read)
@@ -295,7 +308,8 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 		return -EFAULT;
 
 	ret = rw_verify_area(WRITE, file, pos, count);
-	if (!ret) {
+	if (ret >= 0) {
+		count = ret;
 		ret = security_file_permission (file, MAY_WRITE);
 		if (!ret) {
 			if (file->f_op->write)
@@ -497,7 +511,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	}
 
 	ret = rw_verify_area(type, file, pos, tot_len);
-	if (ret)
+	if (ret < 0)
 		goto out;
 	ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE);
 	if (ret)
@@ -653,8 +667,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		if (!(in_file->f_mode & FMODE_PREAD))
 			goto fput_in;
 	retval = rw_verify_area(READ, in_file, ppos, count);
-	if (retval)
+	if (retval < 0)
 		goto fput_in;
+	count = retval;
 
 	retval = security_file_permission (in_file, MAY_READ);
 	if (retval)
@@ -674,8 +689,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		goto fput_out;
 	out_inode = out_file->f_dentry->d_inode;
 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
-	if (retval)
+	if (retval < 0)
 		goto fput_out;
+	count = retval;
 
 	retval = security_file_permission (out_file, MAY_WRITE);
 	if (retval)
-- 
cgit v1.2.3


From 80cfd548eed68cf90c5ae9cfcd6b02230cece756 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 09:43:28 +0100
Subject: [BLOCK] bio: check for same page merge possibilities in
 __bio_add_page()

For filesystems with a blocksize < page size, we can merge same page
calls into the bio_vec at the end of the bio. This saves segments
on systems with a page size > the "normal" 4kb fs block size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/bio.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 38d3e8023a07..dfe242a21eb4 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -325,10 +325,31 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	if (unlikely(bio_flagged(bio, BIO_CLONED)))
 		return 0;
 
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
+	if (((bio->bi_size + len) >> 9) > max_sectors)
 		return 0;
 
-	if (((bio->bi_size + len) >> 9) > max_sectors)
+	/*
+	 * For filesystems with a blocksize smaller than the pagesize
+	 * we will often be called with the same page as last time and
+	 * a consecutive offset.  Optimize this special case.
+	 */
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		if (page == prev->bv_page &&
+		    offset == prev->bv_offset + prev->bv_len) {
+			prev->bv_len += len;
+			if (q->merge_bvec_fn &&
+			    q->merge_bvec_fn(q, bio, prev) < len) {
+				prev->bv_len -= len;
+				return 0;
+			}
+
+			goto done;
+		}
+	}
+
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return 0;
 
 	/*
@@ -382,6 +403,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	bio->bi_vcnt++;
 	bio->bi_phys_segments++;
 	bio->bi_hw_segments++;
+ done:
 	bio->bi_size += len;
 	return len;
 }
-- 
cgit v1.2.3


From bd6a59b22fd3bd044bb14978b885bcd042a10e8e Mon Sep 17 00:00:00 2001
From: Joshua Kwan <joshk@triplehelix.org>
Date: Fri, 6 Jan 2006 00:09:45 -0800
Subject: [PATCH] hfsplus oops fix

nls_utf8 is available, and the check in hfsplus_fill_super checks the wrong
pointer for NULLness (it checks the saved nls, not the new one that it
needs to use.)

Signed-off-by: Joshua Kwan <joshk@triplehelix.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hfsplus/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 8093351bd7c3..6daaf7c755a6 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -320,7 +320,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	/* temporarily use utf8 to correctly find the hidden dir below */
 	nls = sbi->nls;
 	sbi->nls = load_nls("utf8");
-	if (!nls) {
+	if (!sbi->nls) {
 		printk("HFS+: unable to load nls for utf8\n");
 		err = -EINVAL;
 		goto cleanup;
-- 
cgit v1.2.3


From 1e8f889b10d8d2223105719e36ce45688fedbd59 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Jan 2006 00:10:44 -0800
Subject: [PATCH] Hugetlb: Copy on Write support

Implement copy-on-write support for hugetlb mappings so MAP_PRIVATE can be
supported.  This helps us to safely use hugetlb pages in many more
applications.  The patch makes the following changes.  If needed, I also have
it broken out according to the following paragraphs.

1. Add a pair of functions to set/clear write access on huge ptes.  The
   writable check in make_huge_pte is moved out to the caller for use by COW
   later.

2. Hugetlb copy-on-write requires special case handling in the following
   situations:

   - copy_hugetlb_page_range() - Copied pages must be write protected so
     a COW fault will be triggered (if necessary) if those pages are written
     to.

   - find_or_alloc_huge_page() - Only MAP_SHARED pages are added to the
     page cache.  MAP_PRIVATE pages still need to be locked however.

3. Provide hugetlb_cow() and calls from hugetlb_fault() and
   hugetlb_no_page() which handles the COW fault by making the actual copy.

4. Remove the check in hugetlbfs_file_map() so that MAP_PRIVATE mmaps
   will be allowed.  Make MAP_HUGETLB exempt from the depricated VM_RESERVED
   mapping check.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c |   3 --
 mm/hugetlb.c         | 127 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 108 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c1cef3bb677..8c41315a6e42 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,9 +100,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	loff_t len, vma_len;
 	int ret;
 
-	if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
-		return -EINVAL;
-
 	if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
 		return -EINVAL;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cf8225108b2f..da8a211414c9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -261,11 +261,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
 	.nopage = hugetlb_nopage,
 };
 
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+				int writable)
 {
 	pte_t entry;
 
-	if (vma->vm_flags & VM_WRITE) {
+	if (writable) {
 		entry =
 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 	} else {
@@ -277,12 +278,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
 	return entry;
 }
 
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+				   unsigned long address, pte_t *ptep)
+{
+	pte_t entry;
+
+	entry = pte_mkwrite(pte_mkdirty(*ptep));
+	ptep_set_access_flags(vma, address, ptep, entry, 1);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+}
+
+
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			    struct vm_area_struct *vma)
 {
 	pte_t *src_pte, *dst_pte, entry;
 	struct page *ptepage;
 	unsigned long addr;
+	int cow;
+
+	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		src_pte = huge_pte_offset(src, addr);
@@ -294,6 +310,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
 		if (!pte_none(*src_pte)) {
+			if (cow)
+				ptep_set_wrprotect(src, addr, src_pte);
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
@@ -346,7 +364,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 
 static struct page *find_or_alloc_huge_page(struct address_space *mapping,
-						unsigned long idx)
+				unsigned long idx, int shared)
 {
 	struct page *page;
 	int err;
@@ -364,26 +382,80 @@ retry:
 		goto out;
 	}
 
-	err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
-	if (err) {
-		put_page(page);
-		hugetlb_put_quota(mapping);
-		if (err == -EEXIST)
-			goto retry;
-		page = NULL;
+	if (shared) {
+		err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+		if (err) {
+			put_page(page);
+			hugetlb_put_quota(mapping);
+			if (err == -EEXIST)
+				goto retry;
+			page = NULL;
+		}
+	} else {
+		/* Caller expects a locked page */
+		lock_page(page);
 	}
 out:
 	return page;
 }
 
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pte_t *ptep, pte_t pte)
+{
+	struct page *old_page, *new_page;
+	int i, avoidcopy;
+
+	old_page = pte_page(pte);
+
+	/* If no-one else is actually using this page, avoid the copy
+	 * and just make the page writable */
+	avoidcopy = (page_count(old_page) == 1);
+	if (avoidcopy) {
+		set_huge_ptep_writable(vma, address, ptep);
+		return VM_FAULT_MINOR;
+	}
+
+	page_cache_get(old_page);
+	new_page = alloc_huge_page();
+
+	if (!new_page) {
+		page_cache_release(old_page);
+
+		/* Logically this is OOM, not a SIGBUS, but an OOM
+		 * could cause the kernel to go killing other
+		 * processes which won't help the hugepage situation
+		 * at all (?) */
+		return VM_FAULT_SIGBUS;
+	}
+
+	spin_unlock(&mm->page_table_lock);
+	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+		copy_user_highpage(new_page + i, old_page + i,
+				   address + i*PAGE_SIZE);
+	spin_lock(&mm->page_table_lock);
+
+	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+	if (likely(pte_same(*ptep, pte))) {
+		/* Break COW */
+		set_huge_pte_at(mm, address, ptep,
+				make_huge_pte(vma, new_page, 1));
+		/* Make the old page be freed below */
+		new_page = old_page;
+	}
+	page_cache_release(new_page);
+	page_cache_release(old_page);
+	return VM_FAULT_MINOR;
+}
+
 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep)
+			unsigned long address, pte_t *ptep, int write_access)
 {
 	int ret = VM_FAULT_SIGBUS;
 	unsigned long idx;
 	unsigned long size;
 	struct page *page;
 	struct address_space *mapping;
+	pte_t new_pte;
 
 	mapping = vma->vm_file->f_mapping;
 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -393,10 +465,13 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Use page lock to guard against racing truncation
 	 * before we get page_table_lock.
 	 */
-	page = find_or_alloc_huge_page(mapping, idx);
+	page = find_or_alloc_huge_page(mapping, idx,
+			vma->vm_flags & VM_SHARED);
 	if (!page)
 		goto out;
 
+	BUG_ON(!PageLocked(page));
+
 	spin_lock(&mm->page_table_lock);
 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 	if (idx >= size)
@@ -407,7 +482,15 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto backout;
 
 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-	set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+				&& (vma->vm_flags & VM_SHARED)));
+	set_huge_pte_at(mm, address, ptep, new_pte);
+
+	if (write_access && !(vma->vm_flags & VM_SHARED)) {
+		/* Optimization, do the COW without a second fault */
+		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+	}
+
 	spin_unlock(&mm->page_table_lock);
 	unlock_page(page);
 out:
@@ -426,6 +509,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	pte_t *ptep;
 	pte_t entry;
+	int ret;
 
 	ptep = huge_pte_alloc(mm, address);
 	if (!ptep)
@@ -433,13 +517,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	entry = *ptep;
 	if (pte_none(entry))
-		return hugetlb_no_page(mm, vma, address, ptep);
+		return hugetlb_no_page(mm, vma, address, ptep, write_access);
 
-	/*
-	 * We could get here if another thread instantiated the pte
-	 * before the test above.
-	 */
-	return VM_FAULT_MINOR;
+	ret = VM_FAULT_MINOR;
+
+	spin_lock(&mm->page_table_lock);
+	/* Check for a racing update before calling hugetlb_cow */
+	if (likely(pte_same(entry, *ptep)))
+		if (write_access && !pte_write(entry))
+			ret = hugetlb_cow(mm, vma, address, ptep, entry);
+	spin_unlock(&mm->page_table_lock);
+
+	return ret;
 }
 
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-- 
cgit v1.2.3


From 9617d95e6e9ffd883cf90a89724fe60d7ab22f9a Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:12 -0800
Subject: [PATCH] mm: rmap optimisation

Optimise rmap functions by minimising atomic operations when we know there
will be no concurrent modifications.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c            |  2 +-
 include/linux/rmap.h |  1 +
 mm/memory.c          |  6 +++---
 mm/rmap.c            | 49 ++++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 22533cce0611..e75a9548da8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -324,7 +324,7 @@ void install_arg_page(struct vm_area_struct *vma,
 	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
-	page_add_anon_rmap(page, vma, address);
+	page_add_new_anon_rmap(page, vma, address);
 	pte_unmap_unlock(pte, ptl);
 
 	/* no need for flush_tlb */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 33261f1d2239..9d6fbeef2104 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -71,6 +71,7 @@ void __anon_vma_link(struct vm_area_struct *);
  * rmap interfaces called when adding or removing pte of page
  */
 void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_file_rmap(struct page *);
 void page_remove_rmap(struct page *);
 
diff --git a/mm/memory.c b/mm/memory.c
index e249088908c4..d7ca7de10f4d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
 		update_mmu_cache(vma, address, entry);
 		lazy_mmu_prot_update(entry);
 		lru_cache_add_active(new_page);
-		page_add_anon_rmap(new_page, vma, address);
+		page_add_new_anon_rmap(new_page, vma, address);
 
 		/* Free the old page.. */
 		new_page = old_page;
@@ -1978,7 +1978,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		inc_mm_counter(mm, anon_rss);
 		lru_cache_add_active(page);
 		SetPageReferenced(page);
-		page_add_anon_rmap(page, vma, address);
+		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
 		page = ZERO_PAGE(address);
@@ -2109,7 +2109,7 @@ retry:
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
-			page_add_anon_rmap(new_page, vma, address);
+			page_add_new_anon_rmap(new_page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def159..4107f64ff749 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -434,6 +434,26 @@ int page_referenced(struct page *page, int is_locked)
 	return referenced;
 }
 
+/**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	BUG_ON(!anon_vma);
+	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+	page->mapping = (struct address_space *) anon_vma;
+
+	page->index = linear_page_index(vma, address);
+
+	inc_page_state(nr_mapped);
+}
+
 /**
  * page_add_anon_rmap - add pte mapping to an anonymous page
  * @page:	the page to add the mapping to
@@ -445,20 +465,27 @@ int page_referenced(struct page *page, int is_locked)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	if (atomic_inc_and_test(&page->_mapcount)) {
-		struct anon_vma *anon_vma = vma->anon_vma;
-
-		BUG_ON(!anon_vma);
-		anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-		page->mapping = (struct address_space *) anon_vma;
-
-		page->index = linear_page_index(vma, address);
-
-		inc_page_state(nr_mapped);
-	}
+	if (atomic_inc_and_test(&page->_mapcount))
+		__page_set_anon_rmap(page, vma, address);
 	/* else checking page index and mapping is racy */
 }
 
+/*
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ */
+void page_add_new_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+	__page_set_anon_rmap(page, vma, address);
+}
+
 /**
  * page_add_file_rmap - add pte mapping to a file page
  * @page: the page to add the mapping to
-- 
cgit v1.2.3


From 642fb4d1f1dd2417aa69189fe5ceb81e4fb72900 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:41 -0800
Subject: [PATCH] NOMMU: Provide shared-writable mmap support on ramfs

The attached patch makes ramfs support shared-writable mmaps by:

 (1) Attempting to perform a contiguous block allocation to the requested size
     when truncate attempts to increase the file from zero size, such as
     happens when:

	fd = shm_open("/file/on/ramfs", ...):
	ftruncate(fd, size_requested);
	addr = mmap(NULL, subsize, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_SHARED,
		    fd, offset);

 (2) Permitting any shared-writable mapping over any contiguous set of extant
     pages. get_unmapped_area() will return the address into the actual ramfs
     pages. The mapping may start anywhere and be of any size, but may not go
     over the end of file. Multiple mappings may overlap in any way.

 (3) Not permitting a file to be shrunk if it would truncate any shared
     mappings (private mappings are copied).

Thus this patch provides support for POSIX shared memory on NOMMU kernels,
with certain limitations such as there being a large enough block of pages
available to support the allocation and it only working on directly mappable
filesystems.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ramfs/Makefile     |   4 +-
 fs/ramfs/file-mmu.c   |  57 ++++++++++
 fs/ramfs/file-nommu.c | 292 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ramfs/inode.c      |  22 +---
 fs/ramfs/internal.h   |  15 +++
 include/linux/ramfs.h |  10 ++
 6 files changed, 378 insertions(+), 22 deletions(-)
 create mode 100644 fs/ramfs/file-mmu.c
 create mode 100644 fs/ramfs/file-nommu.c
 create mode 100644 fs/ramfs/internal.h

(limited to 'fs')

diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile
index f096f3007091..5a0236e02ee1 100644
--- a/fs/ramfs/Makefile
+++ b/fs/ramfs/Makefile
@@ -4,4 +4,6 @@
 
 obj-$(CONFIG_RAMFS) += ramfs.o
 
-ramfs-objs := inode.o
+file-mmu-y := file-nommu.o
+file-mmu-$(CONFIG_MMU) := file-mmu.o
+ramfs-objs += inode.o $(file-mmu-y)
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
new file mode 100644
index 000000000000..2115383dcc8d
--- /dev/null
+++ b/fs/ramfs/file-mmu.c
@@ -0,0 +1,57 @@
+/* file-mmu.c: ramfs MMU-based file operations
+ *
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+struct address_space_operations ramfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.read		= generic_file_read,
+	.write		= generic_file_write,
+	.mmap		= generic_file_mmap,
+	.fsync		= simple_sync_file,
+	.sendfile	= generic_file_sendfile,
+	.llseek		= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
new file mode 100644
index 000000000000..3f810acd0bfa
--- /dev/null
+++ b/fs/ramfs/file-nommu.c
@@ -0,0 +1,292 @@
+/* file-nommu.c: no-MMU version of ramfs
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/quotaops.h>
+#include <linux/pagevec.h>
+#include <linux/mman.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+
+struct address_space_operations ramfs_aops = {
+	.readpage		= simple_readpage,
+	.prepare_write		= simple_prepare_write,
+	.commit_write		= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.mmap			= ramfs_nommu_mmap,
+	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
+	.read			= generic_file_read,
+	.write			= generic_file_write,
+	.fsync			= simple_sync_file,
+	.sendfile		= generic_file_sendfile,
+	.llseek			= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.setattr		= ramfs_nommu_setattr,
+	.getattr		= simple_getattr,
+};
+
+/*****************************************************************************/
+/*
+ * add a contiguous set of pages into a ramfs inode when it's truncated from
+ * size 0 on the assumption that it's going to be used for an mmap of shared
+ * memory
+ */
+static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+	struct pagevec lru_pvec;
+	unsigned long npages, xpages, loop, limit;
+	struct page *pages;
+	unsigned order;
+	void *data;
+	int ret;
+
+	/* make various checks */
+	order = get_order(newsize);
+	if (unlikely(order >= MAX_ORDER))
+		goto too_big;
+
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY && newsize > limit)
+		goto fsize_exceeded;
+
+	if (newsize > inode->i_sb->s_maxbytes)
+		goto too_big;
+
+	i_size_write(inode, newsize);
+
+	/* allocate enough contiguous pages to be able to satisfy the
+	 * request */
+	pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+	if (!pages)
+		return -ENOMEM;
+
+	/* split the high-order page into an array of single pages */
+	xpages = 1UL << order;
+	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	for (loop = 0; loop < npages; loop++)
+		set_page_count(pages + loop, 1);
+
+	/* trim off any pages we don't actually require */
+	for (loop = npages; loop < xpages; loop++)
+		__free_page(pages + loop);
+
+	/* clear the memory we allocated */
+	newsize = PAGE_SIZE * npages;
+	data = page_address(pages);
+	memset(data, 0, newsize);
+
+	/* attach all the pages to the inode's address space */
+	pagevec_init(&lru_pvec, 0);
+	for (loop = 0; loop < npages; loop++) {
+		struct page *page = pages + loop;
+
+		ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+		if (ret < 0)
+			goto add_error;
+
+		if (!pagevec_add(&lru_pvec, page))
+			__pagevec_lru_add(&lru_pvec);
+
+		unlock_page(page);
+	}
+
+	pagevec_lru_add(&lru_pvec);
+	return 0;
+
+ fsize_exceeded:
+	send_sig(SIGXFSZ, current, 0);
+ too_big:
+	return -EFBIG;
+
+ add_error:
+	page_cache_release(pages + loop);
+	for (loop++; loop < npages; loop++)
+		__free_page(pages + loop);
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * check that file shrinkage doesn't leave any VMAs dangling in midair
+ */
+static int ramfs_nommu_check_mappings(struct inode *inode,
+				      size_t newsize, size_t size)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	/* search for VMAs that fall within the dead zone */
+	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+			      newsize >> PAGE_SHIFT,
+			      (size + PAGE_SIZE - 1) >> PAGE_SHIFT
+			      ) {
+		/* found one - only interested if it's shared out of the page
+		 * cache */
+		if (vma->vm_flags & VM_SHARED)
+			return -ETXTBSY; /* not quite true, but near enough */
+	}
+
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ *
+ */
+static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
+{
+	int ret;
+
+	/* assume a truncate from zero size is going to be for the purposes of
+	 * shared mmap */
+	if (size == 0) {
+		if (unlikely(newsize >> 32))
+			return -EFBIG;
+
+		return ramfs_nommu_expand_for_mapping(inode, newsize);
+	}
+
+	/* check that a decrease in size doesn't cut off any shared mappings */
+	if (newsize < size) {
+		ret = ramfs_nommu_check_mappings(inode, newsize, size);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = vmtruncate(inode, size);
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * handle a change of attributes
+ * - we're specifically interested in a change of size
+ */
+static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *inode = dentry->d_inode;
+	unsigned int old_ia_valid = ia->ia_valid;
+	int ret = 0;
+
+	/* by providing our own setattr() method, we skip this quotaism */
+	if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
+	    (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
+		ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
+
+	/* pick out size-changing events */
+	if (ia->ia_valid & ATTR_SIZE) {
+		loff_t size = i_size_read(inode);
+		if (ia->ia_size != size) {
+			ret = ramfs_nommu_resize(inode, ia->ia_size, size);
+			if (ret < 0 || ia->ia_valid == ATTR_SIZE)
+				goto out;
+		} else {
+			/* we skipped the truncate but must still update
+			 * timestamps
+			 */
+			ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+		}
+	}
+
+	ret = inode_setattr(inode, ia);
+ out:
+	ia->ia_valid = old_ia_valid;
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * try to determine where a shared mapping can be made
+ * - we require that:
+ *   - the pages to be mapped must exist
+ *   - the pages be physically contiguous in sequence
+ */
+unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+					    unsigned long addr, unsigned long len,
+					    unsigned long pgoff, unsigned long flags)
+{
+	unsigned long maxpages, lpages, nr, loop, ret;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page **pages = NULL, **ptr, *page;
+	loff_t isize;
+
+	if (!(flags & MAP_SHARED))
+		return addr;
+
+	/* the mapping mustn't extend beyond the EOF */
+	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	isize = i_size_read(inode);
+
+	ret = -EINVAL;
+	maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= maxpages)
+		goto out;
+
+	if (maxpages - pgoff < lpages)
+		goto out;
+
+	/* gang-find the pages */
+	ret = -ENOMEM;
+	pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto out;
+
+	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+	if (nr != lpages)
+		goto out; /* leave if some pages were missing */
+
+	/* check the pages for physical adjacency */
+	ptr = pages;
+	page = *ptr++;
+	page++;
+	for (loop = lpages; loop > 1; loop--)
+		if (*ptr++ != page++)
+			goto out;
+
+	/* okay - all conditions fulfilled */
+	ret = (unsigned long) page_address(pages[0]);
+
+ out:
+	if (pages) {
+		ptr = pages;
+		for (loop = lpages; loop > 0; loop--)
+			put_page(*ptr++);
+		kfree(pages);
+	}
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * set up a mapping
+ */
+int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return 0;
+}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0a88917605ae..c66bd5e4c05c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,13 +34,12 @@
 #include <linux/ramfs.h>
 
 #include <asm/uaccess.h>
+#include "internal.h"
 
 /* some random number */
 #define RAMFS_MAGIC	0x858458f6
 
 static struct super_operations ramfs_ops;
-static struct address_space_operations ramfs_aops;
-static struct inode_operations ramfs_file_inode_operations;
 static struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
@@ -142,25 +141,6 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
 	return error;
 }
 
-static struct address_space_operations ramfs_aops = {
-	.readpage	= simple_readpage,
-	.prepare_write	= simple_prepare_write,
-	.commit_write	= simple_commit_write
-};
-
-struct file_operations ramfs_file_operations = {
-	.read		= generic_file_read,
-	.write		= generic_file_write,
-	.mmap		= generic_file_mmap,
-	.fsync		= simple_sync_file,
-	.sendfile	= generic_file_sendfile,
-	.llseek		= generic_file_llseek,
-};
-
-static struct inode_operations ramfs_file_inode_operations = {
-	.getattr	= simple_getattr,
-};
-
 static struct inode_operations ramfs_dir_inode_operations = {
 	.create		= ramfs_create,
 	.lookup		= simple_lookup,
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
new file mode 100644
index 000000000000..272c8a7120b0
--- /dev/null
+++ b/fs/ramfs/internal.h
@@ -0,0 +1,15 @@
+/* internal.h: ramfs internal definitions
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+extern struct address_space_operations ramfs_aops;
+extern struct file_operations ramfs_file_operations;
+extern struct inode_operations ramfs_file_inode_operations;
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index e0a4faa9610c..953b6df5d037 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -5,6 +5,16 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev);
 struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
 	 int flags, const char *dev_name, void *data);
 
+#ifndef CONFIG_MMU
+extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+						   unsigned long addr,
+						   unsigned long len,
+						   unsigned long pgoff,
+						   unsigned long flags);
+
+extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+#endif
+
 extern struct file_operations ramfs_file_operations;
 extern struct vm_operations_struct generic_file_vm_ops;
 
-- 
cgit v1.2.3


From 56dc6a88ec76019e0d0729165cb5b98536270e1d Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:09 -0800
Subject: [PATCH] s390: cms volume label definitions

Moved definition of CMS volume label to vtoc.h and modify partitions/ibm.c to
use this volume label definition instead of anonymous array.

Signed-off-by: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/block/dasd_diag.c |  7 ++++---
 drivers/s390/block/dasd_diag.h | 23 +----------------------
 fs/partitions/ibm.c            | 30 +++++++++++++++---------------
 include/asm-s390/vtoc.h        | 24 ++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index ab8754e566bc..16c4b7d94bd7 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -6,7 +6,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
  *
- * $Revision: 1.51 $
+ * $Revision: 1.52 $
  */
 
 #include <linux/config.h>
@@ -25,6 +25,7 @@
 #include <asm/io.h>
 #include <asm/s390_ext.h>
 #include <asm/todclk.h>
+#include <asm/vtoc.h>
 
 #include "dasd_int.h"
 #include "dasd_diag.h"
@@ -329,7 +330,7 @@ dasd_diag_check_device(struct dasd_device *device)
 	struct dasd_diag_private *private;
 	struct dasd_diag_characteristics *rdc_data;
 	struct dasd_diag_bio bio;
-	struct dasd_diag_cms_label *label;
+	struct vtoc_cms_label *label;
 	blocknum_t end_block;
 	unsigned int sb, bsize;
 	int rc;
@@ -380,7 +381,7 @@ dasd_diag_check_device(struct dasd_device *device)
 	mdsk_term_io(device);
 
 	/* figure out blocksize of device */
-	label = (struct dasd_diag_cms_label *) get_zeroed_page(GFP_KERNEL);
+	label = (struct vtoc_cms_label *) get_zeroed_page(GFP_KERNEL);
 	if (label == NULL)  {
 		DEV_MESSAGE(KERN_WARNING, device, "%s",
 			    "No memory to allocate initialization request");
diff --git a/drivers/s390/block/dasd_diag.h b/drivers/s390/block/dasd_diag.h
index df31484d73a7..37edf6e91715 100644
--- a/drivers/s390/block/dasd_diag.h
+++ b/drivers/s390/block/dasd_diag.h
@@ -6,7 +6,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
  *
- * $Revision: 1.8 $
+ * $Revision: 1.9 $
  */
 
 #define MDSK_WRITE_REQ 0x01
@@ -44,27 +44,6 @@ struct dasd_diag_characteristics {
 	u8 rdev_features;
 } __attribute__ ((packed, aligned(4)));
 
-struct dasd_diag_cms_label {
-	u8 label_id[4];
-	u8 vol_id[6];
-	u16 version_id;
-	u32 block_size;
-	u32 origin_ptr;
-	u32 usable_count;
-	u32 formatted_count;
-	u32 block_count;
-	u32 used_count;
-	u32 fst_size;
-	u32 fst_count;
-	u8 format_date[6];
-	u8 reserved1[2];
-	u32 disk_offset;
-	u32 map_block;
-	u32 hblk_disp;
-	u32 user_disp;
-	u8 reserved2[4];
-	u8 segment_name[8];
-} __attribute__ ((packed));
 
 #ifdef CONFIG_ARCH_S390X
 #define DASD_DIAG_FLAGA_DEFAULT		DASD_DIAG_FLAGA_FORMAT_64BIT
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 6327bcb2d73d..78010ad60e47 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -56,7 +56,10 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	struct hd_geometry *geo;
 	char type[5] = {0,};
 	char name[7] = {0,};
-	struct vtoc_volume_label *vlabel;
+	union label_t {
+		struct vtoc_volume_label vol;
+		struct vtoc_cms_label cms;
+	} *label;
 	unsigned char *data;
 	Sector sect;
 
@@ -64,9 +67,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		goto out_noinfo;
 	if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
 		goto out_nogeo;
-	if ((vlabel = kmalloc(sizeof(struct vtoc_volume_label),
-			      GFP_KERNEL)) == NULL)
-		goto out_novlab;
+	if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL)
+		goto out_nolab;
 	
 	if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
 	    ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
@@ -87,7 +89,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		strncpy(name, data + 8, 6);
 	else
 		strncpy(name, data + 4, 6);
-	memcpy (vlabel, data, sizeof(struct vtoc_volume_label));
+	memcpy(label, data, sizeof(union label_t));
 	put_dev_sector(sect);
 
 	EBCASC(type, 4);
@@ -100,14 +102,12 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		/*
 		 * VM style CMS1 labeled disk
 		 */
-		int *label = (int *) vlabel;
-
-		if (label[13] != 0) {
+		if (label->cms.disk_offset != 0) {
 			printk("CMS1/%8s(MDSK):", name);
 			/* disk is reserved minidisk */
-			blocksize = label[3];
-			offset = label[13];
-			size = (label[7] - 1)*(blocksize >> 9);
+			blocksize = label->cms.block_size;
+			offset = label->cms.disk_offset;
+			size = (label->cms.block_count - 1) * (blocksize >> 9);
 		} else {
 			printk("CMS1/%8s:", name);
 			offset = (info->label_block + 1);
@@ -126,7 +126,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		printk("VOL1/%8s:", name);
 
 		/* get block number and read then go through format1 labels */
-		blk = cchhb2blk(&vlabel->vtoc, geo) + 1;
+		blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
 		counter = 0;
 		while ((data = read_dev_sector(bdev, blk*(blocksize/512),
 					       &sect)) != NULL) {
@@ -174,7 +174,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 
 	printk("\n");
-	kfree(vlabel);
+	kfree(label);
 	kfree(geo);
 	kfree(info);
 	return 1;
@@ -182,8 +182,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 out_readerr:
 out_badsect:
 out_noioctl:
-	kfree(vlabel);
-out_novlab:
+	kfree(label);
+out_nolab:
 	kfree(geo);
 out_nogeo:
 	kfree(info);
diff --git a/include/asm-s390/vtoc.h b/include/asm-s390/vtoc.h
index 41d369f38b0e..d1de5b7ebb0b 100644
--- a/include/asm-s390/vtoc.h
+++ b/include/asm-s390/vtoc.h
@@ -176,4 +176,28 @@ struct vtoc_format7_label
 	struct vtoc_cchhb DS7PTRDS; /* pointer to next FMT7 DSCB */
 } __attribute__ ((packed));
 
+struct vtoc_cms_label {
+	u8 label_id[4];		/* Label identifier */
+	u8 vol_id[6];		/* Volid */
+	u16 version_id;		/* Version identifier */
+	u32 block_size;		/* Disk block size */
+	u32 origin_ptr;		/* Disk origin pointer */
+	u32 usable_count;	/* Number of usable cylinders/blocks */
+	u32 formatted_count;	/* Maximum number of formatted cylinders/
+				 * blocks */
+	u32 block_count;	/* Disk size in CMS blocks */
+	u32 used_count;		/* Number of CMS blocks in use */
+	u32 fst_size;		/* File Status Table (FST) size */
+	u32 fst_count;		/* Number of FSTs per CMS block */
+	u8 format_date[6];	/* Disk FORMAT date */
+	u8 reserved1[2];
+	u32 disk_offset;	/* Disk offset when reserved*/
+	u32 map_block;		/* Allocation Map Block with next hole */
+	u32 hblk_disp;		/* Displacement into HBLK data of next hole */
+	u32 user_disp;		/* Displacement into user part of Allocation
+				 * map */
+	u8 reserved2[4];
+	u8 segment_name[8];	/* Name of shared segment */
+} __attribute__ ((packed));
+
 #endif /* _ASM_S390_VTOC_H */
-- 
cgit v1.2.3


From 347a8dc3b815f0c0fa62a1df075184ffe4cbdcf1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:28 -0800
Subject: [PATCH] s390: cleanup Kconfig

Sanitize some s390 Kconfig options.  We have ARCH_S390, ARCH_S390X,
ARCH_S390_31, 64BIT, S390_SUPPORT and COMPAT.  Replace these 6 options by
S390, 64BIT and COMPAT.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/Kconfig                  | 27 +++++++--------------------
 arch/s390/Makefile                 |  6 ++----
 arch/s390/appldata/appldata_base.c |  8 ++++----
 arch/s390/crypto/crypt_s390.h      | 10 +++++-----
 arch/s390/defconfig                |  4 +---
 arch/s390/kernel/Makefile          | 15 +++++----------
 arch/s390/kernel/cpcmd.c           | 16 ++++++++--------
 arch/s390/kernel/entry64.S         | 18 +++++++++---------
 arch/s390/kernel/head.S            |  4 ++--
 arch/s390/kernel/module.c          | 12 ++++++------
 arch/s390/kernel/process.c         | 12 ++++++------
 arch/s390/kernel/ptrace.c          | 24 ++++++++++++------------
 arch/s390/kernel/reipl_diag.c      |  2 +-
 arch/s390/kernel/setup.c           | 14 +++++++-------
 arch/s390/kernel/signal.c          |  2 +-
 arch/s390/kernel/smp.c             |  8 ++++----
 arch/s390/kernel/sys_s390.c        | 12 +++++-------
 arch/s390/kernel/traps.c           | 10 +++++-----
 arch/s390/kernel/vmlinux.lds.S     |  2 +-
 arch/s390/lib/Makefile             |  5 ++---
 arch/s390/lib/spinlock.c           |  2 +-
 arch/s390/mm/extmem.c              |  2 +-
 arch/s390/mm/fault.c               | 18 +++++++++---------
 arch/s390/mm/init.c                |  8 ++++----
 arch/s390/mm/mmap.c                |  2 +-
 block/Kconfig                      |  2 +-
 crypto/Kconfig                     |  8 ++++----
 drivers/char/Kconfig               |  2 +-
 drivers/char/hangcheck-timer.c     |  2 +-
 drivers/char/watchdog/Kconfig      |  2 +-
 drivers/input/evdev.c              |  2 +-
 drivers/net/phy/Kconfig            |  2 +-
 drivers/s390/block/Kconfig         |  8 ++++----
 drivers/s390/block/dasd.c          |  2 +-
 drivers/s390/block/dasd_diag.c     |  2 +-
 drivers/s390/block/dasd_diag.h     |  6 +++---
 drivers/s390/block/dasd_eckd.c     |  2 +-
 drivers/s390/block/dasd_fba.c      |  2 +-
 drivers/s390/block/xpram.c         |  4 ++--
 drivers/s390/char/vmwatchdog.c     |  2 +-
 drivers/s390/cio/cio.c             |  2 +-
 drivers/s390/cio/device_id.c       |  2 +-
 drivers/s390/cio/ioasm.h           |  4 ++--
 drivers/s390/cio/qdio.c            |  2 +-
 drivers/s390/cio/qdio.h            | 34 +++++++++++++++++-----------------
 drivers/s390/crypto/z90hardware.c  |  8 ++++----
 drivers/s390/net/Kconfig           |  2 +-
 drivers/s390/net/claw.c            |  6 +++---
 drivers/s390/s390mach.c            | 10 +++++-----
 drivers/s390/sysinfo.c             |  2 +-
 drivers/scsi/Kconfig               |  2 +-
 fs/partitions/Kconfig              |  2 +-
 fs/proc/array.c                    |  2 +-
 include/asm-s390/unistd.h          |  2 +-
 include/linux/irq.h                |  2 +-
 init/Kconfig                       |  2 +-
 init/do_mounts_rd.c                |  4 ++--
 kernel/panic.c                     |  4 ++--
 kernel/sysctl.c                    |  6 +++---
 lib/Kconfig.debug                  |  2 +-
 60 files changed, 183 insertions(+), 208 deletions(-)

(limited to 'fs')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 1846fbfd6bf2..6fe532d82417 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -23,14 +23,14 @@ config GENERIC_BUST_SPINLOCK
 
 mainmenu "Linux Kernel Configuration"
 
-config ARCH_S390
+config S390
 	bool
 	default y
 
 config UID16
 	bool
 	default y
-	depends on ARCH_S390X = 'n'
+	depends on !64BIT
 
 source "init/Kconfig"
 
@@ -38,20 +38,12 @@ menu "Base setup"
 
 comment "Processor type and features"
 
-config ARCH_S390X
+config 64BIT
 	bool "64 bit kernel"
 	help
 	  Select this option if you have a 64 bit IBM zSeries machine
 	  and want to use the 64 bit addressing mode.
 
-config 64BIT
-	def_bool ARCH_S390X
-
-config ARCH_S390_31
-	bool
-	depends on ARCH_S390X = 'n'
-	default y
-
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -101,20 +93,15 @@ config MATHEMU
 	  on older S/390 machines. Say Y unless you know your machine doesn't
 	  need this.
 
-config S390_SUPPORT
+config COMPAT
 	bool "Kernel support for 31 bit emulation"
-	depends on ARCH_S390X
+	depends on 64BIT
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
 	  (and some other stuff like libraries and such) is needed for
 	  executing 31 bit applications.  It is safe to say "Y".
 
-config COMPAT
-	bool
-	depends on S390_SUPPORT
-	default y
-
 config SYSVIPC_COMPAT
 	bool
 	depends on COMPAT && SYSVIPC
@@ -122,7 +109,7 @@ config SYSVIPC_COMPAT
 
 config BINFMT_ELF32
 	tristate "Kernel support for 31 bit ELF binaries"
-	depends on S390_SUPPORT
+	depends on COMPAT
 	help
 	  This allows you to run 32-bit Linux/ELF binaries on your zSeries
 	  in 64 bit mode. Everybody wants this; say Y.
@@ -135,7 +122,7 @@ choice
 
 config MARCH_G5
 	bool "S/390 model G5 and G6"
-	depends on ARCH_S390_31
+	depends on !64BIT
 	help
 	  Select this to build a 31 bit kernel that works
 	  on all S/390 and zSeries machines.
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 73a09a6ee6c8..6c6b197898d0 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -13,16 +13,14 @@
 # Copyright (C) 1994 by Linus Torvalds
 #
 
-ifdef CONFIG_ARCH_S390_31
+ifndef CONFIG_64BIT
 LDFLAGS		:= -m elf_s390
 CFLAGS		+= -m31
 AFLAGS		+= -m31
 UTS_MACHINE	:= s390
 STACK_SIZE	:= 8192
 CHECKFLAGS	+= -D__s390__
-endif
-
-ifdef CONFIG_ARCH_S390X
+else
 LDFLAGS		:= -m elf64_s390
 MODFLAGS	+= -fpic -D__PIC__
 CFLAGS		+= -m64
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index dee6ab54984d..d06a8d71c71d 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -40,7 +40,7 @@
 
 #define TOD_MICRO	0x01000			/* nr. of TOD clock units
 						   for 1 microsecond */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 
 #define APPLDATA_START_INTERVAL_REC 0x00   	/* Function codes for */
 #define APPLDATA_STOP_REC	    0x01	/* DIAG 0xDC	  */
@@ -54,13 +54,13 @@
 #define APPLDATA_GEN_EVENT_RECORD   0x82
 #define APPLDATA_START_CONFIG_REC   0x83
 
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 
 /*
  * Parameter list for DIAGNOSE X'DC'
  */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 struct appldata_parameter_list {
 	u16 diag;		/* The DIAGNOSE code X'00DC'          */
 	u8  function;		/* The function code for the DIAGNOSE */
@@ -82,7 +82,7 @@ struct appldata_parameter_list {
 	u64 product_id_addr;
 	u64 buffer_addr;
 };
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * /proc entries (sysctl)
diff --git a/arch/s390/crypto/crypt_s390.h b/arch/s390/crypto/crypt_s390.h
index d6712cfa6def..d1c259a7fe33 100644
--- a/arch/s390/crypto/crypt_s390.h
+++ b/arch/s390/crypto/crypt_s390.h
@@ -112,7 +112,7 @@ struct crypt_s390_query_status {
  * [ret] is the variable to receive the error code
  * [ERR] is the error code value
  */
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 #define __crypt_s390_fixup \
 	".section .fixup,\"ax\" \n"	\
 	"7:	lhi	%0,%h[e1] \n"	\
@@ -129,7 +129,7 @@ struct crypt_s390_query_status {
 	"	.long	0b,7b \n"	\
 	"	.long	1b,8b \n"	\
 	".previous"
-#else /* __s390x__ */
+#else /* CONFIG_64BIT */
 #define __crypt_s390_fixup \
 	".section .fixup,\"ax\" \n"	\
 	"7:	lhi	%0,%h[e1] \n"	\
@@ -142,7 +142,7 @@ struct crypt_s390_query_status {
 	"	.quad	0b,7b \n"	\
 	"	.quad	1b,8b \n"	\
 	".previous"
-#endif /* __s390x__ */
+#endif /* CONFIG_64BIT */
 
 /*
  * Standard code for setting the result of s390 crypto instructions.
@@ -150,10 +150,10 @@ struct crypt_s390_query_status {
  * [result]: the register containing the result (e.g. second operand length
  * to compute number of processed bytes].
  */
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 #define __crypt_s390_set_result \
 	"	lr	%0,%[result] \n"
-#else /* __s390x__ */
+#else /* CONFIG_64BIT */
 #define __crypt_s390_set_result \
 	"	lgr	%0,%[result] \n"
 #endif
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index f195c7ea1d7b..7d23edc6facb 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -6,7 +6,7 @@
 CONFIG_MMU=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_ARCH_S390=y
+CONFIG_S390=y
 CONFIG_UID16=y
 
 #
@@ -89,9 +89,7 @@ CONFIG_DEFAULT_IOSCHED="anticipatory"
 #
 # Processor type and features
 #
-# CONFIG_ARCH_S390X is not set
 # CONFIG_64BIT is not set
-CONFIG_ARCH_S390_31=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=32
 CONFIG_HOTPLUG_CPU=y
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 7434c32bc631..4865e4b49464 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -8,31 +8,26 @@ obj-y	:=  bitmap.o traps.o time.o process.o \
             setup.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o \
             semaphore.o s390_ext.o debug.o profile.o irq.o reipl_diag.o
 
+obj-y	+= $(if $(CONFIG_64BIT),entry64.o,entry.o)
+obj-y	+= $(if $(CONFIG_64BIT),reipl64.o,reipl.o)
+
 extra-y				+= head.o init_task.o vmlinux.lds
 
 obj-$(CONFIG_MODULES)		+= s390_ksyms.o module.o
 obj-$(CONFIG_SMP)		+= smp.o
 
-obj-$(CONFIG_S390_SUPPORT)	+= compat_linux.o compat_signal.o \
+obj-$(CONFIG_COMPAT)		+= compat_linux.o compat_signal.o \
 					compat_ioctl.o compat_wrapper.o \
 					compat_exec_domain.o
 obj-$(CONFIG_BINFMT_ELF32)	+= binfmt_elf32.o
 
-obj-$(CONFIG_ARCH_S390_31)	+= entry.o reipl.o
-obj-$(CONFIG_ARCH_S390X)	+= entry64.o reipl64.o
-
 obj-$(CONFIG_VIRT_TIMER)	+= vtime.o
 
 # Kexec part
 S390_KEXEC_OBJS := machine_kexec.o crash.o
-ifeq ($(CONFIG_ARCH_S390X),y)
-S390_KEXEC_OBJS += relocate_kernel64.o
-else
-S390_KEXEC_OBJS += relocate_kernel.o
-endif
+S390_KEXEC_OBJS += $(if $(CONFIG_64BIT),relocate_kernel64.o,relocate_kernel.o)
 obj-$(CONFIG_KEXEC) += $(S390_KEXEC_OBJS)
 
-
 #
 # This is just to get the dependencies...
 #
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index d47fecb42cc5..4ef44e536b2c 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -39,7 +39,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 
 	if (response != NULL && rlen > 0) {
 		memset(response, 0, rlen);
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		asm volatile (	"lra	2,0(%2)\n"
 				"lr	4,%3\n"
 				"o	4,%6\n"
@@ -55,7 +55,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "a" (cpcmd_buf), "d" (cmdlen),
 				"a" (response), "d" (rlen), "m" (mask)
 				: "cc", "2", "3", "4", "5" );
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
                 asm volatile (	"lrag	2,0(%2)\n"
 				"lgr	4,%3\n"
 				"o	4,%6\n"
@@ -73,11 +73,11 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "a" (cpcmd_buf), "d" (cmdlen),
 				"a" (response), "d" (rlen), "m" (mask)
 				: "cc", "2", "3", "4", "5" );
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
                 EBCASC(response, rlen);
         } else {
 		return_len = 0;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
                 asm volatile (	"lra	2,0(%1)\n"
 				"lr	3,%2\n"
 				"diag	2,3,0x8\n"
@@ -85,7 +85,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "=d" (return_code)
 				: "a" (cpcmd_buf), "d" (cmdlen)
 				: "2", "3"  );
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
                 asm volatile (	"lrag	2,0(%1)\n"
 				"lgr	3,%2\n"
 				"sam31\n"
@@ -95,7 +95,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "=d" (return_code)
 				: "a" (cpcmd_buf), "d" (cmdlen)
 				: "2", "3" );
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
         }
 	spin_unlock_irqrestore(&cpcmd_lock, flags);
 	if (response_code != NULL)
@@ -105,7 +105,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 
 EXPORT_SYMBOL(__cpcmd);
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 int cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 {
 	char *lowbuf;
@@ -129,4 +129,4 @@ int cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 }
 
 EXPORT_SYMBOL(cpcmd);
-#endif		/* CONFIG_ARCH_S390X */
+#endif		/* CONFIG_64BIT */
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 4eb71ffcf484..369ab4413ec7 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -213,7 +213,7 @@ sysc_nr_ok:
 	mvc	SP_ARGS(8,%r15),SP_R7(%r15)
 sysc_do_restart:
 	larl    %r10,sys_call_table
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 	tm	__TI_flags+5(%r9),(_TIF_31BIT>>16)  # running in 31 bit mode ?
 	jno	sysc_noemu
 	larl    %r10,sys_call_table_emu  # use 31 bit emulation system calls
@@ -361,7 +361,7 @@ sys_clone_glue:
         la      %r2,SP_PTREGS(%r15)    # load pt_regs
         jg      sys_clone              # branch to sys_clone
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_clone_glue: 
         la      %r2,SP_PTREGS(%r15)    # load pt_regs
         jg      sys32_clone            # branch to sys32_clone
@@ -383,7 +383,7 @@ sys_execve_glue:
         bnz     0(%r12)               # it did fail -> store result in gpr2
         b       6(%r12)               # SKIP STG 2,SP_R2(15) in
                                       # system_call/sysc_tracesys
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_execve_glue:        
         la      %r2,SP_PTREGS(%r15)   # load pt_regs
 	lgr     %r12,%r14             # save return address
@@ -398,7 +398,7 @@ sys_sigreturn_glue:
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys_sigreturn         # branch to sys_sigreturn
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_sigreturn_glue:     
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys32_sigreturn       # branch to sys32_sigreturn
@@ -408,7 +408,7 @@ sys_rt_sigreturn_glue:
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys_rt_sigreturn      # branch to sys_sigreturn
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_rt_sigreturn_glue:     
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys32_rt_sigreturn    # branch to sys32_sigreturn
@@ -429,7 +429,7 @@ sys_sigsuspend_glue:
 	la      %r14,6(%r14)          # skip store of return value
         jg      sys_sigsuspend        # branch to sys_sigsuspend
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_sigsuspend_glue:    
 	llgfr	%r4,%r4               # unsigned long			
         lgr     %r5,%r4               # move mask back
@@ -449,7 +449,7 @@ sys_rt_sigsuspend_glue:
 	la      %r14,6(%r14)          # skip store of return value
         jg      sys_rt_sigsuspend     # branch to sys_rt_sigsuspend
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_rt_sigsuspend_glue: 
 	llgfr	%r3,%r3               # size_t			
         lgr     %r4,%r3               # move sigsetsize parameter
@@ -464,7 +464,7 @@ sys_sigaltstack_glue:
         la      %r4,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys_sigaltstack       # branch to sys_sigreturn
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_sigaltstack_glue:
         la      %r4,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys32_sigaltstack_wrapper # branch to sys_sigreturn
@@ -1009,7 +1009,7 @@ sys_call_table:
 #include "syscalls.S"
 #undef SYSCALL
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 
 #define SYSCALL(esa,esame,emu)	.long emu
 	.globl  sys_call_table_emu
diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S
index d31a97c89f68..ea88d066bf04 100644
--- a/arch/s390/kernel/head.S
+++ b/arch/s390/kernel/head.S
@@ -30,7 +30,7 @@
 #include <asm/thread_info.h>
 #include <asm/page.h>
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #define ARCH_OFFSET	4
 #else
 #define ARCH_OFFSET	0
@@ -539,7 +539,7 @@ ipl_devno:
 	.word 0
 .endm
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #include "head64.S"
 #else
 #include "head31.S"
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 607d506689c8..c271cdab58e2 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -37,11 +37,11 @@
 #define DEBUGP(fmt , ...)
 #endif
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 #define PLT_ENTRY_SIZE 12
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define PLT_ENTRY_SIZE 20
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 void *module_alloc(unsigned long size)
 {
@@ -294,17 +294,17 @@ apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 			unsigned int *ip;
 			ip = me->module_core + me->arch.plt_offset +
 				info->plt_offset;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 			ip[0] = 0x0d105810; /* basr 1,0; l 1,6(1); br 1 */
 			ip[1] = 0x100607f1;
 			ip[2] = val;
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 			ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */
 			ip[1] = 0x100a0004;
 			ip[2] = 0x07f10000;
 			ip[3] = (unsigned int) (val >> 32);
 			ip[4] = (unsigned int) val;
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 			info->plt_initialized = 1;
 		}
 		if (r_type == R_390_PLTOFF16 ||
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 78b64fe5e7c2..a942bf2d58e9 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -235,7 +235,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 	/* Save access registers to new thread structure. */
 	save_access_regs(&p->thread.acrs[0]);
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
         /*
 	 * save fprs to current->thread.fp_regs to merge them with
 	 * the emulated registers and then copy the result to the child.
@@ -247,7 +247,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 	/* Set a new TLS ?  */
 	if (clone_flags & CLONE_SETTLS)
 		p->thread.acrs[0] = regs->gprs[6];
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	/* Save the fpu registers to new thread structure. */
 	save_fp_regs(&p->thread.fp_regs);
         p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _REGION_TABLE;
@@ -260,7 +260,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 			p->thread.acrs[1] = (unsigned int) regs->gprs[6];
 		}
 	}
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	/* start new process with ar4 pointing to the correct address space */
 	p->thread.mm_segment = get_fs();
         /* Don't copy debug registers */
@@ -339,16 +339,16 @@ out:
  */
 int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
 {
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
         /*
 	 * save fprs to current->thread.fp_regs to merge them with
 	 * the emulated registers and then copy the result to the dump.
 	 */
 	save_fp_regs(&current->thread.fp_regs);
 	memcpy(fpregs, &current->thread.fp_regs, sizeof(s390_fp_regs));
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	save_fp_regs(fpregs);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	return 1;
 }
 
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 06afa3103ace..8ecda6d66de4 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -42,7 +42,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 #include "compat_ptrace.h"
 #endif
 
@@ -59,7 +59,7 @@ FixPerRegisters(struct task_struct *task)
 	
 	if (per_info->single_step) {
 		per_info->control_regs.bits.starting_addr = 0;
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (test_thread_flag(TIF_31BIT))
 			per_info->control_regs.bits.ending_addr = 0x7fffffffUL;
 		else
@@ -112,7 +112,7 @@ ptrace_disable(struct task_struct *child)
 	clear_single_step(child);
 }
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 # define __ADDR_MASK 3
 #else
 # define __ADDR_MASK 7
@@ -138,7 +138,7 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
 	 * an alignment of 4. Programmers from hell...
 	 */
 	mask = __ADDR_MASK;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	if (addr >= (addr_t) &dummy->regs.acrs &&
 	    addr < (addr_t) &dummy->regs.orig_gpr2)
 		mask = 3;
@@ -160,7 +160,7 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * access registers are stored in the thread structure
 		 */
 		offset = addr - (addr_t) &dummy->regs.acrs;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		/*
 		 * Very special case: old & broken 64 bit gdb reading
 		 * from acrs[15]. Result is a 64 bit value. Read the
@@ -218,7 +218,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 	 * an alignment of 4. Programmers from hell indeed...
 	 */
 	mask = __ADDR_MASK;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	if (addr >= (addr_t) &dummy->regs.acrs &&
 	    addr < (addr_t) &dummy->regs.orig_gpr2)
 		mask = 3;
@@ -231,13 +231,13 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * psw and gprs are stored on the stack
 		 */
 		if (addr == (addr_t) &dummy->regs.psw.mask &&
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		    data != PSW_MASK_MERGE(PSW_USER32_BITS, data) &&
 #endif
 		    data != PSW_MASK_MERGE(PSW_USER_BITS, data))
 			/* Invalid psw mask. */
 			return -EINVAL;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		if (addr == (addr_t) &dummy->regs.psw.addr)
 			/* I'd like to reject addresses without the
 			   high order bit but older gdb's rely on it */
@@ -250,7 +250,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * access registers are stored in the thread structure
 		 */
 		offset = addr - (addr_t) &dummy->regs.acrs;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		/*
 		 * Very special case: old & broken 64 bit gdb writing
 		 * to acrs[15] with a 64 bit value. Ignore the lower
@@ -357,7 +357,7 @@ do_ptrace_normal(struct task_struct *child, long request, long addr, long data)
 	return ptrace_request(child, request, addr, data);
 }
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 /*
  * Now the fun part starts... a 31 bit program running in the
  * 31 bit emulation tracing another program. PTRACE_PEEKTEXT,
@@ -629,7 +629,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
 			return peek_user(child, addr, data);
 		if (request == PTRACE_POKEUSR && addr == PT_IEEE_IP)
 			return poke_user(child, addr, data);
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (request == PTRACE_PEEKUSR &&
 		    addr == PT32_IEEE_IP && test_thread_flag(TIF_31BIT))
 			return peek_user_emu31(child, addr, data);
@@ -695,7 +695,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	/* Do requests that differ for 31/64 bit */
 	default:
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (test_thread_flag(TIF_31BIT))
 			return do_ptrace_emu31(child, request, addr, data);
 #endif
diff --git a/arch/s390/kernel/reipl_diag.c b/arch/s390/kernel/reipl_diag.c
index 83cb42bc0b76..1f33951ba439 100644
--- a/arch/s390/kernel/reipl_diag.c
+++ b/arch/s390/kernel/reipl_diag.c
@@ -26,7 +26,7 @@ void reipl_diag(void)
 		"   st   %%r4,%0\n"
 		"   st   %%r5,%1\n"
                 ".section __ex_table,\"a\"\n"
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
                 "   .align 8\n"
                 "   .quad 0b, 0b\n"
 #else
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 31e7b19348b7..b03847d100d9 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -427,7 +427,7 @@ setup_lowcore(void)
 		__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0) + PAGE_SIZE;
 	lc->current_task = (unsigned long) init_thread_union.thread_info.task;
 	lc->thread_info = (unsigned long) &init_thread_union;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE) {
 		lc->extended_save_area_addr = (__u32)
 			__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0);
@@ -562,21 +562,21 @@ setup_arch(char **cmdline_p)
         /*
          * print what head.S has found out about the machine
          */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	printk((MACHINE_IS_VM) ?
 	       "We are running under VM (31 bit mode)\n" :
 	       "We are running native (31 bit mode)\n");
 	printk((MACHINE_HAS_IEEE) ?
 	       "This machine has an IEEE fpu\n" :
 	       "This machine has no IEEE fpu\n");
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	printk((MACHINE_IS_VM) ?
 	       "We are running under VM (64 bit mode)\n" :
 	       "We are running native (64 bit mode)\n");
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
         ROOT_DEV = Root_RAM0;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	memory_end = memory_size & ~0x400000UL;  /* align memory end to 4MB */
         /*
          * We need some free virtual space to be able to do vmalloc.
@@ -585,9 +585,9 @@ setup_arch(char **cmdline_p)
          */
         if (memory_end > 1920*1024*1024)
                 memory_end = 1920*1024*1024;
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	memory_end = memory_size & ~0x200000UL;  /* detected in head.s */
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 	init_mm.start_code = PAGE_OFFSET;
 	init_mm.end_code = (unsigned long) &_etext;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 13592d00a10f..6ae4a77270b5 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -501,7 +501,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (test_thread_flag(TIF_31BIT)) {
 			extern void handle_signal32(unsigned long sig,
 						    struct k_sigaction *ka,
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index bd5b311006be..e10f4ca00499 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -402,7 +402,7 @@ static void smp_ext_bitcall_others(ec_bit_sig sig)
         }
 }
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 /*
  * this function sends a 'purge tlb' signal to another CPU.
  */
@@ -416,7 +416,7 @@ void smp_ptlb_all(void)
         on_each_cpu(smp_ptlb_callback, NULL, 0, 1);
 }
 EXPORT_SYMBOL(smp_ptlb_all);
-#endif /* ! CONFIG_ARCH_S390X */
+#endif /* ! CONFIG_64BIT */
 
 /*
  * this function sends a 'reschedule' IPI to another CPU.
@@ -783,7 +783,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		if (stack == 0ULL)
 			panic("smp_boot_cpus failed to allocate memory\n");
 		lowcore_ptr[i]->panic_stack = stack + (PAGE_SIZE);
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 		if (MACHINE_HAS_IEEE) {
 			lowcore_ptr[i]->extended_save_area_addr =
 				(__u32) __get_free_pages(GFP_KERNEL,0);
@@ -793,7 +793,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		}
 #endif
 	}
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE)
 		ctl_set_bit(14, 29); /* enable extended save area */
 #endif
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c
index efe6b83b53f7..6a63553493c5 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/sys_s390.c
@@ -26,9 +26,7 @@
 #include <linux/mman.h>
 #include <linux/file.h>
 #include <linux/utsname.h>
-#ifdef CONFIG_ARCH_S390X
 #include <linux/personality.h>
-#endif /* CONFIG_ARCH_S390X */
 
 #include <asm/uaccess.h>
 #include <asm/ipc.h>
@@ -121,7 +119,7 @@ out:
 	return error;
 }
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 struct sel_arg_struct {
 	unsigned long n;
 	fd_set *inp, *outp, *exp;
@@ -138,7 +136,7 @@ asmlinkage long old_select(struct sel_arg_struct __user *arg)
 	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
 
 }
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * sys_ipc() is the de-multiplexer for the SysV IPC calls..
@@ -211,7 +209,7 @@ asmlinkage long sys_ipc(uint call, int first, unsigned long second,
 	return -EINVAL;
 }
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 asmlinkage long s390x_newuname(struct new_utsname __user *name)
 {
 	int ret = sys_newuname(name);
@@ -235,12 +233,12 @@ asmlinkage long s390x_personality(unsigned long personality)
 
 	return ret;
 }
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * Wrapper function for sys_fadvise64/fadvise64_64
  */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 
 asmlinkage long
 s390_fadvise64(int fd, u32 offset_high, u32 offset_low, size_t len, int advice)
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index c5bd36fae56b..95d109968619 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -67,13 +67,13 @@ extern pgm_check_handler_t do_monitor_call;
 
 #define stack_pointer ({ void **sp; asm("la %0,0(15)" : "=&d" (sp)); sp; })
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 #define FOURLONG "%08lx %08lx %08lx %08lx\n"
 static int kstack_depth_to_print = 12;
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define FOURLONG "%016lx %016lx %016lx %016lx\n"
 static int kstack_depth_to_print = 20;
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * For show_trace we have tree different stack to consider:
@@ -702,12 +702,12 @@ void __init trap_init(void)
         pgm_check_table[0x11] = &do_dat_exception;
         pgm_check_table[0x12] = &translation_exception;
         pgm_check_table[0x13] = &special_op_exception;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
         pgm_check_table[0x38] = &do_dat_exception;
 	pgm_check_table[0x39] = &do_dat_exception;
 	pgm_check_table[0x3A] = &do_dat_exception;
         pgm_check_table[0x3B] = &do_dat_exception;
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
         pgm_check_table[0x15] = &operand_exception;
         pgm_check_table[0x1C] = &space_switch_exception;
         pgm_check_table[0x1D] = &hfp_sqrt_exception;
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 89fdb3808bc0..9289face3027 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
 #include <asm-generic/vmlinux.lds.h>
 #include <linux/config.h>
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
 OUTPUT_ARCH(s390)
 ENTRY(_start)
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index b701efa1f00e..d9b97b3c597f 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -4,6 +4,5 @@
 
 EXTRA_AFLAGS := -traditional
 
-lib-y += delay.o string.o
-lib-$(CONFIG_ARCH_S390_31) += uaccess.o spinlock.o
-lib-$(CONFIG_ARCH_S390X) += uaccess64.o spinlock.o
+lib-y += delay.o string.o spinlock.o
+lib-y += $(if $(CONFIG_64BIT),uaccess64.o,uaccess.o)
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index 2dc14e9c8327..68d79c502081 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -29,7 +29,7 @@ __setup("spin_retry=", spin_retry_setup);
 static inline void
 _diag44(void)
 {
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	if (MACHINE_HAS_DIAG44)
 #endif
 		asm volatile("diag 0,0,0x44");
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 506a33b51e4f..a9566bcab682 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -143,7 +143,7 @@ dcss_diag (__u8 func, void *parameter,
 	rx = (unsigned long) parameter;
 	ry = (unsigned long) func;
 	__asm__ __volatile__(
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		"   sam31\n" // switch to 31 bit
 		"   diag    %0,%1,0x64\n"
 		"   sam64\n" // switch back to 64 bit
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index fb2607c369ed..81ade401b073 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -31,17 +31,17 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 #define __FAIL_ADDR_MASK 0x7ffff000
 #define __FIXUP_MASK 0x7fffffff
 #define __SUBCODE_MASK 0x0200
 #define __PF_RES_FIELD 0ULL
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define __FAIL_ADDR_MASK -4096L
 #define __FIXUP_MASK ~0L
 #define __SUBCODE_MASK 0x0600
 #define __PF_RES_FIELD 0x8000000000000000ULL
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 #ifdef CONFIG_SYSCTL
 extern int sysctl_userprocess_debug;
@@ -393,11 +393,11 @@ int pfault_init(void)
 		"2:\n"
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		"   .long  0b,1b\n"
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 		"   .quad  0b,1b\n"
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 		".previous"
                 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc" );
         __ctl_set_bit(0, 9);
@@ -417,11 +417,11 @@ void pfault_fini(void)
 		"0:\n"
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		"   .long  0b,0b\n"
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 		"   .quad  0b,0b\n"
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 		".previous"
 		: : "a" (&refbk), "m" (refbk) : "cc" );
 }
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6ec5cd981e74..df953383724d 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -44,7 +44,7 @@ void diag10(unsigned long addr)
 {
         if (addr >= 0x7ff00000)
                 return;
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
         asm volatile (
 		"   sam31\n"
 		"   diag %0,%0,0x10\n"
@@ -106,7 +106,7 @@ extern unsigned long __initdata zholes_size[];
  * paging_init() sets up the page tables
  */
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 void __init paging_init(void)
 {
         pgd_t * pg_dir;
@@ -175,7 +175,7 @@ void __init paging_init(void)
         return;
 }
 
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 void __init paging_init(void)
 {
         pgd_t * pg_dir;
@@ -256,7 +256,7 @@ void __init paging_init(void)
 
         return;
 }
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 void __init mem_init(void)
 {
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index fb187e5a54b4..356257c171de 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -50,7 +50,7 @@ static inline unsigned long mmap_base(void)
 
 static inline int mmap_is_legacy(void)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	/*
 	 * Force standard allocation for 64 bit programs.
 	 */
diff --git a/block/Kconfig b/block/Kconfig
index eb48edb80c1d..377f6dd20e17 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,7 +5,7 @@
 #for instance.
 config LBD
 	bool "Support for Large Block Devices"
-	depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
+	depends on X86 || (MIPS && 32BIT) || PPC32 || (S390 && !64BIT) || SUPERH || UML
 	help
 	  Say Y here if you want to attach large (bigger than 2TB) discs to
 	  your machine, or if you want to have a raid or loopback device
diff --git a/crypto/Kconfig b/crypto/Kconfig
index c696f7ab729e..52e1d4108a99 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -42,7 +42,7 @@ config CRYPTO_SHA1
 
 config CRYPTO_SHA1_S390
 	tristate "SHA1 digest algorithm (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
@@ -58,7 +58,7 @@ config CRYPTO_SHA256
 
 config CRYPTO_SHA256_S390
 	tristate "SHA256 digest algorithm (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  SHA256 secure hash standard (DFIPS 180-2).
@@ -111,7 +111,7 @@ config CRYPTO_DES
 
 config CRYPTO_DES_S390
 	tristate "DES and Triple DES cipher algorithms (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3).
 
@@ -217,7 +217,7 @@ config CRYPTO_AES_X86_64
 
 config CRYPTO_AES_S390
 	tristate "AES cipher algorithms (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  AES cipher algorithms (FIPS-197). AES uses the Rijndael
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 84e68cdd451b..5ebd06b1b4ca 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -985,7 +985,7 @@ config HPET_MMAP
 
 config HANGCHECK_TIMER
 	tristate "Hangcheck timer"
-	depends on X86 || IA64 || PPC64 || ARCH_S390
+	depends on X86 || IA64 || PPC64 || S390
 	help
 	  The hangcheck-timer module detects when the system has gone
 	  out to lunch past a certain margin.  It can reboot the system
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 66e53dd450ff..40a67c86420c 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -120,7 +120,7 @@ __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
 #if defined(CONFIG_X86)
 # define HAVE_MONOTONIC
 # define TIMER_FREQ 1000000000ULL
-#elif defined(CONFIG_ARCH_S390)
+#elif defined(CONFIG_S390)
 /* FA240000 is 1 Second in the IBM time universe (Page 4-38 Principles of Op for zSeries */
 # define TIMER_FREQ 0xFA240000ULL
 #elif defined(CONFIG_IA64)
diff --git a/drivers/char/watchdog/Kconfig b/drivers/char/watchdog/Kconfig
index 344001b45af9..a6544790af60 100644
--- a/drivers/char/watchdog/Kconfig
+++ b/drivers/char/watchdog/Kconfig
@@ -438,7 +438,7 @@ config INDYDOG
 
 config ZVM_WATCHDOG
 	tristate "z/VM Watchdog Timer"
-	depends on WATCHDOG && ARCH_S390
+	depends on WATCHDOG && S390
 	help
 	  IBM s/390 and zSeries machines running under z/VM 5.1 or later
 	  provide a virtual watchdog timer to their guest that cause a
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 9f2352bd8348..a1e660e3531d 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -157,7 +157,7 @@ struct input_event_compat {
 #  define COMPAT_TEST test_thread_flag(TIF_IA32)
 #elif defined(CONFIG_IA64)
 #  define COMPAT_TEST IS_IA32_PROCESS(ia64_task_regs(current))
-#elif defined(CONFIG_ARCH_S390)
+#elif defined(CONFIG_S390)
 #  define COMPAT_TEST test_thread_flag(TIF_31BIT)
 #elif defined(CONFIG_MIPS)
 #  define COMPAT_TEST (current->thread.mflags & MF_32BIT_ADDR)
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index c782a6329805..fa39b944bc46 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -6,7 +6,7 @@ menu "PHY device support"
 
 config PHYLIB
 	tristate "PHY Device support and infrastructure"
-	depends on NET_ETHERNET && (BROKEN || !ARCH_S390)
+	depends on NET_ETHERNET && (BROKEN || !S390)
 	help
 	  Ethernet controllers are usually attached to PHY
 	  devices.  This option provides infrastructure for
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 6e7d7b06421d..6f50cc9323d9 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -1,11 +1,11 @@
-if ARCH_S390
+if S390
 
 comment "S/390 block device drivers"
-	depends on ARCH_S390
+	depends on S390
 
 config BLK_DEV_XPRAM
 	tristate "XPRAM disk support"
-	depends on ARCH_S390
+	depends on S390
 	help
 	  Select this option if you want to use your expanded storage on S/390
 	  or zSeries as a disk.  This is useful as a _fast_ swap device if you
@@ -49,7 +49,7 @@ config DASD_FBA
 
 config DASD_DIAG
 	tristate "Support for DIAG access to Disks"
-	depends on DASD && ( ARCH_S390X = 'n' || EXPERIMENTAL)
+	depends on DASD && ( 64BIT = 'n' || EXPERIMENTAL)
 	help
 	  Select this option if you want to use Diagnose250 command to access
 	  Disks under VM.  If you are not running under VM or unsure what it is,
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 1141a5963b67..041e1a621885 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -604,7 +604,7 @@ dasd_smalloc_request(char *magic, int cplength, int datasize,
 void
 dasd_kfree_request(struct dasd_ccw_req * cqr, struct dasd_device * device)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	struct ccw1 *ccw;
 
 	/* Clear any idals used for the request. */
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index a33d4064b537..ba80fdea7ebf 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -75,7 +75,7 @@ dia250(void *iob, int cmd)
 	int rc;
 
 	__asm__ __volatile__(
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		"	lghi	%0,3\n"
 		"	lgr	0,%3\n"
 		"	diag	0,%2,0x250\n"
diff --git a/drivers/s390/block/dasd_diag.h b/drivers/s390/block/dasd_diag.h
index 37edf6e91715..a4f80bd735f1 100644
--- a/drivers/s390/block/dasd_diag.h
+++ b/drivers/s390/block/dasd_diag.h
@@ -45,7 +45,7 @@ struct dasd_diag_characteristics {
 } __attribute__ ((packed, aligned(4)));
 
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #define DASD_DIAG_FLAGA_DEFAULT		DASD_DIAG_FLAGA_FORMAT_64BIT
 
 typedef u64 blocknum_t;
@@ -86,7 +86,7 @@ struct dasd_diag_rw_io {
 	struct dasd_diag_bio *bio_list;
 	u8  spare4[8];
 } __attribute__ ((packed, aligned(8)));
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define DASD_DIAG_FLAGA_DEFAULT		0x0
 
 typedef u32 blocknum_t;
@@ -125,4 +125,4 @@ struct dasd_diag_rw_io {
 	u32 interrupt_params;
 	u8 spare3[20];
 } __attribute__ ((packed, aligned(8)));
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index efc4cf62496e..96eb48258580 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -1041,7 +1041,7 @@ dasd_eckd_build_cp(struct dasd_device * device, struct request *req)
 				/* Eckd can only do full blocks. */
 				return ERR_PTR(-EINVAL);
 			count += bv->bv_len >> (device->s2b_shift + 9);
-#if defined(CONFIG_ARCH_S390X)
+#if defined(CONFIG_64BIT)
 			if (idal_is_needed (page_address(bv->bv_page),
 					    bv->bv_len))
 				cidaw += bv->bv_len >> (device->s2b_shift + 9);
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index 9bac8d87a9cc..8ec75dc08e2c 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -271,7 +271,7 @@ dasd_fba_build_cp(struct dasd_device * device, struct request *req)
 				/* Fba can only do full blocks. */
 				return ERR_PTR(-EINVAL);
 			count += bv->bv_len >> (device->s2b_shift + 9);
-#if defined(CONFIG_ARCH_S390X)
+#if defined(CONFIG_64BIT)
 			if (idal_is_needed (page_address(bv->bv_page),
 					    bv->bv_len))
 				cidaw += bv->bv_len / blksize;
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index d428c909b8a0..bf3a67c3cc5e 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -160,7 +160,7 @@ static int xpram_page_in (unsigned long page_addr, unsigned int xpage_index)
                 "0: ipm   %0\n"
 		"   srl   %0,28\n"
 		"1:\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
 		"   .long  0b,1b\n"
@@ -208,7 +208,7 @@ static long xpram_page_out (unsigned long page_addr, unsigned int xpage_index)
                 "0: ipm   %0\n"
 		"   srl   %0,28\n"
 		"1:\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
 		"   .long  0b,1b\n"
diff --git a/drivers/s390/char/vmwatchdog.c b/drivers/s390/char/vmwatchdog.c
index 5473c23fcb52..5acc0ace3d7d 100644
--- a/drivers/s390/char/vmwatchdog.c
+++ b/drivers/s390/char/vmwatchdog.c
@@ -66,7 +66,7 @@ static int __diag288(enum vmwdt_func func, unsigned int timeout,
 	__cmdl = len;
 	err = 0;
 	asm volatile (
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 		       "diag %2,%4,0x288\n"
 		"1:	\n"
 		".section .fixup,\"ax\"\n"
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 6f274f4f92eb..7376bc87206d 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -195,7 +195,7 @@ cio_start_key (struct subchannel *sch,	/* subchannel structure */
 	sch->orb.spnd = sch->options.suspend;
 	sch->orb.ssic = sch->options.suspend && sch->options.inter;
 	sch->orb.lpm = (lpm != 0) ? (lpm & sch->opm) : sch->lpm;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	/*
 	 * for 64 bit we always support 64 bit IDAWs with 4k page size only
 	 */
diff --git a/drivers/s390/cio/device_id.c b/drivers/s390/cio/device_id.c
index 3c77c3fd461d..04ceba343db8 100644
--- a/drivers/s390/cio/device_id.c
+++ b/drivers/s390/cio/device_id.c
@@ -27,7 +27,7 @@
 /*
  * diag210 is used under VM to get information about a virtual device
  */
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 int
 diag210(struct diag210 * addr)
 {
diff --git a/drivers/s390/cio/ioasm.h b/drivers/s390/cio/ioasm.h
index 62b0e2ad507f..95a9462f9a91 100644
--- a/drivers/s390/cio/ioasm.h
+++ b/drivers/s390/cio/ioasm.h
@@ -50,7 +50,7 @@ static inline int stsch_err(struct subchannel_id schid,
 		"0:  ipm  %0\n"
 		"    srl  %0,28\n"
 		"1:\n"
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 8\n"
 		"   .quad 0b,1b\n"
@@ -95,7 +95,7 @@ static inline int msch_err(struct subchannel_id schid,
 		"0:  ipm  %0\n"
 		"    srl  %0,28\n"
 		"1:\n"
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 8\n"
 		"   .quad 0b,1b\n"
diff --git a/drivers/s390/cio/qdio.c b/drivers/s390/cio/qdio.c
index 035c77af9cd3..30a836ffc31f 100644
--- a/drivers/s390/cio/qdio.c
+++ b/drivers/s390/cio/qdio.c
@@ -2394,7 +2394,7 @@ tiqdio_check_chsc_availability(void)
 	sprintf(dbf_text,"hydrati%1x", hydra_thinints);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	/* Check for QEBSM support in general (bit 58). */
 	is_passthrough = css_general_characteristics.qebsm;
 #endif
diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index 43b840af5300..fa385e761fe1 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -271,7 +271,7 @@ static inline int
 do_sqbs(unsigned long sch, unsigned char state, int queue,
        unsigned int *start, unsigned int *count)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
        register unsigned long _ccq asm ("0") = *count;
        register unsigned long _sch asm ("1") = sch;
        unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
@@ -295,7 +295,7 @@ static inline int
 do_eqbs(unsigned long sch, unsigned char *state, int queue,
 	unsigned int *start, unsigned int *count)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	register unsigned long _ccq asm ("0") = *count;
 	register unsigned long _sch asm ("1") = sch;
 	unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
@@ -323,7 +323,7 @@ do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 {
 	int cc;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	0,2	\n\t"
 		"lr	1,%1	\n\t"
@@ -336,7 +336,7 @@ do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 		: "d" (schid), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
 		"lghi	0,2	\n\t"
 		"llgfr	1,%1	\n\t"
@@ -349,7 +349,7 @@ do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 		: "d" (schid), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	return cc;
 }
 
@@ -358,7 +358,7 @@ do_siga_input(struct subchannel_id schid, unsigned int mask)
 {
 	int cc;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	0,1	\n\t"
 		"lr	1,%1	\n\t"
@@ -370,7 +370,7 @@ do_siga_input(struct subchannel_id schid, unsigned int mask)
 		: "d" (schid), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
 		"lghi	0,1	\n\t"
 		"llgfr	1,%1	\n\t"
@@ -382,7 +382,7 @@ do_siga_input(struct subchannel_id schid, unsigned int mask)
 		: "d" (schid), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	
 	return cc;
 }
@@ -394,7 +394,7 @@ do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 	int cc;
 	__u32 busy_bit;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	0,0	\n\t"
 		"lr	1,%2	\n\t"
@@ -424,7 +424,7 @@ do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION)
 		: "cc", "0", "1", "2", "memory"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
         	"llgfr  0,%5    \n\t"
                 "lgr    1,%2    \n\t"
@@ -449,7 +449,7 @@ do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION), "d" (fc)
 		: "cc", "0", "1", "2", "memory"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	
 	(*bb) = busy_bit;
 	return cc;
@@ -461,21 +461,21 @@ do_clear_global_summary(void)
 
 	unsigned long time;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	1,3	\n\t"
 		".insn	rre,0xb2650000,2,0	\n\t"
 		"lr	%0,3	\n\t"
 		: "=d" (time) : : "cc", "1", "2", "3"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
 		"lghi	1,3	\n\t"
 		".insn	rre,0xb2650000,2,0	\n\t"
 		"lgr	%0,3	\n\t"
 		: "=d" (time) : : "cc", "1", "2", "3"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	
 	return time;
 }
@@ -542,11 +542,11 @@ struct qdio_perf_stats {
 
 #define MY_MODULE_STRING(x) #x
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #define QDIO_GET_ADDR(x) ((__u32)(unsigned long)x)
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define QDIO_GET_ADDR(x) ((__u32)(long)x)
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 struct qdio_q {
 	volatile struct slsb slsb;
diff --git a/drivers/s390/crypto/z90hardware.c b/drivers/s390/crypto/z90hardware.c
index 7c3ed52e03e1..d7f7494a0cbe 100644
--- a/drivers/s390/crypto/z90hardware.c
+++ b/drivers/s390/crypto/z90hardware.c
@@ -785,7 +785,7 @@ testq(int q_nr, int *q_depth, int *dev_type, struct ap_status_word *stat)
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	llgfr	0,%4		\n"
 	 "	slgr	1,1		\n"
 	 "	lgr	2,1		\n"
@@ -855,7 +855,7 @@ resetq(int q_nr, struct ap_status_word *stat_p)
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	llgfr	0,%2		\n"
 	 "	lghi	1,1		\n"
 	 "	sll	1,24		\n"
@@ -921,7 +921,7 @@ sen(int msg_len, unsigned char *msg_ext, struct ap_status_word *stat)
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	lgr	6,%3		\n"
 	 "	llgfr	7,%2		\n"
 	 "	llgt	0,0(6)		\n"
@@ -1000,7 +1000,7 @@ rec(int q_nr, int buff_l, unsigned char *rsp, unsigned char *id,
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	llgfr	0,%2		\n"
 	 "	lgr	3,%4		\n"
 	 "	lgr	6,%3		\n"
diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig
index a7efc394515e..548854754921 100644
--- a/drivers/s390/net/Kconfig
+++ b/drivers/s390/net/Kconfig
@@ -1,5 +1,5 @@
 menu "S/390 network device drivers"
-	depends on NETDEVICES && ARCH_S390
+	depends on NETDEVICES && S390
 
 config LCS
 	tristate "Lan Channel Station Interface"
diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c
index 6b63d21612ec..e70af7f39946 100644
--- a/drivers/s390/net/claw.c
+++ b/drivers/s390/net/claw.c
@@ -1603,7 +1603,7 @@ dumpit(char* buf, int len)
         __u32      ct, sw, rm, dup;
         char       *ptr, *rptr;
         char       tbuf[82], tdup[82];
-#if (CONFIG_ARCH_S390X)
+#if (CONFIG_64BIT)
         char       addr[22];
 #else
         char       addr[12];
@@ -1619,7 +1619,7 @@ dumpit(char* buf, int len)
         dup = 0;
         for ( ct=0; ct < len; ct++, ptr++, rptr++ )  {
                 if (sw == 0) {
-#if (CONFIG_ARCH_S390X)
+#if (CONFIG_64BIT)
                         sprintf(addr, "%16.16lX",(unsigned long)rptr);
 #else
                         sprintf(addr, "%8.8X",(__u32)rptr);
@@ -1634,7 +1634,7 @@ dumpit(char* buf, int len)
                 if (sw == 8) {
                         strcat(bhex, "  ");
                 }
-#if (CONFIG_ARCH_S390X)
+#if (CONFIG_64BIT)
                 sprintf(tbuf,"%2.2lX", (unsigned long)*ptr);
 #else
                 sprintf(tbuf,"%2.2X", (__u32)*ptr);
diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c
index 7dad597ff86e..3bf466603512 100644
--- a/drivers/s390/s390mach.c
+++ b/drivers/s390/s390mach.c
@@ -246,7 +246,7 @@ s390_revalidate_registers(struct mci *mci)
 		 */
 		kill_task = 1;
 
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 	asm volatile("ld 0,0(%0)\n"
 		     "ld 2,8(%0)\n"
 		     "ld 4,16(%0)\n"
@@ -255,7 +255,7 @@ s390_revalidate_registers(struct mci *mci)
 #endif
 
 	if (MACHINE_HAS_IEEE) {
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 		fpt_save_area = &S390_lowcore.floating_pt_save_area;
 		fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area;
 #else
@@ -314,7 +314,7 @@ s390_revalidate_registers(struct mci *mci)
 		 */
 		s390_handle_damage("invalid control registers.");
 	else
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 		asm volatile("lctlg 0,15,0(%0)"
 			     : : "a" (&S390_lowcore.cregs_save_area));
 #else
@@ -327,7 +327,7 @@ s390_revalidate_registers(struct mci *mci)
 	 * can't write something sensible into that register.
 	 */
 
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	/*
 	 * See if we can revalidate the TOD programmable register with its
 	 * old contents (should be zero) otherwise set it to zero.
@@ -384,7 +384,7 @@ s390_do_machine_check(struct pt_regs *regs)
 		if (mci->b) {
 			/* Processing backup -> verify if we can survive this */
 			u64 z_mcic, o_mcic, t_mcic;
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 			z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29);
 			o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 |
 				  1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 |
diff --git a/drivers/s390/sysinfo.c b/drivers/s390/sysinfo.c
index 87c2db1bd4f5..66da840c9316 100644
--- a/drivers/s390/sysinfo.c
+++ b/drivers/s390/sysinfo.c
@@ -106,7 +106,7 @@ static inline int stsi (void *sysinfo,
 {
 	int cc, retv;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	__asm__ __volatile__ (	"lr\t0,%2\n"
 				"\tlr\t1,%3\n"
 				"\tstsi\t0(%4)\n"
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 4c42065dea88..9e8254f0256c 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1815,7 +1815,7 @@ config SCSI_SUNESP
 
 config ZFCP
 	tristate "FCP host bus adapter driver for IBM eServer zSeries"
-	depends on ARCH_S390 && QDIO && SCSI
+	depends on S390 && QDIO && SCSI
 	select SCSI_FC_ATTRS
 	help
           If you want to access SCSI devices attached to your IBM eServer
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index 656bc43431b9..e227a04261ab 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -85,7 +85,7 @@ config ATARI_PARTITION
 
 config IBM_PARTITION
 	bool "IBM disk label and partition support"
-	depends on PARTITION_ADVANCED && ARCH_S390
+	depends on PARTITION_ADVANCED && S390
 	help
 	  Say Y here if you would like to be able to read the hard disk
 	  partition table format used by IBM DASD disks operating under CMS.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3e1239e4b303..5e9251f65317 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
 	buffer = task_sig(task, buffer);
 	buffer = task_cap(task, buffer);
 	buffer = cpuset_task_status_allowed(task, buffer);
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
 	buffer = task_show_regs(task, buffer);
 #endif
 	return buffer - orig;
diff --git a/include/asm-s390/unistd.h b/include/asm-s390/unistd.h
index f97d92691f17..2861cdc243ad 100644
--- a/include/asm-s390/unistd.h
+++ b/include/asm-s390/unistd.h
@@ -539,7 +539,7 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4,    \
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_SYS_RT_SIGACTION
-# ifdef CONFIG_ARCH_S390_31
+# ifndef CONFIG_64BIT
 #   define __ARCH_WANT_STAT64
 #   define __ARCH_WANT_SYS_TIME
 # endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 60f8bc78a35a..6c5d4c898ccb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -12,7 +12,7 @@
 #include <linux/config.h>
 #include <linux/smp.h>
 
-#if !defined(CONFIG_ARCH_S390)
+#if !defined(CONFIG_S390)
 
 #include <linux/linkage.h>
 #include <linux/cache.h>
diff --git a/init/Kconfig b/init/Kconfig
index 24e0f7c756c0..ba42f3793a84 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -189,7 +189,7 @@ config AUDIT
 
 config AUDITSYSCALL
 	bool "Enable system-call auditing support"
-	depends on AUDIT && (X86 || PPC || PPC64 || ARCH_S390 || IA64 || UML || SPARC64)
+	depends on AUDIT && (X86 || PPC || PPC64 || S390 || IA64 || UML || SPARC64)
 	default y if SECURITY_SELINUX
 	help
 	  Enable low-overhead system-call auditing infrastructure that
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index c10b08a80982..c2683fcd792d 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -145,7 +145,7 @@ int __init rd_load_image(char *from)
 	int nblocks, i, disk;
 	char *buf = NULL;
 	unsigned short rotate = 0;
-#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES)
+#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES)
 	char rotator[4] = { '|' , '/' , '-' , '\\' };
 #endif
 
@@ -237,7 +237,7 @@ int __init rd_load_image(char *from)
 		}
 		sys_read(in_fd, buf, BLOCK_SIZE);
 		sys_write(out_fd, buf, BLOCK_SIZE);
-#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES)
+#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES)
 		if (!(i % 16)) {
 			printk("%c\b", rotator[rotate & 0x3]);
 			rotate++;
diff --git a/kernel/panic.c b/kernel/panic.c
index aabc5f86fa3f..c5c4ab255834 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	long i;
 	static char buf[1024];
 	va_list args;
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
@@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 		printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
 	}
 #endif
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         disabled_wait(caller);
 #endif
 	local_irq_enable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 345f4a1d533f..a85047bb5739 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -108,7 +108,7 @@ extern int pwrsw_enabled;
 extern int unaligned_enabled;
 #endif
 
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 extern int sysctl_ieee_emulation_warnings;
 #endif
@@ -542,7 +542,7 @@ static ctl_table kern_table[] = {
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 	{
 		.ctl_name	= KERN_IEEE_EMULATION_WARNINGS,
@@ -644,7 +644,7 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
 	{
 		.ctl_name	= KERN_SPIN_RETRY,
 		.procname	= "spin_retry",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1cedc2356b78..80598cfd728c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -32,7 +32,7 @@ config MAGIC_SYSRQ
 config LOG_BUF_SHIFT
 	int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL
 	range 12 21
-	default 17 if ARCH_S390
+	default 17 if S390
 	default 16 if X86_NUMAQ || IA64
 	default 15 if SMP
 	default 14
-- 
cgit v1.2.3


From 0aa7c6990e7de06131cdc14ef4abfcab017c24a0 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:34 -0800
Subject: [PATCH] fuse: clean up fuse_lookup()

Simplify fuse_lookup() and related functions.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c | 75 ++++++++++++++++++-----------------------------------------
 1 file changed, 23 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 51f5da652771..0d1438a9dab3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -13,7 +13,6 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
-#include <linux/mount.h>
 
 static inline unsigned long time_to_jiffies(unsigned long sec,
 					    unsigned long nsec)
@@ -22,6 +21,13 @@ static inline unsigned long time_to_jiffies(unsigned long sec,
 	return jiffies + timespec_to_jiffies(&ts);
 }
 
+static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
+{
+	struct fuse_inode *fi = get_fuse_inode(entry->d_inode);
+	entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
+	fi->i_time = time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
 static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 			     struct dentry *entry,
 			     struct fuse_entry_out *outarg)
@@ -66,10 +72,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 			return 0;
 
 		fuse_change_attributes(inode, &outarg.attr);
-		entry->d_time = time_to_jiffies(outarg.entry_valid,
-						outarg.entry_valid_nsec);
-		fi->i_time = time_to_jiffies(outarg.attr_valid,
-					     outarg.attr_valid_nsec);
+		fuse_change_timeout(entry, &outarg);
 	}
 	return 1;
 }
@@ -96,8 +99,8 @@ static struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 };
 
-static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
-			    struct inode **inodep)
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+				  struct nameidata *nd)
 {
 	int err;
 	struct fuse_entry_out outarg;
@@ -106,11 +109,11 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 	struct fuse_req *req;
 
 	if (entry->d_name.len > FUSE_NAME_MAX)
-		return -ENAMETOOLONG;
+		return ERR_PTR(-ENAMETOOLONG);
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -EINTR;
+		return ERR_PTR(-EINTR);
 
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
@@ -122,24 +125,22 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 				  &outarg.attr);
 		if (!inode) {
 			fuse_send_forget(fc, req, outarg.nodeid, 1);
-			return -ENOMEM;
+			return ERR_PTR(-ENOMEM);
 		}
 	}
 	fuse_put_request(fc, req);
 	if (err && err != -ENOENT)
-		return err;
+		return ERR_PTR(err);
 
-	if (inode) {
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		entry->d_time =	time_to_jiffies(outarg.entry_valid,
-						outarg.entry_valid_nsec);
-		fi->i_time = time_to_jiffies(outarg.attr_valid,
-					     outarg.attr_valid_nsec);
+	if (inode && dir_alias(inode)) {
+		iput(inode);
+		return ERR_PTR(-EIO);
 	}
-
+	d_add(entry, inode);
 	entry->d_op = &fuse_dentry_operations;
-	*inodep = inode;
-	return 0;
+	if (inode)
+		fuse_change_timeout(entry, &outarg);
+	return NULL;
 }
 
 void fuse_invalidate_attr(struct inode *inode)
@@ -163,7 +164,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct fuse_open_in inarg;
 	struct fuse_open_out outopen;
 	struct fuse_entry_out outentry;
-	struct fuse_inode *fi;
 	struct fuse_file *ff;
 	struct file *file;
 	int flags = nd->intent.open.flags - 1;
@@ -224,13 +224,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 		goto out_put_request;
 	}
 	fuse_put_request(fc, req);
-	entry->d_time =	time_to_jiffies(outentry.entry_valid,
-					outentry.entry_valid_nsec);
-	fi = get_fuse_inode(inode);
-	fi->i_time = time_to_jiffies(outentry.attr_valid,
-				     outentry.attr_valid_nsec);
-
 	d_instantiate(entry, inode);
+	fuse_change_timeout(entry, &outentry);
 	file = lookup_instantiate_filp(nd, entry, generic_file_open);
 	if (IS_ERR(file)) {
 		ff->fh = outopen.fh;
@@ -254,7 +249,6 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 {
 	struct fuse_entry_out outarg;
 	struct inode *inode;
-	struct fuse_inode *fi;
 	int err;
 
 	req->in.h.nodeid = get_node_id(dir);
@@ -286,14 +280,8 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 		return -EIO;
 	}
 
-	entry->d_time = time_to_jiffies(outarg.entry_valid,
-					outarg.entry_valid_nsec);
-
-	fi = get_fuse_inode(inode);
-	fi->i_time = time_to_jiffies(outarg.attr_valid,
-				     outarg.attr_valid_nsec);
-
 	d_instantiate(entry, inode);
+	fuse_change_timeout(entry, &outarg);
 	fuse_invalidate_attr(dir);
 	return 0;
 }
@@ -883,23 +871,6 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
 	return err;
 }
 
-static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
-				  struct nameidata *nd)
-{
-	struct inode *inode;
-	int err;
-
-	err = fuse_lookup_iget(dir, entry, &inode);
-	if (err)
-		return ERR_PTR(err);
-	if (inode && dir_alias(inode)) {
-		iput(inode);
-		return ERR_PTR(-EIO);
-	}
-	d_add(entry, inode);
-	return NULL;
-}
-
 static int fuse_setxattr(struct dentry *entry, const char *name,
 			 const void *value, size_t size, int flags)
 {
-- 
cgit v1.2.3


From 4633a22e7added835fd1d4b072dbcc4474aa3017 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:36 -0800
Subject: [PATCH] fuse: clean up page offset calculation

Use page_offset() instead of doing page offset calculation by hand.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2ca86141d13a..18aafa6c9af4 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -272,7 +272,6 @@ static int fuse_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT;
 	struct fuse_req *req = fuse_get_request(fc);
 	int err = -EINTR;
 	if (!req)
@@ -281,7 +280,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 	req->out.page_zeroing = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
-	fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE);
+	fuse_send_read(req, file, inode, page_offset(page), PAGE_CACHE_SIZE);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err)
@@ -295,7 +294,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 static int fuse_send_readpages(struct fuse_req *req, struct file *file,
 			       struct inode *inode)
 {
-	loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT;
+	loff_t pos = page_offset(req->pages[0]);
 	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
 	unsigned i;
 	req->out.page_zeroing = 1;
@@ -402,7 +401,7 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	unsigned count = to - offset;
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset;
+	loff_t pos = page_offset(page) + offset;
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
-- 
cgit v1.2.3


From 45714d65618407bce1fd0271bc58303ce14b0785 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:36 -0800
Subject: [PATCH] fuse: bump interface version

Change interface version to 7.4.

Following changes will need backward compatibility support, so store the minor
version returned by userspace.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        | 2 ++
 fs/fuse/fuse_i.h     | 3 +++
 include/linux/fuse.h | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8f873e621f41..e5bc3f8eebd0 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -178,6 +178,8 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
 			fc->conn_error = 1;
 
+		fc->minor = req->misc.init_in_out.minor;
+
 		/* After INIT reply is received other requests can go
 		   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
 		   up()s on outstanding_sem.  The last up() is done in
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0ea5301f86be..2d4835e54c90 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,9 @@ struct fuse_conn {
 	/** Is create not implemented by fs? */
 	unsigned no_create : 1;
 
+	/** Negotiated minor version */
+	unsigned minor;
+
 	/** Backing dev info */
 	struct backing_dev_info bdi;
 };
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index b76b558b03d4..3c85f1a422cc 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -14,7 +14,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 3
+#define FUSE_KERNEL_MINOR_VERSION 4
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
-- 
cgit v1.2.3


From de5f12025572ef8fcffa4be5453061725acfb754 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:37 -0800
Subject: [PATCH] fuse: add frsize to statfs reply

Add 'frsize' member to the statfs reply.

I'm not sure if sending f_fsid will ever be needed, but just in case leave
some space at the end of the structure, so less compatibility mess would be
required.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/inode.c      | 5 ++++-
 include/linux/fuse.h | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e69a546844d0..3b928a02af04 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -218,6 +218,7 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
 {
 	stbuf->f_type    = FUSE_SUPER_MAGIC;
 	stbuf->f_bsize   = attr->bsize;
+	stbuf->f_frsize  = attr->frsize;
 	stbuf->f_blocks  = attr->blocks;
 	stbuf->f_bfree   = attr->bfree;
 	stbuf->f_bavail  = attr->bavail;
@@ -238,10 +239,12 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
 	if (!req)
 		return -EINTR;
 
+	memset(&outarg, 0, sizeof(outarg));
 	req->in.numargs = 0;
 	req->in.h.opcode = FUSE_STATFS;
 	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].size =
+		fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
 	req->out.args[0].value = &outarg;
 	request_send(fc, req);
 	err = req->out.h.error;
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 3c85f1a422cc..9d5177c356cc 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -53,6 +53,9 @@ struct fuse_kstatfs {
 	__u64	ffree;
 	__u32	bsize;
 	__u32	namelen;
+	__u32	frsize;
+	__u32	padding;
+	__u32	spare[6];
 };
 
 #define FATTR_MODE	(1 << 0)
@@ -213,6 +216,8 @@ struct fuse_write_out {
 	__u32	padding;
 };
 
+#define FUSE_COMPAT_STATFS_SIZE 48
+
 struct fuse_statfs_out {
 	struct fuse_kstatfs st;
 };
-- 
cgit v1.2.3


From 8cbdf1e6f6876b37d2a0d96fd15ea9f90f7d51c1 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:38 -0800
Subject: [PATCH] fuse: support caching negative dentries

Add support for caching negative dentries.

Up till now, ->d_revalidate() always forced a new lookup on these.  Now let
the lookup method return a zero node ID (not used for anything else) meaning a
negative entry, but with a positive cache timeout.  The old way of signaling
negative entry (replying ENOENT) still works.

Userspace should check the ABI minor version to see whether sending a zero ID
is allowed by the kernel or not.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c | 64 +++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0d1438a9dab3..4c127f2bc814 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -23,9 +23,26 @@ static inline unsigned long time_to_jiffies(unsigned long sec,
 
 static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
 {
-	struct fuse_inode *fi = get_fuse_inode(entry->d_inode);
 	entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
-	fi->i_time = time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+	if (entry->d_inode)
+		get_fuse_inode(entry->d_inode)->i_time =
+			time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+void fuse_invalidate_attr(struct inode *inode)
+{
+	get_fuse_inode(inode)->i_time = jiffies - 1;
+}
+
+static void fuse_invalidate_entry_cache(struct dentry *entry)
+{
+	entry->d_time = jiffies - 1;
+}
+
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+	d_invalidate(entry);
+	fuse_invalidate_entry_cache(entry);
 }
 
 static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
@@ -45,15 +62,22 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
-	if (!entry->d_inode || is_bad_inode(entry->d_inode))
+	struct inode *inode = entry->d_inode;
+
+	if (inode && is_bad_inode(inode))
 		return 0;
 	else if (time_after(jiffies, entry->d_time)) {
 		int err;
 		struct fuse_entry_out outarg;
-		struct inode *inode = entry->d_inode;
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_req *req = fuse_get_request(fc);
+		struct fuse_conn *fc;
+		struct fuse_req *req;
+
+		fuse_invalidate_entry_cache(entry);
+		if (!inode)
+			return 0;
+
+		fc = get_fuse_conn(inode);
+		req = fuse_get_request(fc);
 		if (!req)
 			return 0;
 
@@ -61,6 +85,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		request_send(fc, req);
 		err = req->out.h.error;
 		if (!err) {
+			struct fuse_inode *fi = get_fuse_inode(inode);
 			if (outarg.nodeid != get_node_id(inode)) {
 				fuse_send_forget(fc, req, outarg.nodeid, 1);
 				return 0;
@@ -118,9 +143,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
-	if (!err && invalid_nodeid(outarg.nodeid))
+	if (!err && outarg.nodeid && invalid_nodeid(outarg.nodeid))
 		err = -EIO;
-	if (!err) {
+	if (!err && outarg.nodeid) {
 		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
 				  &outarg.attr);
 		if (!inode) {
@@ -138,22 +163,13 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	}
 	d_add(entry, inode);
 	entry->d_op = &fuse_dentry_operations;
-	if (inode)
+	if (!err)
 		fuse_change_timeout(entry, &outarg);
+	else
+		fuse_invalidate_entry_cache(entry);
 	return NULL;
 }
 
-void fuse_invalidate_attr(struct inode *inode)
-{
-	get_fuse_inode(inode)->i_time = jiffies - 1;
-}
-
-static void fuse_invalidate_entry(struct dentry *entry)
-{
-	d_invalidate(entry);
-	entry->d_time = jiffies - 1;
-}
-
 static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 			    struct nameidata *nd)
 {
@@ -387,6 +403,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 		inode->i_nlink = 0;
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
 	} else if (err == -EINTR)
 		fuse_invalidate_entry(entry);
 	return err;
@@ -412,6 +429,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	if (!err) {
 		entry->d_inode->i_nlink = 0;
 		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
 	} else if (err == -EINTR)
 		fuse_invalidate_entry(entry);
 	return err;
@@ -447,6 +465,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 		fuse_invalidate_attr(olddir);
 		if (olddir != newdir)
 			fuse_invalidate_attr(newdir);
+
+		/* newent will end up negative */
+		if (newent->d_inode)
+			fuse_invalidate_entry_cache(newent);
 	} else if (err == -EINTR) {
 		/* If request was interrupted, DEITY only knows if the
 		   rename actually took place.  If the invalidation
-- 
cgit v1.2.3


From 6f9f11806af8ad3a107714a3ece56c1c4fafd047 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:39 -0800
Subject: [PATCH] fuse: add code documentation

Document some not-so-trivial functions.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 90 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4c127f2bc814..fead7f49e2ca 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,6 +14,15 @@
 #include <linux/sched.h>
 #include <linux/namei.h>
 
+/*
+ * FUSE caches dentries and attributes with separate timeout.  The
+ * time in jiffies until the dentry/attributes are valid is stored in
+ * dentry->d_time and fuse_inode->i_time respectively.
+ */
+
+/*
+ * Calculate the time in jiffies until a dentry/attributes are valid
+ */
 static inline unsigned long time_to_jiffies(unsigned long sec,
 					    unsigned long nsec)
 {
@@ -21,6 +30,10 @@ static inline unsigned long time_to_jiffies(unsigned long sec,
 	return jiffies + timespec_to_jiffies(&ts);
 }
 
+/*
+ * Set dentry and possibly attribute timeouts from the lookup/mk*
+ * replies
+ */
 static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
 {
 	entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
@@ -29,16 +42,32 @@ static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
 			time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
 }
 
+/*
+ * Mark the attributes as stale, so that at the next call to
+ * ->getattr() they will be fetched from userspace
+ */
 void fuse_invalidate_attr(struct inode *inode)
 {
 	get_fuse_inode(inode)->i_time = jiffies - 1;
 }
 
+/*
+ * Just mark the entry as stale, so that a next attempt to look it up
+ * will result in a new lookup call to userspace
+ *
+ * This is called when a dentry is about to become negative and the
+ * timeout is unknown (unlink, rmdir, rename and in some cases
+ * lookup)
+ */
 static void fuse_invalidate_entry_cache(struct dentry *entry)
 {
 	entry->d_time = jiffies - 1;
 }
 
+/*
+ * Same as fuse_invalidate_entry_cache(), but also try to remove the
+ * dentry from the hash
+ */
 static void fuse_invalidate_entry(struct dentry *entry)
 {
 	d_invalidate(entry);
@@ -60,6 +89,15 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 	req->out.args[0].value = outarg;
 }
 
+/*
+ * Check whether the dentry is still valid
+ *
+ * If the entry validity timeout has expired and the dentry is
+ * positive, try to redo the lookup.  If the lookup results in a
+ * different inode, then let the VFS invalidate the dentry and redo
+ * the lookup once more.  If the lookup results in the same inode,
+ * then refresh the attributes, timeouts and mark the dentry valid.
+ */
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
 	struct inode *inode = entry->d_inode;
@@ -72,6 +110,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		struct fuse_conn *fc;
 		struct fuse_req *req;
 
+		/* Doesn't hurt to "reset" the validity timeout */
 		fuse_invalidate_entry_cache(entry);
 		if (!inode)
 			return 0;
@@ -102,10 +141,13 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 	return 1;
 }
 
+/*
+ * Check if there's already a hashed alias of this directory inode.
+ * If yes, then lookup and mkdir must not create a new alias.
+ */
 static int dir_alias(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode)) {
-		/* Don't allow creating an alias to a directory  */
 		struct dentry *alias = d_find_alias(inode);
 		if (alias) {
 			dput(alias);
@@ -170,6 +212,12 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	return NULL;
 }
 
+/*
+ * Atomic create+open operation
+ *
+ * If the filesystem doesn't support this, then fall back to separate
+ * 'mknod' + 'open' requests.
+ */
 static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 			    struct nameidata *nd)
 {
@@ -236,6 +284,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
 		ff->fh = outopen.fh;
+		/* Special release, with inode = NULL, this will
+		   trigger a 'forget' request when the release is
+		   complete */
 		fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
 		goto out_put_request;
 	}
@@ -259,6 +310,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	return err;
 }
 
+/*
+ * Code shared between mknod, mkdir, symlink and link
+ */
 static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 			    struct inode *dir, struct dentry *entry,
 			    int mode)
@@ -576,6 +630,15 @@ static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 	return 0;
 }
 
+/*
+ * Check whether the inode attributes are still valid
+ *
+ * If the attribute validity timeout has expired, then fetch the fresh
+ * attributes with a 'getattr' request
+ *
+ * I'm not sure why cached attributes are never returned for the root
+ * inode, this is probably being too cautious.
+ */
 static int fuse_revalidate(struct dentry *entry)
 {
 	struct inode *inode = entry->d_inode;
@@ -623,6 +686,19 @@ static int fuse_access(struct inode *inode, int mask)
 	return err;
 }
 
+/*
+ * Check permission.  The two basic access models of FUSE are:
+ *
+ * 1) Local access checking ('default_permissions' mount option) based
+ * on file mode.  This is the plain old disk filesystem permission
+ * modell.
+ *
+ * 2) "Remote" access checking, where server is responsible for
+ * checking permission in each inode operation.  An exception to this
+ * is if ->permission() was invoked from sys_access() in which case an
+ * access request is sent.  Execute permission is still checked
+ * locally based on file mode.
+ */
 static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -641,14 +717,10 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 				err = generic_permission(inode, mask, NULL);
 		}
 
-		/* FIXME: Need some mechanism to revoke permissions:
-		   currently if the filesystem suddenly changes the
-		   file mode, we will not be informed about it, and
-		   continue to allow access to the file/directory.
-
-		   This is actually not so grave, since the user can
-		   simply keep access to the file/directory anyway by
-		   keeping it open... */
+		/* Note: the opposite of the above test does not
+		   exist.  So if permissions are revoked this won't be
+		   noticed immediately, only after the attribute
+		   timeout has expired */
 
 		return err;
 	} else {
@@ -816,6 +888,15 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
 	}
 }
 
+/*
+ * Set attributes, and at the same time refresh them.
+ *
+ * Truncation is slightly complicated, because the 'truncate' request
+ * may fail, in which case we don't want to touch the mapping.
+ * vmtruncate() doesn't allow for this case.  So do the rlimit
+ * checking by hand and call vmtruncate() only after the file has
+ * actually been truncated.
+ */
 static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 {
 	struct inode *inode = entry->d_inode;
-- 
cgit v1.2.3


From 248d86e87d12da19eee602075f05a49a5215288b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:39 -0800
Subject: [PATCH] fuse: fail file operations on bad inode

Make file operations on a bad inode fail.  This just makes things a
bit more consistent.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c  |  7 ++++++-
 fs/fuse/file.c | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fead7f49e2ca..9a6075de961f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -773,7 +773,12 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	struct page *page;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 18aafa6c9af4..c989f0e9456b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -163,6 +163,9 @@ static int fuse_flush(struct file *file)
 	struct fuse_flush_in inarg;
 	int err;
 
+	if (is_bad_inode(inode))
+		return -EIO;
+
 	if (fc->no_flush)
 		return 0;
 
@@ -199,6 +202,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	struct fuse_fsync_in inarg;
 	int err;
 
+	if (is_bad_inode(inode))
+		return -EIO;
+
 	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
 		return 0;
 
@@ -272,8 +278,15 @@ static int fuse_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
-	int err = -EINTR;
+	struct fuse_req *req;
+	int err;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	err = -EINTR;
+	req = fuse_get_request(fc);
 	if (!req)
 		goto out;
 
@@ -344,6 +357,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_readpages_data data;
 	int err;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
 	data.file = file;
 	data.inode = inode;
 	data.req = fuse_get_request(fc);
@@ -402,7 +419,12 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	loff_t pos = page_offset(page) + offset;
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -474,7 +496,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 	size_t nmax = write ? fc->max_write : fc->max_read;
 	loff_t pos = *ppos;
 	ssize_t res = 0;
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
-- 
cgit v1.2.3


From 1d3d752b471d2a3a1d5e4fe177e5e7d52abb4e4c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:40 -0800
Subject: [PATCH] fuse: clean up request size limit checking

Change the way a too large request is handled.  Until now in this case the
device read returned -EINVAL and the operation returned -EIO.

Make it more flexibible by not returning -EINVAL from the read, but restarting
it instead.

Also remove the fixed limit on setxattr data and let the filesystem provide as
large a read buffer as it needs to handle the extended attribute data.

The symbolic link length is already checked by VFS to be less than PATH_MAX,
so the extra check against FUSE_SYMLINK_MAX is not needed.

The check in fuse_create_open() against FUSE_NAME_MAX is not needed, since the
dentry has already been looked up, and hence the name already checked.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        | 26 ++++++++++++++++----------
 fs/fuse/dir.c        | 14 +-------------
 fs/fuse/fuse_i.h     |  9 ++++++---
 fs/fuse/inode.c      |  2 +-
 include/linux/fuse.h |  8 ++------
 5 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e5bc3f8eebd0..1afdffdf80db 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -617,6 +617,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	struct fuse_copy_state cs;
 	unsigned reqsize;
 
+ restart:
 	spin_lock(&fuse_lock);
 	fc = file->private_data;
 	err = -EPERM;
@@ -632,20 +633,25 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 
 	req = list_entry(fc->pending.next, struct fuse_req, list);
 	list_del_init(&req->list);
-	spin_unlock(&fuse_lock);
 
 	in = &req->in;
-	reqsize = req->in.h.len;
-	fuse_copy_init(&cs, 1, req, iov, nr_segs);
-	err = -EINVAL;
-	if (iov_length(iov, nr_segs) >= reqsize) {
-		err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
-		if (!err)
-			err = fuse_copy_args(&cs, in->numargs, in->argpages,
-					     (struct fuse_arg *) in->args, 0);
+	reqsize = in->h.len;
+	/* If request is too large, reply with an error and restart the read */
+	if (iov_length(iov, nr_segs) < reqsize) {
+		req->out.h.error = -EIO;
+		/* SETXATTR is special, since it may contain too large data */
+		if (in->h.opcode == FUSE_SETXATTR)
+			req->out.h.error = -E2BIG;
+		request_end(fc, req);
+		goto restart;
 	}
+	spin_unlock(&fuse_lock);
+	fuse_copy_init(&cs, 1, req, iov, nr_segs);
+	err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+	if (!err)
+		err = fuse_copy_args(&cs, in->numargs, in->argpages,
+				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(&cs);
-
 	spin_lock(&fuse_lock);
 	req->locked = 0;
 	if (!err && req->interrupted)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9a6075de961f..f156392d019e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -236,10 +236,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (fc->no_create)
 		goto out;
 
-	err = -ENAMETOOLONG;
-	if (entry->d_name.len > FUSE_NAME_MAX)
-		goto out;
-
 	err = -EINTR;
 	req = fuse_get_request(fc);
 	if (!req)
@@ -413,12 +409,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	unsigned len = strlen(link) + 1;
-	struct fuse_req *req;
-
-	if (len > FUSE_SYMLINK_MAX)
-		return -ENAMETOOLONG;
-
-	req = fuse_get_request(fc);
+	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -988,9 +979,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	struct fuse_setxattr_in inarg;
 	int err;
 
-	if (size > FUSE_XATTR_SIZE_MAX)
-		return -E2BIG;
-
 	if (fc->no_setxattr)
 		return -EOPNOTSUPP;
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2d4835e54c90..17fd368559cd 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,12 @@
 /** If more requests are outstanding, then the operation will block */
 #define FUSE_MAX_OUTSTANDING 10
 
+/** Maximum size of data in a write request */
+#define FUSE_MAX_WRITE 4096
+
+/** It could be as large as PATH_MAX, but would that have any uses? */
+#define FUSE_NAME_MAX 1024
+
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
     permission checking is done in the kernel */
@@ -108,9 +114,6 @@ struct fuse_out {
 	struct fuse_arg args[3];
 };
 
-struct fuse_req;
-struct fuse_conn;
-
 /**
  * A request to the client
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3b928a02af04..3580b9e12345 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -485,7 +485,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->max_read = d.max_read;
 	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
 		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
-	fc->max_write = FUSE_MAX_IN / 2;
+	fc->max_write = FUSE_MAX_WRITE;
 
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 9d5177c356cc..8f64cc2205b0 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -108,12 +108,8 @@ enum fuse_opcode {
 	FUSE_CREATE        = 35
 };
 
-/* Conservative buffer size for the client */
-#define FUSE_MAX_IN 8192
-
-#define FUSE_NAME_MAX 1024
-#define FUSE_SYMLINK_MAX 4096
-#define FUSE_XATTR_SIZE_MAX 4096
+/* The read buffer is required to be at least 8k, but may be much larger */
+#define FUSE_MIN_READ_BUFFER 8192
 
 struct fuse_entry_out {
 	__u64	nodeid;		/* Inode ID */
-- 
cgit v1.2.3


From 3ec870d524c9150add120475c8ddcfa50574f98e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:41 -0800
Subject: [PATCH] fuse: make maximum write data configurable

Make the maximum size of write data configurable by the filesystem.  The
previous fixed 4096 limit only worked on architectures where the page size is
less or equal to this.  This change make writing work on other architectures
too, and also lets the filesystem receive bigger write requests in direct_io
mode.

Normal writes which go through the page cache are still limited to a page
sized chunk per request.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        | 48 ++++++++++++++++++++++++++++++------------------
 fs/fuse/fuse_i.h     |  6 ++----
 fs/fuse/inode.c      |  1 -
 include/linux/fuse.h | 11 +++++++++--
 4 files changed, 41 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1afdffdf80db..e08ab4702d97 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -148,6 +148,26 @@ void fuse_release_background(struct fuse_req *req)
 	spin_unlock(&fuse_lock);
 }
 
+static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int i;
+	struct fuse_init_out *arg = &req->misc.init_out;
+
+	if (arg->major != FUSE_KERNEL_VERSION)
+		fc->conn_error = 1;
+	else {
+		fc->minor = arg->minor;
+		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+	}
+
+	/* After INIT reply is received other requests can go
+	   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
+	   up()s on outstanding_sem.  The last up() is done in
+	   fuse_putback_request() */
+	for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
+		up(&fc->outstanding_sem);
+}
+
 /*
  * This function is called when a request is finished.  Either a reply
  * has arrived or it was interrupted (and not yet sent) or some error
@@ -172,21 +192,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		up_read(&fc->sbput_sem);
 	}
 	wake_up(&req->waitq);
-	if (req->in.h.opcode == FUSE_INIT) {
-		int i;
-
-		if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
-			fc->conn_error = 1;
-
-		fc->minor = req->misc.init_in_out.minor;
-
-		/* After INIT reply is received other requests can go
-		   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
-		   up()s on outstanding_sem.  The last up() is done in
-		   fuse_putback_request() */
-		for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
-			up(&fc->outstanding_sem);
-	} else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
+	if (req->in.h.opcode == FUSE_INIT)
+		process_init_reply(fc, req);
+	else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
 		/* Special case for failed iget in CREATE */
 		u64 nodeid = req->in.h.nodeid;
 		__fuse_get_request(req);
@@ -359,7 +367,7 @@ void fuse_send_init(struct fuse_conn *fc)
 	/* This is called from fuse_read_super() so there's guaranteed
 	   to be a request available */
 	struct fuse_req *req = do_get_request(fc);
-	struct fuse_init_in_out *arg = &req->misc.init_in_out;
+	struct fuse_init_in *arg = &req->misc.init_in;
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	req->in.h.opcode = FUSE_INIT;
@@ -367,8 +375,12 @@ void fuse_send_init(struct fuse_conn *fc)
 	req->in.args[0].size = sizeof(*arg);
 	req->in.args[0].value = arg;
 	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(*arg);
-	req->out.args[0].value = arg;
+	/* Variable length arguement used for backward compatibility
+	   with interface version < 7.5.  Rest of init_out is zeroed
+	   by do_get_request(), so a short reply is not a problem */
+	req->out.argvar = 1;
+	req->out.args[0].size = sizeof(struct fuse_init_out);
+	req->out.args[0].value = &req->misc.init_out;
 	request_send_background(fc, req);
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 17fd368559cd..74c8d098a14a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,9 +21,6 @@
 /** If more requests are outstanding, then the operation will block */
 #define FUSE_MAX_OUTSTANDING 10
 
-/** Maximum size of data in a write request */
-#define FUSE_MAX_WRITE 4096
-
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
 
@@ -162,7 +159,8 @@ struct fuse_req {
 	union {
 		struct fuse_forget_in forget_in;
 		struct fuse_release_in release_in;
-		struct fuse_init_in_out init_in_out;
+		struct fuse_init_in init_in;
+		struct fuse_init_out init_out;
 	} misc;
 
 	/** page vector */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3580b9e12345..e4541869831e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -485,7 +485,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->max_read = d.max_read;
 	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
 		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
-	fc->max_write = FUSE_MAX_WRITE;
 
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 8f64cc2205b0..528959c52f1b 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -14,7 +14,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 4
+#define FUSE_KERNEL_MINOR_VERSION 5
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -244,11 +244,18 @@ struct fuse_access_in {
 	__u32	padding;
 };
 
-struct fuse_init_in_out {
+struct fuse_init_in {
 	__u32	major;
 	__u32	minor;
 };
 
+struct fuse_init_out {
+	__u32	major;
+	__u32	minor;
+	__u32	unused[3];
+	__u32	max_write;
+};
+
 struct fuse_in_header {
 	__u32	len;
 	__u32	opcode;
-- 
cgit v1.2.3


From 6ad84acab972f4dfc78e6fdb04c419f82c497d29 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:42 -0800
Subject: [PATCH] fuse: ensure progress in read and write

In direct_io mode, send at least one page per reqest.  Previously it was
possible that reqests with zero data were sent, and hence the read/write
didn't make any progress, resulting in an infinite (though interruptible)
loop.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c989f0e9456b..05dedddf4289 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -475,7 +475,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
 
 	nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
 	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	npages = min(npages, FUSE_MAX_PAGES_PER_REQ);
+	npages = min(max(npages, 1), FUSE_MAX_PAGES_PER_REQ);
 	down_read(&current->mm->mmap_sem);
 	npages = get_user_pages(current, current->mm, user_addr, npages, write,
 				0, req->pages, NULL);
@@ -506,7 +506,6 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 		return -EINTR;
 
 	while (count) {
-		size_t tmp;
 		size_t nres;
 		size_t nbytes = min(count, nmax);
 		int err = fuse_get_user_pages(req, buf, nbytes, !write);
@@ -514,8 +513,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 			res = err;
 			break;
 		}
-		tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset;
-		nbytes = min(nbytes, tmp);
+		nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+		nbytes = min(count, nbytes);
 		if (write)
 			nres = fuse_send_write(req, file, inode, pos, nbytes);
 		else
-- 
cgit v1.2.3


From 39ee059affaf57a152c64cd3a0adc3f48f02ed71 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:43 -0800
Subject: [PATCH] fuse: check file type in lookup

Previously invalid types were quietly changed to regular files, but at
revalidation the inode was changed to bad.  This was rather inconsistent
behavior.

Now check if the type is valid on initial lookup, and return -EIO if not.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c   | 27 ++++++++++++++++++++-------
 fs/fuse/inode.c |  8 ++------
 2 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f156392d019e..417bcee466f6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -166,6 +166,12 @@ static struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 };
 
+static inline int valid_mode(int m)
+{
+	return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
+		S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
+}
+
 static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 				  struct nameidata *nd)
 {
@@ -185,7 +191,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
-	if (!err && outarg.nodeid && invalid_nodeid(outarg.nodeid))
+	if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) ||
+		     !valid_mode(outarg.attr.mode)))
 		err = -EIO;
 	if (!err && outarg.nodeid) {
 		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
@@ -328,10 +335,13 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 		fuse_put_request(fc, req);
 		return err;
 	}
-	if (invalid_nodeid(outarg.nodeid)) {
-		fuse_put_request(fc, req);
-		return -EIO;
-	}
+	err = -EIO;
+	if (invalid_nodeid(outarg.nodeid))
+		goto out_put_request;
+
+	if ((outarg.attr.mode ^ mode) & S_IFMT)
+		goto out_put_request;
+
 	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
 			  &outarg.attr);
 	if (!inode) {
@@ -340,8 +350,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	}
 	fuse_put_request(fc, req);
 
-	/* Don't allow userspace to do really stupid things... */
-	if (((inode->i_mode ^ mode) & S_IFMT) || dir_alias(inode)) {
+	if (dir_alias(inode)) {
 		iput(inode);
 		return -EIO;
 	}
@@ -350,6 +359,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	fuse_change_timeout(entry, &outarg);
 	fuse_invalidate_attr(dir);
 	return 0;
+
+ out_put_request:
+	fuse_put_request(fc, req);
+	return err;
 }
 
 static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e4541869831e..04c80cc957a3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -135,12 +135,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 		fuse_init_common(inode);
 		init_special_inode(inode, inode->i_mode,
 				   new_decode_dev(attr->rdev));
-	} else {
-		/* Don't let user create weird files */
-		inode->i_mode = S_IFREG;
-		fuse_init_common(inode);
-		fuse_init_file_inode(inode);
-	}
+	} else
+		BUG();
 }
 
 static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
-- 
cgit v1.2.3


From f93ea411b73594f7d144855fd34278bcf34a9afc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 6 Jan 2006 00:19:55 -0800
Subject: [PATCH] jbd: split checkpoint lists

Split the checkpoint list of the transaction into two lists.  In the first
list we keep the buffers that need to be submitted for IO.  In the second
list are kept buffers that were already submitted and we just have to wait
for the IO to complete.  This should simplify a handling of checkpoint
lists a bit and can eventually be also a performance gain.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/checkpoint.c | 418 ++++++++++++++++++++++++++++++----------------------
 include/linux/jbd.h |   8 +-
 2 files changed, 248 insertions(+), 178 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 014a51fd00d7..cb3cef525c3b 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,75 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction. 
+ * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
 
-static inline void __buffer_unlink(struct journal_head *jh)
+static void __buffer_unlink_first(struct journal_head *jh)
 {
 	transaction_t *transaction;
 
 	transaction = jh->b_cp_transaction;
-	jh->b_cp_transaction = NULL;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
+	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
-		transaction->t_checkpoint_list = NULL;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction;
+
+	transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction;
+
+	transaction = jh->b_cp_transaction;
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -57,12 +103,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		__journal_remove_checkpoint(jh);
+		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
-		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -117,83 +162,53 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up a transaction's checkpoint list.  
- *
- * We wait for any pending IO to complete and make sure any clean
- * buffers are removed from the transaction. 
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh, *next_jh, *last_jh;
+	struct journal_head *jh;
 	struct buffer_head *bh;
-	int ret = 0;
-
-	assert_spin_locked(&journal->j_list_lock);
-	jh = transaction->t_checkpoint_list;
-	if (!jh)
-		return 0;
-
-	last_jh = jh->b_cpprev;
-	next_jh = jh;
-	do {
-		jh = next_jh;
+	tid_t this_tid;
+	int released = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Didn't somebody clean up the transaction in the meanwhile */
+	if (journal->j_checkpoint_transactions != transaction ||
+		transaction->t_tid != this_tid)
+		return;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			goto out_return_1;
-		}
-
-		/*
-		 * This is foul
-		 */
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			goto out_return_1;
+			spin_lock(&journal->j_list_lock);
+			goto restart;
 		}
-
-		if (jh->b_transaction != NULL) {
-			transaction_t *t = jh->b_transaction;
-			tid_t tid = t->t_tid;
-
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			log_start_commit(journal, tid);
-			log_wait_commit(journal, tid);
-			goto out_return_1;
-		}
-
 		/*
-		 * AKPM: I think the buffer_jbddirty test is redundant - it
-		 * shouldn't have NULL b_transaction?
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
 		 */
-		next_jh = jh->b_cpnext;
-		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
-			BUFFER_TRACE(bh, "remove from checkpoint");
-			__journal_remove_checkpoint(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
-			ret = 1;
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-	} while (jh != last_jh);
-
-	return ret;
-out_return_1:
-	spin_lock(&journal->j_list_lock);
-	return 1;
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+	}
 }
 
 #define NR_BATCH	64
@@ -203,9 +218,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
-	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -221,19 +234,46 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			int *drop_count)
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
-		J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	if (buffer_locked(bh)) {
+		get_bh(bh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		put_bh(bh);
+		ret = 1;
+	}
+	else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
 
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	}
+	else if (!buffer_dirty(bh)) {
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		put_bh(bh);
+		ret = 1;
+	}
+	else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -246,45 +286,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
-	} else {
-		int last_buffer = 0;
-		if (jh->b_cpnext == jh) {
-			/* We may be about to drop the transaction.  Tell the
-			 * caller that the lists have changed.
-			 */
-			last_buffer = 1;
-		}
-		if (__try_to_free_cp_buf(jh)) {
-			(*drop_count)++;
-			ret = last_buffer;
-		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint.  We don't write out only enough to
- * satisfy the current blocked requests: rather we submit a reasonably
- * sized chunk of the outstanding data to disk at once for
- * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
  * 
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.  
- *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
+	transaction_t *transaction;
+	tid_t this_tid;
 	int result;
-	int batch_count = 0;
-	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -299,79 +324,70 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Try to free up a
-	 * quarter of the log in a single checkpoint if we can.
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
 	 */
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
 	/*
-	 * AKPM: check this code.  I had a feeling a while back that it
-	 * degenerates into a busy loop at unmount time.
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
 	 */
-	spin_lock(&journal->j_list_lock);
-	while (journal->j_checkpoint_transactions) {
-		transaction_t *transaction;
-		struct journal_head *jh, *last_jh, *next_jh;
-		int drop_count = 0;
-		int cleanup_ret, retry = 0;
-		tid_t this_tid;
-
-		transaction = journal->j_checkpoint_transactions;
-		this_tid = transaction->t_tid;
-		jh = transaction->t_checkpoint_list;
-		last_jh = jh->b_cpprev;
-		next_jh = jh;
-		do {
+ 	if (journal->j_checkpoint_transactions == transaction ||
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0;
+
+		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 
-			jh = next_jh;
-			next_jh = jh->b_cpnext;
+			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
-				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
-			if (cond_resched_lock(&journal->j_list_lock)) {
+			retry = __process_buffer(journal, jh, bhs,
+						&batch_count);
+			if (!retry &&
+			    lock_need_resched(&journal->j_list_lock)) {
+				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-		} while (jh != last_jh && !retry);
+		}
 
 		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
 			__flush_batch(journal, bhs, &batch_count);
-			retry = 1;
 		}
 
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		/*
-		 * If someone cleaned up this transaction while we slept, we're
-		 * done
-		 */
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
-		if (retry)
-			continue;
-		/*
-		 * Maybe it's a new transaction, but it fell at the same
-		 * address
-		 */
-		if (transaction->t_tid != this_tid)
-			continue;
-		/*
-		 * We have walked the whole transaction list without
-		 * finding anything to write to disk.  We had better be
-		 * able to make some progress or we are in trouble. 
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list.  Let's clean up the second one.
 		 */
-		cleanup_ret = __cleanup_transaction(journal, transaction);
-		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
+		__wait_cp_io(journal, transaction);
 	}
+out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
-
 	return 0;
 }
 
@@ -455,6 +471,53 @@ int cleanup_journal_tail(journal_t *journal)
 
 /* Checkpoint list management */
 
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+ 	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
 /*
  * journal_clean_checkpoint_list
  *
@@ -462,46 +525,38 @@ int cleanup_journal_tail(journal_t *journal)
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
-	int ret = 0;
+	int ret = 0, released;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (transaction == 0)
+	if (!transaction)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
-		struct journal_head *jh;
-
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		jh = transaction->t_checkpoint_list;
-		if (jh) {
-			struct journal_head *last_jh = jh->b_cpprev;
-			struct journal_head *next_jh = jh;
-
-			do {
-				jh = next_jh;
-				next_jh = jh->b_cpnext;
-				/* Use trylock because of the ranknig */
-				if (jbd_trylock_bh_state(jh2bh(jh)))
-					ret += __try_to_free_cp_buf(jh);
-				/*
-				 * This function only frees up some memory
-				 * if possible so we dont have an obligation
-				 * to finish processing. Bail out if preemption
-				 * requested:
-				 */
-				if (need_resched())
-					goto out;
-			} while (jh != last_jh);
-		}
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -516,18 +571,22 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint list.  
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
+	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -538,8 +597,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL)
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -565,8 +626,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
+	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
+	return ret;
 }
 
 /*
@@ -628,6 +691,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index dcde7adfdce5..558cb4c26ec9 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -497,6 +497,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_checkpoint_list;
 
+	/*
+	 * Doubly-linked circular list of all buffers submitted for IO while
+	 * checkpointing. [j_list_lock]
+	 */
+	struct journal_head	*t_checkpoint_io_list;
+
 	/*
 	 * Doubly-linked circular list of temporary buffers currently undergoing
 	 * IO in the log [j_list_lock]
@@ -843,7 +849,7 @@ extern void journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
 int __journal_clean_checkpoint_list(journal_t *journal);
-void __journal_remove_checkpoint(struct journal_head *);
+int __journal_remove_checkpoint(struct journal_head *);
 void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
 
 /* Buffer IO */
-- 
cgit v1.2.3


From a334de28665b14f0a33df82699fa9a78cfeedf31 Mon Sep 17 00:00:00 2001
From: David Shaw <dshaw@jabberwocky.com>
Date: Fri, 6 Jan 2006 00:19:58 -0800
Subject: [PATCH] knfsd: check error status from vfs_getattr and i_op->fsync

Both vfs_getattr and i_op->fsync return error statuses which nfsd was
largely ignoring.  This as noticed when exporting directories using fuse.

This patch cleans up most of the offences, which involves moving the call
to vfs_getattr out of the xdr encoding routines (where it is too late to
report an error) into the main NFS procedure handling routines.

There is still a called to vfs_gettattr (related to the ACL code) where the
status is ignored, and called to nfsd_sync_dir don't check return status
either.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs3proc.c        | 11 +++++++++--
 fs/nfsd/nfs3xdr.c         | 47 ++++++++++++++++++++++++----------------------
 fs/nfsd/nfsxdr.c          | 48 +++++++++++++++++++++++------------------------
 fs/nfsd/vfs.c             | 20 +++++++++++++-------
 include/linux/nfsd/xdr.h  |  3 +++
 include/linux/nfsd/xdr3.h |  1 +
 6 files changed, 75 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 041380fe667b..6d2dfed1de08 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -56,13 +56,20 @@ static int
 nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 					   struct nfsd3_attrstat *resp)
 {
-	int	nfserr;
+	int	err, nfserr;
 
 	dprintk("nfsd: GETATTR(3)  %s\n",
-				SVCFH_fmt(&argp->fh));
+		SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	err = vfs_getattr(resp->fh.fh_export->ex_mnt,
+			  resp->fh.fh_dentry, &resp->stat);
+	nfserr = nfserrno(err);
+
 	RETURN_STATUS(nfserr);
 }
 
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9147b8524d05..243d94b9653a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -154,37 +154,34 @@ decode_sattr3(u32 *p, struct iattr *iap)
 }
 
 static inline u32 *
-encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+	      struct kstat *stat)
 {
-	struct vfsmount *mnt = fhp->fh_export->ex_mnt;
 	struct dentry	*dentry = fhp->fh_dentry;
-	struct kstat stat;
 	struct timespec time;
 
-	vfs_getattr(mnt, dentry, &stat);
-
-	*p++ = htonl(nfs3_ftypes[(stat.mode & S_IFMT) >> 12]);
-	*p++ = htonl((u32) stat.mode);
-	*p++ = htonl((u32) stat.nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
-	if (S_ISLNK(stat.mode) && stat.size > NFS3_MAXPATHLEN) {
+	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+	if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
 		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
 	} else {
-		p = xdr_encode_hyper(p, (u64) stat.size);
+		p = xdr_encode_hyper(p, (u64) stat->size);
 	}
-	p = xdr_encode_hyper(p, ((u64)stat.blocks) << 9);
-	*p++ = htonl((u32) MAJOR(stat.rdev));
-	*p++ = htonl((u32) MINOR(stat.rdev));
+	p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+	*p++ = htonl((u32) MAJOR(stat->rdev));
+	*p++ = htonl((u32) MINOR(stat->rdev));
 	if (is_fsid(fhp, rqstp->rq_reffh))
 		p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
 	else
-		p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat.dev));
-	p = xdr_encode_hyper(p, (u64) stat.ino);
-	p = encode_time3(p, &stat.atime);
+		p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat->dev));
+	p = xdr_encode_hyper(p, (u64) stat->ino);
+	p = encode_time3(p, &stat->atime);
 	lease_get_mtime(dentry->d_inode, &time); 
 	p = encode_time3(p, &time);
-	p = encode_time3(p, &stat.ctime);
+	p = encode_time3(p, &stat->ctime);
 
 	return p;
 }
@@ -232,8 +229,14 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 {
 	struct dentry *dentry = fhp->fh_dentry;
 	if (dentry && dentry->d_inode != NULL) {
-		*p++ = xdr_one;		/* attributes follow */
-		return encode_fattr3(rqstp, p, fhp);
+	        int err;
+		struct kstat stat;
+
+		err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat);
+		if (!err) {
+			*p++ = xdr_one;		/* attributes follow */
+			return encode_fattr3(rqstp, p, fhp, &stat);
+		}
 	}
 	*p++ = xdr_zero;
 	return p;
@@ -616,7 +619,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd3_attrstat *resp)
 {
 	if (resp->status == 0)
-		p = encode_fattr3(rqstp, p, &resp->fh);
+		p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b45999ff33e6..aa7bb41b293d 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,46 +152,44 @@ decode_sattr(u32 *p, struct iattr *iap)
 }
 
 static inline u32 *
-encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+	     struct kstat *stat)
 {
-	struct vfsmount *mnt = fhp->fh_export->ex_mnt;
 	struct dentry	*dentry = fhp->fh_dentry;
-	struct kstat stat;
 	int type;
 	struct timespec time;
 
-	vfs_getattr(mnt, dentry, &stat);
-	type = (stat.mode & S_IFMT);
+	type = (stat->mode & S_IFMT);
 
 	*p++ = htonl(nfs_ftypes[type >> 12]);
-	*p++ = htonl((u32) stat.mode);
-	*p++ = htonl((u32) stat.nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
 
-	if (S_ISLNK(type) && stat.size > NFS_MAXPATHLEN) {
+	if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
 		*p++ = htonl(NFS_MAXPATHLEN);
 	} else {
-		*p++ = htonl((u32) stat.size);
+		*p++ = htonl((u32) stat->size);
 	}
-	*p++ = htonl((u32) stat.blksize);
+	*p++ = htonl((u32) stat->blksize);
 	if (S_ISCHR(type) || S_ISBLK(type))
-		*p++ = htonl(new_encode_dev(stat.rdev));
+		*p++ = htonl(new_encode_dev(stat->rdev));
 	else
 		*p++ = htonl(0xffffffff);
-	*p++ = htonl((u32) stat.blocks);
+	*p++ = htonl((u32) stat->blocks);
 	if (is_fsid(fhp, rqstp->rq_reffh))
 		*p++ = htonl((u32) fhp->fh_export->ex_fsid);
 	else
-		*p++ = htonl(new_encode_dev(stat.dev));
-	*p++ = htonl((u32) stat.ino);
-	*p++ = htonl((u32) stat.atime.tv_sec);
-	*p++ = htonl(stat.atime.tv_nsec ? stat.atime.tv_nsec / 1000 : 0);
+		*p++ = htonl(new_encode_dev(stat->dev));
+	*p++ = htonl((u32) stat->ino);
+	*p++ = htonl((u32) stat->atime.tv_sec);
+	*p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
 	lease_get_mtime(dentry->d_inode, &time); 
 	*p++ = htonl((u32) time.tv_sec);
 	*p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); 
-	*p++ = htonl((u32) stat.ctime.tv_sec);
-	*p++ = htonl(stat.ctime.tv_nsec ? stat.ctime.tv_nsec / 1000 : 0);
+	*p++ = htonl((u32) stat->ctime.tv_sec);
+	*p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
 
 	return p;
 }
@@ -199,7 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 /* Helper function for NFSv2 ACL code */
 u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 {
-	return encode_fattr(rqstp, p, fhp);
+	struct kstat stat;
+	vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
+	return encode_fattr(rqstp, p, fhp, &stat);
 }
 
 /*
@@ -394,7 +394,7 @@ int
 nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_attrstat *resp)
 {
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -403,7 +403,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_diropres *resp)
 {
 	p = encode_fh(p, &resp->fh);
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -428,7 +428,7 @@ int
 nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_readres *resp)
 {
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	*p++ = htonl(resp->count);
 	xdr_ressize_check(rqstp, p);
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index af7c3c3074b0..f83ab4cf4265 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -717,27 +717,33 @@ nfsd_close(struct file *filp)
  * As this calls fsync (not fdatasync) there is no need for a write_inode
  * after it.
  */
-static inline void nfsd_dosync(struct file *filp, struct dentry *dp,
-			       struct file_operations *fop)
+static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
+			      struct file_operations *fop)
 {
 	struct inode *inode = dp->d_inode;
 	int (*fsync) (struct file *, struct dentry *, int);
+	int err = nfs_ok;
 
 	filemap_fdatawrite(inode->i_mapping);
 	if (fop && (fsync = fop->fsync))
-		fsync(filp, dp, 0);
+		err=fsync(filp, dp, 0);
 	filemap_fdatawait(inode->i_mapping);
+
+	return nfserrno(err);
 }
 	
 
-static void
+static int
 nfsd_sync(struct file *filp)
 {
+        int err;
 	struct inode *inode = filp->f_dentry->d_inode;
 	dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name);
 	down(&inode->i_sem);
-	nfsd_dosync(filp, filp->f_dentry, filp->f_op);
+	err=nfsd_dosync(filp, filp->f_dentry, filp->f_op);
 	up(&inode->i_sem);
+
+	return err;
 }
 
 void
@@ -962,7 +968,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 			if (inode->i_state & I_DIRTY) {
 				dprintk("nfsd: write sync %d\n", current->pid);
-				nfsd_sync(file);
+				err=nfsd_sync(file);
 			}
 #if 0
 			wake_up(&inode->i_wait);
@@ -1066,7 +1072,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		return err;
 	if (EX_ISSYNC(fhp->fh_export)) {
 		if (file->f_op && file->f_op->fsync) {
-			nfsd_sync(file);
+			err = nfsd_sync(file);
 		} else {
 			err = nfserr_notsupp;
 		}
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index 130d4f588a37..3f4f7142bbe3 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -88,10 +88,12 @@ struct nfsd_readdirargs {
 
 struct nfsd_attrstat {
 	struct svc_fh		fh;
+	struct kstat		stat;
 };
 
 struct nfsd_diropres  {
 	struct svc_fh		fh;
+	struct kstat		stat;
 };
 
 struct nfsd_readlinkres {
@@ -101,6 +103,7 @@ struct nfsd_readlinkres {
 struct nfsd_readres {
 	struct svc_fh		fh;
 	unsigned long		count;
+	struct kstat		stat;
 };
 
 struct nfsd_readdirres {
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 3c2a71b43bac..a4322741f8b9 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -126,6 +126,7 @@ struct nfsd3_setaclargs {
 struct nfsd3_attrstat {
 	__u32			status;
 	struct svc_fh		fh;
+	struct kstat            stat;
 };
 
 /* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
-- 
cgit v1.2.3


From 9f708e40fe040e79f6c393a282f0701c9f8dc174 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:19:59 -0800
Subject: [PATCH] knfsd: reduce stack consumption

A typical nfsd call trace is
 nfsd -> svc_process -> nfsd_dispatch -> nfsd3_proc_write ->
   nfsd_write ->nfsd_vfs_write -> vfs_writev

These add up to over 300 bytes on the stack.
Looking at each of these, I see that nfsd_write (which includes
 nfsd_vfs_write) contributes 0x8c to stack usage itself!!

It turns out this is because it puts a 'struct iattr' on the stack so
it can kill suid if needed.  The following patch saves about 50 bytes
off the stack in this call path.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f83ab4cf4265..df4019f04560 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -880,6 +880,16 @@ out:
 	return err;
 }
 
+static void kill_suid(struct dentry *dentry)
+{
+	struct iattr	ia;
+	ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
+
+	down(&dentry->d_inode->i_sem);
+	notify_change(dentry, &ia);
+	up(&dentry->d_inode->i_sem);
+}
+
 static inline int
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
@@ -933,14 +943,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	}
 
 	/* clear setuid/setgid flag after write */
-	if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) {
-		struct iattr	ia;
-		ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
-
-		down(&inode->i_sem);
-		notify_change(dentry, &ia);
-		up(&inode->i_sem);
-	}
+	if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+		kill_suid(dentry);
 
 	if (err >= 0 && stable) {
 		static ino_t	last_ino;
-- 
cgit v1.2.3


From abd3e641d5ef9f836ab2f2b04d80b8619b051531 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:02 +0100
Subject: NFS: Work correctly with single-page ->writepage() calls

 Ensure that we always initiate flushing of data before we exit
 a single-page ->writepage() call.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3107908e5f3f..95d00f9132d0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -232,19 +232,16 @@ static int nfs_writepage_async(struct nfs_open_context *ctx,
 		unsigned int offset, unsigned int count)
 {
 	struct nfs_page	*req;
-	int		status;
 
 	req = nfs_update_request(ctx, inode, page, offset, count);
-	status = (IS_ERR(req)) ? PTR_ERR(req) : 0;
-	if (status < 0)
-		goto out;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 	/* Update file length */
 	nfs_grow_file(page, offset, count);
 	/* Set the PG_uptodate flag? */
 	nfs_mark_uptodate(page, offset, count);
 	nfs_unlock_request(req);
- out:
-	return status;
+	return 0;
 }
 
 static int wb_priority(struct writeback_control *wbc)
@@ -304,11 +301,8 @@ do_it:
 	lock_kernel();
 	if (!IS_SYNC(inode) && inode_referenced) {
 		err = nfs_writepage_async(ctx, inode, page, 0, offset);
-		if (err >= 0) {
-			err = 0;
-			if (wbc->for_reclaim)
-				nfs_flush_inode(inode, 0, 0, FLUSH_STABLE);
-		}
+		if (!wbc->for_writepages)
+			nfs_flush_inode(inode, 0, 0, wb_priority(wbc));
 	} else {
 		err = nfs_writepage_sync(ctx, inode, page, 0,
 						offset, priority);
-- 
cgit v1.2.3


From 963d8fe53339128ee46a7701f2e36305f0ccff8c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:04 +0100
Subject: RPC: Clean up RPC task structure

 Shrink the RPC task structure. Instead of storing separate pointers
 for task->tk_exit and task->tk_release, put them in a structure.

 Also pass the user data pointer as a parameter instead of passing it via
 task->tk_calldata. This enables us to nest callbacks.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c          |  38 ++++++++-------
 fs/lockd/svc4proc.c          |  15 +++---
 fs/lockd/svclock.c           |  14 ++++--
 fs/lockd/svcproc.c           |  14 ++++--
 fs/nfs/direct.c              |   1 -
 fs/nfs/nfs3proc.c            |  44 +++++++++++-------
 fs/nfs/nfs4proc.c            | 107 +++++++++++++++++++++++++------------------
 fs/nfs/proc.c                |  28 +++++++----
 fs/nfs/read.c                |  10 ++--
 fs/nfs/unlink.c              |  19 ++++----
 fs/nfs/write.c               |  21 +++------
 fs/nfsd/nfs4callback.c       |  10 ++--
 include/linux/lockd/lockd.h  |   2 +-
 include/linux/nfs_fs.h       |  12 +++--
 include/linux/sunrpc/clnt.h  |   3 +-
 include/linux/sunrpc/sched.h |  20 +++++---
 net/sunrpc/clnt.c            |  15 +++---
 net/sunrpc/sched.c           |  53 +++++++++++----------
 18 files changed, 241 insertions(+), 185 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c5a33648e9fd..816333cd377b 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -26,11 +26,12 @@
 static int	nlmclnt_test(struct nlm_rqst *, struct file_lock *);
 static int	nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
 static int	nlmclnt_unlock(struct nlm_rqst *, struct file_lock *);
-static void	nlmclnt_unlock_callback(struct rpc_task *);
-static void	nlmclnt_cancel_callback(struct rpc_task *);
 static int	nlm_stat_to_errno(u32 stat);
 static void	nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host);
 
+static const struct rpc_call_ops nlmclnt_unlock_ops;
+static const struct rpc_call_ops nlmclnt_cancel_ops;
+
 /*
  * Cookie counter for NLM requests
  */
@@ -399,8 +400,7 @@ in_grace_period:
 /*
  * Generic NLM call, async version.
  */
-int
-nlmsvc_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
+int nlmsvc_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
 {
 	struct nlm_host	*host = req->a_host;
 	struct rpc_clnt	*clnt;
@@ -419,13 +419,12 @@ nlmsvc_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
 
         /* bootstrap and kick off the async RPC call */
-        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, callback, req);
+        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req);
 
 	return status;
 }
 
-static int
-nlmclnt_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
+static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
 {
 	struct nlm_host	*host = req->a_host;
 	struct rpc_clnt	*clnt;
@@ -448,7 +447,7 @@ nlmclnt_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback)
 	/* Increment host refcount */
 	nlm_get_host(host);
         /* bootstrap and kick off the async RPC call */
-        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, callback, req);
+        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req);
 	if (status < 0)
 		nlm_release_host(host);
 	return status;
@@ -664,7 +663,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 
 	if (req->a_flags & RPC_TASK_ASYNC) {
 		status = nlmclnt_async_call(req, NLMPROC_UNLOCK,
-					nlmclnt_unlock_callback);
+					&nlmclnt_unlock_ops);
 		/* Hrmf... Do the unlock early since locks_remove_posix()
 		 * really expects us to free the lock synchronously */
 		do_vfs_lock(fl);
@@ -692,10 +691,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	return -ENOLCK;
 }
 
-static void
-nlmclnt_unlock_callback(struct rpc_task *task)
+static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*req = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst	*req = data;
 	int		status = req->a_res.status;
 
 	if (RPC_ASSASSINATED(task))
@@ -722,6 +720,10 @@ die:
 	rpc_restart_call(task);
 }
 
+static const struct rpc_call_ops nlmclnt_unlock_ops = {
+	.rpc_call_done = nlmclnt_unlock_callback,
+};
+
 /*
  * Cancel a blocked lock request.
  * We always use an async RPC call for this in order not to hang a
@@ -750,8 +752,7 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
 
 	nlmclnt_setlockargs(req, fl);
 
-	status = nlmclnt_async_call(req, NLMPROC_CANCEL,
-					nlmclnt_cancel_callback);
+	status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 	if (status < 0) {
 		nlmclnt_release_lockargs(req);
 		kfree(req);
@@ -765,10 +766,9 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
 	return status;
 }
 
-static void
-nlmclnt_cancel_callback(struct rpc_task *task)
+static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*req = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst	*req = data;
 
 	if (RPC_ASSASSINATED(task))
 		goto die;
@@ -807,6 +807,10 @@ retry_cancel:
 	rpc_delay(task, 30 * HZ);
 }
 
+static const struct rpc_call_ops nlmclnt_cancel_ops = {
+	.rpc_call_done = nlmclnt_cancel_callback,
+};
+
 /*
  * Convert an NLM status code to a generic kernel errno
  */
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 489670e21769..4063095d849e 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -22,7 +22,8 @@
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
 static u32	nlm4svc_callback(struct svc_rqst *, u32, struct nlm_res *);
-static void	nlm4svc_callback_exit(struct rpc_task *);
+
+static const struct rpc_call_ops nlm4svc_callback_ops;
 
 /*
  * Obtain client and file from arguments
@@ -470,7 +471,6 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
 }
 
 
-
 /*
  * This is the generic lockd callback for async RPC calls
  */
@@ -494,7 +494,7 @@ nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	call->a_host  = host;
 	memcpy(&call->a_args, resp, sizeof(*resp));
 
-	if (nlmsvc_async_call(call, proc, nlm4svc_callback_exit) < 0)
+	if (nlmsvc_async_call(call, proc, &nlm4svc_callback_ops) < 0)
 		goto error;
 
 	return rpc_success;
@@ -504,10 +504,9 @@ nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	return rpc_system_err;
 }
 
-static void
-nlm4svc_callback_exit(struct rpc_task *task)
+static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*call = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst	*call = data;
 
 	if (task->tk_status < 0) {
 		dprintk("lockd: %4d callback failed (errno = %d)\n",
@@ -517,6 +516,10 @@ nlm4svc_callback_exit(struct rpc_task *task)
 	kfree(call);
 }
 
+static const struct rpc_call_ops nlm4svc_callback_ops = {
+	.rpc_call_done = nlm4svc_callback_exit,
+};
+
 /*
  * NLM Server procedures.
  */
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 49f959796b66..87d09a0d8f64 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -41,7 +41,8 @@
 
 static void	nlmsvc_insert_block(struct nlm_block *block, unsigned long);
 static int	nlmsvc_remove_block(struct nlm_block *block);
-static void	nlmsvc_grant_callback(struct rpc_task *task);
+
+static const struct rpc_call_ops nlmsvc_grant_ops;
 
 /*
  * The list of blocked locks to retry
@@ -562,7 +563,7 @@ callback:
 	/* Call the client */
 	nlm_get_host(block->b_call.a_host);
 	if (nlmsvc_async_call(&block->b_call, NLMPROC_GRANTED_MSG,
-						nlmsvc_grant_callback) < 0)
+						&nlmsvc_grant_ops) < 0)
 		nlm_release_host(block->b_call.a_host);
 	up(&file->f_sema);
 }
@@ -575,10 +576,9 @@ callback:
  * chain once more in order to have it removed by lockd itself (which can
  * then sleep on the file semaphore without disrupting e.g. the nfs client).
  */
-static void
-nlmsvc_grant_callback(struct rpc_task *task)
+static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst		*call = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst		*call = data;
 	struct nlm_block	*block;
 	unsigned long		timeout;
 	struct sockaddr_in	*peer_addr = RPC_PEERADDR(task->tk_client);
@@ -614,6 +614,10 @@ nlmsvc_grant_callback(struct rpc_task *task)
 	nlm_release_host(call->a_host);
 }
 
+static const struct rpc_call_ops nlmsvc_grant_ops = {
+	.rpc_call_done = nlmsvc_grant_callback,
+};
+
 /*
  * We received a GRANT_RES callback. Try to find the corresponding
  * block.
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 757e344cf200..3bc437e0cf5b 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -23,7 +23,8 @@
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
 static u32	nlmsvc_callback(struct svc_rqst *, u32, struct nlm_res *);
-static void	nlmsvc_callback_exit(struct rpc_task *);
+
+static const struct rpc_call_ops nlmsvc_callback_ops;
 
 #ifdef CONFIG_LOCKD_V4
 static u32
@@ -518,7 +519,7 @@ nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	call->a_host  = host;
 	memcpy(&call->a_args, resp, sizeof(*resp));
 
-	if (nlmsvc_async_call(call, proc, nlmsvc_callback_exit) < 0)
+	if (nlmsvc_async_call(call, proc, &nlmsvc_callback_ops) < 0)
 		goto error;
 
 	return rpc_success;
@@ -528,10 +529,9 @@ nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	return rpc_system_err;
 }
 
-static void
-nlmsvc_callback_exit(struct rpc_task *task)
+static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*call = (struct nlm_rqst *) task->tk_calldata;
+	struct nlm_rqst	*call = data;
 
 	if (task->tk_status < 0) {
 		dprintk("lockd: %4d callback failed (errno = %d)\n",
@@ -541,6 +541,10 @@ nlmsvc_callback_exit(struct rpc_task *task)
 	kfree(call);
 }
 
+static const struct rpc_call_ops nlmsvc_callback_ops = {
+	.rpc_call_done = nlmsvc_callback_exit,
+};
+
 /*
  * NLM Server procedures.
  */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 079228817603..a834423942c7 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -269,7 +269,6 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq,
 
 		data->task.tk_cookie = (unsigned long) inode;
 		data->task.tk_calldata = data;
-		data->task.tk_release = nfs_readdata_release;
 		data->complete = nfs_direct_read_result;
 
 		lock_kernel();
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 92c870d19ccd..c172a7584646 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -732,19 +732,23 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
 
-static void
-nfs3_read_done(struct rpc_task *task)
+static void nfs3_read_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata;
+	struct nfs_read_data *data = calldata;
 
 	if (nfs3_async_handle_jukebox(task))
 		return;
 	/* Call back common NFS readpage processing */
 	if (task->tk_status >= 0)
 		nfs_refresh_inode(data->inode, &data->fattr);
-	nfs_readpage_result(task);
+	nfs_readpage_result(task, calldata);
 }
 
+static const struct rpc_call_ops nfs3_read_ops = {
+	.rpc_call_done = nfs3_read_done,
+	.rpc_release = nfs_readdata_release,
+};
+
 static void
 nfs3_proc_read_setup(struct nfs_read_data *data)
 {
@@ -762,23 +766,26 @@ nfs3_proc_read_setup(struct nfs_read_data *data)
 	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs3_read_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_read_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs3_write_done(struct rpc_task *task)
+static void nfs3_write_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data;
+	struct nfs_write_data *data = calldata;
 
 	if (nfs3_async_handle_jukebox(task))
 		return;
-	data = (struct nfs_write_data *)task->tk_calldata;
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(data->inode, data->res.fattr);
-	nfs_writeback_done(task);
+	nfs_writeback_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs3_write_ops = {
+	.rpc_call_done = nfs3_write_done,
+	.rpc_release = nfs_writedata_release,
+};
+
 static void
 nfs3_proc_write_setup(struct nfs_write_data *data, int how)
 {
@@ -806,23 +813,26 @@ nfs3_proc_write_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs3_write_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_write_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs3_commit_done(struct rpc_task *task)
+static void nfs3_commit_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data;
+	struct nfs_write_data *data = calldata;
 
 	if (nfs3_async_handle_jukebox(task))
 		return;
-	data = (struct nfs_write_data *)task->tk_calldata;
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(data->inode, data->res.fattr);
-	nfs_commit_done(task);
+	nfs_commit_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs3_commit_ops = {
+	.rpc_call_done = nfs3_commit_done,
+	.rpc_release = nfs_commit_release,
+};
+
 static void
 nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
 {
@@ -840,7 +850,7 @@ nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs3_commit_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_commit_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f988a9417b13..3d5d3c07d621 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -196,14 +196,12 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf
 
 /* Helper for asynchronous RPC calls */
 static int nfs4_call_async(struct rpc_clnt *clnt, rpc_action tk_begin,
-		rpc_action tk_exit, void *calldata)
+		const struct rpc_call_ops *tk_ops, void *calldata)
 {
 	struct rpc_task *task;
 
-	if (!(task = rpc_new_task(clnt, tk_exit, RPC_TASK_ASYNC)))
+	if (!(task = rpc_new_task(clnt, RPC_TASK_ASYNC, tk_ops, calldata)))
 		return -ENOMEM;
-
-	task->tk_calldata = calldata;
 	task->tk_action = tk_begin;
 	rpc_execute(task);
 	return 0;
@@ -867,10 +865,10 @@ struct nfs4_closedata {
 	struct nfs_fattr fattr;
 };
 
-static void nfs4_free_closedata(struct nfs4_closedata *calldata)
+static void nfs4_free_closedata(void *data)
 {
-	struct nfs4_state *state = calldata->state;
-	struct nfs4_state_owner *sp = state->owner;
+	struct nfs4_closedata *calldata = data;
+	struct nfs4_state_owner *sp = calldata->state->owner;
 
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
@@ -878,9 +876,9 @@ static void nfs4_free_closedata(struct nfs4_closedata *calldata)
 	kfree(calldata);
 }
 
-static void nfs4_close_done(struct rpc_task *task)
+static void nfs4_close_done(struct rpc_task *task, void *data)
 {
-	struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
+	struct nfs4_closedata *calldata = data;
 	struct nfs4_state *state = calldata->state;
 	struct nfs_server *server = NFS_SERVER(calldata->inode);
 
@@ -904,7 +902,6 @@ static void nfs4_close_done(struct rpc_task *task)
 			}
 	}
 	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
-	nfs4_free_closedata(calldata);
 }
 
 static void nfs4_close_begin(struct rpc_task *task)
@@ -918,10 +915,8 @@ static void nfs4_close_begin(struct rpc_task *task)
 		.rpc_cred = state->owner->so_cred,
 	};
 	int mode = 0, old_mode;
-	int status;
 
-	status = nfs_wait_on_sequence(calldata->arg.seqid, task);
-	if (status != 0)
+	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		return;
 	/* Recalculate the new open mode in case someone reopened the file
 	 * while we were waiting in line to be scheduled.
@@ -937,9 +932,8 @@ static void nfs4_close_begin(struct rpc_task *task)
 	spin_unlock(&calldata->inode->i_lock);
 	spin_unlock(&state->owner->so_lock);
 	if (mode == old_mode || test_bit(NFS_DELEGATED_STATE, &state->flags)) {
-		nfs4_free_closedata(calldata);
-		task->tk_exit = NULL;
-		rpc_exit(task, 0);
+		/* Note: exit _without_ calling nfs4_close_done */
+		task->tk_action = NULL;
 		return;
 	}
 	nfs_fattr_init(calldata->res.fattr);
@@ -949,6 +943,11 @@ static void nfs4_close_begin(struct rpc_task *task)
 	rpc_call_setup(task, &msg, 0);
 }
 
+static const struct rpc_call_ops nfs4_close_ops = {
+	.rpc_call_done = nfs4_close_done,
+	.rpc_release = nfs4_free_closedata,
+};
+
 /* 
  * It is possible for data to be read/written from a mem-mapped file 
  * after the sys_close call (which hits the vfs layer as a flush).
@@ -982,7 +981,7 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state)
 	calldata->res.server = server;
 
 	status = nfs4_call_async(server->client, nfs4_close_begin,
-			nfs4_close_done, calldata);
+			&nfs4_close_ops, calldata);
 	if (status == 0)
 		goto out;
 
@@ -2125,10 +2124,9 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return err;
 }
 
-static void
-nfs4_read_done(struct rpc_task *task)
+static void nfs4_read_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata;
+	struct nfs_read_data *data = calldata;
 	struct inode *inode = data->inode;
 
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
@@ -2138,9 +2136,14 @@ nfs4_read_done(struct rpc_task *task)
 	if (task->tk_status > 0)
 		renew_lease(NFS_SERVER(inode), data->timestamp);
 	/* Call back common NFS readpage processing */
-	nfs_readpage_result(task);
+	nfs_readpage_result(task, calldata);
 }
 
+static const struct rpc_call_ops nfs4_read_ops = {
+	.rpc_call_done = nfs4_read_done,
+	.rpc_release = nfs_readdata_release,
+};
+
 static void
 nfs4_proc_read_setup(struct nfs_read_data *data)
 {
@@ -2160,14 +2163,13 @@ nfs4_proc_read_setup(struct nfs_read_data *data)
 	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs4_read_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_read_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs4_write_done(struct rpc_task *task)
+static void nfs4_write_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata;
+	struct nfs_write_data *data = calldata;
 	struct inode *inode = data->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
@@ -2179,9 +2181,14 @@ nfs4_write_done(struct rpc_task *task)
 		nfs_post_op_update_inode(inode, data->res.fattr);
 	}
 	/* Call back common NFS writeback processing */
-	nfs_writeback_done(task);
+	nfs_writeback_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs4_write_ops = {
+	.rpc_call_done = nfs4_write_done,
+	.rpc_release = nfs_writedata_release,
+};
+
 static void
 nfs4_proc_write_setup(struct nfs_write_data *data, int how)
 {
@@ -2214,14 +2221,13 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs4_write_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_write_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs4_commit_done(struct rpc_task *task)
+static void nfs4_commit_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata;
+	struct nfs_write_data *data = calldata;
 	struct inode *inode = data->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
@@ -2231,9 +2237,14 @@ nfs4_commit_done(struct rpc_task *task)
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(inode, data->res.fattr);
 	/* Call back common NFS writeback processing */
-	nfs_commit_done(task);
+	nfs_commit_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs4_commit_ops = {
+	.rpc_call_done = nfs4_commit_done,
+	.rpc_release = nfs_commit_release,
+};
+
 static void
 nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
 {
@@ -2255,7 +2266,7 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs4_commit_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_commit_ops, data);
 	rpc_call_setup(task, &msg, 0);	
 }
 
@@ -2263,11 +2274,10 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
  * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
  * standalone procedure for queueing an asynchronous RENEW.
  */
-static void
-renew_done(struct rpc_task *task)
+static void nfs4_renew_done(struct rpc_task *task, void *data)
 {
 	struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
-	unsigned long timestamp = (unsigned long)task->tk_calldata;
+	unsigned long timestamp = (unsigned long)data;
 
 	if (task->tk_status < 0) {
 		switch (task->tk_status) {
@@ -2284,6 +2294,10 @@ renew_done(struct rpc_task *task)
 	spin_unlock(&clp->cl_lock);
 }
 
+static const struct rpc_call_ops nfs4_renew_ops = {
+	.rpc_call_done = nfs4_renew_done,
+};
+
 int
 nfs4_proc_async_renew(struct nfs4_client *clp)
 {
@@ -2294,7 +2308,7 @@ nfs4_proc_async_renew(struct nfs4_client *clp)
 	};
 
 	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
-			renew_done, (void *)jiffies);
+			&nfs4_renew_ops, (void *)jiffies);
 }
 
 int
@@ -2866,15 +2880,16 @@ static void nfs4_locku_release_calldata(struct nfs4_unlockdata *calldata)
 	}
 }
 
-static void nfs4_locku_complete(struct nfs4_unlockdata *calldata)
+static void nfs4_locku_complete(void *data)
 {
+	struct nfs4_unlockdata *calldata = data;
 	complete(&calldata->completion);
 	nfs4_locku_release_calldata(calldata);
 }
 
-static void nfs4_locku_done(struct rpc_task *task)
+static void nfs4_locku_done(struct rpc_task *task, void *data)
 {
-	struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata;
+	struct nfs4_unlockdata *calldata = data;
 
 	nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid);
 	switch (task->tk_status) {
@@ -2890,10 +2905,8 @@ static void nfs4_locku_done(struct rpc_task *task)
 		default:
 			if (nfs4_async_handle_error(task, calldata->res.server) == -EAGAIN) {
 				rpc_restart_call(task);
-				return;
 			}
 	}
-	nfs4_locku_complete(calldata);
 }
 
 static void nfs4_locku_begin(struct rpc_task *task)
@@ -2911,14 +2924,18 @@ static void nfs4_locku_begin(struct rpc_task *task)
 	if (status != 0)
 		return;
 	if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) {
-		nfs4_locku_complete(calldata);
-		task->tk_exit = NULL;
-		rpc_exit(task, 0);
+		/* Note: exit _without_ running nfs4_locku_done */
+		task->tk_action = NULL;
 		return;
 	}
 	rpc_call_setup(task, &msg, 0);
 }
 
+static const struct rpc_call_ops nfs4_locku_ops = {
+	.rpc_call_done = nfs4_locku_done,
+	.rpc_release = nfs4_locku_complete,
+};
+
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
 	struct nfs4_unlockdata *calldata;
@@ -2963,7 +2980,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	init_completion(&calldata->completion);
 
 	status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_locku_begin,
-			nfs4_locku_done, calldata);
+			&nfs4_locku_ops, calldata);
 	if (status == 0)
 		wait_for_completion_interruptible(&calldata->completion);
 	do_vfs_lock(request->fl_file, request);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e1e3ca5d746b..6145e82b45e8 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -547,10 +547,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
 
-static void
-nfs_read_done(struct rpc_task *task)
+static void nfs_read_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata;
+	struct nfs_read_data *data = calldata;
 
 	if (task->tk_status >= 0) {
 		nfs_refresh_inode(data->inode, data->res.fattr);
@@ -560,9 +559,14 @@ nfs_read_done(struct rpc_task *task)
 		if (data->args.offset + data->args.count >= data->res.fattr->size)
 			data->res.eof = 1;
 	}
-	nfs_readpage_result(task);
+	nfs_readpage_result(task, calldata);
 }
 
+static const struct rpc_call_ops nfs_read_ops = {
+	.rpc_call_done = nfs_read_done,
+	.rpc_release = nfs_readdata_release,
+};
+
 static void
 nfs_proc_read_setup(struct nfs_read_data *data)
 {
@@ -580,20 +584,24 @@ nfs_proc_read_setup(struct nfs_read_data *data)
 	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs_read_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_read_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void
-nfs_write_done(struct rpc_task *task)
+static void nfs_write_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata;
+	struct nfs_write_data *data = calldata;
 
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(data->inode, data->res.fattr);
-	nfs_writeback_done(task);
+	nfs_writeback_done(task, calldata);
 }
 
+static const struct rpc_call_ops nfs_write_ops = {
+	.rpc_call_done = nfs_write_done,
+	.rpc_release = nfs_writedata_release,
+};
+
 static void
 nfs_proc_write_setup(struct nfs_write_data *data, int how)
 {
@@ -614,7 +622,7 @@ nfs_proc_write_setup(struct nfs_write_data *data, int how)
 	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), nfs_write_done, flags);
+	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_write_ops, data);
 	rpc_call_setup(task, &msg, 0);
 }
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5f20eafba8ec..21486242c3d3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -42,9 +42,8 @@ mempool_t *nfs_rdata_mempool;
 
 #define MIN_POOL_READ	(32)
 
-void nfs_readdata_release(struct rpc_task *task)
+void nfs_readdata_release(void *data)
 {
-        struct nfs_read_data   *data = (struct nfs_read_data *)task->tk_calldata;
         nfs_readdata_free(data);
 }
 
@@ -220,9 +219,6 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	NFS_PROTO(inode)->read_setup(data);
 
 	data->task.tk_cookie = (unsigned long)inode;
-	data->task.tk_calldata = data;
-	/* Release requests */
-	data->task.tk_release = nfs_readdata_release;
 
 	dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 			data->task.tk_pid,
@@ -452,9 +448,9 @@ static void nfs_readpage_result_full(struct nfs_read_data *data, int status)
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
  */
-void nfs_readpage_result(struct rpc_task *task)
+void nfs_readpage_result(struct rpc_task *task, void *calldata)
 {
-	struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata;
+	struct nfs_read_data *data = calldata;
 	struct nfs_readargs *argp = &data->args;
 	struct nfs_readres *resp = &data->res;
 	int status = task->tk_status;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index d639d172d568..1494484ba86d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -116,10 +116,9 @@ nfs_async_unlink_init(struct rpc_task *task)
  *
  * Do the directory attribute update.
  */
-static void
-nfs_async_unlink_done(struct rpc_task *task)
+static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_unlinkdata	*data = (struct nfs_unlinkdata *)task->tk_calldata;
+	struct nfs_unlinkdata	*data = calldata;
 	struct dentry		*dir = data->dir;
 	struct inode		*dir_i;
 
@@ -141,13 +140,17 @@ nfs_async_unlink_done(struct rpc_task *task)
  * We need to call nfs_put_unlinkdata as a 'tk_release' task since the
  * rpc_task would be freed too.
  */
-static void
-nfs_async_unlink_release(struct rpc_task *task)
+static void nfs_async_unlink_release(void *calldata)
 {
-	struct nfs_unlinkdata	*data = (struct nfs_unlinkdata *)task->tk_calldata;
+	struct nfs_unlinkdata	*data = calldata;
 	nfs_put_unlinkdata(data);
 }
 
+static const struct rpc_call_ops nfs_unlink_ops = {
+	.rpc_call_done = nfs_async_unlink_done,
+	.rpc_release = nfs_async_unlink_release,
+};
+
 /**
  * nfs_async_unlink - asynchronous unlinking of a file
  * @dentry: dentry to unlink
@@ -179,10 +182,8 @@ nfs_async_unlink(struct dentry *dentry)
 	data->count = 1;
 
 	task = &data->task;
-	rpc_init_task(task, clnt, nfs_async_unlink_done , RPC_TASK_ASYNC);
-	task->tk_calldata = data;
+	rpc_init_task(task, clnt, RPC_TASK_ASYNC, &nfs_unlink_ops, data);
 	task->tk_action = nfs_async_unlink_init;
-	task->tk_release = nfs_async_unlink_release;
 
 	spin_lock(&dentry->d_lock);
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 95d00f9132d0..80bc4ea1b824 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -104,9 +104,8 @@ static inline void nfs_commit_free(struct nfs_write_data *p)
 	mempool_free(p, nfs_commit_mempool);
 }
 
-static void nfs_writedata_release(struct rpc_task *task)
+void nfs_writedata_release(void *wdata)
 {
-	struct nfs_write_data	*wdata = (struct nfs_write_data *)task->tk_calldata;
 	nfs_writedata_free(wdata);
 }
 
@@ -871,9 +870,6 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 
 	data->task.tk_priority = flush_task_priority(how);
 	data->task.tk_cookie = (unsigned long)inode;
-	data->task.tk_calldata = data;
-	/* Release requests */
-	data->task.tk_release = nfs_writedata_release;
 
 	dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 		data->task.tk_pid,
@@ -1131,9 +1127,9 @@ static void nfs_writeback_done_full(struct nfs_write_data *data, int status)
 /*
  * This function is called when the WRITE call is complete.
  */
-void nfs_writeback_done(struct rpc_task *task)
+void nfs_writeback_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data	*data = (struct nfs_write_data *) task->tk_calldata;
+	struct nfs_write_data	*data = calldata;
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
 
@@ -1200,9 +1196,8 @@ void nfs_writeback_done(struct rpc_task *task)
 
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-static void nfs_commit_release(struct rpc_task *task)
+void nfs_commit_release(void *wdata)
 {
-	struct nfs_write_data	*wdata = (struct nfs_write_data *)task->tk_calldata;
 	nfs_commit_free(wdata);
 }
 
@@ -1238,9 +1233,6 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 
 	data->task.tk_priority = flush_task_priority(how);
 	data->task.tk_cookie = (unsigned long)inode;
-	data->task.tk_calldata = data;
-	/* Release requests */
-	data->task.tk_release = nfs_commit_release;
 	
 	dprintk("NFS: %4d initiated commit call\n", data->task.tk_pid);
 }
@@ -1277,10 +1269,9 @@ nfs_commit_list(struct list_head *head, int how)
 /*
  * COMMIT call returned
  */
-void
-nfs_commit_done(struct rpc_task *task)
+void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_write_data	*data = (struct nfs_write_data *)task->tk_calldata;
+	struct nfs_write_data	*data = calldata;
 	struct nfs_page		*req;
 	int res = 0;
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 583c0710e45e..cf92008f219a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,7 +53,7 @@
 #define NFSPROC4_CB_COMPOUND 1
 
 /* declarations */
-static void nfs4_cb_null(struct rpc_task *task);
+static const struct rpc_call_ops nfs4_cb_null_ops;
 
 /* Index of predefined Linux callback client operations */
 
@@ -447,7 +447,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 	msg.rpc_cred = nfsd4_lookupcred(clp,0);
 	if (IS_ERR(msg.rpc_cred))
 		goto out_rpciod;
-	status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, NULL);
+	status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL);
 	put_rpccred(msg.rpc_cred);
 
 	if (status != 0) {
@@ -469,7 +469,7 @@ out_err:
 }
 
 static void
-nfs4_cb_null(struct rpc_task *task)
+nfs4_cb_null(struct rpc_task *task, void *dummy)
 {
 	struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
 	struct nfs4_callback *cb = &clp->cl_callback;
@@ -488,6 +488,10 @@ out:
 	put_nfs4_client(clp);
 }
 
+static const struct rpc_call_ops nfs4_cb_null_ops = {
+	.rpc_call_done = nfs4_cb_null,
+};
+
 /*
  * called with dp->dl_count inc'ed.
  * nfs4_lock_state() may or may not have been called.
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 16d4e5a08e1d..95c8fea293ba 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -172,7 +172,7 @@ extern struct nlm_host *nlm_find_client(void);
 /*
  * Server-side lock handling
  */
-int		  nlmsvc_async_call(struct nlm_rqst *, u32, rpc_action);
+int		  nlmsvc_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *);
 u32		  nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
 					struct nlm_lock *, int, struct nlm_cookie *);
 u32		  nlmsvc_unlock(struct nlm_file *, struct nlm_lock *);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 2516adeccecf..4dff705d2ff2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -406,10 +406,12 @@ extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-extern void nfs_writeback_done(struct rpc_task *task);
+extern void nfs_writeback_done(struct rpc_task *task, void *data);
+extern void nfs_writedata_release(void *data);
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-extern void nfs_commit_done(struct rpc_task *);
+extern void nfs_commit_done(struct rpc_task *, void *data);
+extern void nfs_commit_release(void *data);
 #endif
 
 /*
@@ -481,7 +483,9 @@ static inline void nfs_writedata_free(struct nfs_write_data *p)
 extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
-extern void nfs_readpage_result(struct rpc_task *);
+extern void nfs_readpage_result(struct rpc_task *, void *);
+extern void  nfs_readdata_release(void *data);
+
 
 /*
  * Allocate and free nfs_read_data structures
@@ -501,8 +505,6 @@ static inline void nfs_readdata_free(struct nfs_read_data *p)
 	mempool_free(p, nfs_rdata_mempool);
 }
 
-extern void  nfs_readdata_release(struct rpc_task *task);
-
 /*
  * linux/fs/nfs3proc.c
  */
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index ab151bbb66df..b0ab959eca65 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -126,7 +126,8 @@ int		rpc_register(u32, u32, int, unsigned short, int *);
 void		rpc_call_setup(struct rpc_task *, struct rpc_message *, int);
 
 int		rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg,
-			       int flags, rpc_action callback, void *clntdata);
+			       int flags, const struct rpc_call_ops *tk_ops,
+			       void *calldata);
 int		rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg,
 			      int flags);
 void		rpc_restart_call(struct rpc_task *);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 4c4b2dc8aca5..581d8cdc3b86 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -27,6 +27,7 @@ struct rpc_message {
 	struct rpc_cred *	rpc_cred;	/* Credentials */
 };
 
+struct rpc_call_ops;
 struct rpc_wait_queue;
 struct rpc_wait {
 	struct list_head	list;		/* wait queue links */
@@ -61,13 +62,12 @@ struct rpc_task {
 	 * timeout_fn   to be executed by timer bottom half
 	 * callback	to be executed after waking up
 	 * action	next procedure for async tasks
-	 * exit		exit async task and report to caller
+	 * tk_ops	caller callbacks
 	 */
 	void			(*tk_timeout_fn)(struct rpc_task *);
 	void			(*tk_callback)(struct rpc_task *);
 	void			(*tk_action)(struct rpc_task *);
-	void			(*tk_exit)(struct rpc_task *);
-	void			(*tk_release)(struct rpc_task *);
+	const struct rpc_call_ops *tk_ops;
 	void *			tk_calldata;
 
 	/*
@@ -111,6 +111,12 @@ struct rpc_task {
 
 typedef void			(*rpc_action)(struct rpc_task *);
 
+struct rpc_call_ops {
+	void (*rpc_call_done)(struct rpc_task *, void *);
+	void (*rpc_release)(void *);
+};
+
+
 /*
  * RPC task flags
  */
@@ -228,10 +234,12 @@ struct rpc_wait_queue {
 /*
  * Function prototypes
  */
-struct rpc_task *rpc_new_task(struct rpc_clnt *, rpc_action, int flags);
+struct rpc_task *rpc_new_task(struct rpc_clnt *, int flags,
+				const struct rpc_call_ops *ops, void *data);
 struct rpc_task *rpc_new_child(struct rpc_clnt *, struct rpc_task *parent);
-void		rpc_init_task(struct rpc_task *, struct rpc_clnt *,
-					rpc_action exitfunc, int flags);
+void		rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt,
+				int flags, const struct rpc_call_ops *ops,
+				void *data);
 void		rpc_release_task(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
 void		rpc_killall_tasks(struct rpc_clnt *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 6ab4cbd8a901..8b2f75bc006d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -374,10 +374,14 @@ out:
  * Default callback for async RPC calls
  */
 static void
-rpc_default_callback(struct rpc_task *task)
+rpc_default_callback(struct rpc_task *task, void *data)
 {
 }
 
+static const struct rpc_call_ops rpc_default_ops = {
+	.rpc_call_done = rpc_default_callback,
+};
+
 /*
  *	Export the signal mask handling for synchronous code that
  *	sleeps on RPC calls
@@ -432,7 +436,7 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 	BUG_ON(flags & RPC_TASK_ASYNC);
 
 	status = -ENOMEM;
-	task = rpc_new_task(clnt, NULL, flags);
+	task = rpc_new_task(clnt, flags, &rpc_default_ops, NULL);
 	if (task == NULL)
 		goto out;
 
@@ -459,7 +463,7 @@ out:
  */
 int
 rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
-	       rpc_action callback, void *data)
+	       const struct rpc_call_ops *tk_ops, void *data)
 {
 	struct rpc_task	*task;
 	sigset_t	oldset;
@@ -472,12 +476,9 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
 	flags |= RPC_TASK_ASYNC;
 
 	/* Create/initialize a new RPC task */
-	if (!callback)
-		callback = rpc_default_callback;
 	status = -ENOMEM;
-	if (!(task = rpc_new_task(clnt, callback, flags)))
+	if (!(task = rpc_new_task(clnt, flags, tk_ops, data)))
 		goto out;
-	task->tk_calldata = data;
 
 	/* Mask signals on GSS_AUTH upcalls */
 	rpc_task_sigmask(task, &oldset);		
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 3fcf7b0e1f6c..8d6233d3248b 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -555,13 +555,13 @@ __rpc_atrun(struct rpc_task *task)
 }
 
 /*
- * Helper that calls task->tk_exit if it exists
+ * Helper that calls task->tk_ops->rpc_call_done if it exists
  */
 void rpc_exit_task(struct rpc_task *task)
 {
 	task->tk_action = NULL;
-	if (task->tk_exit != NULL) {
-		task->tk_exit(task);
+	if (task->tk_ops->rpc_call_done != NULL) {
+		task->tk_ops->rpc_call_done(task, task->tk_calldata);
 		if (task->tk_action != NULL) {
 			WARN_ON(RPC_ASSASSINATED(task));
 			/* Always release the RPC slot and buffer memory */
@@ -747,7 +747,7 @@ rpc_free(struct rpc_task *task)
 /*
  * Creation and deletion of RPC task structures
  */
-void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action callback, int flags)
+void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata)
 {
 	memset(task, 0, sizeof(*task));
 	init_timer(&task->tk_timer);
@@ -755,7 +755,8 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
 	task->tk_timer.function = (void (*)(unsigned long)) rpc_run_timer;
 	task->tk_client = clnt;
 	task->tk_flags  = flags;
-	task->tk_exit   = callback;
+	task->tk_ops = tk_ops;
+	task->tk_calldata = calldata;
 
 	/* Initialize retry counters */
 	task->tk_garb_retry = 2;
@@ -784,6 +785,8 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
 	list_add_tail(&task->tk_task, &all_tasks);
 	spin_unlock(&rpc_sched_lock);
 
+	BUG_ON(task->tk_ops == NULL);
+
 	dprintk("RPC: %4d new task procpid %d\n", task->tk_pid,
 				current->pid);
 }
@@ -794,8 +797,7 @@ rpc_alloc_task(void)
 	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
 }
 
-static void
-rpc_default_free_task(struct rpc_task *task)
+static void rpc_free_task(struct rpc_task *task)
 {
 	dprintk("RPC: %4d freeing task\n", task->tk_pid);
 	mempool_free(task, rpc_task_mempool);
@@ -806,8 +808,7 @@ rpc_default_free_task(struct rpc_task *task)
  * clean up after an allocation failure, as the client may
  * have specified "oneshot".
  */
-struct rpc_task *
-rpc_new_task(struct rpc_clnt *clnt, rpc_action callback, int flags)
+struct rpc_task *rpc_new_task(struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata)
 {
 	struct rpc_task	*task;
 
@@ -815,10 +816,7 @@ rpc_new_task(struct rpc_clnt *clnt, rpc_action callback, int flags)
 	if (!task)
 		goto cleanup;
 
-	rpc_init_task(task, clnt, callback, flags);
-
-	/* Replace tk_release */
-	task->tk_release = rpc_default_free_task;
+	rpc_init_task(task, clnt, flags, tk_ops, calldata);
 
 	dprintk("RPC: %4d allocated task\n", task->tk_pid);
 	task->tk_flags |= RPC_TASK_DYNAMIC;
@@ -838,6 +836,8 @@ cleanup:
 
 void rpc_release_task(struct rpc_task *task)
 {
+	const struct rpc_call_ops *tk_ops = task->tk_ops;
+	void *calldata = task->tk_calldata;
 	dprintk("RPC: %4d release task\n", task->tk_pid);
 
 #ifdef RPC_DEBUG
@@ -869,8 +869,10 @@ void rpc_release_task(struct rpc_task *task)
 #ifdef RPC_DEBUG
 	task->tk_magic = 0;
 #endif
-	if (task->tk_release)
-		task->tk_release(task);
+	if (task->tk_flags & RPC_TASK_DYNAMIC)
+		rpc_free_task(task);
+	if (tk_ops->rpc_release)
+		tk_ops->rpc_release(calldata);
 }
 
 /**
@@ -883,12 +885,11 @@ void rpc_release_task(struct rpc_task *task)
  *
  * Caller must hold childq.lock
  */
-static inline struct rpc_task *rpc_find_parent(struct rpc_task *child)
+static inline struct rpc_task *rpc_find_parent(struct rpc_task *child, struct rpc_task *parent)
 {
-	struct rpc_task	*task, *parent;
+	struct rpc_task	*task;
 	struct list_head *le;
 
-	parent = (struct rpc_task *) child->tk_calldata;
 	task_for_each(task, le, &childq.tasks[0])
 		if (task == parent)
 			return parent;
@@ -896,18 +897,22 @@ static inline struct rpc_task *rpc_find_parent(struct rpc_task *child)
 	return NULL;
 }
 
-static void rpc_child_exit(struct rpc_task *child)
+static void rpc_child_exit(struct rpc_task *child, void *calldata)
 {
 	struct rpc_task	*parent;
 
 	spin_lock_bh(&childq.lock);
-	if ((parent = rpc_find_parent(child)) != NULL) {
+	if ((parent = rpc_find_parent(child, calldata)) != NULL) {
 		parent->tk_status = child->tk_status;
 		__rpc_wake_up_task(parent);
 	}
 	spin_unlock_bh(&childq.lock);
 }
 
+static const struct rpc_call_ops rpc_child_ops = {
+	.rpc_call_done = rpc_child_exit,
+};
+
 /*
  * Note: rpc_new_task releases the client after a failure.
  */
@@ -916,11 +921,9 @@ rpc_new_child(struct rpc_clnt *clnt, struct rpc_task *parent)
 {
 	struct rpc_task	*task;
 
-	task = rpc_new_task(clnt, NULL, RPC_TASK_ASYNC | RPC_TASK_CHILD);
+	task = rpc_new_task(clnt, RPC_TASK_ASYNC | RPC_TASK_CHILD, &rpc_child_ops, parent);
 	if (!task)
 		goto fail;
-	task->tk_exit = rpc_child_exit;
-	task->tk_calldata = parent;
 	return task;
 
 fail:
@@ -1056,7 +1059,7 @@ void rpc_show_tasks(void)
 		return;
 	}
 	printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
-		"-rpcwait -action- --exit--\n");
+		"-rpcwait -action- ---ops--\n");
 	alltask_for_each(t, le, &all_tasks) {
 		const char *rpc_waitq = "none";
 
@@ -1071,7 +1074,7 @@ void rpc_show_tasks(void)
 			(t->tk_client ? t->tk_client->cl_prog : 0),
 			t->tk_rqstp, t->tk_timeout,
 			rpc_waitq,
-			t->tk_action, t->tk_exit);
+			t->tk_action, t->tk_ops);
 	}
 	spin_unlock(&rpc_sched_lock);
 }
-- 
cgit v1.2.3


From 4ce70ada1ff1d0b80916ec9ec5764ce44a50a54f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:05 +0100
Subject: SUNRPC: Further cleanups

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c            | 21 +++++++++++----------
 fs/nfs/unlink.c              | 13 +++++--------
 include/linux/sunrpc/sched.h |  1 +
 net/sunrpc/sched.c           | 10 ++++++++++
 4 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3d5d3c07d621..368b75b3bcba 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -195,14 +195,13 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf
 }
 
 /* Helper for asynchronous RPC calls */
-static int nfs4_call_async(struct rpc_clnt *clnt, rpc_action tk_begin,
+static int nfs4_call_async(struct rpc_clnt *clnt,
 		const struct rpc_call_ops *tk_ops, void *calldata)
 {
 	struct rpc_task *task;
 
 	if (!(task = rpc_new_task(clnt, RPC_TASK_ASYNC, tk_ops, calldata)))
 		return -ENOMEM;
-	task->tk_action = tk_begin;
 	rpc_execute(task);
 	return 0;
 }
@@ -882,6 +881,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	struct nfs4_state *state = calldata->state;
 	struct nfs_server *server = NFS_SERVER(calldata->inode);
 
+	if (RPC_ASSASSINATED(task))
+		return;
         /* hmm. we are done with the inode, and in the process of freeing
 	 * the state_owner. we keep this around to process errors
 	 */
@@ -904,9 +905,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
 }
 
-static void nfs4_close_begin(struct rpc_task *task)
+static void nfs4_close_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
+	struct nfs4_closedata *calldata = data;
 	struct nfs4_state *state = calldata->state;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
@@ -944,6 +945,7 @@ static void nfs4_close_begin(struct rpc_task *task)
 }
 
 static const struct rpc_call_ops nfs4_close_ops = {
+	.rpc_call_prepare = nfs4_close_prepare,
 	.rpc_call_done = nfs4_close_done,
 	.rpc_release = nfs4_free_closedata,
 };
@@ -980,8 +982,7 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state)
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.server = server;
 
-	status = nfs4_call_async(server->client, nfs4_close_begin,
-			&nfs4_close_ops, calldata);
+	status = nfs4_call_async(server->client, &nfs4_close_ops, calldata);
 	if (status == 0)
 		goto out;
 
@@ -2909,9 +2910,9 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 	}
 }
 
-static void nfs4_locku_begin(struct rpc_task *task)
+static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 {
-	struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata;
+	struct nfs4_unlockdata *calldata = data;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
 		.rpc_argp       = &calldata->arg,
@@ -2932,6 +2933,7 @@ static void nfs4_locku_begin(struct rpc_task *task)
 }
 
 static const struct rpc_call_ops nfs4_locku_ops = {
+	.rpc_call_prepare = nfs4_locku_prepare,
 	.rpc_call_done = nfs4_locku_done,
 	.rpc_release = nfs4_locku_complete,
 };
@@ -2979,8 +2981,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	atomic_set(&calldata->refcount, 2);
 	init_completion(&calldata->completion);
 
-	status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_locku_begin,
-			&nfs4_locku_ops, calldata);
+	status = nfs4_call_async(NFS_SERVER(inode)->client, &nfs4_locku_ops, calldata);
 	if (status == 0)
 		wait_for_completion_interruptible(&calldata->completion);
 	do_vfs_lock(request->fl_file, request);
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1494484ba86d..a65c7b53d558 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -87,10 +87,9 @@ nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data)
  * We delay initializing RPC info until after the call to dentry_iput()
  * in order to minimize races against rename().
  */
-static void
-nfs_async_unlink_init(struct rpc_task *task)
+static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
 {
-	struct nfs_unlinkdata	*data = (struct nfs_unlinkdata *)task->tk_calldata;
+	struct nfs_unlinkdata	*data = calldata;
 	struct dentry		*dir = data->dir;
 	struct rpc_message	msg = {
 		.rpc_cred	= data->cred,
@@ -147,6 +146,7 @@ static void nfs_async_unlink_release(void *calldata)
 }
 
 static const struct rpc_call_ops nfs_unlink_ops = {
+	.rpc_call_prepare = nfs_async_unlink_init,
 	.rpc_call_done = nfs_async_unlink_done,
 	.rpc_release = nfs_async_unlink_release,
 };
@@ -160,7 +160,6 @@ nfs_async_unlink(struct dentry *dentry)
 {
 	struct dentry	*dir = dentry->d_parent;
 	struct nfs_unlinkdata	*data;
-	struct rpc_task	*task;
 	struct rpc_clnt	*clnt = NFS_CLIENT(dir->d_inode);
 	int		status = -ENOMEM;
 
@@ -181,15 +180,13 @@ nfs_async_unlink(struct dentry *dentry)
 	nfs_deletes = data;
 	data->count = 1;
 
-	task = &data->task;
-	rpc_init_task(task, clnt, RPC_TASK_ASYNC, &nfs_unlink_ops, data);
-	task->tk_action = nfs_async_unlink_init;
+	rpc_init_task(&data->task, clnt, RPC_TASK_ASYNC, &nfs_unlink_ops, data);
 
 	spin_lock(&dentry->d_lock);
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
 	spin_unlock(&dentry->d_lock);
 
-	rpc_sleep_on(&nfs_delete_queue, task, NULL, NULL);
+	rpc_sleep_on(&nfs_delete_queue, &data->task, NULL, NULL);
 	status = 0;
  out:
 	return status;
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 581d8cdc3b86..ac1326fc3e1a 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -112,6 +112,7 @@ struct rpc_task {
 typedef void			(*rpc_action)(struct rpc_task *);
 
 struct rpc_call_ops {
+	void (*rpc_call_prepare)(struct rpc_task *, void *);
 	void (*rpc_call_done)(struct rpc_task *, void *);
 	void (*rpc_release)(void *);
 };
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 8d6233d3248b..2d74a1672028 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -554,6 +554,14 @@ __rpc_atrun(struct rpc_task *task)
 	rpc_wake_up_task(task);
 }
 
+/*
+ * Helper to call task->tk_ops->rpc_call_prepare
+ */
+static void rpc_prepare_task(struct rpc_task *task)
+{
+	task->tk_ops->rpc_call_prepare(task, task->tk_calldata);
+}
+
 /*
  * Helper that calls task->tk_ops->rpc_call_done if it exists
  */
@@ -756,6 +764,8 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, cons
 	task->tk_client = clnt;
 	task->tk_flags  = flags;
 	task->tk_ops = tk_ops;
+	if (tk_ops->rpc_call_prepare != NULL)
+		task->tk_action = rpc_prepare_task;
 	task->tk_calldata = calldata;
 
 	/* Initialize retry counters */
-- 
cgit v1.2.3


From 44c288732fdbd7e38460d156a40d29590bf93bce Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:06 +0100
Subject: NFSv4: stateful NFSv4 RPC call interface

 The NFSv4 model requires us to complete all RPC calls that might
 establish state on the server whether or not the user wants to
 interrupt it. We may also need to schedule new work (including
 new RPC calls) in order to cancel the new state.

 The asynchronous RPC model will allow us to ensure that RPC calls
 always complete, but in order to allow for "synchronous" RPC, we
 want to add the ability to wait for completion.
 The waits are, of course, interruptible.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c              |  1 -
 include/linux/sunrpc/sched.h | 21 ++++++++++--
 net/sunrpc/sched.c           | 78 +++++++++++++++++++++++++++++++++-----------
 3 files changed, 78 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a834423942c7..ae2be0744191 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -268,7 +268,6 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq,
 		NFS_PROTO(inode)->read_setup(data);
 
 		data->task.tk_cookie = (unsigned long) inode;
-		data->task.tk_calldata = data;
 		data->complete = nfs_direct_read_result;
 
 		lock_kernel();
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index ac1326fc3e1a..94b0afa4ab05 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -42,6 +42,7 @@ struct rpc_task {
 #ifdef RPC_DEBUG
 	unsigned long		tk_magic;	/* 0xf00baa */
 #endif
+	atomic_t		tk_count;	/* Reference count */
 	struct list_head	tk_task;	/* global list of tasks */
 	struct rpc_clnt *	tk_client;	/* RPC client */
 	struct rpc_rqst *	tk_rqstp;	/* RPC request */
@@ -78,7 +79,6 @@ struct rpc_task {
 	struct timer_list	tk_timer;	/* kernel timer */
 	unsigned long		tk_timeout;	/* timeout for rpc_sleep() */
 	unsigned short		tk_flags;	/* misc flags */
-	unsigned char		tk_active   : 1;/* Task has been activated */
 	unsigned char		tk_priority : 2;/* Task priority */
 	unsigned long		tk_runstate;	/* Task run status */
 	struct workqueue_struct	*tk_workqueue;	/* Normally rpciod, but could
@@ -136,7 +136,6 @@ struct rpc_call_ops {
 #define RPC_IS_SWAPPER(t)	((t)->tk_flags & RPC_TASK_SWAPPER)
 #define RPC_DO_ROOTOVERRIDE(t)	((t)->tk_flags & RPC_TASK_ROOTCREDS)
 #define RPC_ASSASSINATED(t)	((t)->tk_flags & RPC_TASK_KILLED)
-#define RPC_IS_ACTIVATED(t)	((t)->tk_active)
 #define RPC_DO_CALLBACK(t)	((t)->tk_callback != NULL)
 #define RPC_IS_SOFT(t)		((t)->tk_flags & RPC_TASK_SOFT)
 #define RPC_TASK_UNINTERRUPTIBLE(t) ((t)->tk_flags & RPC_TASK_NOINTR)
@@ -145,6 +144,7 @@ struct rpc_call_ops {
 #define RPC_TASK_QUEUED		1
 #define RPC_TASK_WAKEUP		2
 #define RPC_TASK_HAS_TIMER	3
+#define RPC_TASK_ACTIVE		4
 
 #define RPC_IS_RUNNING(t)	(test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
 #define rpc_set_running(t)	(set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
@@ -175,6 +175,15 @@ struct rpc_call_ops {
 		smp_mb__after_clear_bit(); \
 	} while (0)
 
+#define RPC_IS_ACTIVATED(t)	(test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate))
+#define rpc_set_active(t)	(set_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate))
+#define rpc_clear_active(t)	\
+	do { \
+		smp_mb__before_clear_bit(); \
+		clear_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate); \
+		smp_mb__after_clear_bit(); \
+	} while(0)
+
 /*
  * Task priorities.
  * Note: if you change these, you must also change
@@ -237,6 +246,8 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(struct rpc_clnt *, int flags,
 				const struct rpc_call_ops *ops, void *data);
+struct rpc_task *rpc_run_task(struct rpc_clnt *clnt, int flags,
+				const struct rpc_call_ops *ops, void *data);
 struct rpc_task *rpc_new_child(struct rpc_clnt *, struct rpc_task *parent);
 void		rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt,
 				int flags, const struct rpc_call_ops *ops,
@@ -260,6 +271,7 @@ void *		rpc_malloc(struct rpc_task *, size_t);
 int		rpciod_up(void);
 void		rpciod_down(void);
 void		rpciod_wake_up(void);
+int		__rpc_wait_for_completion_task(struct rpc_task *task, int (*)(void *));
 #ifdef RPC_DEBUG
 void		rpc_show_tasks(void);
 #endif
@@ -272,6 +284,11 @@ static inline void rpc_exit(struct rpc_task *task, int status)
 	task->tk_action = rpc_exit_task;
 }
 
+static inline int rpc_wait_for_completion_task(struct rpc_task *task)
+{
+	return __rpc_wait_for_completion_task(task, NULL);
+}
+
 #ifdef RPC_DEBUG
 static inline const char * rpc_qname(struct rpc_wait_queue *q)
 {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 2d74a1672028..82d158dad16d 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -264,6 +264,35 @@ void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname)
 }
 EXPORT_SYMBOL(rpc_init_wait_queue);
 
+static int rpc_wait_bit_interruptible(void *word)
+{
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
+/*
+ * Mark an RPC call as having completed by clearing the 'active' bit
+ */
+static inline void rpc_mark_complete_task(struct rpc_task *task)
+{
+	rpc_clear_active(task);
+	wake_up_bit(&task->tk_runstate, RPC_TASK_ACTIVE);
+}
+
+/*
+ * Allow callers to wait for completion of an RPC call
+ */
+int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *))
+{
+	if (action == NULL)
+		action = rpc_wait_bit_interruptible;
+	return wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
+			action, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__rpc_wait_for_completion_task);
+
 /*
  * Make an RPC task runnable.
  *
@@ -299,10 +328,7 @@ static void rpc_make_runnable(struct rpc_task *task)
 static inline void
 rpc_schedule_run(struct rpc_task *task)
 {
-	/* Don't run a child twice! */
-	if (RPC_IS_ACTIVATED(task))
-		return;
-	task->tk_active = 1;
+	rpc_set_active(task);
 	rpc_make_runnable(task);
 }
 
@@ -324,8 +350,7 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 	}
 
 	/* Mark the task as being activated if so needed */
-	if (!RPC_IS_ACTIVATED(task))
-		task->tk_active = 1;
+	rpc_set_active(task);
 
 	__rpc_add_wait_queue(q, task);
 
@@ -580,14 +605,6 @@ void rpc_exit_task(struct rpc_task *task)
 }
 EXPORT_SYMBOL(rpc_exit_task);
 
-static int rpc_wait_bit_interruptible(void *word)
-{
-	if (signal_pending(current))
-		return -ERESTARTSYS;
-	schedule();
-	return 0;
-}
-
 /*
  * This is the RPC `scheduler' (or rather, the finite state machine).
  */
@@ -680,6 +697,8 @@ static int __rpc_execute(struct rpc_task *task)
 	dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status);
 	status = task->tk_status;
 
+	/* Wake up anyone who is waiting for task completion */
+	rpc_mark_complete_task(task);
 	/* Release all resources associated with the task */
 	rpc_release_task(task);
 	return status;
@@ -697,9 +716,7 @@ static int __rpc_execute(struct rpc_task *task)
 int
 rpc_execute(struct rpc_task *task)
 {
-	BUG_ON(task->tk_active);
-
-	task->tk_active = 1;
+	rpc_set_active(task);
 	rpc_set_running(task);
 	return __rpc_execute(task);
 }
@@ -761,6 +778,7 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, cons
 	init_timer(&task->tk_timer);
 	task->tk_timer.data     = (unsigned long) task;
 	task->tk_timer.function = (void (*)(unsigned long)) rpc_run_timer;
+	atomic_set(&task->tk_count, 1);
 	task->tk_client = clnt;
 	task->tk_flags  = flags;
 	task->tk_ops = tk_ops;
@@ -848,11 +866,13 @@ void rpc_release_task(struct rpc_task *task)
 {
 	const struct rpc_call_ops *tk_ops = task->tk_ops;
 	void *calldata = task->tk_calldata;
-	dprintk("RPC: %4d release task\n", task->tk_pid);
 
 #ifdef RPC_DEBUG
 	BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID);
 #endif
+	if (!atomic_dec_and_test(&task->tk_count))
+		return;
+	dprintk("RPC: %4d release task\n", task->tk_pid);
 
 	/* Remove from global task list */
 	spin_lock(&rpc_sched_lock);
@@ -860,7 +880,6 @@ void rpc_release_task(struct rpc_task *task)
 	spin_unlock(&rpc_sched_lock);
 
 	BUG_ON (RPC_IS_QUEUED(task));
-	task->tk_active = 0;
 
 	/* Synchronously delete any running timer */
 	rpc_delete_timer(task);
@@ -885,6 +904,27 @@ void rpc_release_task(struct rpc_task *task)
 		tk_ops->rpc_release(calldata);
 }
 
+/**
+ * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it
+ * @clnt - pointer to RPC client
+ * @flags - RPC flags
+ * @ops - RPC call ops
+ * @data - user call data
+ */
+struct rpc_task *rpc_run_task(struct rpc_clnt *clnt, int flags,
+					const struct rpc_call_ops *ops,
+					void *data)
+{
+	struct rpc_task *task;
+	task = rpc_new_task(clnt, flags, ops, data);
+	if (task == NULL)
+		return ERR_PTR(-ENOMEM);
+	atomic_inc(&task->tk_count);
+	rpc_execute(task);
+	return task;
+}
+EXPORT_SYMBOL(rpc_run_task);
+
 /**
  * rpc_find_parent - find the parent of a child task.
  * @child: child task
-- 
cgit v1.2.3


From 06f814a3ad0ddfe19e6e4f44e3da5d490547faf3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:07 +0100
Subject: NFSv4: Make locku use the new RPC "wait on completion" interface.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 65 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 368b75b3bcba..c7bec4319236 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -206,6 +206,17 @@ static int nfs4_call_async(struct rpc_clnt *clnt,
 	return 0;
 }
 
+static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
+{
+	sigset_t oldset;
+	int ret;
+
+	rpc_clnt_sigmask(task->tk_client, &oldset);
+	ret = rpc_wait_for_completion_task(task);
+	rpc_clnt_sigunmask(task->tk_client, &oldset);
+	return ret;
+}
+
 static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
 {
 	struct inode *inode = state->inode;
@@ -2867,31 +2878,23 @@ struct nfs4_unlockdata {
 	struct nfs_lockres res;
 	struct nfs4_lock_state *lsp;
 	struct nfs_open_context *ctx;
-	atomic_t refcount;
-	struct completion completion;
 };
 
-static void nfs4_locku_release_calldata(struct nfs4_unlockdata *calldata)
-{
-	if (atomic_dec_and_test(&calldata->refcount)) {
-		nfs_free_seqid(calldata->luargs.seqid);
-		nfs4_put_lock_state(calldata->lsp);
-		put_nfs_open_context(calldata->ctx);
-		kfree(calldata);
-	}
-}
-
-static void nfs4_locku_complete(void *data)
+static void nfs4_locku_release_calldata(void *data)
 {
 	struct nfs4_unlockdata *calldata = data;
-	complete(&calldata->completion);
-	nfs4_locku_release_calldata(calldata);
+	nfs_free_seqid(calldata->luargs.seqid);
+	nfs4_put_lock_state(calldata->lsp);
+	put_nfs_open_context(calldata->ctx);
+	kfree(calldata);
 }
 
 static void nfs4_locku_done(struct rpc_task *task, void *data)
 {
 	struct nfs4_unlockdata *calldata = data;
 
+	if (RPC_ASSASSINATED(task))
+		return;
 	nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid);
 	switch (task->tk_status) {
 		case 0:
@@ -2935,7 +2938,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 static const struct rpc_call_ops nfs4_locku_ops = {
 	.rpc_call_prepare = nfs4_locku_prepare,
 	.rpc_call_done = nfs4_locku_done,
-	.rpc_release = nfs4_locku_complete,
+	.rpc_release = nfs4_locku_release_calldata,
 };
 
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
@@ -2944,26 +2947,28 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_lock_state *lsp;
-	int status;
+	struct rpc_task *task;
+	int status = 0;
 
 	/* Is this a delegated lock? */
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags))
-		return do_vfs_lock(request->fl_file, request);
+		goto out;
 
 	status = nfs4_set_lock_state(state, request);
 	if (status != 0)
-		return status;
+		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
 	/* We might have lost the locks! */
 	if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0)
-		return 0;
+		goto out;
+	status = -ENOMEM;
 	calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
 	if (calldata == NULL)
-		return -ENOMEM;
+		goto out;
 	calldata->luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid);
 	if (calldata->luargs.seqid == NULL) {
 		kfree(calldata);
-		return -ENOMEM;
+		goto out;
 	}
 	calldata->luargs.stateid = &lsp->ls_stateid;
 	calldata->arg.fh = NFS_FH(inode);
@@ -2978,14 +2983,16 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	/* Ensure we don't close file until we're done freeing locks! */
 	calldata->ctx = get_nfs_open_context((struct nfs_open_context*)request->fl_file->private_data);
 
-	atomic_set(&calldata->refcount, 2);
-	init_completion(&calldata->completion);
-
-	status = nfs4_call_async(NFS_SERVER(inode)->client, &nfs4_locku_ops, calldata);
-	if (status == 0)
-		wait_for_completion_interruptible(&calldata->completion);
+	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_locku_ops, calldata);
+	if (!IS_ERR(task)) {
+		status = nfs4_wait_for_completion_rpc_task(task);
+		rpc_release_task(task);
+	} else {
+		status = PTR_ERR(task);
+		nfs4_locku_release_calldata(calldata);
+	}
+out:
 	do_vfs_lock(request->fl_file, request);
-	nfs4_locku_release_calldata(calldata);
 	return status;
 }
 
-- 
cgit v1.2.3


From e56e0b78eb1097a8e06512b9ed4be94d7538e7ac Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:08 +0100
Subject: NFSv4: Allocate OPEN call RPC arguments using kmalloc()

 Cleanup in preparation for making OPEN calls interruptible by the user.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 213 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 117 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c7bec4319236..4a5cc8402110 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -194,6 +194,76 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf
 	spin_unlock(&inode->i_lock);
 }
 
+struct nfs4_opendata {
+	struct nfs_openargs o_arg;
+	struct nfs_openres o_res;
+	struct nfs_fattr f_attr;
+	struct nfs_fattr dir_attr;
+	struct dentry *dentry;
+	struct dentry *dir;
+	struct nfs4_state_owner *owner;
+	struct iattr attrs;
+};
+
+static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
+		struct nfs4_state_owner *sp, int flags,
+		const struct iattr *attrs)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct inode *dir = parent->d_inode;
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs4_opendata *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL)
+		goto err;
+	p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+	if (p->o_arg.seqid == NULL)
+		goto err_free;
+	p->dentry = dget(dentry);
+	p->dir = parent;
+	p->owner = sp;
+	atomic_inc(&sp->so_count);
+	p->o_arg.fh = NFS_FH(dir);
+	p->o_arg.open_flags = flags,
+	p->o_arg.clientid = server->nfs4_state->cl_clientid;
+	p->o_arg.id = sp->so_id;
+	p->o_arg.name = &dentry->d_name;
+	p->o_arg.server = server;
+	p->o_arg.bitmask = server->attr_bitmask;
+	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
+	p->o_res.f_attr = &p->f_attr;
+	p->o_res.dir_attr = &p->dir_attr;
+	p->o_res.server = server;
+	nfs_fattr_init(&p->f_attr);
+	nfs_fattr_init(&p->dir_attr);
+	if (flags & O_EXCL) {
+		u32 *s = (u32 *) p->o_arg.u.verifier.data;
+		s[0] = jiffies;
+		s[1] = current->pid;
+	} else if (flags & O_CREAT) {
+		p->o_arg.u.attrs = &p->attrs;
+		memcpy(&p->attrs, attrs, sizeof(p->attrs));
+	}
+	return p;
+err_free:
+	kfree(p);
+err:
+	dput(parent);
+	return NULL;
+}
+
+static void nfs4_opendata_free(struct nfs4_opendata *p)
+{
+	if (p != NULL) {
+		nfs_free_seqid(p->o_arg.seqid);
+		nfs4_put_state_owner(p->owner);
+		dput(p->dir);
+		dput(p->dentry);
+		kfree(p);
+	}
+}
+
 /* Helper for asynchronous RPC calls */
 static int nfs4_call_async(struct rpc_clnt *clnt,
 		const struct rpc_call_ops *tk_ops, void *calldata)
@@ -314,57 +384,45 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state
 	struct nfs4_state_owner  *sp  = state->owner;
 	struct inode *inode = dentry->d_inode;
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct dentry *parent = dget_parent(dentry);
-	struct nfs_openargs arg = {
-		.fh = NFS_FH(parent->d_inode),
-		.clientid = server->nfs4_state->cl_clientid,
-		.name = &dentry->d_name,
-		.id = sp->so_id,
-		.server = server,
-		.bitmask = server->attr_bitmask,
-		.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR,
-	};
-	struct nfs_openres res = {
-		.server = server,
-	};
 	struct 	rpc_message msg = {
 		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR],
-		.rpc_argp       = &arg,
-		.rpc_resp       = &res,
 		.rpc_cred	= sp->so_cred,
 	};
+	struct nfs4_opendata *opendata;
 	int status = 0;
 
 	if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
 		goto out;
 	if (state->state == 0)
 		goto out;
-	arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+	opendata = nfs4_opendata_alloc(dentry, sp, state->state, NULL);
 	status = -ENOMEM;
-	if (arg.seqid == NULL)
+	if (opendata == NULL)
 		goto out;
-	arg.open_flags = state->state;
-	memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data));
+	opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
+	msg.rpc_argp = &opendata->o_arg;
+	msg.rpc_resp = &opendata->o_res;
+	memcpy(opendata->o_arg.u.delegation.data, state->stateid.data,
+			sizeof(opendata->o_arg.u.delegation.data));
 	status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
-	nfs_increment_open_seqid(status, arg.seqid);
+	nfs_increment_open_seqid(status, opendata->o_arg.seqid);
 	if (status != 0)
 		goto out_free;
-	if(res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
+	if(opendata->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
 		status = _nfs4_proc_open_confirm(server->client, NFS_FH(inode),
-				sp, &res.stateid, arg.seqid);
+				sp, &opendata->o_res.stateid, opendata->o_arg.seqid);
 		if (status != 0)
 			goto out_free;
 	}
 	nfs_confirm_seqid(&sp->so_seqid, 0);
 	if (status >= 0) {
-		memcpy(state->stateid.data, res.stateid.data,
+		memcpy(state->stateid.data, opendata->o_res.stateid.data,
 				sizeof(state->stateid.data));
 		clear_bit(NFS_DELEGATED_STATE, &state->flags);
 	}
 out_free:
-	nfs_free_seqid(arg.seqid);
+	nfs4_opendata_free(opendata);
 out:
-	dput(parent);
 	return status;
 }
 
@@ -506,21 +564,8 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 	struct dentry *parent = dget_parent(dentry);
 	struct inode *dir = parent->d_inode;
 	struct inode *inode = state->inode;
-	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs_delegation *delegation = NFS_I(inode)->delegation;
-	struct nfs_fattr f_attr, dir_attr;
-	struct nfs_openargs o_arg = {
-		.fh = NFS_FH(dir),
-		.open_flags = state->state,
-		.name = &dentry->d_name,
-		.bitmask = server->attr_bitmask,
-		.claim = NFS4_OPEN_CLAIM_NULL,
-	};
-	struct nfs_openres o_res = {
-		.f_attr = &f_attr,
-		.dir_attr = &dir_attr,
-		.server = server,
-	};
+	struct nfs4_opendata *opendata;
 	int status = 0;
 
 	if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) {
@@ -531,38 +576,38 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 		set_bit(NFS_DELEGATED_STATE, &state->flags);
 		goto out;
 	}
-	o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
 	status = -ENOMEM;
-	if (o_arg.seqid == NULL)
+	opendata = nfs4_opendata_alloc(dentry, sp, state->state, NULL);
+	if (opendata == NULL)
 		goto out;
-	nfs_fattr_init(&f_attr);
-	nfs_fattr_init(&dir_attr);
-	status = _nfs4_proc_open(dir, sp, &o_arg, &o_res);
+	status = _nfs4_proc_open(dir, sp, &opendata->o_arg, &opendata->o_res);
 	if (status != 0)
 		goto out_nodeleg;
 	/* Check if files differ */
-	if ((f_attr.mode & S_IFMT) != (inode->i_mode & S_IFMT))
+	if ((opendata->f_attr.mode & S_IFMT) != (inode->i_mode & S_IFMT))
 		goto out_stale;
 	/* Has the file handle changed? */
-	if (nfs_compare_fh(&o_res.fh, NFS_FH(inode)) != 0) {
+	if (nfs_compare_fh(&opendata->o_res.fh, NFS_FH(inode)) != 0) {
 		/* Verify if the change attributes are the same */
-		if (f_attr.change_attr != NFS_I(inode)->change_attr)
+		if (opendata->f_attr.change_attr != NFS_I(inode)->change_attr)
 			goto out_stale;
-		if (nfs_size_to_loff_t(f_attr.size) != inode->i_size)
+		if (nfs_size_to_loff_t(opendata->f_attr.size) != inode->i_size)
 			goto out_stale;
 		/* Lets just pretend that this is the same file */
-		nfs_copy_fh(NFS_FH(inode), &o_res.fh);
-		NFS_I(inode)->fileid = f_attr.fileid;
+		nfs_copy_fh(NFS_FH(inode), &opendata->o_res.fh);
+		NFS_I(inode)->fileid = opendata->f_attr.fileid;
 	}
-	memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
-	if (o_res.delegation_type != 0) {
+	memcpy(&state->stateid, &opendata->o_res.stateid, sizeof(state->stateid));
+	if (opendata->o_res.delegation_type != 0) {
 		if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM))
-			nfs_inode_set_delegation(inode, sp->so_cred, &o_res);
+			nfs_inode_set_delegation(inode, sp->so_cred,
+					&opendata->o_res);
 		else
-			nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res);
+			nfs_inode_reclaim_delegation(inode, sp->so_cred,
+					&opendata->o_res);
 	}
 out_nodeleg:
-	nfs_free_seqid(o_arg.seqid);
+	nfs4_opendata_free(opendata);
 	clear_bit(NFS_DELEGATED_STATE, &state->flags);
 out:
 	dput(parent);
@@ -706,21 +751,8 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
 	struct nfs_server       *server = NFS_SERVER(dir);
 	struct nfs4_client *clp = server->nfs4_state;
 	struct inode *inode = NULL;
+	struct nfs4_opendata *opendata;
 	int                     status;
-	struct nfs_fattr f_attr, dir_attr;
-	struct nfs_openargs o_arg = {
-		.fh             = NFS_FH(dir),
-		.open_flags	= flags,
-		.name           = &dentry->d_name,
-		.server         = server,
-		.bitmask = server->attr_bitmask,
-		.claim = NFS4_OPEN_CLAIM_NULL,
-	};
-	struct nfs_openres o_res = {
-		.f_attr         = &f_attr,
-		.dir_attr	= &dir_attr,
-		.server         = server,
-	};
 
 	/* Protect against reboot recovery conflicts */
 	down_read(&clp->cl_sem);
@@ -729,45 +761,34 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
 		dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
 		goto out_err;
 	}
-	if (flags & O_EXCL) {
-		u32 *p = (u32 *) o_arg.u.verifier.data;
-		p[0] = jiffies;
-		p[1] = current->pid;
-	} else
-		o_arg.u.attrs = sattr;
-	/* Serialization for the sequence id */
+	opendata = nfs4_opendata_alloc(dentry, sp, flags, sattr);
+	if (opendata == NULL)
+		goto err_put_state_owner;
 
-	o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
-	if (o_arg.seqid == NULL)
-		return -ENOMEM;
-	nfs_fattr_init(&f_attr);
-	nfs_fattr_init(&dir_attr);
-	status = _nfs4_proc_open(dir, sp, &o_arg, &o_res);
+	status = _nfs4_proc_open(dir, sp, &opendata->o_arg, &opendata->o_res);
 	if (status != 0)
-		goto out_err;
+		goto err_opendata_free;
 
 	status = -ENOMEM;
-	inode = nfs_fhget(dir->i_sb, &o_res.fh, &f_attr);
+	inode = nfs_fhget(dir->i_sb, &opendata->o_res.fh, &opendata->f_attr);
 	if (!inode)
-		goto out_err;
+		goto err_opendata_free;
 	state = nfs4_get_open_state(inode, sp);
 	if (!state)
-		goto out_err;
-	update_open_stateid(state, &o_res.stateid, flags);
-	if (o_res.delegation_type != 0)
-		nfs_inode_set_delegation(inode, cred, &o_res);
-	nfs_free_seqid(o_arg.seqid);
+		goto err_opendata_free;
+	update_open_stateid(state, &opendata->o_res.stateid, flags);
+	if (opendata->o_res.delegation_type != 0)
+		nfs_inode_set_delegation(inode, cred, &opendata->o_res);
+	nfs4_opendata_free(opendata);
 	nfs4_put_state_owner(sp);
 	up_read(&clp->cl_sem);
 	*res = state;
 	return 0;
+err_opendata_free:
+	nfs4_opendata_free(opendata);
+err_put_state_owner:
+	nfs4_put_state_owner(sp);
 out_err:
-	if (sp != NULL) {
-		if (state != NULL)
-			nfs4_put_open_state(state);
-		nfs_free_seqid(o_arg.seqid);
-		nfs4_put_state_owner(sp);
-	}
 	/* Note: clp->cl_sem must be released before nfs4_put_open_state()! */
 	up_read(&clp->cl_sem);
 	if (inode != NULL)
-- 
cgit v1.2.3


From 24ac23ab88df5b21b5b2df8cde748bf99b289099 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:11 +0100
Subject: NFSv4: Convert open() into an asynchronous RPC call

 OPEN is a stateful operation, so we must ensure that it always
 completes. In order to allow users to interrupt the operation,
 we need to make the RPC call asynchronous, and then wait on
 completion (or cancel).

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 193 ++++++++++++++++++++++++++++++++++++------------------
 fs/nfs/nfs4xdr.c  |   3 -
 2 files changed, 130 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4a5cc8402110..aed8701c1a36 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -195,6 +195,7 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf
 }
 
 struct nfs4_opendata {
+	atomic_t count;
 	struct nfs_openargs o_arg;
 	struct nfs_openres o_res;
 	struct nfs_fattr f_attr;
@@ -203,6 +204,8 @@ struct nfs4_opendata {
 	struct dentry *dir;
 	struct nfs4_state_owner *owner;
 	struct iattr attrs;
+	int rpc_status;
+	int cancelled;
 };
 
 static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
@@ -220,6 +223,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
 	if (p->o_arg.seqid == NULL)
 		goto err_free;
+	atomic_set(&p->count, 1);
 	p->dentry = dget(dentry);
 	p->dir = parent;
 	p->owner = sp;
@@ -255,7 +259,7 @@ err:
 
 static void nfs4_opendata_free(struct nfs4_opendata *p)
 {
-	if (p != NULL) {
+	if (p != NULL && atomic_dec_and_test(&p->count)) {
 		nfs_free_seqid(p->o_arg.seqid);
 		nfs4_put_state_owner(p->owner);
 		dput(p->dir);
@@ -305,6 +309,26 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid,
 	spin_unlock(&state->owner->so_lock);
 }
 
+static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+	struct inode *inode;
+	struct nfs4_state *state = NULL;
+
+	if (!(data->f_attr.valid & NFS_ATTR_FATTR))
+		goto out;
+	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
+	if (inode == NULL)
+		goto out;
+	state = nfs4_get_open_state(inode, data->owner);
+	if (state == NULL)
+		goto put_inode;
+	update_open_stateid(state, &data->o_res.stateid, data->o_arg.open_flags);
+put_inode:
+	iput(inode);
+out:
+	return state;
+}
+
 /*
  * OPEN_RECLAIM:
  * 	reclaim state on the server after a reboot.
@@ -473,41 +497,105 @@ static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *f
 	return status;
 }
 
-static int _nfs4_proc_open(struct inode *dir, struct nfs4_state_owner  *sp, struct nfs_openargs *o_arg, struct nfs_openres *o_res)
+static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state_owner *sp = data->owner;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
-		.rpc_argp = o_arg,
-		.rpc_resp = o_res,
+		.rpc_argp = &data->o_arg,
+		.rpc_resp = &data->o_res,
 		.rpc_cred = sp->so_cred,
 	};
-	int status;
+	
+	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
+		return;
+	/* Update sequence id. */
+	data->o_arg.id = sp->so_id;
+	data->o_arg.clientid = sp->so_client->cl_clientid;
+	rpc_call_setup(task, &msg, 0);
+}
 
-	/* Update sequence id. The caller must serialize! */
-	o_arg->id = sp->so_id;
-	o_arg->clientid = sp->so_client->cl_clientid;
+static void nfs4_open_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
 
-	status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
-	if (status == 0) {
-		/* OPEN on anything except a regular file is disallowed in NFSv4 */
-		switch (o_res->f_attr->mode & S_IFMT) {
+	data->rpc_status = task->tk_status;
+	if (RPC_ASSASSINATED(task))
+		return;
+	if (task->tk_status == 0) {
+		switch (data->o_res.f_attr->mode & S_IFMT) {
 			case S_IFREG:
 				break;
 			case S_IFLNK:
-				status = -ELOOP;
+				data->rpc_status = -ELOOP;
 				break;
 			case S_IFDIR:
-				status = -EISDIR;
+				data->rpc_status = -EISDIR;
 				break;
 			default:
-				status = -ENOTDIR;
+				data->rpc_status = -ENOTDIR;
 		}
 	}
+	nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid);
+}
+
+static void nfs4_open_release(void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state *state = NULL;
+
+	/* If this request hasn't been cancelled, do nothing */
+	if (data->cancelled == 0)
+		goto out_free;
+	/* In case of error, no cleanup! */
+	if (data->rpc_status != 0)
+		goto out_free;
+	/* In case we need an open_confirm, no cleanup! */
+	if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
+		goto out_free;
+	nfs_confirm_seqid(&data->owner->so_seqid, 0);
+	state = nfs4_opendata_to_nfs4_state(data);
+	if (state != NULL)
+		nfs4_close_state(state, data->o_arg.open_flags);
+out_free:
+	nfs4_opendata_free(data);
+}
+
+static const struct rpc_call_ops nfs4_open_ops = {
+	.rpc_call_prepare = nfs4_open_prepare,
+	.rpc_call_done = nfs4_open_done,
+	.rpc_release = nfs4_open_release,
+};
 
-	nfs_increment_open_seqid(status, o_arg->seqid);
+/*
+ * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open(struct nfs4_opendata *data)
+{
+	struct inode *dir = data->dir->d_inode;
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_openargs *o_arg = &data->o_arg;
+	struct nfs_openres *o_res = &data->o_res;
+	struct rpc_task *task;
+	int status;
+
+	atomic_inc(&data->count);
+	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data);
+	if (IS_ERR(task)) {
+		nfs4_opendata_free(data);
+		return PTR_ERR(task);
+	}
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0) {
+		data->cancelled = 1;
+		smp_wmb();
+	} else
+		status = data->rpc_status;
+	rpc_release_task(task);
 	if (status != 0)
-		goto out;
+		return status;
+
 	if (o_arg->open_flags & O_CREAT) {
 		update_changeattr(dir, &o_res->cinfo);
 		nfs_post_op_update_inode(dir, o_res->dir_attr);
@@ -515,15 +603,14 @@ static int _nfs4_proc_open(struct inode *dir, struct nfs4_state_owner  *sp, stru
 		nfs_refresh_inode(dir, o_res->dir_attr);
 	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
 		status = _nfs4_proc_open_confirm(server->client, &o_res->fh,
-				sp, &o_res->stateid, o_arg->seqid);
+				data->owner, &o_res->stateid, o_arg->seqid);
 		if (status != 0)
-			goto out;
+			return status;
 	}
-	nfs_confirm_seqid(&sp->so_seqid, 0);
+	nfs_confirm_seqid(&data->owner->so_seqid, 0);
 	if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
-		status = server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
-out:
-	return status;
+		return server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
+	return 0;
 }
 
 static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags)
@@ -562,14 +649,15 @@ out:
 static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
 {
 	struct dentry *parent = dget_parent(dentry);
-	struct inode *dir = parent->d_inode;
 	struct inode *inode = state->inode;
 	struct nfs_delegation *delegation = NFS_I(inode)->delegation;
 	struct nfs4_opendata *opendata;
+	struct nfs4_state *newstate;
+	int openflags = state->state & (FMODE_READ|FMODE_WRITE);
 	int status = 0;
 
 	if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) {
-		status = _nfs4_do_access(inode, sp->so_cred, state->state);
+		status = _nfs4_do_access(inode, sp->so_cred, openflags);
 		if (status < 0)
 			goto out;
 		memcpy(&state->stateid, &delegation->stateid, sizeof(state->stateid));
@@ -577,27 +665,15 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 		goto out;
 	}
 	status = -ENOMEM;
-	opendata = nfs4_opendata_alloc(dentry, sp, state->state, NULL);
+	opendata = nfs4_opendata_alloc(dentry, sp, openflags, NULL);
 	if (opendata == NULL)
 		goto out;
-	status = _nfs4_proc_open(dir, sp, &opendata->o_arg, &opendata->o_res);
+	status = _nfs4_proc_open(opendata);
 	if (status != 0)
 		goto out_nodeleg;
-	/* Check if files differ */
-	if ((opendata->f_attr.mode & S_IFMT) != (inode->i_mode & S_IFMT))
+	newstate = nfs4_opendata_to_nfs4_state(opendata);
+	if (newstate != state)
 		goto out_stale;
-	/* Has the file handle changed? */
-	if (nfs_compare_fh(&opendata->o_res.fh, NFS_FH(inode)) != 0) {
-		/* Verify if the change attributes are the same */
-		if (opendata->f_attr.change_attr != NFS_I(inode)->change_attr)
-			goto out_stale;
-		if (nfs_size_to_loff_t(opendata->f_attr.size) != inode->i_size)
-			goto out_stale;
-		/* Lets just pretend that this is the same file */
-		nfs_copy_fh(NFS_FH(inode), &opendata->o_res.fh);
-		NFS_I(inode)->fileid = opendata->f_attr.fileid;
-	}
-	memcpy(&state->stateid, &opendata->o_res.stateid, sizeof(state->stateid));
 	if (opendata->o_res.delegation_type != 0) {
 		if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM))
 			nfs_inode_set_delegation(inode, sp->so_cred,
@@ -606,6 +682,8 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 			nfs_inode_reclaim_delegation(inode, sp->so_cred,
 					&opendata->o_res);
 	}
+out_close_state:
+	nfs4_close_state(newstate, openflags);
 out_nodeleg:
 	nfs4_opendata_free(opendata);
 	clear_bit(NFS_DELEGATED_STATE, &state->flags);
@@ -618,7 +696,7 @@ out_stale:
 	nfs4_drop_state_owner(sp);
 	d_drop(dentry);
 	/* Should we be trying to close that stateid? */
-	goto out_nodeleg;
+	goto out_close_state;
 }
 
 static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
@@ -656,7 +734,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
 }
 
 /*
- * Returns an nfs4_state + an extra reference to the inode
+ * Returns a referenced nfs4_state if there is an open delegation on the file
  */
 static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred, struct nfs4_state **res)
 {
@@ -709,7 +787,6 @@ out_ok:
 	nfs4_put_state_owner(sp);
 	up_read(&nfsi->rwsem);
 	up_read(&clp->cl_sem);
-	igrab(inode);
 	*res = state;
 	return 0; 
 out_err:
@@ -742,7 +819,7 @@ static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, st
 }
 
 /*
- * Returns an nfs4_state + an referenced inode
+ * Returns a referenced nfs4_state
  */
 static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
 {
@@ -750,7 +827,6 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
 	struct nfs4_state     *state = NULL;
 	struct nfs_server       *server = NFS_SERVER(dir);
 	struct nfs4_client *clp = server->nfs4_state;
-	struct inode *inode = NULL;
 	struct nfs4_opendata *opendata;
 	int                     status;
 
@@ -765,20 +841,16 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
 	if (opendata == NULL)
 		goto err_put_state_owner;
 
-	status = _nfs4_proc_open(dir, sp, &opendata->o_arg, &opendata->o_res);
+	status = _nfs4_proc_open(opendata);
 	if (status != 0)
 		goto err_opendata_free;
 
 	status = -ENOMEM;
-	inode = nfs_fhget(dir->i_sb, &opendata->o_res.fh, &opendata->f_attr);
-	if (!inode)
-		goto err_opendata_free;
-	state = nfs4_get_open_state(inode, sp);
-	if (!state)
+	state = nfs4_opendata_to_nfs4_state(opendata);
+	if (state == NULL)
 		goto err_opendata_free;
-	update_open_stateid(state, &opendata->o_res.stateid, flags);
 	if (opendata->o_res.delegation_type != 0)
-		nfs_inode_set_delegation(inode, cred, &opendata->o_res);
+		nfs_inode_set_delegation(state->inode, cred, &opendata->o_res);
 	nfs4_opendata_free(opendata);
 	nfs4_put_state_owner(sp);
 	up_read(&clp->cl_sem);
@@ -791,8 +863,6 @@ err_put_state_owner:
 out_err:
 	/* Note: clp->cl_sem must be released before nfs4_put_open_state()! */
 	up_read(&clp->cl_sem);
-	if (inode != NULL)
-		iput(inode);
 	*res = NULL;
 	return status;
 }
@@ -1066,7 +1136,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 			d_add(dentry, NULL);
 		return (struct dentry *)state;
 	}
-	res = d_add_unique(dentry, state->inode);
+	res = d_add_unique(dentry, igrab(state->inode));
 	if (res != NULL)
 		dentry = res;
 	nfs4_intent_set_file(nd, dentry, state);
@@ -1078,7 +1148,6 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 {
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
-	struct inode *inode;
 
 	cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
 	if (IS_ERR(cred))
@@ -1102,9 +1171,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 		}
 		goto out_drop;
 	}
-	inode = state->inode;
-	iput(inode);
-	if (inode == dentry->d_inode) {
+	if (state->inode == dentry->d_inode) {
 		nfs4_intent_set_file(nd, dentry, state);
 		return 1;
 	}
@@ -1633,7 +1700,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		status = PTR_ERR(state);
 		goto out;
 	}
-	d_instantiate(dentry, state->inode);
+	d_instantiate(dentry, igrab(state->inode));
 	if (flags & O_EXCL) {
 		struct nfs_fattr fattr;
 		status = nfs4_do_setattr(NFS_SERVER(dir), &fattr,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index fbbace8a30c4..db2bcf722f91 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1499,9 +1499,6 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena
 	};
 	int status;
 
-	status = nfs_wait_on_sequence(args->seqid, req->rq_task);
-	if (status != 0)
-		goto out;
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
 	status = encode_putfh(&xdr, args->fh);
-- 
cgit v1.2.3


From cdd4e68b5f0ed12c64b3e2be83655d2a47588a74 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:12 +0100
Subject: NFSv4: Make open_confirm() asynchronous too

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 101 +++++++++++++++++++++++++++++++++++++-----------
 fs/nfs/nfs4xdr.c        |   7 +---
 include/linux/nfs_xdr.h |   2 +-
 3 files changed, 81 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aed8701c1a36..8154f2579469 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -57,7 +57,8 @@
 #define NFS4_POLL_RETRY_MIN	(1*HZ)
 #define NFS4_POLL_RETRY_MAX	(15*HZ)
 
-static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid);
+struct nfs4_opendata;
+static int _nfs4_proc_open_confirm(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
@@ -198,6 +199,8 @@ struct nfs4_opendata {
 	atomic_t count;
 	struct nfs_openargs o_arg;
 	struct nfs_openres o_res;
+	struct nfs_open_confirmargs c_arg;
+	struct nfs_open_confirmres c_res;
 	struct nfs_fattr f_attr;
 	struct nfs_fattr dir_attr;
 	struct dentry *dentry;
@@ -249,6 +252,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 		p->o_arg.u.attrs = &p->attrs;
 		memcpy(&p->attrs, attrs, sizeof(p->attrs));
 	}
+	p->c_arg.fh = &p->o_res.fh;
+	p->c_arg.stateid = &p->o_res.stateid;
+	p->c_arg.seqid = p->o_arg.seqid;
 	return p;
 err_free:
 	kfree(p);
@@ -433,8 +439,7 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state
 	if (status != 0)
 		goto out_free;
 	if(opendata->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
-		status = _nfs4_proc_open_confirm(server->client, NFS_FH(inode),
-				sp, &opendata->o_res.stateid, opendata->o_arg.seqid);
+		status = _nfs4_proc_open_confirm(opendata);
 		if (status != 0)
 			goto out_free;
 	}
@@ -472,28 +477,79 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
 	return err;
 }
 
-static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid)
+static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs_open_confirmargs arg = {
-		.fh             = fh,
-		.seqid          = seqid,
-		.stateid	= *stateid,
-	};
-	struct nfs_open_confirmres res;
-	struct 	rpc_message msg = {
-		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
-		.rpc_argp       = &arg,
-		.rpc_resp       = &res,
-		.rpc_cred	= sp->so_cred,
+	struct nfs4_opendata *data = calldata;
+	struct  rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+		.rpc_argp = &data->c_arg,
+		.rpc_resp = &data->c_res,
+		.rpc_cred = data->owner->so_cred,
 	};
+	rpc_call_setup(task, &msg, 0);
+}
+
+static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+
+	data->rpc_status = task->tk_status;
+	if (RPC_ASSASSINATED(task))
+		return;
+	if (data->rpc_status == 0)
+		memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
+				sizeof(data->o_res.stateid.data));
+	nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
+	nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status);
+}
+
+static void nfs4_open_confirm_release(void *calldata)
+{
+	struct nfs4_opendata *data = calldata;
+	struct nfs4_state *state = NULL;
+
+	/* If this request hasn't been cancelled, do nothing */
+	if (data->cancelled == 0)
+		goto out_free;
+	/* In case of error, no cleanup! */
+	if (data->rpc_status != 0)
+		goto out_free;
+	nfs_confirm_seqid(&data->owner->so_seqid, 0);
+	state = nfs4_opendata_to_nfs4_state(data);
+	if (state != NULL)
+		nfs4_close_state(state, data->o_arg.open_flags);
+out_free:
+	nfs4_opendata_free(data);
+}
+
+static const struct rpc_call_ops nfs4_open_confirm_ops = {
+	.rpc_call_prepare = nfs4_open_confirm_prepare,
+	.rpc_call_done = nfs4_open_confirm_done,
+	.rpc_release = nfs4_open_confirm_release,
+};
+
+/*
+ * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
+{
+	struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
+	struct rpc_task *task;
 	int status;
 
-	status = rpc_call_sync(clnt, &msg, RPC_TASK_NOINTR);
-	/* Confirm the sequence as being established */
-	nfs_confirm_seqid(&sp->so_seqid, status);
-	nfs_increment_open_seqid(status, seqid);
-	if (status >= 0)
-		memcpy(stateid, &res.stateid, sizeof(*stateid));
+	atomic_inc(&data->count);
+	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data);
+	if (IS_ERR(task)) {
+		nfs4_opendata_free(data);
+		return PTR_ERR(task);
+	}
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status != 0) {
+		data->cancelled = 1;
+		smp_wmb();
+	} else
+		status = data->rpc_status;
+	rpc_release_task(task);
 	return status;
 }
 
@@ -602,8 +658,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	} else
 		nfs_refresh_inode(dir, o_res->dir_attr);
 	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
-		status = _nfs4_proc_open_confirm(server->client, &o_res->fh,
-				data->owner, &o_res->stateid, o_arg->seqid);
+		status = _nfs4_proc_open_confirm(data);
 		if (status != 0)
 			return status;
 	}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index db2bcf722f91..2ba9906f2a51 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -964,9 +964,9 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
 {
 	uint32_t *p;
 
-	RESERVE_SPACE(8+sizeof(arg->stateid.data));
+	RESERVE_SPACE(8+sizeof(arg->stateid->data));
 	WRITE32(OP_OPEN_CONFIRM);
-	WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
+	WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
 	WRITE32(arg->seqid->sequence->counter);
 
 	return 0;
@@ -1535,9 +1535,6 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct n
 	};
 	int status;
 
-	status = nfs_wait_on_sequence(args->seqid, req->rq_task);
-	if (status != 0)
-		goto out;
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
 	status = encode_putfh(&xdr, args->fh);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 40718669b9c8..518cfa5cd024 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -137,7 +137,7 @@ struct nfs_openres {
  */
 struct nfs_open_confirmargs {
 	const struct nfs_fh *	fh;
-	nfs4_stateid            stateid;
+	nfs4_stateid *		stateid;
 	struct nfs_seqid *	seqid;
 };
 
-- 
cgit v1.2.3


From e761692381f294ea079d2e869fcd7c0afc79e394 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:13 +0100
Subject: NFSv4: Make nfs4_state track O_RDWR, O_RDONLY and O_WRONLY separately

 A closer reading of RFC3530 reveals that OPEN_DOWNGRADE must always
 specify a access modes that have been the argument of a previous OPEN
 operation.
 IOW: doing OPEN(O_RDWR) and then OPEN_DOWNGRADE(O_WRONLY) is forbidden
 unless the user called OPEN(O_WRONLY)

 In order to fix that, we really need to track the three possible open
 states separately.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h   |  5 +++--
 fs/nfs/nfs4proc.c  | 34 ++++++++++++++++++++++------------
 fs/nfs/nfs4state.c | 31 +++++++++++++++++--------------
 3 files changed, 42 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b7f262dcb6e3..4ad5981f9375 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -182,8 +182,9 @@ struct nfs4_state {
 
 	nfs4_stateid stateid;
 
-	unsigned int nreaders;
-	unsigned int nwriters;
+	unsigned int n_rdonly;
+	unsigned int n_wronly;
+	unsigned int n_rdwr;
 	int state;			/* State on the server (R,W, or RW) */
 	atomic_t count;
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8154f2579469..e494cc2ea986 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -297,6 +297,20 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
 	return ret;
 }
 
+static inline void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
+{
+	switch (open_flags) {
+		case FMODE_WRITE:
+			state->n_wronly++;
+			break;
+		case FMODE_READ:
+			state->n_rdonly++;
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			state->n_rdwr++;
+	}
+}
+
 static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
 {
 	struct inode *inode = state->inode;
@@ -306,10 +320,7 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid,
 	spin_lock(&state->owner->so_lock);
 	spin_lock(&inode->i_lock);
 	memcpy(&state->stateid, stateid, sizeof(state->stateid));
-	if ((open_flags & FMODE_WRITE))
-		state->nwriters++;
-	if (open_flags & FMODE_READ)
-		state->nreaders++;
+	update_open_stateflags(state, open_flags);
 	nfs4_state_set_mode_locked(state, state->state | open_flags);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&state->owner->so_lock);
@@ -822,10 +833,7 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
 	err = -ENOENT;
 	if ((state->state & open_flags) == open_flags) {
 		spin_lock(&inode->i_lock);
-		if (open_flags & FMODE_READ)
-			state->nreaders++;
-		if (open_flags & FMODE_WRITE)
-			state->nwriters++;
+		update_open_stateflags(state, open_flags);
 		spin_unlock(&inode->i_lock);
 		goto out_ok;
 	} else if (state->state != 0)
@@ -1082,10 +1090,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	spin_lock(&state->owner->so_lock);
 	spin_lock(&calldata->inode->i_lock);
 	mode = old_mode = state->state;
-	if (state->nreaders == 0)
-		mode &= ~FMODE_READ;
-	if (state->nwriters == 0)
-		mode &= ~FMODE_WRITE;
+	if (state->n_rdwr == 0) {
+		if (state->n_rdonly == 0)
+			mode &= ~FMODE_READ;
+		if (state->n_wronly == 0)
+			mode &= ~FMODE_WRITE;
+	}
 	nfs4_state_set_mode_locked(state, mode);
 	spin_unlock(&calldata->inode->i_lock);
 	spin_unlock(&state->owner->so_lock);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5ef4c57618fe..41a5f1a8a984 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -349,14 +349,9 @@ nfs4_alloc_open_state(void)
 {
 	struct nfs4_state *state;
 
-	state = kmalloc(sizeof(*state), GFP_KERNEL);
+	state = kzalloc(sizeof(*state), GFP_KERNEL);
 	if (!state)
 		return NULL;
-	state->state = 0;
-	state->nreaders = 0;
-	state->nwriters = 0;
-	state->flags = 0;
-	memset(state->stateid.data, 0, sizeof(state->stateid.data));
 	atomic_set(&state->count, 1);
 	INIT_LIST_HEAD(&state->lock_states);
 	spin_lock_init(&state->state_lock);
@@ -475,15 +470,23 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode)
 	/* Protect against nfs4_find_state() */
 	spin_lock(&owner->so_lock);
 	spin_lock(&inode->i_lock);
-	if (mode & FMODE_READ)
-		state->nreaders--;
-	if (mode & FMODE_WRITE)
-		state->nwriters--;
+	switch (mode & (FMODE_READ | FMODE_WRITE)) {
+		case FMODE_READ:
+			state->n_rdonly--;
+			break;
+		case FMODE_WRITE:
+			state->n_wronly--;
+			break;
+		case FMODE_READ|FMODE_WRITE:
+			state->n_rdwr--;
+	}
 	oldstate = newstate = state->state;
-	if (state->nreaders == 0)
-		newstate &= ~FMODE_READ;
-	if (state->nwriters == 0)
-		newstate &= ~FMODE_WRITE;
+	if (state->n_rdwr == 0) {
+		if (state->n_rdonly == 0)
+			newstate &= ~FMODE_READ;
+		if (state->n_wronly == 0)
+			newstate &= ~FMODE_WRITE;
+	}
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
 		nfs4_state_set_mode_locked(state, newstate);
 		oldstate = newstate;
-- 
cgit v1.2.3


From 864472e9b8fa76ffaad17dfcb84d79e16df6828c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:15 +0100
Subject: NFSv4: Make open recovery track O_RDWR, O_RDONLY and O_WRONLY
 correctly

 When recovering from a delegation recall or a network partition, we need
 to replay open(O_RDWR), open(O_RDONLY) and open(O_WRONLY) separately.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 276 +++++++++++++++++++++++++++++-------------------------
 fs/nfs/nfs4xdr.c  |  11 ++-
 2 files changed, 156 insertions(+), 131 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e494cc2ea986..3ecb7da220f5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -58,7 +58,7 @@
 #define NFS4_POLL_RETRY_MAX	(15*HZ)
 
 struct nfs4_opendata;
-static int _nfs4_proc_open_confirm(struct nfs4_opendata *data);
+static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
@@ -346,32 +346,108 @@ out:
 	return state;
 }
 
+static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
+{
+	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct nfs_open_context *ctx;
+
+	spin_lock(&state->inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		if (ctx->state != state)
+			continue;
+		get_nfs_open_context(ctx);
+		spin_unlock(&state->inode->i_lock);
+		return ctx;
+	}
+	spin_unlock(&state->inode->i_lock);
+	return ERR_PTR(-ENOENT);
+}
+
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, nfs4_stateid *stateid)
+{
+	int ret;
+
+	opendata->o_arg.open_flags = openflags;
+	ret = _nfs4_proc_open(opendata);
+	if (ret != 0)
+		return ret; 
+	memcpy(stateid->data, opendata->o_res.stateid.data,
+			sizeof(stateid->data));
+	return 0;
+}
+
+static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state)
+{
+	nfs4_stateid stateid;
+	struct nfs4_state *newstate;
+	int mode = 0;
+	int delegation = 0;
+	int ret;
+
+	/* memory barrier prior to reading state->n_* */
+	smp_rmb();
+	if (state->n_rdwr != 0) {
+		ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &stateid);
+		if (ret != 0)
+			return ret;
+		mode |= FMODE_READ|FMODE_WRITE;
+		if (opendata->o_res.delegation_type != 0)
+			delegation = opendata->o_res.delegation_type;
+		smp_rmb();
+	}
+	if (state->n_wronly != 0) {
+		ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &stateid);
+		if (ret != 0)
+			return ret;
+		mode |= FMODE_WRITE;
+		if (opendata->o_res.delegation_type != 0)
+			delegation = opendata->o_res.delegation_type;
+		smp_rmb();
+	}
+	if (state->n_rdonly != 0) {
+		ret = nfs4_open_recover_helper(opendata, FMODE_READ, &stateid);
+		if (ret != 0)
+			return ret;
+		mode |= FMODE_READ;
+	}
+	clear_bit(NFS_DELEGATED_STATE, &state->flags);
+	if (mode == 0)
+		return 0;
+	if (opendata->o_res.delegation_type == 0)
+		opendata->o_res.delegation_type = delegation;
+	opendata->o_arg.open_flags |= mode;
+	newstate = nfs4_opendata_to_nfs4_state(opendata);
+	if (newstate != NULL) {
+		if (opendata->o_res.delegation_type != 0) {
+			struct nfs_inode *nfsi = NFS_I(newstate->inode);
+			int delegation_flags = 0;
+			if (nfsi->delegation)
+				delegation_flags = nfsi->delegation->flags;
+			if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM))
+				nfs_inode_set_delegation(newstate->inode,
+						opendata->owner->so_cred,
+						&opendata->o_res);
+			else
+				nfs_inode_reclaim_delegation(newstate->inode,
+						opendata->owner->so_cred,
+						&opendata->o_res);
+		}
+		nfs4_close_state(newstate, opendata->o_arg.open_flags);
+	}
+	if (newstate != state)
+		return -ESTALE;
+	return 0;
+}
+
 /*
  * OPEN_RECLAIM:
  * 	reclaim state on the server after a reboot.
  */
-static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
+static int _nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
 {
-	struct inode *inode = state->inode;
-	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs_delegation *delegation = NFS_I(inode)->delegation;
-	struct nfs_openargs o_arg = {
-		.fh = NFS_FH(inode),
-		.id = sp->so_id,
-		.open_flags = state->state,
-		.clientid = server->nfs4_state->cl_clientid,
-		.claim = NFS4_OPEN_CLAIM_PREVIOUS,
-		.bitmask = server->attr_bitmask,
-	};
-	struct nfs_openres o_res = {
-		.server = server,	/* Grrr */
-	};
-	struct rpc_message msg = {
-		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR],
-		.rpc_argp       = &o_arg,
-		.rpc_resp	= &o_res,
-		.rpc_cred	= sp->so_cred,
-	};
+	struct nfs_delegation *delegation = NFS_I(state->inode)->delegation;
+	struct nfs4_opendata *opendata;
+	int delegation_type = 0;
 	int status;
 
 	if (delegation != NULL) {
@@ -381,38 +457,27 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st
 			set_bit(NFS_DELEGATED_STATE, &state->flags);
 			return 0;
 		}
-		o_arg.u.delegation_type = delegation->type;
+		delegation_type = delegation->type;
 	}
-	o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
-	if (o_arg.seqid == NULL)
+	opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL);
+	if (opendata == NULL)
 		return -ENOMEM;
-	status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
-	/* Confirm the sequence as being established */
-	nfs_confirm_seqid(&sp->so_seqid, status);
-	nfs_increment_open_seqid(status, o_arg.seqid);
-	if (status == 0) {
-		memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
-		if (o_res.delegation_type != 0) {
-			nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res);
-			/* Did the server issue an immediate delegation recall? */
-			if (o_res.do_recall)
-				nfs_async_inode_return_delegation(inode, &o_res.stateid);
-		}
-	}
-	nfs_free_seqid(o_arg.seqid);
-	clear_bit(NFS_DELEGATED_STATE, &state->flags);
-	/* Ensure we update the inode attributes */
-	NFS_CACHEINV(inode);
+	opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS;
+	opendata->o_arg.fh = NFS_FH(state->inode);
+	nfs_copy_fh(&opendata->o_res.fh, opendata->o_arg.fh);
+	opendata->o_arg.u.delegation_type = delegation_type;
+	status = nfs4_open_recover(opendata, state);
+	nfs4_opendata_free(opendata);
 	return status;
 }
 
-static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
+static int nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	struct nfs4_exception exception = { };
 	int err;
 	do {
-		err = _nfs4_open_reclaim(sp, state);
+		err = _nfs4_do_open_reclaim(sp, state, dentry);
 		if (err != -NFS4ERR_DELAY)
 			break;
 		nfs4_handle_exception(server, err, &exception);
@@ -420,50 +485,36 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta
 	return err;
 }
 
+static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+	struct nfs_open_context *ctx;
+	int ret;
+
+	ctx = nfs4_state_find_open_context(state);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+	ret = nfs4_do_open_reclaim(sp, state, ctx->dentry);
+	put_nfs_open_context(ctx);
+	return ret;
+}
+
 static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
 {
 	struct nfs4_state_owner  *sp  = state->owner;
-	struct inode *inode = dentry->d_inode;
-	struct nfs_server *server = NFS_SERVER(inode);
-	struct 	rpc_message msg = {
-		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR],
-		.rpc_cred	= sp->so_cred,
-	};
 	struct nfs4_opendata *opendata;
-	int status = 0;
+	int ret;
 
 	if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
-		goto out;
-	if (state->state == 0)
-		goto out;
-	opendata = nfs4_opendata_alloc(dentry, sp, state->state, NULL);
-	status = -ENOMEM;
+		return 0;
+	opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL);
 	if (opendata == NULL)
-		goto out;
+		return -ENOMEM;
 	opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
-	msg.rpc_argp = &opendata->o_arg;
-	msg.rpc_resp = &opendata->o_res;
 	memcpy(opendata->o_arg.u.delegation.data, state->stateid.data,
 			sizeof(opendata->o_arg.u.delegation.data));
-	status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
-	nfs_increment_open_seqid(status, opendata->o_arg.seqid);
-	if (status != 0)
-		goto out_free;
-	if(opendata->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
-		status = _nfs4_proc_open_confirm(opendata);
-		if (status != 0)
-			goto out_free;
-	}
-	nfs_confirm_seqid(&sp->so_seqid, 0);
-	if (status >= 0) {
-		memcpy(state->stateid.data, opendata->o_res.stateid.data,
-				sizeof(state->stateid.data));
-		clear_bit(NFS_DELEGATED_STATE, &state->flags);
-	}
-out_free:
+	ret = nfs4_open_recover(opendata, state);
 	nfs4_opendata_free(opendata);
-out:
-	return status;
+	return ret;
 }
 
 int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
@@ -580,6 +631,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	/* Update sequence id. */
 	data->o_arg.id = sp->so_id;
 	data->o_arg.clientid = sp->so_client->cl_clientid;
+	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
+		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
 	rpc_call_setup(task, &msg, 0);
 }
 
@@ -714,55 +767,31 @@ out:
  */
 static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
 {
-	struct dentry *parent = dget_parent(dentry);
 	struct inode *inode = state->inode;
 	struct nfs_delegation *delegation = NFS_I(inode)->delegation;
 	struct nfs4_opendata *opendata;
-	struct nfs4_state *newstate;
 	int openflags = state->state & (FMODE_READ|FMODE_WRITE);
-	int status = 0;
+	int ret;
 
 	if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) {
-		status = _nfs4_do_access(inode, sp->so_cred, openflags);
-		if (status < 0)
-			goto out;
+		ret = _nfs4_do_access(inode, sp->so_cred, openflags);
+		if (ret < 0)
+			return ret;
 		memcpy(&state->stateid, &delegation->stateid, sizeof(state->stateid));
 		set_bit(NFS_DELEGATED_STATE, &state->flags);
-		goto out;
+		return 0;
 	}
-	status = -ENOMEM;
 	opendata = nfs4_opendata_alloc(dentry, sp, openflags, NULL);
 	if (opendata == NULL)
-		goto out;
-	status = _nfs4_proc_open(opendata);
-	if (status != 0)
-		goto out_nodeleg;
-	newstate = nfs4_opendata_to_nfs4_state(opendata);
-	if (newstate != state)
-		goto out_stale;
-	if (opendata->o_res.delegation_type != 0) {
-		if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM))
-			nfs_inode_set_delegation(inode, sp->so_cred,
-					&opendata->o_res);
-		else
-			nfs_inode_reclaim_delegation(inode, sp->so_cred,
-					&opendata->o_res);
+		return -ENOMEM;
+	ret = nfs4_open_recover(opendata, state);
+	if (ret == -ESTALE) {
+		/* Invalidate the state owner so we don't ever use it again */
+		nfs4_drop_state_owner(sp);
+		d_drop(dentry);
 	}
-out_close_state:
-	nfs4_close_state(newstate, openflags);
-out_nodeleg:
 	nfs4_opendata_free(opendata);
-	clear_bit(NFS_DELEGATED_STATE, &state->flags);
-out:
-	dput(parent);
-	return status;
-out_stale:
-	status = -ESTALE;
-	/* Invalidate the state owner so we don't ever use it again */
-	nfs4_drop_state_owner(sp);
-	d_drop(dentry);
-	/* Should we be trying to close that stateid? */
-	goto out_close_state;
+	return ret;
 }
 
 static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry)
@@ -781,22 +810,15 @@ static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_
 
 static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
 {
-	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs_open_context *ctx;
-	int status;
+	int ret;
 
-	spin_lock(&state->inode->i_lock);
-	list_for_each_entry(ctx, &nfsi->open_files, list) {
-		if (ctx->state != state)
-			continue;
-		get_nfs_open_context(ctx);
-		spin_unlock(&state->inode->i_lock);
-		status = nfs4_do_open_expired(sp, state, ctx->dentry);
-		put_nfs_open_context(ctx);
-		return status;
-	}
-	spin_unlock(&state->inode->i_lock);
-	return -ENOENT;
+	ctx = nfs4_state_find_open_context(state);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+	ret = nfs4_do_open_expired(sp, state, ctx->dentry);
+	put_nfs_open_context(ctx);
+	return ret;
 }
 
 /*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2ba9906f2a51..3100172822c9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1552,19 +1552,19 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nf
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
-		.nops   = 2,
+		.nops   = 3,
 	};
 	int status;
 
-	status = nfs_wait_on_sequence(args->seqid, req->rq_task);
-	if (status != 0)
-		goto out;
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
 	status = encode_putfh(&xdr, args->fh);
 	if (status)
 		goto out;
 	status = encode_open(&xdr, args);
+	if (status)
+		goto out;
+	status = encode_getfattr(&xdr, args->bitmask);
 out:
 	return status;
 }
@@ -3825,6 +3825,9 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct
         if (status)
                 goto out;
         status = decode_open(&xdr, res);
+        if (status)
+                goto out;
+	decode_getfattr(&xdr, res->f_attr, res->server);
 out:
         return status;
 }
-- 
cgit v1.2.3


From 911d1aaf26fc4d771174d98fcab710a44e2a5fa0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:16 +0100
Subject: NFSv4: locking XDR cleanup

 Get rid of some unnecessary intermediate structures

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 192 ++++++++++++++++++++----------------------------
 fs/nfs/nfs4xdr.c        | 131 +++++++++++++++++++--------------
 include/linux/nfs_xdr.h |  52 ++++++-------
 3 files changed, 179 insertions(+), 196 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3ecb7da220f5..857125705b6f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2935,43 +2935,17 @@ nfs4_set_lock_task_retry(unsigned long timeout)
 	return timeout;
 }
 
-static inline int
-nfs4_lck_type(int cmd, struct file_lock *request)
-{
-	/* set lock type */
-	switch (request->fl_type) {
-		case F_RDLCK:
-			return IS_SETLKW(cmd) ? NFS4_READW_LT : NFS4_READ_LT;
-		case F_WRLCK:
-			return IS_SETLKW(cmd) ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
-		case F_UNLCK:
-			return NFS4_WRITE_LT; 
-	}
-	BUG();
-	return 0;
-}
-
-static inline uint64_t
-nfs4_lck_length(struct file_lock *request)
-{
-	if (request->fl_end == OFFSET_MAX)
-		return ~(uint64_t)0;
-	return request->fl_end - request->fl_start + 1;
-}
-
 static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_client *clp = server->nfs4_state;
-	struct nfs_lockargs arg = {
+	struct nfs_lockt_args arg = {
 		.fh = NFS_FH(inode),
-		.type = nfs4_lck_type(cmd, request),
-		.offset = request->fl_start,
-		.length = nfs4_lck_length(request),
+		.fl = request,
 	};
-	struct nfs_lockres res = {
-		.server = server,
+	struct nfs_lockt_res res = {
+		.denied = request,
 	};
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCKT],
@@ -2979,36 +2953,23 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 		.rpc_resp       = &res,
 		.rpc_cred	= state->owner->so_cred,
 	};
-	struct nfs_lowner nlo;
 	struct nfs4_lock_state *lsp;
 	int status;
 
 	down_read(&clp->cl_sem);
-	nlo.clientid = clp->cl_clientid;
+	arg.lock_owner.clientid = clp->cl_clientid;
 	status = nfs4_set_lock_state(state, request);
 	if (status != 0)
 		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
-	nlo.id = lsp->ls_id; 
-	arg.u.lockt = &nlo;
+	arg.lock_owner.id = lsp->ls_id; 
 	status = rpc_call_sync(server->client, &msg, 0);
-	if (!status) {
-		request->fl_type = F_UNLCK;
-	} else if (status == -NFS4ERR_DENIED) {
-		int64_t len, start, end;
-		start = res.u.denied.offset;
-		len = res.u.denied.length;
-		end = start + len - 1;
-		if (end < 0 || len == 0)
-			request->fl_end = OFFSET_MAX;
-		else
-			request->fl_end = (loff_t)end;
-		request->fl_start = (loff_t)start;
-		request->fl_type = F_WRLCK;
-		if (res.u.denied.type & 1)
-			request->fl_type = F_RDLCK;
-		request->fl_pid = 0;
-		status = 0;
+	switch (status) {
+		case 0:
+			request->fl_type = F_UNLCK;
+			break;
+		case -NFS4ERR_DENIED:
+			status = 0;
 	}
 out:
 	up_read(&clp->cl_sem);
@@ -3048,17 +3009,42 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
 }
 
 struct nfs4_unlockdata {
-	struct nfs_lockargs arg;
-	struct nfs_locku_opargs luargs;
-	struct nfs_lockres res;
+	struct nfs_locku_args arg;
+	struct nfs_locku_res res;
 	struct nfs4_lock_state *lsp;
 	struct nfs_open_context *ctx;
+	struct file_lock fl;
+	const struct nfs_server *server;
 };
 
+static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
+		struct nfs_open_context *ctx,
+		struct nfs4_lock_state *lsp,
+		struct nfs_seqid *seqid)
+{
+	struct nfs4_unlockdata *p;
+	struct inode *inode = lsp->ls_state->inode;
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL)
+		return NULL;
+	p->arg.fh = NFS_FH(inode);
+	p->arg.fl = &p->fl;
+	p->arg.seqid = seqid;
+	p->arg.stateid = &lsp->ls_stateid;
+	p->lsp = lsp;
+	atomic_inc(&lsp->ls_count);
+	/* Ensure we don't close file until we're done freeing locks! */
+	p->ctx = get_nfs_open_context(ctx);
+	memcpy(&p->fl, fl, sizeof(p->fl));
+	p->server = NFS_SERVER(inode);
+	return p;
+}
+
 static void nfs4_locku_release_calldata(void *data)
 {
 	struct nfs4_unlockdata *calldata = data;
-	nfs_free_seqid(calldata->luargs.seqid);
+	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_lock_state(calldata->lsp);
 	put_nfs_open_context(calldata->ctx);
 	kfree(calldata);
@@ -3070,19 +3056,19 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 
 	if (RPC_ASSASSINATED(task))
 		return;
-	nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid);
+	nfs_increment_lock_seqid(task->tk_status, calldata->arg.seqid);
 	switch (task->tk_status) {
 		case 0:
 			memcpy(calldata->lsp->ls_stateid.data,
-					calldata->res.u.stateid.data,
+					calldata->res.stateid.data,
 					sizeof(calldata->lsp->ls_stateid.data));
 			break;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
-			nfs4_schedule_state_recovery(calldata->res.server->nfs4_state);
+			nfs4_schedule_state_recovery(calldata->server->nfs4_state);
 			break;
 		default:
-			if (nfs4_async_handle_error(task, calldata->res.server) == -EAGAIN) {
+			if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) {
 				rpc_restart_call(task);
 			}
 	}
@@ -3097,10 +3083,8 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 		.rpc_resp       = &calldata->res,
 		.rpc_cred	= calldata->lsp->ls_state->owner->so_cred,
 	};
-	int status;
 
-	status = nfs_wait_on_sequence(calldata->luargs.seqid, task);
-	if (status != 0)
+	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		return;
 	if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) {
 		/* Note: exit _without_ running nfs4_locku_done */
@@ -3121,43 +3105,32 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	struct nfs4_unlockdata *calldata;
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_seqid *seqid;
 	struct nfs4_lock_state *lsp;
 	struct rpc_task *task;
 	int status = 0;
 
 	/* Is this a delegated lock? */
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags))
-		goto out;
+		goto out_unlock;
+	/* Is this open_owner holding any locks on the server? */
+	if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
+		goto out_unlock;
 
 	status = nfs4_set_lock_state(state, request);
 	if (status != 0)
-		goto out;
+		goto out_unlock;
 	lsp = request->fl_u.nfs4_fl.owner;
-	/* We might have lost the locks! */
-	if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0)
-		goto out;
 	status = -ENOMEM;
-	calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
+	seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+	if (seqid == NULL)
+		goto out_unlock;
+	calldata = nfs4_alloc_unlockdata(request, request->fl_file->private_data,
+			lsp, seqid);
 	if (calldata == NULL)
-		goto out;
-	calldata->luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid);
-	if (calldata->luargs.seqid == NULL) {
-		kfree(calldata);
-		goto out;
-	}
-	calldata->luargs.stateid = &lsp->ls_stateid;
-	calldata->arg.fh = NFS_FH(inode);
-	calldata->arg.type = nfs4_lck_type(cmd, request);
-	calldata->arg.offset = request->fl_start;
-	calldata->arg.length = nfs4_lck_length(request);
-	calldata->arg.u.locku = &calldata->luargs;
-	calldata->res.server = server;
-	calldata->lsp = lsp;
-	atomic_inc(&lsp->ls_count);
-
-	/* Ensure we don't close file until we're done freeing locks! */
-	calldata->ctx = get_nfs_open_context((struct nfs_open_context*)request->fl_file->private_data);
-
+		goto out_free_seqid;
+	/* Unlock _before_ we do the RPC call */
+	do_vfs_lock(request->fl_file, request);
 	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_locku_ops, calldata);
 	if (!IS_ERR(task)) {
 		status = nfs4_wait_for_completion_rpc_task(task);
@@ -3166,7 +3139,10 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 		status = PTR_ERR(task);
 		nfs4_locku_release_calldata(calldata);
 	}
-out:
+	return status;
+out_free_seqid:
+	nfs_free_seqid(seqid);
+out_unlock:
 	do_vfs_lock(request->fl_file, request);
 	return status;
 }
@@ -3176,27 +3152,19 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
-	struct nfs_lock_opargs largs = {
+	struct nfs_lock_args arg = {
+		.fh = NFS_FH(inode),
+		.fl = request,
 		.lock_stateid = &lsp->ls_stateid,
 		.open_stateid = &state->stateid,
 		.lock_owner = {
 			.clientid = server->nfs4_state->cl_clientid,
 			.id = lsp->ls_id,
 		},
+		.block = (IS_SETLKW(cmd)) ? 1 : 0,
 		.reclaim = reclaim,
 	};
-	struct nfs_lockargs arg = {
-		.fh = NFS_FH(inode),
-		.type = nfs4_lck_type(cmd, request),
-		.offset = request->fl_start,
-		.length = nfs4_lck_length(request),
-		.u = {
-			.lock = &largs,
-		},
-	};
-	struct nfs_lockres res = {
-		.server = server,
-	};
+	struct nfs_lock_res res;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCK],
 		.rpc_argp       = &arg,
@@ -3205,37 +3173,37 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r
 	};
 	int status = -ENOMEM;
 
-	largs.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
-	if (largs.lock_seqid == NULL)
+	arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+	if (arg.lock_seqid == NULL)
 		return -ENOMEM;
 	if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) {
 		struct nfs4_state_owner *owner = state->owner;
 
-		largs.open_seqid = nfs_alloc_seqid(&owner->so_seqid);
-		if (largs.open_seqid == NULL)
+		arg.open_seqid = nfs_alloc_seqid(&owner->so_seqid);
+		if (arg.open_seqid == NULL)
 			goto out;
-		largs.new_lock_owner = 1;
+		arg.new_lock_owner = 1;
 		status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
 		/* increment open seqid on success, and seqid mutating errors */
-		if (largs.new_lock_owner != 0) {
-			nfs_increment_open_seqid(status, largs.open_seqid);
+		if (arg.new_lock_owner != 0) {
+			nfs_increment_open_seqid(status, arg.open_seqid);
 			if (status == 0)
 				nfs_confirm_seqid(&lsp->ls_seqid, 0);
 		}
-		nfs_free_seqid(largs.open_seqid);
+		nfs_free_seqid(arg.open_seqid);
 	} else
 		status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
 	/* increment lock seqid on success, and seqid mutating errors*/
-	nfs_increment_lock_seqid(status, largs.lock_seqid);
+	nfs_increment_lock_seqid(status, arg.lock_seqid);
 	/* save the returned stateid. */
 	if (status == 0) {
-		memcpy(lsp->ls_stateid.data, res.u.stateid.data,
+		memcpy(lsp->ls_stateid.data, res.stateid.data,
 				sizeof(lsp->ls_stateid.data));
 		lsp->ls_flags |= NFS_LOCK_INITIALIZED;
 	} else if (status == -NFS4ERR_DENIED)
 		status = -EAGAIN;
 out:
-	nfs_free_seqid(largs.lock_seqid);
+	nfs_free_seqid(arg.lock_seqid);
 	return status;
 }
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3100172822c9..a7b5de899c6d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -742,69 +742,80 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
 	return 0;
 }
 
+static inline int nfs4_lock_type(struct file_lock *fl, int block)
+{
+	if ((fl->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) == F_RDLCK)
+		return block ? NFS4_READW_LT : NFS4_READ_LT;
+	return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
+}
+
+static inline uint64_t nfs4_lock_length(struct file_lock *fl)
+{
+	if (fl->fl_end == OFFSET_MAX)
+		return ~(uint64_t)0;
+	return fl->fl_end - fl->fl_start + 1;
+}
+
 /*
  * opcode,type,reclaim,offset,length,new_lock_owner = 32
  * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
  */
-static int encode_lock(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
+static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
 {
 	uint32_t *p;
-	struct nfs_lock_opargs *opargs = arg->u.lock;
 
 	RESERVE_SPACE(32);
 	WRITE32(OP_LOCK);
-	WRITE32(arg->type); 
-	WRITE32(opargs->reclaim);
-	WRITE64(arg->offset);
-	WRITE64(arg->length);
-	WRITE32(opargs->new_lock_owner);
-	if (opargs->new_lock_owner){
+	WRITE32(nfs4_lock_type(args->fl, args->block));
+	WRITE32(args->reclaim);
+	WRITE64(args->fl->fl_start);
+	WRITE64(nfs4_lock_length(args->fl));
+	WRITE32(args->new_lock_owner);
+	if (args->new_lock_owner){
 		RESERVE_SPACE(40);
-		WRITE32(opargs->open_seqid->sequence->counter);
-		WRITEMEM(opargs->open_stateid->data, sizeof(opargs->open_stateid->data));
-		WRITE32(opargs->lock_seqid->sequence->counter);
-		WRITE64(opargs->lock_owner.clientid);
+		WRITE32(args->open_seqid->sequence->counter);
+		WRITEMEM(args->open_stateid->data, sizeof(args->open_stateid->data));
+		WRITE32(args->lock_seqid->sequence->counter);
+		WRITE64(args->lock_owner.clientid);
 		WRITE32(4);
-		WRITE32(opargs->lock_owner.id);
+		WRITE32(args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(20);
-		WRITEMEM(opargs->lock_stateid->data, sizeof(opargs->lock_stateid->data));
-		WRITE32(opargs->lock_seqid->sequence->counter);
+		WRITEMEM(args->lock_stateid->data, sizeof(args->lock_stateid->data));
+		WRITE32(args->lock_seqid->sequence->counter);
 	}
 
 	return 0;
 }
 
-static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
+static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
 {
 	uint32_t *p;
-	struct nfs_lowner *opargs = arg->u.lockt;
 
 	RESERVE_SPACE(40);
 	WRITE32(OP_LOCKT);
-	WRITE32(arg->type);
-	WRITE64(arg->offset);
-	WRITE64(arg->length);
-	WRITE64(opargs->clientid);
+	WRITE32(nfs4_lock_type(args->fl, 0));
+	WRITE64(args->fl->fl_start);
+	WRITE64(nfs4_lock_length(args->fl));
+	WRITE64(args->lock_owner.clientid);
 	WRITE32(4);
-	WRITE32(opargs->id);
+	WRITE32(args->lock_owner.id);
 
 	return 0;
 }
 
-static int encode_locku(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
+static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
 {
 	uint32_t *p;
-	struct nfs_locku_opargs *opargs = arg->u.locku;
 
 	RESERVE_SPACE(44);
 	WRITE32(OP_LOCKU);
-	WRITE32(arg->type);
-	WRITE32(opargs->seqid->sequence->counter);
-	WRITEMEM(opargs->stateid->data, sizeof(opargs->stateid->data));
-	WRITE64(arg->offset);
-	WRITE64(arg->length);
+	WRITE32(nfs4_lock_type(args->fl, 0));
+	WRITE32(args->seqid->sequence->counter);
+	WRITEMEM(args->stateid->data, sizeof(args->stateid->data));
+	WRITE64(args->fl->fl_start);
+	WRITE64(nfs4_lock_length(args->fl));
 
 	return 0;
 }
@@ -1596,21 +1607,20 @@ out:
 /*
  * Encode a LOCK request
  */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args)
+static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lock_args *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops   = 2,
 	};
-	struct nfs_lock_opargs *opargs = args->u.lock;
 	int status;
 
-	status = nfs_wait_on_sequence(opargs->lock_seqid, req->rq_task);
+	status = nfs_wait_on_sequence(args->lock_seqid, req->rq_task);
 	if (status != 0)
 		goto out;
 	/* Do we need to do an open_to_lock_owner? */
-	if (opargs->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)
-		opargs->new_lock_owner = 0;
+	if (args->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)
+		args->new_lock_owner = 0;
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
 	status = encode_putfh(&xdr, args->fh);
@@ -1624,7 +1634,7 @@ out:
 /*
  * Encode a LOCKT request
  */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args)
+static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockt_args *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1645,7 +1655,7 @@ out:
 /*
  * Encode a LOCKU request
  */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args)
+static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_locku_args *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -2949,55 +2959,64 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 /*
  * We create the owner, so we know a proper owner.id length is 4.
  */
-static int decode_lock_denied (struct xdr_stream *xdr, struct nfs_lock_denied *denied)
+static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 {
+	uint64_t offset, length, clientid;
 	uint32_t *p;
-	uint32_t namelen;
+	uint32_t namelen, type;
 
 	READ_BUF(32);
-	READ64(denied->offset);
-	READ64(denied->length);
-	READ32(denied->type);
-	READ64(denied->owner.clientid);
+	READ64(offset);
+	READ64(length);
+	READ32(type);
+	if (fl != NULL) {
+		fl->fl_start = (loff_t)offset;
+		fl->fl_end = fl->fl_start + (loff_t)length - 1;
+		if (length == ~(uint64_t)0)
+			fl->fl_end = OFFSET_MAX;
+		fl->fl_type = F_WRLCK;
+		if (type & 1)
+			fl->fl_type = F_RDLCK;
+		fl->fl_pid = 0;
+	}
+	READ64(clientid);
 	READ32(namelen);
 	READ_BUF(namelen);
-	if (namelen == 4)
-		READ32(denied->owner.id);
 	return -NFS4ERR_DENIED;
 }
 
-static int decode_lock(struct xdr_stream *xdr, struct nfs_lockres *res)
+static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 {
 	uint32_t *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCK);
 	if (status == 0) {
-		READ_BUF(sizeof(res->u.stateid.data));
-		COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data));
+		READ_BUF(sizeof(res->stateid.data));
+		COPYMEM(res->stateid.data, sizeof(res->stateid.data));
 	} else if (status == -NFS4ERR_DENIED)
-		return decode_lock_denied(xdr, &res->u.denied);
+		return decode_lock_denied(xdr, NULL);
 	return status;
 }
 
-static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockres *res)
+static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
 {
 	int status;
 	status = decode_op_hdr(xdr, OP_LOCKT);
 	if (status == -NFS4ERR_DENIED)
-		return decode_lock_denied(xdr, &res->u.denied);
+		return decode_lock_denied(xdr, res->denied);
 	return status;
 }
 
-static int decode_locku(struct xdr_stream *xdr, struct nfs_lockres *res)
+static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 {
 	uint32_t *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCKU);
 	if (status == 0) {
-		READ_BUF(sizeof(res->u.stateid.data));
-		COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data));
+		READ_BUF(sizeof(res->stateid.data));
+		COPYMEM(res->stateid.data, sizeof(res->stateid.data));
 	}
 	return status;
 }
@@ -3861,7 +3880,7 @@ out:
 /*
  * Decode LOCK response
  */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lock_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3882,7 +3901,7 @@ out:
 /*
  * Decode LOCKT response
  */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockt_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3903,7 +3922,7 @@ out:
 /*
  * Decode LOCKU response
  */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_locku_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 518cfa5cd024..b8b0eed98ec9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -165,50 +165,46 @@ struct nfs_closeres {
  *  * Arguments to the lock,lockt, and locku call.
  *   */
 struct nfs_lowner {
-	__u64           clientid;
-	u32                     id;
+	__u64			clientid;
+	u32			id;
 };
 
-struct nfs_lock_opargs {
+struct nfs_lock_args {
+	struct nfs_fh *		fh;
+	struct file_lock *	fl;
 	struct nfs_seqid *	lock_seqid;
 	nfs4_stateid *		lock_stateid;
 	struct nfs_seqid *	open_seqid;
 	nfs4_stateid *		open_stateid;
-	struct nfs_lowner       lock_owner;
-	__u32                   reclaim;
-	__u32                   new_lock_owner;
+	struct nfs_lowner	lock_owner;
+	unsigned char		block : 1;
+	unsigned char		reclaim : 1;
+	unsigned char		new_lock_owner : 1;
+};
+
+struct nfs_lock_res {
+	nfs4_stateid			stateid;
 };
 
-struct nfs_locku_opargs {
+struct nfs_locku_args {
+	struct nfs_fh *		fh;
+	struct file_lock *	fl;
 	struct nfs_seqid *	seqid;
 	nfs4_stateid *		stateid;
 };
 
-struct nfs_lockargs {
-	struct nfs_fh *         fh;
-	__u32                   type;
-	__u64                   offset; 
-	__u64                   length; 
-	union {
-		struct nfs_lock_opargs  *lock;    /* LOCK  */
-		struct nfs_lowner       *lockt;  /* LOCKT */
-		struct nfs_locku_opargs *locku;  /* LOCKU */
-	} u;
+struct nfs_locku_res {
+	nfs4_stateid			stateid;
 };
 
-struct nfs_lock_denied {
-	__u64                   offset;
-	__u64                   length;
-	__u32                   type;
-	struct nfs_lowner   	owner;
+struct nfs_lockt_args {
+	struct nfs_fh *		fh;
+	struct file_lock *	fl;
+	struct nfs_lowner	lock_owner;
 };
 
-struct nfs_lockres {
-	union {
-		nfs4_stateid            stateid;/* LOCK success, LOCKU */
-		struct nfs_lock_denied  denied; /* LOCK failed, LOCKT success */
-	} u;
-	const struct nfs_server *	server;
+struct nfs_lockt_res {
+	struct file_lock *	denied; /* LOCK, LOCKT failed */
 };
 
 struct nfs4_delegreturnargs {
-- 
cgit v1.2.3


From a5d16a4d090bd2af86e648ed9bb205903fcf1e86 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:17 +0100
Subject: NFSv4: Convert LOCK rpc call into an asynchronous RPC call

 In order to allow users to interrupt/cancel it.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 244 +++++++++++++++++++++++++++++++++++++++---------------
 fs/nfs/nfs4xdr.c  |   6 --
 2 files changed, 175 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 857125705b6f..718fcc722556 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3100,11 +3100,30 @@ static const struct rpc_call_ops nfs4_locku_ops = {
 	.rpc_release = nfs4_locku_release_calldata,
 };
 
+static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
+		struct nfs_open_context *ctx,
+		struct nfs4_lock_state *lsp,
+		struct nfs_seqid *seqid)
+{
+	struct nfs4_unlockdata *data;
+	struct rpc_task *task;
+
+	data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
+	if (data == NULL) {
+		nfs_free_seqid(seqid);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Unlock _before_ we do the RPC call */
+	do_vfs_lock(fl->fl_file, fl);
+	task = rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data);
+	if (IS_ERR(task))
+		nfs4_locku_release_calldata(data);
+	return task;
+}
+
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-	struct nfs4_unlockdata *calldata;
-	struct inode *inode = state->inode;
-	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_seqid *seqid;
 	struct nfs4_lock_state *lsp;
 	struct rpc_task *task;
@@ -3125,86 +3144,173 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	seqid = nfs_alloc_seqid(&lsp->ls_seqid);
 	if (seqid == NULL)
 		goto out_unlock;
-	calldata = nfs4_alloc_unlockdata(request, request->fl_file->private_data,
-			lsp, seqid);
-	if (calldata == NULL)
-		goto out_free_seqid;
-	/* Unlock _before_ we do the RPC call */
-	do_vfs_lock(request->fl_file, request);
-	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_locku_ops, calldata);
-	if (!IS_ERR(task)) {
-		status = nfs4_wait_for_completion_rpc_task(task);
-		rpc_release_task(task);
-	} else {
-		status = PTR_ERR(task);
-		nfs4_locku_release_calldata(calldata);
-	}
+	task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid);
+	status = PTR_ERR(task);
+	if (IS_ERR(task))
+		goto out_unlock;
+	status = nfs4_wait_for_completion_rpc_task(task);
+	rpc_release_task(task);
 	return status;
-out_free_seqid:
-	nfs_free_seqid(seqid);
 out_unlock:
 	do_vfs_lock(request->fl_file, request);
 	return status;
 }
 
-static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim)
+struct nfs4_lockdata {
+	struct nfs_lock_args arg;
+	struct nfs_lock_res res;
+	struct nfs4_lock_state *lsp;
+	struct nfs_open_context *ctx;
+	struct file_lock fl;
+	int rpc_status;
+	int cancelled;
+};
+
+static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
+		struct nfs_open_context *ctx, struct nfs4_lock_state *lsp)
 {
-	struct inode *inode = state->inode;
+	struct nfs4_lockdata *p;
+	struct inode *inode = lsp->ls_state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
-	struct nfs_lock_args arg = {
-		.fh = NFS_FH(inode),
-		.fl = request,
-		.lock_stateid = &lsp->ls_stateid,
-		.open_stateid = &state->stateid,
-		.lock_owner = {
-			.clientid = server->nfs4_state->cl_clientid,
-			.id = lsp->ls_id,
-		},
-		.block = (IS_SETLKW(cmd)) ? 1 : 0,
-		.reclaim = reclaim,
-	};
-	struct nfs_lock_res res;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL)
+		return NULL;
+
+	p->arg.fh = NFS_FH(inode);
+	p->arg.fl = &p->fl;
+	p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+	if (p->arg.lock_seqid == NULL)
+		goto out_free;
+	p->arg.lock_stateid = &lsp->ls_stateid;
+	p->arg.lock_owner.clientid = server->nfs4_state->cl_clientid;
+	p->arg.lock_owner.id = lsp->ls_id;
+	p->lsp = lsp;
+	atomic_inc(&lsp->ls_count);
+	p->ctx = get_nfs_open_context(ctx);
+	memcpy(&p->fl, fl, sizeof(p->fl));
+	return p;
+out_free:
+	kfree(p);
+	return NULL;
+}
+
+static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_lockdata *data = calldata;
+	struct nfs4_state *state = data->lsp->ls_state;
+	struct nfs4_state_owner *sp = state->owner;
 	struct rpc_message msg = {
-		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCK],
-		.rpc_argp       = &arg,
-		.rpc_resp       = &res,
-		.rpc_cred	= state->owner->so_cred,
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
+		.rpc_argp = &data->arg,
+		.rpc_resp = &data->res,
+		.rpc_cred = sp->so_cred,
 	};
-	int status = -ENOMEM;
-
-	arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
-	if (arg.lock_seqid == NULL)
-		return -ENOMEM;
-	if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) {
-		struct nfs4_state_owner *owner = state->owner;
 
-		arg.open_seqid = nfs_alloc_seqid(&owner->so_seqid);
-		if (arg.open_seqid == NULL)
+	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
+		return;
+	dprintk("%s: begin!\n", __FUNCTION__);
+	/* Do we need to do an open_to_lock_owner? */
+	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
+		data->arg.open_seqid = nfs_alloc_seqid(&sp->so_seqid);
+		if (data->arg.open_seqid == NULL) {
+			data->rpc_status = -ENOMEM;
+			task->tk_action = NULL;
 			goto out;
-		arg.new_lock_owner = 1;
-		status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
-		/* increment open seqid on success, and seqid mutating errors */
-		if (arg.new_lock_owner != 0) {
-			nfs_increment_open_seqid(status, arg.open_seqid);
-			if (status == 0)
-				nfs_confirm_seqid(&lsp->ls_seqid, 0);
 		}
-		nfs_free_seqid(arg.open_seqid);
-	} else
-		status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
-	/* increment lock seqid on success, and seqid mutating errors*/
-	nfs_increment_lock_seqid(status, arg.lock_seqid);
-	/* save the returned stateid. */
-	if (status == 0) {
-		memcpy(lsp->ls_stateid.data, res.stateid.data,
-				sizeof(lsp->ls_stateid.data));
-		lsp->ls_flags |= NFS_LOCK_INITIALIZED;
-	} else if (status == -NFS4ERR_DENIED)
-		status = -EAGAIN;
+		data->arg.open_stateid = &state->stateid;
+		data->arg.new_lock_owner = 1;
+	}
+	rpc_call_setup(task, &msg, 0);
 out:
-	nfs_free_seqid(arg.lock_seqid);
-	return status;
+	dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
+}
+
+static void nfs4_lock_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_lockdata *data = calldata;
+
+	dprintk("%s: begin!\n", __FUNCTION__);
+
+	data->rpc_status = task->tk_status;
+	if (RPC_ASSASSINATED(task))
+		goto out;
+	if (data->arg.new_lock_owner != 0) {
+		nfs_increment_open_seqid(data->rpc_status, data->arg.open_seqid);
+		if (data->rpc_status == 0)
+			nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
+		else
+			goto out;
+	}
+	if (data->rpc_status == 0) {
+		memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
+					sizeof(data->lsp->ls_stateid.data));
+		data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
+	}
+	nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid);
+out:
+	dprintk("%s: done, ret = %d!\n", __FUNCTION__, data->rpc_status);
+}
+
+static void nfs4_lock_release(void *calldata)
+{
+	struct nfs4_lockdata *data = calldata;
+
+	dprintk("%s: begin!\n", __FUNCTION__);
+	if (data->arg.open_seqid != NULL)
+		nfs_free_seqid(data->arg.open_seqid);
+	if (data->cancelled != 0) {
+		struct rpc_task *task;
+		task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
+				data->arg.lock_seqid);
+		if (!IS_ERR(task))
+			rpc_release_task(task);
+		dprintk("%s: cancelling lock!\n", __FUNCTION__);
+	} else
+		nfs_free_seqid(data->arg.lock_seqid);
+	nfs4_put_lock_state(data->lsp);
+	put_nfs_open_context(data->ctx);
+	kfree(data);
+	dprintk("%s: done!\n", __FUNCTION__);
+}
+
+static const struct rpc_call_ops nfs4_lock_ops = {
+	.rpc_call_prepare = nfs4_lock_prepare,
+	.rpc_call_done = nfs4_lock_done,
+	.rpc_release = nfs4_lock_release,
+};
+
+static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim)
+{
+	struct nfs4_lockdata *data;
+	struct rpc_task *task;
+	int ret;
+
+	dprintk("%s: begin!\n", __FUNCTION__);
+	data = nfs4_alloc_lockdata(fl, fl->fl_file->private_data,
+			fl->fl_u.nfs4_fl.owner);
+	if (data == NULL)
+		return -ENOMEM;
+	if (IS_SETLKW(cmd))
+		data->arg.block = 1;
+	if (reclaim != 0)
+		data->arg.reclaim = 1;
+	task = rpc_run_task(NFS_CLIENT(state->inode), RPC_TASK_ASYNC,
+			&nfs4_lock_ops, data);
+	if (IS_ERR(task)) {
+		nfs4_lock_release(data);
+		return PTR_ERR(task);
+	}
+	ret = nfs4_wait_for_completion_rpc_task(task);
+	if (ret == 0) {
+		ret = data->rpc_status;
+		if (ret == -NFS4ERR_DENIED)
+			ret = -EAGAIN;
+	} else
+		data->cancelled = 1;
+	rpc_release_task(task);
+	dprintk("%s: done, ret = %d!\n", __FUNCTION__, ret);
+	return ret;
 }
 
 static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a7b5de899c6d..5d6bda43dfaa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1615,12 +1615,6 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lock_
 	};
 	int status;
 
-	status = nfs_wait_on_sequence(args->lock_seqid, req->rq_task);
-	if (status != 0)
-		goto out;
-	/* Do we need to do an open_to_lock_owner? */
-	if (args->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)
-		args->new_lock_owner = 0;
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
 	status = encode_putfh(&xdr, args->fh);
-- 
cgit v1.2.3


From fe650407a86823bcafbfbee96c7bc6a1b5cd1c76 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:18 +0100
Subject: NFSv4: Make DELEGRETURN an interruptible operation.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 60 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 718fcc722556..f060e6538edc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2885,19 +2885,71 @@ nfs4_proc_setclientid_confirm(struct nfs4_client *clp)
 	return status;
 }
 
-static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
+struct nfs4_delegreturndata {
+	struct nfs4_delegreturnargs args;
+	struct nfs_fh fh;
+	nfs4_stateid stateid;
+	struct rpc_cred *cred;
+	int rpc_status;
+};
+
+static void nfs4_delegreturn_prepare(struct rpc_task *task, void *calldata)
 {
-	struct nfs4_delegreturnargs args = {
-		.fhandle = NFS_FH(inode),
-		.stateid = stateid,
-	};
+	struct nfs4_delegreturndata *data = calldata;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
-		.rpc_argp = &args,
-		.rpc_cred = cred,
+		.rpc_argp = &data->args,
+		.rpc_cred = data->cred,
 	};
+	rpc_call_setup(task, &msg, 0);
+}
 
-	return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_delegreturndata *data = calldata;
+	data->rpc_status = task->tk_status;
+}
+
+static void nfs4_delegreturn_release(void *calldata)
+{
+	struct nfs4_delegreturndata *data = calldata;
+
+	put_rpccred(data->cred);
+	kfree(calldata);
+}
+
+const static struct rpc_call_ops nfs4_delegreturn_ops = {
+	.rpc_call_prepare = nfs4_delegreturn_prepare,
+	.rpc_call_done = nfs4_delegreturn_done,
+	.rpc_release = nfs4_delegreturn_release,
+};
+
+static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
+{
+	struct nfs4_delegreturndata *data;
+	struct rpc_task *task;
+	int status;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+	data->args.fhandle = &data->fh;
+	data->args.stateid = &data->stateid;
+	nfs_copy_fh(&data->fh, NFS_FH(inode));
+	memcpy(&data->stateid, stateid, sizeof(data->stateid));
+	data->cred = get_rpccred(cred);
+	data->rpc_status = 0;
+
+	task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data);
+	if (IS_ERR(task)) {
+		nfs4_delegreturn_release(data);
+		return PTR_ERR(task);
+	}
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status == 0)
+		status = data->rpc_status;
+	rpc_release_task(task);
+	return status;
 }
 
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
-- 
cgit v1.2.3


From 2bd615797ef32ec06ef0ee44198a7aecc21ffd8c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:19 +0100
Subject: SUNRPC: Ensure that SIGKILL will always terminate a synchronous RPC
 call.

 ...and make sure that the "intr" flag also enables SIGHUP and SIGTERM to
 interrupt RPC calls too (as per the Solaris implementation).

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c    | 4 ++--
 net/sunrpc/clnt.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 12a857c29e25..71a30b416d1a 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -178,6 +178,8 @@ lockd(struct svc_rqst *rqstp)
 
 	}
 
+	flush_signals(current);
+
 	/*
 	 * Check whether there's a new lockd process before
 	 * shutting down the hosts and clearing the slot.
@@ -192,8 +194,6 @@ lockd(struct svc_rqst *rqstp)
 			"lockd: new process, skipping host shutdown\n");
 	wake_up(&lockd_exit);
 
-	flush_signals(current);
-
 	/* Exit the RPC thread */
 	svc_exit_thread(rqstp);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f025b7e72353..b23c0d328c9c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -386,11 +386,11 @@ static const struct rpc_call_ops rpc_default_ops = {
  *	Export the signal mask handling for synchronous code that
  *	sleeps on RPC calls
  */
-#define RPC_INTR_SIGNALS (sigmask(SIGINT) | sigmask(SIGQUIT) | sigmask(SIGKILL))
+#define RPC_INTR_SIGNALS (sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT) | sigmask(SIGTERM))
  
 static void rpc_save_sigmask(sigset_t *oldset, int intr)
 {
-	unsigned long	sigallow = 0;
+	unsigned long	sigallow = sigmask(SIGKILL);
 	sigset_t sigmask;
 
 	/* Block all signals except those listed in sigallow */
-- 
cgit v1.2.3


From 26e976a884be9aa08f8ff906372f25f68df0d948 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:21 +0100
Subject: NFSv4: OPEN/LOCK/LOCKU/CLOSE will automatically renew the NFSv4 lease

 Cut down on the number of unnecessary RENEW requests on the wire.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f060e6538edc..76d7d0259b6c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -174,8 +174,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
 	kunmap_atomic(start, KM_USER0);
 }
 
-static void
-renew_lease(struct nfs_server *server, unsigned long timestamp)
+static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
 {
 	struct nfs4_client *clp = server->nfs4_state;
 	spin_lock(&clp->cl_lock);
@@ -207,6 +206,7 @@ struct nfs4_opendata {
 	struct dentry *dir;
 	struct nfs4_state_owner *owner;
 	struct iattr attrs;
+	unsigned long timestamp;
 	int rpc_status;
 	int cancelled;
 };
@@ -548,6 +548,7 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
 		.rpc_resp = &data->c_res,
 		.rpc_cred = data->owner->so_cred,
 	};
+	data->timestamp = jiffies;
 	rpc_call_setup(task, &msg, 0);
 }
 
@@ -558,9 +559,11 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
 	data->rpc_status = task->tk_status;
 	if (RPC_ASSASSINATED(task))
 		return;
-	if (data->rpc_status == 0)
+	if (data->rpc_status == 0) {
 		memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
 				sizeof(data->o_res.stateid.data));
+		renew_lease(data->o_res.server, data->timestamp);
+	}
 	nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
 	nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status);
 }
@@ -633,6 +636,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	data->o_arg.clientid = sp->so_client->cl_clientid;
 	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
 		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+	data->timestamp = jiffies;
 	rpc_call_setup(task, &msg, 0);
 }
 
@@ -656,6 +660,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
 			default:
 				data->rpc_status = -ENOTDIR;
 		}
+		renew_lease(data->o_res.server, data->timestamp);
 	}
 	nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid);
 }
@@ -1014,6 +1019,7 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
                 .rpc_argp       = &arg,
                 .rpc_resp       = &res,
         };
+	unsigned long timestamp = jiffies;
 	int status;
 
 	nfs_fattr_init(fattr);
@@ -1025,6 +1031,8 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
 		memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
 
 	status = rpc_call_sync(server->client, &msg, 0);
+	if (status == 0 && state != NULL)
+		renew_lease(server, timestamp);
 	return status;
 }
 
@@ -1049,6 +1057,7 @@ struct nfs4_closedata {
 	struct nfs_closeargs arg;
 	struct nfs_closeres res;
 	struct nfs_fattr fattr;
+	unsigned long timestamp;
 };
 
 static void nfs4_free_closedata(void *data)
@@ -1078,6 +1087,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 		case 0:
 			memcpy(&state->stateid, &calldata->res.stateid,
 					sizeof(state->stateid));
+			renew_lease(server, calldata->timestamp);
 			break;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
@@ -1130,6 +1140,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	if (mode != 0)
 		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
 	calldata->arg.open_flags = mode;
+	calldata->timestamp = jiffies;
 	rpc_call_setup(task, &msg, 0);
 }
 
@@ -1694,11 +1705,13 @@ static int _nfs4_proc_write(struct nfs_write_data *wdata)
 
 	wdata->args.bitmask = server->attr_bitmask;
 	wdata->res.server = server;
+	wdata->timestamp = jiffies;
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(server->client, &msg, rpcflags);
 	dprintk("NFS reply write: %d\n", status);
 	if (status < 0)
 		return status;
+	renew_lease(server, wdata->timestamp);
 	nfs_post_op_update_inode(inode, fattr);
 	return wdata->res.count;
 }
@@ -1733,8 +1746,11 @@ static int _nfs4_proc_commit(struct nfs_write_data *cdata)
 
 	cdata->args.bitmask = server->attr_bitmask;
 	cdata->res.server = server;
+	cdata->timestamp = jiffies;
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(server->client, &msg, 0);
+	if (status >= 0)
+		renew_lease(server, cdata->timestamp);
 	dprintk("NFS reply commit: %d\n", status);
 	if (status >= 0)
 		nfs_post_op_update_inode(inode, fattr);
@@ -2890,6 +2906,8 @@ struct nfs4_delegreturndata {
 	struct nfs_fh fh;
 	nfs4_stateid stateid;
 	struct rpc_cred *cred;
+	unsigned long timestamp;
+	const struct nfs_server *server;
 	int rpc_status;
 };
 
@@ -2908,6 +2926,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_delegreturndata *data = calldata;
 	data->rpc_status = task->tk_status;
+	if (data->rpc_status == 0)
+		renew_lease(data->server, data->timestamp);
 }
 
 static void nfs4_delegreturn_release(void *calldata)
@@ -2938,6 +2958,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	nfs_copy_fh(&data->fh, NFS_FH(inode));
 	memcpy(&data->stateid, stateid, sizeof(data->stateid));
 	data->cred = get_rpccred(cred);
+	data->timestamp = jiffies;
+	data->server = NFS_SERVER(inode);
 	data->rpc_status = 0;
 
 	task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data);
@@ -3067,6 +3089,7 @@ struct nfs4_unlockdata {
 	struct nfs_open_context *ctx;
 	struct file_lock fl;
 	const struct nfs_server *server;
+	unsigned long timestamp;
 };
 
 static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
@@ -3114,6 +3137,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 			memcpy(calldata->lsp->ls_stateid.data,
 					calldata->res.stateid.data,
 					sizeof(calldata->lsp->ls_stateid.data));
+			renew_lease(calldata->server, calldata->timestamp);
 			break;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
@@ -3143,6 +3167,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 		task->tk_action = NULL;
 		return;
 	}
+	calldata->timestamp = jiffies;
 	rpc_call_setup(task, &msg, 0);
 }
 
@@ -3214,6 +3239,7 @@ struct nfs4_lockdata {
 	struct nfs4_lock_state *lsp;
 	struct nfs_open_context *ctx;
 	struct file_lock fl;
+	unsigned long timestamp;
 	int rpc_status;
 	int cancelled;
 };
@@ -3273,6 +3299,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 		data->arg.open_stateid = &state->stateid;
 		data->arg.new_lock_owner = 1;
 	}
+	data->timestamp = jiffies;
 	rpc_call_setup(task, &msg, 0);
 out:
 	dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
@@ -3298,6 +3325,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 		memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
 					sizeof(data->lsp->ls_stateid.data));
 		data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
+		renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
 	}
 	nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid);
 out:
-- 
cgit v1.2.3


From 433fbe4c8837e3cc2ba6a6374edf28737d01a2e9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:22 +0100
Subject: NFSv4: State recovery cleanup

 Use wait_on_bit() when waiting for state recovery to complete.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h   |  3 +--
 fs/nfs/nfs4proc.c  | 27 ++++++++++++++-------------
 fs/nfs/nfs4state.c | 23 +++++++++++++----------
 3 files changed, 28 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4ad5981f9375..1d4c5b339b4d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -38,7 +38,7 @@ struct idmap;
  ((err) != NFSERR_NOFILEHANDLE))
 
 enum nfs4_client_state {
-	NFS4CLNT_OK  = 0,
+	NFS4CLNT_STATE_RECOVER  = 0,
 };
 
 /*
@@ -76,7 +76,6 @@ struct nfs4_client {
 	struct work_struct	cl_renewd;
 	struct work_struct	cl_recoverd;
 
-	wait_queue_head_t	cl_waitq;
 	struct rpc_wait_queue	cl_rpcwaitq;
 
 	/* used for the setclientid verifier */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 76d7d0259b6c..46623ac3ce86 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2736,7 +2736,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
 		case -NFS4ERR_EXPIRED:
 			rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL);
 			nfs4_schedule_state_recovery(clp);
-			if (test_bit(NFS4CLNT_OK, &clp->cl_state))
+			if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
 				rpc_wake_up_task(task);
 			task->tk_status = 0;
 			return -EAGAIN;
@@ -2753,25 +2753,25 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
 	return 0;
 }
 
+static int nfs4_wait_bit_interruptible(void *word)
+{
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp)
 {
-	DEFINE_WAIT(wait);
 	sigset_t oldset;
-	int interruptible, res = 0;
+	int res;
 
 	might_sleep();
 
 	rpc_clnt_sigmask(clnt, &oldset);
-	interruptible = TASK_UNINTERRUPTIBLE;
-	if (clnt->cl_intr)
-		interruptible = TASK_INTERRUPTIBLE;
-	prepare_to_wait(&clp->cl_waitq, &wait, interruptible);
-	nfs4_schedule_state_recovery(clp);
-	if (clnt->cl_intr && signalled())
-		res = -ERESTARTSYS;
-	else if (!test_bit(NFS4CLNT_OK, &clp->cl_state))
-		schedule();
-	finish_wait(&clp->cl_waitq, &wait);
+	res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
+			nfs4_wait_bit_interruptible,
+			TASK_INTERRUPTIBLE);
 	rpc_clnt_sigunmask(clnt, &oldset);
 	return res;
 }
@@ -2814,6 +2814,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
+			nfs4_schedule_state_recovery(clp);
 			ret = nfs4_wait_clnt_recover(server->client, clp);
 			if (ret == 0)
 				exception->retry = 1;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 41a5f1a8a984..4ebf4df7f191 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -106,11 +106,10 @@ nfs4_alloc_client(struct in_addr *addr)
 	INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp);
 	INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
 	INIT_LIST_HEAD(&clp->cl_superblocks);
-	init_waitqueue_head(&clp->cl_waitq);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
 	clp->cl_rpcclient = ERR_PTR(-EINVAL);
 	clp->cl_boot_time = CURRENT_TIME;
-	clp->cl_state = 1 << NFS4CLNT_OK;
+	clp->cl_state = 0;
 	return clp;
 }
 
@@ -193,7 +192,6 @@ nfs4_put_client(struct nfs4_client *clp)
 	list_del(&clp->cl_servers);
 	spin_unlock(&state_spinlock);
 	BUG_ON(!list_empty(&clp->cl_superblocks));
-	wake_up_all(&clp->cl_waitq);
 	rpc_wake_up(&clp->cl_rpcwaitq);
 	nfs4_kill_renewd(clp);
 	nfs4_free_client(clp);
@@ -741,6 +739,15 @@ struct reclaimer_args {
 	struct completion complete;
 };
 
+static inline void nfs4_clear_recover_bit(struct nfs4_client *clp)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER);
+	rpc_wake_up(&clp->cl_rpcwaitq);
+}
+
 /*
  * State recovery routine
  */
@@ -760,9 +767,7 @@ nfs4_recover_state(void *data)
 	wait_for_completion(&args.complete);
 	return;
 out_failed_clear:
-	set_bit(NFS4CLNT_OK, &clp->cl_state);
-	wake_up_all(&clp->cl_waitq);
-	rpc_wake_up(&clp->cl_rpcwaitq);
+	nfs4_clear_recover_bit(clp);
 }
 
 /*
@@ -773,7 +778,7 @@ nfs4_schedule_state_recovery(struct nfs4_client *clp)
 {
 	if (!clp)
 		return;
-	if (test_and_clear_bit(NFS4CLNT_OK, &clp->cl_state))
+	if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
 		schedule_work(&clp->cl_recoverd);
 }
 
@@ -943,13 +948,11 @@ restart_loop:
 	}
 	nfs_delegation_reap_unclaimed(clp);
 out:
-	set_bit(NFS4CLNT_OK, &clp->cl_state);
 	up_write(&clp->cl_sem);
 	unlock_kernel();
-	wake_up_all(&clp->cl_waitq);
-	rpc_wake_up(&clp->cl_rpcwaitq);
 	if (status == -NFS4ERR_CB_PATH_DOWN)
 		nfs_handle_cb_pathdown(clp);
+	nfs4_clear_recover_bit(clp);
 	nfs4_put_client(clp);
 	return 0;
 out_error:
-- 
cgit v1.2.3


From 5043e900f5404c01864fbeb5826aa7de3981bbc1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:23 +0100
Subject: NFS: Convert instances of kernel_thread() to kthread()

 Convert private implementations in NFSv4 state recovery and delegation
 code to use kthreads.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 46 ++++++++++++++++------------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 4ebf4df7f191..f2148376f1a7 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -43,6 +43,8 @@
 #include <linux/smp_lock.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 
@@ -57,8 +59,6 @@ const nfs4_stateid zero_stateid;
 static DEFINE_SPINLOCK(state_spinlock);
 static LIST_HEAD(nfs4_clientid_list);
 
-static void nfs4_recover_state(void *);
-
 void
 init_nfsv4_state(struct nfs_server *server)
 {
@@ -103,7 +103,6 @@ nfs4_alloc_client(struct in_addr *addr)
 	INIT_LIST_HEAD(&clp->cl_unused);
 	spin_lock_init(&clp->cl_lock);
 	atomic_set(&clp->cl_count, 1);
-	INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp);
 	INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
 	INIT_LIST_HEAD(&clp->cl_superblocks);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
@@ -734,10 +733,6 @@ out:
 }
 
 static int reclaimer(void *);
-struct reclaimer_args {
-	struct nfs4_client *clp;
-	struct completion complete;
-};
 
 static inline void nfs4_clear_recover_bit(struct nfs4_client *clp)
 {
@@ -751,35 +746,30 @@ static inline void nfs4_clear_recover_bit(struct nfs4_client *clp)
 /*
  * State recovery routine
  */
-void
-nfs4_recover_state(void *data)
+static void nfs4_recover_state(struct nfs4_client *clp)
 {
-	struct nfs4_client *clp = (struct nfs4_client *)data;
-	struct reclaimer_args args = {
-		.clp = clp,
-	};
-	might_sleep();
-
-	init_completion(&args.complete);
+	struct task_struct *task;
 
-	if (kernel_thread(reclaimer, &args, CLONE_KERNEL) < 0)
-		goto out_failed_clear;
-	wait_for_completion(&args.complete);
-	return;
-out_failed_clear:
+	__module_get(THIS_MODULE);
+	atomic_inc(&clp->cl_count);
+	task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim",
+			NIPQUAD(clp->cl_addr));
+	if (!IS_ERR(task))
+		return;
 	nfs4_clear_recover_bit(clp);
+	nfs4_put_client(clp);
+	module_put(THIS_MODULE);
 }
 
 /*
  * Schedule a state recovery attempt
  */
-void
-nfs4_schedule_state_recovery(struct nfs4_client *clp)
+void nfs4_schedule_state_recovery(struct nfs4_client *clp)
 {
 	if (!clp)
 		return;
 	if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
-		schedule_work(&clp->cl_recoverd);
+		nfs4_recover_state(clp);
 }
 
 static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state)
@@ -895,18 +885,13 @@ static void nfs4_state_mark_reclaim(struct nfs4_client *clp)
 
 static int reclaimer(void *ptr)
 {
-	struct reclaimer_args *args = (struct reclaimer_args *)ptr;
-	struct nfs4_client *clp = args->clp;
+	struct nfs4_client *clp = ptr;
 	struct nfs4_state_owner *sp;
 	struct nfs4_state_recovery_ops *ops;
 	int status = 0;
 
-	daemonize("%u.%u.%u.%u-reclaim", NIPQUAD(clp->cl_addr));
 	allow_signal(SIGKILL);
 
-	atomic_inc(&clp->cl_count);
-	complete(&args->complete);
-
 	/* Ensure exclusive access to NFSv4 state */
 	lock_kernel();
 	down_write(&clp->cl_sem);
@@ -954,6 +939,7 @@ out:
 		nfs_handle_cb_pathdown(clp);
 	nfs4_clear_recover_bit(clp);
 	nfs4_put_client(clp);
+	module_put_and_exit(0);
 	return 0;
 out_error:
 	printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n",
-- 
cgit v1.2.3


From 58d9714a44a79bba9b414da3ffbf3c753dc5915f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:24 +0100
Subject: NFSv4: Send RENEW requests to the server only when we're holding
 state

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/delegation.h |  1 +
 fs/nfs/nfs4_fs.h    |  1 +
 fs/nfs/nfs4proc.c   | 20 +++++++++++++++++++-
 fs/nfs/nfs4renewd.c |  9 ++++++++-
 5 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 618a327027b3..75dfb1c717a0 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -8,6 +8,7 @@
  */
 #include <linux/config.h>
 #include <linux/completion.h>
+#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
@@ -231,6 +232,51 @@ restart:
 	spin_unlock(&clp->cl_lock);
 }
 
+int nfs_do_expire_all_delegations(void *ptr)
+{
+	struct nfs4_client *clp = ptr;
+	struct nfs_delegation *delegation;
+	struct inode *inode;
+	int err = 0;
+
+	allow_signal(SIGKILL);
+restart:
+	spin_lock(&clp->cl_lock);
+	if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0)
+		goto out;
+	if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0)
+		goto out;
+	list_for_each_entry(delegation, &clp->cl_delegations, super_list) {
+		inode = igrab(delegation->inode);
+		if (inode == NULL)
+			continue;
+		spin_unlock(&clp->cl_lock);
+		err = nfs_inode_return_delegation(inode);
+		iput(inode);
+		if (!err)
+			goto restart;
+	}
+out:
+	spin_unlock(&clp->cl_lock);
+	nfs4_put_client(clp);
+	module_put_and_exit(0);
+}
+
+void nfs_expire_all_delegations(struct nfs4_client *clp)
+{
+	struct task_struct *task;
+
+	__module_get(THIS_MODULE);
+	atomic_inc(&clp->cl_count);
+	task = kthread_run(nfs_do_expire_all_delegations, clp,
+			"%u.%u.%u.%u-delegreturn",
+			NIPQUAD(clp->cl_addr));
+	if (!IS_ERR(task))
+		return;
+	nfs4_put_client(clp);
+	module_put(THIS_MODULE);
+}
+
 /*
  * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
  */
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2fcc30de924b..fbc50ec271c5 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -30,6 +30,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
 
 struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle);
 void nfs_return_all_delegations(struct super_block *sb);
+void nfs_expire_all_delegations(struct nfs4_client *clp);
 void nfs_handle_cb_pathdown(struct nfs4_client *clp);
 
 void nfs_delegation_mark_reclaim(struct nfs4_client *clp);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1d4c5b339b4d..75fe646c28ab 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -39,6 +39,7 @@ struct idmap;
 
 enum nfs4_client_state {
 	NFS4CLNT_STATE_RECOVER  = 0,
+	NFS4CLNT_LEASE_EXPIRED,
 };
 
 /*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 46623ac3ce86..cc33a1c32cfb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -63,6 +63,7 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
+static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp);
 extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 
@@ -765,6 +766,15 @@ out:
 	return -EACCES;
 }
 
+int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+	struct nfs4_client *clp = server->nfs4_state;
+
+	if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+		nfs4_schedule_state_recovery(clp);
+	return nfs4_wait_clnt_recover(server->client, clp);
+}
+
 /*
  * OPEN_EXPIRED:
  * 	reclaim state on the server after a network partition.
@@ -840,6 +850,9 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
 	int open_flags = flags & (FMODE_READ|FMODE_WRITE);
 	int err;
 
+	err = nfs4_recover_expired_lease(server);
+	if (err != 0)
+		return err;
 	/* Protect against reboot recovery - NOTE ORDER! */
 	down_read(&clp->cl_sem);
 	/* Protect against delegation recall */
@@ -921,12 +934,16 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
 	int                     status;
 
 	/* Protect against reboot recovery conflicts */
-	down_read(&clp->cl_sem);
 	status = -ENOMEM;
 	if (!(sp = nfs4_get_state_owner(server, cred))) {
 		dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
 		goto out_err;
 	}
+	status = nfs4_recover_expired_lease(server);
+	if (status != 0)
+		goto out_err;
+	down_read(&clp->cl_sem);
+	status = -ENOMEM;
 	opendata = nfs4_opendata_alloc(dentry, sp, flags, sattr);
 	if (opendata == NULL)
 		goto err_put_state_owner;
@@ -2897,6 +2914,7 @@ nfs4_proc_setclientid_confirm(struct nfs4_client *clp)
 		spin_lock(&clp->cl_lock);
 		clp->cl_lease_time = fsinfo.lease_time * HZ;
 		clp->cl_last_renewal = now;
+		clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 		spin_unlock(&clp->cl_lock);
 	}
 	return status;
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index a3001628ad32..f62c2f7a4ffb 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -54,6 +54,7 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include "nfs4_fs.h"
+#include "delegation.h"
 
 #define NFSDBG_FACILITY	NFSDBG_PROC
 
@@ -68,7 +69,7 @@ nfs4_renew_state(void *data)
 	dprintk("%s: start\n", __FUNCTION__);
 	/* Are there any active superblocks? */
 	if (list_empty(&clp->cl_superblocks))
-		goto out; 
+		goto out;
 	spin_lock(&clp->cl_lock);
 	lease = clp->cl_lease_time;
 	last = clp->cl_last_renewal;
@@ -76,6 +77,12 @@ nfs4_renew_state(void *data)
 	timeout = (2 * lease) / 3 + (long)last - (long)now;
 	/* Are we close to a lease timeout? */
 	if (time_after(now, last + lease/3)) {
+		if (list_empty(&clp->cl_state_owners)) {
+			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+			spin_unlock(&clp->cl_lock);
+			nfs_expire_all_delegations(clp);
+			goto out;
+		}
 		spin_unlock(&clp->cl_lock);
 		/* Queue an asynchronous RENEW. */
 		nfs4_proc_async_renew(clp);
-- 
cgit v1.2.3


From b4454fe1a7cb76a248d0641c9d68a44a1b8d9a1f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:25 +0100
Subject: NFSv4: Remove requirement for machine creds for the "renew" operation

 In RFC3530, the RENEW operation is allowed to use either

 the same principal, RPC security flavour and (if RPCSEC_GSS), the same
  mechanism and service that was used for SETCLIENTID_CONFIRM

 OR

 Any principal, RPC security flavour and service combination that
 currently has an OPEN file on the server.

 Choose the latter since that doesn't require us to keep credentials for
 the same principal for the entire duration of the mount.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h    |  5 +++--
 fs/nfs/nfs4proc.c   | 38 ++++++++++++++++++--------------------
 fs/nfs/nfs4renewd.c |  7 +++++--
 fs/nfs/nfs4state.c  | 16 +++++++++++++++-
 4 files changed, 41 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 75fe646c28ab..4a9c53e0793c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -213,8 +213,8 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 extern int nfs4_map_errors(int err);
 extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short);
 extern int nfs4_proc_setclientid_confirm(struct nfs4_client *);
-extern int nfs4_proc_async_renew(struct nfs4_client *);
-extern int nfs4_proc_renew(struct nfs4_client *);
+extern int nfs4_proc_async_renew(struct nfs4_client *, struct rpc_cred *);
+extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -240,6 +240,7 @@ extern struct nfs4_client *nfs4_get_client(struct in_addr *);
 extern void nfs4_put_client(struct nfs4_client *clp);
 extern int nfs4_init_client(struct nfs4_client *clp);
 extern struct nfs4_client *nfs4_find_client(struct in_addr *);
+struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp);
 extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *);
 
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cc33a1c32cfb..e38401931291 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -850,9 +850,14 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
 	int open_flags = flags & (FMODE_READ|FMODE_WRITE);
 	int err;
 
+	err = -ENOMEM;
+	if (!(sp = nfs4_get_state_owner(server, cred))) {
+		dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__);
+		return err;
+	}
 	err = nfs4_recover_expired_lease(server);
 	if (err != 0)
-		return err;
+		goto out_put_state_owner;
 	/* Protect against reboot recovery - NOTE ORDER! */
 	down_read(&clp->cl_sem);
 	/* Protect against delegation recall */
@@ -862,10 +867,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
 	if (delegation == NULL || (delegation->type & open_flags) != open_flags)
 		goto out_err;
 	err = -ENOMEM;
-	if (!(sp = nfs4_get_state_owner(server, cred))) {
-		dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__);
-		goto out_err;
-	}
 	state = nfs4_get_open_state(inode, sp);
 	if (state == NULL)
 		goto out_err;
@@ -877,13 +878,13 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
 		spin_unlock(&inode->i_lock);
 		goto out_ok;
 	} else if (state->state != 0)
-		goto out_err;
+		goto out_put_open_state;
 
 	lock_kernel();
 	err = _nfs4_do_access(inode, cred, open_flags);
 	unlock_kernel();
 	if (err != 0)
-		goto out_err;
+		goto out_put_open_state;
 	set_bit(NFS_DELEGATED_STATE, &state->flags);
 	update_open_stateid(state, &delegation->stateid, open_flags);
 out_ok:
@@ -891,17 +892,16 @@ out_ok:
 	up_read(&nfsi->rwsem);
 	up_read(&clp->cl_sem);
 	*res = state;
-	return 0; 
+	return 0;
+out_put_open_state:
+	nfs4_put_open_state(state);
 out_err:
-	if (sp != NULL) {
-		if (state != NULL)
-			nfs4_put_open_state(state);
-		nfs4_put_state_owner(sp);
-	}
 	up_read(&nfsi->rwsem);
 	up_read(&clp->cl_sem);
 	if (err != -EACCES)
 		nfs_inode_return_delegation(inode);
+out_put_state_owner:
+	nfs4_put_state_owner(sp);
 	return err;
 }
 
@@ -941,7 +941,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
 	}
 	status = nfs4_recover_expired_lease(server);
 	if (status != 0)
-		goto out_err;
+		goto err_put_state_owner;
 	down_read(&clp->cl_sem);
 	status = -ENOMEM;
 	opendata = nfs4_opendata_alloc(dentry, sp, flags, sattr);
@@ -2518,26 +2518,24 @@ static const struct rpc_call_ops nfs4_renew_ops = {
 	.rpc_call_done = nfs4_renew_done,
 };
 
-int
-nfs4_proc_async_renew(struct nfs4_client *clp)
+int nfs4_proc_async_renew(struct nfs4_client *clp, struct rpc_cred *cred)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
 		.rpc_argp	= clp,
-		.rpc_cred	= clp->cl_cred,
+		.rpc_cred	= cred,
 	};
 
 	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
 			&nfs4_renew_ops, (void *)jiffies);
 }
 
-int
-nfs4_proc_renew(struct nfs4_client *clp)
+int nfs4_proc_renew(struct nfs4_client *clp, struct rpc_cred *cred)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
 		.rpc_argp	= clp,
-		.rpc_cred	= clp->cl_cred,
+		.rpc_cred	= cred,
 	};
 	unsigned long now = jiffies;
 	int status;
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index f62c2f7a4ffb..5d764d8e6d8a 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -62,6 +62,7 @@ void
 nfs4_renew_state(void *data)
 {
 	struct nfs4_client *clp = (struct nfs4_client *)data;
+	struct rpc_cred *cred;
 	long lease, timeout;
 	unsigned long last, now;
 
@@ -77,7 +78,8 @@ nfs4_renew_state(void *data)
 	timeout = (2 * lease) / 3 + (long)last - (long)now;
 	/* Are we close to a lease timeout? */
 	if (time_after(now, last + lease/3)) {
-		if (list_empty(&clp->cl_state_owners)) {
+		cred = nfs4_get_renew_cred(clp);
+		if (cred == NULL) {
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 			spin_unlock(&clp->cl_lock);
 			nfs_expire_all_delegations(clp);
@@ -85,7 +87,8 @@ nfs4_renew_state(void *data)
 		}
 		spin_unlock(&clp->cl_lock);
 		/* Queue an asynchronous RENEW. */
-		nfs4_proc_async_renew(clp);
+		nfs4_proc_async_renew(clp, cred);
+		put_rpccred(cred);
 		timeout = (2 * lease) / 3;
 		spin_lock(&clp->cl_lock);
 	} else
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f2148376f1a7..18f6ed1a0b54 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -232,6 +232,20 @@ nfs4_client_grab_unused(struct nfs4_client *clp, struct rpc_cred *cred)
 	return sp;
 }
 
+struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp)
+{
+	struct nfs4_state_owner *sp;
+	struct rpc_cred *cred = NULL;
+
+	list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
+		if (list_empty(&sp->so_states))
+			continue;
+		cred = get_rpccred(sp->so_cred);
+		break;
+	}
+	return cred;
+}
+
 static struct nfs4_state_owner *
 nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred)
 {
@@ -899,7 +913,7 @@ static int reclaimer(void *ptr)
 	if (list_empty(&clp->cl_superblocks))
 		goto out;
 restart_loop:
-	status = nfs4_proc_renew(clp);
+	status = nfs4_proc_renew(clp, clp->cl_cred);
 	switch (status) {
 		case 0:
 		case -NFS4ERR_CB_PATH_DOWN:
-- 
cgit v1.2.3


From 286d7d6a0cb38d3d4316a1dfea9b0c0fc5a6455b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:26 +0100
Subject: NFSv4: Remove requirement for machine creds for the "setclientid"
 operation

 Use a cred from the nfs4_client->cl_state_owners list.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c     | 14 -----------
 fs/nfs/nfs4_fs.h   |  6 ++---
 fs/nfs/nfs4proc.c  | 10 ++++----
 fs/nfs/nfs4state.c | 71 ++++++++++++++++++++++++++++++++++--------------------
 4 files changed, 52 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 432f41cd75e6..740d3cd40246 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1822,23 +1822,9 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
 		clnt->cl_softrtry = 1;
 		clnt->cl_chatty   = 1;
 		clp->cl_rpcclient = clnt;
-		clp->cl_cred = rpcauth_lookupcred(clnt->cl_auth, 0);
-		if (IS_ERR(clp->cl_cred)) {
-			up_write(&clp->cl_sem);
-			err = PTR_ERR(clp->cl_cred);
-			clp->cl_cred = NULL;
-			goto out_fail;
-		}
 		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
 		nfs_idmap_new(clp);
 	}
-	if (list_empty(&clp->cl_superblocks)) {
-		err = nfs4_init_client(clp);
-		if (err != 0) {
-			up_write(&clp->cl_sem);
-			goto out_fail;
-		}
-	}
 	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
 	clnt = rpc_clone_client(clp->cl_rpcclient);
 	if (!IS_ERR(clnt))
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4a9c53e0793c..0f5e4e7cddec 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -68,7 +68,6 @@ struct nfs4_client {
 	atomic_t		cl_count;
 
 	struct rpc_clnt *	cl_rpcclient;
-	struct rpc_cred *	cl_cred;
 
 	struct list_head	cl_superblocks;	/* List of nfs_server structs */
 
@@ -211,8 +210,8 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 
 /* nfs4proc.c */
 extern int nfs4_map_errors(int err);
-extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short);
-extern int nfs4_proc_setclientid_confirm(struct nfs4_client *);
+extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short, struct rpc_cred *);
+extern int nfs4_proc_setclientid_confirm(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_proc_async_renew(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
@@ -238,7 +237,6 @@ extern void init_nfsv4_state(struct nfs_server *);
 extern void destroy_nfsv4_state(struct nfs_server *);
 extern struct nfs4_client *nfs4_get_client(struct in_addr *);
 extern void nfs4_put_client(struct nfs4_client *clp);
-extern int nfs4_init_client(struct nfs4_client *clp);
 extern struct nfs4_client *nfs4_find_client(struct in_addr *);
 struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp);
 extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e38401931291..b3349154994b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2846,7 +2846,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct
 	return nfs4_map_errors(ret);
 }
 
-int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port)
+int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
 	nfs4_verifier sc_verifier;
 	struct nfs4_setclientid setclientid = {
@@ -2857,7 +2857,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
 		.rpc_argp = &setclientid,
 		.rpc_resp = clp,
-		.rpc_cred = clp->cl_cred,
+		.rpc_cred = cred,
 	};
 	u32 *p;
 	int loop = 0;
@@ -2871,7 +2871,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p
 		setclientid.sc_name_len = scnprintf(setclientid.sc_name,
 				sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u",
 				clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr),
-				clp->cl_cred->cr_ops->cr_name,
+				cred->cr_ops->cr_name,
 				clp->cl_id_uniquifier);
 		setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
 				sizeof(setclientid.sc_netid), "tcp");
@@ -2894,14 +2894,14 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p
 }
 
 int
-nfs4_proc_setclientid_confirm(struct nfs4_client *clp)
+nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred)
 {
 	struct nfs_fsinfo fsinfo;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
 		.rpc_argp = clp,
 		.rpc_resp = &fsinfo,
-		.rpc_cred = clp->cl_cred,
+		.rpc_cred = cred,
 	};
 	unsigned long now;
 	int status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 18f6ed1a0b54..afad0255e7db 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -91,11 +91,10 @@ nfs4_alloc_client(struct in_addr *addr)
 
 	if (nfs_callback_up() < 0)
 		return NULL;
-	if ((clp = kmalloc(sizeof(*clp), GFP_KERNEL)) == NULL) {
+	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) {
 		nfs_callback_down();
 		return NULL;
 	}
-	memset(clp, 0, sizeof(*clp));
 	memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
 	init_rwsem(&clp->cl_sem);
 	INIT_LIST_HEAD(&clp->cl_delegations);
@@ -108,7 +107,7 @@ nfs4_alloc_client(struct in_addr *addr)
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
 	clp->cl_rpcclient = ERR_PTR(-EINVAL);
 	clp->cl_boot_time = CURRENT_TIME;
-	clp->cl_state = 0;
+	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
 	return clp;
 }
 
@@ -125,8 +124,6 @@ nfs4_free_client(struct nfs4_client *clp)
 		kfree(sp);
 	}
 	BUG_ON(!list_empty(&clp->cl_state_owners));
-	if (clp->cl_cred)
-		put_rpccred(clp->cl_cred);
 	nfs_idmap_delete(clp);
 	if (!IS_ERR(clp->cl_rpcclient))
 		rpc_shutdown_client(clp->cl_rpcclient);
@@ -196,21 +193,17 @@ nfs4_put_client(struct nfs4_client *clp)
 	nfs4_free_client(clp);
 }
 
-static int __nfs4_init_client(struct nfs4_client *clp)
+static int nfs4_init_client(struct nfs4_client *clp, struct rpc_cred *cred)
 {
-	int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, nfs_callback_tcpport);
+	int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
+			nfs_callback_tcpport, cred);
 	if (status == 0)
-		status = nfs4_proc_setclientid_confirm(clp);
+		status = nfs4_proc_setclientid_confirm(clp, cred);
 	if (status == 0)
 		nfs4_schedule_state_renewal(clp);
 	return status;
 }
 
-int nfs4_init_client(struct nfs4_client *clp)
-{
-	return nfs4_map_errors(__nfs4_init_client(clp));
-}
-
 u32
 nfs4_alloc_lockowner_id(struct nfs4_client *clp)
 {
@@ -246,6 +239,18 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp)
 	return cred;
 }
 
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs4_client *clp)
+{
+	struct nfs4_state_owner *sp;
+
+	if (!list_empty(&clp->cl_state_owners)) {
+		sp = list_entry(clp->cl_state_owners.next,
+				struct nfs4_state_owner, so_list);
+		return get_rpccred(sp->so_cred);
+	}
+	return NULL;
+}
+
 static struct nfs4_state_owner *
 nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred)
 {
@@ -902,6 +907,7 @@ static int reclaimer(void *ptr)
 	struct nfs4_client *clp = ptr;
 	struct nfs4_state_owner *sp;
 	struct nfs4_state_recovery_ops *ops;
+	struct rpc_cred *cred;
 	int status = 0;
 
 	allow_signal(SIGKILL);
@@ -913,20 +919,33 @@ static int reclaimer(void *ptr)
 	if (list_empty(&clp->cl_superblocks))
 		goto out;
 restart_loop:
-	status = nfs4_proc_renew(clp, clp->cl_cred);
-	switch (status) {
-		case 0:
-		case -NFS4ERR_CB_PATH_DOWN:
-			goto out;
-		case -NFS4ERR_STALE_CLIENTID:
-		case -NFS4ERR_LEASE_MOVED:
-			ops = &nfs4_reboot_recovery_ops;
-			break;
-		default:
-			ops = &nfs4_network_partition_recovery_ops;
-	};
+	ops = &nfs4_network_partition_recovery_ops;
+	/* Are there any open files on this volume? */
+	cred = nfs4_get_renew_cred(clp);
+	if (cred != NULL) {
+		/* Yes there are: try to renew the old lease */
+		status = nfs4_proc_renew(clp, cred);
+		switch (status) {
+			case 0:
+			case -NFS4ERR_CB_PATH_DOWN:
+				put_rpccred(cred);
+				goto out;
+			case -NFS4ERR_STALE_CLIENTID:
+			case -NFS4ERR_LEASE_MOVED:
+				ops = &nfs4_reboot_recovery_ops;
+		}
+	} else {
+		/* "reboot" to ensure we clear all state on the server */
+		clp->cl_boot_time = CURRENT_TIME;
+		cred = nfs4_get_setclientid_cred(clp);
+	}
+	/* We're going to have to re-establish a clientid */
 	nfs4_state_mark_reclaim(clp);
-	status = __nfs4_init_client(clp);
+	status = -ENOENT;
+	if (cred != NULL) {
+		status = nfs4_init_client(clp, cred);
+		put_rpccred(cred);
+	}
 	if (status)
 		goto out_error;
 	/* Mark all delegations for reclaim */
-- 
cgit v1.2.3


From ce1a8e6796150233f5098100f70217521dc7c08f Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 30 Nov 2005 18:08:17 -0500
Subject: NFS: use generic_write_checks() to sanity check direct writes

 Replace ad hoc write parameter sanity checking in nfs_file_direct_write()
 with a call to generic_write_checks().  This should make the proper checks
 modulo the O_LARGEFILE flag, and should catch NFSv2-specific limitations by
 virtue of i_sb->s_maxbytes.

 Test plan:
 Posix compliance testing with both NFSv2 and NFSv3.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ae2be0744191..f69d95aa78b2 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -660,10 +660,10 @@ nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t
 		.iov_len = count,
 	};
 
-	dprintk("nfs: direct read(%s/%s, %lu@%lu)\n",
+	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
 		file->f_dentry->d_parent->d_name.name,
 		file->f_dentry->d_name.name,
-		(unsigned long) count, (unsigned long) pos);
+		(unsigned long) count, (long long) pos);
 
 	if (!is_sync_kiocb(iocb))
 		goto out;
@@ -716,9 +716,7 @@ out:
 ssize_t
 nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
 {
-	ssize_t retval = -EINVAL;
-	loff_t *ppos = &iocb->ki_pos;
-	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	ssize_t retval;
 	struct file *file = iocb->ki_filp;
 	struct nfs_open_context *ctx =
 			(struct nfs_open_context *) file->private_data;
@@ -726,35 +724,32 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count,
 	struct inode *inode = mapping->host;
 	struct iovec iov = {
 		.iov_base = (char __user *)buf,
-		.iov_len = count,
 	};
 
-	dfprintk(VFS, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n",
+	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
 		file->f_dentry->d_parent->d_name.name,
-		file->f_dentry->d_name.name, inode->i_ino,
-		(unsigned long) count, (unsigned long) pos);
+		file->f_dentry->d_name.name,
+		(unsigned long) count, (long long) pos);
 
+	retval = -EINVAL;
 	if (!is_sync_kiocb(iocb))
 		goto out;
-	if (count < 0)
-		goto out;
-        if (pos < 0)
+
+	retval = generic_write_checks(file, &pos, &count, 0);
+	if (retval)
 		goto out;
-	retval = -EFAULT;
-	if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len))
+
+	retval = -EINVAL;
+	if ((ssize_t) count < 0)
 		goto out;
-	retval = -EFBIG;
-	if (limit != RLIM_INFINITY) {
-		if (pos >= limit) {
-			send_sig(SIGXFSZ, current, 0);
-			goto out;
-		}
-		if (count > limit - (unsigned long) pos)
-			count = limit - (unsigned long) pos;
-	}
 	retval = 0;
 	if (!count)
 		goto out;
+	iov.iov_len = count,
+
+	retval = -EFAULT;
+	if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len))
+		goto out;
 
 	retval = nfs_sync_mapping(mapping);
 	if (retval)
@@ -764,7 +759,7 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count,
 	if (mapping->nrpages)
 		invalidate_inode_pages2(mapping);
 	if (retval > 0)
-		*ppos = pos + retval;
+		iocb->ki_pos = pos + retval;
 
 out:
 	return retval;
-- 
cgit v1.2.3


From 6b59a75460eb9527145d7bd55068e5d32bee8a44 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 30 Nov 2005 18:08:19 -0500
Subject: NFS: Fix error recovery code in fs/nfs/inode.c:__init_nfs()

 Red Hat found a problem in the error recovery logic in __init_nfs.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 740d3cd40246..da58207b366b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2152,11 +2152,11 @@ out:
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
-	nfs_destroy_writepagecache();
 #ifdef CONFIG_NFS_DIRECTIO
-out0:
 	nfs_destroy_directcache();
+out0:
 #endif
+	nfs_destroy_writepagecache();
 out1:
 	nfs_destroy_readpagecache();
 out2:
-- 
cgit v1.2.3


From dc20f803904dbf30f834dcc43c14701dfce32491 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 30 Nov 2005 18:08:57 -0500
Subject: NFS: get rid of useless kernel log message

 nfs_statfs() generates a log message when GETATTR returns an error.  This
 is usually a useless message.  Make it a dprintk.

 Test plan:
 None

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index da58207b366b..cc753928e411 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -575,11 +575,10 @@ nfs_statfs(struct super_block *sb, struct kstatfs *buf)
 	buf->f_namelen = server->namelen;
  out:
 	unlock_kernel();
-
 	return 0;
 
  out_err:
-	printk(KERN_WARNING "nfs_statfs: statfs error = %d\n", -error);
+	dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
 	buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
 	goto out;
 
-- 
cgit v1.2.3


From 325cfed9ae901320e9234b18c21434b783dbe342 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 30 Nov 2005 18:08:55 -0500
Subject: NFS: make "inode number mismatch" message more useful

 To help NFS users and server developers, make the "inode number mismatch"
 message display more useful information.

 Test-plan:
 None.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cc753928e411..4e6558df54b8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1409,14 +1409,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
 		return 0;
 
-	if (nfsi->fileid != fattr->fileid) {
-		printk(KERN_ERR "%s: inode number mismatch\n"
-		       "expected (%s/0x%Lx), got (%s/0x%Lx)\n",
-		       __FUNCTION__,
-		       inode->i_sb->s_id, (long long)nfsi->fileid,
-		       inode->i_sb->s_id, (long long)fattr->fileid);
-		goto out_err;
-	}
+	if (nfsi->fileid != fattr->fileid)
+		goto out_fileid;
 
 	/*
 	 * Make sure the inode's type hasn't changed.
@@ -1538,6 +1532,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfs_invalidate_inode(inode);
 	return -ESTALE;
+
+ out_fileid:
+	printk(KERN_ERR "NFS: server %s error: fileid changed\n"
+		"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
+		NFS_SERVER(inode)->hostname, inode->i_sb->s_id,
+		(long long)nfsi->fileid, (long long)fattr->fileid);
+	goto out_err;
 }
 
 /*
-- 
cgit v1.2.3


From 40859d7ee64ed6bfad8a4e93f9bb5c1074afadff Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 30 Nov 2005 18:09:02 -0500
Subject: NFS: support large reads and writes on the wire

 Most NFS server implementations allow up to 64KB reads and writes on the
 wire.  The Solaris NFS server allows up to a megabyte, for instance.

 Now the Linux NFS client supports transfer sizes up to 1MB, too.  This will
 help reduce protocol and context switch overhead on read/write intensive NFS
 workloads, and support larger atomic read and write operations on servers
 that support them.

 Test-plan:
 Connectathon and iozone on mount point with wsize=rsize>32768 over TCP.
 Tests with NFS over UDP to verify the maximum RPC payload size cap.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c            |  5 +++--
 fs/nfs/inode.c             | 25 ++++++++++---------------
 fs/nfs/nfsroot.c           |  4 ++--
 fs/nfs/read.c              |  6 +++---
 fs/nfs/write.c             | 29 ++++++++++++++++++++++-------
 include/linux/nfs_fs.h     | 41 +++++++++++++++++++++++++++++++++++------
 include/linux/nfs_xdr.h    | 29 ++++++++++++++++-------------
 include/linux/sunrpc/xdr.h |  5 -----
 8 files changed, 91 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f69d95aa78b2..fd7ac5e841c1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -154,6 +154,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
 	struct list_head *list;
 	struct nfs_direct_req *dreq;
 	unsigned int reads = 0;
+	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
 	if (!dreq)
@@ -167,7 +168,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
 
 	list = &dreq->list;
 	for(;;) {
-		struct nfs_read_data *data = nfs_readdata_alloc();
+		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
 
 		if (unlikely(!data)) {
 			while (!list_empty(list)) {
@@ -431,7 +432,7 @@ static ssize_t nfs_direct_write_seg(struct inode *inode,
 	struct nfs_writeverf first_verf;
 	struct nfs_write_data *wdata;
 
-	wdata = nfs_writedata_alloc();
+	wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
 	if (!wdata)
 		return -ENOMEM;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4e6558df54b8..acde2c5725bf 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -221,10 +221,10 @@ nfs_calc_block_size(u64 tsize)
 static inline unsigned long
 nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
 {
-	if (bsize < 1024)
-		bsize = NFS_DEF_FILE_IO_BUFFER_SIZE;
-	else if (bsize >= NFS_MAX_FILE_IO_BUFFER_SIZE)
-		bsize = NFS_MAX_FILE_IO_BUFFER_SIZE;
+	if (bsize < NFS_MIN_FILE_IO_SIZE)
+		bsize = NFS_DEF_FILE_IO_SIZE;
+	else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+		bsize = NFS_MAX_FILE_IO_SIZE;
 
 	return nfs_block_bits(bsize, nrbitsp);
 }
@@ -307,20 +307,15 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
 	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
 	if (server->rsize > max_rpc_payload)
 		server->rsize = max_rpc_payload;
-	if (server->wsize > max_rpc_payload)
-		server->wsize = max_rpc_payload;
-
+	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (server->rpages > NFS_READ_MAXIOV) {
-		server->rpages = NFS_READ_MAXIOV;
-		server->rsize = server->rpages << PAGE_CACHE_SHIFT;
-	}
 
+	if (server->wsize > max_rpc_payload)
+		server->wsize = max_rpc_payload;
+	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+		server->wsize = NFS_MAX_FILE_IO_SIZE;
 	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (server->wpages > NFS_WRITE_MAXIOV) {
-		server->wpages = NFS_WRITE_MAXIOV;
-                server->wsize = server->wpages << PAGE_CACHE_SHIFT;
-	}
 
 	if (sb->s_blocksize == 0)
 		sb->s_blocksize = nfs_block_bits(server->wsize,
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 1b272a135a31..985cc53b8dd5 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -296,8 +296,8 @@ static int __init root_nfs_name(char *name)
 	nfs_port          = -1;
 	nfs_data.version  = NFS_MOUNT_VERSION;
 	nfs_data.flags    = NFS_MOUNT_NONLM;	/* No lockd in nfs root yet */
-	nfs_data.rsize    = NFS_DEF_FILE_IO_BUFFER_SIZE;
-	nfs_data.wsize    = NFS_DEF_FILE_IO_BUFFER_SIZE;
+	nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
+	nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
 	nfs_data.acregmin = 3;
 	nfs_data.acregmax = 60;
 	nfs_data.acdirmin = 30;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 21486242c3d3..05eb43fadf8e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -83,7 +83,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
 	int		result;
 	struct nfs_read_data *rdata;
 
-	rdata = nfs_readdata_alloc();
+	rdata = nfs_readdata_alloc(1);
 	if (!rdata)
 		return -ENOMEM;
 
@@ -283,7 +283,7 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
 
 	nbytes = req->wb_bytes;
 	for(;;) {
-		data = nfs_readdata_alloc();
+		data = nfs_readdata_alloc(1);
 		if (!data)
 			goto out_bad;
 		INIT_LIST_HEAD(&data->pages);
@@ -339,7 +339,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode)
 	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
 		return nfs_pagein_multi(head, inode);
 
-	data = nfs_readdata_alloc();
+	data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages);
 	if (!data)
 		goto out_bad;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 80bc4ea1b824..1ce0c200df16 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -89,18 +89,33 @@ static mempool_t *nfs_commit_mempool;
 
 static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
 
-static inline struct nfs_write_data *nfs_commit_alloc(void)
+static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
+
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
+		if (pagecount < NFS_PAGEVEC_SIZE)
+			p->pagevec = &p->page_array[0];
+		else {
+			size_t size = ++pagecount * sizeof(struct page *);
+			p->pagevec = kmalloc(size, GFP_NOFS);
+			if (p->pagevec) {
+				memset(p->pagevec, 0, size);
+			} else {
+				mempool_free(p, nfs_commit_mempool);
+				p = NULL;
+			}
+		}
 	}
 	return p;
 }
 
 static inline void nfs_commit_free(struct nfs_write_data *p)
 {
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
 	mempool_free(p, nfs_commit_mempool);
 }
 
@@ -167,7 +182,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
 	int		result, written = 0;
 	struct nfs_write_data *wdata;
 
-	wdata = nfs_writedata_alloc();
+	wdata = nfs_writedata_alloc(1);
 	if (!wdata)
 		return -ENOMEM;
 
@@ -909,7 +924,7 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how)
 
 	nbytes = req->wb_bytes;
 	for (;;) {
-		data = nfs_writedata_alloc();
+		data = nfs_writedata_alloc(1);
 		if (!data)
 			goto out_bad;
 		list_add(&data->pages, &list);
@@ -973,7 +988,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how)
 	if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE)
 		return nfs_flush_multi(head, inode, how);
 
-	data = nfs_writedata_alloc();
+	data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
 	if (!data)
 		goto out_bad;
 
@@ -1241,12 +1256,12 @@ static void nfs_commit_rpcsetup(struct list_head *head,
  * Commit dirty pages
  */
 static int
-nfs_commit_list(struct list_head *head, int how)
+nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
 	struct nfs_write_data	*data;
 	struct nfs_page         *req;
 
-	data = nfs_commit_alloc();
+	data = nfs_commit_alloc(NFS_SERVER(inode)->wpages);
 
 	if (!data)
 		goto out_bad;
@@ -1351,7 +1366,7 @@ int nfs_commit_inode(struct inode *inode, int how)
 	res = nfs_scan_commit(inode, &head, 0, 0);
 	spin_unlock(&nfsi->req_lock);
 	if (res) {
-		error = nfs_commit_list(&head, how);
+		error = nfs_commit_list(inode, &head, how);
 		if (error < 0)
 			return error;
 	}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4dff705d2ff2..d38010ba6477 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -38,9 +38,6 @@
 # define NFS_DEBUG
 #endif
 
-#define NFS_MAX_FILE_IO_BUFFER_SIZE	32768
-#define NFS_DEF_FILE_IO_BUFFER_SIZE	4096
-
 /* Default timeout values */
 #define NFS_MAX_UDP_TIMEOUT	(60*HZ)
 #define NFS_MAX_TCP_TIMEOUT	(600*HZ)
@@ -462,18 +459,33 @@ static inline int nfs_wb_page(struct inode *inode, struct page* page)
  */
 extern mempool_t *nfs_wdata_mempool;
 
-static inline struct nfs_write_data *nfs_writedata_alloc(void)
+static inline struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
+
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
+		if (pagecount < NFS_PAGEVEC_SIZE)
+			p->pagevec = &p->page_array[0];
+		else {
+			size_t size = ++pagecount * sizeof(struct page *);
+			p->pagevec = kmalloc(size, GFP_NOFS);
+			if (p->pagevec) {
+				memset(p->pagevec, 0, size);
+			} else {
+				mempool_free(p, nfs_wdata_mempool);
+				p = NULL;
+			}
+		}
 	}
 	return p;
 }
 
 static inline void nfs_writedata_free(struct nfs_write_data *p)
 {
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
 	mempool_free(p, nfs_wdata_mempool);
 }
 
@@ -492,16 +504,33 @@ extern void  nfs_readdata_release(void *data);
  */
 extern mempool_t *nfs_rdata_mempool;
 
-static inline struct nfs_read_data *nfs_readdata_alloc(void)
+static inline struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
 	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
-	if (p)
+
+	if (p) {
 		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->pages);
+		if (pagecount < NFS_PAGEVEC_SIZE)
+			p->pagevec = &p->page_array[0];
+		else {
+			size_t size = ++pagecount * sizeof(struct page *);
+			p->pagevec = kmalloc(size, GFP_NOFS);
+			if (p->pagevec) {
+				memset(p->pagevec, 0, size);
+			} else {
+				mempool_free(p, nfs_rdata_mempool);
+				p = NULL;
+			}
+		}
+	}
 	return p;
 }
 
 static inline void nfs_readdata_free(struct nfs_read_data *p)
 {
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
 	mempool_free(p, nfs_rdata_mempool);
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b8b0eed98ec9..9f422fd87673 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -4,6 +4,16 @@
 #include <linux/sunrpc/xprt.h>
 #include <linux/nfsacl.h>
 
+/*
+ * To change the maximum rsize and wsize supported by the NFS client, adjust
+ * NFS_MAX_FILE_IO_SIZE.  64KB is a typical maximum, but some servers can
+ * support a megabyte or more.  The default is left at 4096 bytes, which is
+ * reasonable for NFS over UDP.
+ */
+#define NFS_MAX_FILE_IO_SIZE	(1048576U)
+#define NFS_DEF_FILE_IO_SIZE	(4096U)
+#define NFS_MIN_FILE_IO_SIZE	(1024U)
+
 struct nfs4_fsid {
 	__u64 major;
 	__u64 minor;
@@ -215,12 +225,6 @@ struct nfs4_delegreturnargs {
 /*
  * Arguments to the read call.
  */
-
-#define NFS_READ_MAXIOV		(9U)
-#if (NFS_READ_MAXIOV > (MAX_IOVEC -2))
-#error "NFS_READ_MAXIOV is too large"
-#endif
-
 struct nfs_readargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
@@ -239,11 +243,6 @@ struct nfs_readres {
 /*
  * Arguments to the write call.
  */
-#define NFS_WRITE_MAXIOV	(9U)
-#if (NFS_WRITE_MAXIOV > (MAX_IOVEC -2))
-#error "NFS_WRITE_MAXIOV is too large"
-#endif
-
 struct nfs_writeargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
@@ -674,6 +673,8 @@ struct nfs4_server_caps_res {
 
 struct nfs_page;
 
+#define NFS_PAGEVEC_SIZE	(8U)
+
 struct nfs_read_data {
 	int			flags;
 	struct rpc_task		task;
@@ -682,13 +683,14 @@ struct nfs_read_data {
 	struct nfs_fattr	fattr;	/* fattr storage */
 	struct list_head	pages;	/* Coalesced read requests */
 	struct nfs_page		*req;	/* multi ops per nfs_page */
-	struct page		*pagevec[NFS_READ_MAXIOV];
+	struct page		**pagevec;
 	struct nfs_readargs args;
 	struct nfs_readres  res;
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
 	void (*complete) (struct nfs_read_data *, int);
+	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
 };
 
 struct nfs_write_data {
@@ -700,13 +702,14 @@ struct nfs_write_data {
 	struct nfs_writeverf	verf;
 	struct list_head	pages;		/* Coalesced requests we wish to flush */
 	struct nfs_page		*req;		/* multi ops per nfs_page */
-	struct page		*pagevec[NFS_WRITE_MAXIOV];
+	struct page		**pagevec;
 	struct nfs_writeargs	args;		/* argument struct */
 	struct nfs_writeres	res;		/* result struct */
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
 	void (*complete) (struct nfs_write_data *, int);
+	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
 };
 
 struct nfs_access_entry;
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5da968729cf8..5676794ee34f 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -134,11 +134,6 @@ xdr_adjust_iovec(struct kvec *iov, u32 *p)
 	return iov->iov_len = ((u8 *) p - (u8 *) iov->iov_base);
 }
 
-/*
- * Maximum number of iov's we use.
- */
-#define MAX_IOVEC	(12)
-
 /*
  * XDR buffer helper functions
  */
-- 
cgit v1.2.3


From 24174119c73983d5217da8f56a12c79a9b57e056 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:33 +0100
Subject: NFSv4: Ensure that we return the delegation on the target of a rename
 too.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c0d1a214572c..e9255198f767 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1550,8 +1550,10 @@ go_ahead:
 	}
 	nfs_inode_return_delegation(old_inode);
 
-	if (new_inode)
+	if (new_inode != NULL) {
+		nfs_inode_return_delegation(new_inode);
 		d_delete(new_dentry);
+	}
 
 	nfs_begin_data_update(old_dir);
 	nfs_begin_data_update(new_dir);
-- 
cgit v1.2.3


From 70b9ecbdb9c5fdc731f8780bffd45d9519020c4a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:34 +0100
Subject: NFS: Make stat() return updated mtimes after a write()

 The SuS states that a call to write() will cause mtime to be updated on
 the file. In order to satisfy that requirement, we need to flush out
 any cached writes in nfs_getattr().
 Speed things up slightly by not committing the writes.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  2 ++
 fs/nfs/write.c         | 23 ++++++++++++-----------
 include/linux/nfs_fs.h |  1 +
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index acde2c5725bf..2c7f8aac1dec 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -952,6 +952,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
 	int err;
 
+	/* Flush out writes to the server in order to update c/mtime */
+	nfs_sync_inode(inode, 0, 0, FLUSH_WAIT|FLUSH_NOCOMMIT);
 	if (__IS_FLG(inode, MS_NOATIME))
 		need_atime = 0;
 	else if (__IS_FLG(inode, MS_NODIRATIME) && S_ISDIR(inode->i_mode))
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1ce0c200df16..9449b6835509 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1377,22 +1377,23 @@ int nfs_commit_inode(struct inode *inode, int how)
 int nfs_sync_inode(struct inode *inode, unsigned long idx_start,
 		  unsigned int npages, int how)
 {
-	int	error,
-		wait;
+	int nocommit = how & FLUSH_NOCOMMIT;
+	int wait = how & FLUSH_WAIT;
+	int error;
 
-	wait = how & FLUSH_WAIT;
-	how &= ~FLUSH_WAIT;
+	how &= ~(FLUSH_WAIT|FLUSH_NOCOMMIT);
 
 	do {
-		error = 0;
-		if (wait)
+		if (wait) {
 			error = nfs_wait_on_requests(inode, idx_start, npages);
-		if (error == 0)
-			error = nfs_flush_inode(inode, idx_start, npages, how);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-		if (error == 0)
+			if (error != 0)
+				continue;
+		}
+		error = nfs_flush_inode(inode, idx_start, npages, how);
+		if (error != 0)
+			continue;
+		if (!nocommit)
 			error = nfs_commit_inode(inode, how);
-#endif
 	} while (error > 0);
 	return error;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d38010ba6477..408d82d3d97c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -62,6 +62,7 @@
 #define FLUSH_STABLE		4	/* commit to stable storage */
 #define FLUSH_LOWPRI		8	/* low priority background flush */
 #define FLUSH_HIGHPRI		16	/* high priority memory reclaim flush */
+#define FLUSH_NOCOMMIT		32	/* Don't send the NFSv3/v4 COMMIT */
 
 #ifdef __KERNEL__
 
-- 
cgit v1.2.3


From 566dd6064e89b15ff2dec666a421bebf0f98f26c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:35 +0100
Subject: NFS: Make directIO aware of compound pages...

 ...and avoid calling set_page_dirty on them

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index fd7ac5e841c1..10ae377e68ff 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -122,9 +122,10 @@ nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
 {
 	int i;
 	for (i = 0; i < npages; i++) {
-		if (do_dirty)
-			set_page_dirty_lock(pages[i]);
-		page_cache_release(pages[i]);
+		struct page *page = pages[i];
+		if (do_dirty && !PageCompound(page))
+			set_page_dirty_lock(page);
+		page_cache_release(page);
 	}
 	kfree(pages);
 }
-- 
cgit v1.2.3


From beb2a5ec386e5ce6891ebd1c06b913da04354b40 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:37 +0100
Subject: NFSv4: Ensure change attribute returned by GETATTR callback conforms
 to spec

 According to RFC3530 we're supposed to cache the change attribute
 at the time the client receives a write delegation.
 If the inode is clean, a CB_GETATTR callback by the server to the
 client is supposed to return the cached change attribute.
 If, OTOH, the inode is dirty, the client should bump the cached
 change attribute by 1.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_proc.c | 4 +++-
 fs/nfs/delegation.c    | 1 +
 fs/nfs/delegation.h    | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 65f1e19e4d19..462cfceb50c5 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -35,7 +35,9 @@ unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres
 	if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
 		goto out_iput;
 	res->size = i_size_read(inode);
-	res->change_attr = NFS_CHANGE_ATTR(inode);
+	res->change_attr = delegation->change_attr;
+	if (nfsi->npages != 0)
+		res->change_attr++;
 	res->ctime = inode->i_ctime;
 	res->mtime = inode->i_mtime;
 	res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 75dfb1c717a0..d2ee09b38cee 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -131,6 +131,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			sizeof(delegation->stateid.data));
 	delegation->type = res->delegation_type;
 	delegation->maxsize = res->maxsize;
+	delegation->change_attr = nfsi->change_attr;
 	delegation->cred = get_rpccred(cred);
 	delegation->inode = inode;
 
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index fbc50ec271c5..7a0b2bfce771 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -21,6 +21,7 @@ struct nfs_delegation {
 #define NFS_DELEGATION_NEED_RECLAIM 1
 	long flags;
 	loff_t maxsize;
+	__u64 change_attr;
 };
 
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
-- 
cgit v1.2.3


From fa178f29c0f8a0dce748181a5351f4a92fd4f455 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:38 +0100
Subject: NFSv4: Ensure DELEGRETURN returns attributes

 Upon return of a write delegation, the server will almost always bump the
 change attribute. Ensure that we pick up that change so that we don't
 invalidate our data cache unnecessarily.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c     |  2 --
 fs/nfs/nfs4proc.c       | 17 +++++++++++++----
 fs/nfs/nfs4xdr.c        | 33 ++++++++++++++++++++++-----------
 include/linux/nfs_xdr.h |  6 ++++++
 4 files changed, 41 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d2ee09b38cee..66cc720e3927 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -159,8 +159,6 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
 {
 	int res = 0;
 
-	__nfs_revalidate_inode(NFS_SERVER(inode), inode);
-
 	res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
 	nfs_free_delegation(delegation);
 	return res;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b3349154994b..984ca3454d04 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2920,11 +2920,12 @@ nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred)
 
 struct nfs4_delegreturndata {
 	struct nfs4_delegreturnargs args;
+	struct nfs4_delegreturnres res;
 	struct nfs_fh fh;
 	nfs4_stateid stateid;
 	struct rpc_cred *cred;
 	unsigned long timestamp;
-	const struct nfs_server *server;
+	struct nfs_fattr fattr;
 	int rpc_status;
 };
 
@@ -2934,8 +2935,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *calldata)
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
 		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
 		.rpc_cred = data->cred,
 	};
+	nfs_fattr_init(data->res.fattr);
 	rpc_call_setup(task, &msg, 0);
 }
 
@@ -2944,7 +2947,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 	struct nfs4_delegreturndata *data = calldata;
 	data->rpc_status = task->tk_status;
 	if (data->rpc_status == 0)
-		renew_lease(data->server, data->timestamp);
+		renew_lease(data->res.server, data->timestamp);
 }
 
 static void nfs4_delegreturn_release(void *calldata)
@@ -2964,6 +2967,7 @@ const static struct rpc_call_ops nfs4_delegreturn_ops = {
 static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
 {
 	struct nfs4_delegreturndata *data;
+	struct nfs_server *server = NFS_SERVER(inode);
 	struct rpc_task *task;
 	int status;
 
@@ -2972,11 +2976,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 		return -ENOMEM;
 	data->args.fhandle = &data->fh;
 	data->args.stateid = &data->stateid;
+	data->args.bitmask = server->attr_bitmask;
 	nfs_copy_fh(&data->fh, NFS_FH(inode));
 	memcpy(&data->stateid, stateid, sizeof(data->stateid));
+	data->res.fattr = &data->fattr;
+	data->res.server = server;
 	data->cred = get_rpccred(cred);
 	data->timestamp = jiffies;
-	data->server = NFS_SERVER(inode);
 	data->rpc_status = 0;
 
 	task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data);
@@ -2985,8 +2991,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 		return PTR_ERR(task);
 	}
 	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status == 0)
+	if (status == 0) {
 		status = data->rpc_status;
+		if (status == 0)
+			nfs_post_op_update_inode(inode, &data->fattr);
+	}
 	rpc_release_task(task);
 	return status;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5d6bda43dfaa..12be1d682164 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -392,9 +392,11 @@ static int nfs_stat_to_errno(int);
 				decode_getattr_maxsz)
 #define NFS4_enc_delegreturn_sz	(compound_encode_hdr_maxsz + \
 				encode_putfh_maxsz + \
-				encode_delegreturn_maxsz)
+				encode_delegreturn_maxsz + \
+				encode_getattr_maxsz)
 #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
-				decode_delegreturn_maxsz)
+				decode_delegreturn_maxsz + \
+				decode_getattr_maxsz)
 #define NFS4_enc_getacl_sz	(compound_encode_hdr_maxsz + \
 				encode_putfh_maxsz + \
 				encode_getattr_maxsz)
@@ -1983,14 +1985,20 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const str
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
-		.nops = 2,
+		.nops = 3,
 	};
 	int status;
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, &hdr);
-	if ((status = encode_putfh(&xdr, args->fhandle)) == 0)
-		status = encode_delegreturn(&xdr, args->stateid);
+	status = encode_putfh(&xdr, args->fhandle);
+	if (status != 0)
+		goto out;
+	status = encode_delegreturn(&xdr, args->stateid);
+	if (status != 0)
+		goto out;
+	status = encode_getfattr(&xdr, args->bitmask);
+out:
 	return status;
 }
 
@@ -4184,7 +4192,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
 /*
  * DELEGRETURN request
  */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_delegreturnres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4192,11 +4200,14 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *d
 
 	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
 	status = decode_compound_hdr(&xdr, &hdr);
-	if (status == 0) {
-		status = decode_putfh(&xdr);
-		if (status == 0)
-			status = decode_delegreturn(&xdr);
-	}
+	if (status != 0)
+		goto out;
+	status = decode_putfh(&xdr);
+	if (status != 0)
+		goto out;
+	status = decode_delegreturn(&xdr);
+	decode_getfattr(&xdr, res->fattr, res->server);
+out:
 	return status;
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9f422fd87673..6d6f69ec5675 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -220,6 +220,12 @@ struct nfs_lockt_res {
 struct nfs4_delegreturnargs {
 	const struct nfs_fh *fhandle;
 	const nfs4_stateid *stateid;
+	const u32 * bitmask;
+};
+
+struct nfs4_delegreturnres {
+	struct nfs_fattr * fattr;
+	const struct nfs_server *server;
 };
 
 /*
-- 
cgit v1.2.3


From a895b4a198dd06f8353328867e4f6cfd28b63081 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:40 +0100
Subject: NFS: Clean up weak cache consistency code

 ...and ensure that nfs_update_inode() respects wcc

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 60 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2c7f8aac1dec..7270b1d73d30 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1248,6 +1248,33 @@ void nfs_end_data_update(struct inode *inode)
 	atomic_dec(&nfsi->data_updates);
 }
 
+static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
+			&& nfsi->change_attr == fattr->pre_change_attr) {
+		nfsi->change_attr = fattr->change_attr;
+		nfsi->cache_change_attribute = jiffies;
+	}
+
+	/* If we have atomic WCC data, we may update some attributes */
+	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
+		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
+			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+			nfsi->cache_change_attribute = jiffies;
+		}
+		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+			nfsi->cache_change_attribute = jiffies;
+		}
+		if (inode->i_size == fattr->pre_size && nfsi->npages == 0) {
+			inode->i_size = fattr->size;
+			nfsi->cache_change_attribute = jiffies;
+		}
+	}
+}
+
 /**
  * nfs_check_inode_attributes - verify consistency of the inode attribute cache
  * @inode - pointer to inode
@@ -1264,22 +1291,20 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	int data_unstable;
 
 
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+		return 0;
+
 	/* Are we in the process of updating data on the server? */
 	data_unstable = nfs_caches_unstable(inode);
 
-	if (fattr->valid & NFS_ATTR_FATTR_V4) {
-		if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
-				&& nfsi->change_attr == fattr->pre_change_attr)
-			nfsi->change_attr = fattr->change_attr;
-		if (nfsi->change_attr != fattr->change_attr) {
-			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
-			if (!data_unstable)
-				nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
-		}
-	}
+	/* Do atomic weak cache consistency updates */
+	nfs_wcc_update_inode(inode, fattr);
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0) {
-		return 0;
+	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+			nfsi->change_attr != fattr->change_attr) {
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+		if (!data_unstable)
+			nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
 	}
 
 	/* Has the inode gone and changed behind our back? */
@@ -1291,14 +1316,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	cur_size = i_size_read(inode);
  	new_isize = nfs_size_to_loff_t(fattr->size);
 
-	/* If we have atomic WCC data, we may update some attributes */
-	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
-		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
-			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-		if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime))
-			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-	}
-
 	/* Verify a few of the more important attributes */
 	if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
 		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
@@ -1426,6 +1443,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if (data_stable)
 		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
 
+	/* Do atomic weak cache consistency updates */
+	nfs_wcc_update_inode(inode, fattr);
+
 	/* Check if our cached file size is stale */
  	new_isize = nfs_size_to_loff_t(fattr->size);
 	cur_isize = i_size_read(inode);
-- 
cgit v1.2.3


From a72b44222d222749d54b3e370d825094352e389f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:41 +0100
Subject: NFSv4: Allow user to set the port used by the NFSv4 callback channel

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/kernel-parameters.txt |  4 ++
 fs/nfs/Makefile                     |  1 +
 fs/nfs/callback.c                   |  3 +-
 fs/nfs/callback.h                   |  1 +
 fs/nfs/inode.c                      | 37 ++++++++++++++++++-
 fs/nfs/sysctl.c                     | 74 +++++++++++++++++++++++++++++++++++++
 include/linux/nfs_fs.h              | 11 ++++++
 7 files changed, 128 insertions(+), 3 deletions(-)
 create mode 100644 fs/nfs/sysctl.c

(limited to 'fs')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 61a56b100c62..309c9cec6e7c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -910,6 +910,10 @@ running once the system is up.
 	nfsroot=	[NFS] nfs root filesystem for disk-less boxes.
 			See Documentation/nfsroot.txt.
 
+	nfs.callback_tcpport=
+			[NFS] set the TCP port on which the NFSv4 callback
+			channel should listen.
+
 	nmi_watchdog=	[KNL,BUGS=IA-32] Debugging features for SMP kernels
 
 	no387		[BUGS=IA-32] Tells the kernel to use the 387 maths
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8b3bb715d177..ec61fd56a1a9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -13,4 +13,5 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   delegation.o idmap.o \
 			   callback.o callback_xdr.o callback_proc.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
+nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-objs		:= $(nfs-y)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 30cae3602867..fcd97406a778 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -34,6 +34,7 @@ static struct nfs_callback_data nfs_callback_info;
 static DECLARE_MUTEX(nfs_callback_sema);
 static struct svc_program nfs4_callback_program;
 
+unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
 
 /*
@@ -98,7 +99,7 @@ int nfs_callback_up(void)
 	if (!serv)
 		goto out_err;
 	/* FIXME: We don't want to register this socket with the portmapper */
-	ret = svc_makesock(serv, IPPROTO_TCP, 0);
+	ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport);
 	if (ret < 0)
 		goto out_destroy;
 	if (!list_empty(&serv->sv_permsocks)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index a0db2d4f9415..b252e7fe53a5 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -65,6 +65,7 @@ extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
 extern int nfs_callback_up(void);
 extern int nfs_callback_down(void);
 
+extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7270b1d73d30..648cb1aef3b1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -40,6 +40,7 @@
 #include <asm/uaccess.h>
 
 #include "nfs4_fs.h"
+#include "callback.h"
 #include "delegation.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
@@ -2036,6 +2037,21 @@ static struct file_system_type nfs4_fs_type = {
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+static int param_set_port(const char *val, struct kernel_param *kp)
+{
+	char *endp;
+	int num = simple_strtol(val, &endp, 0);
+	if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+		return -EINVAL;
+	*((int *)kp->arg) = num;
+	return 0;
+}
+
+module_param_call(callback_tcpport, param_set_port, param_get_int,
+		 &nfs_callback_set_tcpport, 0644);
+
 #define nfs4_init_once(nfsi) \
 	do { \
 		INIT_LIST_HEAD(&(nfsi)->open_states); \
@@ -2043,8 +2059,25 @@ static struct file_system_type nfs4_fs_type = {
 		nfsi->delegation_state = 0; \
 		init_rwsem(&nfsi->rwsem); \
 	} while(0)
-#define register_nfs4fs() register_filesystem(&nfs4_fs_type)
-#define unregister_nfs4fs() unregister_filesystem(&nfs4_fs_type)
+
+static inline int register_nfs4fs(void)
+{
+	int ret;
+
+	ret = nfs_register_sysctl();
+	if (ret != 0)
+		return ret;
+	ret = register_filesystem(&nfs4_fs_type);
+	if (ret != 0)
+		nfs_unregister_sysctl();
+	return ret;
+}
+
+static inline void unregister_nfs4fs(void)
+{
+	unregister_filesystem(&nfs4_fs_type);
+	nfs_unregister_sysctl();
+}
 #else
 #define nfs4_init_once(nfsi) \
 	do { } while (0)
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
new file mode 100644
index 000000000000..fdc64b59a4ee
--- /dev/null
+++ b/fs/nfs/sysctl.c
@@ -0,0 +1,74 @@
+/*
+ * linux/fs/nfs/sysctl.c
+ *
+ * Sysctl interface to NFS parameters
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/nfs4.h>
+
+#include "callback.h"
+
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+static struct ctl_table_header *nfs_callback_sysctl_table;
+/*
+ * Something that isn't CTL_ANY, CTL_NONE or a value that may clash.
+ * Use the same values as fs/lockd/svc.c
+ */
+#define CTL_UNNUMBERED -2
+
+static ctl_table nfs_cb_sysctls[] = {
+#ifdef CONFIG_NFS_V4
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "nfs_callback_tcpport",
+		.data = &nfs_callback_set_tcpport,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec_minmax,
+		.extra1 = (int *)&nfs_set_port_min,
+		.extra2 = (int *)&nfs_set_port_max,
+	},
+#endif
+	{ .ctl_name = 0 }
+};
+
+static ctl_table nfs_cb_sysctl_dir[] = {
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "nfs",
+		.mode = 0555,
+		.child = nfs_cb_sysctls,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table nfs_cb_sysctl_root[] = {
+	{
+		.ctl_name = CTL_FS,
+		.procname = "fs",
+		.mode = 0555,
+		.child = nfs_cb_sysctl_dir,
+	},
+	{ .ctl_name = 0 }
+};
+
+int nfs_register_sysctl(void)
+{
+	nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root, 0);
+	if (nfs_callback_sysctl_table == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void nfs_unregister_sysctl(void)
+{
+	unregister_sysctl_table(nfs_callback_sysctl_table);
+	nfs_callback_sysctl_table = NULL;
+}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 408d82d3d97c..547d649b274e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -391,6 +391,17 @@ extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_
  */
 extern struct inode_operations nfs_symlink_inode_operations;
 
+/*
+ * linux/fs/nfs/sysctl.c
+ */
+#ifdef CONFIG_SYSCTL
+extern int nfs_register_sysctl(void);
+extern void nfs_unregister_sysctl(void);
+#else
+#define nfs_register_sysctl() do { } while(0)
+#define nfs_unregister_sysctl() do { } while(0)
+#endif
+
 /*
  * linux/fs/nfs/unlink.c
  */
-- 
cgit v1.2.3


From f232142cc21127c829559923eb405d1bcb2e2278 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 3 Jan 2006 09:55:42 +0100
Subject: NLM: Clean up nlmsvc_grant_reply locking

 Slightly simpler logic here makes it more trivial to verify that the up's
 and down's are balanced here.  Break out an assignment from a conditional
 while we're at it.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 87d09a0d8f64..e42f0cc6c450 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -637,11 +637,12 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status
 
 	file->f_count++;
 	down(&file->f_sema);
-	if ((block = nlmsvc_find_block(cookie,&rqstp->rq_addr)) != NULL) {
+	block = nlmsvc_find_block(cookie, &rqstp->rq_addr);
+	if (block) {
 		if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
 			/* Try again in a couple of seconds */
 			nlmsvc_insert_block(block, 10 * HZ);
-			block = NULL;
+			up(&file->f_sema);
 		} else {
 			/* Lock is now held by client, or has been rejected.
 			 * In both cases, the block should be removed. */
@@ -652,8 +653,6 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status
 				nlmsvc_delete_block(block, 1);
 		}
 	}
-	if (!block)
-		up(&file->f_sema);
 	nlm_release_file(file);
 }
 
-- 
cgit v1.2.3


From 5996a298da43a03081e9ba2116983d173001c862 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 3 Jan 2006 09:55:44 +0100
Subject: NLM: don't unlock on cancel requests

 Currently when lockd gets an NLM_CANCEL request, it also does an unlock for
 the same range.  This is incorrect.

 The Open Group documentation says that "This procedure cancels an
 *outstanding* blocked lock request."  (Emphasis mine.)

 Also, consider a client that holds a lock on the first byte of a file, and
 requests a lock on the entire file.  If the client cancels that request
 (perhaps because the requesting process is signalled), the server shouldn't
 apply perform an unlock on the entire file, since that will also remove the
 previous lock that the client was already granted.

 Or consider a lock request that actually *downgraded* an exclusive lock to
 a shared lock.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c |  5 -----
 fs/locks.c         | 13 ++-----------
 2 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e42f0cc6c450..5fb48b4390bd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -240,11 +240,6 @@ nlmsvc_delete_block(struct nlm_block *block, int unlock)
 	nlmsvc_remove_block(block);
 	if (fl->fl_next)
 		posix_unblock_lock(file->f_file, fl);
-	if (unlock) {
-		fl->fl_type = F_UNLCK;
-		posix_lock_file(file->f_file, fl);
-		block->b_granted = 0;
-	}
 
 	/* If the block is in the middle of a GRANT callback,
 	 * don't kill it yet. */
diff --git a/fs/locks.c b/fs/locks.c
index 250ef53d25ef..75650d52fe60 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1961,19 +1961,10 @@ EXPORT_SYMBOL(posix_block_lock);
 void
 posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 {
-	/* 
-	 * A remote machine may cancel the lock request after it's been
-	 * granted locally.  If that happens, we need to delete the lock.
-	 */
 	lock_kernel();
-	if (waiter->fl_next) {
+	if (waiter->fl_next)
 		__locks_delete_block(waiter);
-		unlock_kernel();
-	} else {
-		unlock_kernel();
-		waiter->fl_type = F_UNLCK;
-		posix_lock_file(filp, waiter);
-	}
+	unlock_kernel();
 }
 
 EXPORT_SYMBOL(posix_unblock_lock);
-- 
cgit v1.2.3


From 2c5acd2e1a73cad59203a1bace21e6b03f2920a9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 3 Jan 2006 09:55:45 +0100
Subject: NLM: clean up nlmsvc_delete_block

 The fl_next check here is superfluous (and possibly a layering violation).

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 5fb48b4390bd..b56d439bad82 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -238,8 +238,7 @@ nlmsvc_delete_block(struct nlm_block *block, int unlock)
 
 	/* Remove block from list */
 	nlmsvc_remove_block(block);
-	if (fl->fl_next)
-		posix_unblock_lock(file->f_file, fl);
+	posix_unblock_lock(file->f_file, fl);
 
 	/* If the block is in the middle of a GRANT callback,
 	 * don't kill it yet. */
-- 
cgit v1.2.3


From 64a318ee2af9000df482d7a125c3b3e1f1007404 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 3 Jan 2006 09:55:46 +0100
Subject: NLM: Further cancel fixes

 If the server receives an NLM cancel call and finds no waiting lock to
 cancel, then chances are the lock has already been applied, and the client
 just hadn't yet processed the NLM granted callback before it sent the
 cancel.

 The Open Group text, for example, perimts a server to return either success
 (LCK_GRANTED) or failure (LCK_DENIED) in this case.  But returning an error
 seems more helpful; the client may be able to use it to recognize that a
 race has occurred and to recover from the race.

 So, modify the relevant functions to return an error in this case.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c | 15 ++++++++++-----
 fs/locks.c         |  7 ++++++-
 include/linux/fs.h |  2 +-
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index b56d439bad82..9cfced65d4a2 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -227,25 +227,27 @@ failed:
  * It is the caller's responsibility to check whether the file
  * can be closed hereafter.
  */
-static void
+static int
 nlmsvc_delete_block(struct nlm_block *block, int unlock)
 {
 	struct file_lock	*fl = &block->b_call.a_args.lock.fl;
 	struct nlm_file		*file = block->b_file;
 	struct nlm_block	**bp;
+	int status = 0;
 
 	dprintk("lockd: deleting block %p...\n", block);
 
 	/* Remove block from list */
 	nlmsvc_remove_block(block);
-	posix_unblock_lock(file->f_file, fl);
+	if (unlock)
+		status = posix_unblock_lock(file->f_file, fl);
 
 	/* If the block is in the middle of a GRANT callback,
 	 * don't kill it yet. */
 	if (block->b_incall) {
 		nlmsvc_insert_block(block, NLM_NEVER);
 		block->b_done = 1;
-		return;
+		return status;
 	}
 
 	/* Remove block from file's list of blocks */
@@ -260,6 +262,7 @@ nlmsvc_delete_block(struct nlm_block *block, int unlock)
 		nlm_release_host(block->b_host);
 	nlmclnt_freegrantargs(&block->b_call);
 	kfree(block);
+	return status;
 }
 
 /*
@@ -270,6 +273,7 @@ int
 nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
 {
 	struct nlm_block	*block, *next;
+	/* XXX: Will everything get cleaned up if we don't unlock here? */
 
 	down(&file->f_sema);
 	for (block = file->f_blocks; block; block = next) {
@@ -439,6 +443,7 @@ u32
 nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 {
 	struct nlm_block	*block;
+	int status = 0;
 
 	dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
 				file->f_file->f_dentry->d_inode->i_sb->s_id,
@@ -449,9 +454,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 
 	down(&file->f_sema);
 	if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL)
-		nlmsvc_delete_block(block, 1);
+		status = nlmsvc_delete_block(block, 1);
 	up(&file->f_sema);
-	return nlm_granted;
+	return status ? nlm_lck_denied : nlm_granted;
 }
 
 /*
diff --git a/fs/locks.c b/fs/locks.c
index 75650d52fe60..fb32d6218e21 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1958,13 +1958,18 @@ EXPORT_SYMBOL(posix_block_lock);
  *
  *	lockd needs to block waiting for locks.
  */
-void
+int
 posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 {
+	int status = 0;
+
 	lock_kernel();
 	if (waiter->fl_next)
 		__locks_delete_block(waiter);
+	else
+		status = -ENOENT;
 	unlock_kernel();
+	return status;
 }
 
 EXPORT_SYMBOL(posix_unblock_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 115e72be25d0..2c9c48d65630 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -760,7 +760,7 @@ extern struct file_lock *posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern void posix_block_lock(struct file_lock *, struct file_lock *);
-extern void posix_unblock_lock(struct file *, struct file_lock *);
+extern int posix_unblock_lock(struct file *, struct file_lock *);
 extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags);
-- 
cgit v1.2.3


From a659753ecc66945e9c69823fcbbe222b446c66d7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 3 Jan 2006 09:55:46 +0100
Subject: NLM: fix parsing of sm notify procedure

 The procedure that decodes statd sm_notify call seems to be skipping a
 few arguments.  How did this ever work?

 >From folks at Polyserve.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/xdr4.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index ae4d6b426c62..fdcf105a5303 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -354,7 +354,9 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
 		return 0;
 	argp->state = ntohl(*p++);
 	/* Preserve the address in network byte order */
-	argp->addr = *p++;
+	argp->addr  = *p++;
+	argp->vers  = *p++;
+	argp->proto = *p++;
 	return xdr_argsize_check(rqstp, p);
 }
 
-- 
cgit v1.2.3


From 03c21733938aad0758f5f88e1cc7ede69fc3c910 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 3 Jan 2006 09:55:48 +0100
Subject: NFSv3: try get_root user-supplied security_flavor

 Thanks to Ed Keizer for bug and root cause.  He says: "... we could only mount
 the top-level Solaris share. We could not mount deeper into the tree.
 Investigation showed that Solaris allows UNIX authenticated FSINFO only on the
 top level of the share. This is a problem because we share/export our home
 directories one level higher than we mount them. I.e. we share the partition
 and not the individual home directories. This prevented access to home
 directories."

 We still may need to try auth_sys for the case where the client doesn't have
 appropriate credentials.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c172a7584646..ed67567f0556 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -68,26 +68,38 @@ nfs3_async_handle_jukebox(struct rpc_task *task)
 	return 1;
 }
 
-/*
- * Bare-bones access to getattr: this is for nfs_read_super.
- */
 static int
-nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
-		   struct nfs_fsinfo *info)
+do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
+		 struct nfs_fsinfo *info)
 {
 	int	status;
 
 	dprintk("%s: call  fsinfo\n", __FUNCTION__);
 	nfs_fattr_init(info->fattr);
-	status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0);
+	status = rpc_call(client, NFS3PROC_FSINFO, fhandle, info, 0);
 	dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status);
 	if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
-		status = rpc_call(server->client_sys, NFS3PROC_GETATTR, fhandle, info->fattr, 0);
+		status = rpc_call(client, NFS3PROC_GETATTR, fhandle, info->fattr, 0);
 		dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
 	}
 	return status;
 }
 
+/*
+ * Bare-bones access to getattr: this is for nfs_read_super.
+ */
+static int
+nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+		   struct nfs_fsinfo *info)
+{
+	int	status;
+
+	status = do_proc_get_root(server->client, fhandle, info);
+	if (status && server->client_sys != server->client)
+		status = do_proc_get_root(server->client_sys, fhandle, info);
+	return status;
+}
+
 /*
  * One function for each procedure in the NFS protocol.
  */
-- 
cgit v1.2.3


From 35f5a422ce1af836007f811b613c440d0e348e06 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 3 Jan 2006 09:55:50 +0100
Subject: SUNRPC: new interface to force an RPC rebind

 We'd like to hide fields in rpc_xprt and rpc_clnt from upper layer protocols.
 Start by creating an API to force RPC rebind, replacing logic that simply
 sets cl_port to zero.

 Test-plan:
 Destructive testing (unplugging the network temporarily).  Connectathon
 with UDP and TCP.  NFSv2/3 and NFSv4 mounting should be carefully checked.
 Probably need to rig a server where certain services aren't running, or
 that returns an error for some typical operation.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             |  4 ++--
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 21 +++++++++++++++------
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index c4c8601096e0..82f7a0b1d8ae 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -177,7 +177,7 @@ nlm_bind_host(struct nlm_host *host)
 	if ((clnt = host->h_rpcclnt) != NULL) {
 		xprt = clnt->cl_xprt;
 		if (time_after_eq(jiffies, host->h_nextrebind)) {
-			clnt->cl_port = 0;
+			rpc_force_rebind(clnt);
 			host->h_nextrebind = jiffies + NLM_HOST_REBIND;
 			dprintk("lockd: next rebind in %ld jiffies\n",
 					host->h_nextrebind - jiffies);
@@ -217,7 +217,7 @@ nlm_rebind_host(struct nlm_host *host)
 {
 	dprintk("lockd: rebind host %s\n", host->h_name);
 	if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) {
-		host->h_rpcclnt->cl_port = 0;
+		rpc_force_rebind(host->h_rpcclnt);
 		host->h_nextrebind = jiffies + NLM_HOST_REBIND;
 	}
 }
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index b0ab959eca65..3d605765f84b 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -135,6 +135,7 @@ void		rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset);
 void		rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset);
 void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 size_t		rpc_max_payload(struct rpc_clnt *);
+void		rpc_force_rebind(struct rpc_clnt *);
 int		rpc_ping(struct rpc_clnt *clnt, int flags);
 
 static __inline__
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 25cba94c5683..2789d3083fe7 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -538,6 +538,18 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL(rpc_max_payload);
 
+/**
+ * rpc_force_rebind - force transport to check that remote port is unchanged
+ * @clnt: client to rebind
+ *
+ */
+void rpc_force_rebind(struct rpc_clnt *clnt)
+{
+	if (clnt->cl_autobind)
+		clnt->cl_port = 0;
+}
+EXPORT_SYMBOL(rpc_force_rebind);
+
 /*
  * Restart an (async) RPC call. Usually called from within the
  * exit handler.
@@ -853,8 +865,7 @@ call_connect_status(struct rpc_task *task)
 	}
 
 	/* Something failed: remote service port may have changed */
-	if (clnt->cl_autobind)
-		clnt->cl_port = 0;
+	rpc_force_rebind(clnt);
 
 	switch (status) {
 	case -ENOTCONN:
@@ -935,8 +946,7 @@ call_status(struct rpc_task *task)
 		break;
 	case -ECONNREFUSED:
 	case -ENOTCONN:
-		if (clnt->cl_autobind)
-			clnt->cl_port = 0;
+		rpc_force_rebind(clnt);
 		task->tk_action = call_bind;
 		break;
 	case -EAGAIN:
@@ -995,8 +1005,7 @@ call_timeout(struct rpc_task *task)
 		printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
 			clnt->cl_protname, clnt->cl_server);
 	}
-	if (clnt->cl_autobind)
-		clnt->cl_port = 0;
+	rpc_force_rebind(clnt);
 
 retry:
 	clnt->cl_stats->rpcretrans++;
-- 
cgit v1.2.3


From f518e35aec984036903c1003e867f833747a9d79 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 3 Jan 2006 09:55:52 +0100
Subject: SUNRPC: get rid of cl_chatty

 Clean up: Every ULP that uses the in-kernel RPC client, except the NLM
 client, sets cl_chatty.  There's no reason why NLM shouldn't set it, so
 just get rid of cl_chatty and always be verbose.

 Test-plan:
 Compile with CONFIG_NFS enabled.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         |  3 +--
 fs/lockd/mon.c              |  1 -
 fs/nfs/inode.c              |  2 --
 fs/nfs/mount_clnt.c         |  1 -
 fs/nfsd/nfs4callback.c      |  1 -
 include/linux/sunrpc/clnt.h |  1 -
 net/sunrpc/clnt.c           | 10 ++++------
 net/sunrpc/pmap_clnt.c      |  1 -
 8 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 816333cd377b..145524039577 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -222,8 +222,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 			goto done;
 		}
 		clnt->cl_softrtry = nfssrv->client->cl_softrtry;
-		clnt->cl_intr     = nfssrv->client->cl_intr;
-		clnt->cl_chatty   = nfssrv->client->cl_chatty;
+		clnt->cl_intr = nfssrv->client->cl_intr;
 	}
 
 	/* Keep the old signal mask */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 2d144abe84ad..0edc03e67966 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -123,7 +123,6 @@ nsm_create(void)
 	if (IS_ERR(clnt))
 		goto out_err;
 	clnt->cl_softrtry = 1;
-	clnt->cl_chatty   = 1;
 	clnt->cl_oneshot  = 1;
 	return clnt;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 648cb1aef3b1..4625479a6b62 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -413,7 +413,6 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 
 	clnt->cl_intr     = 1;
 	clnt->cl_softrtry = 1;
-	clnt->cl_chatty   = 1;
 
 	return clnt;
 
@@ -1838,7 +1837,6 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
 		}
 		clnt->cl_intr     = 1;
 		clnt->cl_softrtry = 1;
-		clnt->cl_chatty   = 1;
 		clp->cl_rpcclient = clnt;
 		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
 		nfs_idmap_new(clp);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0e82617f2de0..db99b8f678f8 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -82,7 +82,6 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version,
 				RPC_AUTH_UNIX);
 	if (!IS_ERR(clnt)) {
 		clnt->cl_softrtry = 1;
-		clnt->cl_chatty   = 1;
 		clnt->cl_oneshot  = 1;
 		clnt->cl_intr = 1;
 	}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cf92008f219a..d828662d737d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -431,7 +431,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 	}
 	clnt->cl_intr = 0;
 	clnt->cl_softrtry = 1;
-	clnt->cl_chatty = 1;
 
 	/* Kick rpciod, put the call on the wire. */
 
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 3d605765f84b..f147e6b84332 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -49,7 +49,6 @@ struct rpc_clnt {
 
 	unsigned int		cl_softrtry : 1,/* soft timeouts */
 				cl_intr     : 1,/* interruptible */
-				cl_chatty   : 1,/* be verbose */
 				cl_autobind : 1,/* use getport() */
 				cl_oneshot  : 1,/* dispose after use */
 				cl_dead     : 1;/* abandoned */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2789d3083fe7..5530ac8c6df9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -957,8 +957,7 @@ call_status(struct rpc_task *task)
 		rpc_exit(task, status);
 		break;
 	default:
-		if (clnt->cl_chatty)
-			printk("%s: RPC call returned error %d\n",
+		printk("%s: RPC call returned error %d\n",
 			       clnt->cl_protname, -status);
 		rpc_exit(task, status);
 		break;
@@ -993,14 +992,13 @@ call_timeout(struct rpc_task *task)
 
 	dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid);
 	if (RPC_IS_SOFT(task)) {
-		if (clnt->cl_chatty)
-			printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
+		printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
 				clnt->cl_protname, clnt->cl_server);
 		rpc_exit(task, -EIO);
 		return;
 	}
 
-	if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) {
+	if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) {
 		task->tk_flags |= RPC_CALL_MAJORSEEN;
 		printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
 			clnt->cl_protname, clnt->cl_server);
@@ -1027,7 +1025,7 @@ call_decode(struct rpc_task *task)
 	dprintk("RPC: %4d call_decode (status %d)\n", 
 				task->tk_pid, task->tk_status);
 
-	if (clnt->cl_chatty && (task->tk_flags & RPC_CALL_MAJORSEEN)) {
+	if (task->tk_flags & RPC_CALL_MAJORSEEN) {
 		printk(KERN_NOTICE "%s: server %s OK\n",
 			clnt->cl_protname, clnt->cl_server);
 		task->tk_flags &= ~RPC_CALL_MAJORSEEN;
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 0935adb91b3c..8139ce68e915 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -217,7 +217,6 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileg
 				RPC_AUTH_UNIX);
 	if (!IS_ERR(clnt)) {
 		clnt->cl_softrtry = 1;
-		clnt->cl_chatty   = 1;
 		clnt->cl_oneshot  = 1;
 	}
 	return clnt;
-- 
cgit v1.2.3


From cf3fff54a46e1f8fa4cc1deb783172a392077eb0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:53 +0100
Subject: NFS: Send valid mode bits to the server

 inode->i_mode contains a lot more than just the mode bits. Make sure that
 we mask away this extra stuff in SETATTR calls to the server.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3xdr.c | 2 +-
 fs/nfs/nfs4xdr.c | 2 +-
 fs/nfs/proc.c    | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 0498bd36602c..b6c0b5012bce 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -182,7 +182,7 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
 {
 	if (attr->ia_valid & ATTR_MODE) {
 		*p++ = xdr_one;
-		*p++ = htonl(attr->ia_mode);
+		*p++ = htonl(attr->ia_mode & S_IALLUGO);
 	} else {
 		*p++ = xdr_zero;
 	}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 12be1d682164..4bbf5ef57785 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -566,7 +566,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 	}
 	if (iap->ia_valid & ATTR_MODE) {
 		bmval1 |= FATTR4_WORD1_MODE;
-		WRITE32(iap->ia_mode);
+		WRITE32(iap->ia_mode & S_IALLUGO);
 	}
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 6145e82b45e8..f5150d71c03d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -111,6 +111,9 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	};
 	int	status;
 
+	/* Mask out the non-modebit related stuff from attr->ia_mode */
+	sattr->ia_mode &= S_IALLUGO;
+
 	dprintk("NFS call  setattr\n");
 	nfs_fattr_init(fattr);
 	status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0);
-- 
cgit v1.2.3


From eadb8c147154bff982f02accf31b847a1f142ace Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:54 +0100
Subject: NFS: get rid of some needless code obfuscation in xdr_encode_sattr().

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 59049e864ca7..7fc0560c89c9 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -146,23 +146,23 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	return p;
 }
 
-#define SATTR(p, attr, flag, field) \
-        *p++ = (attr->ia_valid & flag) ? htonl(attr->field) : ~(u32) 0
 static inline u32 *
 xdr_encode_sattr(u32 *p, struct iattr *attr)
 {
-	SATTR(p, attr, ATTR_MODE, ia_mode);
-	SATTR(p, attr, ATTR_UID, ia_uid);
-	SATTR(p, attr, ATTR_GID, ia_gid);
-	SATTR(p, attr, ATTR_SIZE, ia_size);
+	const u32 not_set = __constant_htonl(0xFFFFFFFF);
+
+	*p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
+	*p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
+	*p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set;
+	*p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set;
 
 	if (attr->ia_valid & ATTR_ATIME_SET) {
 		p = xdr_encode_time(p, &attr->ia_atime);
 	} else if (attr->ia_valid & ATTR_ATIME) {
 		p = xdr_encode_current_server_time(p, &attr->ia_atime);
 	} else {
-		*p++ = ~(u32) 0;
-		*p++ = ~(u32) 0;
+		*p++ = not_set;
+		*p++ = not_set;
 	}
 
 	if (attr->ia_valid & ATTR_MTIME_SET) {
@@ -170,12 +170,11 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
 	} else if (attr->ia_valid & ATTR_MTIME) {
 		p = xdr_encode_current_server_time(p, &attr->ia_mtime);
 	} else {
-		*p++ = ~(u32) 0;	
-		*p++ = ~(u32) 0;
+		*p++ = not_set;	
+		*p++ = not_set;
 	}
   	return p;
 }
-#undef SATTR
 
 /*
  * NFS encode functions
-- 
cgit v1.2.3


From 58df095b732529ade8f4051b41d7c29731afecd6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:57 +0100
Subject: NFSv4: Allow entries in the idmap cache to expire

 If someone changes the uid/gid mapping in userland, then we do eventually
 want those changes to be propagated to the kernel. Currently the kernel
 assumes that it may cache entries forever.

 Add an expiration time + garbage collector for idmap entries.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/kernel-parameters.txt |  4 ++++
 fs/nfs/idmap.c                      |  9 +++++++++
 fs/nfs/inode.c                      | 14 ++++++++++++++
 fs/nfs/sysctl.c                     | 10 ++++++++++
 include/linux/nfs_idmap.h           |  2 ++
 5 files changed, 39 insertions(+)

(limited to 'fs')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 309c9cec6e7c..a482fde09bbb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -914,6 +914,10 @@ running once the system is up.
 			[NFS] set the TCP port on which the NFSv4 callback
 			channel should listen.
 
+	nfs.idmap_cache_timeout=
+			[NFS] set the maximum lifetime for idmapper cache
+			entries.
+
 	nmi_watchdog=	[KNL,BUGS=IA-32] Debugging features for SMP kernels
 
 	no387		[BUGS=IA-32] Tells the kernel to use the 387 maths
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index ffb8df91dc34..821edd30333b 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -54,7 +54,11 @@
 
 #define IDMAP_HASH_SZ          128
 
+/* Default cache timeout is 10 minutes */
+unsigned int nfs_idmap_cache_timeout = 600 * HZ;
+
 struct idmap_hashent {
+	unsigned long ih_expires;
 	__u32 ih_id;
 	int ih_namelen;
 	char ih_name[IDMAP_NAMESZ];
@@ -149,6 +153,8 @@ idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
 
 	if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0)
 		return NULL;
+	if (time_after(jiffies, he->ih_expires))
+		return NULL;
 	return he;
 }
 
@@ -164,6 +170,8 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
 	struct idmap_hashent *he = idmap_id_hash(h, id);
 	if (he->ih_id != id || he->ih_namelen == 0)
 		return NULL;
+	if (time_after(jiffies, he->ih_expires))
+		return NULL;
 	return he;
 }
 
@@ -192,6 +200,7 @@ idmap_update_entry(struct idmap_hashent *he, const char *name,
 	memcpy(he->ih_name, name, namelen);
 	he->ih_name[namelen] = '\0';
 	he->ih_namelen = namelen;
+	he->ih_expires = jiffies + nfs_idmap_cache_timeout;
 }
 
 /*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4625479a6b62..e7bd0d92600f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2050,6 +2050,20 @@ static int param_set_port(const char *val, struct kernel_param *kp)
 module_param_call(callback_tcpport, param_set_port, param_get_int,
 		 &nfs_callback_set_tcpport, 0644);
 
+static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
+{
+	char *endp;
+	int num = simple_strtol(val, &endp, 0);
+	int jif = num * HZ;
+	if (endp == val || *endp || num < 0 || jif < num)
+		return -EINVAL;
+	*((int *)kp->arg) = jif;
+	return 0;
+}
+
+module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
+		 &nfs_idmap_cache_timeout, 0644);
+
 #define nfs4_init_once(nfsi) \
 	do { \
 		INIT_LIST_HEAD(&(nfsi)->open_states); \
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index fdc64b59a4ee..4c486eb867ca 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -11,6 +11,7 @@
 #include <linux/sysctl.h>
 #include <linux/module.h>
 #include <linux/nfs4.h>
+#include <linux/nfs_idmap.h>
 
 #include "callback.h"
 
@@ -35,6 +36,15 @@ static ctl_table nfs_cb_sysctls[] = {
 		.extra1 = (int *)&nfs_set_port_min,
 		.extra2 = (int *)&nfs_set_port_max,
 	},
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "idmap_cache_timeout",
+		.data = &nfs_idmap_cache_timeout,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec_jiffies,
+		.strategy = &sysctl_jiffies,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index a0f1f25e0ead..102e56094296 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -71,6 +71,8 @@ int nfs_map_name_to_uid(struct nfs4_client *, const char *, size_t, __u32 *);
 int nfs_map_group_to_gid(struct nfs4_client *, const char *, size_t, __u32 *);
 int nfs_map_uid_to_name(struct nfs4_client *, __u32, char *);
 int nfs_map_gid_to_group(struct nfs4_client *, __u32, char *);
+
+extern unsigned int nfs_idmap_cache_timeout;
 #endif /* __KERNEL__ */
 
 #endif /* NFS_IDMAP_H */
-- 
cgit v1.2.3


From 26c78e156b1d1b2387ec33b5f2fb62d6e0a186a3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2006 09:55:58 +0100
Subject: NFSv4: Fix an Oops in nfs_do_expire_all_delegations

 If the loop errors, we need to exit.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 66cc720e3927..c6f07c1c71e6 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -236,7 +236,6 @@ int nfs_do_expire_all_delegations(void *ptr)
 	struct nfs4_client *clp = ptr;
 	struct nfs_delegation *delegation;
 	struct inode *inode;
-	int err = 0;
 
 	allow_signal(SIGKILL);
 restart:
@@ -250,10 +249,9 @@ restart:
 		if (inode == NULL)
 			continue;
 		spin_unlock(&clp->cl_lock);
-		err = nfs_inode_return_delegation(inode);
+		nfs_inode_return_delegation(inode);
 		iput(inode);
-		if (!err)
-			goto restart;
+		goto restart;
 	}
 out:
 	spin_unlock(&clp->cl_lock);
-- 
cgit v1.2.3


From 221fc10ec89834329e5613e3cab4569ba22da410 Mon Sep 17 00:00:00 2001
From: Evgeniy <dushistov@mail.ru>
Date: Fri, 6 Jan 2006 21:18:01 +0300
Subject: [PATCH] fs/ufs: debug mode compilation failure

This patch should fix compilation failure of fs/ufs/dir.c with defined UFS_DIR_DEBUG

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index d0915fba155a..7c10c68902ae 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -491,7 +491,7 @@ int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir,
 	
 	UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
 		fs32_to_cpu(sb, de->d_ino),
-		fs16to_cpu(sb, de->d_reclen),
+		fs16_to_cpu(sb, de->d_reclen),
 		ufs_get_de_namlen(sb, de), de->d_name))
 
 	while (i < bh->b_size) {
-- 
cgit v1.2.3


From 9d0243bca345d5ce25d3f4b74b7facb3a6df1232 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:00:39 -0800
Subject: [PATCH] drop-pagecache

Add /proc/sys/vm/drop_caches.  When written to, this will cause the kernel to
discard as much pagecache and/or reclaimable slab objects as it can.  THis
operation requires root permissions.

It won't drop dirty data, so the user should run `sync' first.

Caveats:

a) Holds inode_lock for exorbitant amounts of time.

b) Needs to be taught about NUMA nodes: propagate these all the way through
   so the discarding can be controlled on a per-node basis.

This is a debugging feature: useful for getting consistent results between
filesystem benchmarks.  We could possibly put it under a config option, but
it's less than 300 bytes.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/proc.txt | 17 ++++++++++
 Documentation/sysctl/vm.txt        |  3 +-
 fs/Makefile                        |  2 +-
 fs/drop_caches.c                   | 68 ++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h                 |  7 ++++
 include/linux/sysctl.h             |  1 +
 kernel/sysctl.c                    | 10 ++++++
 mm/truncate.c                      |  1 -
 mm/vmscan.c                        |  3 +-
 9 files changed, 107 insertions(+), 5 deletions(-)
 create mode 100644 fs/drop_caches.c

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index d4773565ea2f..a4dcf42c2fd9 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1302,6 +1302,23 @@ VM has token based thrashing control mechanism and uses the token to prevent
 unnecessary page faults in thrashing situation. The unit of the value is
 second. The value would be useful to tune thrashing behavior.
 
+drop_caches
+-----------
+
+Writing to this will cause the kernel to drop clean caches, dentries and
+inodes from memory, causing that memory to become free.
+
+To free pagecache:
+	echo 1 > /proc/sys/vm/drop_caches
+To free dentries and inodes:
+	echo 2 > /proc/sys/vm/drop_caches
+To free pagecache, dentries and inodes:
+	echo 3 > /proc/sys/vm/drop_caches
+
+As this is a non-destructive operation and dirty objects are not freeable, the
+user should run `sync' first.
+
+
 2.5 /proc/sys/dev - Device specific parameters
 ----------------------------------------------
 
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 2f1aae32a5d9..89ba1a42a17d 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -26,12 +26,13 @@ Currently, these files are in /proc/sys/vm:
 - min_free_kbytes
 - laptop_mode
 - block_dump
+- drop-caches
 
 ==============================================================
 
 dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
 dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
-block_dump, swap_token_timeout:
+block_dump, swap_token_timeout, drop-caches:
 
 See Documentation/filesystems/proc.txt
 
diff --git a/fs/Makefile b/fs/Makefile
index 73676111ebbe..35e9aec608e4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o pnode.o
+		ioprio.o pnode.o drop_caches.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
new file mode 100644
index 000000000000..4e4762389bdc
--- /dev/null
+++ b/fs/drop_caches.c
@@ -0,0 +1,68 @@
+/*
+ * Implement the manual drop-all-pagecache function
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/sysctl.h>
+#include <linux/gfp.h>
+
+/* A global variable is a bit ugly, but it keeps the code simple */
+int sysctl_drop_caches;
+
+static void drop_pagecache_sb(struct super_block *sb)
+{
+	struct inode *inode;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE))
+			continue;
+		invalidate_inode_pages(inode->i_mapping);
+	}
+	spin_unlock(&inode_lock);
+}
+
+void drop_pagecache(void)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			drop_pagecache_sb(sb);
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+}
+
+void drop_slab(void)
+{
+	int nr_objects;
+
+	do {
+		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+	} while (nr_objects > 10);
+}
+
+int drop_caches_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (write) {
+		if (sysctl_drop_caches & 1)
+			drop_pagecache();
+		if (sysctl_drop_caches & 2)
+			drop_slab();
+	}
+	return 0;
+}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc01fff3aa01..83c651f25188 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1036,5 +1036,12 @@ int in_gate_area_no_task(unsigned long addr);
 /* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
 #define OOM_DISABLE -17
 
+int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages);
+void drop_pagecache(void);
+void drop_slab(void);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a9b80fc7f0f3..4cd267fe87ec 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -180,6 +180,7 @@ enum
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a85047bb5739..8dcf6fd5b0f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,7 @@ extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
+extern int sysctl_drop_caches;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -774,6 +775,15 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
 		.strategy	= &sysctl_intvec,
 	},
+	{
+		.ctl_name	= VM_DROP_PAGECACHE,
+		.procname	= "drop_caches",
+		.data		= &sysctl_drop_caches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= drop_caches_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
 	{
 		.ctl_name	= VM_MIN_FREE_KBYTES,
 		.procname	= "min_free_kbytes",
diff --git a/mm/truncate.c b/mm/truncate.c
index 7dee32745901..b1a463d0fe71 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -249,7 +249,6 @@ unlock:
 				break;
 		}
 		pagevec_release(&pvec);
-		cond_resched();
 	}
 	return ret;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be8235fb1939..428c5801d4b4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -180,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker);
  *
  * Returns the number of slab objects which we shrunk.
  */
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages)
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
 	int ret = 0;
-- 
cgit v1.2.3


From 1a75a6c825c17249ca49f050a872a04ce0997ce3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:02 -0800
Subject: [PATCH] Fold numa_maps into mempolicies.c

First discussed at http://marc.theaimsgroup.com/?t=113149255100001&r=1&w=2

- Use the check_range() in mempolicy.c to gather statistics.

- Improve the numa_maps code in general and fix some comments.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c | 127 ++----------------------------------------------
 mm/mempolicy.c     | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 142 insertions(+), 123 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 50bd5a8f0446..0eaad41f4658 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -390,129 +390,12 @@ struct seq_operations proc_pid_smaps_op = {
 };
 
 #ifdef CONFIG_NUMA
-
-struct numa_maps {
-	unsigned long pages;
-	unsigned long anon;
-	unsigned long mapped;
-	unsigned long mapcount_max;
-	unsigned long node[MAX_NUMNODES];
-};
-
-/*
- * Calculate numa node maps for a vma
- */
-static struct numa_maps *get_numa_maps(struct vm_area_struct *vma)
-{
-	int i;
-	struct page *page;
-	unsigned long vaddr;
-	struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
-
-	if (!md)
-		return NULL;
-	md->pages = 0;
-	md->anon = 0;
-	md->mapped = 0;
-	md->mapcount_max = 0;
-	for_each_node(i)
-		md->node[i] =0;
-
- 	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
-		page = follow_page(vma, vaddr, 0);
-		if (page) {
-			int count = page_mapcount(page);
-
-			if (count)
-				md->mapped++;
-			if (count > md->mapcount_max)
-				md->mapcount_max = count;
-			md->pages++;
-			if (PageAnon(page))
-				md->anon++;
-			md->node[page_to_nid(page)]++;
-		}
-		cond_resched();
-	}
-	return md;
-}
-
-static int show_numa_map(struct seq_file *m, void *v)
-{
-	struct task_struct *task = m->private;
-	struct vm_area_struct *vma = v;
-	struct mempolicy *pol;
-	struct numa_maps *md;
-	struct zone **z;
-	int n;
-	int first;
-
-	if (!vma->vm_mm)
-		return 0;
-
-	md = get_numa_maps(vma);
-	if (!md)
-		return 0;
-
-	seq_printf(m, "%08lx", vma->vm_start);
-	pol = get_vma_policy(task, vma, vma->vm_start);
-	/* Print policy */
-	switch (pol->policy) {
-	case MPOL_PREFERRED:
-		seq_printf(m, " prefer=%d", pol->v.preferred_node);
-		break;
-	case MPOL_BIND:
-		seq_printf(m, " bind={");
-		first = 1;
-		for (z = pol->v.zonelist->zones; *z; z++) {
-
-			if (!first)
-				seq_putc(m, ',');
-			else
-				first = 0;
-			seq_printf(m, "%d/%s", (*z)->zone_pgdat->node_id,
-					(*z)->name);
-		}
-		seq_putc(m, '}');
-		break;
-	case MPOL_INTERLEAVE:
-		seq_printf(m, " interleave={");
-		first = 1;
-		for_each_node(n) {
-			if (node_isset(n, pol->v.nodes)) {
-				if (!first)
-					seq_putc(m,',');
-				else
-					first = 0;
-				seq_printf(m, "%d",n);
-			}
-		}
-		seq_putc(m, '}');
-		break;
-	default:
-		seq_printf(m," default");
-		break;
-	}
-	seq_printf(m, " MaxRef=%lu Pages=%lu Mapped=%lu",
-			md->mapcount_max, md->pages, md->mapped);
-	if (md->anon)
-		seq_printf(m," Anon=%lu",md->anon);
-
-	for_each_online_node(n) {
-		if (md->node[n])
-			seq_printf(m, " N%d=%lu", n, md->node[n]);
-	}
-	seq_putc(m, '\n');
-	kfree(md);
-	if (m->count < m->size)  /* vma is copied successfully */
-		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
-	return 0;
-}
+extern int show_numa_map(struct seq_file *m, void *v);
 
 struct seq_operations proc_pid_numa_maps_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_numa_map
+        .start  = m_start,
+        .next   = m_next,
+        .stop   = m_stop,
+        .show   = show_numa_map
 };
 #endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 270e9a39ec15..44b9d69900bc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -84,6 +84,8 @@
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -91,6 +93,7 @@
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
@@ -228,6 +231,8 @@ static void migrate_page_add(struct vm_area_struct *vma,
 	}
 }
 
+static void gather_stats(struct page *, void *);
+
 /* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
@@ -252,7 +257,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 			continue;
 
-		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		if (flags & MPOL_MF_STATS)
+			gather_stats(page, private);
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(vma, page, private, flags);
 		else
 			break;
@@ -1460,3 +1467,132 @@ void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
 {
 	rebind_policy(current->mempolicy, old, new);
 }
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+
+static const char *policy_types[] = { "default", "prefer", "bind",
+				      "interleave" };
+
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+	char *p = buffer;
+	int l;
+	nodemask_t nodes;
+	int mode = pol ? pol->policy : MPOL_DEFAULT;
+
+	switch (mode) {
+	case MPOL_DEFAULT:
+		nodes_clear(nodes);
+		break;
+
+	case MPOL_PREFERRED:
+		nodes_clear(nodes);
+		node_set(pol->v.preferred_node, nodes);
+		break;
+
+	case MPOL_BIND:
+		get_zonemask(pol, &nodes);
+		break;
+
+	case MPOL_INTERLEAVE:
+		nodes = pol->v.nodes;
+		break;
+
+	default:
+		BUG();
+		return -EFAULT;
+	}
+
+	l = strlen(policy_types[mode]);
+ 	if (buffer + maxlen < p + l + 1)
+ 		return -ENOSPC;
+
+	strcpy(p, policy_types[mode]);
+	p += l;
+
+	if (!nodes_empty(nodes)) {
+		if (buffer + maxlen < p + 2)
+			return -ENOSPC;
+		*p++ = '=';
+	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+	}
+	return p - buffer;
+}
+
+struct numa_maps {
+	unsigned long pages;
+	unsigned long anon;
+	unsigned long mapped;
+	unsigned long mapcount_max;
+	unsigned long node[MAX_NUMNODES];
+};
+
+static void gather_stats(struct page *page, void *private)
+{
+	struct numa_maps *md = private;
+	int count = page_mapcount(page);
+
+	if (count)
+		md->mapped++;
+
+	if (count > md->mapcount_max)
+		md->mapcount_max = count;
+
+	md->pages++;
+
+	if (PageAnon(page))
+		md->anon++;
+
+	md->node[page_to_nid(page)]++;
+	cond_resched();
+}
+
+int show_numa_map(struct seq_file *m, void *v)
+{
+	struct task_struct *task = m->private;
+	struct vm_area_struct *vma = v;
+	struct numa_maps *md;
+	int n;
+	char buffer[50];
+
+	if (!vma->vm_mm)
+		return 0;
+
+	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
+	if (!md)
+		return 0;
+
+	check_pgd_range(vma, vma->vm_start, vma->vm_end,
+		    &node_online_map, MPOL_MF_STATS, md);
+
+	if (md->pages) {
+		mpol_to_str(buffer, sizeof(buffer),
+			    get_vma_policy(task, vma, vma->vm_start));
+
+		seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
+			   vma->vm_start, buffer, md->pages,
+			   md->mapped, md->mapcount_max);
+
+		if (md->anon)
+			seq_printf(m," anon=%lu",md->anon);
+
+		for_each_online_node(n)
+			if (md->node[n])
+				seq_printf(m, " N%d=%lu", n, md->node[n]);
+
+		seq_putc(m, '\n');
+	}
+	kfree(md);
+
+	if (m->count < m->size)
+		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+	return 0;
+}
+
-- 
cgit v1.2.3


From 2919b51075b3906c2f476e5a932244af1947bf80 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:01:16 -0800
Subject: [PATCH] frv: suppress configuration of certain features for FRV

Suppress configuration of certain features for the FRV arch as they can't be
built for FRV at the moment:

 (*) RTC

 (*) HISAX_*

 (*) PARPORT_PC

 (*) VGA_CONSOLE

 (*) BINFMT_ELF

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/Kconfig          |  4 ++--
 drivers/isdn/hisax/Kconfig    | 10 +++++-----
 drivers/parport/Kconfig       |  2 +-
 drivers/video/console/Kconfig |  2 +-
 fs/Kconfig.binfmt             |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 5ebd06b1b4ca..b1b34edcd70c 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -687,7 +687,7 @@ config NVRAM
 
 config RTC
 	tristate "Enhanced Real Time Clock Support"
-	depends on !PPC32 && !PARISC && !IA64 && !M68K && (!SPARC || PCI)
+	depends on !PPC32 && !PARISC && !IA64 && !M68K && (!SPARC || PCI) && !FRV
 	---help---
 	  If you say Y here and create a character special file /dev/rtc with
 	  major number 10 and minor number 135 using mknod ("man mknod"), you
@@ -735,7 +735,7 @@ config SGI_IP27_RTC
 
 config GEN_RTC
 	tristate "Generic /dev/rtc emulation"
-	depends on RTC!=y && !IA64 && !ARM && !M32R && !SPARC
+	depends on RTC!=y && !IA64 && !ARM && !M32R && !SPARC && !FRV
 	---help---
 	  If you say Y here and create a character special file /dev/rtc with
 	  major number 10 and minor number 135 using mknod ("man mknod"), you
diff --git a/drivers/isdn/hisax/Kconfig b/drivers/isdn/hisax/Kconfig
index c82105920d71..0ef560144be3 100644
--- a/drivers/isdn/hisax/Kconfig
+++ b/drivers/isdn/hisax/Kconfig
@@ -110,7 +110,7 @@ config HISAX_16_3
 
 config HISAX_TELESPCI
 	bool "Teles PCI"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the Teles PCI.
 	  See <file:Documentation/isdn/README.HiSax> on how to configure it.
@@ -238,7 +238,7 @@ config HISAX_MIC
 
 config HISAX_NETJET
 	bool "NETjet card"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the NetJet from Traverse
 	  Technologies.
@@ -249,7 +249,7 @@ config HISAX_NETJET
 
 config HISAX_NETJET_U
 	bool "NETspider U card"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the Netspider U interface ISDN card
 	  from Traverse Technologies.
@@ -317,7 +317,7 @@ config HISAX_GAZEL
 
 config HISAX_HFC_PCI
 	bool "HFC PCI-Bus cards"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the HFC-S PCI 2BDS0 based cards.
 
@@ -344,7 +344,7 @@ config HISAX_HFC_SX
 
 config HISAX_ENTERNOW_PCI
 	bool "Formula-n enter:now PCI card"
-	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K))
+	depends on PCI && (BROKEN || !(SPARC || PPC || PARISC || M68K || FRV))
 	help
 	  This enables HiSax support for the Formula-n enter:now PCI
 	  ISDN card.
diff --git a/drivers/parport/Kconfig b/drivers/parport/Kconfig
index b8241561da45..a665951b1586 100644
--- a/drivers/parport/Kconfig
+++ b/drivers/parport/Kconfig
@@ -34,7 +34,7 @@ config PARPORT
 
 config PARPORT_PC
 	tristate "PC-style hardware"
-	depends on PARPORT && (!SPARC64 || PCI) && !SPARC32 && !M32R
+	depends on PARPORT && (!SPARC64 || PCI) && !SPARC32 && !M32R && !FRV
 	---help---
 	  You should say Y here if you have a PC-style parallel port. All
 	  IBM PC compatible computers and some Alphas have PC-style
diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index a5d09e159cd1..6ee449858a5c 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -6,7 +6,7 @@ menu "Console display driver support"
 
 config VGA_CONSOLE
 	bool "VGA text console" if EMBEDDED || !X86
-	depends on !ARCH_ACORN && !ARCH_EBSA110 && !4xx && !8xx && !SPARC && !M68K && !PARISC && !ARCH_VERSATILE
+	depends on !ARCH_ACORN && !ARCH_EBSA110 && !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && !ARCH_VERSATILE
 	default y
 	help
 	  Saying Y here will allow you to use Linux in text mode through a
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 175b2e8177c1..f3d3d81eb7e9 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -1,6 +1,6 @@
 config BINFMT_ELF
 	bool "Kernel support for ELF binaries"
-	depends on MMU
+	depends on MMU && (BROKEN || !FRV)
 	default y
 	---help---
 	  ELF (Executable and Linkable Format) is a format for libraries and
-- 
cgit v1.2.3


From e56d090310d7625ecb43a1eeebd479f04affb48b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 8 Jan 2006 01:01:37 -0800
Subject: [PATCH] RCU signal handling

RCU tasklist_lock and RCU signal handling: send signals RCU-read-locked
instead of tasklist_lock read-locked.  This is a scalability improvement on
SMP and a preemption-latency improvement under PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  4 +--
 include/linux/sched.h | 32 +++++++++++++++--
 kernel/exit.c         |  1 -
 kernel/fork.c         | 10 +++++-
 kernel/pid.c          | 22 ++++++------
 kernel/rcupdate.c     |  1 +
 kernel/sched.c        |  7 ++++
 kernel/signal.c       | 97 +++++++++++++++++++++++++++++++++++++++++++--------
 8 files changed, 143 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index e75a9548da8e..e9650cd22a3b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -760,7 +760,7 @@ no_thread_group:
 		spin_lock(&oldsighand->siglock);
 		spin_lock(&newsighand->siglock);
 
-		current->sighand = newsighand;
+		rcu_assign_pointer(current->sighand, newsighand);
 		recalc_sigpending();
 
 		spin_unlock(&newsighand->siglock);
@@ -768,7 +768,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			kmem_cache_free(sighand_cachep, oldsighand);
+			sighand_free(oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a74662077d60..a6af77e9b4cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/seccomp.h>
+#include <linux/rcupdate.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
@@ -350,8 +351,16 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	struct rcu_head		rcu;
 };
 
+extern void sighand_free_cb(struct rcu_head *rhp);
+
+static inline void sighand_free(struct sighand_struct *sp)
+{
+	call_rcu(&sp->rcu, sighand_free_cb);
+}
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -844,6 +853,7 @@ struct task_struct {
 	int cpuset_mems_generation;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
+	struct rcu_head rcu;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
@@ -867,8 +877,26 @@ static inline int pid_alive(struct task_struct *p)
 extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-#define put_task_struct(tsk) \
-do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
+
+static inline int get_task_struct_rcu(struct task_struct *t)
+{
+	int oldusage;
+
+	do {
+		oldusage = atomic_read(&t->usage);
+		if (oldusage == 0)
+			return 0;
+	} while (cmpxchg(&t->usage.counter, oldusage, oldusage+1) != oldusage);
+	return 1;
+}
+
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->rcu, __put_task_struct_cb);
+}
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index ee515683b92d..c73a7eb26de3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,7 +72,6 @@ repeat:
 		__ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
-	__exit_sighand(p);
 	/*
 	 * Note that the fastpath in sys_times depends on __exit_signal having
 	 * updated the counters before a task is removed from the tasklist of
diff --git a/kernel/fork.c b/kernel/fork.c
index fb8572a42297..7fe3adfa65cb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -743,6 +743,14 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
+void sighand_free_cb(struct rcu_head *rhp)
+{
+	struct sighand_struct *sp;
+
+	sp = container_of(rhp, struct sighand_struct, rcu);
+	kmem_cache_free(sighand_cachep, sp);
+}
+
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -752,7 +760,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 		return 0;
 	}
 	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
-	tsk->sighand = sig;
+	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
 	spin_lock_init(&sig->siglock);
diff --git a/kernel/pid.c b/kernel/pid.c
index edba31c681ac..1acc07246991 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
 	struct hlist_node *elem;
 	struct pid *pid;
 
-	hlist_for_each_entry(pid, elem,
+	hlist_for_each_entry_rcu(pid, elem,
 			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
@@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 
 	task_pid = &task->pids[type];
 	pid = find_pid(type, nr);
+	task_pid->nr = nr;
 	if (pid == NULL) {
-		hlist_add_head(&task_pid->pid_chain,
-				&pid_hash[type][pid_hashfn(nr)]);
 		INIT_LIST_HEAD(&task_pid->pid_list);
+		hlist_add_head_rcu(&task_pid->pid_chain,
+				   &pid_hash[type][pid_hashfn(nr)]);
 	} else {
 		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
 	}
-	task_pid->nr = nr;
 
 	return 0;
 }
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type)
 
 	pid = &task->pids[type];
 	if (!hlist_unhashed(&pid->pid_chain)) {
-		hlist_del(&pid->pid_chain);
 
-		if (list_empty(&pid->pid_list))
+		if (list_empty(&pid->pid_list)) {
 			nr = pid->nr;
-		else {
+			hlist_del_rcu(&pid->pid_chain);
+		} else {
 			pid_next = list_entry(pid->pid_list.next,
 						struct pid, pid_list);
 			/* insert next pid from pid_list to hash */
-			hlist_add_head(&pid_next->pid_chain,
-				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+			hlist_replace_rcu(&pid->pid_chain,
+					  &pid_next->pid_chain);
 		}
 	}
 
-	list_del(&pid->pid_list);
+	list_del_rcu(&pid->pid_list);
 	pid->nr = 0;
 
 	return nr;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c9afc61240e4..0a669bd2f6d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f46c94cc29e..92733091154c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -176,6 +176,13 @@ static unsigned int task_timeslice(task_t *p)
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	__put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
 /*
  * These are the runqueue data structures:
  */
diff --git a/kernel/signal.c b/kernel/signal.c
index d7611f189ef7..64737c72dadd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -329,13 +329,20 @@ void __exit_sighand(struct task_struct *tsk)
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
+		sighand_free(sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
 {
 	write_lock_irq(&tasklist_lock);
-	__exit_sighand(tsk);
+	rcu_read_lock();
+	if (tsk->sighand != NULL) {
+		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
+		spin_lock(&sighand->siglock);
+		__exit_sighand(tsk);
+		spin_unlock(&sighand->siglock);
+	}
+	rcu_read_unlock();
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -345,12 +352,14 @@ void exit_sighand(struct task_struct *tsk)
 void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct * sig = tsk->signal;
-	struct sighand_struct * sighand = tsk->sighand;
+	struct sighand_struct * sighand;
 
 	if (!sig)
 		BUG();
 	if (!atomic_read(&sig->count))
 		BUG();
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
@@ -358,6 +367,7 @@ void __exit_signal(struct task_struct *tsk)
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 	} else {
@@ -389,9 +399,11 @@ void __exit_signal(struct task_struct *tsk)
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}
+	rcu_read_unlock();
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
@@ -1080,18 +1092,28 @@ void zap_other_threads(struct task_struct *p)
 }
 
 /*
- * Must be called with the tasklist_lock held for reading!
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
+	struct sighand_struct *sp;
 	int ret;
 
+retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && p->sighand) {
-		spin_lock_irqsave(&p->sighand->siglock, flags);
+	if (!ret && sig && (sp = p->sighand)) {
+		if (!get_task_struct_rcu(p))
+			return -ESRCH;
+		spin_lock_irqsave(&sp->siglock, flags);
+		if (p->sighand != sp) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			put_task_struct(p);
+			goto retry;
+		}
 		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		spin_unlock_irqrestore(&sp->siglock, flags);
+		put_task_struct(p);
 	}
 
 	return ret;
@@ -1136,14 +1158,21 @@ int
 kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
+	int acquired_tasklist_lock = 0;
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
+	if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+		read_lock(&tasklist_lock);
+		acquired_tasklist_lock = 1;
+	}
 	p = find_task_by_pid(pid);
 	error = -ESRCH;
 	if (p)
 		error = group_send_sig_info(sig, info, p);
-	read_unlock(&tasklist_lock);
+	if (unlikely(acquired_tasklist_lock))
+		read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1355,16 +1384,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
 	unsigned long flags;
 	int ret = 0;
+	struct sighand_struct *sh;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	/*
+	 * The rcu based delayed sighand destroy makes it possible to
+	 * run this without tasklist lock held. The task struct itself
+	 * cannot go away as create_timer did get_task_struct().
+	 *
+	 * We return -1, when the task is marked exiting, so
+	 * posix_timer_event can redirect it to the group leader
+	 */
+	rcu_read_lock();
 
 	if (unlikely(p->flags & PF_EXITING)) {
 		ret = -1;
 		goto out_err;
 	}
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
+retry:
+	sh = rcu_dereference(p->sighand);
+
+	spin_lock_irqsave(&sh->siglock, flags);
+	if (p->sighand != sh) {
+		/* We raced with exec() in a multithreaded process... */
+		spin_unlock_irqrestore(&sh->siglock, flags);
+		goto retry;
+	}
+
+	/*
+	 * We do the check here again to handle the following scenario:
+	 *
+	 * CPU 0		CPU 1
+	 * send_sigqueue
+	 * check PF_EXITING
+	 * interrupt		exit code running
+	 *			__exit_signal
+	 *			lock sighand->siglock
+	 *			unlock sighand->siglock
+	 * lock sh->siglock
+	 * add(tsk->pending) 	flush_sigqueue(tsk->pending)
+	 *
+	 */
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out;
+	}
 
 	if (unlikely(!list_empty(&q->list))) {
 		/*
@@ -1388,9 +1455,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		signal_wake_up(p, sig == SIGKILL);
 
 out:
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	spin_unlock_irqrestore(&sh->siglock, flags);
 out_err:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -1402,7 +1469,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	int ret = 0;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+
 	read_lock(&tasklist_lock);
+	/* Since it_lock is held, p->sighand cannot be NULL. */
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 
@@ -1436,7 +1505,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	read_unlock(&tasklist_lock);
-	return(ret);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From 10cef6029502915bdb3cf0821d425cf9dc30c817 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:01:45 -0800
Subject: [PATCH] slob: introduce the SLOB allocator

configurable replacement for slab allocator

This adds a CONFIG_SLAB option under CONFIG_EMBEDDED.  When CONFIG_SLAB is
disabled, the kernel falls back to using the 'SLOB' allocator.

SLOB is a traditional K&R/UNIX allocator with a SLAB emulation layer,
similar to the original Linux kmalloc allocator that SLAB replaced.  It's
signicantly smaller code and is more memory efficient.  But like all
similar allocators, it scales poorly and suffers from fragmentation more
than SLAB, so it's only appropriate for small systems.

It's been tested extensively in the Linux-tiny tree.  I've also
stress-tested it with make -j 8 compiles on a 3G SMP+PREEMPT box (not
recommended).

Here's a comparison for otherwise identical builds, showing SLOB saving
nearly half a megabyte of RAM:

$ size vmlinux*
   text    data     bss     dec     hex filename
3336372  529360  190812 4056544  3de5e0 vmlinux-slab
3323208  527948  190684 4041840  3dac70 vmlinux-slob

$ size mm/{slab,slob}.o
   text    data     bss     dec     hex filename
  13221     752      48   14021    36c5 mm/slab.o
   1896      52       8    1956     7a4 mm/slob.o

/proc/meminfo:
                  SLAB          SLOB      delta
MemTotal:        27964 kB      27980 kB     +16 kB
MemFree:         24596 kB      25092 kB    +496 kB
Buffers:            36 kB         36 kB       0 kB
Cached:           1188 kB       1188 kB       0 kB
SwapCached:          0 kB          0 kB       0 kB
Active:            608 kB        600 kB      -8 kB
Inactive:          808 kB        812 kB      +4 kB
HighTotal:           0 kB          0 kB       0 kB
HighFree:            0 kB          0 kB       0 kB
LowTotal:        27964 kB      27980 kB     +16 kB
LowFree:         24596 kB      25092 kB    +496 kB
SwapTotal:           0 kB          0 kB       0 kB
SwapFree:            0 kB          0 kB       0 kB
Dirty:               4 kB         12 kB      +8 kB
Writeback:           0 kB          0 kB       0 kB
Mapped:            560 kB        556 kB      -4 kB
Slab:             1756 kB          0 kB   -1756 kB
CommitLimit:     13980 kB      13988 kB      +8 kB
Committed_AS:     4208 kB       4208 kB       0 kB
PageTables:         28 kB         28 kB       0 kB
VmallocTotal:  1007312 kB    1007312 kB       0 kB
VmallocUsed:        48 kB         48 kB       0 kB
VmallocChunk:  1007264 kB    1007264 kB       0 kB

(this work has been sponsored in part by CELF)

From: Ingo Molnar <mingo@elte.hu>

   Fix 32-bitness bugs in mm/slob.c.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c  |   4 +
 include/linux/slab.h |  35 +++++
 init/Kconfig         |  13 ++
 mm/Makefile          |   4 +-
 mm/slob.c            | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 440 insertions(+), 1 deletion(-)
 create mode 100644 mm/slob.c

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5b6b0b6038a7..63bf6c00fa0c 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -323,6 +323,7 @@ static struct file_operations proc_modules_operations = {
 };
 #endif
 
+#ifdef CONFIG_SLAB
 extern struct seq_operations slabinfo_op;
 extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
@@ -336,6 +337,7 @@ static struct file_operations proc_slabinfo_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif
 
 static int show_stat(struct seq_file *p, void *v)
 {
@@ -600,7 +602,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
 	create_seq_entry("stat", 0, &proc_stat_operations);
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
+#ifdef CONFIG_SLAB
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+#endif
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d1ea4051b996..1fb77a9cc148 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -53,6 +53,8 @@ typedef struct kmem_cache kmem_cache_t;
 #define SLAB_CTOR_ATOMIC	0x002UL		/* tell constructor it can't sleep */
 #define	SLAB_CTOR_VERIFY	0x004UL		/* tell constructor it's a verify call */
 
+#ifndef CONFIG_SLOB
+
 /* prototypes */
 extern void __init kmem_cache_init(void);
 
@@ -134,6 +136,39 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 extern int FASTCALL(kmem_cache_reap(int));
 extern int FASTCALL(kmem_ptr_validate(kmem_cache_t *cachep, void *ptr));
 
+#else /* CONFIG_SLOB */
+
+/* SLOB allocator routines */
+
+void kmem_cache_init(void);
+struct kmem_cache *kmem_find_general_cachep(size_t, gfp_t gfpflags);
+struct kmem_cache *kmem_cache_create(const char *c, size_t, size_t,
+	unsigned long,
+	void (*)(void *, struct kmem_cache *, unsigned long),
+	void (*)(void *, struct kmem_cache *, unsigned long));
+int kmem_cache_destroy(struct kmem_cache *c);
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *c, void *b);
+const char *kmem_cache_name(struct kmem_cache *);
+void *kmalloc(size_t size, gfp_t flags);
+void *kzalloc(size_t size, gfp_t flags);
+void kfree(const void *m);
+unsigned int ksize(const void *m);
+unsigned int kmem_cache_size(struct kmem_cache *c);
+
+static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+{
+	return kzalloc(n * size, flags);
+}
+
+#define kmem_cache_shrink(d) (0)
+#define kmem_cache_reap(a)
+#define kmem_ptr_validate(a, b) (0)
+#define kmem_cache_alloc_node(c, f, n) kmem_cache_alloc(c, f)
+#define kmalloc_node(s, f, n) kmalloc(s, f)
+
+#endif /* CONFIG_SLOB */
+
 /* System wide caches */
 extern kmem_cache_t	*vm_area_cachep;
 extern kmem_cache_t	*names_cachep;
diff --git a/init/Kconfig b/init/Kconfig
index ba42f3793a84..0c9932f9f06b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -380,6 +380,15 @@ config CC_ALIGN_JUMPS
 	  no dummy operations need be executed.
 	  Zero means use compiler's default.
 
+config SLAB
+	default y
+	bool "Use full SLAB allocator" if EMBEDDED
+	help
+	  Disabling this replaces the advanced SLAB allocator and
+	  kmalloc support with the drastically simpler SLOB allocator.
+	  SLOB is more space efficient but does not scale well and is
+	  more susceptible to fragmentation.
+
 endmenu		# General setup
 
 config TINY_SHMEM
@@ -391,6 +400,10 @@ config BASE_SMALL
 	default 0 if BASE_FULL
 	default 1 if !BASE_FULL
 
+config SLOB
+	default !SLAB
+	bool
+
 menu "Loadable module support"
 
 config MODULES
diff --git a/mm/Makefile b/mm/Makefile
index 74c85ddc9176..9aa03fa1dcc3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
-			   readahead.o slab.o swap.o truncate.o vmscan.o \
+			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 000000000000..1c240c4b71d9
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+
+struct slob_block {
+	int units;
+	struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+
+struct bigblock {
+	int order;
+	void *pages;
+	struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+
+static void slob_free(void *b, int size);
+
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+	slob_t *prev, *cur, *aligned = 0;
+	int delta = 0, units = SLOB_UNITS(size);
+	unsigned long flags;
+
+	spin_lock_irqsave(&slob_lock, flags);
+	prev = slobfree;
+	for (cur = prev->next; ; prev = cur, cur = cur->next) {
+		if (align) {
+			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+			delta = aligned - cur;
+		}
+		if (cur->units >= units + delta) { /* room enough? */
+			if (delta) { /* need to fragment head to align? */
+				aligned->units = cur->units - delta;
+				aligned->next = cur->next;
+				cur->next = aligned;
+				cur->units = delta;
+				prev = cur;
+				cur = aligned;
+			}
+
+			if (cur->units == units) /* exact fit? */
+				prev->next = cur->next; /* unlink */
+			else { /* fragment */
+				prev->next = cur + units;
+				prev->next->units = cur->units - units;
+				prev->next->next = cur->next;
+				cur->units = units;
+			}
+
+			slobfree = prev;
+			spin_unlock_irqrestore(&slob_lock, flags);
+			return cur;
+		}
+		if (cur == slobfree) {
+			spin_unlock_irqrestore(&slob_lock, flags);
+
+			if (size == PAGE_SIZE) /* trying to shrink arena? */
+				return 0;
+
+			cur = (slob_t *)__get_free_page(gfp);
+			if (!cur)
+				return 0;
+
+			slob_free(cur, PAGE_SIZE);
+			spin_lock_irqsave(&slob_lock, flags);
+			cur = slobfree;
+		}
+	}
+}
+
+static void slob_free(void *block, int size)
+{
+	slob_t *cur, *b = (slob_t *)block;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (size)
+		b->units = SLOB_UNITS(size);
+
+	/* Find reinsertion point */
+	spin_lock_irqsave(&slob_lock, flags);
+	for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+		if (cur >= cur->next && (b > cur || b < cur->next))
+			break;
+
+	if (b + b->units == cur->next) {
+		b->units += cur->next->units;
+		b->next = cur->next->next;
+	} else
+		b->next = cur->next;
+
+	if (cur + cur->units == b) {
+		cur->units += b->units;
+		cur->next = b->next;
+	} else
+		cur->next = b;
+
+	slobfree = cur;
+
+	spin_unlock_irqrestore(&slob_lock, flags);
+}
+
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+	int order = 0;
+	for ( ; size > 4096 ; size >>=1)
+		order++;
+	return order;
+}
+
+void *kmalloc(size_t size, gfp_t gfp)
+{
+	slob_t *m;
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (size < PAGE_SIZE - SLOB_UNIT) {
+		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+		return m ? (void *)(m + 1) : 0;
+	}
+
+	bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+	if (!bb)
+		return 0;
+
+	bb->order = find_order(size);
+	bb->pages = (void *)__get_free_pages(gfp, bb->order);
+
+	if (bb->pages) {
+		spin_lock_irqsave(&block_lock, flags);
+		bb->next = bigblocks;
+		bigblocks = bb;
+		spin_unlock_irqrestore(&block_lock, flags);
+		return bb->pages;
+	}
+
+	slob_free(bb, sizeof(bigblock_t));
+	return 0;
+}
+
+EXPORT_SYMBOL(kmalloc);
+
+void kfree(const void *block)
+{
+	bigblock_t *bb, **last = &bigblocks;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		/* might be on the big block list */
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+			if (bb->pages == block) {
+				*last = bb->next;
+				spin_unlock_irqrestore(&block_lock, flags);
+				free_pages((unsigned long)block, bb->order);
+				slob_free(bb, sizeof(bigblock_t));
+				return;
+			}
+		}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	slob_free((slob_t *)block - 1, 0);
+	return;
+}
+
+EXPORT_SYMBOL(kfree);
+
+unsigned int ksize(const void *block)
+{
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (!block)
+		return 0;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; bb = bb->next)
+			if (bb->pages == block) {
+				spin_unlock_irqrestore(&slob_lock, flags);
+				return PAGE_SIZE << bb->order;
+			}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+
+struct kmem_cache {
+	unsigned int size, align;
+	const char *name;
+	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+	size_t align, unsigned long flags,
+	void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+	struct kmem_cache *c;
+
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+
+	if (c) {
+		c->name = name;
+		c->size = size;
+		c->ctor = ctor;
+		c->dtor = dtor;
+		/* ignore alignment unless it's forced */
+		c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+		if (c->align < align)
+			c->align = align;
+	}
+
+	return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+	slob_free(c, sizeof(struct kmem_cache));
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+	void *b;
+
+	if (c->size < PAGE_SIZE)
+		b = slob_alloc(c->size, flags, c->align);
+	else
+		b = (void *)__get_free_pages(flags, find_order(c->size));
+
+	if (c->ctor)
+		c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+
+	return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+	if (c->dtor)
+		c->dtor(b, c, 0);
+
+	if (c->size < PAGE_SIZE)
+		slob_free(b, c->size);
+	else
+		free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+	return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+	return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+	(void (*)(unsigned long))kmem_cache_init, 0, 0);
+
+void kmem_cache_init(void)
+{
+	void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+
+	if (p)
+		free_page((unsigned long)p);
+
+	mod_timer(&slob_timer, jiffies + HZ);
+}
+
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+
+#ifdef CONFIG_SMP
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+	int i;
+	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+
+	if (!pdata)
+		return NULL;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+		if (!pdata->ptrs[i])
+			goto unwind_oom;
+		memset(pdata->ptrs[i], 0, size);
+	}
+
+	/* Catch derefs w/o wrappers */
+	return (void *) (~(unsigned long) pdata);
+
+unwind_oom:
+	while (--i >= 0) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(pdata->ptrs[i]);
+	}
+	kfree(pdata);
+	return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+
+void
+free_percpu(const void *objp)
+{
+	int i;
+	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(p->ptrs[i]);
+	}
+	kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+
+#endif
-- 
cgit v1.2.3


From a6bf6b211cdb92c315c24719a522d8b6f3998210 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:08 -0800
Subject: [PATCH] fat: move fat_clusters_flush() to write_super()

It is overkill to update the FS_INFO whenever modifying
prev_free/free_clusters, because those are just a hint.

So, this patch uses ->write_super() for updating FS_INFO instead.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/fatent.c |  8 ++++++--
 fs/fat/inode.c  | 10 ++++++++--
 fs/fat/misc.c   |  2 --
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 4164cd54c4d1..20a22875f0a1 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -476,6 +476,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
 				sbi->prev_free = entry;
 				if (sbi->free_clusters != -1)
 					sbi->free_clusters--;
+				sb->s_dirt = 1;
 
 				cluster[idx_clus] = entry;
 				idx_clus++;
@@ -496,6 +497,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
 
 	/* Couldn't allocate the free entries */
 	sbi->free_clusters = 0;
+	sb->s_dirt = 1;
 	err = -ENOSPC;
 
 out:
@@ -509,7 +511,6 @@ out:
 	}
 	for (i = 0; i < nr_bhs; i++)
 		brelse(bhs[i]);
-	fat_clusters_flush(sb);
 
 	if (err && idx_clus)
 		fat_free_clusters(inode, cluster[0]);
@@ -542,8 +543,10 @@ int fat_free_clusters(struct inode *inode, int cluster)
 		}
 
 		ops->ent_put(&fatent, FAT_ENT_FREE);
-		if (sbi->free_clusters != -1)
+		if (sbi->free_clusters != -1) {
 			sbi->free_clusters++;
+			sb->s_dirt = 1;
+		}
 
 		if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
 			if (sb->s_flags & MS_SYNCHRONOUS) {
@@ -605,6 +608,7 @@ int fat_count_free_clusters(struct super_block *sb)
 		} while (fat_ent_next(sbi, &fatent));
 	}
 	sbi->free_clusters = free;
+	sb->s_dirt = 1;
 	fatent_brelse(&fatent);
 out:
 	unlock_fat(sbi);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a0f9b9fe1307..897312616b0a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -374,12 +374,17 @@ static void fat_clear_inode(struct inode *inode)
 	unlock_kernel();
 }
 
-static void fat_put_super(struct super_block *sb)
+static void fat_write_super(struct super_block *sb)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY))
 		fat_clusters_flush(sb);
+}
+
+static void fat_put_super(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
 	if (sbi->nls_disk) {
 		unload_nls(sbi->nls_disk);
@@ -546,6 +551,7 @@ static struct super_operations fat_sops = {
 	.write_inode	= fat_write_inode,
 	.delete_inode	= fat_delete_inode,
 	.put_super	= fat_put_super,
+	.write_super	= fat_write_super,
 	.statfs		= fat_statfs,
 	.clear_inode	= fat_clear_inode,
 	.remount_fs	= fat_remount,
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 2a0df2122f5d..9b592e37e259 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -67,8 +67,6 @@ void fat_clusters_flush(struct super_block *sb)
 		if (sbi->prev_free != -1)
 			fsinfo->next_cluster = cpu_to_le32(sbi->prev_free);
 		mark_buffer_dirty(bh);
-		if (sb->s_flags & MS_SYNCHRONOUS)
-			sync_dirty_buffer(bh);
 	}
 	brelse(bh);
 }
-- 
cgit v1.2.3


From 83b7c996dc859c7b53f94d46ee5c5929cc0399e2 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:09 -0800
Subject: [PATCH] fat: use sb_find_get_block() instead of sb_getblk()

We don't need to allocate buffer for checking the buffer is uptodate.  This
use sb_find_get_block() instead, and if it returns NULL it's not uptodate.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ba824964b9bb..b2a26cd226b5 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -45,8 +45,8 @@ static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
 	if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO))
 		return;
 
-	bh = sb_getblk(sb, phys);
-	if (bh && !buffer_uptodate(bh)) {
+	bh = sb_find_get_block(sb, phys);
+	if (bh == NULL || !buffer_uptodate(bh)) {
 		for (sec = 0; sec < sbi->sec_per_clus; sec++)
 			sb_breadahead(sb, phys + sec);
 	}
-- 
cgit v1.2.3


From a5425d2927a6a771f9ae8767b6bfb3c09225bcdd Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:10 -0800
Subject: [PATCH] fat: add the read/writepages()

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/inode.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 897312616b0a..f502c6b8cb49 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,6 +18,7 @@
 #include <linux/seq_file.h>
 #include <linux/msdos_fs.h>
 #include <linux/pagemap.h>
+#include <linux/mpage.h>
 #include <linux/buffer_head.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
@@ -90,9 +91,21 @@ static int fat_writepage(struct page *page, struct writeback_control *wbc)
 	return block_write_full_page(page, fat_get_block, wbc);
 }
 
+static int fat_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, fat_get_block);
+}
+
 static int fat_readpage(struct file *file, struct page *page)
 {
-	return block_read_full_page(page, fat_get_block);
+	return mpage_readpage(page, fat_get_block);
+}
+
+static int fat_readpages(struct file *file, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
 }
 
 static int fat_prepare_write(struct file *file, struct page *page,
@@ -122,7 +135,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 
 static struct address_space_operations fat_aops = {
 	.readpage	= fat_readpage,
+	.readpages	= fat_readpages,
 	.writepage	= fat_writepage,
+	.writepages	= fat_writepages,
 	.sync_page	= block_sync_page,
 	.prepare_write	= fat_prepare_write,
 	.commit_write	= fat_commit_write,
-- 
cgit v1.2.3


From 7c709d00d614d0f2b6a80895b2a1aedbe04e8478 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:10 -0800
Subject: [PATCH] fat: s/EXPORT_SYMBOL/EXPORT_SYMBOL_GPL/

All EXPORT_SYMBOL of fatfs is only for vfat/msdos. _GPL would be proper.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/dir.c    | 14 +++++++-------
 fs/fat/fatent.c |  2 +-
 fs/fat/file.c   |  2 +-
 fs/fat/inode.c  | 10 +++++-----
 fs/fat/misc.c   |  6 +++---
 5 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index b2a26cd226b5..4ce77475aed3 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -418,7 +418,7 @@ EODir:
 	return err;
 }
 
-EXPORT_SYMBOL(fat_search_long);
+EXPORT_SYMBOL_GPL(fat_search_long);
 
 struct fat_ioctl_filldir_callback {
 	struct dirent __user *dirent;
@@ -780,7 +780,7 @@ int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
 	return -ENOENT;
 }
 
-EXPORT_SYMBOL(fat_get_dotdot_entry);
+EXPORT_SYMBOL_GPL(fat_get_dotdot_entry);
 
 /* See if directory is empty */
 int fat_dir_empty(struct inode *dir)
@@ -803,7 +803,7 @@ int fat_dir_empty(struct inode *dir)
 	return result;
 }
 
-EXPORT_SYMBOL(fat_dir_empty);
+EXPORT_SYMBOL_GPL(fat_dir_empty);
 
 /*
  * fat_subdirs counts the number of sub-directories of dir. It can be run
@@ -849,7 +849,7 @@ int fat_scan(struct inode *dir, const unsigned char *name,
 	return -ENOENT;
 }
 
-EXPORT_SYMBOL(fat_scan);
+EXPORT_SYMBOL_GPL(fat_scan);
 
 static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
 {
@@ -936,7 +936,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 	return 0;
 }
 
-EXPORT_SYMBOL(fat_remove_entries);
+EXPORT_SYMBOL_GPL(fat_remove_entries);
 
 static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
 			      struct buffer_head **bhs, int nr_bhs)
@@ -1048,7 +1048,7 @@ error:
 	return err;
 }
 
-EXPORT_SYMBOL(fat_alloc_new_dir);
+EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
 
 static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
 			       int *nr_cluster, struct msdos_dir_entry **de,
@@ -1264,4 +1264,4 @@ error_remove:
 	return err;
 }
 
-EXPORT_SYMBOL(fat_add_entries);
+EXPORT_SYMBOL_GPL(fat_add_entries);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 20a22875f0a1..a1a9e0451217 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -581,7 +581,7 @@ error:
 	return err;
 }
 
-EXPORT_SYMBOL(fat_free_clusters);
+EXPORT_SYMBOL_GPL(fat_free_clusters);
 
 int fat_count_free_clusters(struct super_block *sb)
 {
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 7134403d5be2..15229fe569c3 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -173,7 +173,7 @@ out:
 	return error;
 }
 
-EXPORT_SYMBOL(fat_notify_change);
+EXPORT_SYMBOL_GPL(fat_notify_change);
 
 /* Free all clusters after the skip'th cluster. */
 static int fat_free(struct inode *inode, int skip)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index f502c6b8cb49..932c8d6d1f54 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -197,7 +197,7 @@ void fat_attach(struct inode *inode, loff_t i_pos)
 	spin_unlock(&sbi->inode_hash_lock);
 }
 
-EXPORT_SYMBOL(fat_attach);
+EXPORT_SYMBOL_GPL(fat_attach);
 
 void fat_detach(struct inode *inode)
 {
@@ -208,7 +208,7 @@ void fat_detach(struct inode *inode)
 	spin_unlock(&sbi->inode_hash_lock);
 }
 
-EXPORT_SYMBOL(fat_detach);
+EXPORT_SYMBOL_GPL(fat_detach);
 
 struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
 {
@@ -362,7 +362,7 @@ out:
 	return inode;
 }
 
-EXPORT_SYMBOL(fat_build_inode);
+EXPORT_SYMBOL_GPL(fat_build_inode);
 
 static void fat_delete_inode(struct inode *inode)
 {
@@ -557,7 +557,7 @@ int fat_sync_inode(struct inode *inode)
 	return fat_write_inode(inode, 1);
 }
 
-EXPORT_SYMBOL(fat_sync_inode);
+EXPORT_SYMBOL_GPL(fat_sync_inode);
 
 static int fat_show_options(struct seq_file *m, struct vfsmount *mnt);
 static struct super_operations fat_sops = {
@@ -1368,7 +1368,7 @@ out_fail:
 	return error;
 }
 
-EXPORT_SYMBOL(fat_fill_super);
+EXPORT_SYMBOL_GPL(fat_fill_super);
 
 int __init fat_cache_init(void);
 void fat_cache_destroy(void);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 9b592e37e259..32fb0a3f1da4 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -33,7 +33,7 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...)
 	}
 }
 
-EXPORT_SYMBOL(fat_fs_panic);
+EXPORT_SYMBOL_GPL(fat_fs_panic);
 
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
@@ -192,7 +192,7 @@ void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
 	*date = cpu_to_le16(nl_day-day_n[month-1]+1+(month << 5)+(year << 9));
 }
 
-EXPORT_SYMBOL(fat_date_unix2dos);
+EXPORT_SYMBOL_GPL(fat_date_unix2dos);
 
 int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
@@ -220,4 +220,4 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 	return err;
 }
 
-EXPORT_SYMBOL(fat_sync_bhs);
+EXPORT_SYMBOL_GPL(fat_sync_bhs);
-- 
cgit v1.2.3


From e5174baaea7585760f02eef23b225847d209a8db Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:11 -0800
Subject: [PATCH] fat: support ->direct_IO()

This patch add to support of ->direct_IO() for mostly read.

The user of this seems to want to use for streaming read.  So, current direct
I/O has limitation, it can only overwrite.  (For write operation, mainly we
need to handle the hole etc..)

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/cache.c           | 14 +++++++--
 fs/fat/dir.c             |  6 ++--
 fs/fat/inode.c           | 82 ++++++++++++++++++++++++++++++++++++++++++------
 include/linux/msdos_fs.h |  3 +-
 4 files changed, 89 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 77c24fcf712a..1acc941245fb 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -295,7 +295,8 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 	return dclus;
 }
 
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+	     unsigned long *mapped_blocks)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -303,9 +304,12 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
 	int cluster, offset;
 
 	*phys = 0;
+	*mapped_blocks = 0;
 	if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) {
-		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits))
+		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
 			*phys = sector + sbi->dir_start;
+			*mapped_blocks = 1;
+		}
 		return 0;
 	}
 	last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1))
@@ -318,7 +322,11 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
 	cluster = fat_bmap_cluster(inode, cluster);
 	if (cluster < 0)
 		return cluster;
-	else if (cluster)
+	else if (cluster) {
 		*phys = fat_clus_to_blknr(sbi, cluster) + offset;
+		*mapped_blocks = sbi->sec_per_clus - offset;
+		if (*mapped_blocks > last_block - sector)
+			*mapped_blocks = last_block - sector;
+	}
 	return 0;
 }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4ce77475aed3..eef1b81aa294 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -68,8 +68,8 @@ static int fat__get_entry(struct inode *dir, loff_t *pos,
 {
 	struct super_block *sb = dir->i_sb;
 	sector_t phys, iblock;
-	int offset;
-	int err;
+	unsigned long mapped_blocks;
+	int err, offset;
 
 next:
 	if (*bh)
@@ -77,7 +77,7 @@ next:
 
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
-	err = fat_bmap(dir, iblock, &phys);
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks);
 	if (err || !phys)
 		return -1;	/* beyond EOF or error */
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 932c8d6d1f54..e7f4aa7fc686 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -23,6 +23,7 @@
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
+#include <linux/uio.h>
 #include <asm/unaligned.h>
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -49,43 +50,77 @@ static int fat_add_cluster(struct inode *inode)
 	return err;
 }
 
-static int fat_get_block(struct inode *inode, sector_t iblock,
-			 struct buffer_head *bh_result, int create)
+static int __fat_get_blocks(struct inode *inode, sector_t iblock,
+			    unsigned long *max_blocks,
+			    struct buffer_head *bh_result, int create)
 {
 	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	sector_t phys;
-	int err;
+	unsigned long mapped_blocks;
+	int err, offset;
 
-	err = fat_bmap(inode, iblock, &phys);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
 	if (err)
 		return err;
 	if (phys) {
 		map_bh(bh_result, sb, phys);
+		*max_blocks = min(mapped_blocks, *max_blocks);
 		return 0;
 	}
 	if (!create)
 		return 0;
+
 	if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
 		fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
 			     MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
 		return -EIO;
 	}
-	if (!((unsigned long)iblock & (MSDOS_SB(sb)->sec_per_clus - 1))) {
+
+	offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
+	if (!offset) {
+		/* TODO: multiple cluster allocation would be desirable. */
 		err = fat_add_cluster(inode);
 		if (err)
 			return err;
 	}
-	MSDOS_I(inode)->mmu_private += sb->s_blocksize;
-	err = fat_bmap(inode, iblock, &phys);
+	/* available blocks on this cluster */
+	mapped_blocks = sbi->sec_per_clus - offset;
+
+	*max_blocks = min(mapped_blocks, *max_blocks);
+	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
+
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
 	if (err)
 		return err;
-	if (!phys)
-		BUG();
+	BUG_ON(!phys);
+	BUG_ON(*max_blocks != mapped_blocks);
 	set_buffer_new(bh_result);
 	map_bh(bh_result, sb, phys);
 	return 0;
 }
 
+static int fat_get_blocks(struct inode *inode, sector_t iblock,
+			  unsigned long max_blocks,
+			  struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+	if (err)
+		return err;
+	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+	return 0;
+}
+
+static int fat_get_block(struct inode *inode, sector_t iblock,
+			 struct buffer_head *bh_result, int create)
+{
+	unsigned long max_blocks = 1;
+	return __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+}
+
 static int fat_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, fat_get_block, wbc);
@@ -128,6 +163,34 @@ static int fat_commit_write(struct file *file, struct page *page,
 	return err;
 }
 
+static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
+			     const struct iovec *iov,
+			     loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+
+	if (rw == WRITE) {
+		/*
+		 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+		 * so we need to update the ->mmu_private to block boundary.
+		 *
+		 * But we must fill the remaining area or hole by nul for
+		 * updating ->mmu_private.
+		 */
+		loff_t size = offset + iov_length(iov, nr_segs);
+		if (MSDOS_I(inode)->mmu_private < size)
+			return -EINVAL;
+	}
+
+	/*
+	 * FAT need to use the DIO_LOCKING for avoiding the race
+	 * condition of fat_get_block() and ->truncate().
+	 */
+	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, fat_get_blocks, NULL);
+}
+
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping, block, fat_get_block);
@@ -141,6 +204,7 @@ static struct address_space_operations fat_aops = {
 	.sync_page	= block_sync_page,
 	.prepare_write	= fat_prepare_write,
 	.commit_write	= fat_commit_write,
+	.direct_IO	= fat_direct_IO,
 	.bmap		= _fat_bmap
 };
 
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 941da5c016a0..e933e2a355ad 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -329,7 +329,8 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
 extern void fat_cache_inval_inode(struct inode *inode);
 extern int fat_get_cluster(struct inode *inode, int cluster,
 			   int *fclus, int *dclus);
-extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks);
 
 /* fat/dir.c */
 extern struct file_operations fat_dir_operations;
-- 
cgit v1.2.3


From 05eb0b51fb46430050d5873458612f53e0234f2e Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:13 -0800
Subject: [PATCH] fat: support a truncate() for expanding size
 (generic_cont_expand)

This patch changes generic_cont_expand(), in order to share the code
with fatfs.

  - Use vmtruncate() if ->prepare_write() returns a error.

Even if ->prepare_write() returns an error, it may already have added some
blocks.  So, this truncates blocks outside of ->i_size by vmtruncate().

  - Add generic_cont_expand_simple().

The generic_cont_expand_simple() assumes that ->prepare_write() can handle
the block boundary.  With this, we don't need to care the extra byte.

And for expanding a file size by truncate(), fatfs uses the
added generic_cont_expand_simple().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 60 ++++++++++++++++++++++++++++++++++-----------
 fs/fat/file.c               | 31 ++++++++++++++++++++---
 include/linux/buffer_head.h |  3 ++-
 3 files changed, 76 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 5287be18633b..55023231e460 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2160,11 +2160,12 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
  * truncates.  Uses prepare/commit_write to allow the filesystem to
  * deal with the hole.  
  */
-int generic_cont_expand(struct inode *inode, loff_t size)
+static int __generic_cont_expand(struct inode *inode, loff_t size,
+				 pgoff_t index, unsigned int offset)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
-	unsigned long index, offset, limit;
+	unsigned long limit;
 	int err;
 
 	err = -EFBIG;
@@ -2176,24 +2177,24 @@ int generic_cont_expand(struct inode *inode, loff_t size)
 	if (size > inode->i_sb->s_maxbytes)
 		goto out;
 
-	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
-
-	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
-	** skip the prepare.  make sure we never send an offset for the start
-	** of a block
-	*/
-	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
-		offset++;
-	}
-	index = size >> PAGE_CACHE_SHIFT;
 	err = -ENOMEM;
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
 	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
-	if (!err) {
-		err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+	if (err) {
+		/*
+		 * ->prepare_write() may have instantiated a few blocks
+		 * outside i_size.  Trim these off again.
+		 */
+		unlock_page(page);
+		page_cache_release(page);
+		vmtruncate(inode, inode->i_size);
+		goto out;
 	}
+
+	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+
 	unlock_page(page);
 	page_cache_release(page);
 	if (err > 0)
@@ -2202,6 +2203,36 @@ out:
 	return err;
 }
 
+int generic_cont_expand(struct inode *inode, loff_t size)
+{
+	pgoff_t index;
+	unsigned int offset;
+
+	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
+
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		/* caller must handle this extra byte. */
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
+int generic_cont_expand_simple(struct inode *inode, loff_t size)
+{
+	loff_t pos = size - 1;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
+
+	/* prepare/commit_write can handle even if from==to==start of block. */
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
 /*
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
@@ -3145,6 +3176,7 @@ EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_commit_write);
 EXPORT_SYMBOL(generic_cont_expand);
+EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
 EXPORT_SYMBOL(invalidate_bdev);
 EXPORT_SYMBOL(ll_rw_block);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 15229fe569c3..9b07c328a6fc 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,6 +11,7 @@
 #include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
@@ -124,6 +125,24 @@ struct file_operations fat_file_operations = {
 	.sendfile	= generic_file_sendfile,
 };
 
+static int fat_cont_expand(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	loff_t start = inode->i_size, count = size - inode->i_size;
+	int err;
+
+	err = generic_cont_expand_simple(inode, size);
+	if (err)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	if (IS_SYNC(inode))
+		err = sync_page_range_nolock(inode, mapping, start, count);
+out:
+	return err;
+}
+
 int fat_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
@@ -132,11 +151,17 @@ int fat_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	lock_kernel();
 
-	/* FAT cannot truncate to a longer file */
+	/*
+	 * Expand the file. Since inode_setattr() updates ->i_size
+	 * before calling the ->truncate(), but FAT needs to fill the
+	 * hole before it.
+	 */
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (attr->ia_size > inode->i_size) {
-			error = -EPERM;
-			goto out;
+			error = fat_cont_expand(inode, attr->ia_size);
+			if (error || attr->ia_valid == ATTR_SIZE)
+				goto out;
+			attr->ia_valid &= ~ATTR_SIZE;
 		}
 	}
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1db061bb6b08..9f159baf153f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -197,7 +197,8 @@ int block_read_full_page(struct page*, get_block_t*);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
 				loff_t *);
-int generic_cont_expand(struct inode *inode, loff_t size) ;
+int generic_cont_expand(struct inode *inode, loff_t size);
+int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
 int block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
-- 
cgit v1.2.3


From 28fd129827b00e12829d48a5290f46277600619b Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:14 -0800
Subject: [PATCH] Fix and add EXPORT_SYMBOL(filemap_write_and_wait)

This patch add EXPORT_SYMBOL(filemap_write_and_wait) and use it.

See mm/filemap.c:

And changes the filemap_write_and_wait() and filemap_write_and_wait_range().

Current filemap_write_and_wait() doesn't wait if filemap_fdatawrite()
returns error.  However, even if filemap_fdatawrite() returned an
error, it may have submitted the partially data pages to the device.
(e.g. in the case of -ENOSPC)

<quotation>
Andrew Morton writes,

If filemap_fdatawrite() returns an error, this might be due to some
I/O problem: dead disk, unplugged cable, etc.  Given the generally
crappy quality of the kernel's handling of such exceptions, there's a
good chance that the filemap_fdatawait() will get stuck in D state
forever.
</quotation>

So, this patch doesn't wait if filemap_fdatawrite() returns the -EIO.

Trond, could you please review the nfs part?  Especially I'm not sure,
nfs must use the "filemap_fdatawrite(inode->i_mapping) == 0", or not.

Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_dir.c                |  3 +--
 fs/9p/vfs_file.c               |  3 +--
 fs/buffer.c                    | 10 ++--------
 fs/cifs/file.c                 |  6 ++----
 fs/cifs/inode.c                |  3 +--
 fs/jfs/jfs_dmap.c              |  3 +--
 fs/jfs/jfs_imap.c              |  6 ++----
 fs/jfs/jfs_txnmgr.c            |  6 ++----
 fs/jfs/jfs_umount.c            |  6 ++----
 fs/jfs/resize.c                |  3 +--
 fs/jfs/super.c                 |  3 +--
 fs/nfs/inode.c                 |  8 ++------
 fs/smbfs/file.c                |  3 +--
 fs/smbfs/inode.c               |  3 +--
 fs/xfs/linux-2.6/xfs_fs_subr.c |  3 +--
 mm/filemap.c                   | 40 +++++++++++++++++++++++++++-------------
 16 files changed, 48 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 57a43b8feef5..17089d1905ff 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -193,8 +193,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 		fid->fid);
 	fidnum = fid->fid;
 
-	filemap_fdatawrite(inode->i_mapping);
-	filemap_fdatawait(inode->i_mapping);
+	filemap_write_and_wait(inode->i_mapping);
 
 	if (fidnum >= 0) {
 		dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 89c849da8504..e13577da9130 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -165,8 +165,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 		return -ENOLCK;
 
 	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
-		filemap_fdatawrite(inode->i_mapping);
-		filemap_fdatawait(inode->i_mapping);
+		filemap_write_and_wait(inode->i_mapping);
 		invalidate_inode_pages(&inode->i_data);
 	}
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 55023231e460..263df0f192aa 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -153,14 +153,8 @@ int sync_blockdev(struct block_device *bdev)
 {
 	int ret = 0;
 
-	if (bdev) {
-		int err;
-
-		ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
-		err = filemap_fdatawait(bdev->bd_inode->i_mapping);
-		if (!ret)
-			ret = err;
-	}
+	if (bdev)
+		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	return ret;
 }
 EXPORT_SYMBOL(sync_blockdev);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 14a1c72ced92..5ade53d7bca8 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -127,8 +127,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 		if (file->f_dentry->d_inode->i_mapping) {
 		/* BB no need to lock inode until after invalidate
 		   since namei code should already have it locked? */
-			filemap_fdatawrite(file->f_dentry->d_inode->i_mapping);
-			filemap_fdatawait(file->f_dentry->d_inode->i_mapping);
+			filemap_write_and_wait(file->f_dentry->d_inode->i_mapping);
 		}
 		cFYI(1, ("invalidating remote inode since open detected it "
 			 "changed"));
@@ -419,8 +418,7 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
 		pCifsInode = CIFS_I(inode);
 		if (pCifsInode) {
 			if (can_flush) {
-				filemap_fdatawrite(inode->i_mapping);
-				filemap_fdatawait(inode->i_mapping);
+				filemap_write_and_wait(inode->i_mapping);
 			/* temporarily disable caching while we
 			   go to server to get inode info */
 				pCifsInode->clientCanCacheAll = FALSE;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 411c1f7f84da..9558f51bca55 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1148,8 +1148,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	/* BB check if we need to refresh inode from server now ? BB */
 
 	/* need to flush data before changing file size on server */
-	filemap_fdatawrite(direntry->d_inode->i_mapping);
-	filemap_fdatawait(direntry->d_inode->i_mapping);
+	filemap_write_and_wait(direntry->d_inode->i_mapping);
 
 	if (attrs->ia_valid & ATTR_SIZE) {
 		/* To avoid spurious oplock breaks from server, in the case of
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 68000a50ceb6..2967b7393415 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -302,8 +302,7 @@ int dbSync(struct inode *ipbmap)
 	/*
 	 * write out dirty pages of bmap
 	 */
-	filemap_fdatawrite(ipbmap->i_mapping);
-	filemap_fdatawait(ipbmap->i_mapping);
+	filemap_write_and_wait(ipbmap->i_mapping);
 
 	diWriteSpecial(ipbmap, 0);
 
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 28201b194f53..31b4aa13dd4b 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -265,8 +265,7 @@ int diSync(struct inode *ipimap)
 	/*
 	 * write out dirty pages of imap
 	 */
-	filemap_fdatawrite(ipimap->i_mapping);
-	filemap_fdatawait(ipimap->i_mapping);
+	filemap_write_and_wait(ipimap->i_mapping);
 
 	diWriteSpecial(ipimap, 0);
 
@@ -565,8 +564,7 @@ void diFreeSpecial(struct inode *ip)
 		jfs_err("diFreeSpecial called with NULL ip!");
 		return;
 	}
-	filemap_fdatawrite(ip->i_mapping);
-	filemap_fdatawait(ip->i_mapping);
+	filemap_write_and_wait(ip->i_mapping);
 	truncate_inode_pages(ip->i_mapping, 0);
 	iput(ip);
 }
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index b660c93c92de..2ddb6b892bcf 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1231,10 +1231,8 @@ int txCommit(tid_t tid,		/* transaction identifier */
 		 * when we don't need to worry about it at all.
 		 *
 		 * if ((!S_ISDIR(ip->i_mode))
-		 *    && (tblk->flag & COMMIT_DELETE) == 0) {
-		 *	filemap_fdatawrite(ip->i_mapping);
-		 *	filemap_fdatawait(ip->i_mapping);
-		 * }
+		 *    && (tblk->flag & COMMIT_DELETE) == 0)
+		 *	filemap_write_and_wait(ip->i_mapping);
 		 */
 
 		/*
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 5cf91785b541..21eaf7ac0fcb 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -108,8 +108,7 @@ int jfs_umount(struct super_block *sb)
 	 * Make sure all metadata makes it to disk before we mark
 	 * the superblock as clean
 	 */
-	filemap_fdatawrite(sbi->direct_inode->i_mapping);
-	filemap_fdatawait(sbi->direct_inode->i_mapping);
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
 
 	/*
 	 * ensure all file system file pages are propagated to their
@@ -161,8 +160,7 @@ int jfs_umount_rw(struct super_block *sb)
 	 * mark the superblock clean before everything is flushed to
 	 * disk.
 	 */
-	filemap_fdatawrite(sbi->direct_inode->i_mapping);
-	filemap_fdatawait(sbi->direct_inode->i_mapping);
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
 
 	updateSuper(sb, FM_CLEAN);
 
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index c6dc254d3253..45180361871c 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -376,8 +376,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	 * by txCommit();
 	 */
 	filemap_fdatawait(ipbmap->i_mapping);
-	filemap_fdatawrite(ipbmap->i_mapping);
-	filemap_fdatawait(ipbmap->i_mapping);
+	filemap_write_and_wait(ipbmap->i_mapping);
 	diWriteSpecial(ipbmap, 0);
 
 	newPage = nPages;	/* first new page number */
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4226af3ea91b..8d31f1336431 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -502,8 +502,7 @@ out_no_rw:
 		jfs_err("jfs_umount failed with return code %d", rc);
 	}
 out_mount_failed:
-	filemap_fdatawrite(sbi->direct_inode->i_mapping);
-	filemap_fdatawait(sbi->direct_inode->i_mapping);
+	filemap_write_and_wait(sbi->direct_inode->i_mapping);
 	truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
 	make_bad_inode(sbi->direct_inode);
 	iput(sbi->direct_inode);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e7bd0d92600f..3e4ba9cb7f80 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -644,10 +644,7 @@ int nfs_sync_mapping(struct address_space *mapping)
 	if (mapping->nrpages == 0)
 		return 0;
 	unmap_mapping_range(mapping, 0, 0, 0);
-	ret = filemap_fdatawrite(mapping);
-	if (ret != 0)
-		goto out;
-	ret = filemap_fdatawait(mapping);
+	ret = filemap_write_and_wait(mapping);
 	if (ret != 0)
 		goto out;
 	ret = nfs_wb_all(mapping->host);
@@ -864,8 +861,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 	nfs_begin_data_update(inode);
 	/* Write all dirty data if we're changing file permissions or size */
 	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE)) != 0) {
-		if (filemap_fdatawrite(inode->i_mapping) == 0)
-			filemap_fdatawait(inode->i_mapping);
+		filemap_write_and_wait(inode->i_mapping);
 		nfs_wb_all(inode);
 	}
 	/*
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index b4fcfa8b55a1..3c6eb3ba718e 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -374,8 +374,7 @@ smb_file_release(struct inode *inode, struct file * file)
 		/* We must flush any dirty pages now as we won't be able to
 		   write anything after close. mmap can trigger this.
 		   "openers" should perhaps include mmap'ers ... */
-		filemap_fdatawrite(inode->i_mapping);
-		filemap_fdatawait(inode->i_mapping);
+		filemap_write_and_wait(inode->i_mapping);
 		smb_close(inode);
 	}
 	unlock_kernel();
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 10b994428fef..6ec88bf59b2d 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -697,8 +697,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
 			DENTRY_PATH(dentry),
 			(long) inode->i_size, (long) attr->ia_size);
 
-		filemap_fdatawrite(inode->i_mapping);
-		filemap_fdatawait(inode->i_mapping);
+		filemap_write_and_wait(inode->i_mapping);
 
 		error = smb_open(dentry, O_WRONLY);
 		if (error)
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index f89340c61bf2..4fa4b1a5187e 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,8 +79,7 @@ fs_flushinval_pages(
 	struct inode	*ip = LINVFS_GET_IP(vp);
 
 	if (VN_CACHED(vp)) {
-		filemap_fdatawrite(ip->i_mapping);
-		filemap_fdatawait(ip->i_mapping);
+		filemap_write_and_wait(ip->i_mapping);
 
 		truncate_inode_pages(ip->i_mapping, first);
 	}
diff --git a/mm/filemap.c b/mm/filemap.c
index 8fdf36508023..478f4c74cc31 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -343,30 +343,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
 
 int filemap_write_and_wait(struct address_space *mapping)
 {
-	int retval = 0;
+	int err = 0;
 
 	if (mapping->nrpages) {
-		retval = filemap_fdatawrite(mapping);
-		if (retval == 0)
-			retval = filemap_fdatawait(mapping);
+		err = filemap_fdatawrite(mapping);
+		/*
+		 * Even if the above returned error, the pages may be
+		 * written partially (e.g. -ENOSPC), so we wait for it.
+		 * But the -EIO is special case, it may indicate the worst
+		 * thing (e.g. bug) happened, so we avoid waiting for it.
+		 */
+		if (err != -EIO) {
+			int err2 = filemap_fdatawait(mapping);
+			if (!err)
+				err = err2;
+		}
 	}
-	return retval;
+	return err;
 }
+EXPORT_SYMBOL(filemap_write_and_wait);
 
 int filemap_write_and_wait_range(struct address_space *mapping,
 				 loff_t lstart, loff_t lend)
 {
-	int retval = 0;
+	int err = 0;
 
 	if (mapping->nrpages) {
-		retval = __filemap_fdatawrite_range(mapping, lstart, lend,
-						    WB_SYNC_ALL);
-		if (retval == 0)
-			retval = wait_on_page_writeback_range(mapping,
-						    lstart >> PAGE_CACHE_SHIFT,
-						    lend >> PAGE_CACHE_SHIFT);
+		err = __filemap_fdatawrite_range(mapping, lstart, lend,
+						 WB_SYNC_ALL);
+		/* See comment of filemap_write_and_wait() */
+		if (err != -EIO) {
+			int err2 = wait_on_page_writeback_range(mapping,
+						lstart >> PAGE_CACHE_SHIFT,
+						lend >> PAGE_CACHE_SHIFT);
+			if (!err)
+				err = err2;
+		}
 	}
-	return retval;
+	return err;
 }
 
 /*
-- 
cgit v1.2.3


From 095975da26dba21698582e91e96be10f7417333f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:02:19 -0800
Subject: [PATCH] rcu file: use atomic primitives

Use atomic_inc_not_zero for rcu files instead of special case rcuref.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/rcuref.txt |  87 ++++++++---------
 fs/aio.c                     |   3 +-
 fs/file_table.c              |   8 +-
 include/linux/fs.h           |   3 +-
 include/linux/radix-tree.h   |   1 +
 include/linux/rcuref.h       | 220 -------------------------------------------
 kernel/rcupdate.c            |  14 ---
 kernel/rcutorture.c          |   1 -
 security/selinux/hooks.c     |   2 +-
 9 files changed, 48 insertions(+), 291 deletions(-)
 delete mode 100644 include/linux/rcuref.h

(limited to 'fs')

diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index a23fee66064d..3f60db41b2f0 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -1,74 +1,67 @@
-Refcounter framework for elements of lists/arrays protected by
-RCU.
+Refcounter design for elements of lists/arrays protected by RCU.
 
 Refcounting on elements of  lists which are protected by traditional
 reader/writer spinlocks or semaphores are straight forward as in:
 
-1.					2.
-add()					search_and_reference()
-{					{
-	alloc_object				read_lock(&list_lock);
-	...					search_for_element
-	atomic_set(&el->rc, 1);			atomic_inc(&el->rc);
-	write_lock(&list_lock);			...
-	add_element				read_unlock(&list_lock);
-	...					...
-	write_unlock(&list_lock);	}
+1.				2.
+add()				search_and_reference()
+{				{
+    alloc_object		    read_lock(&list_lock);
+    ...				    search_for_element
+    atomic_set(&el->rc, 1);	    atomic_inc(&el->rc);
+    write_lock(&list_lock);	     ...
+    add_element			    read_unlock(&list_lock);
+    ...				    ...
+    write_unlock(&list_lock);	}
 }
 
 3.					4.
 release_referenced()			delete()
 {					{
-	...				write_lock(&list_lock);
-	atomic_dec(&el->rc, relfunc)	...
-	...				delete_element
-}					write_unlock(&list_lock);
- 					...
-					if (atomic_dec_and_test(&el->rc))
-						kfree(el);
-					...
+    ...					    write_lock(&list_lock);
+    atomic_dec(&el->rc, relfunc)	    ...
+    ...					    delete_element
+}					    write_unlock(&list_lock);
+ 					    ...
+					    if (atomic_dec_and_test(&el->rc))
+					        kfree(el);
+					    ...
 					}
 
 If this list/array is made lock free using rcu as in changing the
 write_lock in add() and delete() to spin_lock and changing read_lock
-in search_and_reference to rcu_read_lock(), the rcuref_get in
+in search_and_reference to rcu_read_lock(), the atomic_get in
 search_and_reference could potentially hold reference to an element which
-has already been deleted from the list/array.  rcuref_lf_get_rcu takes
+has already been deleted from the list/array.  atomic_inc_not_zero takes
 care of this scenario. search_and_reference should look as;
 
 1.					2.
 add()					search_and_reference()
 {					{
- 	alloc_object				rcu_read_lock();
-	...					search_for_element
-	atomic_set(&el->rc, 1);			if (rcuref_inc_lf(&el->rc)) {
-	write_lock(&list_lock);				rcu_read_unlock();
-							return FAIL;
-	add_element				}
-	...					...
-	write_unlock(&list_lock);		rcu_read_unlock();
+    alloc_object			    rcu_read_lock();
+    ...					    search_for_element
+    atomic_set(&el->rc, 1);		    if (atomic_inc_not_zero(&el->rc)) {
+    write_lock(&list_lock);		        rcu_read_unlock();
+					        return FAIL;
+    add_element				    }
+    ...					    ...
+    write_unlock(&list_lock);		    rcu_read_unlock();
 }					}
 3.					4.
 release_referenced()			delete()
 {					{
-	...				write_lock(&list_lock);
-	rcuref_dec(&el->rc, relfunc)	...
-	...				delete_element
-}					write_unlock(&list_lock);
- 					...
-					if (rcuref_dec_and_test(&el->rc))
-						call_rcu(&el->head, el_free);
-					...
+    ...					    write_lock(&list_lock);
+    atomic_dec(&el->rc, relfunc)	    ...
+    ...					    delete_element
+}					    write_unlock(&list_lock);
+ 					    ...
+					    if (atomic_dec_and_test(&el->rc))
+					        call_rcu(&el->head, el_free);
+					    ...
 					}
 
 Sometimes, reference to the element need to be obtained in the
-update (write) stream.  In such cases, rcuref_inc_lf might be an overkill
-since the spinlock serialising list updates are held. rcuref_inc
+update (write) stream.  In such cases, atomic_inc_not_zero might be an
+overkill since the spinlock serialising list updates are held. atomic_inc
 is to be used in such cases.
-For arches which do not have cmpxchg rcuref_inc_lf
-api uses a hashed spinlock implementation and the same hashed spinlock
-is acquired in all rcuref_xxx primitives to preserve atomicity.
-Note: Use rcuref_inc api only if you need to use rcuref_inc_lf on the
-refcounter atleast at one place.  Mixing rcuref_inc and atomic_xxx api
-might lead to races. rcuref_inc_lf() must be used in lockfree
-RCU critical sections only.
+
diff --git a/fs/aio.c b/fs/aio.c
index 5a28b69ad223..aec2b1916d1b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,7 +29,6 @@
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
-#include <linux/rcuref.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -514,7 +513,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	/* Must be done under the lock to serialise against cancellation.
 	 * Call this aio_fput as it duplicates fput via the fput_work.
 	 */
-	if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
diff --git a/fs/file_table.c b/fs/file_table.c
index c3a5e2fd663b..6142250104a6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(get_empty_filp);
 
 void fastcall fput(struct file *file)
 {
-	if (rcuref_dec_and_test(&file->f_count))
+	if (atomic_dec_and_test(&file->f_count))
 		__fput(file);
 }
 
@@ -166,7 +166,7 @@ struct file fastcall *fget(unsigned int fd)
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (!rcuref_inc_lf(&file->f_count)) {
+		if (!atomic_inc_not_zero(&file->f_count)) {
 			/* File object ref couldn't be taken */
 			rcu_read_unlock();
 			return NULL;
@@ -198,7 +198,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
-			if (rcuref_inc_lf(&file->f_count))
+			if (atomic_inc_not_zero(&file->f_count))
 				*fput_needed = 1;
 			else
 				/* Didn't get the reference, someone's freed */
@@ -213,7 +213,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 
 void put_filp(struct file *file)
 {
-	if (rcuref_dec_and_test(&file->f_count)) {
+	if (atomic_dec_and_test(&file->f_count)) {
 		security_file_free(file);
 		file_kill(file);
 		file_free(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c9c48d65630..ef29500b5df8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -9,7 +9,6 @@
 #include <linux/config.h>
 #include <linux/limits.h>
 #include <linux/ioctl.h>
-#include <linux/rcuref.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -653,7 +652,7 @@ extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 
-#define get_file(x)	rcuref_inc(&(x)->f_count)
+#define get_file(x)	atomic_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 36e5d269612f..c57ff2fcb30a 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -19,6 +19,7 @@
 #ifndef _LINUX_RADIX_TREE_H
 #define _LINUX_RADIX_TREE_H
 
+#include <linux/sched.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
deleted file mode 100644
index e1adbba14b67..000000000000
--- a/include/linux/rcuref.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * rcuref.h
- *
- * Reference counting for elements of lists/arrays protected by
- * RCU.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2005
- *
- * Author: Dipankar Sarma <dipankar@in.ibm.com>
- *	   Ravikiran Thirumalai <kiran_th@gmail.com>
- *
- * See Documentation/RCU/rcuref.txt for detailed user guide.
- *
- */
-
-#ifndef _RCUREF_H_
-#define _RCUREF_H_
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-
-/*
- * These APIs work on traditional atomic_t counters used in the
- * kernel for reference counting. Under special circumstances
- * where a lock-free get() operation races with a put() operation
- * these APIs can be used. See Documentation/RCU/rcuref.txt.
- */
-
-#ifdef __HAVE_ARCH_CMPXCHG
-
-/**
- * rcuref_inc - increment refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline void rcuref_inc(atomic_t *rcuref)
-{
-	atomic_inc(rcuref);
-}
-
-/**
- * rcuref_dec - decrement refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline void rcuref_dec(atomic_t *rcuref)
-{
-	atomic_dec(rcuref);
-}
-
-/**
- * rcuref_dec_and_test - decrement refcount for object and test
- * @rcuref: reference counter in the object.
- * @release: pointer to the function that will clean up the object
- *	     when the last reference to the object is released.
- *	     This pointer is required.
- *
- * Decrement the refcount, and if 0, return 1. Else return 0.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline int rcuref_dec_and_test(atomic_t *rcuref)
-{
-	return atomic_dec_and_test(rcuref);
-}
-
-/*
- * cmpxchg is needed on UP too, if deletions to the list/array can happen
- * in interrupt context.
- */
-
-/**
- * rcuref_inc_lf - Take reference to an object in a read-side
- * critical section protected by RCU.
- * @rcuref: reference counter in the object in question.
- *
- * Try and increment the refcount by 1.  The increment might fail if
- * the reference counter has been through a 1 to 0 transition and
- * is no longer part of the lock-free list.
- * Returns non-zero on successful increment and zero otherwise.
- */
-static inline int rcuref_inc_lf(atomic_t *rcuref)
-{
-	int c, old;
-	c = atomic_read(rcuref);
-	while (c && (old = cmpxchg(&rcuref->counter, c, c + 1)) != c)
-		c = old;
-	return c;
-}
-
-#else				/* !__HAVE_ARCH_CMPXCHG */
-
-extern spinlock_t __rcuref_hash[];
-
-/*
- * Use a hash table of locks to protect the reference count
- * since cmpxchg is not available in this arch.
- */
-#ifdef	CONFIG_SMP
-#define RCUREF_HASH_SIZE	4
-#define RCUREF_HASH(k) \
-	(&__rcuref_hash[(((unsigned long)k)>>8) & (RCUREF_HASH_SIZE-1)])
-#else
-#define	RCUREF_HASH_SIZE	1
-#define RCUREF_HASH(k) 	&__rcuref_hash[0]
-#endif				/* CONFIG_SMP */
-
-/**
- * rcuref_inc - increment refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline void rcuref_inc(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter += 1;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-}
-
-/**
- * rcuref_dec - decrement refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline void rcuref_dec(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter -= 1;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-}
-
-/**
- * rcuref_dec_and_test - decrement refcount for object and test
- * @rcuref: reference counter in the object.
- * @release: pointer to the function that will clean up the object
- *	     when the last reference to the object is released.
- *	     This pointer is required.
- *
- * Decrement the refcount, and if 0, return 1. Else return 0.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline int rcuref_dec_and_test(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter--;
-	if (!rcuref->counter) {
-		spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-		return 1;
-	} else {
-		spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-		return 0;
-	}
-}
-
-/**
- * rcuref_inc_lf - Take reference to an object of a lock-free collection
- * by traversing a lock-free list/array.
- * @rcuref: reference counter in the object in question.
- *
- * Try and increment the refcount by 1.  The increment might fail if
- * the reference counter has been through a 1 to 0 transition and
- * object is no longer part of the lock-free list.
- * Returns non-zero on successful increment and zero otherwise.
- */
-static inline int rcuref_inc_lf(atomic_t *rcuref)
-{
-	int ret;
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	if (rcuref->counter)
-		ret = rcuref->counter++;
-	else
-		ret = 0;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-	return ret;
-}
-
-
-#endif /* !__HAVE_ARCH_CMPXCHG */
-
-#endif /* __KERNEL__ */
-#endif /* _RCUREF_H_ */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0a669bd2f6d1..30b0bba03859 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,7 +46,6 @@
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 
 /* Definition for rcupdate control block. */
@@ -74,19 +73,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int maxbatch = 10000;
 
-#ifndef __HAVE_ARCH_CMPXCHG
-/*
- * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
- * 32 bit atomic_t implementations, and a hash function similar to that
- * for our refcounting needs.
- * Can't help multiprocessors which donot have cmpxchg :(
- */
-
-spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
-	[0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
-};
-#endif
-
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 36efe088ad81..75174c81529a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,7 +39,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 3d496eae1b47..6647204e4636 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1663,7 +1663,7 @@ static inline void flush_unauthorized_files(struct files_struct * files)
 						continue;
 					}
 					if (devnull) {
-						rcuref_inc(&devnull->f_count);
+						get_file(devnull);
 					} else {
 						devnull = dentry_open(dget(selinux_null), mntget(selinuxfs_mount), O_RDWR);
 						if (!devnull) {
-- 
cgit v1.2.3


From 6625b861f8f0e429902b8671b3e70792cd99074e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:23 -0800
Subject: [PATCH] relayfs: decouple buffer creation from inode creation

The patch series implementa or fixes 3 things that were specifically requested
or suggested by relayfs users:

- support for non-relay files (patches 1-6)

Currently, the relayfs API only supports the creation of directories
(relayfs_create_dir()) and relay files (relay_open()).  These patches adds
support for non-relay files (relayfs_create_file()).  This is so relayfs
applications can create 'control files' in relayfs itself rather than in /proc
or via a netlink channel, as is currently done in the relay-app examples.
Basically what this amounts to is exporting relayfs_create_file() with an
additional file_ops param that clients can use to supply file operations for
their own special-purpose files in relayfs.

- make exported relay file ops useful (patches 7-8)

The relayfs relay_file_operations have always been exported, the intent being
to make it possible to create relay files in other filesystems such as
debugfs.  The problem, though, is that currently the file operations are too
tightly coupled to relayfs to actually be used for this purpose.  This patch
fixes that by adding a couple of callback functions that allow a client to
hook into relay_open()/close() and supply the files that will be used to
represent the channel buffers; the default implementation if no callbacks are
defined is to create the files in relayfs.

- add an option to create global relay buffer (patches 9-10) The file creation
callback also supplies an optional param, is_global, that can be used by
clients to create a single global relayfs buffer instead of the default
per-cpu buffers.  This was suggested as being useful for certain debugging
applications where it's more convenient to be able to get all the data from a
single channel without having to go to the bother of dealing with per-cpu
files.

- cleanup, some renaming and Documentation updates (patches 11-12)

There were several comments that the use of netlink in the example code was
non-intuitive and in fact the whole relay-app business was needlessly
confusing.  Based on that feedback, the example code has been completely
converted over to relayfs control files as supported by this patch, and have
also been made completely self-contained.

The converted examples along with a couple of new examples that demonstrate
using exported relay files can be found in relay-apps tarball:
http://prdownloads.sourceforge.net/relayfs/relay-apps-0.9.tar.gz?download

This patch:

Separate buffer create/destroy from inode create/destroy.  We want to be able
to associate other data and not just relay buffers with inodes.  Buffer
create/destroy is moved out of inode.c and into relayfs core code.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/buffers.c |  1 +
 fs/relayfs/inode.c   | 31 +++++++++----------------------
 fs/relayfs/relay.c   | 11 ++++++++---
 fs/relayfs/relay.h   |  2 +-
 4 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
index 84e21ffa5ca8..667b529944c5 100644
--- a/fs/relayfs/buffers.c
+++ b/fs/relayfs/buffers.c
@@ -186,4 +186,5 @@ void relay_remove_buf(struct kref *kref)
 {
 	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
 	relayfs_remove(buf->dentry);
+	relay_destroy_buf(buf);
 }
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 0f7f88d067ad..379e07cd2b34 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -34,23 +34,13 @@ static struct backing_dev_info		relayfs_backing_dev_info = {
 };
 
 static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
-				       struct rchan *chan)
+				       void *data)
 {
-	struct rchan_buf *buf = NULL;
 	struct inode *inode;
 
-	if (S_ISREG(mode)) {
-		BUG_ON(!chan);
-		buf = relay_create_buf(chan);
-		if (!buf)
-			return NULL;
-	}
-
 	inode = new_inode(sb);
-	if (!inode) {
-		relay_destroy_buf(buf);
+	if (!inode)
 		return NULL;
-	}
 
 	inode->i_mode = mode;
 	inode->i_uid = 0;
@@ -62,7 +52,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 	switch (mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_fop = &relayfs_file_operations;
-		RELAYFS_I(inode)->buf = buf;
+		RELAYFS_I(inode)->buf = data;
 		break;
 	case S_IFDIR:
 		inode->i_op = &simple_dir_inode_operations;
@@ -83,7 +73,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode
- *	@chan: relay channel associated with the file
+ *	@data: user-associated data for this file
  *
  *	Returns the new dentry, NULL on failure
  *
@@ -92,7 +82,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 static struct dentry *relayfs_create_entry(const char *name,
 					   struct dentry *parent,
 					   int mode,
-					   struct rchan *chan)
+					   void *data)
 {
 	struct dentry *d;
 	struct inode *inode;
@@ -127,7 +117,7 @@ static struct dentry *relayfs_create_entry(const char *name,
 		goto release_mount;
 	}
 
-	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, chan);
+	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, data);
 	if (!inode) {
 		d = NULL;
 		goto release_mount;
@@ -155,20 +145,20 @@ exit:
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode, if not specied the default perms are used
- *	@chan: channel associated with the file
+ *	@data: user-associated data for this file
  *
  *	Returns file dentry if successful, NULL otherwise.
  *
  *	The file will be created user r on behalf of current user.
  */
 struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
-				   int mode, struct rchan *chan)
+				   int mode, void *data)
 {
 	if (!mode)
 		mode = S_IRUSR;
 	mode = (mode & S_IALLUGO) | S_IFREG;
 
-	return relayfs_create_entry(name, parent, mode, chan);
+	return relayfs_create_entry(name, parent, mode, data);
 }
 
 /**
@@ -505,9 +495,6 @@ static struct inode *relayfs_alloc_inode(struct super_block *sb)
  */
 static void relayfs_destroy_inode(struct inode *inode)
 {
-	if (RELAYFS_I(inode)->buf)
-		relay_destroy_buf(RELAYFS_I(inode)->buf);
-
 	kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
 }
 
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 2a6f7f12b7f9..7fbda177ad8f 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -171,12 +171,17 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
 	struct rchan_buf *buf;
 	struct dentry *dentry;
 
+ 	buf = relay_create_buf(chan);
+ 	if (!buf)
+ 		return NULL;
+
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR, chan);
-	if (!dentry)
+	dentry = relayfs_create_file(filename, parent, S_IRUSR, buf);
+ 	if (!dentry) {
+ 		relay_destroy_buf(buf);
 		return NULL;
+ 	}
 
-	buf = RELAYFS_I(dentry->d_inode)->buf;
 	buf->dentry = dentry;
 	__relay_reset(buf, 1);
 
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
index 703503fa22b6..c325bb243549 100644
--- a/fs/relayfs/relay.h
+++ b/fs/relayfs/relay.h
@@ -4,7 +4,7 @@
 struct dentry *relayfs_create_file(const char *name,
 				   struct dentry *parent,
 				   int mode,
-				   struct rchan *chan);
+				   void *data);
 extern int relayfs_remove(struct dentry *dentry);
 extern int relay_buf_empty(struct rchan_buf *buf);
 extern void relay_destroy_channel(struct kref *kref);
-- 
cgit v1.2.3


From 907f2c77d1653ce235e8e1fd6ce5c46005814e78 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:24 -0800
Subject: [PATCH] relayfs: export relayfs_create_file() with fileops param

This patch adds a mandatory fileops param to relayfs_create_file() and exports
that function so that clients can use it to create files defined by their own
set of file operations, in relayfs.  The purpose is to allow relayfs
applications to create their own set of 'control' files alongside their relay
files in relayfs rather than having to create them in /proc or debugfs for
instance.  relayfs_create_file() is also used by relay_open_buf() to create
the relay files for a channel.  In this case, a pointer to
relayfs_file_operations is passed in, along with a pointer to the buffer
associated with the file.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 41 ++++++++++++++++++++++++++---------------
 fs/relayfs/relay.c         |  3 ++-
 fs/relayfs/relay.h         |  4 ----
 include/linux/relayfs_fs.h |  7 ++++++-
 4 files changed, 34 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 379e07cd2b34..a5e6d4f2efb9 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -33,7 +33,9 @@ static struct backing_dev_info		relayfs_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 };
 
-static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
+static struct inode *relayfs_get_inode(struct super_block *sb,
+				       int mode,
+ 				       struct file_operations *fops,
 				       void *data)
 {
 	struct inode *inode;
@@ -51,8 +53,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	switch (mode & S_IFMT) {
 	case S_IFREG:
-		inode->i_fop = &relayfs_file_operations;
-		RELAYFS_I(inode)->buf = data;
+		inode->i_fop = fops;
+		RELAYFS_I(inode)->data = data;
 		break;
 	case S_IFDIR:
 		inode->i_op = &simple_dir_inode_operations;
@@ -73,6 +75,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode
+ *	@fops: file operations to use for the file
  *	@data: user-associated data for this file
  *
  *	Returns the new dentry, NULL on failure
@@ -82,6 +85,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 static struct dentry *relayfs_create_entry(const char *name,
 					   struct dentry *parent,
 					   int mode,
+					   struct file_operations *fops,
 					   void *data)
 {
 	struct dentry *d;
@@ -117,7 +121,7 @@ static struct dentry *relayfs_create_entry(const char *name,
 		goto release_mount;
 	}
 
-	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, data);
+	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data);
 	if (!inode) {
 		d = NULL;
 		goto release_mount;
@@ -145,20 +149,26 @@ exit:
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode, if not specied the default perms are used
+ *	@fops: file operations to use for the file
  *	@data: user-associated data for this file
  *
  *	Returns file dentry if successful, NULL otherwise.
  *
  *	The file will be created user r on behalf of current user.
  */
-struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
-				   int mode, void *data)
+struct dentry *relayfs_create_file(const char *name,
+				   struct dentry *parent,
+				   int mode,
+				   struct file_operations *fops,
+				   void *data)
 {
+	BUG_ON(!fops);
+
 	if (!mode)
 		mode = S_IRUSR;
 	mode = (mode & S_IALLUGO) | S_IFREG;
 
-	return relayfs_create_entry(name, parent, mode, data);
+	return relayfs_create_entry(name, parent, mode, fops, data);
 }
 
 /**
@@ -173,7 +183,7 @@ struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
 struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
 {
 	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	return relayfs_create_entry(name, parent, mode, NULL);
+	return relayfs_create_entry(name, parent, mode, NULL, NULL);
 }
 
 /**
@@ -234,7 +244,7 @@ int relayfs_remove_dir(struct dentry *dentry)
  */
 static int relayfs_open(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	kref_get(&buf->kref);
 
 	return 0;
@@ -250,7 +260,7 @@ static int relayfs_open(struct inode *inode, struct file *filp)
 static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	return relay_mmap_buf(RELAYFS_I(inode)->buf, vma);
+	return relay_mmap_buf(RELAYFS_I(inode)->data, vma);
 }
 
 /**
@@ -264,7 +274,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 
 	if (buf->finalized)
 		return POLLERR;
@@ -288,7 +298,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
  */
 static int relayfs_release(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	kref_put(&buf->kref, relay_remove_buf);
 
 	return 0;
@@ -450,7 +460,7 @@ static ssize_t relayfs_read(struct file *filp,
 			    loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	size_t read_start, avail;
 	ssize_t ret = 0;
 	void *from;
@@ -485,7 +495,7 @@ static struct inode *relayfs_alloc_inode(struct super_block *sb)
 	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
 	if (!p)
 		return NULL;
-	p->buf = NULL;
+	p->data = NULL;
 
 	return &p->vfs_inode;
 }
@@ -531,7 +541,7 @@ static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = RELAYFS_MAGIC;
 	sb->s_op = &relayfs_ops;
-	inode = relayfs_get_inode(sb, mode, NULL);
+	inode = relayfs_get_inode(sb, mode, NULL, NULL);
 
 	if (!inode)
 		return -ENOMEM;
@@ -589,6 +599,7 @@ module_exit(exit_relayfs_fs)
 EXPORT_SYMBOL_GPL(relayfs_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
+EXPORT_SYMBOL_GPL(relayfs_create_file);
 
 MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
 MODULE_DESCRIPTION("Relay Filesystem");
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 7fbda177ad8f..a9cd5585c45c 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -176,7 +176,8 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
  		return NULL;
 
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR, buf);
+	dentry = relayfs_create_file(filename, parent, S_IRUSR,
+				     &relayfs_file_operations, buf);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
index c325bb243549..0993d3e5753b 100644
--- a/fs/relayfs/relay.h
+++ b/fs/relayfs/relay.h
@@ -1,10 +1,6 @@
 #ifndef _RELAY_H
 #define _RELAY_H
 
-struct dentry *relayfs_create_file(const char *name,
-				   struct dentry *parent,
-				   int mode,
-				   void *data);
 extern int relayfs_remove(struct dentry *dentry);
 extern int relay_buf_empty(struct rchan_buf *buf);
 extern void relay_destroy_channel(struct kref *kref);
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index fb7e80737325..a122df2d9880 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -70,7 +70,7 @@ struct rchan
 struct relayfs_inode_info
 {
 	struct inode vfs_inode;
-	struct rchan_buf *buf;
+	void *data;
 };
 
 static inline struct relayfs_inode_info *RELAYFS_I(struct inode *inode)
@@ -148,6 +148,11 @@ extern size_t relay_switch_subbuf(struct rchan_buf *buf,
 extern struct dentry *relayfs_create_dir(const char *name,
 					 struct dentry *parent);
 extern int relayfs_remove_dir(struct dentry *dentry);
+extern struct dentry *relayfs_create_file(const char *name,
+					  struct dentry *parent,
+					  int mode,
+					  struct file_operations *fops,
+					  void *data);
 
 /**
  *	relay_write - write data into the channel
-- 
cgit v1.2.3


From 7431733791feb0b19453d8047b0723c744667040 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:25 -0800
Subject: [PATCH] relayfs: add relayfs_remove_file()

This patch adds and exports relayfs_remove_file(), for API symmetry (with
relayfs_create_file()).

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 12 ++++++++++++
 include/linux/relayfs_fs.h |  1 +
 2 files changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index a5e6d4f2efb9..b2f50655736b 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -224,6 +224,17 @@ int relayfs_remove(struct dentry *dentry)
 	return error;
 }
 
+/**
+ *	relayfs_remove_file - remove a file from relay filesystem
+ *	@dentry: directory dentry
+ *
+ *	Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove_file(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
 /**
  *	relayfs_remove_dir - remove a directory in the relay filesystem
  *	@dentry: directory dentry
@@ -600,6 +611,7 @@ EXPORT_SYMBOL_GPL(relayfs_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
 EXPORT_SYMBOL_GPL(relayfs_create_file);
+EXPORT_SYMBOL_GPL(relayfs_remove_file);
 
 MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
 MODULE_DESCRIPTION("Relay Filesystem");
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index a122df2d9880..921540b1cdf8 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -153,6 +153,7 @@ extern struct dentry *relayfs_create_file(const char *name,
 					  int mode,
 					  struct file_operations *fops,
 					  void *data);
+extern int relayfs_remove_file(struct dentry *dentry);
 
 /**
  *	relay_write - write data into the channel
-- 
cgit v1.2.3


From 51008f9f95a4c3158151a75f88fb03fb0f646aba Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:26 -0800
Subject: [PATCH] relayfs: use generic_ip for private data

Use inode->u.generic_ip instead of relayfs_inode_info to store pointer to user
data.  Clients using relayfs_file_create() to create their own files would
probably more expect their data to be stored in generic_ip; we also intend in
the next set of patches to get rid of relayfs-specific stuff in the file
operations, so we might as well do it here.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index b2f50655736b..7f6d2c8e91c2 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -54,7 +54,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb,
 	switch (mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_fop = fops;
-		RELAYFS_I(inode)->data = data;
+		if (data)
+			inode->u.generic_ip = data;
 		break;
 	case S_IFDIR:
 		inode->i_op = &simple_dir_inode_operations;
@@ -255,8 +256,9 @@ int relayfs_remove_dir(struct dentry *dentry)
  */
 static int relayfs_open(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->data;
+	struct rchan_buf *buf = inode->u.generic_ip;
 	kref_get(&buf->kref);
+	filp->private_data = buf;
 
 	return 0;
 }
@@ -270,8 +272,8 @@ static int relayfs_open(struct inode *inode, struct file *filp)
  */
 static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
-	return relay_mmap_buf(RELAYFS_I(inode)->data, vma);
+	struct rchan_buf *buf = filp->private_data;
+	return relay_mmap_buf(buf, vma);
 }
 
 /**
@@ -284,8 +286,7 @@ static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
 static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
-	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->data;
+	struct rchan_buf *buf = filp->private_data;
 
 	if (buf->finalized)
 		return POLLERR;
@@ -309,7 +310,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
  */
 static int relayfs_release(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->data;
+	struct rchan_buf *buf = filp->private_data;
 	kref_put(&buf->kref, relay_remove_buf);
 
 	return 0;
@@ -470,8 +471,8 @@ static ssize_t relayfs_read(struct file *filp,
 			    size_t count,
 			    loff_t *ppos)
 {
+	struct rchan_buf *buf = filp->private_data;
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	size_t read_start, avail;
 	ssize_t ret = 0;
 	void *from;
-- 
cgit v1.2.3


From aaea25d7a68a7f72e167dc1698b66a15edc71883 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:26 -0800
Subject: [PATCH] relayfs: remove unused alloc/destroy_inode()

Since we're no longer using relayfs_inode_info, remove relayfs_alloc_inode()
and relayfs_destroy_inode() along with the relayfs inode cache.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 46 +---------------------------------------------
 include/linux/relayfs_fs.h | 14 --------------
 2 files changed, 1 insertion(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 7f6d2c8e91c2..b4c3e0466e98 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -26,7 +26,6 @@
 
 static struct vfsmount *		relayfs_mount;
 static int				relayfs_mount_count;
-static kmem_cache_t *			relayfs_inode_cachep;
 
 static struct backing_dev_info		relayfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
@@ -499,34 +498,6 @@ out:
 	return ret;
 }
 
-/**
- *	relayfs alloc_inode() implementation
- */
-static struct inode *relayfs_alloc_inode(struct super_block *sb)
-{
-	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
-	if (!p)
-		return NULL;
-	p->data = NULL;
-
-	return &p->vfs_inode;
-}
-
-/**
- *	relayfs destroy_inode() implementation
- */
-static void relayfs_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
-}
-
-static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
-{
-	struct relayfs_inode_info *i = p;
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&i->vfs_inode);
-}
-
 struct file_operations relayfs_file_operations = {
 	.open		= relayfs_open,
 	.poll		= relayfs_poll,
@@ -539,8 +510,6 @@ struct file_operations relayfs_file_operations = {
 static struct super_operations relayfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
-	.alloc_inode	= relayfs_alloc_inode,
-	.destroy_inode	= relayfs_destroy_inode,
 };
 
 static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
@@ -584,25 +553,12 @@ static struct file_system_type relayfs_fs_type = {
 
 static int __init init_relayfs_fs(void)
 {
-	int err;
-
-	relayfs_inode_cachep = kmem_cache_create("relayfs_inode_cache",
-				sizeof(struct relayfs_inode_info), 0,
-				0, init_once, NULL);
-	if (!relayfs_inode_cachep)
-		return -ENOMEM;
-
-	err = register_filesystem(&relayfs_fs_type);
-	if (err)
-		kmem_cache_destroy(relayfs_inode_cachep);
-
-	return err;
+	return register_filesystem(&relayfs_fs_type);
 }
 
 static void __exit exit_relayfs_fs(void)
 {
 	unregister_filesystem(&relayfs_fs_type);
-	kmem_cache_destroy(relayfs_inode_cachep);
 }
 
 module_init(init_relayfs_fs)
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 921540b1cdf8..8200ecbe6e0f 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -64,20 +64,6 @@ struct rchan
 	struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
 };
 
-/*
- * Relayfs inode
- */
-struct relayfs_inode_info
-{
-	struct inode vfs_inode;
-	void *data;
-};
-
-static inline struct relayfs_inode_info *RELAYFS_I(struct inode *inode)
-{
-	return container_of(inode, struct relayfs_inode_info, vfs_inode);
-}
-
 /*
  * Relay channel client callbacks
  */
-- 
cgit v1.2.3


From 08c541a7ade230883c48225f4ea406a0117e7c2f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:28 -0800
Subject: [PATCH] relayfs: add support for relay files in other filesystems

This patch adds a couple of callback functions that allow a client to hook
into relay_open()/close() and supply the files that will be used to represent
the channel buffers; the default implementation if no callbacks are defined is
to create the files in relayfs.  This is to support the creation and use of
relay files in other filesystems such as debugfs, as implied by the fact that
relayfs_file_operations are exported.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/buffers.c       |  2 +-
 fs/relayfs/relay.c         | 30 ++++++++++++++++++++++++++++--
 include/linux/relayfs_fs.h | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
index 667b529944c5..10187812771e 100644
--- a/fs/relayfs/buffers.c
+++ b/fs/relayfs/buffers.c
@@ -185,6 +185,6 @@ void relay_destroy_buf(struct rchan_buf *buf)
 void relay_remove_buf(struct kref *kref)
 {
 	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-	relayfs_remove(buf->dentry);
+	buf->chan->cb->remove_buf_file(buf->dentry);
 	relay_destroy_buf(buf);
 }
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index a9cd5585c45c..b9bb56903272 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -80,11 +80,33 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 {
 }
 
+/*
+ * create_buf_file_create() default callback.  Creates file to represent buf.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+						       struct dentry *parent,
+						       int mode,
+						       struct rchan_buf *buf)
+{
+	return relayfs_create_file(filename, parent, mode,
+				   &relayfs_file_operations, buf);
+}
+
+/*
+ * remove_buf_file() default callback.  Removes file representing relay buffer.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
 /* relay channel default callbacks */
 static struct rchan_callbacks default_channel_callbacks = {
 	.subbuf_start = subbuf_start_default_callback,
 	.buf_mapped = buf_mapped_default_callback,
 	.buf_unmapped = buf_unmapped_default_callback,
+	.create_buf_file = create_buf_file_default_callback,
+	.remove_buf_file = remove_buf_file_default_callback,
 };
 
 /**
@@ -176,8 +198,8 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
  		return NULL;
 
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR,
-				     &relayfs_file_operations, buf);
+ 	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
+ 					   buf);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
@@ -220,6 +242,10 @@ static inline void setup_callbacks(struct rchan *chan,
 		cb->buf_mapped = buf_mapped_default_callback;
 	if (!cb->buf_unmapped)
 		cb->buf_unmapped = buf_unmapped_default_callback;
+	if (!cb->create_buf_file)
+		cb->create_buf_file = create_buf_file_default_callback;
+	if (!cb->remove_buf_file)
+		cb->remove_buf_file = remove_buf_file_default_callback;
 	chan->cb = cb;
 }
 
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 8200ecbe6e0f..8c2177105857 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -110,6 +110,40 @@ struct rchan_callbacks
 	 */
         void (*buf_unmapped)(struct rchan_buf *buf,
 			     struct file *filp);
+	/*
+	 * create_buf_file - create file to represent a relayfs channel buffer
+	 * @filename: the name of the file to create
+	 * @parent: the parent of the file to create
+	 * @mode: the mode of the file to create
+	 * @buf: the channel buffer
+	 *
+	 * Called during relay_open(), once for each per-cpu buffer,
+	 * to allow the client to create a file to be used to
+	 * represent the corresponding channel buffer.  If the file is
+	 * created outside of relayfs, the parent must also exist in
+	 * that filesystem.
+	 *
+	 * The callback should return the dentry of the file created
+	 * to represent the relay buffer.
+	 *
+	 * See Documentation/filesystems/relayfs.txt for more info.
+	 */
+	struct dentry *(*create_buf_file)(const char *filename,
+					  struct dentry *parent,
+					  int mode,
+					  struct rchan_buf *buf);
+
+	/*
+	 * remove_buf_file - remove file representing a relayfs channel buffer
+	 * @dentry: the dentry of the file to remove
+	 *
+	 * Called during relay_close(), once for each per-cpu buffer,
+	 * to allow the client to remove a file used to represent a
+	 * channel buffer.
+	 *
+	 * The callback should return 0 if successful, negative if not.
+	 */
+	int (*remove_buf_file)(struct dentry *dentry);
 };
 
 /*
-- 
cgit v1.2.3


From e6c08367b8fc6dce6dfd1106f53f6ef28215b313 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:29 -0800
Subject: [PATCH] relayfs: add support for global relay buffers

This patch adds the optional is_global outparam to the create_buf_file()
callback.  This can be used by clients to create a single global relayfs
buffer instead of the default per-cpu buffers.  This was suggested as being
useful for certain debugging applications where it's more convenient to be
able to get all the data from a single channel without having to go to the
bother of dealing with per-cpu files.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/relay.c         | 35 +++++++++++++++++++++++++----------
 include/linux/relayfs_fs.h |  8 +++++++-
 2 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index b9bb56903272..2935a6ab8ffa 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -86,7 +86,8 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 static struct dentry *create_buf_file_default_callback(const char *filename,
 						       struct dentry *parent,
 						       int mode,
-						       struct rchan_buf *buf)
+						       struct rchan_buf *buf,
+						       int *is_global)
 {
 	return relayfs_create_file(filename, parent, mode,
 				   &relayfs_file_operations, buf);
@@ -170,14 +171,16 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
 void relay_reset(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		__relay_reset(chan->buf[i], 0);
+		prev = chan->buf[i];
 	}
 }
 
@@ -188,18 +191,22 @@ void relay_reset(struct rchan *chan)
  */
 static struct rchan_buf *relay_open_buf(struct rchan *chan,
 					const char *filename,
-					struct dentry *parent)
+					struct dentry *parent,
+					int *is_global)
 {
 	struct rchan_buf *buf;
 	struct dentry *dentry;
 
+	if (*is_global)
+		return chan->buf[0];
+
  	buf = relay_create_buf(chan);
  	if (!buf)
  		return NULL;
 
 	/* Create file in fs */
  	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
- 					   buf);
+ 					   buf, is_global);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
@@ -273,6 +280,7 @@ struct rchan *relay_open(const char *base_filename,
 	unsigned int i;
 	struct rchan *chan;
 	char *tmpname;
+	int is_global = 0;
 
 	if (!base_filename)
 		return NULL;
@@ -297,7 +305,8 @@ struct rchan *relay_open(const char *base_filename,
 
 	for_each_online_cpu(i) {
 		sprintf(tmpname, "%s%d", base_filename, i);
-		chan->buf[i] = relay_open_buf(chan, tmpname, parent);
+		chan->buf[i] = relay_open_buf(chan, tmpname, parent,
+					      &is_global);
 		chan->buf[i]->cpu = i;
 		if (!chan->buf[i])
 			goto free_bufs;
@@ -311,6 +320,8 @@ free_bufs:
 		if (!chan->buf[i])
 			break;
 		relay_close_buf(chan->buf[i]);
+		if (is_global)
+			break;
 	}
 	kfree(tmpname);
 
@@ -420,14 +431,16 @@ void relay_destroy_channel(struct kref *kref)
 void relay_close(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		relay_close_buf(chan->buf[i]);
+		prev = chan->buf[i];
 	}
 
 	if (chan->last_toobig)
@@ -447,14 +460,16 @@ void relay_close(struct rchan *chan)
 void relay_flush(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		relay_switch_subbuf(chan->buf[i], 0);
+		prev = chan->buf[i];
 	}
 }
 
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 8c2177105857..30f45511b40d 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -116,6 +116,7 @@ struct rchan_callbacks
 	 * @parent: the parent of the file to create
 	 * @mode: the mode of the file to create
 	 * @buf: the channel buffer
+	 * @is_global: outparam - set non-zero if the buffer should be global
 	 *
 	 * Called during relay_open(), once for each per-cpu buffer,
 	 * to allow the client to create a file to be used to
@@ -126,12 +127,17 @@ struct rchan_callbacks
 	 * The callback should return the dentry of the file created
 	 * to represent the relay buffer.
 	 *
+	 * Setting the is_global outparam to a non-zero value will
+	 * cause relay_open() to create a single global buffer rather
+	 * than the default set of per-cpu buffers.
+	 *
 	 * See Documentation/filesystems/relayfs.txt for more info.
 	 */
 	struct dentry *(*create_buf_file)(const char *filename,
 					  struct dentry *parent,
 					  int mode,
-					  struct rchan_buf *buf);
+					  struct rchan_buf *buf,
+					  int *is_global);
 
 	/*
 	 * remove_buf_file - remove file representing a relayfs channel buffer
-- 
cgit v1.2.3


From 761da5c88aca34586e5b7295bd8b9be2438906f2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:31 -0800
Subject: [PATCH] relayfs: cleanup, change relayfs_file_* to relay_file_*

This patch renames relayfs_file_operations to relay_file_operations, and the
file operations themselves from relayfs_XXX to relay_file_XXX, to make it more
clear that they refer to relay files.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 89 ++++++++++++++++++++++++----------------------
 fs/relayfs/relay.c         |  2 +-
 include/linux/relayfs_fs.h |  5 ++-
 3 files changed, 50 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index b4c3e0466e98..7b7f2cb5f0e1 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -247,13 +247,13 @@ int relayfs_remove_dir(struct dentry *dentry)
 }
 
 /**
- *	relayfs_open - open file op for relayfs files
+ *	relay_file_open - open file op for relay files
  *	@inode: the inode
  *	@filp: the file
  *
  *	Increments the channel buffer refcount.
  */
-static int relayfs_open(struct inode *inode, struct file *filp)
+static int relay_file_open(struct inode *inode, struct file *filp)
 {
 	struct rchan_buf *buf = inode->u.generic_ip;
 	kref_get(&buf->kref);
@@ -263,26 +263,26 @@ static int relayfs_open(struct inode *inode, struct file *filp)
 }
 
 /**
- *	relayfs_mmap - mmap file op for relayfs files
+ *	relay_file_mmap - mmap file op for relay files
  *	@filp: the file
  *	@vma: the vma describing what to map
  *
  *	Calls upon relay_mmap_buf to map the file into user space.
  */
-static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct rchan_buf *buf = filp->private_data;
 	return relay_mmap_buf(buf, vma);
 }
 
 /**
- *	relayfs_poll - poll file op for relayfs files
+ *	relay_file_poll - poll file op for relay files
  *	@filp: the file
  *	@wait: poll table
  *
  *	Poll implemention.
  */
-static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
 	struct rchan_buf *buf = filp->private_data;
@@ -300,14 +300,14 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 }
 
 /**
- *	relayfs_release - release file op for relayfs files
+ *	relay_file_release - release file op for relay files
  *	@inode: the inode
  *	@filp: the file
  *
  *	Decrements the channel refcount, as the filesystem is
  *	no longer using it.
  */
-static int relayfs_release(struct inode *inode, struct file *filp)
+static int relay_file_release(struct inode *inode, struct file *filp)
 {
 	struct rchan_buf *buf = filp->private_data;
 	kref_put(&buf->kref, relay_remove_buf);
@@ -316,11 +316,11 @@ static int relayfs_release(struct inode *inode, struct file *filp)
 }
 
 /**
- *	relayfs_read_consume - update the consumed count for the buffer
+ *	relay_file_read_consume - update the consumed count for the buffer
  */
-static void relayfs_read_consume(struct rchan_buf *buf,
-				 size_t read_pos,
-				 size_t bytes_consumed)
+static void relay_file_read_consume(struct rchan_buf *buf,
+				    size_t read_pos,
+				    size_t bytes_consumed)
 {
 	size_t subbuf_size = buf->chan->subbuf_size;
 	size_t n_subbufs = buf->chan->n_subbufs;
@@ -343,9 +343,9 @@ static void relayfs_read_consume(struct rchan_buf *buf,
 }
 
 /**
- *	relayfs_read_avail - boolean, are there unconsumed bytes available?
+ *	relay_file_read_avail - boolean, are there unconsumed bytes available?
  */
-static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 {
 	size_t bytes_produced, bytes_consumed, write_offset;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -376,16 +376,16 @@ static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
 	if (bytes_produced == bytes_consumed)
 		return 0;
 
-	relayfs_read_consume(buf, read_pos, 0);
+	relay_file_read_consume(buf, read_pos, 0);
 
 	return 1;
 }
 
 /**
- *	relayfs_read_subbuf_avail - return bytes available in sub-buffer
+ *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
  */
-static size_t relayfs_read_subbuf_avail(size_t read_pos,
-					struct rchan_buf *buf)
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+					   struct rchan_buf *buf)
 {
 	size_t padding, avail = 0;
 	size_t read_subbuf, read_offset, write_subbuf, write_offset;
@@ -407,14 +407,14 @@ static size_t relayfs_read_subbuf_avail(size_t read_pos,
 }
 
 /**
- *	relayfs_read_start_pos - find the first available byte to read
+ *	relay_file_read_start_pos - find the first available byte to read
  *
  *	If the read_pos is in the middle of padding, return the
  *	position of the first actually available byte, otherwise
  *	return the original value.
  */
-static size_t relayfs_read_start_pos(size_t read_pos,
-				     struct rchan_buf *buf)
+static size_t relay_file_read_start_pos(size_t read_pos,
+					struct rchan_buf *buf)
 {
 	size_t read_subbuf, padding, padding_start, padding_end;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -433,11 +433,11 @@ static size_t relayfs_read_start_pos(size_t read_pos,
 }
 
 /**
- *	relayfs_read_end_pos - return the new read position
+ *	relay_file_read_end_pos - return the new read position
  */
-static size_t relayfs_read_end_pos(struct rchan_buf *buf,
-				   size_t read_pos,
-				   size_t count)
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+				      size_t read_pos,
+				      size_t count)
 {
 	size_t read_subbuf, padding, end_pos;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -456,7 +456,7 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
 }
 
 /**
- *	relayfs_read - read file op for relayfs files
+ *	relay_file_read - read file op for relay files
  *	@filp: the file
  *	@buffer: the userspace buffer
  *	@count: number of bytes to read
@@ -465,10 +465,10 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
  *	Reads count bytes or the number of bytes available in the
  *	current sub-buffer being read, whichever is smaller.
  */
-static ssize_t relayfs_read(struct file *filp,
-			    char __user *buffer,
-			    size_t count,
-			    loff_t *ppos)
+static ssize_t relay_file_read(struct file *filp,
+			       char __user *buffer,
+			       size_t count,
+			       loff_t *ppos)
 {
 	struct rchan_buf *buf = filp->private_data;
 	struct inode *inode = filp->f_dentry->d_inode;
@@ -477,11 +477,11 @@ static ssize_t relayfs_read(struct file *filp,
 	void *from;
 
 	down(&inode->i_sem);
-	if(!relayfs_read_avail(buf, *ppos))
+	if(!relay_file_read_avail(buf, *ppos))
 		goto out;
 
-	read_start = relayfs_read_start_pos(*ppos, buf);
-	avail = relayfs_read_subbuf_avail(read_start, buf);
+	read_start = relay_file_read_start_pos(*ppos, buf);
+	avail = relay_file_read_subbuf_avail(read_start, buf);
 	if (!avail)
 		goto out;
 
@@ -491,20 +491,20 @@ static ssize_t relayfs_read(struct file *filp,
 		ret = -EFAULT;
 		goto out;
 	}
-	relayfs_read_consume(buf, read_start, count);
-	*ppos = relayfs_read_end_pos(buf, read_start, count);
+	relay_file_read_consume(buf, read_start, count);
+	*ppos = relay_file_read_end_pos(buf, read_start, count);
 out:
 	up(&inode->i_sem);
 	return ret;
 }
 
-struct file_operations relayfs_file_operations = {
-	.open		= relayfs_open,
-	.poll		= relayfs_poll,
-	.mmap		= relayfs_mmap,
-	.read		= relayfs_read,
+struct file_operations relay_file_operations = {
+	.open		= relay_file_open,
+	.poll		= relay_file_poll,
+	.mmap		= relay_file_mmap,
+	.read		= relay_file_read,
 	.llseek		= no_llseek,
-	.release	= relayfs_release,
+	.release	= relay_file_release,
 };
 
 static struct super_operations relayfs_ops = {
@@ -558,13 +558,18 @@ static int __init init_relayfs_fs(void)
 
 static void __exit exit_relayfs_fs(void)
 {
+
+
+
+
+
 	unregister_filesystem(&relayfs_fs_type);
 }
 
 module_init(init_relayfs_fs)
 module_exit(exit_relayfs_fs)
 
-EXPORT_SYMBOL_GPL(relayfs_file_operations);
+EXPORT_SYMBOL_GPL(relay_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
 EXPORT_SYMBOL_GPL(relayfs_create_file);
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 2935a6ab8ffa..abf3ceaace49 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -90,7 +90,7 @@ static struct dentry *create_buf_file_default_callback(const char *filename,
 						       int *is_global)
 {
 	return relayfs_create_file(filename, parent, mode,
-				   &relayfs_file_operations, buf);
+				   &relay_file_operations, buf);
 }
 
 /*
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 30f45511b40d..7342e66247fb 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -279,10 +279,9 @@ static inline void subbuf_start_reserve(struct rchan_buf *buf,
 }
 
 /*
- * exported relayfs file operations, fs/relayfs/inode.c
+ * exported relay file operations, fs/relayfs/inode.c
  */
-
-extern struct file_operations relayfs_file_operations;
+extern struct file_operations relay_file_operations;
 
 #endif /* _LINUX_RELAYFS_FS_H */
 
-- 
cgit v1.2.3


From b8b0af24353eafadf58a0889999700e43f135aad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:02:33 -0800
Subject: [PATCH] udf: remove bogus inode == NULL check in inode_bmap

inode can never be NULL when calling this function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/udf/inode.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4014f17d382e..395e582ee542 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1957,11 +1957,6 @@ int8_t inode_bmap(struct inode *inode, int block, kernel_lb_addr *bloc, uint32_t
 		printk(KERN_ERR "udf: inode_bmap: block < 0\n");
 		return -1;
 	}
-	if (!inode)
-	{
-		printk(KERN_ERR "udf: inode_bmap: NULL inode\n");
-		return -1;
-	}
 
 	*extoffset = 0;
 	*elen = 0;
-- 
cgit v1.2.3


From 4a30131e7dbb17e5fec6958bfac9da9aff1fa29b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 8 Jan 2006 01:02:39 -0800
Subject: [PATCH] Fix some problems with truncate and mtime semantics.

SUS requires that when truncating a file to the size that it currently
is:
  truncate and ftruncate should NOT modify ctime or mtime
  O_TRUNC SHOULD modify ctime and mtime.

Currently mtime and ctime are always modified on most local
filesystems (side effect of ->truncate) or never modified (on NFS).

With this patch:
  ATTR_CTIME|ATTR_MTIME are sent with ATTR_SIZE precisely when
    an update of these times is required whether size changes or not
    (via a new argument to do_truncate).  This allows NFS to do
    the right thing for O_TRUNC.
  inode_setattr nolonger forces ATTR_MTIME|ATTR_CTIME when the ATTR_SIZE
    sets the size to it's current value.  This allows local filesystems
    to do the right thing for f?truncate.

Also, the logic in inode_setattr is changed a bit so there are two return
points.  One returns the error from vmtruncate if it failed, the other
returns 0 (there can be no other failure).

Finally, if vmtruncate succeeds, and ATTR_SIZE is the only change
requested, we now fall-through and mark_inode_dirty.  If a filesystem did
not have a ->truncate function, then vmtruncate will have changed i_size,
without marking the inode as 'dirty', and I think this is wrong.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/attr.c          | 24 ++++++++----------------
 fs/exec.c          |  2 +-
 fs/namei.c         |  2 +-
 fs/open.c          |  9 +++++----
 include/linux/fs.h |  3 ++-
 5 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 67bcd9b14ea5..b34732506f1d 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,20 +67,12 @@ EXPORT_SYMBOL(inode_change_ok);
 int inode_setattr(struct inode * inode, struct iattr * attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
-	int error = 0;
-
-	if (ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(inode)) {
-			error = vmtruncate(inode, attr->ia_size);
-			if (error || (ia_valid == ATTR_SIZE))
-				goto out;
-		} else {
-			/*
-			 * We skipped the truncate but must still update
-			 * timestamps
-			 */
-			ia_valid |= ATTR_MTIME|ATTR_CTIME;
-		}
+
+	if (ia_valid & ATTR_SIZE &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
 	}
 
 	if (ia_valid & ATTR_UID)
@@ -104,8 +96,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
 		inode->i_mode = mode;
 	}
 	mark_inode_dirty(inode);
-out:
-	return error;
+
+	return 0;
 }
 EXPORT_SYMBOL(inode_setattr);
 
diff --git a/fs/exec.c b/fs/exec.c
index e9650cd22a3b..2075b674d85e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1505,7 +1505,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		goto close_fail;
 	if (!file->f_op->write)
 		goto close_fail;
-	if (do_truncate(file->f_dentry, 0, file) != 0)
+	if (do_truncate(file->f_dentry, 0, 0, file) != 0)
 		goto close_fail;
 
 	retval = binfmt->core_dump(signr, regs, file);
diff --git a/fs/namei.c b/fs/namei.c
index 6dbbd42d8b95..300eae088d5f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1491,7 +1491,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		if (!error) {
 			DQUOT_INIT(inode);
 			
-			error = do_truncate(dentry, 0, NULL);
+			error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL);
 		}
 		put_write_access(inode);
 		if (error)
diff --git a/fs/open.c b/fs/open.c
index f53a5b9ffb7d..94968cb3afca 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -194,7 +194,8 @@ out:
 	return error;
 }
 
-int do_truncate(struct dentry *dentry, loff_t length, struct file *filp)
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+	struct file *filp)
 {
 	int err;
 	struct iattr newattrs;
@@ -204,7 +205,7 @@ int do_truncate(struct dentry *dentry, loff_t length, struct file *filp)
 		return -EINVAL;
 
 	newattrs.ia_size = length;
-	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+	newattrs.ia_valid = ATTR_SIZE | time_attrs;
 	if (filp) {
 		newattrs.ia_file = filp;
 		newattrs.ia_valid |= ATTR_FILE;
@@ -266,7 +267,7 @@ static inline long do_sys_truncate(const char __user * path, loff_t length)
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error) {
 		DQUOT_INIT(inode);
-		error = do_truncate(nd.dentry, length, NULL);
+		error = do_truncate(nd.dentry, length, 0, NULL);
 	}
 	put_write_access(inode);
 
@@ -318,7 +319,7 @@ static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
-		error = do_truncate(dentry, length, file);
+		error = do_truncate(dentry, length, 0, file);
 out_putf:
 	fput(file);
 out:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ef29500b5df8..74c01aabd4ab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1344,7 +1344,8 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 /* fs/open.c */
 
-extern int do_truncate(struct dentry *, loff_t start, struct file *filp);
+extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
+		       struct file *filp);
 extern long do_sys_open(const char __user *filename, int flags, int mode);
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
-- 
cgit v1.2.3


From 2520f14ca85e38f575eed6acc6e586df246abea6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 8 Jan 2006 01:02:40 -0800
Subject: [PATCH] Fix overflow tests for compat_sys_fcntl64 locking

When making an fctl locking call through compat_sys_fcntl64 (i.e.  a 32bit
app on a 64bit kernel), the syscall can return a locking range that is in
conflict with the queried lock.

If some aspect of this range does not fit in the 32bit structure, something
needs to be done.

The current code is wrong in several respects:

- It returns data to userspace even if no conflict was found
   i.e. it should check l_type for F_UNLCK
- It returns -EOVERFLOW too agressively.   A lock range covering
  the last possible byte of the file (start = COMPAT_OFF_T_MAX,
  len = 1) should be possible, but is rejected with the current test.
- A extra-long 'len' should not be a problem.  If only that part
  of the conflicting lock that would be visible to the 32bit
  app needs to be reported to the 32bit app anyway.

This patch addresses those three issues and adds a comment to (hopefully)
record it for posterity.

Note: this patch mainly affects test-cases.  Real applications rarely is
ever see the problems.

This patch has been tested (LSB test suite), and works.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 55ac0324aaf1..271b75d1597f 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -494,9 +494,21 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
 		ret = sys_fcntl(fd, cmd, (unsigned long)&f);
 		set_fs(old_fs);
 		if (cmd == F_GETLK && ret == 0) {
-			if ((f.l_start >= COMPAT_OFF_T_MAX) ||
-			    ((f.l_start + f.l_len) > COMPAT_OFF_T_MAX))
+			/* GETLK was successfule and we need to return the data...
+			 * but it needs to fit in the compat structure.
+			 * l_start shouldn't be too big, unless the original
+			 * start + end is greater than COMPAT_OFF_T_MAX, in which
+			 * case the app was asking for trouble, so we return
+			 * -EOVERFLOW in that case.
+			 * l_len could be too big, in which case we just truncate it,
+			 * and only allow the app to see that part of the conflicting
+			 * lock that might make sense to it anyway
+			 */
+
+			if (f.l_start > COMPAT_OFF_T_MAX)
 				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_OFF_T_MAX)
+				f.l_len = COMPAT_OFF_T_MAX;
 			if (ret == 0)
 				ret = put_compat_flock(&f, compat_ptr(arg));
 		}
@@ -515,9 +527,11 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
 				(unsigned long)&f);
 		set_fs(old_fs);
 		if (cmd == F_GETLK64 && ret == 0) {
-			if ((f.l_start >= COMPAT_LOFF_T_MAX) ||
-			    ((f.l_start + f.l_len) > COMPAT_LOFF_T_MAX))
+			/* need to return lock information - see above for commentary */
+			if (f.l_start > COMPAT_LOFF_T_MAX)
 				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_LOFF_T_MAX)
+				f.l_len = COMPAT_LOFF_T_MAX;
 			if (ret == 0)
 				ret = put_compat_flock64(&f, compat_ptr(arg));
 		}
-- 
cgit v1.2.3


From 54b21a7992a31d30c9a91f7e0a00ffdb4bd0caee Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:03:05 -0800
Subject: [PATCH] fix possible PAGE_CACHE_SHIFT overflows

We've had two instances recently of overflows when doing

	64_bit_value = (32_bit_value << PAGE_CACHE_SHIFT)

I did a tree-wide grep of `<<.*PAGE_CACHE_SHIFT' and this is the result.

- afs_rxfs_fetch_descriptor.offset is of type off_t, which seems broken.

- jfs and jffs are limited to 4GB anyway.

- reiserfs map_block_for_writepage() takes an unsigned long for the block -
  it should take sector_t.  (It'll fail for huge filesystems with
  blocksize<PAGE_CACHE_SIZE)

- cramfs_read() needs to use sector_t (I think cramsfs is busted on large
  filesystems anyway)

- affs is limited in file size anyway.

- I generally didn't fix 32-bit overflows in directory operations.

- arm's __flush_dcache_page() is peculiar.  What if the page lies beyond 4G?

- gss_wrap_req_priv() needs checking (snd_buf->page_base)

Cc: Oleg Drokin <green@linuxhacker.ru>
Cc: David Howells <dhowells@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: <reiserfs-dev@namesys.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: <linux-fsdevel@vger.kernel.org>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/dir.c             | 2 +-
 fs/buffer.c              | 6 +++---
 fs/freevxfs/vxfs_immed.c | 4 ++--
 fs/jffs/inode-v23.c      | 4 ++--
 fs/mpage.c               | 4 ++--
 fs/romfs/inode.c         | 6 +++---
 fs/smbfs/file.c          | 4 ++--
 fs/sysv/dir.c            | 4 ++--
 8 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 6682d6d7f294..5c61c24dab2a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -137,7 +137,7 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
 #endif
 
 	/* determine how many magic numbers there should be in this page */
-	latter = dir->i_size - (page->index << PAGE_CACHE_SHIFT);
+	latter = dir->i_size - page_offset(page);
 	if (latter >= PAGE_SIZE)
 		qty = PAGE_SIZE;
 	else
diff --git a/fs/buffer.c b/fs/buffer.c
index 263df0f192aa..55f0975a9b15 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1762,7 +1762,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	 * handle that here by just cleaning them.
 	 */
 
-	block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	head = page_buffers(page);
 	bh = head;
 
@@ -2635,7 +2635,7 @@ int block_truncate_page(struct address_space *mapping,
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize;
-	pgoff_t iblock;
+	sector_t iblock;
 	unsigned length, pos;
 	struct inode *inode = mapping->host;
 	struct page *page;
@@ -2651,7 +2651,7 @@ int block_truncate_page(struct address_space *mapping,
 		return 0;
 
 	length = blocksize - length;
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index d0401dc68d41..6f5df1700e95 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -99,8 +99,8 @@ static int
 vxfs_immed_readpage(struct file *fp, struct page *pp)
 {
 	struct vxfs_inode_info	*vip = VXFS_INO(pp->mapping->host);
-	u_int64_t		offset = pp->index << PAGE_CACHE_SHIFT;
-	caddr_t			kaddr;
+	u_int64_t	offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT;
+	caddr_t		kaddr;
 
 	kaddr = kmap(pp);
 	memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE);
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 3dcc6d2162cb..2559ee10beda 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -757,7 +757,7 @@ jffs_do_readpage_nolock(struct file *file, struct page *page)
 
 	read_len = 0;
 	result = 0;
-	offset = page->index << PAGE_CACHE_SHIFT;
+	offset = page_offset(page);
 
 	kmap(page);
 	buf = page_address(page);
@@ -1545,7 +1545,7 @@ jffs_commit_write(struct file *filp, struct page *page,
 {
        void *addr = page_address(page) + from;
        /* XXX: PAGE_CACHE_SHIFT or PAGE_SHIFT */
-       loff_t pos = (page->index<<PAGE_CACHE_SHIFT) + from;
+       loff_t pos = page_offset(page) + from;
 
        return jffs_file_write(filp, addr, to-from, &pos);
 } /* jffs_commit_write() */
diff --git a/fs/mpage.c b/fs/mpage.c
index f1d2d02bd4c8..e431cb3878d6 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -184,7 +184,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	if (page_has_buffers(page))
 		goto confused;
 
-	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
 
 	bh.b_page = page;
@@ -466,7 +466,7 @@ __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	 * The page has no buffers: map it to disk
 	 */
 	BUG_ON(!PageUptodate(page));
-	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
+	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (i_size - 1) >> blkbits;
 	map_bh.b_page = page;
 	for (page_block = 0; page_block < blocks_per_page; ) {
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index c74f382dabba..0a13859fd57b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -418,7 +418,7 @@ static int
 romfs_readpage(struct file *file, struct page * page)
 {
 	struct inode *inode = page->mapping->host;
-	unsigned long offset, avail, readlen;
+	loff_t offset, avail, readlen;
 	void *buf;
 	int result = -EIO;
 
@@ -429,8 +429,8 @@ romfs_readpage(struct file *file, struct page * page)
 		goto err_out;
 
 	/* 32 bit warning -- but not for us :) */
-	offset = page->index << PAGE_CACHE_SHIFT;
-	if (offset < inode->i_size) {
+	offset = page_offset(page);
+	if (offset < i_size_read(inode)) {
 		avail = inode->i_size-offset;
 		readlen = min_t(unsigned long, avail, PAGE_SIZE);
 		if (romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen) == readlen) {
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 3c6eb3ba718e..7042e62726a4 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -209,8 +209,8 @@ smb_updatepage(struct file *file, struct page *page, unsigned long offset,
 {
 	struct dentry *dentry = file->f_dentry;
 
-	DEBUG1("(%s/%s %d@%ld)\n", DENTRY_PATH(dentry), 
-	       count, (page->index << PAGE_CACHE_SHIFT)+offset);
+	DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count,
+		((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset);
 
 	return smb_writepage_sync(dentry->d_inode, page, offset, count);
 }
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 69a085abad6f..cce8b05cba5a 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -103,7 +103,7 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			offset = (char *)de - kaddr;
 
 			over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
-					(n<<PAGE_CACHE_SHIFT) | offset,
+					((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
 					fs16_to_cpu(SYSV_SB(sb), de->inode),
 					DT_UNKNOWN);
 			if (over) {
@@ -115,7 +115,7 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	}
 
 done:
-	filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
+	filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
 	unlock_kernel();
 	return 0;
 }
-- 
cgit v1.2.3


From bb6f6dbaa48c53525a7a4f9d4df719c3b0b582af Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:13 -0800
Subject: [PATCH] do_coredump() should reset group_stop_count earlier

__group_complete_signal() sets ->group_stop_count in sig_kernel_coredump()
path and marks the target thread as ->group_exit_task.  So any thread
except group_exit_task will go to handle_group_stop()->finish_stop().

However, when group_exit_task actually starts do_coredump(), it sets
SIGNAL_GROUP_EXIT, but does not reset ->group_stop_count while killing
other threads.  If we have not yet stopped threads in the same thread
group, they all will spin in kernel mode until group_exit_task sends them
SIGKILL, because ->group_stop_count > 0 means:

	recalc_sigpending_tsk() never clears TIF_SIGPENDING

	get_signal_to_deliver() goes to handle_group_stop()

	handle_group_stop() returns when SIGNAL_GROUP_EXIT set

	syscall_exit/resume_userspace notice TIF_SIGPENDING,
	call get_signal_to_deliver() again.

So we are wasting cpu cycles, and if one of these threads is rt_task() this
may be a serious problem.

NOTE: do_coredump() holds ->mmap_sem, so not stopped threads can't escape
coredumping after clearing ->group_stop_count.

See also this thread: http://marc.theaimsgroup.com/?t=112739139900002

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 2075b674d85e..fd02ea4a81e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1462,6 +1462,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
 		current->signal->flags = SIGNAL_GROUP_EXIT;
 		current->signal->group_exit_code = exit_code;
+		current->signal->group_stop_count = 0;
 		retval = 0;
 	}
 	spin_unlock_irq(&current->sighand->siglock);
@@ -1477,7 +1478,6 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 * Clear any false indication of pending signals that might
 	 * be seen by the filesystem code called to write the core file.
 	 */
-	current->signal->group_stop_count = 0;
 	clear_thread_flag(TIF_SIGPENDING);
 
 	if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
-- 
cgit v1.2.3


From 4a0d11fae57989e24fe2ee3eff6d62d72db9716c Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Sun, 8 Jan 2006 01:03:18 -0800
Subject: [PATCH] pivot_root: add comment

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 2019899f2ab8..e5aa1eeb5748 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1526,6 +1526,10 @@ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
  * pointed to by put_old must yield the same directory as new_root. No other
  * file system may be mounted on put_old. After all, new_root is a mountpoint.
  *
+ * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
+ * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
+ * in this situation.
+ *
  * Notes:
  *  - we don't move root/cwd if they are not at the root (reason: if something
  *    cared enough to change them, it's probably wrong to force them elsewhere)
-- 
cgit v1.2.3


From bf066c7db775a04bd761f8ea206f5522d0cf40ff Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 8 Jan 2006 01:03:19 -0800
Subject: [PATCH] shared mounts: cleanup

Small cleanups in shared mounts code.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c        | 2 +-
 fs/pnode.c            | 2 +-
 include/linux/fs.h    | 2 +-
 include/linux/mount.h | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index e5aa1eeb5748..3e8fb61ad597 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -451,7 +451,7 @@ EXPORT_SYMBOL(may_umount);
 void release_mounts(struct list_head *head)
 {
 	struct vfsmount *mnt;
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		mnt = list_entry(head->next, struct vfsmount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
 		if (mnt->mnt_parent != mnt) {
diff --git a/fs/pnode.c b/fs/pnode.c
index aeeec8ba8dd2..f1871f773f64 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -103,7 +103,7 @@ static struct vfsmount *propagation_next(struct vfsmount *m,
 		struct vfsmount *next;
 		struct vfsmount *master = m->mnt_master;
 
-		if ( master == origin->mnt_master ) {
+		if (master == origin->mnt_master) {
 			next = next_peer(m);
 			return ((next == origin) ? NULL : next);
 		} else if (m->mnt_slave.next != &master->mnt_slave_list)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7f140480c6a8..a1e28f0895c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -103,11 +103,11 @@ extern int dir_notify_enable;
 #define MS_MOVE		8192
 #define MS_REC		16384
 #define MS_VERBOSE	32768
+#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_UNBINDABLE	(1<<17)	/* change to unbindable */
 #define MS_PRIVATE	(1<<18)	/* change to private */
 #define MS_SLAVE	(1<<19)	/* change to slave */
 #define MS_SHARED	(1<<20)	/* change to shared */
-#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index dd4e83eba933..b98a709f1794 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -22,7 +22,8 @@
 #define MNT_NOEXEC	0x04
 #define MNT_SHARED	0x10	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x20	/* if the vfsmount is a unbindable mount */
-#define MNT_PNODE_MASK	0x30	/* propogation flag mask */
+
+#define MNT_PNODE_MASK	(MNT_SHARED | MNT_UNBINDABLE)
 
 struct vfsmount {
 	struct list_head mnt_hash;
-- 
cgit v1.2.3


From 71b9625744b7d4a6a2416389a5ba464bdf11f07f Mon Sep 17 00:00:00 2001
From: Johann Lombardi <johann.lombardi@bull.net>
Date: Sun, 8 Jan 2006 01:03:20 -0800
Subject: [PATCH] ext3: external journal device as a mount option

The patch below adds a new mount option to allow the external journal
device to be specified.

The syntax is as follows:
# mount -t ext3 -o journal_dev=0x0820 ...
where 0x0820 means major=8 and minor=32.

Signed-off-by: Johann Lombardi <johann.lombardi@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/ext3.txt |  5 ++++
 fs/ext3/super.c                    | 54 +++++++++++++++++++++++++++++++-------
 2 files changed, 49 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 9840d5b8d5b9..22e4040564d5 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -22,6 +22,11 @@ journal=inum		When a journal already exists, this option is
 			the inode which will represent the ext3 file
 			system's journal file.
 
+journal_dev=devnum	When the external journal device's major/minor numbers
+			have changed, this option allows to specify the new
+			journal location. The journal device is identified
+			through its new major/minor numbers encoded in devnum.
+
 noload			Don't load the journal on mounting.
 
 data=journal		All data are committed into the journal prior
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4e6730622d90..7c45acf94589 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -43,7 +43,8 @@
 #include "acl.h"
 #include "namei.h"
 
-static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
+static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
+			     unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
 			       int);
 static void ext3_commit_super (struct super_block * sb,
@@ -628,7 +629,7 @@ enum {
 	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
-	Opt_commit, Opt_journal_update, Opt_journal_inum,
+	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
@@ -666,6 +667,7 @@ static match_table_t tokens = {
 	{Opt_commit, "commit=%u"},
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_inum, "journal=%u"},
+	{Opt_journal_dev, "journal_dev=%u"},
 	{Opt_abort, "abort"},
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
@@ -705,8 +707,9 @@ static unsigned long get_sb_block(void **data)
 	return sb_block;
 }
 
-static int parse_options (char * options, struct super_block *sb,
-			  unsigned long * inum, unsigned long *n_blocks_count, int is_remount)
+static int parse_options (char *options, struct super_block *sb,
+			  unsigned long *inum, unsigned long *journal_devnum,
+			  unsigned long *n_blocks_count, int is_remount)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	char * p;
@@ -839,6 +842,16 @@ static int parse_options (char * options, struct super_block *sb,
 				return 0;
 			*inum = option;
 			break;
+		case Opt_journal_dev:
+			if (is_remount) {
+				printk(KERN_ERR "EXT3-fs: cannot specify "
+				       "journal on remount\n");
+				return 0;
+			}
+			if (match_int(&args[0], &option))
+				return 0;
+			*journal_devnum = option;
+			break;
 		case Opt_noload:
 			set_opt (sbi->s_mount_opt, NOLOAD);
 			break;
@@ -1331,6 +1344,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	unsigned long logic_sb_block;
 	unsigned long offset = 0;
 	unsigned long journal_inum = 0;
+	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
 	int blocksize;
@@ -1411,7 +1425,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
 
-	if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0))
+	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
+			    NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -1622,7 +1637,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	 */
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
-		if (ext3_load_journal(sb, es))
+		if (ext3_load_journal(sb, es, journal_devnum))
 			goto failed_mount2;
 	} else if (journal_inum) {
 		if (ext3_create_journal(sb, es, journal_inum))
@@ -1902,15 +1917,24 @@ out_bdev:
 	return NULL;
 }
 
-static int ext3_load_journal(struct super_block * sb,
-			     struct ext3_super_block * es)
+static int ext3_load_journal(struct super_block *sb,
+			     struct ext3_super_block *es,
+			     unsigned long journal_devnum)
 {
 	journal_t *journal;
 	int journal_inum = le32_to_cpu(es->s_journal_inum);
-	dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+	dev_t journal_dev;
 	int err = 0;
 	int really_read_only;
 
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		printk(KERN_INFO "EXT3-fs: external journal device major/minor "
+			"numbers have changed\n");
+		journal_dev = new_decode_dev(journal_devnum);
+	} else
+		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+
 	really_read_only = bdev_read_only(sb->s_bdev);
 
 	/*
@@ -1969,6 +1993,16 @@ static int ext3_load_journal(struct super_block * sb,
 
 	EXT3_SB(sb)->s_journal = journal;
 	ext3_clear_journal_err(sb, es);
+
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		es->s_journal_dev = cpu_to_le32(journal_devnum);
+		sb->s_dirt = 1;
+
+		/* Make sure we flush the recovery flag to disk. */
+		ext3_commit_super(sb, es, 1);
+	}
+
 	return 0;
 }
 
@@ -2197,7 +2231,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
-- 
cgit v1.2.3


From 9f40668d7d14d4d16cedc2104bfb63a43584dacf Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <glommer@br.ibm.com>
Date: Sun, 8 Jan 2006 01:03:22 -0800
Subject: [PATCH] ext3: remove trailing newlines from ext3_warning() calls

Remove the trailing newlines in calls to ext3_warning().  This function
already adds a trailing newline to the end of messages.

Signed-off-by: Glauber de Oliveira Costa <glommer@br.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/ialloc.c |  6 +++---
 fs/ext3/namei.c  |  2 +-
 fs/ext3/resize.c | 22 +++++++++++-----------
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9e4a24376210..69078079b19c 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -651,7 +651,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
 	/* Error cases - e2fsck has already cleaned up for us */
 	if (ino > max_ino) {
 		ext3_warning(sb, __FUNCTION__,
-			     "bad orphan ino %lu!  e2fsck was run?\n", ino);
+			     "bad orphan ino %lu!  e2fsck was run?", ino);
 		goto out;
 	}
 
@@ -660,7 +660,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
 	bitmap_bh = read_inode_bitmap(sb, block_group);
 	if (!bitmap_bh) {
 		ext3_warning(sb, __FUNCTION__,
-			     "inode bitmap error for orphan %lu\n", ino);
+			     "inode bitmap error for orphan %lu", ino);
 		goto out;
 	}
 
@@ -672,7 +672,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
 			!(inode = iget(sb, ino)) || is_bad_inode(inode) ||
 			NEXT_ORPHAN(inode) > max_ino) {
 		ext3_warning(sb, __FUNCTION__,
-			     "bad orphan inode %lu!  e2fsck was run?\n", ino);
+			     "bad orphan inode %lu!  e2fsck was run?", ino);
 		printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
 		       bit, (unsigned long long)bitmap_bh->b_blocknr,
 		       ext3_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b3c690a3b54a..af193a304ee5 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1476,7 +1476,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 		if (levels && (dx_get_count(frames->entries) ==
 			       dx_get_limit(frames->entries))) {
 			ext3_warning(sb, __FUNCTION__,
-				     "Directory index full!\n");
+				     "Directory index full!");
 			err = -ENOSPC;
 			goto cleanup;
 		}
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 6104ad310507..675aa24b0059 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -340,7 +340,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved GDT %ld missing grp %d (%ld)\n",
+				     "reserved GDT %ld missing grp %d (%ld)",
 				     blk, grp,
 				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
 			return -EINVAL;
@@ -393,7 +393,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	if (EXT3_SB(sb)->s_sbh->b_blocknr !=
 	    le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
 		ext3_warning(sb, __FUNCTION__,
-			"won't resize using backup superblock at %llu\n",
+			"won't resize using backup superblock at %llu",
 			(unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
 		return -EPERM;
 	}
@@ -417,7 +417,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	data = (__u32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
 		ext3_warning(sb, __FUNCTION__,
-			     "new group %u GDT block %lu not reserved\n",
+			     "new group %u GDT block %lu not reserved",
 			     input->group, gdblock);
 		err = -EINVAL;
 		goto exit_dind;
@@ -540,7 +540,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	for (res = 0; res < reserved_gdb; res++, blk++) {
 		if (le32_to_cpu(*data) != blk) {
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved block %lu not at offset %ld\n",
+				     "reserved block %lu not at offset %ld",
 				     blk, (long)(data - (__u32 *)dind->b_data));
 			err = -EINVAL;
 			goto exit_bh;
@@ -683,7 +683,7 @@ exit_err:
 	if (err) {
 		ext3_warning(sb, __FUNCTION__,
 			     "can't update backup for group %d (err %d), "
-			     "forcing fsck on next reboot\n", group, err);
+			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT3_VALID_FS;
 		sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS);
 		mark_buffer_dirty(sbi->s_sbh);
@@ -722,7 +722,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
 					EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
 		ext3_warning(sb, __FUNCTION__,
-			     "Can't resize non-sparse filesystem further\n");
+			     "Can't resize non-sparse filesystem further");
 		return -EPERM;
 	}
 
@@ -730,13 +730,13 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 		if (!EXT3_HAS_COMPAT_FEATURE(sb,
 					     EXT3_FEATURE_COMPAT_RESIZE_INODE)){
 			ext3_warning(sb, __FUNCTION__,
-				     "No reserved GDT blocks, can't resize\n");
+				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
 		}
 		inode = iget(sb, EXT3_RESIZE_INO);
 		if (!inode || is_bad_inode(inode)) {
 			ext3_warning(sb, __FUNCTION__,
-				     "Error opening resize inode\n");
+				     "Error opening resize inode");
 			iput(inode);
 			return -ENOENT;
 		}
@@ -766,7 +766,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	lock_super(sb);
 	if (input->group != EXT3_SB(sb)->s_groups_count) {
 		ext3_warning(sb, __FUNCTION__,
-			     "multiple resizers run on filesystem!\n");
+			     "multiple resizers run on filesystem!");
 		err = -EBUSY;
 		goto exit_journal;
 	}
@@ -937,7 +937,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 
 	if (last == 0) {
 		ext3_warning(sb, __FUNCTION__,
-			     "need to use ext2online to resize further\n");
+			     "need to use ext2online to resize further");
 		return -EPERM;
 	}
 
@@ -973,7 +973,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	lock_super(sb);
 	if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
 		ext3_warning(sb, __FUNCTION__,
-			     "multiple resizers run on filesystem!\n");
+			     "multiple resizers run on filesystem!");
 		err = -EBUSY;
 		goto exit_put;
 	}
-- 
cgit v1.2.3


From 29ba17231222c42ca3df5424f43949e2a6fddec2 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <glommer@br.ibm.com>
Date: Sun, 8 Jan 2006 01:03:23 -0800
Subject: [PATCH] ext3: use sbi instead of EXT3_SB() in resize code.

There are places in the resize code in which EXT3_SB() macro is used after
an statement like sbi = EXT3_SB(sb) is done.  Inside the same function,
both sbi and EXT3_SB() are used to reference the super block Altough it is
not wrong, keeping it coherent increases legibility, IMHO.

Signed-off-by: Glauber de Oliveira Costa <glommer@br.ibm.com>
Cc: "Stephen C. Tweedie" <sct@redhat.com>
Cc: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/resize.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 675aa24b0059..1041dab6de2f 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -31,7 +31,7 @@ static int verify_group_input(struct super_block *sb,
 	unsigned start = le32_to_cpu(es->s_blocks_count);
 	unsigned end = start + input->blocks_count;
 	unsigned group = input->group;
-	unsigned itend = input->inode_table + EXT3_SB(sb)->s_itb_per_group;
+	unsigned itend = input->inode_table + sbi->s_itb_per_group;
 	unsigned overhead = ext3_bg_has_super(sb, group) ?
 		(1 + ext3_bg_num_gdb(sb, group) +
 		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
@@ -764,7 +764,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	}
 
 	lock_super(sb);
-	if (input->group != EXT3_SB(sb)->s_groups_count) {
+	if (input->group != sbi->s_groups_count) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
 		err = -EBUSY;
@@ -799,7 +799,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	 * data.  So we need to be careful to set all of the relevant
 	 * group descriptor data etc. *before* we enable the group.
 	 *
-	 * The key field here is EXT3_SB(sb)->s_groups_count: as long as
+	 * The key field here is sbi->s_groups_count: as long as
 	 * that retains its old value, nobody is going to access the new
 	 * group.
 	 *
@@ -859,7 +859,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	smp_wmb();
 
 	/* Update the global fs size fields */
-	EXT3_SB(sb)->s_groups_count++;
+	sbi->s_groups_count++;
 
 	ext3_journal_dirty_metadata(handle, primary);
 
@@ -874,7 +874,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	percpu_counter_mod(&sbi->s_freeinodes_counter,
 			   EXT3_INODES_PER_GROUP(sb));
 
-	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 	sb->s_dirt = 1;
 
 exit_journal:
-- 
cgit v1.2.3


From 850d6fbe70c62a9792eac3e8ef34f2f09f209895 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:29 -0800
Subject: [PATCH] sigio: cleanup, don't take tasklist twice

The only user of send_sigio_to_task() already holds tasklist_lock, so it is
better not to send the signal via send_group_sig_info() (which takes
tasklist recursively) but use group_send_sig_info().

The same change in send_sigurg()->send_sigurg_to_task().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fcntl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 863b46e0d78a..9903bde475f2 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -457,11 +457,11 @@ static void send_sigio_to_task(struct task_struct *p,
 			else
 				si.si_band = band_table[reason - POLL_IN];
 			si.si_fd    = fd;
-			if (!send_group_sig_info(fown->signum, &si, p))
+			if (!group_send_sig_info(fown->signum, &si, p))
 				break;
 		/* fall-through: fall back on the old plain SIGIO signal */
 		case 0:
-			send_group_sig_info(SIGIO, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
 	}
 }
 
@@ -495,7 +495,7 @@ static void send_sigurg_to_task(struct task_struct *p,
                                 struct fown_struct *fown)
 {
 	if (sigio_perm(p, fown, SIGURG))
-		send_group_sig_info(SIGURG, SEND_SIG_PRIV, p);
+		group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
 }
 
 int send_sigurg(struct fown_struct *fown)
-- 
cgit v1.2.3


From 21b6bf143d05d77c350d9c6764ae090a877b66ea Mon Sep 17 00:00:00 2001
From: Jorn Dreyer <j.dreyer@butonic.de>
Date: Sun, 8 Jan 2006 01:03:30 -0800
Subject: [PATCH] nfsroot: do not silently stop parsing on an unknown option

It would be helpful if the kernel did not silently stop parsing
nfs options, but instead warned about any he does not recognize. The
attached patch adds one printk to do just that.

It took me a couple of hours to find my configuration mistake.

Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/nfsroot.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 985cc53b8dd5..e897e00c2c9d 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -275,7 +275,9 @@ static int __init root_nfs_parse(char *name, char *buf)
 			case Opt_noacl:
 				nfs_data.flags |= NFS_MOUNT_NOACL;
 				break;
-			default : 
+			default:
+				printk(KERN_WARNING "Root-NFS: unknown "
+					"option: %s\n", p);
 				return 0;
 		}
 	}
-- 
cgit v1.2.3


From 5160ee6fc891a9ca114be0e90fa6655647bb64b2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 8 Jan 2006 01:03:32 -0800
Subject: [PATCH] shrink dentry struct

Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.

Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)

This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.

At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.

Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)

As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/usb/core/inode.c     |  6 +++---
 fs/autofs4/autofs_i.h        |  2 +-
 fs/autofs4/expire.c          | 12 ++++++------
 fs/autofs4/inode.c           |  4 ++--
 fs/autofs4/root.c            |  3 ++-
 fs/coda/cache.c              |  2 +-
 fs/dcache.c                  | 34 +++++++++++++++++-----------------
 fs/libfs.c                   | 12 ++++++------
 fs/ncpfs/dir.c               |  2 +-
 fs/ncpfs/ncplib_kernel.h     |  4 ++--
 fs/smbfs/cache.c             |  4 ++--
 include/linux/dcache.h       |  9 +++++++--
 kernel/cpuset.c              |  4 ++--
 net/sunrpc/rpc_pipe.c        |  2 +-
 security/selinux/selinuxfs.c |  2 +-
 15 files changed, 54 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index c44bbedec817..4ddc453023a2 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -186,7 +186,7 @@ static void update_bus(struct dentry *bus)
 
 	down(&bus->d_inode->i_sem);
 
-	list_for_each_entry(dev, &bus->d_subdirs, d_child)
+	list_for_each_entry(dev, &bus->d_subdirs, d_u.d_child)
 		if (dev->d_inode)
 			update_dev(dev);
 
@@ -203,7 +203,7 @@ static void update_sb(struct super_block *sb)
 
 	down(&root->d_inode->i_sem);
 
-	list_for_each_entry(bus, &root->d_subdirs, d_child) {
+	list_for_each_entry(bus, &root->d_subdirs, d_u.d_child) {
 		if (bus->d_inode) {
 			switch (S_IFMT & bus->d_inode->i_mode) {
 			case S_IFDIR:
@@ -319,7 +319,7 @@ static int usbfs_empty (struct dentry *dentry)
 	spin_lock(&dcache_lock);
 
 	list_for_each(list, &dentry->d_subdirs) {
-		struct dentry *de = list_entry(list, struct dentry, d_child);
+		struct dentry *de = list_entry(list, struct dentry, d_u.d_child);
 		if (usbfs_positive(de)) {
 			spin_unlock(&dcache_lock);
 			return 0;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index fca83e28edcf..385bed09b0d8 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -209,7 +209,7 @@ static inline int simple_empty_nolock(struct dentry *dentry)
 	struct dentry *child;
 	int ret = 0;
 
-	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
 		if (simple_positive(child))
 			goto out;
 	ret = 1;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index feb6ac427d05..dc39589df165 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -105,7 +105,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if (!simple_positive(dentry)) {
@@ -138,7 +138,7 @@ resume:
 	}
 
 	if (this_parent != top) {
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -163,7 +163,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if (!simple_positive(dentry)) {
@@ -199,7 +199,7 @@ cont:
 	}
 
 	if (this_parent != parent) {
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -238,7 +238,7 @@ static struct dentry *autofs4_expire(struct super_block *sb,
 	/* On exit from the loop expire is set to a dgot dentry
 	 * to expire or it's NULL */
 	while ( next != &root->d_subdirs ) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if ( !simple_positive(dentry) ) {
@@ -302,7 +302,7 @@ next:
 			expired, (int)expired->d_name.len, expired->d_name.name);
 		spin_lock(&dcache_lock);
 		list_del(&expired->d_parent->d_subdirs);
-		list_add(&expired->d_parent->d_subdirs, &expired->d_child);
+		list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 		spin_unlock(&dcache_lock);
 		return expired;
 	}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 818b37be5153..2d3082854a29 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -91,7 +91,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - don`t care */
 		if (!simple_positive(dentry)) {
@@ -117,7 +117,7 @@ resume:
 	if (this_parent != sbi->root) {
 		struct dentry *dentry = this_parent;
 
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		spin_unlock(&dcache_lock);
 		DPRINTK("parent dentry %p %.*s",
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2a771ec66956..2241405ffc41 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -143,7 +143,8 @@ static int autofs4_dcache_readdir(struct file * filp, void * dirent, filldir_t f
 			}
 
 			while(1) {
-				struct dentry *de = list_entry(list, struct dentry, d_child);
+				struct dentry *de = list_entry(list,
+						struct dentry, d_u.d_child);
 
 				if (!d_unhashed(de) && de->d_inode) {
 					spin_unlock(&dcache_lock);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 80072fd9b7fa..c607d923350a 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
 	spin_lock(&dcache_lock);
 	list_for_each(child, &parent->d_subdirs)
 	{
-		de = list_entry(child, struct dentry, d_child);
+		de = list_entry(child, struct dentry, d_u.d_child);
 		/* don't know what to do with negative dentries */
 		if ( ! de->d_inode ) 
 			continue;
diff --git a/fs/dcache.c b/fs/dcache.c
index 17e439138681..1536f15c4d4c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -71,7 +71,7 @@ struct dentry_stat_t dentry_stat = {
 
 static void d_callback(struct rcu_head *head)
 {
-	struct dentry * dentry = container_of(head, struct dentry, d_rcu);
+	struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
 
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
@@ -86,7 +86,7 @@ static void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
- 	call_rcu(&dentry->d_rcu, d_callback);
+ 	call_rcu(&dentry->d_u.d_rcu, d_callback);
 }
 
 /*
@@ -193,7 +193,7 @@ kill_it: {
   			list_del(&dentry->d_lru);
   			dentry_stat.nr_unused--;
   		}
-  		list_del(&dentry->d_child);
+  		list_del(&dentry->d_u.d_child);
 		dentry_stat.nr_dentry--;	/* For d_free, below */
 		/*drops the locks, at that point nobody can reach this dentry */
 		dentry_iput(dentry);
@@ -367,7 +367,7 @@ static inline void prune_one_dentry(struct dentry * dentry)
 	struct dentry * parent;
 
 	__d_drop(dentry);
-	list_del(&dentry->d_child);
+	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
@@ -518,7 +518,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 		/* Have we found a mount point ? */
 		if (d_mountpoint(dentry))
@@ -532,7 +532,7 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -569,7 +569,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 
 		if (!list_empty(&dentry->d_lru)) {
@@ -610,7 +610,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name, found);
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 #ifdef DCACHE_DEBUG
 printk(KERN_DEBUG "select_parent: ascending to %s/%s, found=%d\n",
@@ -753,12 +753,12 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		dentry->d_parent = dget(parent);
 		dentry->d_sb = parent->d_sb;
 	} else {
-		INIT_LIST_HEAD(&dentry->d_child);
+		INIT_LIST_HEAD(&dentry->d_u.d_child);
 	}
 
 	spin_lock(&dcache_lock);
 	if (parent)
-		list_add(&dentry->d_child, &parent->d_subdirs);
+		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	dentry_stat.nr_dentry++;
 	spin_unlock(&dcache_lock);
 
@@ -1310,8 +1310,8 @@ already_unhashed:
 	/* Unhash the target: dput() will then get rid of it */
 	__d_drop(target);
 
-	list_del(&dentry->d_child);
-	list_del(&target->d_child);
+	list_del(&dentry->d_u.d_child);
+	list_del(&target->d_u.d_child);
 
 	/* Switch the names.. */
 	switch_names(dentry, target);
@@ -1322,15 +1322,15 @@ already_unhashed:
 	if (IS_ROOT(dentry)) {
 		dentry->d_parent = target->d_parent;
 		target->d_parent = target;
-		INIT_LIST_HEAD(&target->d_child);
+		INIT_LIST_HEAD(&target->d_u.d_child);
 	} else {
 		do_switch(dentry->d_parent, target->d_parent);
 
 		/* And add them back to the (new) parent lists */
-		list_add(&target->d_child, &target->d_parent->d_subdirs);
+		list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
 	}
 
-	list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
+	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
 	spin_unlock(&target->d_lock);
 	spin_unlock(&dentry->d_lock);
 	write_sequnlock(&rename_lock);
@@ -1568,7 +1568,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 		if (d_unhashed(dentry)||!dentry->d_inode)
 			continue;
@@ -1579,7 +1579,7 @@ resume:
 		atomic_dec(&dentry->d_count);
 	}
 	if (this_parent != root) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		atomic_dec(&this_parent->d_count);
 		this_parent = this_parent->d_parent;
 		goto resume;
diff --git a/fs/libfs.c b/fs/libfs.c
index 58101dff2c66..9c50523382e7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -93,16 +93,16 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			loff_t n = file->f_pos - 2;
 
 			spin_lock(&dcache_lock);
-			list_del(&cursor->d_child);
+			list_del(&cursor->d_u.d_child);
 			p = file->f_dentry->d_subdirs.next;
 			while (n && p != &file->f_dentry->d_subdirs) {
 				struct dentry *next;
-				next = list_entry(p, struct dentry, d_child);
+				next = list_entry(p, struct dentry, d_u.d_child);
 				if (!d_unhashed(next) && next->d_inode)
 					n--;
 				p = p->next;
 			}
-			list_add_tail(&cursor->d_child, p);
+			list_add_tail(&cursor->d_u.d_child, p);
 			spin_unlock(&dcache_lock);
 		}
 	}
@@ -126,7 +126,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct dentry *cursor = filp->private_data;
-	struct list_head *p, *q = &cursor->d_child;
+	struct list_head *p, *q = &cursor->d_u.d_child;
 	ino_t ino;
 	int i = filp->f_pos;
 
@@ -153,7 +153,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			}
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
-				next = list_entry(p, struct dentry, d_child);
+				next = list_entry(p, struct dentry, d_u.d_child);
 				if (d_unhashed(next) || !next->d_inode)
 					continue;
 
@@ -261,7 +261,7 @@ int simple_empty(struct dentry *dentry)
 	int ret = 0;
 
 	spin_lock(&dcache_lock);
-	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
 		if (simple_positive(child))
 			goto out;
 	ret = 1;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index a9f7a8ab1d59..cfd76f431dc0 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -365,7 +365,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_child);
+		dent = list_entry(next, struct dentry, d_u.d_child);
 		if ((unsigned long)dent->d_fsdata == fpos) {
 			if (dent->d_inode)
 				dget_locked(dent);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 9e4dc30c2435..799e5c2bec55 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -196,7 +196,7 @@ ncp_renew_dentries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		if (dentry->d_fsdata == NULL)
 			ncp_age_dentry(server, dentry);
@@ -218,7 +218,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 		dentry->d_fsdata = NULL;
 		ncp_age_dentry(server, dentry);
 		next = next->next;
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
index f3e6b81288ab..74b86d9725a6 100644
--- a/fs/smbfs/cache.c
+++ b/fs/smbfs/cache.c
@@ -66,7 +66,7 @@ smb_invalidate_dircache_entries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 		dentry->d_fsdata = NULL;
 		smb_age_dentry(server, dentry);
 		next = next->next;
@@ -100,7 +100,7 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_child);
+		dent = list_entry(next, struct dentry, d_u.d_child);
 		if ((unsigned long)dent->d_fsdata == fpos) {
 			if (dent->d_inode)
 				dget_locked(dent);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 46a2ba617595..a3ed5e059d47 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -95,14 +95,19 @@ struct dentry {
 	struct qstr d_name;
 
 	struct list_head d_lru;		/* LRU list */
-	struct list_head d_child;	/* child of parent list */
+	/*
+	 * d_child and d_rcu can share memory
+	 */
+	union {
+		struct list_head d_child;	/* child of parent list */
+	 	struct rcu_head d_rcu;
+	} d_u;
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
 	unsigned long d_time;		/* used by d_revalidate */
 	struct dentry_operations *d_op;
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	void *d_fsdata;			/* fs-specific data */
- 	struct rcu_head d_rcu;
 	struct dcookie_struct *d_cookie; /* cookie, if any */
 	int d_mounted;
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e04c2da9dadb..eab64e23bcae 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -331,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 	spin_lock(&dcache_lock);
 	node = dentry->d_subdirs.next;
 	while (node != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_child);
+		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 		if (d->d_inode) {
 			d = dget_locked(d);
@@ -343,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 		}
 		node = dentry->d_subdirs.next;
 	}
-	list_del_init(&dentry->d_child);
+	list_del_init(&dentry->d_u.d_child);
 	spin_unlock(&dcache_lock);
 	remove_dir(dentry);
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 24cc23af9b95..e14c1cae7460 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -495,7 +495,7 @@ rpc_depopulate(struct dentry *parent)
 repeat:
 	spin_lock(&dcache_lock);
 	list_for_each_safe(pos, next, &parent->d_subdirs) {
-		dentry = list_entry(pos, struct dentry, d_child);
+		dentry = list_entry(pos, struct dentry, d_u.d_child);
 		spin_lock(&dentry->d_lock);
 		if (!d_unhashed(dentry)) {
 			dget_locked(dentry);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index e59da6398d44..b5fa02d17b1e 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -889,7 +889,7 @@ static void sel_remove_bools(struct dentry *de)
 	spin_lock(&dcache_lock);
 	node = de->d_subdirs.next;
 	while (node != &de->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_child);
+		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 
 		if (d->d_inode) {
-- 
cgit v1.2.3


From dda6ebde96044e9b5f1b14588659b39b4e6c08e7 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Sun, 8 Jan 2006 01:03:35 -0800
Subject: [PATCH] Fix handling of ELF segments with zero filesize

mmap() returns -EINVAL if given a zero length, and thus elf_map() in
binfmt_elf.c does likewise if it attempts to map a (page-aligned) ELF
segment with zero filesize.  Such a situation never arises with the default
linker scripts, but there's nothing inherently wrong with zero-filesize
(but non-zero memsize) ELF segments.  Custom linker scripts can generate
them, and the kernel should be able to map them; this patch makes it so.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f36f2210204f..288386b1deff 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -288,11 +288,17 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 			struct elf_phdr *eppnt, int prot, int type)
 {
 	unsigned long map_addr;
+	unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
 
 	down_write(&current->mm->mmap_sem);
-	map_addr = do_mmap(filep, ELF_PAGESTART(addr),
-			   eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot, type,
-			   eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr));
+	/* mmap() will return -EINVAL if given a zero size, but a
+	 * segment with zero filesize is perfectly valid */
+	if (eppnt->p_filesz + pageoffset)
+		map_addr = do_mmap(filep, ELF_PAGESTART(addr),
+				   eppnt->p_filesz + pageoffset, prot, type,
+				   eppnt->p_offset - pageoffset);
+	else
+		map_addr = ELF_PAGESTART(addr);
 	up_write(&current->mm->mmap_sem);
 	return(map_addr);
 }
-- 
cgit v1.2.3


From e78c9a004aadebe22306c81d1a7f1d1278dc37f9 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:03:39 -0800
Subject: [PATCH] fs: remove s_old_blocksize from struct super_block

This patch inlines the single user of struct super_block field
s_old_blocksize and removes the field.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/super.c         | 3 +--
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 5a347a4f673a..0a30e51692cf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -700,8 +700,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		s->s_old_blocksize = block_size(bdev);
-		sb_set_blocksize(s, s->s_old_blocksize);
+		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a1e28f0895c0..4c82219b0fae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -808,7 +808,6 @@ struct super_block {
 	struct list_head	s_list;		/* Keep this first */
 	dev_t			s_dev;		/* search index; _not_ kdev_t */
 	unsigned long		s_blocksize;
-	unsigned long		s_old_blocksize;
 	unsigned char		s_blocksize_bits;
 	unsigned char		s_dirt;
 	unsigned long long	s_maxbytes;	/* Max file size */
-- 
cgit v1.2.3


From ddc0f846aa7621940b74cee0c91cd26405058a4d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:04:01 -0800
Subject: [PATCH] fs/udf/balloc.c: "extern inline" -> "static inline"

"extern inline" doesn't make much sense.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/udf/balloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 6598a5037ac8..4fae57d9d115 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -41,7 +41,7 @@
 #define uint(x) xuint(x)
 #define xuint(x) __le ## x
 
-extern inline int find_next_one_bit (void * addr, int size, int offset)
+static inline int find_next_one_bit (void * addr, int size, int offset)
 {
 	uintBPL_t * p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
 	int result = offset & ~(BITS_PER_LONG-1);
-- 
cgit v1.2.3


From a1365647022eb05a5993f270a78e9bef3bf554eb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:09 -0800
Subject: [PATCH] remove gcc-2 checks

Remove various things which were checking for gcc-1.x and gcc-2.x compilers.

From: Adrian Bunk <bunk@stusta.de>

    Some documentation updates and removes some code paths for gcc < 3.2.

Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/Changes                 | 31 +++++--------------------------
 README                                |  7 ++-----
 arch/arm/kernel/asm-offsets.c         |  9 ++-------
 arch/arm26/kernel/asm-offsets.c       |  7 -------
 arch/i386/Kconfig                     |  4 ----
 arch/i386/Makefile                    |  5 +----
 arch/i386/Makefile.cpu                | 10 +++++-----
 arch/ia64/Makefile                    |  4 ----
 arch/ia64/kernel/head.S               |  2 +-
 arch/ia64/kernel/ia64_ksyms.c         |  2 +-
 arch/ia64/oprofile/backtrace.c        |  2 +-
 drivers/md/raid0.c                    |  6 ------
 drivers/media/video/v4l2-common.c     |  2 --
 fs/ocfs2/cluster/masklog.h            |  7 +++----
 fs/xfs/xfs_log.h                      |  8 +-------
 include/asm-alpha/compiler.h          |  2 --
 include/asm-alpha/processor.h         | 21 ---------------------
 include/asm-ia64/bug.h                |  6 +-----
 include/asm-ia64/spinlock.h           |  2 +-
 include/asm-sparc64/system.h          |  4 ----
 include/asm-um/rwsem.h                |  4 ----
 include/asm-v850/unistd.h             | 18 ------------------
 include/linux/byteorder/generic.h     |  2 +-
 include/linux/byteorder/swab.h        |  2 +-
 include/linux/byteorder/swabb.h       |  2 +-
 include/linux/compiler-gcc.h          |  9 +++++++++
 include/linux/compiler-gcc3.h         | 17 -----------------
 include/linux/compiler-gcc4.h         |  7 -------
 include/linux/kernel.h                |  2 --
 include/linux/seccomp.h               |  6 +-----
 include/linux/spinlock_types_up.h     | 14 --------------
 sound/isa/wavefront/wavefront_synth.c |  7 -------
 32 files changed, 37 insertions(+), 194 deletions(-)

(limited to 'fs')

diff --git a/Documentation/Changes b/Documentation/Changes
index 86b86399d61d..fe5ae0f55020 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -31,8 +31,6 @@ al espa
 Eine deutsche Version dieser Datei finden Sie unter
 <http://www.stefan-winter.de/Changes-2.4.0.txt>.
 
-Last updated: October 29th, 2002
-
 Chris Ricker (kaboom@gatech.edu or chris.ricker@genetics.utah.edu).
 
 Current Minimal Requirements
@@ -48,7 +46,7 @@ necessary on all systems; obviously, if you don't have any ISDN
 hardware, for example, you probably needn't concern yourself with
 isdn4k-utils.
 
-o  Gnu C                  2.95.3                  # gcc --version
+o  Gnu C                  3.2                     # gcc --version
 o  Gnu make               3.79.1                  # make --version
 o  binutils               2.12                    # ld -v
 o  util-linux             2.10o                   # fdformat --version
@@ -74,26 +72,7 @@ GCC
 ---
 
 The gcc version requirements may vary depending on the type of CPU in your
-computer. The next paragraph applies to users of x86 CPUs, but not
-necessarily to users of other CPUs. Users of other CPUs should obtain
-information about their gcc version requirements from another source.
-
-The recommended compiler for the kernel is gcc 2.95.x (x >= 3), and it
-should be used when you need absolute stability. You may use gcc 3.0.x
-instead if you wish, although it may cause problems. Later versions of gcc 
-have not received much testing for Linux kernel compilation, and there are 
-almost certainly bugs (mainly, but not exclusively, in the kernel) that
-will need to be fixed in order to use these compilers. In any case, using
-pgcc instead of plain gcc is just asking for trouble.
-
-The Red Hat gcc 2.96 compiler subtree can also be used to build this tree.
-You should ensure you use gcc-2.96-74 or later. gcc-2.96-54 will not build
-the kernel correctly.
-
-In addition, please pay attention to compiler optimization.  Anything
-greater than -O2 may not be wise.  Similarly, if you choose to use gcc-2.95.x
-or derivatives, be sure not to use -fstrict-aliasing (which, depending on
-your version of gcc 2.95.x, may necessitate using -fno-strict-aliasing).
+computer.
 
 Make
 ----
@@ -322,9 +301,9 @@ Getting updated software
 Kernel compilation
 ******************
 
-gcc 2.95.3
-----------
-o  <ftp://ftp.gnu.org/gnu/gcc/gcc-2.95.3.tar.gz>
+gcc
+---
+o  <ftp://ftp.gnu.org/gnu/gcc/>
 
 Make
 ----
diff --git a/README b/README
index 61c4f7429233..cd5e2eb6213b 100644
--- a/README
+++ b/README
@@ -183,11 +183,8 @@ CONFIGURING the kernel:
 
 COMPILING the kernel:
 
- - Make sure you have gcc 2.95.3 available.
-   gcc 2.91.66 (egcs-1.1.2), and gcc 2.7.2.3 are known to miscompile
-   some parts of the kernel, and are *no longer supported*.
-   Also remember to upgrade your binutils package (for as/ld/nm and company)
-   if necessary. For more information, refer to Documentation/Changes.
+ - Make sure you have at least gcc 3.2 available.
+   For more information, refer to Documentation/Changes.
 
    Please note that you can still run a.out user programs with this kernel.
 
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 04d3082a7b94..0abbce8c70bc 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -23,20 +23,15 @@
 #error Sorry, your compiler targets APCS-26 but this kernel requires APCS-32
 #endif
 /*
- * GCC 2.95.1, 2.95.2: ignores register clobber list in asm().
  * GCC 3.0, 3.1: general bad code generation.
  * GCC 3.2.0: incorrect function argument offset calculation.
  * GCC 3.2.x: miscompiles NEW_AUX_ENT in fs/binfmt_elf.c
  *            (http://gcc.gnu.org/PR8896) and incorrect structure
  *	      initialisation in fs/jffs2/erase.c
  */
-#if __GNUC__ < 2 || \
-   (__GNUC__ == 2 && __GNUC_MINOR__ < 95) || \
-   (__GNUC__ == 2 && __GNUC_MINOR__ == 95 && __GNUC_PATCHLEVEL__ != 0 && \
-					     __GNUC_PATCHLEVEL__ < 3) || \
-   (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 #error Your compiler is too buggy; it is known to miscompile kernels.
-#error    Known good compilers: 2.95.3, 2.95.4, 2.96, 3.3
+#error    Known good compilers: 3.3
 #endif
 
 /* Use marker if you need to separate the values later */
diff --git a/arch/arm26/kernel/asm-offsets.c b/arch/arm26/kernel/asm-offsets.c
index 4ccacaef94df..ac682d5fd039 100644
--- a/arch/arm26/kernel/asm-offsets.c
+++ b/arch/arm26/kernel/asm-offsets.c
@@ -25,13 +25,6 @@
 #if defined(__APCS_32__) && defined(CONFIG_CPU_26)
 #error Sorry, your compiler targets APCS-32 but this kernel requires APCS-26
 #endif
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 95)
-#error Sorry, your compiler is known to miscompile kernels.  Only use gcc 2.95.3 and later.
-#endif
-#if __GNUC__ == 2 && __GNUC_MINOR__ == 95
-/* shame we can't detect the .1 or .2 releases */
-#warning GCC 2.95.2 and earlier miscompiles kernels.
-#endif
 
 /* Use marker if you need to separate the values later */
 
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 968fabd8723f..486449e9e710 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -630,10 +630,6 @@ config REGPARM
 	and passes the first three arguments of a function call in registers.
 	This will probably break binary only modules.
 
-	This feature is only enabled for gcc-3.0 and later - earlier compilers
-	generate incorrect output with certain kernel constructs when
-	-mregparm=3 is used.
-
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index d121ea18460f..b84119f9cc63 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -37,10 +37,7 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
 # CPU-specific tuning. Anything which can be shared with UML should go here.
 include $(srctree)/arch/i386/Makefile.cpu
 
-# -mregparm=3 works ok on gcc-3.0 and later
-#
-GCC_VERSION			:= $(call cc-version)
-cflags-$(CONFIG_REGPARM) 	+= $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;)
+cflags-$(CONFIG_REGPARM) 	+= -mregparm=3
 
 # Disable unit-at-a-time mode, it makes gcc use a lot more stack
 # due to the lack of sharing of stacklots.
diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu
index 8e51456df23d..dcd936ef45db 100644
--- a/arch/i386/Makefile.cpu
+++ b/arch/i386/Makefile.cpu
@@ -1,7 +1,7 @@
 # CPU tuning section - shared with UML.
 # Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML.
 
-#-mtune exists since gcc 3.4, and some -mcpu flavors didn't exist in gcc 2.95.
+#-mtune exists since gcc 3.4
 HAS_MTUNE	:= $(call cc-option-yn, -mtune=i386)
 ifeq ($(HAS_MTUNE),y)
 tune		= $(call cc-option,-mtune=$(1),)
@@ -14,7 +14,7 @@ cflags-$(CONFIG_M386)		+= -march=i386
 cflags-$(CONFIG_M486)		+= -march=i486
 cflags-$(CONFIG_M586)		+= -march=i586
 cflags-$(CONFIG_M586TSC)	+= -march=i586
-cflags-$(CONFIG_M586MMX)	+= $(call cc-option,-march=pentium-mmx,-march=i586)
+cflags-$(CONFIG_M586MMX)	+= -march=pentium-mmx
 cflags-$(CONFIG_M686)		+= -march=i686
 cflags-$(CONFIG_MPENTIUMII)	+= -march=i686 $(call tune,pentium2)
 cflags-$(CONFIG_MPENTIUMIII)	+= -march=i686 $(call tune,pentium3)
@@ -23,8 +23,8 @@ cflags-$(CONFIG_MPENTIUM4)	+= -march=i686 $(call tune,pentium4)
 cflags-$(CONFIG_MK6)		+= -march=k6
 # Please note, that patches that add -march=athlon-xp and friends are pointless.
 # They make zero difference whatsosever to performance at this time.
-cflags-$(CONFIG_MK7)		+= $(call cc-option,-march=athlon,-march=i686 $(align)-functions=4)
-cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,$(call cc-option,-march=athlon,-march=i686 $(align)-functions=4))
+cflags-$(CONFIG_MK7)		+= -march=athlon
+cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,-march=athlon)
 cflags-$(CONFIG_MCRUSOE)	+= -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MEFFICEON)	+= -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MWINCHIPC6)	+= $(call cc-option,-march=winchip-c6,-march=i586)
@@ -37,5 +37,5 @@ cflags-$(CONFIG_MVIAC3_2)	+= $(call cc-option,-march=c3-2,-march=i686)
 cflags-$(CONFIG_X86_ELAN)	+= -march=i486
 
 # Geode GX1 support
-cflags-$(CONFIG_MGEODEGX1)		+= $(call cc-option,-march=pentium-mmx,-march=i486)
+cflags-$(CONFIG_MGEODEGX1)	+= -march=pentium-mmx
 
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 67932ad53082..57b047c27e46 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -37,10 +37,6 @@ $(error Sorry, you need a newer version of the assember, one that is built from
 		ftp://ftp.hpl.hp.com/pub/linux-ia64/gas-030124.tar.gz)
 endif
 
-ifneq ($(shell if [ $(GCC_VERSION) -lt 0300 ] ; then echo "bad"; fi ;),)
-$(error Sorry, your compiler is too old.  GCC v2.96 is known to generate bad code.)
-endif
-
 ifeq ($(GCC_VERSION),0304)
 	cflags-$(CONFIG_ITANIUM)	+= -mtune=merced
 	cflags-$(CONFIG_MCKINLEY)	+= -mtune=mckinley
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index bfe65b2e8621..fbc7ea35dd57 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -1060,7 +1060,7 @@ SET_REG(b5);
 	 * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
 	 */
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 
 GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4)
 	.prologue
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 5db9d3bcbbcb..e72de580ebbf 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -103,7 +103,7 @@ EXPORT_SYMBOL(unw_init_running);
 
 #ifdef ASM_SUPPORTED
 # ifdef CONFIG_SMP
-#  if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#  if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 /*
  * This is not a normal routine and we don't want a function descriptor for it, so we use
  * a fake declaration here.
diff --git a/arch/ia64/oprofile/backtrace.c b/arch/ia64/oprofile/backtrace.c
index b7dabbfb0d61..adb01566bd57 100644
--- a/arch/ia64/oprofile/backtrace.c
+++ b/arch/ia64/oprofile/backtrace.c
@@ -32,7 +32,7 @@ typedef struct
 	u64 *prev_pfs_loc;	/* state for WAR for old spinlock ool code */
 } ia64_backtrace_t;
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 /*
  * Returns non-zero if the PC is in the spinlock contention out-of-line code
  * with non-standard calling sequence (on older compilers).
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index abbca150202b..d03f99cf4b7d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -306,9 +306,6 @@ static int raid0_run (mddev_t *mddev)
 	printk("raid0 : conf->hash_spacing is %llu blocks.\n",
 		(unsigned long long)conf->hash_spacing);
 	{
-#if __GNUC__ < 3
-		volatile
-#endif
 		sector_t s = mddev->array_size;
 		sector_t space = conf->hash_spacing;
 		int round;
@@ -439,9 +436,6 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)
  
 
 	{
-#if __GNUC__ < 3
-		volatile
-#endif
 		sector_t x = block >> conf->preshift;
 		sector_div(x, (u32)conf->hash_spacing);
 		zone = conf->hash_table[x];
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index 597b8db35a13..62a7d636ef11 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -191,9 +191,7 @@ char *v4l2_type_names[] = {
 };
 
 char *v4l2_ioctl_names[256] = {
-#if __GNUC__ >= 3
 	[0 ... 255]                      = "UNKNOWN",
-#endif
 	[_IOC_NR(VIDIOC_QUERYCAP)]       = "VIDIOC_QUERYCAP",
 	[_IOC_NR(VIDIOC_RESERVED)]       = "VIDIOC_RESERVED",
 	[_IOC_NR(VIDIOC_ENUM_FMT)]       = "VIDIOC_ENUM_FMT",
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index f5ef5ea61a05..e8c56a3d9c64 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -212,11 +212,10 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 	mlog(ML_ENTRY, "ENTRY:\n");					\
 } while (0)
 
-/* We disable this for old compilers since they don't have support for
- * __builtin_types_compatible_p.
+/*
+ * We disable this for sparse.
  */
-#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
-    !defined(__CHECKER__)
+#if !defined(__CHECKER__)
 #define mlog_exit(st) do {						     \
 	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
 		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 158829ca56f6..f40d4391fcfc 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,13 +30,7 @@
  * By comparing each compnent, we don't have to worry about extra
  * endian issues in treating two 32 bit numbers as one 64 bit number
  */
-static
-#if defined(__GNUC__) && (__GNUC__ == 2) && ( (__GNUC_MINOR__ == 95) || (__GNUC_MINOR__ == 96))
-__attribute__((unused))	/* gcc 2.95, 2.96 miscompile this when inlined */
-#else
-__inline__
-#endif
-xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
+static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 {
 	if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
 		return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
diff --git a/include/asm-alpha/compiler.h b/include/asm-alpha/compiler.h
index 0a4a8b40dfcd..00c6f57ad9a7 100644
--- a/include/asm-alpha/compiler.h
+++ b/include/asm-alpha/compiler.h
@@ -98,9 +98,7 @@
 #undef inline
 #undef __inline__
 #undef __inline
-#if __GNUC__ == 3 && __GNUC_MINOR__ >= 1 || __GNUC__ > 3
 #undef __always_inline
 #define __always_inline		inline __attribute__((always_inline))
-#endif
 
 #endif /* __ALPHA_COMPILER_H */
diff --git a/include/asm-alpha/processor.h b/include/asm-alpha/processor.h
index 059780a7d3d7..bb1a7a3abb8b 100644
--- a/include/asm-alpha/processor.h
+++ b/include/asm-alpha/processor.h
@@ -77,7 +77,6 @@ unsigned long get_wchan(struct task_struct *p);
 #define spin_lock_prefetch(lock)  	do { } while (0)
 #endif
 
-#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
 extern inline void prefetch(const void *ptr)  
 { 
 	__builtin_prefetch(ptr, 0, 3);
@@ -95,24 +94,4 @@ extern inline void spin_lock_prefetch(const void *ptr)
 }
 #endif
 
-#else
-extern inline void prefetch(const void *ptr)  
-{ 
-	__asm__ ("ldl $31,%0" : : "m"(*(char *)ptr)); 
-}
-
-extern inline void prefetchw(const void *ptr)  
-{
-	__asm__ ("ldq $31,%0" : : "m"(*(char *)ptr)); 
-}
-
-#ifdef CONFIG_SMP
-extern inline void spin_lock_prefetch(const void *ptr)  
-{
-	__asm__ ("ldq $31,%0" : : "m"(*(char *)ptr)); 
-}
-#endif
-
-#endif /* GCC 3.1 */
-
 #endif /* __ASM_ALPHA_PROCESSOR_H */
diff --git a/include/asm-ia64/bug.h b/include/asm-ia64/bug.h
index 3aa0a0a5474b..823616b5020b 100644
--- a/include/asm-ia64/bug.h
+++ b/include/asm-ia64/bug.h
@@ -2,11 +2,7 @@
 #define _ASM_IA64_BUG_H
 
 #ifdef CONFIG_BUG
-#if (__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
-# define ia64_abort()	__builtin_trap()
-#else
-# define ia64_abort()	(*(volatile int *) 0 = 0)
-#endif
+#define ia64_abort()	__builtin_trap()
 #define BUG() do { printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); ia64_abort(); } while (0)
 
 /* should this BUG be made generic? */
diff --git a/include/asm-ia64/spinlock.h b/include/asm-ia64/spinlock.h
index 0c91a76c5ea3..9e83210dc312 100644
--- a/include/asm-ia64/spinlock.h
+++ b/include/asm-ia64/spinlock.h
@@ -34,7 +34,7 @@ __raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
 {
 	register volatile unsigned int *ptr asm ("r31") = &lock->lock;
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 # ifdef CONFIG_ITANIUM
 	/* don't use brl on Itanium... */
 	asm volatile ("{\n\t"
diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index b5417529f6f1..309f1466b6fa 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -193,11 +193,7 @@ do {						\
 	 * not preserve it's value.  Hairy, but it lets us remove 2 loads
 	 * and 2 stores in this critical code path.  -DaveM
 	 */
-#if __GNUC__ >= 3
 #define EXTRA_CLOBBER ,"%l1"
-#else
-#define EXTRA_CLOBBER
-#endif
 #define switch_to(prev, next, last)					\
 do {	if (test_thread_flag(TIF_PERFCTR)) {				\
 		unsigned long __tmp;					\
diff --git a/include/asm-um/rwsem.h b/include/asm-um/rwsem.h
index 661c0e54702b..b5fc449dc86b 100644
--- a/include/asm-um/rwsem.h
+++ b/include/asm-um/rwsem.h
@@ -1,10 +1,6 @@
 #ifndef __UM_RWSEM_H__
 #define __UM_RWSEM_H__
 
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
-#define __builtin_expect(exp,c) (exp)
-#endif
-
 #include "asm/arch/rwsem.h"
 
 #endif
diff --git a/include/asm-v850/unistd.h b/include/asm-v850/unistd.h
index 5a86f8e976ec..82460a7bb233 100644
--- a/include/asm-v850/unistd.h
+++ b/include/asm-v850/unistd.h
@@ -241,9 +241,6 @@
 /* User programs sometimes end up including this header file
    (indirectly, via uClibc header files), so I'm a bit nervous just
    including <linux/compiler.h>.  */
-#if !defined(__builtin_expect) && __GNUC__ == 2 && __GNUC_MINOR__ < 96
-#define __builtin_expect(x, expected_value) (x)
-#endif
 
 #define __syscall_return(type, res)					      \
   do {									      \
@@ -346,20 +343,6 @@ type name (atype a, btype b, ctype c, dtype d, etype e)			      \
   __syscall_return (type, __ret);					      \
 }
 
-#if __GNUC__ < 3
-/* In older versions of gcc, `asm' statements with more than 10
-   input/output arguments produce a fatal error.  To work around this
-   problem, we use two versions, one for gcc-3.x and one for earlier
-   versions of gcc (the `earlier gcc' version doesn't work with gcc-3.x
-   because gcc-3.x doesn't allow clobbers to also be input arguments).  */
-#define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
-  __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
-			: "=r" (ret), "=r" (syscall)			      \
-			: "1" (syscall),				      \
-			"r" (a), "r" (b), "r" (c), "r" (d),		      \
- 			"r" (e), "r" (f)				      \
-			: SYSCALL_CLOBBERS, SYSCALL_ARG4, SYSCALL_ARG5);
-#else /* __GNUC__ >= 3 */
 #define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
   __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
 			: "=r" (ret), "=r" (syscall),			      \
@@ -368,7 +351,6 @@ type name (atype a, btype b, ctype c, dtype d, etype e)			      \
 			"r" (a), "r" (b), "r" (c), "r" (d),		      \
 			"2" (e), "3" (f)				      \
 			: SYSCALL_CLOBBERS);
-#endif
 
 #define _syscall6(type, name, atype, a, btype, b, ctype, c, dtype, d, etype, e, ftype, f) \
 type name (atype a, btype b, ctype c, dtype d, etype e, ftype f)	      \
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 04bd756efc67..e86e4a938373 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -156,7 +156,7 @@ extern __be32			htonl(__u32);
 extern __u16			ntohs(__be16);
 extern __be16			htons(__u16);
 
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 
 #define ___htonl(x) __cpu_to_be32(x)
 #define ___htons(x) __cpu_to_be16(x)
diff --git a/include/linux/byteorder/swab.h b/include/linux/byteorder/swab.h
index 2f1cb775125a..25f7f32883ec 100644
--- a/include/linux/byteorder/swab.h
+++ b/include/linux/byteorder/swab.h
@@ -110,7 +110,7 @@
 /*
  * Allow constant folding
  */
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 #  define __swab16(x) \
 (__builtin_constant_p((__u16)(x)) ? \
  ___swab16((x)) : \
diff --git a/include/linux/byteorder/swabb.h b/include/linux/byteorder/swabb.h
index d5f2a3205109..ae5e5f914bf4 100644
--- a/include/linux/byteorder/swabb.h
+++ b/include/linux/byteorder/swabb.h
@@ -77,7 +77,7 @@
 /*
  * Allow constant folding
  */
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 #  define __swahw32(x) \
 (__builtin_constant_p((__u32)(x)) ? \
  ___swahw32((x)) : \
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 152734055403..2e05e1e6b0e6 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -15,3 +15,12 @@
   ({ unsigned long __ptr;					\
     __asm__ ("" : "=g"(__ptr) : "0"(ptr));		\
     (typeof(ptr)) (__ptr + (off)); })
+
+
+#define inline		inline		__attribute__((always_inline))
+#define __inline__	__inline__	__attribute__((always_inline))
+#define __inline	__inline	__attribute__((always_inline))
+#define __deprecated			__attribute__((deprecated))
+#define  noinline			__attribute__((noinline))
+#define __attribute_pure__		__attribute__((pure))
+#define __attribute_const__		__attribute__((__const__))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index a6fa615afab5..4209082ee934 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -3,29 +3,12 @@
 /* These definitions are for GCC v3.x.  */
 #include <linux/compiler-gcc.h>
 
-#if __GNUC_MINOR__ >= 1
-# define inline		inline		__attribute__((always_inline))
-# define __inline__	__inline__	__attribute__((always_inline))
-# define __inline	__inline	__attribute__((always_inline))
-#endif
-
-#if __GNUC_MINOR__ > 0
-# define __deprecated		__attribute__((deprecated))
-#endif
-
 #if __GNUC_MINOR__ >= 3
 # define __attribute_used__	__attribute__((__used__))
 #else
 # define __attribute_used__	__attribute__((__unused__))
 #endif
 
-#define __attribute_pure__	__attribute__((pure))
-#define __attribute_const__	__attribute__((__const__))
-
-#if __GNUC_MINOR__ >= 1
-#define  noinline		__attribute__((noinline))
-#endif
-
 #if __GNUC_MINOR__ >= 4
 #define __must_check		__attribute__((warn_unused_result))
 #endif
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 53686c037a06..e913e9beaf69 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -3,14 +3,7 @@
 /* These definitions are for GCC v4.x.  */
 #include <linux/compiler-gcc.h>
 
-#define inline			inline		__attribute__((always_inline))
-#define __inline__		__inline__	__attribute__((always_inline))
-#define __inline		__inline	__attribute__((always_inline))
-#define __deprecated		__attribute__((deprecated))
 #define __attribute_used__	__attribute__((__used__))
-#define __attribute_pure__	__attribute__((pure))
-#define __attribute_const__	__attribute__((__const__))
-#define  noinline		__attribute__((noinline))
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b1e407a4fbda..ca7ff8fdd090 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -316,8 +316,6 @@ extern int randomize_va_space;
 #endif
 
 /* Trap pasters of __FUNCTION__ at compile-time */
-#if __GNUC__ > 2 || __GNUC_MINOR__ >= 95
 #define __FUNCTION__ (__func__)
-#endif
 
 #endif
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index dc89116bb1ca..cd2773b29a64 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -26,11 +26,7 @@ static inline int has_secure_computing(struct thread_info *ti)
 
 #else /* CONFIG_SECCOMP */
 
-#if (__GNUC__ > 2)
-  typedef struct { } seccomp_t;
-#else
-  typedef struct { int gcc_is_buggy; } seccomp_t;
-#endif
+typedef struct { } seccomp_t;
 
 #define secure_computing(x) do { } while (0)
 /* static inline to preserve typechecking */
diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
index def2d173a8db..04135b0e198e 100644
--- a/include/linux/spinlock_types_up.h
+++ b/include/linux/spinlock_types_up.h
@@ -22,30 +22,16 @@ typedef struct {
 
 #else
 
-/*
- * All gcc 2.95 versions and early versions of 2.96 have a nasty bug
- * with empty initializers.
- */
-#if (__GNUC__ > 2)
 typedef struct { } raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { }
-#else
-typedef struct { int gcc_is_buggy; } raw_spinlock_t;
-#define __RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) { 0 }
-#endif
 
 #endif
 
-#if (__GNUC__ > 2)
 typedef struct {
 	/* no debug version on UP */
 } raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
-#else
-typedef struct { int gcc_is_buggy; } raw_rwlock_t;
-#define __RAW_RW_LOCK_UNLOCKED (raw_rwlock_t) { 0 }
-#endif
 
 #endif /* __LINUX_SPINLOCK_TYPES_UP_H */
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 679d0ae97e4f..ed81eec6e732 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -115,18 +115,11 @@ MODULE_PARM_DESC(osrun_time, "how many seconds to wait for the ICS2115 OS");
 
 #ifdef WF_DEBUG
 
-#if defined(NEW_MACRO_VARARGS) || __GNUC__ >= 3
 #define DPRINT(cond, ...) \
        if ((dev->debug & (cond)) == (cond)) { \
 	     snd_printk (__VA_ARGS__); \
        }
 #else
-#define DPRINT(cond, args...) \
-       if ((dev->debug & (cond)) == (cond)) { \
-	     snd_printk (args); \
-       }
-#endif
-#else
 #define DPRINT(cond, args...)
 #endif /* WF_DEBUG */
 
-- 
cgit v1.2.3


From fee781e6c25772db862d3322b4745a896022a4f1 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:04:16 -0800
Subject: [PATCH] fs/proc/: function prototypes belong in header files

Function prototypes belong into header files.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/generic.c  | 2 ++
 fs/proc/inode.c    | 2 +-
 fs/proc/internal.h | 4 ++++
 fs/proc/root.c     | 3 ++-
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 72b431d0a0a4..20e5c4509a43 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -21,6 +21,8 @@
 #include <linux/bitops.h>
 #include <asm/uaccess.h>
 
+#include "internal.h"
+
 static ssize_t proc_file_read(struct file *file, char __user *buf,
 			      size_t nbytes, loff_t *ppos);
 static ssize_t proc_file_write(struct file *file, const char __user *buffer,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e6a818a93f3d..6573f31f1fd9 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -19,7 +19,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
-extern void free_proc_entry(struct proc_dir_entry *);
+#include "internal.h"
 
 static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de)
 {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e55198f9806..95a1cf32b838 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,6 +37,10 @@ extern int proc_tgid_stat(struct task_struct *, char *);
 extern int proc_pid_status(struct task_struct *, char *);
 extern int proc_pid_statm(struct task_struct *, char *);
 
+void free_proc_entry(struct proc_dir_entry *de);
+
+int proc_init_inodecache(void);
+
 static inline struct task_struct *proc_task(struct inode *inode)
 {
 	return PROC_I(inode)->task;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index aef148f099a2..68896283c8ae 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,8 @@
 #include <linux/bitops.h>
 #include <linux/smp_lock.h>
 
+#include "internal.h"
+
 struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver;
 
 #ifdef CONFIG_SYSCTL
@@ -36,7 +38,6 @@ static struct file_system_type proc_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-extern int __init proc_init_inodecache(void);
 void __init proc_root_init(void)
 {
 	int err = proc_init_inodecache();
-- 
cgit v1.2.3


From ac34dd052400b1e63aa8e711a13c0670943296fd Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Sun, 8 Jan 2006 01:04:50 -0800
Subject: [PATCH] fs/smbfs/proc.c: fix data corruption in
 smb_proc_setattr_unix()

This patch fixes a data corruption in smb_proc_setattr_unix()
(smb_filetype_from_mode() returns an u32, and there are only four bytes
reserved for it in data.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/smbfs/proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 38ab558835c4..d6baec0f24ad 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -3113,7 +3113,7 @@ smb_proc_setattr_unix(struct dentry *d, struct iattr *attr,
 	LSET(data, 32, SMB_TIME_NO_CHANGE);
 	LSET(data, 40, SMB_UID_NO_CHANGE);
 	LSET(data, 48, SMB_GID_NO_CHANGE);
-	LSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
+	DSET(data, 56, smb_filetype_from_mode(attr->ia_mode));
 	LSET(data, 60, major);
 	LSET(data, 68, minor);
 	LSET(data, 76, 0);
-- 
cgit v1.2.3


From 15b2fe3931831891a62bad9cafd403c169ae087c Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Sun, 8 Jan 2006 01:04:51 -0800
Subject: [PATCH] UFS: inode->i_sem is not released in error path

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 54828ebcf1ba..2ba11a9aa995 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1296,8 +1296,10 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		up(&inode->i_sem);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
 	inode->i_version++;
-- 
cgit v1.2.3


From f5ef3c105bee3a52486d7b55cef3330fcde9bca6 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Sun, 8 Jan 2006 01:04:56 -0800
Subject: [PATCH] v9fs: fix fd_close

If a 9pfs server crashes, v9fs_fd_close() is called.  Subsequently, in
cleaning up by performing a umount() on the FS that was provided by this
server v9fs_fd_close() is called again, and uses the old, freed valus of
trans->priv.  This patch ensures that trans->priv can be freed only once,
otherwise this function bails early.

Signed-off-by: Michal Ostrowski <mostrows@watson.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/trans_fd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index 63b58ce98ff4..b7ffb9859588 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -148,12 +148,12 @@ static void v9fs_fd_close(struct v9fs_transport *trans)
 	if (!trans)
 		return;
 
-	trans->status = Disconnected;
-	ts = trans->priv;
+	ts = xchg(&trans->priv, NULL);
 
 	if (!ts)
 		return;
 
+	trans->status = Disconnected;
 	if (ts->in_file)
 		fput(ts->in_file);
 
-- 
cgit v1.2.3


From 3cf6429a26da5c4d7b795e6d0f8f56ed2e4fdfc0 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Sun, 8 Jan 2006 01:04:58 -0800
Subject: [PATCH] v9fs: new multiplexer implementation

New multiplexer implementation. Decreases the number of kernel threads
required. Better handling when the user process receives a signal.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/9p.c         |   68 +++-
 fs/9p/9p.h         |    9 +-
 fs/9p/conv.c       |   86 ++--
 fs/9p/conv.h       |   13 +-
 fs/9p/fid.c        |    2 +-
 fs/9p/mux.c        | 1122 +++++++++++++++++++++++++++++++++++++---------------
 fs/9p/mux.h        |   40 +-
 fs/9p/trans_fd.c   |   49 ++-
 fs/9p/trans_sock.c |  161 +++++---
 fs/9p/transport.h  |    4 +-
 fs/9p/v9fs.c       |   41 +-
 fs/9p/v9fs.h       |   17 +-
 fs/9p/vfs_dentry.c |   13 +-
 fs/9p/vfs_dir.c    |   17 +-
 fs/9p/vfs_inode.c  |   89 ++---
 fs/9p/vfs_super.c  |    3 +-
 16 files changed, 1172 insertions(+), 562 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index e847f504a47c..a3a1ac610723 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -52,10 +52,11 @@ v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 
 	dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version);
 	msg.id = TVERSION;
+	msg.tag = ~0;
 	msg.params.tversion.msize = msize;
 	msg.params.tversion.version = version;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -83,7 +84,30 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
 	msg.params.tattach.uname = uname;
 	msg.params.tattach.aname = aname;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+}
+
+static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
+	struct v9fs_fcall *rc, int err)
+{
+	int fid;
+	struct v9fs_session_info *v9ses;
+
+	if (err)
+		return;
+
+	fid = tc->params.tclunk.fid;
+	kfree(tc);
+
+	if (!rc)
+		return;
+
+	dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id);
+	v9ses = a;
+	if (rc->id == RCLUNK)
+		v9fs_put_idpool(fid, &v9ses->fidpool);
+
+	kfree(rc);
 }
 
 /**
@@ -93,18 +117,24 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
  * @fcall: pointer to response fcall pointer
  *
  */
-
 int
-v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
-	     struct v9fs_fcall **fcall)
+v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
 {
-	struct v9fs_fcall msg;
+	int err;
+	struct v9fs_fcall *tc, *rc;
+
+	tc = kmalloc(sizeof(struct v9fs_fcall), GFP_KERNEL);
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
-	msg.id = TCLUNK;
-	msg.params.tclunk.fid = fid;
+	tc->id = TCLUNK;
+	tc->params.tclunk.fid = fid;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	err = v9fs_mux_rpc(v9ses->mux, tc, &rc);
+	if (err >= 0) {
+		v9fs_t_clunk_cb(v9ses, tc, rc, 0);
+	}
+
+	return err;
 }
 
 /**
@@ -121,7 +151,7 @@ int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
 	dprintk(DEBUG_9P, "oldtag %d\n", tag);
 	msg.id = TFLUSH;
 	msg.params.tflush.oldtag = tag;
-	return v9fs_mux_rpc(v9ses, &msg, NULL);
+	return v9fs_mux_rpc(v9ses->mux, &msg, NULL);
 }
 
 /**
@@ -143,7 +173,7 @@ v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
 
 	msg.id = TSTAT;
 	msg.params.tstat.fid = fid;
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -166,7 +196,7 @@ v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
 	msg.params.twstat.fid = fid;
 	msg.params.twstat.stat = stat;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -199,7 +229,7 @@ v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
 		msg.params.twalk.nwname = 0;
 	}
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -217,14 +247,14 @@ v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
 	    struct v9fs_fcall **fcall)
 {
 	struct v9fs_fcall msg;
-	long errorno = -1;
+	int errorno = -1;
 
 	dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
 	msg.id = TOPEN;
 	msg.params.topen.fid = fid;
 	msg.params.topen.mode = mode;
 
-	errorno = v9fs_mux_rpc(v9ses, &msg, fcall);
+	errorno = v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 
 	return errorno;
 }
@@ -246,7 +276,7 @@ v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
 	dprintk(DEBUG_9P, "fid %d\n", fid);
 	msg.id = TREMOVE;
 	msg.params.tremove.fid = fid;
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -275,7 +305,7 @@ v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
 	msg.params.tcreate.perm = perm;
 	msg.params.tcreate.mode = mode;
 
-	return v9fs_mux_rpc(v9ses, &msg, fcall);
+	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
 }
 
 /**
@@ -302,7 +332,7 @@ v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
 	msg.params.tread.fid = fid;
 	msg.params.tread.offset = offset;
 	msg.params.tread.count = count;
-	errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+	errorno = v9fs_mux_rpc(v9ses->mux, &msg, &rc);
 
 	if (!errorno) {
 		errorno = rc->params.rread.count;
@@ -345,7 +375,7 @@ v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid,
 	msg.params.twrite.count = count;
 	msg.params.twrite.data = data;
 
-	errorno = v9fs_mux_rpc(v9ses, &msg, &rc);
+	errorno = v9fs_mux_rpc(v9ses->mux, &msg, &rc);
 
 	if (!errorno)
 		errorno = rc->params.rwrite.count;
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
index f55424216be2..6355392786e2 100644
--- a/fs/9p/9p.h
+++ b/fs/9p/9p.h
@@ -100,6 +100,9 @@ enum {
 	V9FS_QTFILE = 0x00,
 };
 
+#define V9FS_NOTAG	(u16)(~0)
+#define V9FS_NOFID	(u32)(~0)
+
 /* ample room for Twrite/Rread header (iounit) */
 #define V9FS_IOHDRSZ	24
 
@@ -303,6 +306,9 @@ struct v9fs_fcall {
 	} params;
 };
 
+#define V9FS_FCALLHDRSZ (sizeof(struct v9fs_fcall) + \
+	sizeof(struct v9fs_stat) + 16*sizeof(struct v9fs_qid) + 16)
+
 #define FCALL_ERROR(fcall) (fcall ? fcall->params.rerror.error : "")
 
 int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
@@ -311,8 +317,7 @@ int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
 		  u32 fid, u32 afid, struct v9fs_fcall **rcall);
 
-int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid,
-		 struct v9fs_fcall **rcall);
+int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid);
 
 int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag);
 
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index 18121af99d3e..1b9b15dfeaf0 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -208,7 +208,7 @@ static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
 	len = buf_get_int16(buf);
 
 	if (!buf_check_overflow(buf) && buf_check_size(buf, len) &&
-		buf_check_size(sbuf, len+1)) {
+		buf_check_size(sbuf, len + 1)) {
 
 		memcpy(sbuf->p, buf->p, len);
 		sbuf->p[len] = 0;
@@ -252,13 +252,12 @@ static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf,
 
 /**
  * v9fs_size_stat - calculate the size of a variable length stat struct
- * @v9ses: session information
  * @stat: metadata (stat) structure
+ * @extended: non-zero if 9P2000.u
  *
  */
 
-static int v9fs_size_stat(struct v9fs_session_info *v9ses,
-			  struct v9fs_stat *stat)
+static int v9fs_size_stat(struct v9fs_stat *stat, int extended)
 {
 	int size = 0;
 
@@ -288,7 +287,7 @@ static int v9fs_size_stat(struct v9fs_session_info *v9ses,
 	if (stat->muid)
 		size += strlen(stat->muid);
 
-	if (v9ses->extended) {
+	if (extended) {
 		size += 4 +	/* n_uid[4] */
 		    4 +		/* n_gid[4] */
 		    4 +		/* n_muid[4] */
@@ -302,15 +301,14 @@ static int v9fs_size_stat(struct v9fs_session_info *v9ses,
 
 /**
  * serialize_stat - safely format a stat structure for transmission
- * @v9ses: session info
  * @stat: metadata (stat) structure
  * @bufp: buffer to serialize structure into
+ * @extended: non-zero if 9P2000.u
  *
  */
 
 static int
-serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
-	       struct cbuf *bufp)
+serialize_stat(struct v9fs_stat *stat, struct cbuf *bufp, int extended)
 {
 	buf_put_int16(bufp, stat->size);
 	buf_put_int16(bufp, stat->type);
@@ -328,7 +326,7 @@ serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
 	buf_put_string(bufp, stat->gid);
 	buf_put_string(bufp, stat->muid);
 
-	if (v9ses->extended) {
+	if (extended) {
 		buf_put_string(bufp, stat->extension);
 		buf_put_int32(bufp, stat->n_uid);
 		buf_put_int32(bufp, stat->n_gid);
@@ -343,16 +341,16 @@ serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat,
 
 /**
  * deserialize_stat - safely decode a recieved metadata (stat) structure
- * @v9ses: session info
  * @bufp: buffer to deserialize
  * @stat: metadata (stat) structure
  * @dbufp: buffer to deserialize variable strings into
+ * @extended: non-zero if 9P2000.u
  *
  */
 
 static inline int
-deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
-		 struct v9fs_stat *stat, struct cbuf *dbufp)
+deserialize_stat(struct cbuf *bufp, struct v9fs_stat *stat,
+		 struct cbuf *dbufp, int extended)
 {
 
 	stat->size = buf_get_int16(bufp);
@@ -370,7 +368,7 @@ deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
 	stat->gid = buf_get_stringb(bufp, dbufp);
 	stat->muid = buf_get_stringb(bufp, dbufp);
 
-	if (v9ses->extended) {
+	if (extended) {
 		stat->extension = buf_get_stringb(bufp, dbufp);
 		stat->n_uid = buf_get_int32(bufp);
 		stat->n_gid = buf_get_int32(bufp);
@@ -385,20 +383,20 @@ deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp,
 
 /**
  * deserialize_statb - wrapper for decoding a received metadata structure
- * @v9ses: session info
  * @bufp: buffer to deserialize
  * @dbufp: buffer to deserialize variable strings into
+ * @extended: non-zero if 9P2000.u
  *
  */
 
-static inline struct v9fs_stat *deserialize_statb(struct v9fs_session_info
-						  *v9ses, struct cbuf *bufp,
-						  struct cbuf *dbufp)
+static inline struct v9fs_stat *deserialize_statb(struct cbuf *bufp,
+						  struct cbuf *dbufp,
+						  int extended)
 {
 	struct v9fs_stat *ret = buf_alloc(dbufp, sizeof(struct v9fs_stat));
 
 	if (ret) {
-		int n = deserialize_stat(v9ses, bufp, ret, dbufp);
+		int n = deserialize_stat(bufp, ret, dbufp, extended);
 		if (n <= 0)
 			return NULL;
 	}
@@ -408,17 +406,16 @@ static inline struct v9fs_stat *deserialize_statb(struct v9fs_session_info
 
 /**
  * v9fs_deserialize_stat - decode a received metadata structure
- * @v9ses: session info
  * @buf: buffer to deserialize
  * @buflen: length of received buffer
  * @stat: metadata structure to decode into
  * @statlen: length of destination metadata structure
+ * @extended: non-zero if 9P2000.u
  *
  */
 
-int
-v9fs_deserialize_stat(struct v9fs_session_info *v9ses, void *buf,
-		      u32 buflen, struct v9fs_stat *stat, u32 statlen)
+int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
+			  u32 statlen, int extended)
 {
 	struct cbuf buffer;
 	struct cbuf *bufp = &buffer;
@@ -429,11 +426,10 @@ v9fs_deserialize_stat(struct v9fs_session_info *v9ses, void *buf,
 	buf_init(dbufp, (char *)stat + sizeof(struct v9fs_stat),
 		 statlen - sizeof(struct v9fs_stat));
 
-	return deserialize_stat(v9ses, bufp, stat, dbufp);
+	return deserialize_stat(bufp, stat, dbufp, extended);
 }
 
-static inline int
-v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
+static inline int v9fs_size_fcall(struct v9fs_fcall *fcall, int extended)
 {
 	int size = 4 + 1 + 2;	/* size[4] msg[1] tag[2] */
 	int i = 0;
@@ -485,7 +481,7 @@ v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
 		break;
 	case TWSTAT:		/* fid[4] stat[n] */
 		fcall->params.twstat.stat->size =
-		    v9fs_size_stat(v9ses, fcall->params.twstat.stat);
+		    v9fs_size_stat(fcall->params.twstat.stat, extended);
 		size += 4 + 2 + 2 + fcall->params.twstat.stat->size;
 	}
 	return size;
@@ -493,16 +489,16 @@ v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall)
 
 /*
  * v9fs_serialize_fcall - marshall fcall struct into a packet
- * @v9ses: session information
  * @fcall: structure to convert
  * @data: buffer to serialize fcall into
  * @datalen: length of buffer to serialize fcall into
+ * @extended: non-zero if 9P2000.u
  *
  */
 
 int
-v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
-		     void *data, u32 datalen)
+v9fs_serialize_fcall(struct v9fs_fcall *fcall, void *data, u32 datalen,
+		     int extended)
 {
 	int i = 0;
 	struct v9fs_stat *stat = NULL;
@@ -516,7 +512,7 @@ v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
 		return -EINVAL;
 	}
 
-	fcall->size = v9fs_size_fcall(v9ses, fcall);
+	fcall->size = v9fs_size_fcall(fcall, extended);
 
 	buf_put_int32(bufp, fcall->size);
 	buf_put_int8(bufp, fcall->id);
@@ -591,31 +587,31 @@ v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall,
 		stat = fcall->params.twstat.stat;
 
 		buf_put_int16(bufp, stat->size + 2);
-		serialize_stat(v9ses, stat, bufp);
+		serialize_stat(stat, bufp, extended);
 		break;
 	}
 
-	if (buf_check_overflow(bufp))
+	if (buf_check_overflow(bufp)) {
+		dprintk(DEBUG_ERROR, "buffer overflow\n");
 		return -EIO;
+	}
 
 	return fcall->size;
 }
 
 /**
  * deserialize_fcall - unmarshal a response
- * @v9ses: session information
- * @msgsize: size of rcall message
  * @buf: recieved buffer
  * @buflen: length of received buffer
  * @rcall: fcall structure to populate
  * @rcalllen: length of fcall structure to populate
+ * @extended: non-zero if 9P2000.u
  *
  */
 
 int
-v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
-		       void *buf, u32 buflen, struct v9fs_fcall *rcall,
-		       int rcalllen)
+v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
+		       int rcalllen, int extended)
 {
 
 	struct cbuf buffer;
@@ -628,7 +624,7 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
 	buf_init(dbufp, (char *)rcall + sizeof(struct v9fs_fcall),
 		 rcalllen - sizeof(struct v9fs_fcall));
 
-	rcall->size = msgsize;
+	rcall->size = buf_get_int32(bufp);
 	rcall->id = buf_get_int8(bufp);
 	rcall->tag = buf_get_int16(bufp);
 
@@ -651,6 +647,12 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
 		break;
 	case RWALK:
 		rcall->params.rwalk.nwqid = buf_get_int16(bufp);
+		if (rcall->params.rwalk.nwqid > 16) {
+			eprintk(KERN_ERR, "Rwalk with more than 16 qids: %d\n",
+				rcall->params.rwalk.nwqid);
+			return -EPROTO;
+		}
+
 		rcall->params.rwalk.wqids = buf_alloc(dbufp,
 		      rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid));
 		if (rcall->params.rwalk.wqids)
@@ -690,19 +692,21 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
 	case RSTAT:
 		buf_get_int16(bufp);
 		rcall->params.rstat.stat =
-		    deserialize_statb(v9ses, bufp, dbufp);
+		    deserialize_statb(bufp, dbufp, extended);
 		break;
 	case RWSTAT:
 		break;
 	case RERROR:
 		rcall->params.rerror.error = buf_get_stringb(bufp, dbufp);
-		if (v9ses->extended)
+		if (extended)
 			rcall->params.rerror.errno = buf_get_int16(bufp);
 		break;
 	}
 
-	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
+	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp)) {
+		dprintk(DEBUG_ERROR, "buffer overflow\n");
 		return -EIO;
+	}
 
 	return rcall->size;
 }
diff --git a/fs/9p/conv.h b/fs/9p/conv.h
index ee849613c61a..d5e33e17a685 100644
--- a/fs/9p/conv.h
+++ b/fs/9p/conv.h
@@ -24,13 +24,12 @@
  *
  */
 
-int v9fs_deserialize_stat(struct v9fs_session_info *, void *buf,
-			  u32 buflen, struct v9fs_stat *stat, u32 statlen);
-int v9fs_serialize_fcall(struct v9fs_session_info *, struct v9fs_fcall *tcall,
-			 void *buf, u32 buflen);
-int v9fs_deserialize_fcall(struct v9fs_session_info *, u32 msglen,
-			   void *buf, u32 buflen, struct v9fs_fcall *rcall,
-			   int rcalllen);
+int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
+	u32 statlen, int extended);
+int v9fs_serialize_fcall(struct v9fs_fcall *tcall, void *buf, u32 buflen,
+	int extended);
+int v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
+	int rcalllen, int extended);
 
 /* this one is actually in error.c right now */
 int v9fs_errstr2errno(char *errstr);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index d95f8626d170..60ef8aba757a 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -164,7 +164,7 @@ static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
 	return v9fs_fid_create(dentry, v9ses, fidnum, 0);
 
 clunk_fid:
-	v9fs_t_clunk(v9ses, fidnum, NULL);
+	v9fs_t_clunk(v9ses, fidnum);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 8835b576f744..62b6ad0767e1 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -4,7 +4,7 @@
  * Protocol Multiplexer
  *
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
- *  Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/poll.h>
 #include <linux/kthread.h>
 #include <linux/idr.h>
 
@@ -38,438 +39,903 @@
 #include "conv.h"
 #include "mux.h"
 
-/**
- * dprintcond - print condition of session info
- * @v9ses: session info structure
- * @req: RPC request structure
- *
- */
+#define ERREQFLUSH	1
+#define SCHED_TIMEOUT	10
+#define MAXPOLLWADDR	2
+
+enum {
+	Rworksched = 1,		/* read work scheduled or running */
+	Rpending = 2,		/* can read */
+	Wworksched = 4,		/* write work scheduled or running */
+	Wpending = 8,		/* can write */
+};
+
+struct v9fs_mux_poll_task;
+
+struct v9fs_req {
+	int tag;
+	struct v9fs_fcall *tcall;
+	struct v9fs_fcall *rcall;
+	int err;
+	v9fs_mux_req_callback cb;
+	void *cba;
+	struct list_head req_list;
+};
+
+struct v9fs_mux_data {
+	spinlock_t lock;
+	struct list_head mux_list;
+	struct v9fs_mux_poll_task *poll_task;
+	int msize;
+	unsigned char *extended;
+	struct v9fs_transport *trans;
+	struct v9fs_idpool tidpool;
+	int err;
+	wait_queue_head_t equeue;
+	struct list_head req_list;
+	struct list_head unsent_req_list;
+	int rpos;
+	char *rbuf;
+	int wpos;
+	int wsize;
+	char *wbuf;
+	wait_queue_t poll_wait[MAXPOLLWADDR];
+	wait_queue_head_t *poll_waddr[MAXPOLLWADDR];
+	poll_table pt;
+	struct work_struct rq;
+	struct work_struct wq;
+	unsigned long wsched;
+};
+
+struct v9fs_mux_poll_task {
+	struct task_struct *task;
+	struct list_head mux_list;
+	int muxnum;
+};
+
+struct v9fs_mux_rpc {
+	struct v9fs_mux_data *m;
+	struct v9fs_req *req;
+	int err;
+	struct v9fs_fcall *rcall;
+	wait_queue_head_t wqueue;
+};
+
+static int v9fs_poll_proc(void *);
+static void v9fs_read_work(void *);
+static void v9fs_write_work(void *);
+static void v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
+			  poll_table * p);
+
+static DECLARE_MUTEX(v9fs_mux_task_lock);
+static struct workqueue_struct *v9fs_mux_wq;
+
+static int v9fs_mux_num;
+static int v9fs_mux_poll_task_num;
+static struct v9fs_mux_poll_task v9fs_mux_poll_tasks[100];
+
+void v9fs_mux_global_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++)
+		v9fs_mux_poll_tasks[i].task = NULL;
+
+	v9fs_mux_wq = create_workqueue("v9fs");
+}
 
-static inline int
-dprintcond(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+void v9fs_mux_global_exit(void)
 {
-	dprintk(DEBUG_MUX, "condition: %d, %p\n", v9ses->transport->status,
-		req->rcall);
-	return 0;
+	destroy_workqueue(v9fs_mux_wq);
 }
 
 /**
- * xread - force read of a certain number of bytes
- * @v9ses: session info structure
- * @ptr: pointer to buffer
- * @sz: number of bytes to read
+ * v9fs_mux_calc_poll_procs - calculates the number of polling procs
+ * based on the number of mounted v9fs filesystems.
  *
- * Chuck Cranor CS-533 project1
+ * The current implementation returns sqrt of the number of mounts.
  */
+inline int v9fs_mux_calc_poll_procs(int muxnum)
+{
+	int n;
+
+	if (v9fs_mux_poll_task_num)
+		n = muxnum / v9fs_mux_poll_task_num +
+		    (muxnum % v9fs_mux_poll_task_num ? 1 : 0);
+	else
+		n = 1;
+
+	if (n > ARRAY_SIZE(v9fs_mux_poll_tasks))
+		n = ARRAY_SIZE(v9fs_mux_poll_tasks);
 
-static int xread(struct v9fs_session_info *v9ses, void *ptr, unsigned long sz)
+	return n;
+}
+
+static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
 {
-	int rd = 0;
-	int ret = 0;
-	while (rd < sz) {
-		ret = v9ses->transport->read(v9ses->transport, ptr, sz - rd);
-		if (ret <= 0) {
-			dprintk(DEBUG_ERROR, "xread errno %d\n", ret);
-			return ret;
+	int i, n;
+	struct v9fs_mux_poll_task *vpt, *vptlast;
+
+	dprintk(DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, v9fs_mux_num,
+		v9fs_mux_poll_task_num);
+	up(&v9fs_mux_task_lock);
+
+	n = v9fs_mux_calc_poll_procs(v9fs_mux_num + 1);
+	if (n > v9fs_mux_poll_task_num) {
+		for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) {
+			if (v9fs_mux_poll_tasks[i].task == NULL) {
+				vpt = &v9fs_mux_poll_tasks[i];
+				dprintk(DEBUG_MUX, "create proc %p\n", vpt);
+				vpt->task = kthread_create(v9fs_poll_proc,
+					vpt, "v9fs-poll");
+				INIT_LIST_HEAD(&vpt->mux_list);
+				vpt->muxnum = 0;
+				v9fs_mux_poll_task_num++;
+				wake_up_process(vpt->task);
+				break;
+			}
 		}
-		rd += ret;
-		ptr += ret;
-	}
-	return (rd);
-}
 
-/**
- * read_message - read a full 9P2000 fcall packet
- * @v9ses: session info structure
- * @rcall: fcall structure to read into
- * @rcalllen: size of fcall buffer
- *
- */
+		if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks))
+			dprintk(DEBUG_ERROR, "warning: no free poll slots\n");
+	}
 
-static int
-read_message(struct v9fs_session_info *v9ses,
-	     struct v9fs_fcall *rcall, int rcalllen)
-{
-	unsigned char buf[4];
-	void *data;
-	int size = 0;
-	int res = 0;
-
-	res = xread(v9ses, buf, sizeof(buf));
-	if (res < 0) {
-		dprintk(DEBUG_ERROR,
-			"Reading of count field failed returned: %d\n", res);
-		return res;
+	n = (v9fs_mux_num + 1) / v9fs_mux_poll_task_num +
+	    ((v9fs_mux_num + 1) % v9fs_mux_poll_task_num ? 1 : 0);
+
+	vptlast = NULL;
+	for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) {
+		vpt = &v9fs_mux_poll_tasks[i];
+		if (vpt->task != NULL) {
+			vptlast = vpt;
+			if (vpt->muxnum < n) {
+				dprintk(DEBUG_MUX, "put in proc %d\n", i);
+				list_add(&m->mux_list, &vpt->mux_list);
+				vpt->muxnum++;
+				m->poll_task = vpt;
+				memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
+				init_poll_funcptr(&m->pt, v9fs_pollwait);
+				break;
+			}
+		}
 	}
 
-	if (res < 4) {
-		dprintk(DEBUG_ERROR,
-			"Reading of count field failed returned: %d\n", res);
-		return -EIO;
+	if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) {
+		dprintk(DEBUG_MUX, "put in proc %d\n", i);
+		list_add(&m->mux_list, &vptlast->mux_list);
+		vptlast->muxnum++;
+		m->poll_task = vpt;
+		memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
+		init_poll_funcptr(&m->pt, v9fs_pollwait);
 	}
 
-	size = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
-	dprintk(DEBUG_MUX, "got a packet count: %d\n", size);
+	v9fs_mux_num++;
+	down(&v9fs_mux_task_lock);
+}
 
-	/* adjust for the four bytes of size */
-	size -= 4;
+static void v9fs_mux_poll_stop(struct v9fs_mux_data *m)
+{
+	int i;
+	struct v9fs_mux_poll_task *vpt;
+
+	up(&v9fs_mux_task_lock);
+	vpt = m->poll_task;
+	list_del(&m->mux_list);
+	for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
+		if (m->poll_waddr[i] != NULL) {
+			remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]);
+			m->poll_waddr[i] = NULL;
+		}
+	}
+	vpt->muxnum--;
+	if (!vpt->muxnum) {
+		dprintk(DEBUG_MUX, "destroy proc %p\n", vpt);
+		send_sig(SIGKILL, vpt->task, 1);
+		vpt->task = NULL;
+		v9fs_mux_poll_task_num--;
+	}
+	v9fs_mux_num--;
+	down(&v9fs_mux_task_lock);
+}
 
-	if (size > v9ses->maxdata) {
-		dprintk(DEBUG_ERROR, "packet too big: %d\n", size);
-		return -E2BIG;
+/**
+ * v9fs_mux_init - allocate and initialize the per-session mux data
+ * Creates the polling task if this is the first session.
+ *
+ * @trans - transport structure
+ * @msize - maximum message size
+ * @extended - pointer to the extended flag
+ */
+struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
+				    unsigned char *extended)
+{
+	int i, n;
+	struct v9fs_mux_data *m, *mtmp;
+
+	dprintk(DEBUG_MUX, "transport %p msize %d\n", trans, msize);
+	m = kmalloc(sizeof(struct v9fs_mux_data) + 2 * msize, GFP_KERNEL);
+	if (!m)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&m->lock);
+	INIT_LIST_HEAD(&m->mux_list);
+	m->msize = msize;
+	m->extended = extended;
+	m->trans = trans;
+	idr_init(&m->tidpool.pool);
+	init_MUTEX(&m->tidpool.lock);
+	m->err = 0;
+	init_waitqueue_head(&m->equeue);
+	INIT_LIST_HEAD(&m->req_list);
+	INIT_LIST_HEAD(&m->unsent_req_list);
+	m->rpos = 0;
+	m->rbuf = (char *)m + sizeof(struct v9fs_mux_data);
+	m->wpos = m->wsize = 0;
+	m->wbuf = m->rbuf + msize;
+	INIT_WORK(&m->rq, v9fs_read_work, m);
+	INIT_WORK(&m->wq, v9fs_write_work, m);
+	m->wsched = 0;
+	memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
+	v9fs_mux_poll_start(m);
+
+	n = trans->poll(trans, &m->pt);
+	if (n & POLLIN) {
+		dprintk(DEBUG_MUX, "mux %p can read\n", m);
+		set_bit(Rpending, &m->wsched);
 	}
 
-	data = kmalloc(size, GFP_KERNEL);
-	if (!data) {
-		eprintk(KERN_WARNING, "out of memory\n");
-		return -ENOMEM;
+	if (n & POLLOUT) {
+		dprintk(DEBUG_MUX, "mux %p can write\n", m);
+		set_bit(Wpending, &m->wsched);
 	}
 
-	res = xread(v9ses, data, size);
-	if (res < size) {
-		dprintk(DEBUG_ERROR, "Reading of fcall failed returned: %d\n",
-			res);
-		kfree(data);
-		return res;
+	for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) {
+		if (IS_ERR(m->poll_waddr[i])) {
+			v9fs_mux_poll_stop(m);
+			mtmp = (void *)m->poll_waddr;	/* the error code */
+			kfree(m);
+			m = mtmp;
+			break;
+		}
 	}
 
-	/* we now have an in-memory string that is the reply.
-	 * deserialize it. There is very little to go wrong at this point
-	 * save for v9fs_alloc errors.
-	 */
-	res = v9fs_deserialize_fcall(v9ses, size, data, v9ses->maxdata,
-				     rcall, rcalllen);
+	return m;
+}
 
-	kfree(data);
+/**
+ * v9fs_mux_destroy - cancels all pending requests and frees mux resources
+ */
+void v9fs_mux_destroy(struct v9fs_mux_data *m)
+{
+	dprintk(DEBUG_MUX, "mux %p prev %p next %p\n", m,
+		m->mux_list.prev, m->mux_list.next);
+	v9fs_mux_cancel(m, -ECONNRESET);
+
+	if (!list_empty(&m->req_list)) {
+		/* wait until all processes waiting on this session exit */
+		dprintk(DEBUG_MUX, "mux %p waiting for empty request queue\n",
+			m);
+		wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000);
+		dprintk(DEBUG_MUX, "mux %p request queue empty: %d\n", m,
+			list_empty(&m->req_list));
+	}
 
-	if (res < 0)
-		return res;
+	v9fs_mux_poll_stop(m);
+	m->trans = NULL;
 
-	return 0;
+	kfree(m);
 }
 
 /**
- * v9fs_recv - receive an RPC response for a particular tag
- * @v9ses: session info structure
- * @req: RPC request structure
- *
+ * v9fs_pollwait - called by files poll operation to add v9fs-poll task
+ * 	to files wait queue
  */
-
-static int v9fs_recv(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+static void
+v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
+	      poll_table * p)
 {
-	int ret = 0;
-
-	dprintk(DEBUG_MUX, "waiting for response: %d\n", req->tcall->tag);
-	ret = wait_event_interruptible(v9ses->read_wait,
-		       ((v9ses->transport->status != Connected) ||
-			(req->rcall != 0) || (req->err < 0) ||
-			dprintcond(v9ses, req)));
+	int i;
+	struct v9fs_mux_data *m;
 
-	dprintk(DEBUG_MUX, "got it: rcall %p\n", req->rcall);
+	m = container_of(p, struct v9fs_mux_data, pt);
+	for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++)
+		if (m->poll_waddr[i] == NULL)
+			break;
 
-	spin_lock(&v9ses->muxlock);
-	list_del(&req->next);
-	spin_unlock(&v9ses->muxlock);
+	if (i >= ARRAY_SIZE(m->poll_waddr)) {
+		dprintk(DEBUG_ERROR, "not enough wait_address slots\n");
+		return;
+	}
 
-	if (req->err < 0)
-		return req->err;
+	m->poll_waddr[i] = wait_address;
 
-	if (v9ses->transport->status == Disconnected)
-		return -ECONNRESET;
+	if (!wait_address) {
+		dprintk(DEBUG_ERROR, "no wait_address\n");
+		m->poll_waddr[i] = ERR_PTR(-EIO);
+		return;
+	}
 
-	return ret;
+	init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task);
+	add_wait_queue(wait_address, &m->poll_wait[i]);
 }
 
 /**
- * v9fs_send - send a 9P request
- * @v9ses: session info structure
- * @req: RPC request to send
- *
+ * v9fs_poll_mux - polls a mux and schedules read or write works if necessary
  */
-
-static int v9fs_send(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req)
+static inline void v9fs_poll_mux(struct v9fs_mux_data *m)
 {
-	int ret = -1;
-	void *data = NULL;
-	struct v9fs_fcall *tcall = req->tcall;
-
-	data = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	tcall->size = 0;	/* enforce size recalculation */
-	ret =
-	    v9fs_serialize_fcall(v9ses, tcall, data,
-				 v9ses->maxdata + V9FS_IOHDRSZ);
-	if (ret < 0)
-		goto free_data;
-
-	spin_lock(&v9ses->muxlock);
-	list_add(&req->next, &v9ses->mux_fcalls);
-	spin_unlock(&v9ses->muxlock);
-
-	dprintk(DEBUG_MUX, "sending message: tag %d size %d\n", tcall->tag,
-		tcall->size);
-	ret = v9ses->transport->write(v9ses->transport, data, tcall->size);
-
-	if (ret != tcall->size) {
-		spin_lock(&v9ses->muxlock);
-		list_del(&req->next);
-		kfree(req->rcall);
+	int n;
 
-		spin_unlock(&v9ses->muxlock);
-		if (ret >= 0)
-			ret = -EREMOTEIO;
-	} else
-		ret = 0;
+	if (m->err < 0)
+		return;
+
+	n = m->trans->poll(m->trans, NULL);
+	if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
+		dprintk(DEBUG_MUX, "error mux %p err %d\n", m, n);
+		if (n >= 0)
+			n = -ECONNRESET;
+		v9fs_mux_cancel(m, n);
+	}
+
+	if (n & POLLIN) {
+		set_bit(Rpending, &m->wsched);
+		dprintk(DEBUG_MUX, "mux %p can read\n", m);
+		if (!test_and_set_bit(Rworksched, &m->wsched)) {
+			dprintk(DEBUG_MUX, "schedule read work mux %p\n", m);
+			queue_work(v9fs_mux_wq, &m->rq);
+		}
+	}
 
-      free_data:
-	kfree(data);
-	return ret;
+	if (n & POLLOUT) {
+		set_bit(Wpending, &m->wsched);
+		dprintk(DEBUG_MUX, "mux %p can write\n", m);
+		if ((m->wsize || !list_empty(&m->unsent_req_list))
+		    && !test_and_set_bit(Wworksched, &m->wsched)) {
+			dprintk(DEBUG_MUX, "schedule write work mux %p\n", m);
+			queue_work(v9fs_mux_wq, &m->wq);
+		}
+	}
 }
 
 /**
- * v9fs_mux_rpc - send a request, receive a response
- * @v9ses: session info structure
- * @tcall: fcall to send
- * @rcall: buffer to place response into
- *
+ * v9fs_poll_proc - polls all v9fs transports for new events and queues
+ * 	the appropriate work to the work queue
  */
-
-long
-v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall,
-	     struct v9fs_fcall **rcall)
+static int v9fs_poll_proc(void *a)
 {
-	int tid = -1;
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_rpcreq req;
-	int ret = -1;
+	struct v9fs_mux_data *m, *mtmp;
+	struct v9fs_mux_poll_task *vpt;
 
-	if (!v9ses)
-		return -EINVAL;
+	vpt = a;
+	dprintk(DEBUG_MUX, "start %p %p\n", current, vpt);
+	allow_signal(SIGKILL);
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (signal_pending(current))
+			break;
 
-	if (!v9ses->transport || v9ses->transport->status != Connected)
-		return -EIO;
+		list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) {
+			v9fs_poll_mux(m);
+		}
+
+		dprintk(DEBUG_MUX, "sleeping...\n");
+		schedule_timeout(SCHED_TIMEOUT * HZ);
+	}
 
-	if (rcall)
-		*rcall = NULL;
+	__set_current_state(TASK_RUNNING);
+	dprintk(DEBUG_MUX, "finish\n");
+	return 0;
+}
 
-	if (tcall->id != TVERSION) {
-		tid = v9fs_get_idpool(&v9ses->tidpool);
-		if (tid < 0)
-			return -ENOMEM;
+static inline int v9fs_write_req(struct v9fs_mux_data *m, struct v9fs_req *req)
+{
+	int n;
+
+	list_move_tail(&req->req_list, &m->req_list);
+	n = v9fs_serialize_fcall(req->tcall, m->wbuf, m->msize, *m->extended);
+	if (n < 0) {
+		req->err = n;
+		list_del(&req->req_list);
+		if (req->cb) {
+			spin_unlock(&m->lock);
+			(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
+			req->cb = NULL;
+			spin_lock(&m->lock);
+		} else
+			kfree(req->rcall);
+
+		kfree(req);
 	}
 
-	tcall->tag = tid;
+	return n;
+}
 
-	req.tcall = tcall;
-	req.err = 0;
-	req.rcall = NULL;
+/**
+ * v9fs_write_work - called when a transport can send some data
+ */
+static void v9fs_write_work(void *a)
+{
+	int n, err;
+	struct v9fs_mux_data *m;
+	struct v9fs_req *req, *rtmp;
 
-	ret = v9fs_send(v9ses, &req);
+	m = a;
 
-	if (ret < 0) {
-		if (tcall->id != TVERSION)
-			v9fs_put_idpool(tid, &v9ses->tidpool);
-		dprintk(DEBUG_MUX, "error %d\n", ret);
-		return ret;
+	if (m->err < 0) {
+		clear_bit(Wworksched, &m->wsched);
+		return;
 	}
 
-	ret = v9fs_recv(v9ses, &req);
-
-	fcall = req.rcall;
-
-	dprintk(DEBUG_MUX, "received: tag=%x, ret=%d\n", tcall->tag, ret);
-	if (ret == -ERESTARTSYS) {
-		if (v9ses->transport->status != Disconnected
-		    && tcall->id != TFLUSH) {
-			unsigned long flags;
-
-			dprintk(DEBUG_MUX, "flushing the tag: %d\n",
-				tcall->tag);
-			clear_thread_flag(TIF_SIGPENDING);
-			v9fs_t_flush(v9ses, tcall->tag);
-			spin_lock_irqsave(&current->sighand->siglock, flags);
-			recalc_sigpending();
-			spin_unlock_irqrestore(&current->sighand->siglock,
-					       flags);
-			dprintk(DEBUG_MUX, "flushing done\n");
+	if (!m->wsize) {
+		if (list_empty(&m->unsent_req_list)) {
+			clear_bit(Wworksched, &m->wsched);
+			return;
 		}
 
-		goto release_req;
-	} else if (ret < 0)
-		goto release_req;
-
-	if (!fcall)
-		ret = -EIO;
-	else {
-		if (fcall->id == RERROR) {
-			ret = v9fs_errstr2errno(fcall->params.rerror.error);
-			if (ret == 0) {	/* string match failed */
-				if (fcall->params.rerror.errno)
-					ret = -(fcall->params.rerror.errno);
-				else
-					ret = -ESERVERFAULT;
-			}
-		} else if (fcall->id != tcall->id + 1) {
-			dprintk(DEBUG_ERROR,
-				"fcall mismatch: expected %d, got %d\n",
-				tcall->id + 1, fcall->id);
-			ret = -EIO;
+		err = 0;
+		spin_lock(&m->lock);
+		list_for_each_entry_safe(req, rtmp, &m->unsent_req_list,
+					 req_list) {
+			err = v9fs_write_req(m, req);
+			if (err > 0)
+				break;
 		}
+
+		m->wsize = err;
+		m->wpos = 0;
+		spin_unlock(&m->lock);
 	}
 
-      release_req:
-	if (tcall->id != TVERSION)
-		v9fs_put_idpool(tid, &v9ses->tidpool);
-	if (rcall)
-		*rcall = fcall;
-	else
-		kfree(fcall);
+	dprintk(DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos, m->wsize);
+	clear_bit(Wpending, &m->wsched);
+	err = m->trans->write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos);
+	dprintk(DEBUG_MUX, "mux %p sent %d bytes\n", m, err);
+	if (err == -EAGAIN) {
+		clear_bit(Wworksched, &m->wsched);
+		return;
+	}
+
+	if (err <= 0)
+		goto error;
+
+	m->wpos += err;
+	if (m->wpos == m->wsize)
+		m->wpos = m->wsize = 0;
+
+	if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
+		if (test_and_clear_bit(Wpending, &m->wsched))
+			n = POLLOUT;
+		else
+			n = m->trans->poll(m->trans, NULL);
+
+		if (n & POLLOUT) {
+			dprintk(DEBUG_MUX, "schedule write work mux %p\n", m);
+			queue_work(v9fs_mux_wq, &m->wq);
+		} else
+			clear_bit(Wworksched, &m->wsched);
+	} else
+		clear_bit(Wworksched, &m->wsched);
 
-	return ret;
+	return;
+
+      error:
+	v9fs_mux_cancel(m, err);
+	clear_bit(Wworksched, &m->wsched);
 }
 
-/**
- * v9fs_mux_cancel_requests - cancels all pending requests
- *
- * @v9ses: session info structure
- * @err: error code to return to the requests
- */
-void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err)
+static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 {
-	struct v9fs_rpcreq *rptr;
-	struct v9fs_rpcreq *rreq;
+	int ecode, tag;
+	char *ename;
+
+	tag = req->tag;
+	if (req->rcall->id == RERROR && !req->err) {
+		ecode = req->rcall->params.rerror.errno;
+		ename = req->rcall->params.rerror.error;
 
-	dprintk(DEBUG_MUX, " %d\n", err);
-	spin_lock(&v9ses->muxlock);
-	list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
-		rreq->err = err;
+		dprintk(DEBUG_MUX, "Rerror %s\n", ename);
+
+		if (*m->extended)
+			req->err = -ecode;
+
+		if (!req->err) {
+			req->err = v9fs_errstr2errno(ename);
+
+			if (!req->err) {	/* string match failed */
+				dprintk(DEBUG_ERROR, "unknown error: %s\n",
+					ename);
+			}
+
+			if (!req->err)
+				req->err = -ESERVERFAULT;
+		}
+	} else if (req->tcall && req->rcall->id != req->tcall->id + 1) {
+		dprintk(DEBUG_ERROR, "fcall mismatch: expected %d, got %d\n",
+			req->tcall->id + 1, req->rcall->id);
+		if (!req->err)
+			req->err = -EIO;
 	}
-	spin_unlock(&v9ses->muxlock);
-	wake_up_all(&v9ses->read_wait);
+
+	if (req->cb && req->err != ERREQFLUSH) {
+		dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n",
+			req->tcall, req->rcall);
+
+		(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
+		req->cb = NULL;
+	} else
+		kfree(req->rcall);
+
+	if (tag != V9FS_NOTAG)
+		v9fs_put_idpool(tag, &m->tidpool);
+
+	wake_up(&m->equeue);
+	kfree(req);
 }
 
 /**
- * v9fs_recvproc - kproc to handle demultiplexing responses
- * @data: session info structure
- *
+ * v9fs_read_work - called when there is some data to be read from a transport
  */
-
-static int v9fs_recvproc(void *data)
+static void v9fs_read_work(void *a)
 {
-	struct v9fs_session_info *v9ses = (struct v9fs_session_info *)data;
-	struct v9fs_fcall *rcall = NULL;
-	struct v9fs_rpcreq *rptr;
-	struct v9fs_rpcreq *req;
-	struct v9fs_rpcreq *rreq;
-	int err = 0;
+	int n, err, rcallen;
+	struct v9fs_mux_data *m;
+	struct v9fs_req *req, *rptr, *rreq;
+	struct v9fs_fcall *rcall;
+
+	m = a;
+
+	if (m->err < 0)
+		return;
+
+	rcall = NULL;
+	dprintk(DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
+	clear_bit(Rpending, &m->wsched);
+	err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
+	dprintk(DEBUG_MUX, "mux %p got %d bytes\n", m, err);
+	if (err == -EAGAIN) {
+		clear_bit(Rworksched, &m->wsched);
+		return;
+	}
 
-	allow_signal(SIGKILL);
-	set_current_state(TASK_INTERRUPTIBLE);
-	complete(&v9ses->proccmpl);
-	while (!kthread_should_stop() && err >= 0) {
-		req = rptr = rreq = NULL;
+	if (err <= 0)
+		goto error;
 
-		rcall = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL);
-		if (!rcall) {
-			eprintk(KERN_ERR, "no memory for buffers\n");
+	m->rpos += err;
+	while (m->rpos > 4) {
+		n = le32_to_cpu(*(__le32 *) m->rbuf);
+		if (n >= m->msize) {
+			dprintk(DEBUG_ERROR,
+				"requested packet size too big: %d\n", n);
+			err = -EIO;
+			goto error;
+		}
+
+		if (m->rpos < n)
 			break;
+
+		rcallen = n + V9FS_FCALLHDRSZ;
+		rcall = kmalloc(rcallen, GFP_KERNEL);
+		if (!rcall) {
+			err = -ENOMEM;
+			goto error;
 		}
 
-		err = read_message(v9ses, rcall, v9ses->maxdata + V9FS_IOHDRSZ);
-		spin_lock(&v9ses->muxlock);
+		dump_data(m->rbuf, n);
+		err = v9fs_deserialize_fcall(m->rbuf, n, rcall, rcallen,
+					     *m->extended);
 		if (err < 0) {
-			list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
-				rreq->err = err;
-			}
-			if(err != -ERESTARTSYS)
-				eprintk(KERN_ERR,
-					"Transport error while reading message %d\n", err);
-		} else {
-			list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) {
-				if (rreq->tcall->tag == rcall->tag) {
-					req = rreq;
-					req->rcall = rcall;
-					break;
-				}
-			}
+			kfree(rcall);
+			goto error;
 		}
 
-		if (req && (req->tcall->id == TFLUSH)) {
-			struct v9fs_rpcreq *treq = NULL;
-			list_for_each_entry_safe(treq, rptr, &v9ses->mux_fcalls, next) {
-				if (treq->tcall->tag ==
-				    req->tcall->params.tflush.oldtag) {
-					list_del(&rptr->next);
-					kfree(treq->rcall);
-					break;
-				}
+		dprintk(DEBUG_MUX, "mux %p fcall id %d tag %d\n", m, rcall->id,
+			rcall->tag);
+
+		req = NULL;
+		spin_lock(&m->lock);
+		list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
+			if (rreq->tag == rcall->tag) {
+				req = rreq;
+				req->rcall = rcall;
+				list_del(&req->req_list);
+				spin_unlock(&m->lock);
+				process_request(m, req);
+				break;
 			}
 		}
 
-		spin_unlock(&v9ses->muxlock);
-
 		if (!req) {
-			if (err >= 0)
+			spin_unlock(&m->lock);
+			if (err >= 0 && rcall->id != RFLUSH)
 				dprintk(DEBUG_ERROR,
-					"unexpected response: id %d tag %d\n",
-					rcall->id, rcall->tag);
-
+					"unexpected response mux %p id %d tag %d\n",
+					m, rcall->id, rcall->tag);
 			kfree(rcall);
 		}
 
-		wake_up_all(&v9ses->read_wait);
-		set_current_state(TASK_INTERRUPTIBLE);
+		if (m->rpos > n)
+			memmove(m->rbuf, m->rbuf + n, m->rpos - n);
+		m->rpos -= n;
 	}
 
-	v9ses->transport->close(v9ses->transport);
-
-	/* Inform all pending processes about the failure */
-	wake_up_all(&v9ses->read_wait);
-
-	if (signal_pending(current))
-		complete(&v9ses->proccmpl);
+	if (!list_empty(&m->req_list)) {
+		if (test_and_clear_bit(Rpending, &m->wsched))
+			n = POLLIN;
+		else
+			n = m->trans->poll(m->trans, NULL);
+
+		if (n & POLLIN) {
+			dprintk(DEBUG_MUX, "schedule read work mux %p\n", m);
+			queue_work(v9fs_mux_wq, &m->rq);
+		} else
+			clear_bit(Rworksched, &m->wsched);
+	} else
+		clear_bit(Rworksched, &m->wsched);
 
-	dprintk(DEBUG_MUX, "recvproc: end\n");
-	v9ses->recvproc = NULL;
+	return;
 
-	return err >= 0;
+      error:
+	v9fs_mux_cancel(m, err);
+	clear_bit(Rworksched, &m->wsched);
 }
 
 /**
- * v9fs_mux_init - initialize multiplexer (spawn kproc)
- * @v9ses: session info structure
- * @dev_name: mount device information (to create unique kproc)
+ * v9fs_send_request - send 9P request
+ * The function can sleep until the request is scheduled for sending.
+ * The function can be interrupted. Return from the function is not
+ * a guarantee that the request is sent succesfully. Can return errors
+ * that can be retrieved by PTR_ERR macros.
  *
+ * @m: mux data
+ * @tc: request to be sent
+ * @cb: callback function to call when response is received
+ * @cba: parameter to pass to the callback function
  */
+static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m,
+					  struct v9fs_fcall *tc,
+					  v9fs_mux_req_callback cb, void *cba)
+{
+	int n;
+	struct v9fs_req *req;
+
+	dprintk(DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current,
+		tc, tc->id);
+	if (m->err < 0)
+		return ERR_PTR(m->err);
+
+	req = kmalloc(sizeof(struct v9fs_req), GFP_KERNEL);
+	if (!req)
+		return ERR_PTR(-ENOMEM);
 
-int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name)
+	if (tc->id == TVERSION)
+		n = V9FS_NOTAG;
+	else
+		n = v9fs_get_idpool(&m->tidpool);
+
+	if (n < 0)
+		return ERR_PTR(-ENOMEM);
+
+	tc->tag = n;
+	req->tag = n;
+	req->tcall = tc;
+	req->rcall = NULL;
+	req->err = 0;
+	req->cb = cb;
+	req->cba = cba;
+
+	spin_lock(&m->lock);
+	list_add_tail(&req->req_list, &m->unsent_req_list);
+	spin_unlock(&m->lock);
+
+	if (test_and_clear_bit(Wpending, &m->wsched))
+		n = POLLOUT;
+	else
+		n = m->trans->poll(m->trans, NULL);
+
+	if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
+		queue_work(v9fs_mux_wq, &m->wq);
+
+	return req;
+}
+
+static inline void
+v9fs_mux_flush_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc,
+		  int err)
 {
-	char procname[60];
-
-	strncpy(procname, dev_name, sizeof(procname));
-	procname[sizeof(procname) - 1] = 0;
-
-	init_waitqueue_head(&v9ses->read_wait);
-	init_completion(&v9ses->fcread);
-	init_completion(&v9ses->proccmpl);
-	spin_lock_init(&v9ses->muxlock);
-	INIT_LIST_HEAD(&v9ses->mux_fcalls);
-	v9ses->recvproc = NULL;
-	v9ses->curfcall = NULL;
-
-	v9ses->recvproc = kthread_create(v9fs_recvproc, v9ses,
-					 "v9fs_recvproc %s", procname);
-
-	if (IS_ERR(v9ses->recvproc)) {
-		eprintk(KERN_ERR, "cannot create receiving thread\n");
-		v9fs_session_close(v9ses);
-		return -ECONNABORTED;
+	v9fs_mux_req_callback cb;
+	int tag;
+	struct v9fs_mux_data *m;
+	struct v9fs_req *req, *rptr;
+
+	m = a;
+	dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, tc,
+		rc, err, tc->params.tflush.oldtag);
+
+	spin_lock(&m->lock);
+	cb = NULL;
+	tag = tc->params.tflush.oldtag;
+	list_for_each_entry_safe(req, rptr, &m->req_list, req_list) {
+		if (req->tag == tag) {
+			list_del(&req->req_list);
+			if (req->cb) {
+				cb = req->cb;
+				req->cb = NULL;
+				spin_unlock(&m->lock);
+				(*cb) (req->cba, req->tcall, req->rcall,
+				       req->err);
+			}
+			kfree(req);
+			wake_up(&m->equeue);
+			break;
+		}
+	}
+
+	if (!cb)
+		spin_unlock(&m->lock);
+
+	if (v9fs_check_idpool(tag, &m->tidpool))
+		v9fs_put_idpool(tag, &m->tidpool);
+
+	kfree(tc);
+	kfree(rc);
+}
+
+static void
+v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req)
+{
+	struct v9fs_fcall *fc;
+
+	dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
+
+	fc = kmalloc(sizeof(struct v9fs_fcall), GFP_KERNEL);
+	fc->id = TFLUSH;
+	fc->params.tflush.oldtag = req->tag;
+
+	v9fs_send_request(m, fc, v9fs_mux_flush_cb, m);
+}
+
+static void
+v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err)
+{
+	struct v9fs_mux_rpc *r;
+
+	if (err == ERREQFLUSH) {
+		dprintk(DEBUG_MUX, "err req flush\n");
+		return;
+	}
+
+	r = a;
+	dprintk(DEBUG_MUX, "mux %p req %p tc %p rc %p err %d\n", r->m, r->req,
+		tc, rc, err);
+	r->rcall = rc;
+	r->err = err;
+	wake_up(&r->wqueue);
+}
+
+/**
+ * v9fs_mux_rpc - sends 9P request and waits until a response is available.
+ *	The function can be interrupted.
+ * @m: mux data
+ * @tc: request to be sent
+ * @rc: pointer where a pointer to the response is stored
+ */
+int
+v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
+	     struct v9fs_fcall **rc)
+{
+	int err;
+	unsigned long flags;
+	struct v9fs_req *req;
+	struct v9fs_mux_rpc r;
+
+	r.err = 0;
+	r.rcall = NULL;
+	r.m = m;
+	init_waitqueue_head(&r.wqueue);
+
+	if (rc)
+		*rc = NULL;
+
+	req = v9fs_send_request(m, tc, v9fs_mux_rpc_cb, &r);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		dprintk(DEBUG_MUX, "error %d\n", err);
+		return PTR_ERR(req);
+	}
+
+	r.req = req;
+	dprintk(DEBUG_MUX, "mux %p tc %p tag %d rpc %p req %p\n", m, tc,
+		req->tag, &r, req);
+	err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0);
+	if (r.err < 0)
+		err = r.err;
+
+	if (err == -ERESTARTSYS && m->trans->status == Connected && m->err == 0) {
+		spin_lock(&m->lock);
+		req->tcall = NULL;
+		req->err = ERREQFLUSH;
+		spin_unlock(&m->lock);
+
+		clear_thread_flag(TIF_SIGPENDING);
+		v9fs_mux_flush_request(m, req);
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
 	}
 
-	wake_up_process(v9ses->recvproc);
-	wait_for_completion(&v9ses->proccmpl);
+	if (!err) {
+		if (r.rcall)
+			dprintk(DEBUG_MUX, "got response id %d tag %d\n",
+				r.rcall->id, r.rcall->tag);
+
+		if (rc)
+			*rc = r.rcall;
+		else
+			kfree(r.rcall);
+	} else {
+		kfree(r.rcall);
+		dprintk(DEBUG_MUX, "got error %d\n", err);
+		if (err > 0)
+			err = -EIO;
+	}
+
+	return err;
+}
+
+/**
+ * v9fs_mux_rpcnb - sends 9P request without waiting for response.
+ * @m: mux data
+ * @tc: request to be sent
+ * @cb: callback function to be called when response arrives
+ * @cba: value to pass to the callback function
+ */
+int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
+		   v9fs_mux_req_callback cb, void *a)
+{
+	int err;
+	struct v9fs_req *req;
+
+	req = v9fs_send_request(m, tc, cb, a);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		dprintk(DEBUG_MUX, "error %d\n", err);
+		return PTR_ERR(req);
+	}
 
+	dprintk(DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag);
 	return 0;
 }
+
+/**
+ * v9fs_mux_cancel - cancel all pending requests with error
+ * @m: mux data
+ * @err: error code
+ */
+void v9fs_mux_cancel(struct v9fs_mux_data *m, int err)
+{
+	struct v9fs_req *req, *rtmp;
+	LIST_HEAD(cancel_list);
+
+	dprintk(DEBUG_MUX, "mux %p err %d\n", m, err);
+	m->err = err;
+	spin_lock(&m->lock);
+	list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
+		list_move(&req->req_list, &cancel_list);
+	}
+	spin_unlock(&m->lock);
+
+	list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
+		list_del(&req->req_list);
+		if (!req->err)
+			req->err = err;
+
+		if (req->cb)
+			(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
+		else
+			kfree(req->rcall);
+
+		kfree(req);
+	}
+
+	wake_up(&m->equeue);
+}
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
index 4994cb10badf..02b13b14b05b 100644
--- a/fs/9p/mux.h
+++ b/fs/9p/mux.h
@@ -3,6 +3,7 @@
  *
  * Multiplexer Definitions
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *
  *  This program is free software; you can redistribute it and/or modify
@@ -23,19 +24,34 @@
  *
  */
 
-/* structure to manage each RPC transaction */
+struct v9fs_mux_data;
 
-struct v9fs_rpcreq {
-	struct v9fs_fcall *tcall;
-	struct v9fs_fcall *rcall;
-	int err;	/* error code if response failed */
+/**
+ * v9fs_mux_req_callback - callback function that is called when the
+ * response of a request is received. The callback is called from
+ * a workqueue and shouldn't block.
+ *
+ * @a - the pointer that was specified when the request was send to be
+ *      passed to the callback
+ * @tc - request call
+ * @rc - response call
+ * @err - error code (non-zero if error occured)
+ */
+typedef void (*v9fs_mux_req_callback)(void *a, struct v9fs_fcall *tc,
+	struct v9fs_fcall *rc, int err);
+
+void v9fs_mux_global_init(void);
+void v9fs_mux_global_exit(void);
 
-	/* XXX - could we put scatter/gather buffers here? */
+struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
+	unsigned char *extended);
+void v9fs_mux_destroy(struct v9fs_mux_data *);
 
-	struct list_head next;
-};
+int v9fs_mux_send(struct v9fs_mux_data *m, struct v9fs_fcall *tc);
+struct v9fs_fcall *v9fs_mux_recv(struct v9fs_mux_data *m);
+int v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, struct v9fs_fcall **rc);
+int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
+	v9fs_mux_req_callback cb, void *a);
 
-int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name);
-long v9fs_mux_rpc(struct v9fs_session_info *v9ses,
-		  struct v9fs_fcall *tcall, struct v9fs_fcall **rcall);
-void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err);
+void v9fs_mux_flush(struct v9fs_mux_data *m, int sendflush);
+void v9fs_mux_cancel(struct v9fs_mux_data *m, int err);
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index b7ffb9859588..1a28ef97a3d1 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -3,6 +3,7 @@
  *
  * File Descriptor Transport Layer
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
  *
  *  This program is free software; you can redistribute it and/or modify
@@ -106,9 +107,6 @@ v9fs_fd_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
 		return -ENOPROTOOPT;
 	}
 
-	sema_init(&trans->writelock, 1);
-	sema_init(&trans->readlock, 1);
-
 	ts = kmalloc(sizeof(struct v9fs_trans_fd), GFP_KERNEL);
 
 	if (!ts)
@@ -163,10 +161,55 @@ static void v9fs_fd_close(struct v9fs_transport *trans)
 	kfree(ts);
 }
 
+static unsigned int
+v9fs_fd_poll(struct v9fs_transport *trans, struct poll_table_struct *pt)
+{
+	int ret, n;
+	struct v9fs_trans_fd *ts;
+	mm_segment_t oldfs;
+
+	if (!trans)
+		return -EIO;
+
+	ts = trans->priv;
+	if (trans->status != Connected || !ts)
+		return -EIO;
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+
+	if (!ts->in_file->f_op || !ts->in_file->f_op->poll) {
+		ret = -EIO;
+		goto end;
+	}
+
+	ret = ts->in_file->f_op->poll(ts->in_file, pt);
+
+	if (ts->out_file != ts->in_file) {
+		if (!ts->out_file->f_op || !ts->out_file->f_op->poll) {
+			ret = -EIO;
+			goto end;
+		}
+
+		n = ts->out_file->f_op->poll(ts->out_file, pt);
+
+		ret &= ~POLLOUT;
+		n &= ~POLLIN;
+
+		ret |= n;
+	}
+
+end:
+	set_fs(oldfs);
+	return ret;
+}
+
+
 struct v9fs_transport v9fs_trans_fd = {
 	.init = v9fs_fd_init,
 	.write = v9fs_fd_send,
 	.read = v9fs_fd_recv,
 	.close = v9fs_fd_close,
+	.poll = v9fs_fd_poll,
 };
 
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index 6a9a75d40f73..9ef404c75c8f 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -3,6 +3,7 @@
  *
  * Socket Transport Layer
  *
+ *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
  *  Copyright (C) 1995, 1996 by Olaf Kirch <okir@monad.swb.de>
@@ -36,6 +37,7 @@
 #include <asm/uaccess.h>
 #include <linux/inet.h>
 #include <linux/idr.h>
+#include <linux/file.h>
 
 #include "debug.h"
 #include "v9fs.h"
@@ -45,6 +47,7 @@
 
 struct v9fs_trans_sock {
 	struct socket *s;
+	struct file *filp;
 };
 
 /**
@@ -57,41 +60,26 @@ struct v9fs_trans_sock {
 
 static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len)
 {
-	struct msghdr msg;
-	struct kvec iov;
-	int result;
-	mm_segment_t oldfs;
-	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+	int ret;
+	struct v9fs_trans_sock *ts;
 
-	if (trans->status == Disconnected)
+	if (!trans || trans->status == Disconnected) {
+		dprintk(DEBUG_ERROR, "disconnected ...\n");
 		return -EREMOTEIO;
+	}
 
-	result = -EINVAL;
-
-	oldfs = get_fs();
-	set_fs(get_ds());
-
-	iov.iov_base = v;
-	iov.iov_len = len;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_iovlen = 1;
-	msg.msg_control = NULL;
-	msg.msg_controllen = 0;
-	msg.msg_namelen = 0;
-	msg.msg_flags = MSG_NOSIGNAL;
-
-	result = kernel_recvmsg(ts->s, &msg, &iov, 1, len, 0);
+	ts = trans->priv;
 
-	dprintk(DEBUG_TRANS, "socket state %d\n", ts->s->state);
-	set_fs(oldfs);
+	if (!(ts->filp->f_flags & O_NONBLOCK))
+		dprintk(DEBUG_ERROR, "blocking read ...\n");
 
-	if (result <= 0) {
-		if (result != -ERESTARTSYS)
+	ret = kernel_read(ts->filp, ts->filp->f_pos, v, len);
+	if (ret <= 0) {
+		if (ret != -ERESTARTSYS && ret != -EAGAIN)
 			trans->status = Disconnected;
 	}
 
-	return result;
+	return ret;
 }
 
 /**
@@ -104,40 +92,73 @@ static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len)
 
 static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len)
 {
-	struct kvec iov;
-	struct msghdr msg;
-	int result = -1;
+	int ret;
 	mm_segment_t oldfs;
-	struct v9fs_trans_sock *ts = trans ? trans->priv : NULL;
+	struct v9fs_trans_sock *ts;
 
-	dprintk(DEBUG_TRANS, "Sending packet size %d (%x)\n", len, len);
-	dump_data(v, len);
+	if (!trans || trans->status == Disconnected) {
+		dprintk(DEBUG_ERROR, "disconnected ...\n");
+		return -EREMOTEIO;
+	}
+
+	ts = trans->priv;
+	if (!ts) {
+		dprintk(DEBUG_ERROR, "no transport ...\n");
+		return -EREMOTEIO;
+	}
 
-	down(&trans->writelock);
+	if (!(ts->filp->f_flags & O_NONBLOCK))
+		dprintk(DEBUG_ERROR, "blocking write ...\n");
 
+	dump_data(v, len);
 	oldfs = get_fs();
 	set_fs(get_ds());
-	iov.iov_base = v;
-	iov.iov_len = len;
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_iovlen = 1;
-	msg.msg_control = NULL;
-	msg.msg_controllen = 0;
-	msg.msg_namelen = 0;
-	msg.msg_flags = MSG_NOSIGNAL;
-	result = kernel_sendmsg(ts->s, &msg, &iov, 1, len);
+	ret = vfs_write(ts->filp, (void __user *)v, len, &ts->filp->f_pos);
 	set_fs(oldfs);
 
-	if (result < 0) {
-		if (result != -ERESTARTSYS)
+	if (ret < 0) {
+		if (ret != -ERESTARTSYS)
 			trans->status = Disconnected;
 	}
 
-	up(&trans->writelock);
-	return result;
+	return ret;
+}
+
+static unsigned int v9fs_sock_poll(struct v9fs_transport *trans,
+	struct poll_table_struct *pt) {
+
+	int ret;
+	struct v9fs_trans_sock *ts;
+	mm_segment_t oldfs;
+
+	if (!trans) {
+		dprintk(DEBUG_ERROR, "no transport\n");
+		return -EIO;
+	}
+
+	ts = trans->priv;
+	if (trans->status != Connected || !ts) {
+		dprintk(DEBUG_ERROR, "transport disconnected: %d\n", trans->status);
+		return -EIO;
+	}
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+
+	if (!ts->filp->f_op || !ts->filp->f_op->poll) {
+		dprintk(DEBUG_ERROR, "no poll operation\n");
+		ret = -EIO;
+		goto end;
+	}
+
+	ret = ts->filp->f_op->poll(ts->filp, pt);
+
+end:
+	set_fs(oldfs);
+	return ret;
 }
 
+
 /**
  * v9fs_tcp_init - initialize TCP socket
  * @v9ses: session information
@@ -154,9 +175,9 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
 	int rc = 0;
 	struct v9fs_trans_sock *ts = NULL;
 	struct v9fs_transport *trans = v9ses->transport;
+	int fd;
 
-	sema_init(&trans->writelock, 1);
-	sema_init(&trans->readlock, 1);
+	trans->status = Disconnected;
 
 	ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL);
 
@@ -165,6 +186,7 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
 
 	trans->priv = ts;
 	ts->s = NULL;
+	ts->filp = NULL;
 
 	if (!addr)
 		return -EINVAL;
@@ -185,7 +207,18 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data)
 		return rc;
 	}
 	csocket->sk->sk_allocation = GFP_NOIO;
+
+	fd = sock_map_fd(csocket);
+	if (fd < 0) {
+		sock_release(csocket);
+		kfree(ts);
+		trans->priv = NULL;
+		return fd;
+	}
+
 	ts->s = csocket;
+	ts->filp = fget(fd);
+	ts->filp->f_flags |= O_NONBLOCK;
 	trans->status = Connected;
 
 	return 0;
@@ -203,7 +236,7 @@ static int
 v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
 	       char *data)
 {
-	int rc;
+	int rc, fd;
 	struct socket *csocket;
 	struct sockaddr_un sun_server;
 	struct v9fs_transport *trans;
@@ -213,6 +246,8 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
 	csocket = NULL;
 	trans = v9ses->transport;
 
+	trans->status = Disconnected;
+
 	if (strlen(dev_name) > UNIX_PATH_MAX) {
 		eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n",
 			dev_name);
@@ -225,9 +260,7 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
 
 	trans->priv = ts;
 	ts->s = NULL;
-
-	sema_init(&trans->writelock, 1);
-	sema_init(&trans->readlock, 1);
+	ts->filp = NULL;
 
 	sun_server.sun_family = PF_UNIX;
 	strcpy(sun_server.sun_path, dev_name);
@@ -241,7 +274,18 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name,
 		return rc;
 	}
 	csocket->sk->sk_allocation = GFP_NOIO;
+
+	fd = sock_map_fd(csocket);
+	if (fd < 0) {
+		sock_release(csocket);
+		kfree(ts);
+		trans->priv = NULL;
+		return fd;
+	}
+
 	ts->s = csocket;
+	ts->filp = fget(fd);
+	ts->filp->f_flags |= O_NONBLOCK;
 	trans->status = Connected;
 
 	return 0;
@@ -262,12 +306,11 @@ static void v9fs_sock_close(struct v9fs_transport *trans)
 
 	ts = trans->priv;
 
-	if ((ts) && (ts->s)) {
-		dprintk(DEBUG_TRANS, "closing the socket %p\n", ts->s);
-		sock_release(ts->s);
+	if ((ts) && (ts->filp)) {
+		fput(ts->filp);
+		ts->filp = NULL;
 		ts->s = NULL;
 		trans->status = Disconnected;
-		dprintk(DEBUG_TRANS, "socket closed\n");
 	}
 
 	kfree(ts);
@@ -280,6 +323,7 @@ struct v9fs_transport v9fs_trans_tcp = {
 	.write = v9fs_sock_send,
 	.read = v9fs_sock_recv,
 	.close = v9fs_sock_close,
+	.poll = v9fs_sock_poll,
 };
 
 struct v9fs_transport v9fs_trans_unix = {
@@ -287,4 +331,5 @@ struct v9fs_transport v9fs_trans_unix = {
 	.write = v9fs_sock_send,
 	.read = v9fs_sock_recv,
 	.close = v9fs_sock_close,
+	.poll = v9fs_sock_poll,
 };
diff --git a/fs/9p/transport.h b/fs/9p/transport.h
index 9e9cd418efd5..91fcdb94b361 100644
--- a/fs/9p/transport.h
+++ b/fs/9p/transport.h
@@ -3,6 +3,7 @@
  *
  * Transport Definition
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *
  *  This program is free software; you can redistribute it and/or modify
@@ -31,14 +32,13 @@ enum v9fs_transport_status {
 
 struct v9fs_transport {
 	enum v9fs_transport_status status;
-	struct semaphore writelock;
-	struct semaphore readlock;
 	void *priv;
 
 	int (*init) (struct v9fs_session_info *, const char *, char *);
 	int (*write) (struct v9fs_transport *, void *, int);
 	int (*read) (struct v9fs_transport *, void *, int);
 	void (*close) (struct v9fs_transport *);
+	unsigned int (*poll)(struct v9fs_transport *, struct poll_table_struct *);
 };
 
 extern struct v9fs_transport v9fs_trans_tcp;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 418c3743fdee..5e0f79355fdf 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -213,7 +213,8 @@ retry:
 		return -1;
 	}
 
-	error = idr_get_new(&p->pool, NULL, &i);
+	/* no need to store exactly p, we just need something non-null */
+	error = idr_get_new(&p->pool, p, &i);
 	up(&p->lock);
 
 	if (error == -EAGAIN)
@@ -242,6 +243,16 @@ void v9fs_put_idpool(int id, struct v9fs_idpool *p)
 	up(&p->lock);
 }
 
+/**
+ * v9fs_check_idpool - check if the specified id is available
+ * @id - id to check
+ * @p - pool
+ */
+int v9fs_check_idpool(int id, struct v9fs_idpool *p)
+{
+	return idr_find(&p->pool, id) != NULL;
+}
+
 /**
  * v9fs_session_init - initialize session
  * @v9ses: session information structure
@@ -281,9 +292,6 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	/* id pools that are session-dependent: FIDs and TIDs */
 	idr_init(&v9ses->fidpool.pool);
 	init_MUTEX(&v9ses->fidpool.lock);
-	idr_init(&v9ses->tidpool.pool);
-	init_MUTEX(&v9ses->tidpool.lock);
-
 
 	switch (v9ses->proto) {
 	case PROTO_TCP:
@@ -320,7 +328,12 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	v9ses->shutdown = 0;
 	v9ses->session_hung = 0;
 
-	if ((retval = v9fs_mux_init(v9ses, dev_name)) < 0) {
+	v9ses->mux = v9fs_mux_init(v9ses->transport, v9ses->maxdata + V9FS_IOHDRSZ,
+		&v9ses->extended);
+
+	if (IS_ERR(v9ses->mux)) {
+		retval = PTR_ERR(v9ses->mux);
+		v9ses->mux = NULL;
 		dprintk(DEBUG_ERROR, "problem initializing mux\n");
 		goto SessCleanUp;
 	}
@@ -381,7 +394,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	}
 
 	if (v9ses->afid != ~0) {
-		if (v9fs_t_clunk(v9ses, v9ses->afid, NULL))
+		if (v9fs_t_clunk(v9ses, v9ses->afid))
 			dprintk(DEBUG_ERROR, "clunk failed\n");
 	}
 
@@ -403,13 +416,16 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 
 void v9fs_session_close(struct v9fs_session_info *v9ses)
 {
-	if (v9ses->recvproc) {
-		send_sig(SIGKILL, v9ses->recvproc, 1);
-		wait_for_completion(&v9ses->proccmpl);
+	if (v9ses->mux) {
+		v9fs_mux_destroy(v9ses->mux);
+		v9ses->mux = NULL;
 	}
 
-	if (v9ses->transport)
+	if (v9ses->transport) {
 		v9ses->transport->close(v9ses->transport);
+		kfree(v9ses->transport);
+		v9ses->transport = NULL;
+	}
 
 	__putname(v9ses->name);
 	__putname(v9ses->remotename);
@@ -420,8 +436,9 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
  * 	and cancel all pending requests.
  */
 void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
+	dprintk(DEBUG_ERROR, "cancel session %p\n", v9ses);
 	v9ses->transport->status = Disconnected;
-	v9fs_mux_cancel_requests(v9ses, -EIO);
+	v9fs_mux_cancel(v9ses->mux, -EIO);
 }
 
 extern int v9fs_error_init(void);
@@ -437,6 +454,7 @@ static int __init init_v9fs(void)
 
 	printk(KERN_INFO "Installing v9fs 9P2000 file system support\n");
 
+	v9fs_mux_global_init();
 	return register_filesystem(&v9fs_fs_type);
 }
 
@@ -447,6 +465,7 @@ static int __init init_v9fs(void)
 
 static void __exit exit_v9fs(void)
 {
+	v9fs_mux_global_exit();
 	unregister_filesystem(&v9fs_fs_type);
 }
 
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 45dcef42bdd6..f337da7a0eec 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -57,24 +57,14 @@ struct v9fs_session_info {
 
 	/* book keeping */
 	struct v9fs_idpool fidpool;	/* The FID pool for file descriptors */
-	struct v9fs_idpool tidpool;	/* The TID pool for transactions ids */
 
-	/* transport information */
 	struct v9fs_transport *transport;
+	struct v9fs_mux_data *mux;
 
 	int inprogress;		/* session in progress => true */
 	int shutdown;		/* session shutting down. no more attaches. */
 	unsigned char session_hung;
-
-	/* mux private data */
-	struct v9fs_fcall *curfcall;
-	wait_queue_head_t read_wait;
-	struct completion fcread;
-	struct completion proccmpl;
-	struct task_struct *recvproc;
-
-	spinlock_t muxlock;
-	struct list_head mux_fcalls;
+	struct dentry *debugfs_dir;
 };
 
 /* possible values of ->proto */
@@ -84,11 +74,14 @@ enum {
 	PROTO_FD,
 };
 
+extern struct dentry *v9fs_debugfs_root;
+
 int v9fs_session_init(struct v9fs_session_info *, const char *, char *);
 struct v9fs_session_info *v9fs_inode2v9ses(struct inode *);
 void v9fs_session_close(struct v9fs_session_info *v9ses);
 int v9fs_get_idpool(struct v9fs_idpool *p);
 void v9fs_put_idpool(int id, struct v9fs_idpool *p);
+int v9fs_check_idpool(int id, struct v9fs_idpool *p);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
 
 #define V9FS_MAGIC 0x01021997
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index a6aa947de0f9..4887df767394 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -95,24 +95,21 @@ static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd)
 
 void v9fs_dentry_release(struct dentry *dentry)
 {
+	int err;
+
 	dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
 
 	if (dentry->d_fsdata != NULL) {
 		struct list_head *fid_list = dentry->d_fsdata;
 		struct v9fs_fid *temp = NULL;
 		struct v9fs_fid *current_fid = NULL;
-		struct v9fs_fcall *fcall = NULL;
 
 		list_for_each_entry_safe(current_fid, temp, fid_list, list) {
-			if (v9fs_t_clunk
-			    (current_fid->v9ses, current_fid->fid, &fcall))
-				dprintk(DEBUG_ERROR, "clunk failed: %s\n",
-					FCALL_ERROR(fcall));
+			err = v9fs_t_clunk(current_fid->v9ses, current_fid->fid);
 
-			v9fs_put_idpool(current_fid->fid,
-					&current_fid->v9ses->fidpool);
+			if (err < 0)
+				dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
 
-			kfree(fcall);
 			v9fs_fid_destroy(current_fid);
 		}
 
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 17089d1905ff..3893dd307ddb 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -74,7 +74,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
 	struct v9fs_fid *file = filp->private_data;
-	unsigned int i, n;
+	unsigned int i, n, s;
 	int fid = -1;
 	int ret = 0;
 	struct v9fs_stat *mi = NULL;
@@ -97,9 +97,9 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		n = file->rdir_fcall->params.rread.count;
 		i = file->rdir_fpos;
 		while (i < n) {
-			int s = v9fs_deserialize_stat(v9ses,
-				  file->rdir_fcall->params.rread.data + i,
-			          n - i, mi, v9ses->maxdata);
+			s = v9fs_deserialize_stat(
+				file->rdir_fcall->params.rread.data + i,
+				n - i, mi, v9ses->maxdata, v9ses->extended);
 
 			if (s == 0) {
 				dprintk(DEBUG_ERROR,
@@ -141,9 +141,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		n = ret;
 		i = 0;
 		while (i < n) {
-			int s = v9fs_deserialize_stat(v9ses,
-			          fcall->params.rread.data + i, n - i, mi,
-			          v9ses->maxdata);
+			s = v9fs_deserialize_stat(fcall->params.rread.data + i,
+				n - i, mi, v9ses->maxdata, v9ses->extended);
 
 			if (s == 0) {
 				dprintk(DEBUG_ERROR,
@@ -199,11 +198,9 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 		dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
 			fid->fid);
 
-		if (v9fs_t_clunk(v9ses, fidnum, NULL))
+		if (v9fs_t_clunk(v9ses, fidnum))
 			dprintk(DEBUG_ERROR, "clunk failed\n");
 
-		v9fs_put_idpool(fid->fid, &v9ses->fidpool);
-
 		kfree(fid->rdir_fcall);
 		kfree(fid);
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0ea965c3bb7d..466002a1fe32 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -318,6 +318,7 @@ v9fs_create(struct inode *dir,
 	int result = 0;
 	unsigned int iounit = 0;
 	int wfidno = -1;
+	int err;
 
 	perm = unixmode2p9mode(v9ses, perm);
 
@@ -356,6 +357,7 @@ v9fs_create(struct inode *dir,
 	}
 
 	kfree(fcall);
+	fcall = NULL;
 
 	result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
 			       perm, open_mode, &fcall);
@@ -369,16 +371,23 @@ v9fs_create(struct inode *dir,
 	iounit = fcall->params.rcreate.iounit;
 	qid = fcall->params.rcreate.qid;
 	kfree(fcall);
+	fcall = NULL;
 
-	fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1);
-	dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate);
-	if (!fid) {
-		result = -ENOMEM;
-		goto CleanUpFid;
-	}
+	if (!(perm&V9FS_DMDIR)) {
+		fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1);
+		dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate);
+		if (!fid) {
+			result = -ENOMEM;
+			goto CleanUpFid;
+		}
 
-	fid->qid = qid;
-	fid->iounit = iounit;
+		fid->qid = qid;
+		fid->iounit = iounit;
+	} else {
+		err = v9fs_t_clunk(v9ses, newfid);
+		if (err < 0)
+			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err);
+	}
 
 	/* walk to the newly created file and put the fid in the dentry */
 	wfidno = v9fs_get_idpool(&v9ses->fidpool);
@@ -388,18 +397,19 @@ v9fs_create(struct inode *dir,
 	}
 
 	result = v9fs_t_walk(v9ses, dirfidnum, wfidno,
-		(char *) file_dentry->d_name.name, NULL);
+		(char *) file_dentry->d_name.name, &fcall);
 	if (result < 0) {
 		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
 		v9fs_put_idpool(wfidno, &v9ses->fidpool);
 		wfidno = -1;
 		goto CleanUpFid;
 	}
+	kfree(fcall);
+	fcall = NULL;
 
 	if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) {
-		if (!v9fs_t_clunk(v9ses, newfid, &fcall)) {
-			v9fs_put_idpool(wfidno, &v9ses->fidpool);
-		}
+		v9fs_t_clunk(v9ses, newfid);
+		v9fs_put_idpool(wfidno, &v9ses->fidpool);
 
 		goto CleanUpFid;
 	}
@@ -431,40 +441,21 @@ v9fs_create(struct inode *dir,
 	file_dentry->d_op = &v9fs_dentry_operations;
 	d_instantiate(file_dentry, file_inode);
 
-	if (perm & V9FS_DMDIR) {
-		if (!v9fs_t_clunk(v9ses, newfid, &fcall))
-			v9fs_put_idpool(newfid, &v9ses->fidpool);
-		else
-			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %s\n",
-				FCALL_ERROR(fcall));
-		kfree(fcall);
-		fid->fidopen = 0;
-		fid->fidcreate = 0;
-		d_drop(file_dentry);
-	}
-
 	return 0;
 
       CleanUpFid:
 	kfree(fcall);
+	fcall = NULL;
 
 	if (newfid >= 0) {
-		if (!v9fs_t_clunk(v9ses, newfid, &fcall))
-			v9fs_put_idpool(newfid, &v9ses->fidpool);
-		else
-			dprintk(DEBUG_ERROR, "clunk failed: %s\n",
-				FCALL_ERROR(fcall));
-
-		kfree(fcall);
+ 		err = v9fs_t_clunk(v9ses, newfid);
+ 		if (err < 0)
+ 			dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
 	}
 	if (wfidno >= 0) {
-		if (!v9fs_t_clunk(v9ses, wfidno, &fcall))
-			v9fs_put_idpool(wfidno, &v9ses->fidpool);
-		else
-			dprintk(DEBUG_ERROR, "clunk failed: %s\n",
-				FCALL_ERROR(fcall));
-
-		kfree(fcall);
+ 		err = v9fs_t_clunk(v9ses, wfidno);
+ 		if (err < 0)
+ 			dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
 	}
 	return result;
 }
@@ -972,6 +963,7 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	int err;
 
 	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
 		symname);
@@ -1004,9 +996,9 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 
 	kfree(fcall);
 
-	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
-			FCALL_ERROR(fcall));
+	err = v9fs_t_clunk(v9ses, newfid->fid);
+	if (err < 0) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
 		goto FreeFcall;
 	}
 
@@ -1180,6 +1172,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry);
 	struct v9fs_fid *newfid = NULL;
 	char *symname = __getname();
+	int err;
 
 	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
 		old_dentry->d_name.name);
@@ -1216,9 +1209,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	kfree(fcall);
 
-	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
-			FCALL_ERROR(fcall));
+	err = v9fs_t_clunk(v9ses, newfid->fid);
+
+	if (err < 0) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
 		goto FreeMem;
 	}
 
@@ -1252,6 +1246,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
 	char *symname = __getname();
+	int err;
 
 	dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
 		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
@@ -1310,9 +1305,9 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	/* need to update dcache so we show up */
 	kfree(fcall);
 
-	if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n",
-			FCALL_ERROR(fcall));
+	err = v9fs_t_clunk(v9ses, newfid->fid);
+	if (err < 0) {
+		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
 		goto FreeMem;
 	}
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 82c5b0084079..83b6edd0988a 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -129,6 +129,7 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
+		kfree(v9ses);
 		return ERR_PTR(newfid);
 	}
 
@@ -157,7 +158,7 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 	stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
 	if (stat_result < 0) {
 		dprintk(DEBUG_ERROR, "stat error\n");
-		v9fs_t_clunk(v9ses, newfid, NULL);
+		v9fs_t_clunk(v9ses, newfid);
 		v9fs_put_idpool(newfid, &v9ses->fidpool);
 	} else {
 		/* Setup the Root Inode */
-- 
cgit v1.2.3


From d8da097afb765654c866062148fd98b11db9003e Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Sun, 8 Jan 2006 01:04:59 -0800
Subject: [PATCH] v9fs: fix fid management in v9fs_create

v9fs_create doesn't manage correctly the fids when it is called to create a
directory..  The fid created by the create 9P call (newfid) and the one
created by walking to already created file (wfidno) are not used
consistently.

This patch cleans up the usage of newfid and wfidno.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 466002a1fe32..f11edde6432e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -385,13 +385,14 @@ v9fs_create(struct inode *dir,
 		fid->iounit = iounit;
 	} else {
 		err = v9fs_t_clunk(v9ses, newfid);
+		newfid = -1;
 		if (err < 0)
 			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err);
 	}
 
 	/* walk to the newly created file and put the fid in the dentry */
 	wfidno = v9fs_get_idpool(&v9ses->fidpool);
-	if (newfid < 0) {
+	if (wfidno < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
 		return -ENOSPC;
 	}
@@ -408,7 +409,6 @@ v9fs_create(struct inode *dir,
 	fcall = NULL;
 
 	if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) {
-		v9fs_t_clunk(v9ses, newfid);
 		v9fs_put_idpool(wfidno, &v9ses->fidpool);
 
 		goto CleanUpFid;
@@ -419,7 +419,7 @@ v9fs_create(struct inode *dir,
 	    (perm & V9FS_DMDEVICE))
 		return 0;
 
-	result = v9fs_t_stat(v9ses, newfid, &fcall);
+	result = v9fs_t_stat(v9ses, wfidno, &fcall);
 	if (result < 0) {
 		dprintk(DEBUG_ERROR, "stat error: %s(%d)\n", FCALL_ERROR(fcall),
 			result);
-- 
cgit v1.2.3


From 531b1094b74365dcc55fa464d28a9a2497ae825d Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Sun, 8 Jan 2006 01:05:00 -0800
Subject: [PATCH] v9fs: zero copy implementation

Performance enhancement reducing the number of copies in the data and
stat paths.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/9p.c         | 302 ++++++++++--------
 fs/9p/9p.h         |  75 +++--
 fs/9p/Makefile     |  10 +-
 fs/9p/conv.c       | 895 +++++++++++++++++++++++++++++++----------------------
 fs/9p/conv.h       |  28 +-
 fs/9p/debug.h      |  23 +-
 fs/9p/error.c      |  10 +-
 fs/9p/error.h      |   3 +-
 fs/9p/fid.c        |   3 -
 fs/9p/mux.c        | 157 +++++-----
 fs/9p/trans_sock.c |   1 -
 fs/9p/v9fs.c       |   3 +-
 fs/9p/v9fs_vfs.h   |   5 +-
 fs/9p/vfs_dentry.c |   4 +-
 fs/9p/vfs_dir.c    |  31 +-
 fs/9p/vfs_file.c   |  25 +-
 fs/9p/vfs_inode.c  | 545 +++++++++++---------------------
 fs/9p/vfs_super.c  |  10 +-
 18 files changed, 1083 insertions(+), 1047 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index a3a1ac610723..dc3ce44ec836 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -1,8 +1,9 @@
 /*
  *  linux/fs/9p/9p.c
  *
- *  This file contains functions 9P2000 functions
+ *  This file contains functions to perform synchronous 9P calls
  *
+ *  Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
  *
@@ -33,6 +34,7 @@
 #include "debug.h"
 #include "v9fs.h"
 #include "9p.h"
+#include "conv.h"
 #include "mux.h"
 
 /**
@@ -46,17 +48,21 @@
 
 int
 v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
-	       char *version, struct v9fs_fcall **fcall)
+	       char *version, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version);
-	msg.id = TVERSION;
-	msg.tag = ~0;
-	msg.params.tversion.msize = msize;
-	msg.params.tversion.version = version;
+	tc = v9fs_create_tversion(msize, version);
 
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -72,19 +78,23 @@ v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 
 int
 v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
-	      u32 fid, u32 afid, struct v9fs_fcall **fcall)
+	      u32 fid, u32 afid, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall* tc;
 
 	dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname,
 		aname, fid, afid);
-	msg.id = TATTACH;
-	msg.params.tattach.fid = fid;
-	msg.params.tattach.afid = afid;
-	msg.params.tattach.uname = uname;
-	msg.params.tattach.aname = aname;
 
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	ret = -ENOMEM;
+	tc = v9fs_create_tattach(fid, afid, uname, aname);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
@@ -117,24 +127,28 @@ static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
  * @fcall: pointer to response fcall pointer
  *
  */
+
 int
 v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
 {
-	int err;
+	int ret;
 	struct v9fs_fcall *tc, *rc;
 
-	tc = kmalloc(sizeof(struct v9fs_fcall), GFP_KERNEL);
-
 	dprintk(DEBUG_9P, "fid %d\n", fid);
-	tc->id = TCLUNK;
-	tc->params.tclunk.fid = fid;
 
-	err = v9fs_mux_rpc(v9ses->mux, tc, &rc);
-	if (err >= 0) {
-		v9fs_t_clunk_cb(v9ses, tc, rc, 0);
-	}
+	ret = -ENOMEM;
+	rc = NULL;
+	tc = v9fs_create_tclunk(fid);
+	if (!IS_ERR(tc))
+		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
+	else
+		ret = PTR_ERR(tc);
+
+	if (ret)
+		dprintk(DEBUG_ERROR, "failed fid %d err %d\n", fid, ret);
 
-	return err;
+	v9fs_t_clunk_cb(v9ses, tc, rc, ret);
+	return ret;
 }
 
 /**
@@ -144,14 +158,22 @@ v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
  *
  */
 
-int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
+int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
-	dprintk(DEBUG_9P, "oldtag %d\n", tag);
-	msg.id = TFLUSH;
-	msg.params.tflush.oldtag = tag;
-	return v9fs_mux_rpc(v9ses->mux, &msg, NULL);
+	dprintk(DEBUG_9P, "oldtag %d\n", oldtag);
+
+	ret = -ENOMEM;
+	tc = v9fs_create_tflush(oldtag);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, NULL);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -163,17 +185,22 @@ int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag)
  */
 
 int
-v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
+v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
-	if (fcall)
-		*fcall = NULL;
 
-	msg.id = TSTAT;
-	msg.params.tstat.fid = fid;
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	ret = -ENOMEM;
+	tc = v9fs_create_tstat(fid);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -187,16 +214,22 @@ v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall)
 
 int
 v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
-	     struct v9fs_stat *stat, struct v9fs_fcall **fcall)
+	     struct v9fs_wstat *wstat, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
-	dprintk(DEBUG_9P, "fid %d length %d\n", fid, (int)stat->length);
-	msg.id = TWSTAT;
-	msg.params.twstat.fid = fid;
-	msg.params.twstat.stat = stat;
+	dprintk(DEBUG_9P, "fid %d\n", fid);
+
+	ret = -ENOMEM;
+	tc = v9fs_create_twstat(fid, wstat, v9ses->extended);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
 
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	return ret;
 }
 
 /**
@@ -213,23 +246,28 @@ v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
 
 int
 v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
-	    char *name, struct v9fs_fcall **fcall)
+	    char *name, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
+	int nwname;
 
 	dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name);
-	msg.id = TWALK;
-	msg.params.twalk.fid = fid;
-	msg.params.twalk.newfid = newfid;
-
-	if (name) {
-		msg.params.twalk.nwname = 1;
-		msg.params.twalk.wnames = &name;
-	} else {
-		msg.params.twalk.nwname = 0;
-	}
-
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+
+	if (name)
+		nwname = 1;
+	else
+		nwname = 0;
+
+	ret = -ENOMEM;
+	tc = v9fs_create_twalk(fid, newfid, nwname, &name);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -244,19 +282,22 @@ v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
 
 int
 v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
-	    struct v9fs_fcall **fcall)
+	    struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
-	int errorno = -1;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
-	msg.id = TOPEN;
-	msg.params.topen.fid = fid;
-	msg.params.topen.mode = mode;
 
-	errorno = v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	ret = -ENOMEM;
+	tc = v9fs_create_topen(fid, mode);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
 
-	return errorno;
+	return ret;
 }
 
 /**
@@ -269,14 +310,22 @@ v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
 
 int
 v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
-	      struct v9fs_fcall **fcall)
+	      struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
-	msg.id = TREMOVE;
-	msg.params.tremove.fid = fid;
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+
+	ret = -ENOMEM;
+	tc = v9fs_create_tremove(fid);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -292,20 +341,23 @@ v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
 
 int
 v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
-	      u32 perm, u8 mode, struct v9fs_fcall **fcall)
+	      u32 perm, u8 mode, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
+	int ret;
+	struct v9fs_fcall *tc;
 
 	dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n",
 		fid, name, perm, mode);
 
-	msg.id = TCREATE;
-	msg.params.tcreate.fid = fid;
-	msg.params.tcreate.name = name;
-	msg.params.tcreate.perm = perm;
-	msg.params.tcreate.mode = mode;
+	ret = -ENOMEM;
+	tc = v9fs_create_tcreate(fid, name, perm, mode);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
 
-	return v9fs_mux_rpc(v9ses->mux, &msg, fcall);
+	return ret;
 }
 
 /**
@@ -320,31 +372,30 @@ v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
 
 int
 v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
-	    u32 count, struct v9fs_fcall **fcall)
+	    u32 count, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
-	struct v9fs_fcall *rc = NULL;
-	long errorno = -1;
-
-	dprintk(DEBUG_9P, "fid %d offset 0x%lx count 0x%x\n", fid,
-		(long unsigned int)offset, count);
-	msg.id = TREAD;
-	msg.params.tread.fid = fid;
-	msg.params.tread.offset = offset;
-	msg.params.tread.count = count;
-	errorno = v9fs_mux_rpc(v9ses->mux, &msg, &rc);
-
-	if (!errorno) {
-		errorno = rc->params.rread.count;
-		dump_data(rc->params.rread.data, rc->params.rread.count);
-	}
-
-	if (fcall)
-		*fcall = rc;
-	else
-		kfree(rc);
+	int ret;
+	struct v9fs_fcall *tc, *rc;
 
-	return errorno;
+	dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
+		(long long unsigned) offset, count);
+
+	ret = -ENOMEM;
+	tc = v9fs_create_tread(fid, offset, count);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
+		if (!ret)
+			ret = rc->params.rread.count;
+		if (rcp)
+			*rcp = rc;
+		else
+			kfree(rc);
+
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
+
+	return ret;
 }
 
 /**
@@ -358,32 +409,31 @@ v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
  */
 
 int
-v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid,
-	     u64 offset, u32 count, void *data, struct v9fs_fcall **fcall)
+v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, u32 count,
+	const char __user *data, struct v9fs_fcall **rcp)
 {
-	struct v9fs_fcall msg;
-	struct v9fs_fcall *rc = NULL;
-	long errorno = -1;
+	int ret;
+	struct v9fs_fcall *tc, *rc;
 
-	dprintk(DEBUG_9P, "fid %d offset 0x%llx count 0x%x\n", fid,
-		(unsigned long long)offset, count);
-	dump_data(data, count);
+	dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
+		(long long unsigned) offset, count);
 
-	msg.id = TWRITE;
-	msg.params.twrite.fid = fid;
-	msg.params.twrite.offset = offset;
-	msg.params.twrite.count = count;
-	msg.params.twrite.data = data;
+	ret = -ENOMEM;
+	tc = v9fs_create_twrite(fid, offset, count, data);
+	if (!IS_ERR(tc)) {
+		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
 
-	errorno = v9fs_mux_rpc(v9ses->mux, &msg, &rc);
+		if (!ret)
+			ret = rc->params.rwrite.count;
+		if (rcp)
+			*rcp = rc;
+		else
+			kfree(rc);
 
-	if (!errorno)
-		errorno = rc->params.rwrite.count;
+		kfree(tc);
+	} else
+		ret = PTR_ERR(tc);
 
-	if (fcall)
-		*fcall = rc;
-	else
-		kfree(rc);
-
-	return errorno;
+	return ret;
 }
+
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
index 6355392786e2..007ff639777d 100644
--- a/fs/9p/9p.h
+++ b/fs/9p/9p.h
@@ -3,6 +3,7 @@
  *
  * 9P protocol definitions.
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
  *
@@ -102,10 +103,16 @@ enum {
 
 #define V9FS_NOTAG	(u16)(~0)
 #define V9FS_NOFID	(u32)(~0)
+#define V9FS_MAXWELEM	16
 
 /* ample room for Twrite/Rread header (iounit) */
 #define V9FS_IOHDRSZ	24
 
+struct v9fs_str {
+	u16 len;
+	char *str;
+};
+
 /* qids are the unique ID for a file (like an inode */
 struct v9fs_qid {
 	u8 type;
@@ -115,6 +122,29 @@ struct v9fs_qid {
 
 /* Plan 9 file metadata (stat) structure */
 struct v9fs_stat {
+	u16 size;
+	u16 type;
+	u32 dev;
+	struct v9fs_qid qid;
+	u32 mode;
+	u32 atime;
+	u32 mtime;
+	u64 length;
+	struct v9fs_str name;
+	struct v9fs_str uid;
+	struct v9fs_str gid;
+	struct v9fs_str muid;
+	struct v9fs_str extension;	/* 9p2000.u extensions */
+	u32 n_uid;		/* 9p2000.u extensions */
+	u32 n_gid;		/* 9p2000.u extensions */
+	u32 n_muid;		/* 9p2000.u extensions */
+};
+
+/* file metadata (stat) structure used to create Twstat message
+   The is similar to v9fs_stat, but the strings don't point to
+   the same memory block and should be freed separately
+*/
+struct v9fs_wstat {
 	u16 size;
 	u16 type;
 	u32 dev;
@@ -131,25 +161,24 @@ struct v9fs_stat {
 	u32 n_uid;		/* 9p2000.u extensions */
 	u32 n_gid;		/* 9p2000.u extensions */
 	u32 n_muid;		/* 9p2000.u extensions */
-	char data[0];
 };
 
 /* Structures for Protocol Operations */
 
 struct Tversion {
 	u32 msize;
-	char *version;
+	struct v9fs_str version;
 };
 
 struct Rversion {
 	u32 msize;
-	char *version;
+	struct v9fs_str version;
 };
 
 struct Tauth {
 	u32 afid;
-	char *uname;
-	char *aname;
+	struct v9fs_str uname;
+	struct v9fs_str aname;
 };
 
 struct Rauth {
@@ -157,12 +186,12 @@ struct Rauth {
 };
 
 struct Rerror {
-	char *error;
+	struct v9fs_str error;
 	u32 errno;		/* 9p2000.u extension */
 };
 
 struct Tflush {
-	u32 oldtag;
+	u16 oldtag;
 };
 
 struct Rflush {
@@ -171,8 +200,8 @@ struct Rflush {
 struct Tattach {
 	u32 fid;
 	u32 afid;
-	char *uname;
-	char *aname;
+	struct v9fs_str uname;
+	struct v9fs_str aname;
 };
 
 struct Rattach {
@@ -182,13 +211,13 @@ struct Rattach {
 struct Twalk {
 	u32 fid;
 	u32 newfid;
-	u32 nwname;
-	char **wnames;
+	u16 nwname;
+	struct v9fs_str wnames[16];
 };
 
 struct Rwalk {
-	u32 nwqid;
-	struct v9fs_qid *wqids;
+	u16 nwqid;
+	struct v9fs_qid wqids[16];
 };
 
 struct Topen {
@@ -203,7 +232,7 @@ struct Ropen {
 
 struct Tcreate {
 	u32 fid;
-	char *name;
+	struct v9fs_str name;
 	u32 perm;
 	u8 mode;
 };
@@ -254,12 +283,12 @@ struct Tstat {
 };
 
 struct Rstat {
-	struct v9fs_stat *stat;
+	struct v9fs_stat stat;
 };
 
 struct Twstat {
 	u32 fid;
-	struct v9fs_stat *stat;
+	struct v9fs_stat stat;
 };
 
 struct Rwstat {
@@ -274,6 +303,7 @@ struct v9fs_fcall {
 	u32 size;
 	u8 id;
 	u16 tag;
+	void *sdata;
 
 	union {
 		struct Tversion tversion;
@@ -306,10 +336,12 @@ struct v9fs_fcall {
 	} params;
 };
 
-#define V9FS_FCALLHDRSZ (sizeof(struct v9fs_fcall) + \
-	sizeof(struct v9fs_stat) + 16*sizeof(struct v9fs_qid) + 16)
+#define PRINT_FCALL_ERROR(s, fcall) dprintk(DEBUG_ERROR, "%s: %.*s\n", s, \
+	fcall?fcall->params.rerror.error.len:0, \
+	fcall?fcall->params.rerror.error.str:"");
 
-#define FCALL_ERROR(fcall) (fcall ? fcall->params.rerror.error : "")
+char *v9fs_str_copy(char *buf, int buflen, struct v9fs_str *str);
+int v9fs_str_compare(char *buf, struct v9fs_str *str);
 
 int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 		   char *version, struct v9fs_fcall **rcall);
@@ -325,7 +357,7 @@ int v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid,
 		struct v9fs_fcall **rcall);
 
 int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
-		 struct v9fs_stat *stat, struct v9fs_fcall **rcall);
+		 struct v9fs_wstat *wstat, struct v9fs_fcall **rcall);
 
 int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
 		char *name, struct v9fs_fcall **rcall);
@@ -343,4 +375,5 @@ int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid,
 		u64 offset, u32 count, struct v9fs_fcall **rcall);
 
 int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
-		 u32 count, void *data, struct v9fs_fcall **rcall);
+		 u32 count, const char __user * data,
+		 struct v9fs_fcall **rcall);
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index e4e4ffe5a7dc..3d023089707e 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -1,17 +1,17 @@
 obj-$(CONFIG_9P_FS) := 9p2000.o
 
 9p2000-objs := \
+	trans_fd.o \
+	trans_sock.o \
+	mux.o \
+	9p.o \
+	conv.o \
 	vfs_super.o \
 	vfs_inode.o \
 	vfs_file.o \
 	vfs_dir.o \
 	vfs_dentry.o \
 	error.o \
-	mux.o \
-	trans_fd.o \
-	trans_sock.o \
-	9p.o \
-	conv.o \
 	v9fs.o \
 	fid.o
 
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index 1b9b15dfeaf0..f62434d435b3 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -30,7 +30,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/idr.h>
-
+#include <asm/uaccess.h>
 #include "debug.h"
 #include "v9fs.h"
 #include "9p.h"
@@ -45,6 +45,37 @@ struct cbuf {
 	unsigned char *ep;
 };
 
+char *v9fs_str_copy(char *buf, int buflen, struct v9fs_str *str)
+{
+	int n;
+
+	if (buflen < str->len)
+		n = buflen;
+	else
+		n = str->len;
+
+	memmove(buf, str->str, n - 1);
+
+	return buf;
+}
+
+int v9fs_str_compare(char *buf, struct v9fs_str *str)
+{
+	int n, ret;
+
+	ret = strncmp(buf, str->str, str->len);
+
+	if (!ret) {
+		n = strlen(buf);
+		if (n < str->len)
+			ret = -1;
+		else if (n > str->len)
+			ret = 1;
+	}
+
+	return ret;
+}
+
 static inline void buf_init(struct cbuf *buf, void *data, int datalen)
 {
 	buf->sp = buf->p = data;
@@ -58,12 +89,12 @@ static inline int buf_check_overflow(struct cbuf *buf)
 
 static inline int buf_check_size(struct cbuf *buf, int len)
 {
-	if (buf->p+len > buf->ep) {
-		if (buf->p < buf->ep) {
-			eprintk(KERN_ERR, "buffer overflow\n");
-			buf->p = buf->ep + 1;
-			return 0;
-		}
+	if (buf->p + len > buf->ep && buf->p < buf->ep) {
+		eprintk(KERN_ERR, "buffer overflow: want %d has %d\n",
+			len, (int)(buf->ep - buf->p));
+		dump_stack();
+		buf->p = buf->ep + 1;
+		return 0;
 	}
 
 	return 1;
@@ -127,14 +158,6 @@ static inline void buf_put_string(struct cbuf *buf, const char *s)
 	buf_put_stringn(buf, s, strlen(s));
 }
 
-static inline void buf_put_data(struct cbuf *buf, void *data, u32 datalen)
-{
-	if (buf_check_size(buf, datalen)) {
-		memcpy(buf->p, data, datalen);
-		buf->p += datalen;
-	}
-}
-
 static inline u8 buf_get_int8(struct cbuf *buf)
 {
 	u8 ret = 0;
@@ -183,85 +206,37 @@ static inline u64 buf_get_int64(struct cbuf *buf)
 	return ret;
 }
 
-static inline int
-buf_get_string(struct cbuf *buf, char *data, unsigned int datalen)
-{
-	u16 len = 0;
-
-	len = buf_get_int16(buf);
-	if (!buf_check_overflow(buf) && buf_check_size(buf, len) && len+1>datalen) {
-		memcpy(data, buf->p, len);
-		data[len] = 0;
-		buf->p += len;
-		len++;
-	}
-
-	return len;
-}
-
-static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
-{
-	char *ret;
-	u16 len;
-
-	ret = NULL;
-	len = buf_get_int16(buf);
-
-	if (!buf_check_overflow(buf) && buf_check_size(buf, len) &&
-		buf_check_size(sbuf, len + 1)) {
-
-		memcpy(sbuf->p, buf->p, len);
-		sbuf->p[len] = 0;
-		ret = sbuf->p;
-		buf->p += len;
-		sbuf->p += len + 1;
-	}
-
-	return ret;
-}
-
-static inline int buf_get_data(struct cbuf *buf, void *data, int datalen)
+static inline void buf_get_str(struct cbuf *buf, struct v9fs_str *vstr)
 {
-	int ret = 0;
-
-	if (buf_check_size(buf, datalen)) {
-		memcpy(data, buf->p, datalen);
-		buf->p += datalen;
-		ret = datalen;
+	vstr->len = buf_get_int16(buf);
+	if (!buf_check_overflow(buf) && buf_check_size(buf, vstr->len)) {
+		vstr->str = buf->p;
+		buf->p += vstr->len;
+	} else {
+		vstr->len = 0;
+		vstr->str = NULL;
 	}
-
-	return ret;
 }
 
-static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf,
-				  int datalen)
+static inline void buf_get_qid(struct cbuf *bufp, struct v9fs_qid *qid)
 {
-	char *ret = NULL;
-	int n = 0;
-
-	if (buf_check_size(dbuf, datalen)) {
-		n = buf_get_data(buf, dbuf->p, datalen);
-		if (n > 0) {
-			ret = dbuf->p;
-			dbuf->p += n;
-		}
-	}
-
-	return ret;
+	qid->type = buf_get_int8(bufp);
+	qid->version = buf_get_int32(bufp);
+	qid->path = buf_get_int64(bufp);
 }
 
 /**
- * v9fs_size_stat - calculate the size of a variable length stat struct
+ * v9fs_size_wstat - calculate the size of a variable length stat struct
  * @stat: metadata (stat) structure
  * @extended: non-zero if 9P2000.u
  *
  */
 
-static int v9fs_size_stat(struct v9fs_stat *stat, int extended)
+static int v9fs_size_wstat(struct v9fs_wstat *wstat, int extended)
 {
 	int size = 0;
 
-	if (stat == NULL) {
+	if (wstat == NULL) {
 		eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n");
 		return 0;
 	}
@@ -278,81 +253,38 @@ static int v9fs_size_stat(struct v9fs_stat *stat, int extended)
 	    8 +			/* length[8] */
 	    8;			/* minimum sum of string lengths */
 
-	if (stat->name)
-		size += strlen(stat->name);
-	if (stat->uid)
-		size += strlen(stat->uid);
-	if (stat->gid)
-		size += strlen(stat->gid);
-	if (stat->muid)
-		size += strlen(stat->muid);
+	if (wstat->name)
+		size += strlen(wstat->name);
+	if (wstat->uid)
+		size += strlen(wstat->uid);
+	if (wstat->gid)
+		size += strlen(wstat->gid);
+	if (wstat->muid)
+		size += strlen(wstat->muid);
 
 	if (extended) {
 		size += 4 +	/* n_uid[4] */
 		    4 +		/* n_gid[4] */
 		    4 +		/* n_muid[4] */
 		    2;		/* string length of extension[4] */
-		if (stat->extension)
-			size += strlen(stat->extension);
+		if (wstat->extension)
+			size += strlen(wstat->extension);
 	}
 
 	return size;
 }
 
 /**
- * serialize_stat - safely format a stat structure for transmission
- * @stat: metadata (stat) structure
- * @bufp: buffer to serialize structure into
- * @extended: non-zero if 9P2000.u
- *
- */
-
-static int
-serialize_stat(struct v9fs_stat *stat, struct cbuf *bufp, int extended)
-{
-	buf_put_int16(bufp, stat->size);
-	buf_put_int16(bufp, stat->type);
-	buf_put_int32(bufp, stat->dev);
-	buf_put_int8(bufp, stat->qid.type);
-	buf_put_int32(bufp, stat->qid.version);
-	buf_put_int64(bufp, stat->qid.path);
-	buf_put_int32(bufp, stat->mode);
-	buf_put_int32(bufp, stat->atime);
-	buf_put_int32(bufp, stat->mtime);
-	buf_put_int64(bufp, stat->length);
-
-	buf_put_string(bufp, stat->name);
-	buf_put_string(bufp, stat->uid);
-	buf_put_string(bufp, stat->gid);
-	buf_put_string(bufp, stat->muid);
-
-	if (extended) {
-		buf_put_string(bufp, stat->extension);
-		buf_put_int32(bufp, stat->n_uid);
-		buf_put_int32(bufp, stat->n_gid);
-		buf_put_int32(bufp, stat->n_muid);
-	}
-
-	if (buf_check_overflow(bufp))
-		return 0;
-
-	return stat->size;
-}
-
-/**
- * deserialize_stat - safely decode a recieved metadata (stat) structure
+ * buf_get_stat - safely decode a recieved metadata (stat) structure
  * @bufp: buffer to deserialize
  * @stat: metadata (stat) structure
- * @dbufp: buffer to deserialize variable strings into
  * @extended: non-zero if 9P2000.u
  *
  */
 
-static inline int
-deserialize_stat(struct cbuf *bufp, struct v9fs_stat *stat,
-		 struct cbuf *dbufp, int extended)
+static inline void
+buf_get_stat(struct cbuf *bufp, struct v9fs_stat *stat, int extended)
 {
-
 	stat->size = buf_get_int16(bufp);
 	stat->type = buf_get_int16(bufp);
 	stat->dev = buf_get_int32(bufp);
@@ -363,45 +295,17 @@ deserialize_stat(struct cbuf *bufp, struct v9fs_stat *stat,
 	stat->atime = buf_get_int32(bufp);
 	stat->mtime = buf_get_int32(bufp);
 	stat->length = buf_get_int64(bufp);
-	stat->name = buf_get_stringb(bufp, dbufp);
-	stat->uid = buf_get_stringb(bufp, dbufp);
-	stat->gid = buf_get_stringb(bufp, dbufp);
-	stat->muid = buf_get_stringb(bufp, dbufp);
+	buf_get_str(bufp, &stat->name);
+	buf_get_str(bufp, &stat->uid);
+	buf_get_str(bufp, &stat->gid);
+	buf_get_str(bufp, &stat->muid);
 
 	if (extended) {
-		stat->extension = buf_get_stringb(bufp, dbufp);
+		buf_get_str(bufp, &stat->extension);
 		stat->n_uid = buf_get_int32(bufp);
 		stat->n_gid = buf_get_int32(bufp);
 		stat->n_muid = buf_get_int32(bufp);
 	}
-
-	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp))
-		return 0;
-
-	return stat->size + 2;
-}
-
-/**
- * deserialize_statb - wrapper for decoding a received metadata structure
- * @bufp: buffer to deserialize
- * @dbufp: buffer to deserialize variable strings into
- * @extended: non-zero if 9P2000.u
- *
- */
-
-static inline struct v9fs_stat *deserialize_statb(struct cbuf *bufp,
-						  struct cbuf *dbufp,
-						  int extended)
-{
-	struct v9fs_stat *ret = buf_alloc(dbufp, sizeof(struct v9fs_stat));
-
-	if (ret) {
-		int n = deserialize_stat(bufp, ret, dbufp, extended);
-		if (n <= 0)
-			return NULL;
-	}
-
-	return ret;
 }
 
 /**
@@ -409,194 +313,27 @@ static inline struct v9fs_stat *deserialize_statb(struct cbuf *bufp,
  * @buf: buffer to deserialize
  * @buflen: length of received buffer
  * @stat: metadata structure to decode into
- * @statlen: length of destination metadata structure
  * @extended: non-zero if 9P2000.u
  *
+ * Note: stat will point to the buf region.
  */
 
-int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
-			  u32 statlen, int extended)
+int
+v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
+		int extended)
 {
 	struct cbuf buffer;
 	struct cbuf *bufp = &buffer;
-	struct cbuf dbuffer;
-	struct cbuf *dbufp = &dbuffer;
+	unsigned char *p;
 
 	buf_init(bufp, buf, buflen);
-	buf_init(dbufp, (char *)stat + sizeof(struct v9fs_stat),
-		 statlen - sizeof(struct v9fs_stat));
-
-	return deserialize_stat(bufp, stat, dbufp, extended);
-}
-
-static inline int v9fs_size_fcall(struct v9fs_fcall *fcall, int extended)
-{
-	int size = 4 + 1 + 2;	/* size[4] msg[1] tag[2] */
-	int i = 0;
+	p = bufp->p;
+	buf_get_stat(bufp, stat, extended);
 
-	switch (fcall->id) {
-	default:
-		eprintk(KERN_ERR, "bad msg type %d\n", fcall->id);
+	if (buf_check_overflow(bufp))
 		return 0;
-	case TVERSION:		/* msize[4] version[s] */
-		size += 4 + 2 + strlen(fcall->params.tversion.version);
-		break;
-	case TAUTH:		/* afid[4] uname[s] aname[s] */
-		size += 4 + 2 + strlen(fcall->params.tauth.uname) +
-		    2 + strlen(fcall->params.tauth.aname);
-		break;
-	case TFLUSH:		/* oldtag[2] */
-		size += 2;
-		break;
-	case TATTACH:		/* fid[4] afid[4] uname[s] aname[s] */
-		size += 4 + 4 + 2 + strlen(fcall->params.tattach.uname) +
-		    2 + strlen(fcall->params.tattach.aname);
-		break;
-	case TWALK:		/* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
-		size += 4 + 4 + 2;
-		/* now compute total for the array of names */
-		for (i = 0; i < fcall->params.twalk.nwname; i++)
-			size += 2 + strlen(fcall->params.twalk.wnames[i]);
-		break;
-	case TOPEN:		/* fid[4] mode[1] */
-		size += 4 + 1;
-		break;
-	case TCREATE:		/* fid[4] name[s] perm[4] mode[1] */
-		size += 4 + 2 + strlen(fcall->params.tcreate.name) + 4 + 1;
-		break;
-	case TREAD:		/* fid[4] offset[8] count[4] */
-		size += 4 + 8 + 4;
-		break;
-	case TWRITE:		/* fid[4] offset[8] count[4] data[count] */
-		size += 4 + 8 + 4 + fcall->params.twrite.count;
-		break;
-	case TCLUNK:		/* fid[4] */
-		size += 4;
-		break;
-	case TREMOVE:		/* fid[4] */
-		size += 4;
-		break;
-	case TSTAT:		/* fid[4] */
-		size += 4;
-		break;
-	case TWSTAT:		/* fid[4] stat[n] */
-		fcall->params.twstat.stat->size =
-		    v9fs_size_stat(fcall->params.twstat.stat, extended);
-		size += 4 + 2 + 2 + fcall->params.twstat.stat->size;
-	}
-	return size;
-}
-
-/*
- * v9fs_serialize_fcall - marshall fcall struct into a packet
- * @fcall: structure to convert
- * @data: buffer to serialize fcall into
- * @datalen: length of buffer to serialize fcall into
- * @extended: non-zero if 9P2000.u
- *
- */
-
-int
-v9fs_serialize_fcall(struct v9fs_fcall *fcall, void *data, u32 datalen,
-		     int extended)
-{
-	int i = 0;
-	struct v9fs_stat *stat = NULL;
-	struct cbuf buffer;
-	struct cbuf *bufp = &buffer;
-
-	buf_init(bufp, data, datalen);
-
-	if (!fcall) {
-		eprintk(KERN_ERR, "no fcall\n");
-		return -EINVAL;
-	}
-
-	fcall->size = v9fs_size_fcall(fcall, extended);
-
-	buf_put_int32(bufp, fcall->size);
-	buf_put_int8(bufp, fcall->id);
-	buf_put_int16(bufp, fcall->tag);
-
-	dprintk(DEBUG_CONV, "size %d id %d tag %d\n", fcall->size, fcall->id,
-		fcall->tag);
-
-	/* now encode it */
-	switch (fcall->id) {
-	default:
-		eprintk(KERN_ERR, "bad msg type: %d\n", fcall->id);
-		return -EPROTO;
-	case TVERSION:
-		buf_put_int32(bufp, fcall->params.tversion.msize);
-		buf_put_string(bufp, fcall->params.tversion.version);
-		break;
-	case TAUTH:
-		buf_put_int32(bufp, fcall->params.tauth.afid);
-		buf_put_string(bufp, fcall->params.tauth.uname);
-		buf_put_string(bufp, fcall->params.tauth.aname);
-		break;
-	case TFLUSH:
-		buf_put_int16(bufp, fcall->params.tflush.oldtag);
-		break;
-	case TATTACH:
-		buf_put_int32(bufp, fcall->params.tattach.fid);
-		buf_put_int32(bufp, fcall->params.tattach.afid);
-		buf_put_string(bufp, fcall->params.tattach.uname);
-		buf_put_string(bufp, fcall->params.tattach.aname);
-		break;
-	case TWALK:
-		buf_put_int32(bufp, fcall->params.twalk.fid);
-		buf_put_int32(bufp, fcall->params.twalk.newfid);
-		buf_put_int16(bufp, fcall->params.twalk.nwname);
-		for (i = 0; i < fcall->params.twalk.nwname; i++)
-			buf_put_string(bufp, fcall->params.twalk.wnames[i]);
-		break;
-	case TOPEN:
-		buf_put_int32(bufp, fcall->params.topen.fid);
-		buf_put_int8(bufp, fcall->params.topen.mode);
-		break;
-	case TCREATE:
-		buf_put_int32(bufp, fcall->params.tcreate.fid);
-		buf_put_string(bufp, fcall->params.tcreate.name);
-		buf_put_int32(bufp, fcall->params.tcreate.perm);
-		buf_put_int8(bufp, fcall->params.tcreate.mode);
-		break;
-	case TREAD:
-		buf_put_int32(bufp, fcall->params.tread.fid);
-		buf_put_int64(bufp, fcall->params.tread.offset);
-		buf_put_int32(bufp, fcall->params.tread.count);
-		break;
-	case TWRITE:
-		buf_put_int32(bufp, fcall->params.twrite.fid);
-		buf_put_int64(bufp, fcall->params.twrite.offset);
-		buf_put_int32(bufp, fcall->params.twrite.count);
-		buf_put_data(bufp, fcall->params.twrite.data,
-			     fcall->params.twrite.count);
-		break;
-	case TCLUNK:
-		buf_put_int32(bufp, fcall->params.tclunk.fid);
-		break;
-	case TREMOVE:
-		buf_put_int32(bufp, fcall->params.tremove.fid);
-		break;
-	case TSTAT:
-		buf_put_int32(bufp, fcall->params.tstat.fid);
-		break;
-	case TWSTAT:
-		buf_put_int32(bufp, fcall->params.twstat.fid);
-		stat = fcall->params.twstat.stat;
-
-		buf_put_int16(bufp, stat->size + 2);
-		serialize_stat(stat, bufp, extended);
-		break;
-	}
-
-	if (buf_check_overflow(bufp)) {
-		dprintk(DEBUG_ERROR, "buffer overflow\n");
-		return -EIO;
-	}
-
-	return fcall->size;
+	else
+		return bufp->p - p;
 }
 
 /**
@@ -611,18 +348,14 @@ v9fs_serialize_fcall(struct v9fs_fcall *fcall, void *data, u32 datalen,
 
 int
 v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
-		       int rcalllen, int extended)
+		       int extended)
 {
 
 	struct cbuf buffer;
 	struct cbuf *bufp = &buffer;
-	struct cbuf dbuffer;
-	struct cbuf *dbufp = &dbuffer;
 	int i = 0;
 
 	buf_init(bufp, buf, buflen);
-	buf_init(dbufp, (char *)rcall + sizeof(struct v9fs_fcall),
-		 rcalllen - sizeof(struct v9fs_fcall));
 
 	rcall->size = buf_get_int32(bufp);
 	rcall->id = buf_get_int8(bufp);
@@ -630,13 +363,14 @@ v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
 
 	dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id,
 		rcall->tag);
+
 	switch (rcall->id) {
 	default:
 		eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id);
 		return -EPROTO;
 	case RVERSION:
 		rcall->params.rversion.msize = buf_get_int32(bufp);
-		rcall->params.rversion.version = buf_get_stringb(bufp, dbufp);
+		buf_get_str(bufp, &rcall->params.rversion.version);
 		break;
 	case RFLUSH:
 		break;
@@ -647,40 +381,27 @@ v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
 		break;
 	case RWALK:
 		rcall->params.rwalk.nwqid = buf_get_int16(bufp);
-		if (rcall->params.rwalk.nwqid > 16) {
-			eprintk(KERN_ERR, "Rwalk with more than 16 qids: %d\n",
-				rcall->params.rwalk.nwqid);
+		if (rcall->params.rwalk.nwqid > V9FS_MAXWELEM) {
+			eprintk(KERN_ERR, "Rwalk with more than %d qids: %d\n",
+				V9FS_MAXWELEM, rcall->params.rwalk.nwqid);
 			return -EPROTO;
 		}
 
-		rcall->params.rwalk.wqids = buf_alloc(dbufp,
-		      rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid));
-		if (rcall->params.rwalk.wqids)
-			for (i = 0; i < rcall->params.rwalk.nwqid; i++) {
-				rcall->params.rwalk.wqids[i].type =
-				    buf_get_int8(bufp);
-				rcall->params.rwalk.wqids[i].version =
-				    buf_get_int16(bufp);
-				rcall->params.rwalk.wqids[i].path =
-				    buf_get_int64(bufp);
-			}
+		for (i = 0; i < rcall->params.rwalk.nwqid; i++)
+			buf_get_qid(bufp, &rcall->params.rwalk.wqids[i]);
 		break;
 	case ROPEN:
-		rcall->params.ropen.qid.type = buf_get_int8(bufp);
-		rcall->params.ropen.qid.version = buf_get_int32(bufp);
-		rcall->params.ropen.qid.path = buf_get_int64(bufp);
+		buf_get_qid(bufp, &rcall->params.ropen.qid);
 		rcall->params.ropen.iounit = buf_get_int32(bufp);
 		break;
 	case RCREATE:
-		rcall->params.rcreate.qid.type = buf_get_int8(bufp);
-		rcall->params.rcreate.qid.version = buf_get_int32(bufp);
-		rcall->params.rcreate.qid.path = buf_get_int64(bufp);
+		buf_get_qid(bufp, &rcall->params.rcreate.qid);
 		rcall->params.rcreate.iounit = buf_get_int32(bufp);
 		break;
 	case RREAD:
 		rcall->params.rread.count = buf_get_int32(bufp);
-		rcall->params.rread.data = buf_get_datab(bufp, dbufp,
-			rcall->params.rread.count);
+		rcall->params.rread.data = bufp->p;
+		buf_check_size(bufp, rcall->params.rread.count);
 		break;
 	case RWRITE:
 		rcall->params.rwrite.count = buf_get_int32(bufp);
@@ -691,22 +412,442 @@ v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
 		break;
 	case RSTAT:
 		buf_get_int16(bufp);
-		rcall->params.rstat.stat =
-		    deserialize_statb(bufp, dbufp, extended);
+		buf_get_stat(bufp, &rcall->params.rstat.stat, extended);
 		break;
 	case RWSTAT:
 		break;
 	case RERROR:
-		rcall->params.rerror.error = buf_get_stringb(bufp, dbufp);
+		buf_get_str(bufp, &rcall->params.rerror.error);
 		if (extended)
 			rcall->params.rerror.errno = buf_get_int16(bufp);
 		break;
 	}
 
-	if (buf_check_overflow(bufp) || buf_check_overflow(dbufp)) {
+	if (buf_check_overflow(bufp)) {
 		dprintk(DEBUG_ERROR, "buffer overflow\n");
 		return -EIO;
 	}
 
-	return rcall->size;
+	return bufp->p - bufp->sp;
+}
+
+static inline void v9fs_put_int8(struct cbuf *bufp, u8 val, u8 * p)
+{
+	*p = val;
+	buf_put_int8(bufp, val);
+}
+
+static inline void v9fs_put_int16(struct cbuf *bufp, u16 val, u16 * p)
+{
+	*p = val;
+	buf_put_int16(bufp, val);
+}
+
+static inline void v9fs_put_int32(struct cbuf *bufp, u32 val, u32 * p)
+{
+	*p = val;
+	buf_put_int32(bufp, val);
+}
+
+static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p)
+{
+	*p = val;
+	buf_put_int64(bufp, val);
+}
+
+static inline void
+v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str)
+{
+	if (data) {
+		str->len = strlen(data);
+		str->str = bufp->p;
+	} else {
+		str->len = 0;
+		str->str = NULL;
+	}
+
+	buf_put_stringn(bufp, data, str->len);
+}
+
+static inline int
+v9fs_put_user_data(struct cbuf *bufp, const char __user * data, int count,
+		   unsigned char **pdata)
+{
+	*pdata = buf_alloc(bufp, count);
+	return copy_from_user(*pdata, data, count);
+}
+
+static void
+v9fs_put_wstat(struct cbuf *bufp, struct v9fs_wstat *wstat,
+	       struct v9fs_stat *stat, int statsz, int extended)
+{
+	v9fs_put_int16(bufp, statsz, &stat->size);
+	v9fs_put_int16(bufp, wstat->type, &stat->type);
+	v9fs_put_int32(bufp, wstat->dev, &stat->dev);
+	v9fs_put_int8(bufp, wstat->qid.type, &stat->qid.type);
+	v9fs_put_int32(bufp, wstat->qid.version, &stat->qid.version);
+	v9fs_put_int64(bufp, wstat->qid.path, &stat->qid.path);
+	v9fs_put_int32(bufp, wstat->mode, &stat->mode);
+	v9fs_put_int32(bufp, wstat->atime, &stat->atime);
+	v9fs_put_int32(bufp, wstat->mtime, &stat->mtime);
+	v9fs_put_int64(bufp, wstat->length, &stat->length);
+
+	v9fs_put_str(bufp, wstat->name, &stat->name);
+	v9fs_put_str(bufp, wstat->uid, &stat->uid);
+	v9fs_put_str(bufp, wstat->gid, &stat->gid);
+	v9fs_put_str(bufp, wstat->muid, &stat->muid);
+
+	if (extended) {
+		v9fs_put_str(bufp, wstat->extension, &stat->extension);
+		v9fs_put_int32(bufp, wstat->n_uid, &stat->n_uid);
+		v9fs_put_int32(bufp, wstat->n_gid, &stat->n_gid);
+		v9fs_put_int32(bufp, wstat->n_muid, &stat->n_muid);
+	}
+}
+
+static struct v9fs_fcall *
+v9fs_create_common(struct cbuf *bufp, u32 size, u8 id)
+{
+	struct v9fs_fcall *fc;
+
+	size += 4 + 1 + 2;	/* size[4] id[1] tag[2] */
+	fc = kmalloc(sizeof(struct v9fs_fcall) + size, GFP_KERNEL);
+	if (!fc)
+		return ERR_PTR(-ENOMEM);
+
+	fc->sdata = (char *)fc + sizeof(*fc);
+
+	buf_init(bufp, (char *)fc->sdata, size);
+	v9fs_put_int32(bufp, size, &fc->size);
+	v9fs_put_int8(bufp, id, &fc->id);
+	v9fs_put_int16(bufp, V9FS_NOTAG, &fc->tag);
+
+	return fc;
+}
+
+void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag)
+{
+	*(__le16 *) (fc->sdata + 5) = cpu_to_le16(tag);
+}
+
+struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 2 + strlen(version);	/* msize[4] version[s] */
+	fc = v9fs_create_common(bufp, size, TVERSION);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, msize, &fc->params.tversion.msize);
+	v9fs_put_str(bufp, version, &fc->params.tversion.version);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 2 + strlen(uname) + 2 + strlen(aname);	/* afid[4] uname[s] aname[s] */
+	fc = v9fs_create_common(bufp, size, TAUTH);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, afid, &fc->params.tauth.afid);
+	v9fs_put_str(bufp, uname, &fc->params.tauth.uname);
+	v9fs_put_str(bufp, aname, &fc->params.tauth.aname);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *
+v9fs_create_tattach(u32 fid, u32 afid, char *uname, char *aname)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 4 + 2 + strlen(uname) + 2 + strlen(aname);	/* fid[4] afid[4] uname[s] aname[s] */
+	fc = v9fs_create_common(bufp, size, TATTACH);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tattach.fid);
+	v9fs_put_int32(bufp, afid, &fc->params.tattach.afid);
+	v9fs_put_str(bufp, uname, &fc->params.tattach.uname);
+	v9fs_put_str(bufp, aname, &fc->params.tattach.aname);
+
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tflush(u16 oldtag)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 2;		/* oldtag[2] */
+	fc = v9fs_create_common(bufp, size, TFLUSH);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int16(bufp, oldtag, &fc->params.tflush.oldtag);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname,
+				     char **wnames)
+{
+	int i, size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	if (nwname > V9FS_MAXWELEM) {
+		dprintk(DEBUG_ERROR, "nwname > %d\n", V9FS_MAXWELEM);
+		return NULL;
+	}
+
+	size = 4 + 4 + 2;	/* fid[4] newfid[4] nwname[2] ... */
+	for (i = 0; i < nwname; i++) {
+		size += 2 + strlen(wnames[i]);	/* wname[s] */
+	}
+
+	fc = v9fs_create_common(bufp, size, TWALK);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.twalk.fid);
+	v9fs_put_int32(bufp, newfid, &fc->params.twalk.newfid);
+	v9fs_put_int16(bufp, nwname, &fc->params.twalk.nwname);
+	for (i = 0; i < nwname; i++) {
+		v9fs_put_str(bufp, wnames[i], &fc->params.twalk.wnames[i]);
+	}
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 1;		/* fid[4] mode[1] */
+	fc = v9fs_create_common(bufp, size, TOPEN);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.topen.fid);
+	v9fs_put_int8(bufp, mode, &fc->params.topen.mode);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 2 + strlen(name) + 4 + 1;	/* fid[4] name[s] perm[4] mode[1] */
+	fc = v9fs_create_common(bufp, size, TCREATE);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tcreate.fid);
+	v9fs_put_str(bufp, name, &fc->params.tcreate.name);
+	v9fs_put_int32(bufp, perm, &fc->params.tcreate.perm);
+	v9fs_put_int8(bufp, mode, &fc->params.tcreate.mode);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 8 + 4;	/* fid[4] offset[8] count[4] */
+	fc = v9fs_create_common(bufp, size, TREAD);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tread.fid);
+	v9fs_put_int64(bufp, offset, &fc->params.tread.offset);
+	v9fs_put_int32(bufp, count, &fc->params.tread.count);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count,
+				      const char __user * data)
+{
+	int size, err;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4 + 8 + 4 + count;	/* fid[4] offset[8] count[4] data[count] */
+	fc = v9fs_create_common(bufp, size, TWRITE);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.twrite.fid);
+	v9fs_put_int64(bufp, offset, &fc->params.twrite.offset);
+	v9fs_put_int32(bufp, count, &fc->params.twrite.count);
+	err = v9fs_put_user_data(bufp, data, count, &fc->params.twrite.data);
+	if (err) {
+		kfree(fc);
+		fc = ERR_PTR(err);
+	}
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tclunk(u32 fid)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4;		/* fid[4] */
+	fc = v9fs_create_common(bufp, size, TCLUNK);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tclunk.fid);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tremove(u32 fid)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4;		/* fid[4] */
+	fc = v9fs_create_common(bufp, size, TREMOVE);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tremove.fid);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_tstat(u32 fid)
+{
+	int size;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	size = 4;		/* fid[4] */
+	fc = v9fs_create_common(bufp, size, TSTAT);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.tstat.fid);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
+}
+
+struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat,
+				      int extended)
+{
+	int size, statsz;
+	struct v9fs_fcall *fc;
+	struct cbuf buffer;
+	struct cbuf *bufp = &buffer;
+
+	statsz = v9fs_size_wstat(wstat, extended);
+	size = 4 + 2 + 2 + statsz;	/* fid[4] stat[n] */
+	fc = v9fs_create_common(bufp, size, TWSTAT);
+	if (IS_ERR(fc))
+		goto error;
+
+	v9fs_put_int32(bufp, fid, &fc->params.twstat.fid);
+	buf_put_int16(bufp, statsz + 2);
+	v9fs_put_wstat(bufp, wstat, &fc->params.twstat.stat, statsz, extended);
+
+	if (buf_check_overflow(bufp)) {
+		kfree(fc);
+		fc = ERR_PTR(-ENOMEM);
+	}
+      error:
+	return fc;
 }
diff --git a/fs/9p/conv.h b/fs/9p/conv.h
index d5e33e17a685..26a736e4a2e7 100644
--- a/fs/9p/conv.h
+++ b/fs/9p/conv.h
@@ -1,8 +1,9 @@
 /*
  * linux/fs/9p/conv.h
  *
- * 9P protocol conversion definitions
+ * 9P protocol conversion definitions.
  *
+ *  Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
  *
@@ -25,11 +26,26 @@
  */
 
 int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat,
-	u32 statlen, int extended);
-int v9fs_serialize_fcall(struct v9fs_fcall *tcall, void *buf, u32 buflen,
 	int extended);
 int v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall,
-	int rcalllen, int extended);
+	int extended);
+
+void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag);
 
-/* this one is actually in error.c right now */
-int v9fs_errstr2errno(char *errstr);
+struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version);
+struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname);
+struct v9fs_fcall *v9fs_create_tattach(u32 fid, u32 afid, char *uname,
+	char *aname);
+struct v9fs_fcall *v9fs_create_tflush(u16 oldtag);
+struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname,
+	char **wnames);
+struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode);
+struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode);
+struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count);
+struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count,
+	const char __user *data);
+struct v9fs_fcall *v9fs_create_tclunk(u32 fid);
+struct v9fs_fcall *v9fs_create_tremove(u32 fid);
+struct v9fs_fcall *v9fs_create_tstat(u32 fid);
+struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat,
+	int extended);
diff --git a/fs/9p/debug.h b/fs/9p/debug.h
index 4445f06919d9..fe551032788b 100644
--- a/fs/9p/debug.h
+++ b/fs/9p/debug.h
@@ -51,16 +51,23 @@ do { \
 #if DEBUG_DUMP_PKT
 static inline void dump_data(const unsigned char *data, unsigned int datalen)
 {
-	int i, j;
-	int len = datalen;
+	int i, n;
+	char buf[5*8];
 
-	printk(KERN_DEBUG "data ");
-	for (i = 0; i < len; i += 4) {
-		for (j = 0; (j < 4) && (i + j < len); j++)
-			printk(KERN_DEBUG "%02x", data[i + j]);
-		printk(KERN_DEBUG " ");
+	n = 0;
+	i = 0;
+	while (i < datalen) {
+		n += snprintf(buf+n, sizeof(buf)-n, "%02x", data[i++]);
+		if (i%4 == 0)
+			n += snprintf(buf+n, sizeof(buf)-n, " ");
+
+		if (i%16 == 0) {
+			dprintk(DEBUG_ERROR, "%s\n", buf);
+			n = 0;
+		}
 	}
-	printk(KERN_DEBUG "\n");
+
+	dprintk(DEBUG_ERROR, "%s\n", buf);
 }
 #else				/* DEBUG_DUMP_PKT */
 static inline void dump_data(const unsigned char *data, unsigned int datalen)
diff --git a/fs/9p/error.c b/fs/9p/error.c
index 834cb179e388..e4b6f8f38b6f 100644
--- a/fs/9p/error.c
+++ b/fs/9p/error.c
@@ -33,7 +33,6 @@
 
 #include <linux/list.h>
 #include <linux/jhash.h>
-#include <linux/string.h>
 
 #include "debug.h"
 #include "error.h"
@@ -55,7 +54,8 @@ int v9fs_error_init(void)
 
 	/* load initial error map into hash table */
 	for (c = errmap; c->name != NULL; c++) {
-		bucket = jhash(c->name, strlen(c->name), 0) % ERRHASHSZ;
+		c->namelen = strlen(c->name);
+		bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ;
 		INIT_HLIST_NODE(&c->list);
 		hlist_add_head(&c->list, &hash_errmap[bucket]);
 	}
@@ -69,15 +69,15 @@ int v9fs_error_init(void)
  *
  */
 
-int v9fs_errstr2errno(char *errstr)
+int v9fs_errstr2errno(char *errstr, int len)
 {
 	int errno = 0;
 	struct hlist_node *p = NULL;
 	struct errormap *c = NULL;
-	int bucket = jhash(errstr, strlen(errstr), 0) % ERRHASHSZ;
+	int bucket = jhash(errstr, len, 0) % ERRHASHSZ;
 
 	hlist_for_each_entry(c, p, &hash_errmap[bucket], list) {
-		if (!strcmp(c->name, errstr)) {
+		if (c->namelen==len && !memcmp(c->name, errstr, len)) {
 			errno = c->val;
 			break;
 		}
diff --git a/fs/9p/error.h b/fs/9p/error.h
index 78f89acf7c9a..8b3176b3f658 100644
--- a/fs/9p/error.h
+++ b/fs/9p/error.h
@@ -36,6 +36,7 @@ struct errormap {
 	char *name;
 	int val;
 
+	int namelen;
 	struct hlist_node list;
 };
 
@@ -175,4 +176,4 @@ static struct errormap errmap[] = {
 };
 
 extern int v9fs_error_init(void);
-extern int v9fs_errstr2errno(char *errstr);
+extern int v9fs_errstr2errno(char *errstr, int len);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 60ef8aba757a..eda449778fa5 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -31,9 +31,6 @@
 #include "v9fs.h"
 #include "9p.h"
 #include "v9fs_vfs.h"
-#include "transport.h"
-#include "mux.h"
-#include "conv.h"
 #include "fid.h"
 
 /**
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 62b6ad0767e1..f21cf5083973 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -35,8 +35,8 @@
 #include "debug.h"
 #include "v9fs.h"
 #include "9p.h"
-#include "transport.h"
 #include "conv.h"
+#include "transport.h"
 #include "mux.h"
 
 #define ERREQFLUSH	1
@@ -74,6 +74,7 @@ struct v9fs_mux_data {
 	wait_queue_head_t equeue;
 	struct list_head req_list;
 	struct list_head unsent_req_list;
+	struct v9fs_fcall *rcall;
 	int rpos;
 	char *rbuf;
 	int wpos;
@@ -101,11 +102,15 @@ struct v9fs_mux_rpc {
 	wait_queue_head_t wqueue;
 };
 
+extern int v9fs_errstr2errno(char *str, int len);
+
 static int v9fs_poll_proc(void *);
 static void v9fs_read_work(void *);
 static void v9fs_write_work(void *);
 static void v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address,
 			  poll_table * p);
+static u16 v9fs_mux_get_tag(struct v9fs_mux_data *);
+static void v9fs_mux_put_tag(struct v9fs_mux_data *, u16);
 
 static DECLARE_MUTEX(v9fs_mux_task_lock);
 static struct workqueue_struct *v9fs_mux_wq;
@@ -166,8 +171,9 @@ static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
 			if (v9fs_mux_poll_tasks[i].task == NULL) {
 				vpt = &v9fs_mux_poll_tasks[i];
 				dprintk(DEBUG_MUX, "create proc %p\n", vpt);
-				vpt->task = kthread_create(v9fs_poll_proc,
-					vpt, "v9fs-poll");
+				vpt->task =
+				    kthread_create(v9fs_poll_proc, vpt,
+						   "v9fs-poll");
 				INIT_LIST_HEAD(&vpt->mux_list);
 				vpt->muxnum = 0;
 				v9fs_mux_poll_task_num++;
@@ -253,7 +259,7 @@ struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
 	struct v9fs_mux_data *m, *mtmp;
 
 	dprintk(DEBUG_MUX, "transport %p msize %d\n", trans, msize);
-	m = kmalloc(sizeof(struct v9fs_mux_data) + 2 * msize, GFP_KERNEL);
+	m = kmalloc(sizeof(struct v9fs_mux_data), GFP_KERNEL);
 	if (!m)
 		return ERR_PTR(-ENOMEM);
 
@@ -268,10 +274,11 @@ struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
 	init_waitqueue_head(&m->equeue);
 	INIT_LIST_HEAD(&m->req_list);
 	INIT_LIST_HEAD(&m->unsent_req_list);
+	m->rcall = NULL;
 	m->rpos = 0;
-	m->rbuf = (char *)m + sizeof(struct v9fs_mux_data);
+	m->rbuf = NULL;
 	m->wpos = m->wsize = 0;
-	m->wbuf = m->rbuf + msize;
+	m->wbuf = NULL;
 	INIT_WORK(&m->rq, v9fs_read_work, m);
 	INIT_WORK(&m->wq, v9fs_write_work, m);
 	m->wsched = 0;
@@ -427,29 +434,6 @@ static int v9fs_poll_proc(void *a)
 	return 0;
 }
 
-static inline int v9fs_write_req(struct v9fs_mux_data *m, struct v9fs_req *req)
-{
-	int n;
-
-	list_move_tail(&req->req_list, &m->req_list);
-	n = v9fs_serialize_fcall(req->tcall, m->wbuf, m->msize, *m->extended);
-	if (n < 0) {
-		req->err = n;
-		list_del(&req->req_list);
-		if (req->cb) {
-			spin_unlock(&m->lock);
-			(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
-			req->cb = NULL;
-			spin_lock(&m->lock);
-		} else
-			kfree(req->rcall);
-
-		kfree(req);
-	}
-
-	return n;
-}
-
 /**
  * v9fs_write_work - called when a transport can send some data
  */
@@ -457,7 +441,7 @@ static void v9fs_write_work(void *a)
 {
 	int n, err;
 	struct v9fs_mux_data *m;
-	struct v9fs_req *req, *rtmp;
+	struct v9fs_req *req;
 
 	m = a;
 
@@ -472,17 +456,15 @@ static void v9fs_write_work(void *a)
 			return;
 		}
 
-		err = 0;
 		spin_lock(&m->lock);
-		list_for_each_entry_safe(req, rtmp, &m->unsent_req_list,
-					 req_list) {
-			err = v9fs_write_req(m, req);
-			if (err > 0)
-				break;
-		}
-
-		m->wsize = err;
+		req =
+		    list_entry(m->unsent_req_list.next, struct v9fs_req,
+			       req_list);
+		list_move_tail(&req->req_list, &m->req_list);
+		m->wbuf = req->tcall->sdata;
+		m->wsize = req->tcall->size;
 		m->wpos = 0;
+		dump_data(m->wbuf, m->wsize);
 		spin_unlock(&m->lock);
 	}
 
@@ -526,24 +508,23 @@ static void v9fs_write_work(void *a)
 static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 {
 	int ecode, tag;
-	char *ename;
+	struct v9fs_str *ename;
 
 	tag = req->tag;
 	if (req->rcall->id == RERROR && !req->err) {
 		ecode = req->rcall->params.rerror.errno;
-		ename = req->rcall->params.rerror.error;
+		ename = &req->rcall->params.rerror.error;
 
-		dprintk(DEBUG_MUX, "Rerror %s\n", ename);
+		dprintk(DEBUG_MUX, "Rerror %.*s\n", ename->len, ename->str);
 
 		if (*m->extended)
 			req->err = -ecode;
 
 		if (!req->err) {
-			req->err = v9fs_errstr2errno(ename);
+			req->err = v9fs_errstr2errno(ename->str, ename->len);
 
 			if (!req->err) {	/* string match failed */
-				dprintk(DEBUG_ERROR, "unknown error: %s\n",
-					ename);
+				PRINT_FCALL_ERROR("unknown error", req->rcall);
 			}
 
 			if (!req->err)
@@ -565,8 +546,7 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 	} else
 		kfree(req->rcall);
 
-	if (tag != V9FS_NOTAG)
-		v9fs_put_idpool(tag, &m->tidpool);
+	v9fs_mux_put_tag(m, tag);
 
 	wake_up(&m->equeue);
 	kfree(req);
@@ -577,10 +557,11 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
  */
 static void v9fs_read_work(void *a)
 {
-	int n, err, rcallen;
+	int n, err;
 	struct v9fs_mux_data *m;
 	struct v9fs_req *req, *rptr, *rreq;
 	struct v9fs_fcall *rcall;
+	char *rbuf;
 
 	m = a;
 
@@ -589,6 +570,19 @@ static void v9fs_read_work(void *a)
 
 	rcall = NULL;
 	dprintk(DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos);
+
+	if (!m->rcall) {
+		m->rcall =
+		    kmalloc(sizeof(struct v9fs_fcall) + m->msize, GFP_KERNEL);
+		if (!m->rcall) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall);
+		m->rpos = 0;
+	}
+
 	clear_bit(Rpending, &m->wsched);
 	err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos);
 	dprintk(DEBUG_MUX, "mux %p got %d bytes\n", m, err);
@@ -613,21 +607,32 @@ static void v9fs_read_work(void *a)
 		if (m->rpos < n)
 			break;
 
-		rcallen = n + V9FS_FCALLHDRSZ;
-		rcall = kmalloc(rcallen, GFP_KERNEL);
-		if (!rcall) {
-			err = -ENOMEM;
-			goto error;
-		}
-
 		dump_data(m->rbuf, n);
-		err = v9fs_deserialize_fcall(m->rbuf, n, rcall, rcallen,
-					     *m->extended);
+		err =
+		    v9fs_deserialize_fcall(m->rbuf, n, m->rcall, *m->extended);
 		if (err < 0) {
-			kfree(rcall);
 			goto error;
 		}
 
+		rcall = m->rcall;
+		rbuf = m->rbuf;
+		if (m->rpos > n) {
+			m->rcall = kmalloc(sizeof(struct v9fs_fcall) + m->msize,
+					   GFP_KERNEL);
+			if (!m->rcall) {
+				err = -ENOMEM;
+				goto error;
+			}
+
+			m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall);
+			memmove(m->rbuf, rbuf + n, m->rpos - n);
+			m->rpos -= n;
+		} else {
+			m->rcall = NULL;
+			m->rbuf = NULL;
+			m->rpos = 0;
+		}
+
 		dprintk(DEBUG_MUX, "mux %p fcall id %d tag %d\n", m, rcall->id,
 			rcall->tag);
 
@@ -642,6 +647,7 @@ static void v9fs_read_work(void *a)
 				process_request(m, req);
 				break;
 			}
+
 		}
 
 		if (!req) {
@@ -652,10 +658,6 @@ static void v9fs_read_work(void *a)
 					m, rcall->id, rcall->tag);
 			kfree(rcall);
 		}
-
-		if (m->rpos > n)
-			memmove(m->rbuf, m->rbuf + n, m->rpos - n);
-		m->rpos -= n;
 	}
 
 	if (!list_empty(&m->req_list)) {
@@ -710,12 +712,13 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m,
 	if (tc->id == TVERSION)
 		n = V9FS_NOTAG;
 	else
-		n = v9fs_get_idpool(&m->tidpool);
+		n = v9fs_mux_get_tag(m);
 
 	if (n < 0)
 		return ERR_PTR(-ENOMEM);
 
-	tc->tag = n;
+	v9fs_set_tag(tc, n);
+
 	req->tag = n;
 	req->tcall = tc;
 	req->rcall = NULL;
@@ -773,9 +776,7 @@ v9fs_mux_flush_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc,
 	if (!cb)
 		spin_unlock(&m->lock);
 
-	if (v9fs_check_idpool(tag, &m->tidpool))
-		v9fs_put_idpool(tag, &m->tidpool);
-
+	v9fs_mux_put_tag(m, tag);
 	kfree(tc);
 	kfree(rc);
 }
@@ -787,10 +788,7 @@ v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 
 	dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
 
-	fc = kmalloc(sizeof(struct v9fs_fcall), GFP_KERNEL);
-	fc->id = TFLUSH;
-	fc->params.tflush.oldtag = req->tag;
-
+	fc = v9fs_create_tflush(req->tag);
 	v9fs_send_request(m, fc, v9fs_mux_flush_cb, m);
 }
 
@@ -939,3 +937,20 @@ void v9fs_mux_cancel(struct v9fs_mux_data *m, int err)
 
 	wake_up(&m->equeue);
 }
+
+static u16 v9fs_mux_get_tag(struct v9fs_mux_data *m)
+{
+	int tag;
+
+	tag = v9fs_get_idpool(&m->tidpool);
+	if (tag < 0)
+		return V9FS_NOTAG;
+	else
+		return (u16) tag;
+}
+
+static void v9fs_mux_put_tag(struct v9fs_mux_data *m, u16 tag)
+{
+	if (tag != V9FS_NOTAG && v9fs_check_idpool(tag, &m->tidpool))
+		v9fs_put_idpool(tag, &m->tidpool);
+}
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index 9ef404c75c8f..44e830697acb 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -110,7 +110,6 @@ static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len)
 	if (!(ts->filp->f_flags & O_NONBLOCK))
 		dprintk(DEBUG_ERROR, "blocking write ...\n");
 
-	dump_data(v, len);
 	oldfs = get_fs();
 	set_fs(get_ds());
 	ret = vfs_write(ts->filp, (void __user *)v, len, &ts->filp->f_pos);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 5e0f79355fdf..519b21d8b15b 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -37,7 +37,6 @@
 #include "v9fs_vfs.h"
 #include "transport.h"
 #include "mux.h"
-#include "conv.h"
 
 /* TODO: sysfs or debugfs interface */
 int v9fs_debug_level = 0;	/* feature-rific global debug level  */
@@ -353,7 +352,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 		}
 
 		/* Really should check for 9P1 and report error */
-		if (!strcmp(fcall->params.rversion.version, "9P2000.u")) {
+		if (!v9fs_str_compare("9P2000.u", &fcall->params.rversion.version)) {
 			dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n");
 			v9ses->extended = 1;
 		} else {
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 2f2cea7ee3e7..c78502ad00ed 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -45,9 +45,8 @@ extern struct dentry_operations v9fs_dentry_operations;
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
 ino_t v9fs_qid2ino(struct v9fs_qid *qid);
-void v9fs_mistat2inode(struct v9fs_stat *, struct inode *,
-		       struct super_block *);
+void v9fs_stat2inode(struct v9fs_stat *, struct inode *, struct super_block *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
-void v9fs_inode2mistat(struct inode *inode, struct v9fs_stat *mistat);
+void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat);
 void v9fs_dentry_release(struct dentry *);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 4887df767394..2dd806dac9f1 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -40,7 +40,6 @@
 #include "v9fs.h"
 #include "9p.h"
 #include "v9fs_vfs.h"
-#include "conv.h"
 #include "fid.h"
 
 /**
@@ -108,7 +107,8 @@ void v9fs_dentry_release(struct dentry *dentry)
 			err = v9fs_t_clunk(current_fid->v9ses, current_fid->fid);
 
 			if (err < 0)
-				dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
+				dprintk(DEBUG_ERROR, "clunk failed: %d name %s\n",
+					err, dentry->d_iname);
 
 			v9fs_fid_destroy(current_fid);
 		}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 3893dd307ddb..ae6d032b9b59 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -37,8 +37,8 @@
 #include "debug.h"
 #include "v9fs.h"
 #include "9p.h"
-#include "v9fs_vfs.h"
 #include "conv.h"
+#include "v9fs_vfs.h"
 #include "fid.h"
 
 /**
@@ -77,17 +77,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	unsigned int i, n, s;
 	int fid = -1;
 	int ret = 0;
-	struct v9fs_stat *mi = NULL;
+	struct v9fs_stat stat;
 	int over = 0;
 
 	dprintk(DEBUG_VFS, "name %s\n", filp->f_dentry->d_name.name);
 
 	fid = file->fid;
 
-	mi = kmalloc(v9ses->maxdata, GFP_KERNEL);
-	if (!mi)
-		return -ENOMEM;
-
 	if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) {
 		kfree(file->rdir_fcall);
 		file->rdir_fcall = NULL;
@@ -99,18 +95,18 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		while (i < n) {
 			s = v9fs_deserialize_stat(
 				file->rdir_fcall->params.rread.data + i,
-				n - i, mi, v9ses->maxdata, v9ses->extended);
+				n - i, &stat, v9ses->extended);
 
 			if (s == 0) {
 				dprintk(DEBUG_ERROR,
-					"error while deserializing mistat\n");
+					"error while deserializing stat\n");
 				ret = -EIO;
 				goto FreeStructs;
 			}
 
-			over = filldir(dirent, mi->name, strlen(mi->name),
-				    filp->f_pos, v9fs_qid2ino(&mi->qid),
-				    dt_type(mi));
+			over = filldir(dirent, stat.name.str, stat.name.len,
+				    filp->f_pos, v9fs_qid2ino(&stat.qid),
+				    dt_type(&stat));
 
 			if (over) {
 				file->rdir_fpos = i;
@@ -130,7 +126,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	while (!over) {
 		ret = v9fs_t_read(v9ses, fid, filp->f_pos,
-					    v9ses->maxdata-V9FS_IOHDRSZ, &fcall);
+			v9ses->maxdata-V9FS_IOHDRSZ, &fcall);
 		if (ret < 0) {
 			dprintk(DEBUG_ERROR, "error while reading: %d: %p\n",
 				ret, fcall);
@@ -142,17 +138,17 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		i = 0;
 		while (i < n) {
 			s = v9fs_deserialize_stat(fcall->params.rread.data + i,
-				n - i, mi, v9ses->maxdata, v9ses->extended);
+				n - i, &stat, v9ses->extended);
 
 			if (s == 0) {
 				dprintk(DEBUG_ERROR,
-					"error while deserializing mistat\n");
+					"error while deserializing stat\n");
 				return -EIO;
 			}
 
-			over = filldir(dirent, mi->name, strlen(mi->name),
-				    filp->f_pos, v9fs_qid2ino(&mi->qid),
-				    dt_type(mi));
+			over = filldir(dirent, stat.name.str, stat.name.len,
+				    filp->f_pos, v9fs_qid2ino(&stat.qid),
+				    dt_type(&stat));
 
 			if (over) {
 				file->rdir_fcall = fcall;
@@ -171,7 +167,6 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
       FreeStructs:
 	kfree(fcall);
-	kfree(mi);
 	return ret;
 }
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e13577da9130..6852f0eb96ed 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/inet.h>
+#include <linux/version.h>
 #include <linux/list.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
@@ -117,9 +118,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 		result = v9fs_t_open(v9ses, newfid, open_mode, &fcall);
 		if (result < 0) {
-			dprintk(DEBUG_ERROR,
-				"open failed, open_mode 0x%x: %s\n", open_mode,
-				FCALL_ERROR(fcall));
+			PRINT_FCALL_ERROR("open failed", fcall);
 			kfree(fcall);
 			return result;
 		}
@@ -256,7 +255,6 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	int result = -EIO;
 	int rsize = 0;
 	int total = 0;
-	char *buf;
 
 	dprintk(DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count,
 		(int)*offset);
@@ -264,28 +262,14 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	if (v9fid->iounit != 0 && rsize > v9fid->iounit)
 		rsize = v9fid->iounit;
 
-	buf = kmalloc(v9ses->maxdata - V9FS_IOHDRSZ, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
 	do {
 		if (count < rsize)
 			rsize = count;
 
-		result = copy_from_user(buf, data, rsize);
-		if (result) {
-			dprintk(DEBUG_ERROR, "Problem copying from user\n");
-			kfree(buf);
-			return -EFAULT;
-		}
-
-		dump_data(buf, rsize);
-		result = v9fs_t_write(v9ses, fid, *offset, rsize, buf, &fcall);
+		result = v9fs_t_write(v9ses, fid, *offset, rsize, data, &fcall);
 		if (result < 0) {
-			eprintk(KERN_ERR, "error while writing: %s(%d)\n",
-				FCALL_ERROR(fcall), result);
+			PRINT_FCALL_ERROR("error while writing", fcall);
 			kfree(fcall);
-			kfree(buf);
 			return result;
 		} else
 			*offset += result;
@@ -305,7 +289,6 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		total += result;
 	} while (count);
 
-	kfree(buf);
 	return total;
 }
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f11edde6432e..742bcd0dc4a7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -40,7 +40,6 @@
 #include "v9fs.h"
 #include "9p.h"
 #include "v9fs_vfs.h"
-#include "conv.h"
 #include "fid.h"
 
 static struct inode_operations v9fs_dir_inode_operations;
@@ -127,100 +126,32 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 }
 
 /**
- * v9fs_blank_mistat - helper function to setup a 9P stat structure
+ * v9fs_blank_wstat - helper function to setup a 9P stat structure
  * @v9ses: 9P session info (for determining extended mode)
- * @mistat: structure to initialize
+ * @wstat: structure to initialize
  *
  */
 
 static void
-v9fs_blank_mistat(struct v9fs_session_info *v9ses, struct v9fs_stat *mistat)
+v9fs_blank_wstat(struct v9fs_wstat *wstat)
 {
-	mistat->type = ~0;
-	mistat->dev = ~0;
-	mistat->qid.type = ~0;
-	mistat->qid.version = ~0;
-	*((long long *)&mistat->qid.path) = ~0;
-	mistat->mode = ~0;
-	mistat->atime = ~0;
-	mistat->mtime = ~0;
-	mistat->length = ~0;
-	mistat->name = mistat->data;
-	mistat->uid = mistat->data;
-	mistat->gid = mistat->data;
-	mistat->muid = mistat->data;
-	if (v9ses->extended) {
-		mistat->n_uid = ~0;
-		mistat->n_gid = ~0;
-		mistat->n_muid = ~0;
-		mistat->extension = mistat->data;
-	}
-	*mistat->data = 0;
-}
-
-/**
- * v9fs_mistat2unix - convert mistat to unix stat
- * @mistat: Plan 9 metadata (mistat) structure
- * @buf: unix metadata (stat) structure to populate
- * @sb: superblock
- *
- */
-
-static void
-v9fs_mistat2unix(struct v9fs_stat *mistat, struct stat *buf,
-		 struct super_block *sb)
-{
-	struct v9fs_session_info *v9ses = sb ? sb->s_fs_info : NULL;
-
-	buf->st_nlink = 1;
-
-	buf->st_atime = mistat->atime;
-	buf->st_mtime = mistat->mtime;
-	buf->st_ctime = mistat->mtime;
-
-	buf->st_uid = (unsigned short)-1;
-	buf->st_gid = (unsigned short)-1;
-
-	if (v9ses && v9ses->extended) {
-		/* TODO: string to uid mapping via user-space daemon */
-		if (mistat->n_uid != -1)
-			sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid);
-
-		if (mistat->n_gid != -1)
-			sscanf(mistat->gid, "%x", (unsigned int *)&buf->st_gid);
-	}
-
-	if (buf->st_uid == (unsigned short)-1)
-		buf->st_uid = v9ses->uid;
-	if (buf->st_gid == (unsigned short)-1)
-		buf->st_gid = v9ses->gid;
-
-	buf->st_mode = p9mode2unixmode(v9ses, mistat->mode);
-	if ((S_ISBLK(buf->st_mode)) || (S_ISCHR(buf->st_mode))) {
-		char type = 0;
-		int major = -1;
-		int minor = -1;
-		sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
-		switch (type) {
-		case 'c':
-			buf->st_mode &= ~S_IFBLK;
-			buf->st_mode |= S_IFCHR;
-			break;
-		case 'b':
-			break;
-		default:
-			dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
-				type, mistat->extension);
-		};
-		buf->st_rdev = MKDEV(major, minor);
-	} else
-		buf->st_rdev = 0;
-
-	buf->st_size = mistat->length;
-
-	buf->st_blksize = sb->s_blocksize;
-	buf->st_blocks =
-	    (buf->st_size + buf->st_blksize - 1) >> sb->s_blocksize_bits;
+	wstat->type = ~0;
+	wstat->dev = ~0;
+	wstat->qid.type = ~0;
+	wstat->qid.version = ~0;
+	*((long long *)&wstat->qid.path) = ~0;
+	wstat->mode = ~0;
+	wstat->atime = ~0;
+	wstat->mtime = ~0;
+	wstat->length = ~0;
+	wstat->name = NULL;
+	wstat->uid = NULL;
+	wstat->gid = NULL;
+	wstat->muid = NULL;
+	wstat->n_uid = ~0;
+	wstat->n_gid = ~0;
+	wstat->n_muid = ~0;
+	wstat->extension = NULL;
 }
 
 /**
@@ -312,7 +243,6 @@ v9fs_create(struct inode *dir,
 	struct inode *file_inode = NULL;
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_qid qid;
-	struct stat newstat;
 	int dirfidnum = -1;
 	long newfid = -1;
 	int result = 0;
@@ -350,7 +280,7 @@ v9fs_create(struct inode *dir,
 
 	result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall);
 	if (result < 0) {
-		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
+		PRINT_FCALL_ERROR("clone error", fcall);
 		v9fs_put_idpool(newfid, &v9ses->fidpool);
 		newfid = -1;
 		goto CleanUpFid;
@@ -362,9 +292,7 @@ v9fs_create(struct inode *dir,
 	result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
 			       perm, open_mode, &fcall);
 	if (result < 0) {
-		dprintk(DEBUG_ERROR, "create fails: %s(%d)\n",
-			FCALL_ERROR(fcall), result);
-
+		PRINT_FCALL_ERROR("create fails", fcall);
 		goto CleanUpFid;
 	}
 
@@ -400,7 +328,7 @@ v9fs_create(struct inode *dir,
 	result = v9fs_t_walk(v9ses, dirfidnum, wfidno,
 		(char *) file_dentry->d_name.name, &fcall);
 	if (result < 0) {
-		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
+		PRINT_FCALL_ERROR("clone error", fcall);
 		v9fs_put_idpool(wfidno, &v9ses->fidpool);
 		wfidno = -1;
 		goto CleanUpFid;
@@ -421,21 +349,21 @@ v9fs_create(struct inode *dir,
 
 	result = v9fs_t_stat(v9ses, wfidno, &fcall);
 	if (result < 0) {
-		dprintk(DEBUG_ERROR, "stat error: %s(%d)\n", FCALL_ERROR(fcall),
-			result);
+		PRINT_FCALL_ERROR("stat error", fcall);
 		goto CleanUpFid;
 	}
 
-	v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
 
-	file_inode = v9fs_get_inode(sb, newstat.st_mode);
+	file_inode = v9fs_get_inode(sb,
+		p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode));
+
 	if ((!file_inode) || IS_ERR(file_inode)) {
 		dprintk(DEBUG_ERROR, "create inode failed\n");
 		result = -EBADF;
 		goto CleanUpFid;
 	}
 
-	v9fs_mistat2inode(fcall->params.rstat.stat, file_inode, sb);
+	v9fs_stat2inode(&fcall->params.rstat.stat, file_inode, sb);
 	kfree(fcall);
 	fcall = NULL;
 	file_dentry->d_op = &v9fs_dentry_operations;
@@ -500,10 +428,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 	}
 
 	result = v9fs_t_remove(v9ses, fid, &fcall);
-	if (result < 0)
-		dprintk(DEBUG_ERROR, "remove of file fails: %s(%d)\n",
-			FCALL_ERROR(fcall), result);
-	else {
+	if (result < 0) {
+		PRINT_FCALL_ERROR("remove fails", fcall);
+	} else {
 		v9fs_put_idpool(fid, &v9ses->fidpool);
 		v9fs_fid_destroy(v9fid);
 	}
@@ -558,7 +485,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct v9fs_fid *fid;
 	struct inode *inode;
 	struct v9fs_fcall *fcall = NULL;
-	struct stat newstat;
 	int dirfidnum = -1;
 	int newfid = -1;
 	int result = 0;
@@ -611,8 +537,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto FreeFcall;
 	}
 
-	v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb);
-	inode = v9fs_get_inode(sb, newstat.st_mode);
+	inode = v9fs_get_inode(sb, p9mode2unixmode(v9ses,
+		fcall->params.rstat.stat.mode));
 
 	if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) {
 		eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n",
@@ -622,7 +548,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto FreeFcall;
 	}
 
-	inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat->qid);
+	inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid);
 
 	fid = v9fs_fid_create(dentry, v9ses, newfid, 0);
 	if (fid == NULL) {
@@ -631,10 +557,10 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto FreeFcall;
 	}
 
-	fid->qid = fcall->params.rstat.stat->qid;
+	fid->qid = fcall->params.rstat.stat.qid;
 
 	dentry->d_op = &v9fs_dentry_operations;
-	v9fs_mistat2inode(fcall->params.rstat.stat, inode, inode->i_sb);
+	v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb);
 
 	d_add(dentry, inode);
 	kfree(fcall);
@@ -690,7 +616,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    v9fs_fid_lookup(old_dentry->d_parent);
 	struct v9fs_fid *newdirfid =
 	    v9fs_fid_lookup(new_dentry->d_parent);
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	struct v9fs_wstat wstat;
 	struct v9fs_fcall *fcall = NULL;
 	int fid = -1;
 	int olddirfidnum = -1;
@@ -699,9 +625,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	dprintk(DEBUG_VFS, "\n");
 
-	if (!mistat)
-		return -ENOMEM;
-
 	if ((!oldfid) || (!olddirfid) || (!newdirfid)) {
 		dprintk(DEBUG_ERROR, "problem with arguments\n");
 		return -EBADF;
@@ -725,26 +648,15 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto FreeFcallnBail;
 	}
 
-	v9fs_blank_mistat(v9ses, mistat);
-
-	strcpy(mistat->data + 1, v9ses->name);
-	mistat->name = mistat->data + 1 + strlen(v9ses->name);
-
-	if (new_dentry->d_name.len >
-	    (v9ses->maxdata - strlen(v9ses->name) - sizeof(struct v9fs_stat))) {
-		dprintk(DEBUG_ERROR, "new name too long\n");
-		goto FreeFcallnBail;
-	}
+	v9fs_blank_wstat(&wstat);
+	wstat.muid = v9ses->name;
+	wstat.name = (char *) new_dentry->d_name.name;
 
-	strcpy(mistat->name, new_dentry->d_name.name);
-	retval = v9fs_t_wstat(v9ses, fid, mistat, &fcall);
+	retval = v9fs_t_wstat(v9ses, fid, &wstat, &fcall);
 
       FreeFcallnBail:
-	kfree(mistat);
-
 	if (retval < 0)
-		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
-			FCALL_ERROR(fcall));
+		PRINT_FCALL_ERROR("wstat error", fcall);
 
 	kfree(fcall);
 	return retval;
@@ -779,7 +691,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	if (err < 0)
 		dprintk(DEBUG_ERROR, "stat error\n");
 	else {
-		v9fs_mistat2inode(fcall->params.rstat.stat, dentry->d_inode,
+		v9fs_stat2inode(&fcall->params.rstat.stat, dentry->d_inode,
 				  dentry->d_inode->i_sb);
 		generic_fillattr(dentry->d_inode, stat);
 	}
@@ -800,57 +712,44 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
 	struct v9fs_fid *fid = v9fs_fid_lookup(dentry);
 	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
+	struct v9fs_wstat wstat;
 	int res = -EPERM;
 
 	dprintk(DEBUG_VFS, "\n");
 
-	if (!mistat)
-		return -ENOMEM;
-
 	if (!fid) {
 		dprintk(DEBUG_ERROR,
 			"Couldn't find fid associated with dentry\n");
 		return -EBADF;
 	}
 
-	v9fs_blank_mistat(v9ses, mistat);
+	v9fs_blank_wstat(&wstat);
 	if (iattr->ia_valid & ATTR_MODE)
-		mistat->mode = unixmode2p9mode(v9ses, iattr->ia_mode);
+		wstat.mode = unixmode2p9mode(v9ses, iattr->ia_mode);
 
 	if (iattr->ia_valid & ATTR_MTIME)
-		mistat->mtime = iattr->ia_mtime.tv_sec;
+		wstat.mtime = iattr->ia_mtime.tv_sec;
 
 	if (iattr->ia_valid & ATTR_ATIME)
-		mistat->atime = iattr->ia_atime.tv_sec;
+		wstat.atime = iattr->ia_atime.tv_sec;
 
 	if (iattr->ia_valid & ATTR_SIZE)
-		mistat->length = iattr->ia_size;
+		wstat.length = iattr->ia_size;
 
 	if (v9ses->extended) {
-		char *ptr = mistat->data+1;
+		if (iattr->ia_valid & ATTR_UID)
+			wstat.n_uid = iattr->ia_uid;
 
-		if (iattr->ia_valid & ATTR_UID) {
-			mistat->uid = ptr;
-			ptr += 1+sprintf(ptr, "%08x", iattr->ia_uid);
-			mistat->n_uid = iattr->ia_uid;
-		}
-
-		if (iattr->ia_valid & ATTR_GID) {
-			mistat->gid = ptr;
-			ptr += 1+sprintf(ptr, "%08x", iattr->ia_gid);
-			mistat->n_gid = iattr->ia_gid;
-		}
+		if (iattr->ia_valid & ATTR_GID)
+			wstat.n_gid = iattr->ia_gid;
 	}
 
-	res = v9fs_t_wstat(v9ses, fid->fid, mistat, &fcall);
+	res = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall);
 
 	if (res < 0)
-		dprintk(DEBUG_ERROR, "wstat error: %s\n", FCALL_ERROR(fcall));
+		PRINT_FCALL_ERROR("wstat error", fcall);
 
-	kfree(mistat);
 	kfree(fcall);
-
 	if (res >= 0)
 		res = inode_setattr(dentry->d_inode, iattr);
 
@@ -858,51 +757,42 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 
 /**
- * v9fs_mistat2inode - populate an inode structure with mistat info
- * @mistat: Plan 9 metadata (mistat) structure
+ * v9fs_stat2inode - populate an inode structure with mistat info
+ * @stat: Plan 9 metadata (mistat) structure
  * @inode: inode to populate
  * @sb: superblock of filesystem
  *
  */
 
 void
-v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode,
-		  struct super_block *sb)
+v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
+	struct super_block *sb)
 {
+	char ext[32];
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
 	inode->i_nlink = 1;
 
-	inode->i_atime.tv_sec = mistat->atime;
-	inode->i_mtime.tv_sec = mistat->mtime;
-	inode->i_ctime.tv_sec = mistat->mtime;
+	inode->i_atime.tv_sec = stat->atime;
+	inode->i_mtime.tv_sec = stat->mtime;
+	inode->i_ctime.tv_sec = stat->mtime;
 
-	inode->i_uid = -1;
-	inode->i_gid = -1;
+	inode->i_uid = v9ses->uid;
+	inode->i_gid = v9ses->gid;
 
 	if (v9ses->extended) {
-		/* TODO: string to uid mapping via user-space daemon */
-		inode->i_uid = mistat->n_uid;
-		inode->i_gid = mistat->n_gid;
-
-		if (mistat->n_uid == -1)
-			sscanf(mistat->uid, "%x", &inode->i_uid);
-
-		if (mistat->n_gid == -1)
-			sscanf(mistat->gid, "%x", &inode->i_gid);
+		inode->i_uid = stat->n_uid;
+		inode->i_gid = stat->n_gid;
 	}
 
-	if (inode->i_uid == -1)
-		inode->i_uid = v9ses->uid;
-	if (inode->i_gid == -1)
-		inode->i_gid = v9ses->gid;
-
-	inode->i_mode = p9mode2unixmode(v9ses, mistat->mode);
+	inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
 	if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
 		char type = 0;
 		int major = -1;
 		int minor = -1;
-		sscanf(mistat->extension, "%c %u %u", &type, &major, &minor);
+
+		v9fs_str_copy(ext, sizeof(ext), &stat->extension);
+		sscanf(ext, "%c %u %u", &type, &major, &minor);
 		switch (type) {
 		case 'c':
 			inode->i_mode &= ~S_IFBLK;
@@ -911,14 +801,14 @@ v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode,
 		case 'b':
 			break;
 		default:
-			dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n",
-				type, mistat->extension);
+			dprintk(DEBUG_ERROR, "Unknown special type %c (%.*s)\n",
+				type, stat->extension.len, stat->extension.str);
 		};
 		inode->i_rdev = MKDEV(major, minor);
 	} else
 		inode->i_rdev = 0;
 
-	inode->i_size = mistat->length;
+	inode->i_size = stat->length;
 
 	inode->i_blksize = sb->s_blocksize;
 	inode->i_blocks =
@@ -945,72 +835,6 @@ ino_t v9fs_qid2ino(struct v9fs_qid *qid)
 	return i;
 }
 
-/**
- * v9fs_vfs_symlink - helper function to create symlinks
- * @dir: directory inode containing symlink
- * @dentry: dentry for symlink
- * @symname: symlink data
- *
- * See 9P2000.u RFC for more information
- *
- */
-
-static int
-v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-	int retval = -EPERM;
-	struct v9fs_fid *newfid;
-	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
-	int err;
-
-	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
-		symname);
-
-	if (!mistat)
-		return -ENOMEM;
-
-	if (!v9ses->extended) {
-		dprintk(DEBUG_ERROR, "not extended\n");
-		goto FreeFcall;
-	}
-
-	/* issue a create */
-	retval = v9fs_create(dir, dentry, S_IFLNK, 0);
-	if (retval != 0)
-		goto FreeFcall;
-
-	newfid = v9fs_fid_lookup(dentry);
-
-	/* issue a twstat */
-	v9fs_blank_mistat(v9ses, mistat);
-	strcpy(mistat->data + 1, symname);
-	mistat->extension = mistat->data + 1;
-	retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
-	if (retval < 0) {
-		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
-			FCALL_ERROR(fcall));
-		goto FreeFcall;
-	}
-
-	kfree(fcall);
-
-	err = v9fs_t_clunk(v9ses, newfid->fid);
-	if (err < 0) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
-		goto FreeFcall;
-	}
-
-	d_drop(dentry);		/* FID - will this also clunk? */
-
-      FreeFcall:
-	kfree(mistat);
-	kfree(fcall);
-
-	return retval;
-}
-
 /**
  * v9fs_readlink - read a symlink's location (internal version)
  * @dentry: dentry for symlink
@@ -1050,16 +874,17 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	if (!fcall)
 		return -EIO;
 
-	if (!(fcall->params.rstat.stat->mode & V9FS_DMSYMLINK)) {
+	if (!(fcall->params.rstat.stat.mode & V9FS_DMSYMLINK)) {
 		retval = -EINVAL;
 		goto FreeFcall;
 	}
 
 	/* copy extension buffer into buffer */
-	if (strlen(fcall->params.rstat.stat->extension) < buflen)
-		buflen = strlen(fcall->params.rstat.stat->extension);
+	if (fcall->params.rstat.stat.extension.len < buflen)
+		buflen = fcall->params.rstat.stat.extension.len;
 
-	memcpy(buffer, fcall->params.rstat.stat->extension, buflen + 1);
+	memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1);
+	buffer[buflen-1] = 0;
 
 	retval = buflen;
 
@@ -1149,82 +974,111 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
 		__putname(s);
 }
 
-/**
- * v9fs_vfs_link - create a hardlink
- * @old_dentry: dentry for file to link to
- * @dir: inode destination for new link
- * @dentry: dentry for link
- *
- */
-
-/* XXX - lots of code dup'd from symlink and creates,
- * figure out a better reuse strategy
- */
-
-static int
-v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
-	      struct dentry *dentry)
+static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
+	int mode, const char *extension)
 {
-	int retval = -EPERM;
-	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
-	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry);
-	struct v9fs_fid *newfid = NULL;
-	char *symname = __getname();
-	int err;
+	int err, retval;
+	struct v9fs_session_info *v9ses;
+	struct v9fs_fcall *fcall;
+	struct v9fs_fid *fid;
+	struct v9fs_wstat wstat;
 
-	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
-		old_dentry->d_name.name);
+	v9ses = v9fs_inode2v9ses(dir);
+	retval = -EPERM;
+	fcall = NULL;
 
 	if (!v9ses->extended) {
 		dprintk(DEBUG_ERROR, "not extended\n");
-		goto FreeMem;
+		goto free_mem;
 	}
 
-	/* get fid of old_dentry */
-	sprintf(symname, "hardlink(%d)\n", oldfid->fid);
-
 	/* issue a create */
-	retval = v9fs_create(dir, dentry, V9FS_DMLINK, 0);
+	retval = v9fs_create(dir, dentry, mode, 0);
 	if (retval != 0)
-		goto FreeMem;
+		goto free_mem;
 
-	newfid = v9fs_fid_lookup(dentry);
-	if (!newfid) {
+	fid = v9fs_fid_get_created(dentry);
+	if (!fid) {
 		dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
-		goto FreeMem;
+		goto free_mem;
 	}
 
-	/* issue a twstat */
-	v9fs_blank_mistat(v9ses, mistat);
-	strcpy(mistat->data + 1, symname);
-	mistat->extension = mistat->data + 1;
-	retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
+	/* issue a Twstat */
+	v9fs_blank_wstat(&wstat);
+	wstat.muid = v9ses->name;
+	wstat.extension = (char *) extension;
+	retval = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall);
 	if (retval < 0) {
-		dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
-			FCALL_ERROR(fcall));
-		goto FreeMem;
+		PRINT_FCALL_ERROR("wstat error", fcall);
+		goto free_mem;
 	}
 
-	kfree(fcall);
-
-	err = v9fs_t_clunk(v9ses, newfid->fid);
-
+	err = v9fs_t_clunk(v9ses, fid->fid);
 	if (err < 0) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
-		goto FreeMem;
+		dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
+		goto free_mem;
 	}
 
 	d_drop(dentry);		/* FID - will this also clunk? */
 
+free_mem:
 	kfree(fcall);
-	fcall = NULL;
+	return retval;
+}
+
+/**
+ * v9fs_vfs_symlink - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See 9P2000.u RFC for more information
+ *
+ */
+
+static int
+v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+		symname);
+
+	return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname);
+}
+
+/**
+ * v9fs_vfs_link - create a hardlink
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+
+/* XXX - lots of code dup'd from symlink and creates,
+ * figure out a better reuse strategy
+ */
+
+static int
+v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
+	      struct dentry *dentry)
+{
+	int retval;
+	struct v9fs_fid *oldfid;
+	char *name;
+
+	dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+		old_dentry->d_name.name);
+
+	oldfid = v9fs_fid_lookup(old_dentry);
+	if (!oldfid) {
+		dprintk(DEBUG_ERROR, "can't find oldfid\n");
+		return -EPERM;
+	}
+
+	name = __getname();
+	sprintf(name, "hardlink(%d)\n", oldfid->fid);
+	retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name);
+	__putname(name);
 
-      FreeMem:
-	kfree(mistat);
-	kfree(fcall);
-	__putname(symname);
 	return retval;
 }
 
@@ -1240,83 +1094,30 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 static int
 v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 {
-	int retval = -EPERM;
-	struct v9fs_fid *newfid;
-	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
-	char *symname = __getname();
-	int err;
+	int retval;
+	char *name;
 
 	dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
 		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
 
-	if (!mistat)
-		return -ENOMEM;
-
-	if (!new_valid_dev(rdev)) {
-		retval = -EINVAL;
-		goto FreeMem;
-	}
-
-	if (!v9ses->extended) {
-		dprintk(DEBUG_ERROR, "not extended\n");
-		goto FreeMem;
-	}
-
-	/* issue a create */
-	retval = v9fs_create(dir, dentry, mode, 0);
-
-	if (retval != 0)
-		goto FreeMem;
-
-	newfid = v9fs_fid_lookup(dentry);
-	if (!newfid) {
-		dprintk(DEBUG_ERROR, "coudn't resove fid from dentry\n");
-		retval = -EINVAL;
-		goto FreeMem;
-	}
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
 
+	name = __getname();
 	/* build extension */
 	if (S_ISBLK(mode))
-		sprintf(symname, "b %u %u", MAJOR(rdev), MINOR(rdev));
+		sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
 	else if (S_ISCHR(mode))
-		sprintf(symname, "c %u %u", MAJOR(rdev), MINOR(rdev));
+		sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
 	else if (S_ISFIFO(mode))
-		;	/* DO NOTHING */
+		*name = 0;
 	else {
-		retval = -EINVAL;
-		goto FreeMem;
+		__putname(name);
+		return -EINVAL;
 	}
 
-	if (!S_ISFIFO(mode)) {
-		/* issue a twstat */
-		v9fs_blank_mistat(v9ses, mistat);
-		strcpy(mistat->data + 1, symname);
-		mistat->extension = mistat->data + 1;
-		retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall);
-		if (retval < 0) {
-			dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n",
-				FCALL_ERROR(fcall));
-			goto FreeMem;
-		}
-	}
-
-	/* need to update dcache so we show up */
-	kfree(fcall);
-
-	err = v9fs_t_clunk(v9ses, newfid->fid);
-	if (err < 0) {
-		dprintk(DEBUG_ERROR, "clunk for symlink failed: %d\n", err);
-		goto FreeMem;
-	}
-
-	d_drop(dentry);		/* FID - will this also clunk? */
-
-      FreeMem:
-	kfree(mistat);
-	kfree(fcall);
-	__putname(symname);
+	retval = v9fs_vfs_mkspecial(dir, dentry, mode, name);
+	__putname(name);
 
 	return retval;
 }
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 83b6edd0988a..d4d71a9ca573 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -44,7 +44,6 @@
 #include "v9fs.h"
 #include "9p.h"
 #include "v9fs_vfs.h"
-#include "conv.h"
 #include "fid.h"
 
 static void v9fs_clear_inode(struct inode *);
@@ -123,10 +122,11 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	dprintk(DEBUG_VFS, " \n");
 
-	v9ses = kcalloc(1, sizeof(struct v9fs_session_info), GFP_KERNEL);
+	v9ses = kmalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
 		return ERR_PTR(-ENOMEM);
 
+	memset(v9ses, 0, sizeof(struct v9fs_session_info));
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
 		kfree(v9ses);
@@ -168,10 +168,10 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 			goto put_back_sb;
 		}
 
-		root_fid->qid = fcall->params.rstat.stat->qid;
+		root_fid->qid = fcall->params.rstat.stat.qid;
 		root->d_inode->i_ino =
-		    v9fs_qid2ino(&fcall->params.rstat.stat->qid);
-		v9fs_mistat2inode(fcall->params.rstat.stat, root->d_inode, sb);
+		    v9fs_qid2ino(&fcall->params.rstat.stat.qid);
+		v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb);
 	}
 
 	kfree(fcall);
-- 
cgit v1.2.3


From 1dac06b20dcc8078dab037bd70652c69c67ba672 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Sun, 8 Jan 2006 01:05:02 -0800
Subject: [PATCH] v9fs: handle kthread_create failure, minor bugfixes

- remove unnecessary -ENOMEM assignments
- return correct value when buf_check_size for second time in a buffer
- handle failures when create_workqueue and kthread_create are called
- use kzalloc instead of kmalloc/memset 0
- v9fs_str_copy and v9fs_str_compare were buggy, were used only in one
  place, correct the logic and move it to the place it is used.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/9p.c        | 10 ----------
 fs/9p/9p.h        |  3 ---
 fs/9p/conv.c      | 45 +++++++++------------------------------------
 fs/9p/error.h     |  1 -
 fs/9p/mux.c       | 38 ++++++++++++++++++++++++++------------
 fs/9p/mux.h       |  3 ++-
 fs/9p/v9fs.c      | 19 ++++++++++++++-----
 fs/9p/vfs_inode.c |  7 ++++++-
 fs/9p/vfs_super.c |  3 +--
 9 files changed, 58 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index dc3ce44ec836..1a6d08761f39 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -86,7 +86,6 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
 	dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname,
 		aname, fid, afid);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tattach(fid, afid, uname, aname);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -136,7 +135,6 @@ v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid)
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
 
-	ret = -ENOMEM;
 	rc = NULL;
 	tc = v9fs_create_tclunk(fid);
 	if (!IS_ERR(tc))
@@ -165,7 +163,6 @@ int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag)
 
 	dprintk(DEBUG_9P, "oldtag %d\n", oldtag);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tflush(oldtag);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, NULL);
@@ -221,7 +218,6 @@ v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid,
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_twstat(fid, wstat, v9ses->extended);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -259,7 +255,6 @@ v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid,
 	else
 		nwname = 0;
 
-	ret = -ENOMEM;
 	tc = v9fs_create_twalk(fid, newfid, nwname, &name);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -289,7 +284,6 @@ v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode,
 
 	dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_topen(fid, mode);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -317,7 +311,6 @@ v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid,
 
 	dprintk(DEBUG_9P, "fid %d\n", fid);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tremove(fid);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -349,7 +342,6 @@ v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name,
 	dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n",
 		fid, name, perm, mode);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tcreate(fid, name, perm, mode);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, rcp);
@@ -380,7 +372,6 @@ v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset,
 	dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
 		(long long unsigned) offset, count);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_tread(fid, offset, count);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
@@ -418,7 +409,6 @@ v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, u32 count,
 	dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid,
 		(long long unsigned) offset, count);
 
-	ret = -ENOMEM;
 	tc = v9fs_create_twrite(fid, offset, count, data);
 	if (!IS_ERR(tc)) {
 		ret = v9fs_mux_rpc(v9ses->mux, tc, &rc);
diff --git a/fs/9p/9p.h b/fs/9p/9p.h
index 007ff639777d..0cd374d94717 100644
--- a/fs/9p/9p.h
+++ b/fs/9p/9p.h
@@ -340,9 +340,6 @@ struct v9fs_fcall {
 	fcall?fcall->params.rerror.error.len:0, \
 	fcall?fcall->params.rerror.error.str:"");
 
-char *v9fs_str_copy(char *buf, int buflen, struct v9fs_str *str);
-int v9fs_str_compare(char *buf, struct v9fs_str *str);
-
 int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize,
 		   char *version, struct v9fs_fcall **rcall);
 
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index f62434d435b3..55ccfa10ee9e 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -45,37 +45,6 @@ struct cbuf {
 	unsigned char *ep;
 };
 
-char *v9fs_str_copy(char *buf, int buflen, struct v9fs_str *str)
-{
-	int n;
-
-	if (buflen < str->len)
-		n = buflen;
-	else
-		n = str->len;
-
-	memmove(buf, str->str, n - 1);
-
-	return buf;
-}
-
-int v9fs_str_compare(char *buf, struct v9fs_str *str)
-{
-	int n, ret;
-
-	ret = strncmp(buf, str->str, str->len);
-
-	if (!ret) {
-		n = strlen(buf);
-		if (n < str->len)
-			ret = -1;
-		else if (n > str->len)
-			ret = 1;
-	}
-
-	return ret;
-}
-
 static inline void buf_init(struct cbuf *buf, void *data, int datalen)
 {
 	buf->sp = buf->p = data;
@@ -89,11 +58,14 @@ static inline int buf_check_overflow(struct cbuf *buf)
 
 static inline int buf_check_size(struct cbuf *buf, int len)
 {
-	if (buf->p + len > buf->ep && buf->p < buf->ep) {
-		eprintk(KERN_ERR, "buffer overflow: want %d has %d\n",
-			len, (int)(buf->ep - buf->p));
-		dump_stack();
-		buf->p = buf->ep + 1;
+	if (buf->p + len > buf->ep) {
+		if (buf->p < buf->ep) {
+			eprintk(KERN_ERR, "buffer overflow: want %d has %d\n",
+				len, (int)(buf->ep - buf->p));
+			dump_stack();
+			buf->p = buf->ep + 1;
+		}
+
 		return 0;
 	}
 
@@ -527,6 +499,7 @@ v9fs_create_common(struct cbuf *bufp, u32 size, u8 id)
 
 void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag)
 {
+	fc->tag = tag;
 	*(__le16 *) (fc->sdata + 5) = cpu_to_le16(tag);
 }
 
diff --git a/fs/9p/error.h b/fs/9p/error.h
index 8b3176b3f658..a9794e85fe51 100644
--- a/fs/9p/error.h
+++ b/fs/9p/error.h
@@ -176,4 +176,3 @@ static struct errormap errmap[] = {
 };
 
 extern int v9fs_error_init(void);
-extern int v9fs_errstr2errno(char *errstr, int len);
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f21cf5083973..945cb368d451 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -102,8 +102,6 @@ struct v9fs_mux_rpc {
 	wait_queue_head_t wqueue;
 };
 
-extern int v9fs_errstr2errno(char *str, int len);
-
 static int v9fs_poll_proc(void *);
 static void v9fs_read_work(void *);
 static void v9fs_write_work(void *);
@@ -119,7 +117,7 @@ static int v9fs_mux_num;
 static int v9fs_mux_poll_task_num;
 static struct v9fs_mux_poll_task v9fs_mux_poll_tasks[100];
 
-void v9fs_mux_global_init(void)
+int v9fs_mux_global_init(void)
 {
 	int i;
 
@@ -127,6 +125,10 @@ void v9fs_mux_global_init(void)
 		v9fs_mux_poll_tasks[i].task = NULL;
 
 	v9fs_mux_wq = create_workqueue("v9fs");
+	if (!v9fs_mux_wq)
+		return -ENOMEM;
+
+	return 0;
 }
 
 void v9fs_mux_global_exit(void)
@@ -156,10 +158,11 @@ inline int v9fs_mux_calc_poll_procs(int muxnum)
 	return n;
 }
 
-static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
+static int v9fs_mux_poll_start(struct v9fs_mux_data *m)
 {
 	int i, n;
 	struct v9fs_mux_poll_task *vpt, *vptlast;
+	struct task_struct *pproc;
 
 	dprintk(DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, v9fs_mux_num,
 		v9fs_mux_poll_task_num);
@@ -171,13 +174,16 @@ static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
 			if (v9fs_mux_poll_tasks[i].task == NULL) {
 				vpt = &v9fs_mux_poll_tasks[i];
 				dprintk(DEBUG_MUX, "create proc %p\n", vpt);
-				vpt->task =
-				    kthread_create(v9fs_poll_proc, vpt,
+				pproc = kthread_create(v9fs_poll_proc, vpt,
 						   "v9fs-poll");
-				INIT_LIST_HEAD(&vpt->mux_list);
-				vpt->muxnum = 0;
-				v9fs_mux_poll_task_num++;
-				wake_up_process(vpt->task);
+
+				if (!IS_ERR(pproc)) {
+					vpt->task = pproc;
+					INIT_LIST_HEAD(&vpt->mux_list);
+					vpt->muxnum = 0;
+					v9fs_mux_poll_task_num++;
+					wake_up_process(vpt->task);
+				}
 				break;
 			}
 		}
@@ -207,16 +213,21 @@ static void v9fs_mux_poll_start(struct v9fs_mux_data *m)
 	}
 
 	if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) {
+		if (vptlast == NULL)
+			return -ENOMEM;
+
 		dprintk(DEBUG_MUX, "put in proc %d\n", i);
 		list_add(&m->mux_list, &vptlast->mux_list);
 		vptlast->muxnum++;
-		m->poll_task = vpt;
+		m->poll_task = vptlast;
 		memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
 		init_poll_funcptr(&m->pt, v9fs_pollwait);
 	}
 
 	v9fs_mux_num++;
 	down(&v9fs_mux_task_lock);
+
+	return 0;
 }
 
 static void v9fs_mux_poll_stop(struct v9fs_mux_data *m)
@@ -283,7 +294,10 @@ struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
 	INIT_WORK(&m->wq, v9fs_write_work, m);
 	m->wsched = 0;
 	memset(&m->poll_waddr, 0, sizeof(m->poll_waddr));
-	v9fs_mux_poll_start(m);
+	m->poll_task = NULL;
+	n = v9fs_mux_poll_start(m);
+	if (n)
+		return ERR_PTR(n);
 
 	n = trans->poll(trans, &m->pt);
 	if (n & POLLIN) {
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
index 02b13b14b05b..9473b84f24b2 100644
--- a/fs/9p/mux.h
+++ b/fs/9p/mux.h
@@ -40,7 +40,7 @@ struct v9fs_mux_data;
 typedef void (*v9fs_mux_req_callback)(void *a, struct v9fs_fcall *tc,
 	struct v9fs_fcall *rc, int err);
 
-void v9fs_mux_global_init(void);
+int v9fs_mux_global_init(void);
 void v9fs_mux_global_exit(void);
 
 struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize,
@@ -55,3 +55,4 @@ int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
 
 void v9fs_mux_flush(struct v9fs_mux_data *m, int sendflush);
 void v9fs_mux_cancel(struct v9fs_mux_data *m, int err);
+int v9fs_errstr2errno(char *errstr, int len);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 519b21d8b15b..5250c428fc1f 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -269,6 +269,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	int n = 0;
 	int newfid = -1;
 	int retval = -EINVAL;
+	struct v9fs_str *version;
 
 	v9ses->name = __getname();
 	if (!v9ses->name)
@@ -351,13 +352,16 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 			goto FreeFcall;
 		}
 
-		/* Really should check for 9P1 and report error */
-		if (!v9fs_str_compare("9P2000.u", &fcall->params.rversion.version)) {
+		version = &fcall->params.rversion.version;
+		if (version->len==8 && !memcmp(version->str, "9P2000.u", 8)) {
 			dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n");
 			v9ses->extended = 1;
-		} else {
+		} else if (version->len==6 && !memcmp(version->str, "9P2000", 6)) {
 			dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n");
 			v9ses->extended = 0;
+		} else {
+			retval = -EREMOTEIO;
+			goto FreeFcall;
 		}
 
 		n = fcall->params.rversion.msize;
@@ -449,12 +453,17 @@ extern int v9fs_error_init(void);
 
 static int __init init_v9fs(void)
 {
+	int ret;
+
 	v9fs_error_init();
 
 	printk(KERN_INFO "Installing v9fs 9P2000 file system support\n");
 
-	v9fs_mux_global_init();
-	return register_filesystem(&v9fs_fs_type);
+	ret = v9fs_mux_global_init();
+	if (!ret)
+		ret = register_filesystem(&v9fs_fs_type);
+
+	return ret;
 }
 
 /**
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 742bcd0dc4a7..d933ef1fbd8a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -768,6 +768,7 @@ void
 v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
 	struct super_block *sb)
 {
+	int n;
 	char ext[32];
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
@@ -791,7 +792,11 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
 		int major = -1;
 		int minor = -1;
 
-		v9fs_str_copy(ext, sizeof(ext), &stat->extension);
+		n = stat->extension.len;
+		if (n > sizeof(ext)-1)
+			n = sizeof(ext)-1;
+		memmove(ext, stat->extension.str, n);
+		ext[n] = 0;
 		sscanf(ext, "%c %u %u", &type, &major, &minor);
 		switch (type) {
 		case 'c':
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index d4d71a9ca573..ae0f06b3c11a 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -122,11 +122,10 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	dprintk(DEBUG_VFS, " \n");
 
-	v9ses = kmalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
+	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
 		return ERR_PTR(-ENOMEM);
 
-	memset(v9ses, 0, sizeof(struct v9fs_session_info));
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
 		kfree(v9ses);
-- 
cgit v1.2.3


From bfc090c468b33fb1f75c27a11efa7b7dc250556f Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Sun, 8 Jan 2006 01:05:08 -0800
Subject: [PATCH] afs: remove unnecessary __attribute__((packed))

Remove the unnecessary __attribute__((packed)) since the enum itself is packed
and not the location of it in the structure.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/volume.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/volume.h b/fs/afs/volume.h
index 1e691889c4c9..bfdcf19ba3f3 100644
--- a/fs/afs/volume.h
+++ b/fs/afs/volume.h
@@ -18,8 +18,6 @@
 #include "kafsasyncd.h"
 #include "cache.h"
 
-#define __packed __attribute__((packed))
-
 typedef enum {
 	AFS_VLUPD_SLEEP,		/* sleeping waiting for update timer to fire */
 	AFS_VLUPD_PENDING,		/* on pending queue */
@@ -115,7 +113,7 @@ struct afs_volume
 	struct cachefs_cookie	*cache;		/* caching cookie */
 #endif
 	afs_volid_t		vid;		/* volume ID */
-	afs_voltype_t __packed	type;		/* type of volume */
+	afs_voltype_t		type;		/* type of volume */
 	char			type_force;	/* force volume type (suppress R/O -> R/W) */
 	unsigned short		nservers;	/* number of server slots filled */
 	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
-- 
cgit v1.2.3


From b01ec0ef63e95570e2463b26333d9c9c854cb941 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:20 -0800
Subject: [PATCH] tiny: Uninline some open.c functions

uninline some open.c functions

add/remove: 3/0 grow/shrink: 0/6 up/down: 679/-1166 (-487)
function                                     old     new   delta
do_sys_truncate                                -     336    +336
do_sys_ftruncate                               -     317    +317
__put_unused_fd                                -      26     +26
put_unused_fd                                 57      49      -8
sys_close                                    150     119     -31
sys_ftruncate64                              260      26    -234
sys_ftruncate                                272      24    -248
sys_truncate                                 339      25    -314
sys_truncate64                               336       5    -331

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 94968cb3afca..75f3329e8a67 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -217,7 +217,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	return err;
 }
 
-static inline long do_sys_truncate(const char __user * path, loff_t length)
+static long do_sys_truncate(const char __user * path, loff_t length)
 {
 	struct nameidata nd;
 	struct inode * inode;
@@ -283,7 +283,7 @@ asmlinkage long sys_truncate(const char __user * path, unsigned long length)
 	return do_sys_truncate(path, (long)length);
 }
 
-static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 {
 	struct inode * inode;
 	struct dentry *dentry;
@@ -971,7 +971,7 @@ out:
 
 EXPORT_SYMBOL(get_unused_fd);
 
-static inline void __put_unused_fd(struct files_struct *files, unsigned int fd)
+static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = files_fdtable(files);
 	__FD_CLR(fd, fdt->open_fds);
-- 
cgit v1.2.3


From 5d2bea4582d20cb24085152acaa29b95c05cdcf8 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:21 -0800
Subject: [PATCH] tiny: Uninline some inode.c functions

uninline a couple inode.c functions

add/remove: 2/0 grow/shrink: 0/5 up/down: 256/-428 (-172)
function                                     old     new   delta
ifind                                          -     136    +136
ifind_fast                                     -     120    +120
ilookup5_nowait                              131      80     -51
ilookup                                      158      71     -87
ilookup5                                     171      80     -91
iget_locked                                  190      95     -95
iget5_locked                                 240     136    -104

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index d8d04bd72b59..fd568caf7f74 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -770,7 +770,7 @@ EXPORT_SYMBOL(igrab);
  *
  * Note, @test is called with the inode_lock held, so can't sleep.
  */
-static inline struct inode *ifind(struct super_block *sb,
+static struct inode *ifind(struct super_block *sb,
 		struct hlist_head *head, int (*test)(struct inode *, void *),
 		void *data, const int wait)
 {
@@ -804,7 +804,7 @@ static inline struct inode *ifind(struct super_block *sb,
  *
  * Otherwise NULL is returned.
  */
-static inline struct inode *ifind_fast(struct super_block *sb,
+static struct inode *ifind_fast(struct super_block *sb,
 		struct hlist_head *head, unsigned long ino)
 {
 	struct inode *inode;
-- 
cgit v1.2.3


From 33443c42f4ffa5ca23b3323234bcb1a78e85d9db Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:22 -0800
Subject: [PATCH] tiny: Uninline some fslocks.c functions

uninline some file locking functions

add/remove: 3/0 grow/shrink: 0/15 up/down: 256/-1525 (-1269)
function                                     old     new   delta
locks_free_lock                                -     134    +134
posix_same_owner                               -      69     +69
__locks_delete_block                           -      53     +53
posix_locks_conflict                         126     108     -18
locks_remove_posix                           266     237     -29
locks_wake_up_blocks                         121      87     -34
locks_block_on_timeout                        83      47     -36
locks_insert_block                           157     120     -37
locks_delete_block                            62      23     -39
posix_unblock_lock                           104      59     -45
posix_locks_deadlock                         162     100     -62
locks_delete_lock                            228     119    -109
sys_flock                                    338     217    -121
__break_lease                                600     474    -126
lease_init                                   252     122    -130
fcntl_setlk64                                793     649    -144
fcntl_setlk                                  793     649    -144
__posix_lock_file                           1477    1026    -451

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index fb32d6218e21..909eab8fb1d0 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -154,7 +154,7 @@ static struct file_lock *locks_alloc_lock(void)
 }
 
 /* Free a lock which is not in use. */
-static inline void locks_free_lock(struct file_lock *fl)
+static void locks_free_lock(struct file_lock *fl)
 {
 	if (fl == NULL) {
 		BUG();
@@ -475,8 +475,7 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
 /*
  * Check whether two locks have the same owner.
  */
-static inline int
-posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
+static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 {
 	if (fl1->fl_lmops && fl1->fl_lmops->fl_compare_owner)
 		return fl2->fl_lmops == fl1->fl_lmops &&
@@ -487,7 +486,7 @@ posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 /* Remove waiter from blocker's block list.
  * When blocker ends up pointing to itself then the list is empty.
  */
-static inline void __locks_delete_block(struct file_lock *waiter)
+static void __locks_delete_block(struct file_lock *waiter)
 {
 	list_del_init(&waiter->fl_block);
 	list_del_init(&waiter->fl_link);
-- 
cgit v1.2.3


From 708e9a794cf8822b760edaccd9053edb07c34d19 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:25 -0800
Subject: [PATCH] tiny: Configure ELF core dump support

configurable support for ELF core dumps

   text    data     bss     dec     hex filename
3330172  529036  190556 4049764  3dcb64 vmlinux-baseline
3325552  528912  190556 4045020  3db8dc vmlinux-no-elf

add/remove: 0/8 grow/shrink: 0/0 up/down: 0/-4424 (-4424)
function                                     old     new   delta
fill_note                                     32       -     -32
maydump                                       58       -     -58
dump_seek                                     67       -     -67
writenote                                    180       -    -180
elf_dump_thread_status                       274       -    -274
fill_psinfo                                  308       -    -308
fill_prstatus                                466       -    -466
elf_core_dump                               3039       -   -3039

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c | 4 ++--
 init/Kconfig    | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 288386b1deff..80ca932ba0bd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -58,7 +58,7 @@ extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
  * If we don't support core dumping, then supply a NULL so we
  * don't even try.
  */
-#ifdef USE_ELF_CORE_DUMP
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
 static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file);
 #else
 #define elf_core_dump	NULL
@@ -1113,7 +1113,7 @@ out:
  * Note that some platforms still use traditional core dumps and not
  * the ELF core dump.  Each platform can select it as appropriate.
  */
-#ifdef USE_ELF_CORE_DUMP
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
 
 /*
  * ELF core dumper
diff --git a/init/Kconfig b/init/Kconfig
index 1a1f114a37e8..9ac522a40130 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -327,6 +327,12 @@ config DOUBLEFAULT
           would otherwise cause a system to silently reboot. Disabling this
           option saves about 4k.
 
+config ELF_CORE
+	default y
+	bool "Enable ELF core dumps" if EMBEDDED
+	help
+	  Enable support for generating core dumps. Disabling saves about 4k.
+
 config BASE_FULL
 	default y
 	bool "Enable full-sized data structures for core" if EMBEDDED
-- 
cgit v1.2.3


From 0ea60b5ad8c3630e8f44c443f173d841be7fc701 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 9 Jan 2006 14:45:10 +0100
Subject: [BLOCK] bio: init ->bi_bdev in bio_init()

For SG_IO requests, bio->bi_bdev may not be explicitly initialized. So make
bio_init() clear the field to make sure it's always NULL or valid.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/bio.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index dfe242a21eb4..7b3069589951 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -126,6 +126,7 @@ static void bio_fs_destructor(struct bio *bio)
 inline void bio_init(struct bio *bio)
 {
 	bio->bi_next = NULL;
+	bio->bi_bdev = NULL;
 	bio->bi_flags = 1 << BIO_UPTODATE;
 	bio->bi_rw = 0;
 	bio->bi_vcnt = 0;
-- 
cgit v1.2.3


From 50365c57860cd931c2d806057e0987634797e9af Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Mon, 9 Jan 2006 14:15:14 +0000
Subject: [ARM] Make Acorn partition types depend on ACORN_PARTITION

balamurugan reported a problem where it was possible to have
the various Acorn partition types selected in the configuration,
but ACORN_PARTITION disabled.  Since ACORN_PARTITION controls
whether we build fs/partitions/acorn.c, this lead to undefined
references to the adfspart_check_TYPE symbols.

Fix this by making the Acorn partition type symbols depend on
ACORN_PARTITION.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 fs/partitions/Kconfig | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index e227a04261ab..7490cc9208b3 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -21,26 +21,30 @@ config ACORN_PARTITION
 	  Support hard disks partitioned under Acorn operating systems.
 
 config ACORN_PARTITION_CUMANA
-	bool "Cumana partition support" if PARTITION_ADVANCED && ACORN_PARTITION
+	bool "Cumana partition support" if PARTITION_ADVANCED
 	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
 	help
 	  Say Y here if you would like to use hard disks under Linux which
 	  were partitioned using the Cumana interface on Acorn machines.
 
 config ACORN_PARTITION_EESOX
-	bool "EESOX partition support" if PARTITION_ADVANCED && ACORN_PARTITION
+	bool "EESOX partition support" if PARTITION_ADVANCED
 	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
 
 config ACORN_PARTITION_ICS
-	bool "ICS partition support" if PARTITION_ADVANCED && ACORN_PARTITION
+	bool "ICS partition support" if PARTITION_ADVANCED
 	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
 	help
 	  Say Y here if you would like to use hard disks under Linux which
 	  were partitioned using the ICS interface on Acorn machines.
 
 config ACORN_PARTITION_ADFS
-	bool "Native filecore partition support" if PARTITION_ADVANCED && ACORN_PARTITION
+	bool "Native filecore partition support" if PARTITION_ADVANCED
 	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
@@ -48,15 +52,17 @@ config ACORN_PARTITION_ADFS
 	  `Y' here, Linux will support disk partitions created under ADFS.
 
 config ACORN_PARTITION_POWERTEC
-	bool "PowerTec partition support" if PARTITION_ADVANCED && ACORN_PARTITION
+	bool "PowerTec partition support" if PARTITION_ADVANCED
 	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
 	help
 	  Support reading partition tables created on Acorn machines using
 	  the PowerTec SCSI drive.
 
 config ACORN_PARTITION_RISCIX
-	bool "RISCiX partition support" if PARTITION_ADVANCED && ACORN_PARTITION
+	bool "RISCiX partition support" if PARTITION_ADVANCED
 	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
 	help
 	  Once upon a time, there was a native Unix port for the Acorn series
 	  of machines called RISCiX.  If you say 'Y' here, Linux will be able
@@ -224,5 +230,3 @@ config EFI_PARTITION
 	  Say Y here if you would like to use hard disks under Linux which
 	  were partitioned using EFI GPT.  Presently only useful on the
 	  IA-64 platform.
-
-#      define_bool CONFIG_ACORN_PARTITION_CUMANA y
-- 
cgit v1.2.3


From 0d0fbf8152fb3bb4393be11e8df7f70e1fbbd738 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 9 Jan 2006 15:24:57 -0200
Subject: V4L (926_2): Moves compat32 functions from fs to v4l subsystem

This moves the 32 bit ioctl compatibility handlers for
Video4Linux into a new file and adds explicit calls to them
to each v4l device driver.

Unfortunately, there does not seem to be any code handling
the v4l2 ioctls, so quite often the code goes through two
separate conversions, first from 32 bit v4l to 64 bit v4l,
and from there to 64 bit v4l2. My patch does not change
that, so there is still much room for improvement.

Also, some drivers have additional ioctl numbers, for
which the conversion should be handled internally to
that driver.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
---
 drivers/media/radio/miropcm20-radio.c       |   1 +
 drivers/media/radio/radio-aimslab.c         |   1 +
 drivers/media/radio/radio-aztech.c          |   1 +
 drivers/media/radio/radio-cadet.c           |   1 +
 drivers/media/radio/radio-gemtek-pci.c      |   1 +
 drivers/media/radio/radio-gemtek.c          |   1 +
 drivers/media/radio/radio-maestro.c         |   1 +
 drivers/media/radio/radio-maxiradio.c       |   1 +
 drivers/media/radio/radio-rtrack2.c         |   1 +
 drivers/media/radio/radio-sf16fmi.c         |   1 +
 drivers/media/radio/radio-sf16fmr2.c        |   1 +
 drivers/media/radio/radio-terratec.c        |   1 +
 drivers/media/radio/radio-trust.c           |   1 +
 drivers/media/radio/radio-typhoon.c         |   1 +
 drivers/media/radio/radio-zoltrix.c         |   1 +
 drivers/media/video/Makefile                |   3 +-
 drivers/media/video/arv.c                   |   1 +
 drivers/media/video/bttv-driver.c           |   1 +
 drivers/media/video/bw-qcam.c               |   1 +
 drivers/media/video/c-qcam.c                |   1 +
 drivers/media/video/compat_ioctl32.c        | 318 ++++++++++++++++++++++++++++
 drivers/media/video/cpia.c                  |   1 +
 drivers/media/video/cx88/cx88-video.c       |   2 +
 drivers/media/video/meye.c                  |   1 +
 drivers/media/video/pms.c                   |   1 +
 drivers/media/video/saa5249.c               |   1 +
 drivers/media/video/saa7134/saa7134-video.c |   2 +
 drivers/media/video/stradis.c               |   1 +
 drivers/media/video/w9966.c                 |   1 +
 drivers/media/video/zoran_driver.c          |   1 +
 drivers/media/video/zr36120.c               |   1 +
 drivers/usb/media/dsbr100.c                 |   1 +
 drivers/usb/media/ov511.c                   |   1 +
 drivers/usb/media/pwc/pwc-if.c              |   1 +
 drivers/usb/media/se401.c                   |   1 +
 drivers/usb/media/stv680.c                  |   1 +
 drivers/usb/media/usbvideo.c                |   1 +
 drivers/usb/media/vicam.c                   |   1 +
 drivers/usb/media/w9968cf.c                 |   1 +
 fs/compat_ioctl.c                           | 246 ---------------------
 include/linux/compat_ioctl.h                |  26 ---
 include/linux/videodev2.h                   |   3 +
 42 files changed, 362 insertions(+), 273 deletions(-)
 create mode 100644 drivers/media/video/compat_ioctl32.c

(limited to 'fs')

diff --git a/drivers/media/radio/miropcm20-radio.c b/drivers/media/radio/miropcm20-radio.c
index c2ebe8754a95..dc292da2605f 100644
--- a/drivers/media/radio/miropcm20-radio.c
+++ b/drivers/media/radio/miropcm20-radio.c
@@ -220,6 +220,7 @@ static struct file_operations pcm20_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= pcm20_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-aimslab.c b/drivers/media/radio/radio-aimslab.c
index 877c770558e9..914deab4e044 100644
--- a/drivers/media/radio/radio-aimslab.c
+++ b/drivers/media/radio/radio-aimslab.c
@@ -299,6 +299,7 @@ static struct file_operations rtrack_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl	        = rt_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-aztech.c b/drivers/media/radio/radio-aztech.c
index 5319a9c9a979..523be820f9c6 100644
--- a/drivers/media/radio/radio-aztech.c
+++ b/drivers/media/radio/radio-aztech.c
@@ -256,6 +256,7 @@ static struct file_operations aztech_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= az_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-cadet.c b/drivers/media/radio/radio-cadet.c
index 9b0406318f2d..f1b5ac81e9d2 100644
--- a/drivers/media/radio/radio-cadet.c
+++ b/drivers/media/radio/radio-cadet.c
@@ -490,6 +490,7 @@ static struct file_operations cadet_fops = {
 	.release       	= cadet_release,
 	.read		= cadet_read,
 	.ioctl		= cadet_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-gemtek-pci.c b/drivers/media/radio/radio-gemtek-pci.c
index 630cc786d0a4..42c8fce04aa2 100644
--- a/drivers/media/radio/radio-gemtek-pci.c
+++ b/drivers/media/radio/radio-gemtek-pci.c
@@ -301,6 +301,7 @@ static struct file_operations gemtek_pci_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= gemtek_pci_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c
index 6418f03b9ce4..47173be97b9f 100644
--- a/drivers/media/radio/radio-gemtek.c
+++ b/drivers/media/radio/radio-gemtek.c
@@ -233,6 +233,7 @@ static struct file_operations gemtek_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= gemtek_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-maestro.c b/drivers/media/radio/radio-maestro.c
index e5e2021a7312..c30effdf711f 100644
--- a/drivers/media/radio/radio-maestro.c
+++ b/drivers/media/radio/radio-maestro.c
@@ -72,6 +72,7 @@ static struct file_operations maestro_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= radio_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-maxiradio.c b/drivers/media/radio/radio-maxiradio.c
index 02d39a50d5ed..30869308332a 100644
--- a/drivers/media/radio/radio-maxiradio.c
+++ b/drivers/media/radio/radio-maxiradio.c
@@ -80,6 +80,7 @@ static struct file_operations maxiradio_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl	        = radio_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 static struct video_device maxiradio_radio =
diff --git a/drivers/media/radio/radio-rtrack2.c b/drivers/media/radio/radio-rtrack2.c
index b2256d675b44..28a47c9e7a81 100644
--- a/drivers/media/radio/radio-rtrack2.c
+++ b/drivers/media/radio/radio-rtrack2.c
@@ -199,6 +199,7 @@ static struct file_operations rtrack2_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= rt_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-sf16fmi.c b/drivers/media/radio/radio-sf16fmi.c
index 6f03ce4dd7b0..0229f792a059 100644
--- a/drivers/media/radio/radio-sf16fmi.c
+++ b/drivers/media/radio/radio-sf16fmi.c
@@ -225,6 +225,7 @@ static struct file_operations fmi_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= fmi_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-sf16fmr2.c b/drivers/media/radio/radio-sf16fmr2.c
index 71971e9bb342..26632cead09a 100644
--- a/drivers/media/radio/radio-sf16fmr2.c
+++ b/drivers/media/radio/radio-sf16fmr2.c
@@ -356,6 +356,7 @@ static struct file_operations fmr2_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = fmr2_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-terratec.c b/drivers/media/radio/radio-terratec.c
index b03573c6840e..fcfde2e4f195 100644
--- a/drivers/media/radio/radio-terratec.c
+++ b/drivers/media/radio/radio-terratec.c
@@ -276,6 +276,7 @@ static struct file_operations terratec_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= tt_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-trust.c b/drivers/media/radio/radio-trust.c
index b300bedf7c74..5a099a50d4d0 100644
--- a/drivers/media/radio/radio-trust.c
+++ b/drivers/media/radio/radio-trust.c
@@ -255,6 +255,7 @@ static struct file_operations trust_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= tr_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-typhoon.c b/drivers/media/radio/radio-typhoon.c
index f304f3c14763..8ac9a8ef9094 100644
--- a/drivers/media/radio/radio-typhoon.c
+++ b/drivers/media/radio/radio-typhoon.c
@@ -261,6 +261,7 @@ static struct file_operations typhoon_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= typhoon_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/radio/radio-zoltrix.c b/drivers/media/radio/radio-zoltrix.c
index 4c6d6fb49034..d590e80c922e 100644
--- a/drivers/media/radio/radio-zoltrix.c
+++ b/drivers/media/radio/radio-zoltrix.c
@@ -313,6 +313,7 @@ static struct file_operations zoltrix_fops =
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl		= zol_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile
index 82060f9909d8..618a08ab940a 100644
--- a/drivers/media/video/Makefile
+++ b/drivers/media/video/Makefile
@@ -8,7 +8,8 @@ zoran-objs      :=	zr36120.o zr36120_i2c.o zr36120_mem.o
 zr36067-objs	:=	zoran_procfs.o zoran_device.o \
 			zoran_driver.o zoran_card.o
 tuner-objs	:=	tuner-core.o tuner-simple.o mt20xx.o tda8290.o tea5767.o
-obj-$(CONFIG_VIDEO_DEV) += videodev.o v4l2-common.o v4l1-compat.o
+
+obj-$(CONFIG_VIDEO_DEV) += videodev.o v4l2-common.o v4l1-compat.o compat_ioctl32.o
 
 obj-$(CONFIG_VIDEO_BT848) += bttv.o msp3400.o tvaudio.o \
 	tda7432.o tda9875.o ir-kbd-i2c.o ir-kbd-gpio.o
diff --git a/drivers/media/video/arv.c b/drivers/media/video/arv.c
index 881cdcb1875d..7d5a068353f2 100644
--- a/drivers/media/video/arv.c
+++ b/drivers/media/video/arv.c
@@ -749,6 +749,7 @@ static struct file_operations ar_fops = {
 	.release	= video_exclusive_release,
 	.read		= ar_read,
 	.ioctl		= ar_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek		= no_llseek,
 };
 
diff --git a/drivers/media/video/bttv-driver.c b/drivers/media/video/bttv-driver.c
index 1ddf9ba613ef..03f925724ce9 100644
--- a/drivers/media/video/bttv-driver.c
+++ b/drivers/media/video/bttv-driver.c
@@ -3120,6 +3120,7 @@ static struct file_operations bttv_fops =
 	.open	  = bttv_open,
 	.release  = bttv_release,
 	.ioctl	  = bttv_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek	  = no_llseek,
 	.read	  = bttv_read,
 	.mmap	  = bttv_mmap,
diff --git a/drivers/media/video/bw-qcam.c b/drivers/media/video/bw-qcam.c
index 0065d0c240d1..6bad93ef969f 100644
--- a/drivers/media/video/bw-qcam.c
+++ b/drivers/media/video/bw-qcam.c
@@ -875,6 +875,7 @@ static struct file_operations qcam_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = qcam_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read		= qcam_read,
 	.llseek         = no_llseek,
 };
diff --git a/drivers/media/video/c-qcam.c b/drivers/media/video/c-qcam.c
index 75442ec49f35..9976db4f6da8 100644
--- a/drivers/media/video/c-qcam.c
+++ b/drivers/media/video/c-qcam.c
@@ -687,6 +687,7 @@ static struct file_operations qcam_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = qcam_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read		= qcam_read,
 	.llseek         = no_llseek,
 };
diff --git a/drivers/media/video/compat_ioctl32.c b/drivers/media/video/compat_ioctl32.c
new file mode 100644
index 000000000000..42dc11c63c0d
--- /dev/null
+++ b/drivers/media/video/compat_ioctl32.c
@@ -0,0 +1,318 @@
+#include <linux/config.h>
+#include <linux/compat.h>
+#include <linux/videodev.h>
+
+#ifdef CONFIG_COMPAT
+struct video_tuner32 {
+	compat_int_t tuner;
+	char name[32];
+	compat_ulong_t rangelow, rangehigh;
+	u32 flags;	/* It is really u32 in videodev.h */
+	u16 mode, signal;
+};
+
+static int get_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
+{
+	int i;
+
+	if(get_user(kp->tuner, &up->tuner))
+		return -EFAULT;
+	for(i = 0; i < 32; i++)
+		__get_user(kp->name[i], &up->name[i]);
+	__get_user(kp->rangelow, &up->rangelow);
+	__get_user(kp->rangehigh, &up->rangehigh);
+	__get_user(kp->flags, &up->flags);
+	__get_user(kp->mode, &up->mode);
+	__get_user(kp->signal, &up->signal);
+	return 0;
+}
+
+static int put_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
+{
+	int i;
+
+	if(put_user(kp->tuner, &up->tuner))
+		return -EFAULT;
+	for(i = 0; i < 32; i++)
+		__put_user(kp->name[i], &up->name[i]);
+	__put_user(kp->rangelow, &up->rangelow);
+	__put_user(kp->rangehigh, &up->rangehigh);
+	__put_user(kp->flags, &up->flags);
+	__put_user(kp->mode, &up->mode);
+	__put_user(kp->signal, &up->signal);
+	return 0;
+}
+
+struct video_buffer32 {
+	compat_caddr_t base;
+	compat_int_t height, width, depth, bytesperline;
+};
+
+static int get_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
+{
+	u32 tmp;
+
+	if (get_user(tmp, &up->base))
+		return -EFAULT;
+
+	/* This is actually a physical address stored
+	 * as a void pointer.
+	 */
+	kp->base = (void *)(unsigned long) tmp;
+
+	__get_user(kp->height, &up->height);
+	__get_user(kp->width, &up->width);
+	__get_user(kp->depth, &up->depth);
+	__get_user(kp->bytesperline, &up->bytesperline);
+	return 0;
+}
+
+static int put_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
+{
+	u32 tmp = (u32)((unsigned long)kp->base);
+
+	if(put_user(tmp, &up->base))
+		return -EFAULT;
+	__put_user(kp->height, &up->height);
+	__put_user(kp->width, &up->width);
+	__put_user(kp->depth, &up->depth);
+	__put_user(kp->bytesperline, &up->bytesperline);
+	return 0;
+}
+
+struct video_clip32 {
+	s32 x, y, width, height;	/* Its really s32 in videodev.h */
+	compat_caddr_t next;
+};
+
+struct video_window32 {
+	u32 x, y, width, height, chromakey, flags;
+	compat_caddr_t clips;
+	compat_int_t clipcount;
+};
+
+static int native_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret = -ENOIOCTLCMD;
+
+	if (file->f_ops->unlocked_ioctl)
+		ret = file->f_ops->unlocked_ioctl(file, cmd, arg);
+	else if (file->f_ops->ioctl) {
+		lock_kernel();
+		ret = file->f_ops->ioctl(file->f_dentry->d_inode, file, cmd, arg);
+		unlock_kernel();
+	}
+
+	return ret;
+}
+
+
+/* You get back everything except the clips... */
+static int put_video_window32(struct video_window *kp, struct video_window32 __user *up)
+{
+	if(put_user(kp->x, &up->x))
+		return -EFAULT;
+	__put_user(kp->y, &up->y);
+	__put_user(kp->width, &up->width);
+	__put_user(kp->height, &up->height);
+	__put_user(kp->chromakey, &up->chromakey);
+	__put_user(kp->flags, &up->flags);
+	__put_user(kp->clipcount, &up->clipcount);
+	return 0;
+}
+
+#define VIDIOCGTUNER32		_IOWR('v',4, struct video_tuner32)
+#define VIDIOCSTUNER32		_IOW('v',5, struct video_tuner32)
+#define VIDIOCGWIN32		_IOR('v',9, struct video_window32)
+#define VIDIOCSWIN32		_IOW('v',10, struct video_window32)
+#define VIDIOCGFBUF32		_IOR('v',11, struct video_buffer32)
+#define VIDIOCSFBUF32		_IOW('v',12, struct video_buffer32)
+#define VIDIOCGFREQ32		_IOR('v',14, u32)
+#define VIDIOCSFREQ32		_IOW('v',15, u32)
+
+enum {
+	MaxClips = (~0U-sizeof(struct video_window))/sizeof(struct video_clip)
+};
+
+static int do_set_window(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct video_window32 __user *up = compat_ptr(arg);
+	struct video_window __user *vw;
+	struct video_clip __user *p;
+	int nclips;
+	u32 n;
+
+	if (get_user(nclips, &up->clipcount))
+		return -EFAULT;
+
+	/* Peculiar interface... */
+	if (nclips < 0)
+		nclips = VIDEO_CLIPMAP_SIZE;
+
+	if (nclips > MaxClips)
+		return -ENOMEM;
+
+	vw = compat_alloc_user_space(sizeof(struct video_window) +
+				    nclips * sizeof(struct video_clip));
+
+	p = nclips ? (struct video_clip __user *)(vw + 1) : NULL;
+
+	if (get_user(n, &up->x) || put_user(n, &vw->x) ||
+	    get_user(n, &up->y) || put_user(n, &vw->y) ||
+	    get_user(n, &up->width) || put_user(n, &vw->width) ||
+	    get_user(n, &up->height) || put_user(n, &vw->height) ||
+	    get_user(n, &up->chromakey) || put_user(n, &vw->chromakey) ||
+	    get_user(n, &up->flags) || put_user(n, &vw->flags) ||
+	    get_user(n, &up->clipcount) || put_user(n, &vw->clipcount) ||
+	    get_user(n, &up->clips) || put_user(p, &vw->clips))
+		return -EFAULT;
+
+	if (nclips) {
+		struct video_clip32 __user *u = compat_ptr(n);
+		int i;
+		if (!u)
+			return -EINVAL;
+		for (i = 0; i < nclips; i++, u++, p++) {
+			s32 v;
+			if (get_user(v, &u->x) ||
+			    put_user(v, &p->x) ||
+			    get_user(v, &u->y) ||
+			    put_user(v, &p->y) ||
+			    get_user(v, &u->width) ||
+			    put_user(v, &p->width) ||
+			    get_user(v, &u->height) ||
+			    put_user(v, &p->height) ||
+			    put_user(NULL, &p->next))
+				return -EFAULT;
+		}
+	}
+
+	return native_ioctl(file, VIDIOCSWIN, (unsigned long)p);
+}
+
+static int do_video_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	union {
+		struct video_tuner vt;
+		struct video_buffer vb;
+		struct video_window vw;
+		unsigned long vx;
+	} karg;
+	mm_segment_t old_fs = get_fs();
+	void __user *up = compat_ptr(arg);
+	int err = 0;
+
+	/* First, convert the command. */
+	switch(cmd) {
+	case VIDIOCGTUNER32: cmd = VIDIOCGTUNER; break;
+	case VIDIOCSTUNER32: cmd = VIDIOCSTUNER; break;
+	case VIDIOCGWIN32: cmd = VIDIOCGWIN; break;
+	case VIDIOCGFBUF32: cmd = VIDIOCGFBUF; break;
+	case VIDIOCSFBUF32: cmd = VIDIOCSFBUF; break;
+	case VIDIOCGFREQ32: cmd = VIDIOCGFREQ; break;
+	case VIDIOCSFREQ32: cmd = VIDIOCSFREQ; break;
+	};
+
+	switch(cmd) {
+	case VIDIOCSTUNER:
+	case VIDIOCGTUNER:
+		err = get_video_tuner32(&karg.vt, up);
+		break;
+
+	case VIDIOCSFBUF:
+		err = get_video_buffer32(&karg.vb, up);
+		break;
+
+	case VIDIOCSFREQ:
+		err = get_user(karg.vx, (u32 __user *)up);
+		break;
+	};
+	if(err)
+		goto out;
+
+	set_fs(KERNEL_DS);
+	err = native_ioctl(file, cmd, (unsigned long)&karg);
+	set_fs(old_fs);
+
+	if(err == 0) {
+		switch(cmd) {
+		case VIDIOCGTUNER:
+			err = put_video_tuner32(&karg.vt, up);
+			break;
+
+		case VIDIOCGWIN:
+			err = put_video_window32(&karg.vw, up);
+			break;
+
+		case VIDIOCGFBUF:
+			err = put_video_buffer32(&karg.vb, up);
+			break;
+
+		case VIDIOCGFREQ:
+			err = put_user(((u32)karg.vx), (u32 __user *)up);
+			break;
+		};
+	}
+out:
+	return err;
+}
+
+long v4l_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret = -ENOIOCTLCMD;
+
+	if (!file->f_ops->ioctl)
+		return ret;
+
+	switch (cmd) {
+	case VIDIOCSWIN32:
+		ret = do_set_window(file, cmd, arg);
+		break;
+	case VIDIOCGTUNER32:
+	case VIDIOCSTUNER32:
+	case VIDIOCGWIN32:
+	case VIDIOCGFBUF32:
+	case VIDIOCSFBUF32:
+	case VIDIOCGFREQ32:
+	case VIDIOCSFREQ32
+		ret = do_video_ioctl(file, cmd, arg);
+		break;
+
+	/* Little v, the video4linux ioctls (conflict?) */
+	case VIDIOCGCAP:
+	case VIDIOCGCHAN:
+	case VIDIOCSCHAN:
+	case VIDIOCGPICT:
+	case VIDIOCSPICT:
+	case VIDIOCCAPTURE:
+	case VIDIOCKEY:
+	case VIDIOCGAUDIO:
+	case VIDIOCSAUDIO:
+	case VIDIOCSYNC:
+	case VIDIOCMCAPTURE:
+	case VIDIOCGMBUF:
+	case VIDIOCGUNIT:
+	case VIDIOCGCAPTURE:
+	case VIDIOCSCAPTURE:
+
+	/* BTTV specific... */
+	case _IOW('v',  BASE_VIDIOCPRIVATE+0, char [256]):
+	case _IOR('v',  BASE_VIDIOCPRIVATE+1, char [256]):
+	case _IOR('v' , BASE_VIDIOCPRIVATE+2, unsigned int):
+	case _IOW('v' , BASE_VIDIOCPRIVATE+3, char [16]): /* struct bttv_pll_info */
+	case _IOR('v' , BASE_VIDIOCPRIVATE+4, int):
+	case _IOR('v' , BASE_VIDIOCPRIVATE+5, int):
+	case _IOR('v' , BASE_VIDIOCPRIVATE+6, int):
+	case _IOR('v' , BASE_VIDIOCPRIVATE+7, int):
+		ret = native_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+		break;
+
+	return ret;
+}
+#else
+long v4l_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+#endif
+EXPORT_SYMBOL_GPL(v4l_compat_ioctl32);
diff --git a/drivers/media/video/cpia.c b/drivers/media/video/cpia.c
index b7ec9bf45085..9f59541155d9 100644
--- a/drivers/media/video/cpia.c
+++ b/drivers/media/video/cpia.c
@@ -3807,6 +3807,7 @@ static struct file_operations cpia_fops = {
 	.read		= cpia_read,
 	.mmap		= cpia_mmap,
 	.ioctl          = cpia_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c
index 24a48f8a48c1..bc025c46aedf 100644
--- a/drivers/media/video/cx88/cx88-video.c
+++ b/drivers/media/video/cx88/cx88-video.c
@@ -1740,6 +1740,7 @@ static struct file_operations video_fops =
 	.poll          = video_poll,
 	.mmap	       = video_mmap,
 	.ioctl	       = video_ioctl,
+	.compat_ioctl  = v4l_compat_ioctl32,
 	.llseek        = no_llseek,
 };
 
@@ -1767,6 +1768,7 @@ static struct file_operations radio_fops =
 	.open          = video_open,
 	.release       = video_release,
 	.ioctl         = radio_ioctl,
+	.compat_ioctl  = v4l_compat_ioctl32,
 	.llseek        = no_llseek,
 };
 
diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c
index 3f2a882bc20a..2869464aee0d 100644
--- a/drivers/media/video/meye.c
+++ b/drivers/media/video/meye.c
@@ -1754,6 +1754,7 @@ static struct file_operations meye_fops = {
 	.release	= meye_release,
 	.mmap		= meye_mmap,
 	.ioctl		= meye_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.poll		= meye_poll,
 	.llseek		= no_llseek,
 };
diff --git a/drivers/media/video/pms.c b/drivers/media/video/pms.c
index 2504207b2e3d..9e6448639480 100644
--- a/drivers/media/video/pms.c
+++ b/drivers/media/video/pms.c
@@ -883,6 +883,7 @@ static struct file_operations pms_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = pms_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read           = pms_read,
 	.llseek         = no_llseek,
 };
diff --git a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c
index a51c7bd96618..73b4f0e2abf0 100644
--- a/drivers/media/video/saa5249.c
+++ b/drivers/media/video/saa5249.c
@@ -702,6 +702,7 @@ static struct file_operations saa_fops = {
 	.open		= saa5249_open,
 	.release       	= saa5249_release,
 	.ioctl          = saa5249_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek         = no_llseek,
 };
 
diff --git a/drivers/media/video/saa7134/saa7134-video.c b/drivers/media/video/saa7134/saa7134-video.c
index 45c852df13ed..9b9e1e7f05ef 100644
--- a/drivers/media/video/saa7134/saa7134-video.c
+++ b/drivers/media/video/saa7134/saa7134-video.c
@@ -2262,6 +2262,7 @@ static struct file_operations video_fops =
 	.poll     = video_poll,
 	.mmap	  = video_mmap,
 	.ioctl	  = video_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek   = no_llseek,
 };
 
@@ -2271,6 +2272,7 @@ static struct file_operations radio_fops =
 	.open	  = video_open,
 	.release  = video_release,
 	.ioctl	  = radio_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek   = no_llseek,
 };
 
diff --git a/drivers/media/video/stradis.c b/drivers/media/video/stradis.c
index d4497dbae05c..6ee54a45411f 100644
--- a/drivers/media/video/stradis.c
+++ b/drivers/media/video/stradis.c
@@ -1974,6 +1974,7 @@ static struct file_operations saa_fops =
 	.open		= saa_open,
 	.release	= saa_release,
 	.ioctl		= saa_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read		= saa_read,
 	.llseek		= no_llseek,
 	.write		= saa_write,
diff --git a/drivers/media/video/w9966.c b/drivers/media/video/w9966.c
index c318ba32fbaf..b7b0afffd214 100644
--- a/drivers/media/video/w9966.c
+++ b/drivers/media/video/w9966.c
@@ -187,6 +187,7 @@ static struct file_operations w9966_fops = {
 	.open           = video_exclusive_open,
 	.release        = video_exclusive_release,
 	.ioctl          = w9966_v4l_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.read           = w9966_v4l_read,
 	.llseek         = no_llseek,
 };
diff --git a/drivers/media/video/zoran_driver.c b/drivers/media/video/zoran_driver.c
index 4034f1b45366..15283f44e79f 100644
--- a/drivers/media/video/zoran_driver.c
+++ b/drivers/media/video/zoran_driver.c
@@ -4678,6 +4678,7 @@ static struct file_operations zoran_fops = {
 	.open = zoran_open,
 	.release = zoran_close,
 	.ioctl = zoran_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek = no_llseek,
 	.read = zoran_read,
 	.write = zoran_write,
diff --git a/drivers/media/video/zr36120.c b/drivers/media/video/zr36120.c
index 07286816d7df..d4c633b8a7f5 100644
--- a/drivers/media/video/zr36120.c
+++ b/drivers/media/video/zr36120.c
@@ -1490,6 +1490,7 @@ static struct video_device zr36120_template=
 	.write		= zoran_write,
 	.poll		= zoran_poll,
 	.ioctl		= zoran_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.mmap		= zoran_mmap,
 	.minor		= -1,
 };
diff --git a/drivers/usb/media/dsbr100.c b/drivers/usb/media/dsbr100.c
index 6a5700e9d428..25646804d5be 100644
--- a/drivers/usb/media/dsbr100.c
+++ b/drivers/usb/media/dsbr100.c
@@ -127,6 +127,7 @@ static struct file_operations usb_dsbr100_fops = {
 	.open =		usb_dsbr100_open,
 	.release =     	usb_dsbr100_close,
 	.ioctl =        usb_dsbr100_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =       no_llseek,
 };
 
diff --git a/drivers/usb/media/ov511.c b/drivers/usb/media/ov511.c
index 3a0e8ce67ebe..8af665bbe330 100644
--- a/drivers/usb/media/ov511.c
+++ b/drivers/usb/media/ov511.c
@@ -4774,6 +4774,7 @@ static struct file_operations ov511_fops = {
 	.read =		ov51x_v4l1_read,
 	.mmap =		ov51x_v4l1_mmap,
 	.ioctl =	ov51x_v4l1_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =	no_llseek,
 };
 
diff --git a/drivers/usb/media/pwc/pwc-if.c b/drivers/usb/media/pwc/pwc-if.c
index 09ca6128ac20..4f9b0dc6fd7b 100644
--- a/drivers/usb/media/pwc/pwc-if.c
+++ b/drivers/usb/media/pwc/pwc-if.c
@@ -154,6 +154,7 @@ static struct file_operations pwc_fops = {
 	.poll =		pwc_video_poll,
 	.mmap =		pwc_video_mmap,
 	.ioctl =        pwc_video_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =       no_llseek,
 };
 static struct video_device pwc_template = {
diff --git a/drivers/usb/media/se401.c b/drivers/usb/media/se401.c
index b2ae29af5940..2ba562285fda 100644
--- a/drivers/usb/media/se401.c
+++ b/drivers/usb/media/se401.c
@@ -1193,6 +1193,7 @@ static struct file_operations se401_fops = {
         .read =         se401_read,
         .mmap =         se401_mmap,
 	.ioctl =        se401_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =       no_llseek,
 };
 static struct video_device se401_template = {
diff --git a/drivers/usb/media/stv680.c b/drivers/usb/media/stv680.c
index 774038b352cd..b497a6a0a206 100644
--- a/drivers/usb/media/stv680.c
+++ b/drivers/usb/media/stv680.c
@@ -1343,6 +1343,7 @@ static struct file_operations stv680_fops = {
 	.read =		stv680_read,
 	.mmap =		stv680_mmap,
 	.ioctl =        stv680_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek =       no_llseek,
 };
 static struct video_device stv680_template = {
diff --git a/drivers/usb/media/usbvideo.c b/drivers/usb/media/usbvideo.c
index 4bd113325ef9..63a72e550a1b 100644
--- a/drivers/usb/media/usbvideo.c
+++ b/drivers/usb/media/usbvideo.c
@@ -953,6 +953,7 @@ static struct file_operations usbvideo_fops = {
 	.read =   usbvideo_v4l_read,
 	.mmap =   usbvideo_v4l_mmap,
 	.ioctl =  usbvideo_v4l_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.llseek = no_llseek,
 };
 static const struct video_device usbvideo_template = {
diff --git a/drivers/usb/media/vicam.c b/drivers/usb/media/vicam.c
index 1c73155c8d77..5df144073871 100644
--- a/drivers/usb/media/vicam.c
+++ b/drivers/usb/media/vicam.c
@@ -1236,6 +1236,7 @@ static struct file_operations vicam_fops = {
 	.read		= vicam_read,
 	.mmap		= vicam_mmap,
 	.ioctl		= vicam_ioctl,
+	.compat_ioctl	= v4l_compat_ioctl32,
 	.llseek		= no_llseek,
 };
 
diff --git a/drivers/usb/media/w9968cf.c b/drivers/usb/media/w9968cf.c
index 3605a6f3067b..bff9434c8e55 100644
--- a/drivers/usb/media/w9968cf.c
+++ b/drivers/usb/media/w9968cf.c
@@ -3490,6 +3490,7 @@ static struct file_operations w9968cf_fops = {
 	.release = w9968cf_release,
 	.read =    w9968cf_read,
 	.ioctl =   w9968cf_ioctl,
+	.compat_ioctl = v4l_compat_ioctl32,
 	.mmap =    w9968cf_mmap,
 	.llseek =  no_llseek,
 };
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 43a2508ac696..55d9a3a954cf 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -207,244 +207,6 @@ static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
 }
 
-struct video_tuner32 {
-	compat_int_t tuner;
-	char name[32];
-	compat_ulong_t rangelow, rangehigh;
-	u32 flags;	/* It is really u32 in videodev.h */
-	u16 mode, signal;
-};
-
-static int get_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
-{
-	int i;
-
-	if(get_user(kp->tuner, &up->tuner))
-		return -EFAULT;
-	for(i = 0; i < 32; i++)
-		__get_user(kp->name[i], &up->name[i]);
-	__get_user(kp->rangelow, &up->rangelow);
-	__get_user(kp->rangehigh, &up->rangehigh);
-	__get_user(kp->flags, &up->flags);
-	__get_user(kp->mode, &up->mode);
-	__get_user(kp->signal, &up->signal);
-	return 0;
-}
-
-static int put_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up)
-{
-	int i;
-
-	if(put_user(kp->tuner, &up->tuner))
-		return -EFAULT;
-	for(i = 0; i < 32; i++)
-		__put_user(kp->name[i], &up->name[i]);
-	__put_user(kp->rangelow, &up->rangelow);
-	__put_user(kp->rangehigh, &up->rangehigh);
-	__put_user(kp->flags, &up->flags);
-	__put_user(kp->mode, &up->mode);
-	__put_user(kp->signal, &up->signal);
-	return 0;
-}
-
-struct video_buffer32 {
-	compat_caddr_t base;
-	compat_int_t height, width, depth, bytesperline;
-};
-
-static int get_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
-{
-	u32 tmp;
-
-	if (get_user(tmp, &up->base))
-		return -EFAULT;
-
-	/* This is actually a physical address stored
-	 * as a void pointer.
-	 */
-	kp->base = (void *)(unsigned long) tmp;
-
-	__get_user(kp->height, &up->height);
-	__get_user(kp->width, &up->width);
-	__get_user(kp->depth, &up->depth);
-	__get_user(kp->bytesperline, &up->bytesperline);
-	return 0;
-}
-
-static int put_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up)
-{
-	u32 tmp = (u32)((unsigned long)kp->base);
-
-	if(put_user(tmp, &up->base))
-		return -EFAULT;
-	__put_user(kp->height, &up->height);
-	__put_user(kp->width, &up->width);
-	__put_user(kp->depth, &up->depth);
-	__put_user(kp->bytesperline, &up->bytesperline);
-	return 0;
-}
-
-struct video_clip32 {
-	s32 x, y, width, height;	/* Its really s32 in videodev.h */
-	compat_caddr_t next;
-};
-
-struct video_window32 {
-	u32 x, y, width, height, chromakey, flags;
-	compat_caddr_t clips;
-	compat_int_t clipcount;
-};
-
-/* You get back everything except the clips... */
-static int put_video_window32(struct video_window *kp, struct video_window32 __user *up)
-{
-	if(put_user(kp->x, &up->x))
-		return -EFAULT;
-	__put_user(kp->y, &up->y);
-	__put_user(kp->width, &up->width);
-	__put_user(kp->height, &up->height);
-	__put_user(kp->chromakey, &up->chromakey);
-	__put_user(kp->flags, &up->flags);
-	__put_user(kp->clipcount, &up->clipcount);
-	return 0;
-}
-
-#define VIDIOCGTUNER32		_IOWR('v',4, struct video_tuner32)
-#define VIDIOCSTUNER32		_IOW('v',5, struct video_tuner32)
-#define VIDIOCGWIN32		_IOR('v',9, struct video_window32)
-#define VIDIOCSWIN32		_IOW('v',10, struct video_window32)
-#define VIDIOCGFBUF32		_IOR('v',11, struct video_buffer32)
-#define VIDIOCSFBUF32		_IOW('v',12, struct video_buffer32)
-#define VIDIOCGFREQ32		_IOR('v',14, u32)
-#define VIDIOCSFREQ32		_IOW('v',15, u32)
-
-enum {
-	MaxClips = (~0U-sizeof(struct video_window))/sizeof(struct video_clip)
-};
-
-static int do_set_window(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct video_window32 __user *up = compat_ptr(arg);
-	struct video_window __user *vw;
-	struct video_clip __user *p;
-	int nclips;
-	u32 n;
-
-	if (get_user(nclips, &up->clipcount))
-		return -EFAULT;
-
-	/* Peculiar interface... */
-	if (nclips < 0)
-		nclips = VIDEO_CLIPMAP_SIZE;
-
-	if (nclips > MaxClips)
-		return -ENOMEM;
-
-	vw = compat_alloc_user_space(sizeof(struct video_window) +
-				    nclips * sizeof(struct video_clip));
-
-	p = nclips ? (struct video_clip __user *)(vw + 1) : NULL;
-
-	if (get_user(n, &up->x) || put_user(n, &vw->x) ||
-	    get_user(n, &up->y) || put_user(n, &vw->y) ||
-	    get_user(n, &up->width) || put_user(n, &vw->width) ||
-	    get_user(n, &up->height) || put_user(n, &vw->height) ||
-	    get_user(n, &up->chromakey) || put_user(n, &vw->chromakey) ||
-	    get_user(n, &up->flags) || put_user(n, &vw->flags) ||
-	    get_user(n, &up->clipcount) || put_user(n, &vw->clipcount) ||
-	    get_user(n, &up->clips) || put_user(p, &vw->clips))
-		return -EFAULT;
-
-	if (nclips) {
-		struct video_clip32 __user *u = compat_ptr(n);
-		int i;
-		if (!u)
-			return -EINVAL;
-		for (i = 0; i < nclips; i++, u++, p++) {
-			s32 v;
-			if (get_user(v, &u->x) ||
-			    put_user(v, &p->x) ||
-			    get_user(v, &u->y) ||
-			    put_user(v, &p->y) ||
-			    get_user(v, &u->width) ||
-			    put_user(v, &p->width) ||
-			    get_user(v, &u->height) ||
-			    put_user(v, &p->height) ||
-			    put_user(NULL, &p->next))
-				return -EFAULT;
-		}
-	}
-
-	return sys_ioctl(fd, VIDIOCSWIN, (unsigned long)p);
-}
-
-static int do_video_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	union {
-		struct video_tuner vt;
-		struct video_buffer vb;
-		struct video_window vw;
-		unsigned long vx;
-	} karg;
-	mm_segment_t old_fs = get_fs();
-	void __user *up = compat_ptr(arg);
-	int err = 0;
-
-	/* First, convert the command. */
-	switch(cmd) {
-	case VIDIOCGTUNER32: cmd = VIDIOCGTUNER; break;
-	case VIDIOCSTUNER32: cmd = VIDIOCSTUNER; break;
-	case VIDIOCGWIN32: cmd = VIDIOCGWIN; break;
-	case VIDIOCGFBUF32: cmd = VIDIOCGFBUF; break;
-	case VIDIOCSFBUF32: cmd = VIDIOCSFBUF; break;
-	case VIDIOCGFREQ32: cmd = VIDIOCGFREQ; break;
-	case VIDIOCSFREQ32: cmd = VIDIOCSFREQ; break;
-	};
-
-	switch(cmd) {
-	case VIDIOCSTUNER:
-	case VIDIOCGTUNER:
-		err = get_video_tuner32(&karg.vt, up);
-		break;
-
-	case VIDIOCSFBUF:
-		err = get_video_buffer32(&karg.vb, up);
-		break;
-
-	case VIDIOCSFREQ:
-		err = get_user(karg.vx, (u32 __user *)up);
-		break;
-	};
-	if(err)
-		goto out;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&karg);
-	set_fs(old_fs);
-
-	if(err == 0) {
-		switch(cmd) {
-		case VIDIOCGTUNER:
-			err = put_video_tuner32(&karg.vt, up);
-			break;
-
-		case VIDIOCGWIN:
-			err = put_video_window32(&karg.vw, up);
-			break;
-
-		case VIDIOCGFBUF:
-			err = put_video_buffer32(&karg.vb, up);
-			break;
-
-		case VIDIOCGFREQ:
-			err = put_user(((u32)karg.vx), (u32 __user *)up);
-			break;
-		};
-	}
-out:
-	return err;
-}
-
 struct compat_dmx_event {
 	dmx_event_t	event;
 	compat_time_t	timeStamp;
@@ -3015,14 +2777,6 @@ COMPATIBLE_IOCTL(EXT3_IOC_GROUP_ADD)
 #ifdef CONFIG_JBD_DEBUG
 HANDLE_IOCTL(EXT3_IOC32_WAIT_FOR_READONLY, do_ext3_ioctl)
 #endif
-HANDLE_IOCTL(VIDIOCGTUNER32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCSTUNER32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCGWIN32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCSWIN32, do_set_window)
-HANDLE_IOCTL(VIDIOCGFBUF32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCSFBUF32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCGFREQ32, do_video_ioctl)
-HANDLE_IOCTL(VIDIOCSFREQ32, do_video_ioctl)
 /* One SMB ioctl needs translations. */
 #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
 HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid)
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 119f9d064cc6..339878952f12 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -218,32 +218,6 @@ COMPATIBLE_IOCTL(VT_RESIZE)
 COMPATIBLE_IOCTL(VT_RESIZEX)
 COMPATIBLE_IOCTL(VT_LOCKSWITCH)
 COMPATIBLE_IOCTL(VT_UNLOCKSWITCH)
-/* Little v */
-/* Little v, the video4linux ioctls (conflict?) */
-COMPATIBLE_IOCTL(VIDIOCGCAP)
-COMPATIBLE_IOCTL(VIDIOCGCHAN)
-COMPATIBLE_IOCTL(VIDIOCSCHAN)
-COMPATIBLE_IOCTL(VIDIOCGPICT)
-COMPATIBLE_IOCTL(VIDIOCSPICT)
-COMPATIBLE_IOCTL(VIDIOCCAPTURE)
-COMPATIBLE_IOCTL(VIDIOCKEY)
-COMPATIBLE_IOCTL(VIDIOCGAUDIO)
-COMPATIBLE_IOCTL(VIDIOCSAUDIO)
-COMPATIBLE_IOCTL(VIDIOCSYNC)
-COMPATIBLE_IOCTL(VIDIOCMCAPTURE)
-COMPATIBLE_IOCTL(VIDIOCGMBUF)
-COMPATIBLE_IOCTL(VIDIOCGUNIT)
-COMPATIBLE_IOCTL(VIDIOCGCAPTURE)
-COMPATIBLE_IOCTL(VIDIOCSCAPTURE)
-/* BTTV specific... */
-COMPATIBLE_IOCTL(_IOW('v',  BASE_VIDIOCPRIVATE+0, char [256]))
-COMPATIBLE_IOCTL(_IOR('v',  BASE_VIDIOCPRIVATE+1, char [256]))
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+2, unsigned int))
-COMPATIBLE_IOCTL(_IOW('v' , BASE_VIDIOCPRIVATE+3, char [16])) /* struct bttv_pll_info */
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+4, int))
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+5, int))
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+6, int))
-COMPATIBLE_IOCTL(_IOR('v' , BASE_VIDIOCPRIVATE+7, int))
 /* Little p (/dev/rtc, /dev/envctrl, etc.) */
 COMPATIBLE_IOCTL(RTC_AIE_ON)
 COMPATIBLE_IOCTL(RTC_AIE_OFF)
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 1cded681eb6d..13f78ec4bf76 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1117,6 +1117,9 @@ typedef int (*v4l2_kioctl)(struct inode *inode, struct file *file,
 			   unsigned int cmd, void *arg);
 int v4l_compat_translate_ioctl(struct inode *inode, struct file *file,
 			       int cmd, void *arg, v4l2_kioctl driver_ioctl);
+/* 32 Bits compatibility layer for 64 bits processors */
+extern long v4l_compat_ioctl32(struct file *file, unsigned int cmd,
+				unsigned long arg);
 
 #endif /* __KERNEL__ */
 #endif /* __LINUX_VIDEODEV2_H */
-- 
cgit v1.2.3


From a9aa1ffaac7c8d6f093bb8f7cdeea761a5e25f53 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@mars.ravnborg.org>
Date: Mon, 9 Jan 2006 20:48:03 +0100
Subject: kbuild/xfs: introduce fs/xfs/Kbuild

In kbuild the file named 'Kbuild' has precedence over the file named
Makefile. Utilise a file named Kbuild to include the 2.6 Makefile for xfs
- since the xfs people likes to keep their arch specific Makefiles separate.

With this patch xfs does no longer rely on the KERNELRELEASE components to be global.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 fs/xfs/Kbuild | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 fs/xfs/Kbuild

(limited to 'fs')

diff --git a/fs/xfs/Kbuild b/fs/xfs/Kbuild
new file mode 100644
index 000000000000..2566e96706f1
--- /dev/null
+++ b/fs/xfs/Kbuild
@@ -0,0 +1,6 @@
+#
+# The xfs people like to share Makefile with 2.6 and 2.4.
+# Utilise file named Kbuild file which has precedence over Makefile.
+#
+
+include $(srctree)/$(obj)/Makefile-linux-2.6
-- 
cgit v1.2.3


From 943ffb587cfdf3b2adfe52a6db08573f4ecf3284 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 10 Jan 2006 00:10:13 +0100
Subject: spelling: s/retreive/retrieve/

Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/i2o/ioctl                      | 2 +-
 arch/powerpc/kernel/prom.c                   | 2 +-
 arch/powerpc/kernel/rtas.c                   | 2 +-
 arch/powerpc/kernel/setup_64.c               | 2 +-
 arch/powerpc/mm/hash_utils_64.c              | 2 +-
 arch/powerpc/platforms/powermac/cpufreq_64.c | 2 +-
 drivers/macintosh/therm_pm72.c               | 4 ++--
 drivers/macintosh/windfarm_pm81.c            | 2 +-
 drivers/message/i2o/README.ioctl             | 2 +-
 drivers/net/hp100.c                          | 2 +-
 drivers/usb/serial/cypress_m8.c              | 2 +-
 drivers/video/aty/radeon_monitor.c           | 2 +-
 drivers/video/logo/Makefile                  | 2 +-
 fs/9p/vfs_inode.c                            | 2 +-
 include/asm-powerpc/smu.h                    | 6 +++---
 scripts/Makefile.modpost                     | 2 +-
 scripts/mksysmap                             | 2 +-
 17 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/Documentation/i2o/ioctl b/Documentation/i2o/ioctl
index 3e174978997d..1e77fac4e120 100644
--- a/Documentation/i2o/ioctl
+++ b/Documentation/i2o/ioctl
@@ -185,7 +185,7 @@ VII. Getting Parameters
       ENOMEM      Kernel memory allocation error
 
    A return value of 0 does not mean that the value was actually
-   properly retreived.  The user should check the result list
+   properly retrieved.  The user should check the result list
    to determine the specific status of the transaction.
 
 VIII. Downloading Software
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 977ee3adaf2d..34ab0daec3a7 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -972,7 +972,7 @@ static int __init early_init_dt_scan_chosen(unsigned long node,
 #endif
 
 #ifdef CONFIG_PPC_RTAS
-	/* To help early debugging via the front panel, we retreive a minimal
+	/* To help early debugging via the front panel, we retrieve a minimal
 	 * set of RTAS infos now if available
 	 */
 	{
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 4283fa33f784..ae2e2a31a705 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -632,7 +632,7 @@ void rtas_stop_self(void)
 }
 
 /*
- * Call early during boot, before mem init or bootmem, to retreive the RTAS
+ * Call early during boot, before mem init or bootmem, to retrieve the RTAS
  * informations from the device-tree and allocate the RMO buffer for userland
  * accesses.
  */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 98e9f0595dd8..81567e931260 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -438,7 +438,7 @@ void __init setup_system(void)
 
 	/*
 	 * Fill the ppc64_caches & systemcfg structures with informations
-	 * retreived from the device-tree. Need to be called before
+	 * retrieved from the device-tree. Need to be called before
 	 * finish_device_tree() since the later requires some of the
 	 * informations filled up here to properly parse the interrupt
 	 * tree.
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 5bb433cbe41b..149351a84b94 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -368,7 +368,7 @@ static unsigned long __init htab_get_table_size(void)
 	unsigned long mem_size, rnd_mem_size, pteg_count;
 
 	/* If hash size isn't already provided by the platform, we try to
-	 * retreive it from the device-tree. If it's not there neither, we
+	 * retrieve it from the device-tree. If it's not there neither, we
 	 * calculate it now based on the total RAM size
 	 */
 	if (ppc64_pft_size == 0)
diff --git a/arch/powerpc/platforms/powermac/cpufreq_64.c b/arch/powerpc/platforms/powermac/cpufreq_64.c
index a4b50c4109c2..a415e8d2f7af 100644
--- a/arch/powerpc/platforms/powermac/cpufreq_64.c
+++ b/arch/powerpc/platforms/powermac/cpufreq_64.c
@@ -80,7 +80,7 @@ static struct freq_attr* g5_cpu_freqs_attr[] = {
 };
 
 /* Power mode data is an array of the 32 bits PCR values to use for
- * the various frequencies, retreived from the device-tree
+ * the various frequencies, retrieved from the device-tree
  */
 static u32 *g5_pmode_data;
 static int g5_pmode_max;
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index 8d0958c38b6b..4f50ee5767a2 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -630,12 +630,12 @@ static int read_eeprom(int cpu, struct mpu_data *out)
 	sprintf(nodename, "/u3@0,f8000000/i2c@f8001000/cpuid@a%d", cpu ? 2 : 0);
 	np = of_find_node_by_path(nodename);
 	if (np == NULL) {
-		printk(KERN_ERR "therm_pm72: Failed to retreive cpuid node from device-tree\n");
+		printk(KERN_ERR "therm_pm72: Failed to retrieve cpuid node from device-tree\n");
 		return -ENODEV;
 	}
 	data = (u8 *)get_property(np, "cpuid", &len);
 	if (data == NULL) {
-		printk(KERN_ERR "therm_pm72: Failed to retreive cpuid property from device-tree\n");
+		printk(KERN_ERR "therm_pm72: Failed to retrieve cpuid property from device-tree\n");
 		of_node_put(np);
 		return -ENODEV;
 	}
diff --git a/drivers/macintosh/windfarm_pm81.c b/drivers/macintosh/windfarm_pm81.c
index 80ddf9776bde..eb69a601e765 100644
--- a/drivers/macintosh/windfarm_pm81.c
+++ b/drivers/macintosh/windfarm_pm81.c
@@ -26,7 +26,7 @@
  *    (typically the drive fan)
  *  - the main control (first control) gets the target value scaled with
  *    the first pair of factors, and is then modified as below
- *  - the value of the target of the CPU Fan control loop is retreived,
+ *  - the value of the target of the CPU Fan control loop is retrieved,
  *    scaled with the second pair of factors, and the max of that and
  *    the scaled target is applied to the main control.
  *
diff --git a/drivers/message/i2o/README.ioctl b/drivers/message/i2o/README.ioctl
index 73dd084c0e98..65c0c47aeb79 100644
--- a/drivers/message/i2o/README.ioctl
+++ b/drivers/message/i2o/README.ioctl
@@ -185,7 +185,7 @@ VII. Getting Parameters
       ENOMEM      Kernel memory allocation error
 
    A return value of 0 does not mean that the value was actually
-   properly retreived.  The user should check the result list 
+   properly retrieved.  The user should check the result list 
    to determine the specific status of the transaction.
 
 VIII. Downloading Software
diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c
index e92c17f6931c..55c7ed608391 100644
--- a/drivers/net/hp100.c
+++ b/drivers/net/hp100.c
@@ -276,7 +276,7 @@ static void hp100_RegisterDump(struct net_device *dev);
  * Convert an address in a kernel buffer to a bus/phys/dma address.
  * This work *only* for memory fragments part of lp->page_vaddr,
  * because it was properly DMA allocated via pci_alloc_consistent(),
- * so we just need to "retreive" the original mapping to bus/phys/dma
+ * so we just need to "retrieve" the original mapping to bus/phys/dma
  * address - Jean II */
 static inline dma_addr_t virt_to_whatever(struct net_device *dev, u32 * ptr)
 {
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index af18355e94cc..4e9637eb6137 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -357,7 +357,7 @@ static int cypress_serial_control (struct usb_serial_port *port, unsigned baud_m
 			} while (retval != 5 && retval != ENODEV);
 
 			if (retval != 5) {
-				err("%s - failed to retreive serial line settings - %d", __FUNCTION__, retval);
+				err("%s - failed to retrieve serial line settings - %d", __FUNCTION__, retval);
 				return retval;
 			} else {
 				spin_lock_irqsave(&priv->lock, flags);
diff --git a/drivers/video/aty/radeon_monitor.c b/drivers/video/aty/radeon_monitor.c
index ea7c86306918..7f9838dceab5 100644
--- a/drivers/video/aty/radeon_monitor.c
+++ b/drivers/video/aty/radeon_monitor.c
@@ -423,7 +423,7 @@ static int __devinit radeon_parse_monitor_layout(struct radeonfb_info *rinfo,
 /*
  * Probe display on both primary and secondary card's connector (if any)
  * by various available techniques (i2c, OF device tree, BIOS, ...) and
- * try to retreive EDID. The algorithm here comes from XFree's radeon
+ * try to retrieve EDID. The algorithm here comes from XFree's radeon
  * driver
  */
 void __devinit radeon_probe_screens(struct radeonfb_info *rinfo,
diff --git a/drivers/video/logo/Makefile b/drivers/video/logo/Makefile
index d0244c04af5a..4ef5cd19609d 100644
--- a/drivers/video/logo/Makefile
+++ b/drivers/video/logo/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_LOGO_M32R_CLUT224)		+= logo_m32r_clut224.o
 
 # How to generate logo's
 
-# Use logo-cfiles to retreive list of .c files to be built
+# Use logo-cfiles to retrieve list of .c files to be built
 logo-cfiles = $(notdir $(patsubst %.$(2), %.c, \
               $(wildcard $(srctree)/$(src)/*$(1).$(2))))
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d933ef1fbd8a..a17b28854288 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -663,7 +663,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 }
 
 /**
- * v9fs_vfs_getattr - retreive file metadata
+ * v9fs_vfs_getattr - retrieve file metadata
  * @mnt - mount information
  * @dentry - file to get attributes on
  * @stat - metadata structure to populate
diff --git a/include/asm-powerpc/smu.h b/include/asm-powerpc/smu.h
index 134c2b5be0f2..82ce47607774 100644
--- a/include/asm-powerpc/smu.h
+++ b/include/asm-powerpc/smu.h
@@ -22,7 +22,7 @@
 /*
  * Partition info commands
  *
- * These commands are used to retreive the sdb-partition-XX datas from
+ * These commands are used to retrieve the sdb-partition-XX datas from
  * the SMU. The lenght is always 2. First byte is the subcommand code
  * and second byte is the partition ID.
  *
@@ -225,7 +225,7 @@
  *
  * SMU_CMD_MISC_ee_GET_DATABLOCK_REC is used, among others, to
  * transfer blocks of data from the SMU. So far, I've decrypted it's
- * usage to retreive partition data. In order to do that, you have to
+ * usage to retrieve partition data. In order to do that, you have to
  * break your transfer in "chunks" since that command cannot transfer
  * more than a chunk at a time. The chunk size used by OF is 0xe bytes,
  * but it seems that the darwin driver will let you do 0x1e bytes if
@@ -556,7 +556,7 @@ struct smu_user_cmd_hdr
 	__u32		cmdtype;
 #define SMU_CMDTYPE_SMU			0	/* SMU command */
 #define SMU_CMDTYPE_WANTS_EVENTS	1	/* switch fd to events mode */
-#define SMU_CMDTYPE_GET_PARTITION	2	/* retreive an sdb partition */
+#define SMU_CMDTYPE_GET_PARTITION	2	/* retrieve an sdb partition */
 
 	__u8		cmd;			/* SMU command byte */
 	__u8		pad[3];			/* padding */
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index 0c4f3a9f2ea9..bf96a61d4b86 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -30,7 +30,7 @@
 #     - See include/linux/module.h for more details
 
 # Step 4 is solely used to allow module versioning in external modules,
-# where the CRC of each module is retreived from the Module.symers file.
+# where the CRC of each module is retrieved from the Module.symers file.
 
 .PHONY: _modpost
 _modpost: __modpost
diff --git a/scripts/mksysmap b/scripts/mksysmap
index a6430e05972d..4390fab9f5bd 100644
--- a/scripts/mksysmap
+++ b/scripts/mksysmap
@@ -1,7 +1,7 @@
 #!/bin/sh -x
 # Based on the vmlinux file create the System.map file
 # System.map is used by module-init tools and some debugging
-# tools to retreive the actual addresses of symbols in the kernel.
+# tools to retrieve the actual addresses of symbols in the kernel.
 #
 # Usage
 # mksysmap vmlinux System.map
-- 
cgit v1.2.3


From 794ee1baee1c26be40410233e6c20bceb2b03c08 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@trained-monkey.org>
Date: Mon, 9 Jan 2006 15:59:21 -0800
Subject: [PATCH] mutex subsystem, semaphore to mutex: XFS

This patch switches XFS over to use the new mutex code directly as
opposed to the previous workaround patch I posted earlier that avoided
the namespace clash by forcing it back to semaphores. This falls in the
'works for me<tm>' category.

Signed-off-by: Jes Sorensen <jes@trained-monkey.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/xfs/linux-2.6/mutex.h       | 10 +++-------
 fs/xfs/quota/xfs_dquot.c       |  4 ++--
 fs/xfs/quota/xfs_qm.c          | 10 +++++-----
 fs/xfs/quota/xfs_qm.h          |  2 +-
 fs/xfs/quota/xfs_qm_bhv.c      |  2 +-
 fs/xfs/quota/xfs_qm_syscalls.c |  8 ++++----
 fs/xfs/quota/xfs_quota_priv.h  |  2 +-
 fs/xfs/support/uuid.c          |  6 +++---
 fs/xfs/xfs_mount.c             |  2 +-
 fs/xfs/xfs_mount.h             |  2 +-
 10 files changed, 22 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
index ce773d89a923..d3369b6ca168 100644
--- a/fs/xfs/linux-2.6/mutex.h
+++ b/fs/xfs/linux-2.6/mutex.h
@@ -19,7 +19,7 @@
 #define __XFS_SUPPORT_MUTEX_H__
 
 #include <linux/spinlock.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 /*
  * Map the mutex'es from IRIX to Linux semaphores.
@@ -28,12 +28,8 @@
  * callers.
  */
 #define MUTEX_DEFAULT		0x0
-typedef struct semaphore	mutex_t;
 
-#define mutex_init(lock, type, name)		sema_init(lock, 1)
-#define mutex_destroy(lock)			sema_init(lock, -99)
-#define mutex_lock(lock, num)			down(lock)
-#define mutex_trylock(lock)			(down_trylock(lock) ? 0 : 1)
-#define mutex_unlock(lock)			up(lock)
+typedef struct mutex		mutex_t;
+//#define mutex_destroy(lock)			do{}while(0)
 
 #endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 00b5043dfa5a..772ac48329ea 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -104,7 +104,7 @@ xfs_qm_dqinit(
 	 */
 	if (brandnewdquot) {
 		dqp->dq_flnext = dqp->dq_flprev = dqp;
-		mutex_init(&dqp->q_qlock,  MUTEX_DEFAULT, "xdq");
+		mutex_init(&dqp->q_qlock);
 		initnsema(&dqp->q_flock, 1, "fdq");
 		sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
 
@@ -1382,7 +1382,7 @@ void
 xfs_dqlock(
 	xfs_dquot_t *dqp)
 {
-	mutex_lock(&(dqp->q_qlock), PINOD);
+	mutex_lock(&(dqp->q_qlock));
 }
 
 void
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 5328a2937127..bb6991a7a617 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -167,7 +167,7 @@ xfs_Gqm_init(void)
 	xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
 	xqm->qm_nrefs = 0;
 #ifdef DEBUG
-	mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk");
+	xfs_mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk");
 #endif
 	return xqm;
 }
@@ -1166,7 +1166,7 @@ xfs_qm_init_quotainfo(
 	qinf->qi_dqreclaims = 0;
 
 	/* mutex used to serialize quotaoffs */
-	mutex_init(&qinf->qi_quotaofflock, MUTEX_DEFAULT, "qoff");
+	mutex_init(&qinf->qi_quotaofflock);
 
 	/* Precalc some constants */
 	qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -1285,7 +1285,7 @@ xfs_qm_list_init(
 	char		*str,
 	int		n)
 {
-	mutex_init(&list->qh_lock, MUTEX_DEFAULT, str);
+	mutex_init(&list->qh_lock);
 	list->qh_next = NULL;
 	list->qh_version = 0;
 	list->qh_nelems = 0;
@@ -2762,7 +2762,7 @@ STATIC void
 xfs_qm_freelist_init(xfs_frlist_t *ql)
 {
 	ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
-	mutex_init(&ql->qh_lock, MUTEX_DEFAULT, "dqf");
+	mutex_init(&ql->qh_lock);
 	ql->qh_version = 0;
 	ql->qh_nelems = 0;
 }
@@ -2772,7 +2772,7 @@ xfs_qm_freelist_destroy(xfs_frlist_t *ql)
 {
 	xfs_dquot_t	*dqp, *nextdqp;
 
-	mutex_lock(&ql->qh_lock, PINOD);
+	mutex_lock(&ql->qh_lock);
 	for (dqp = ql->qh_next;
 	     dqp != (xfs_dquot_t *)ql; ) {
 		xfs_dqlock(dqp);
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 12da259f2fcb..4568deb6da86 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_IWARNLIMIT	5
 #define XFS_QM_RTBWARNLIMIT	5
 
-#define XFS_QM_LOCK(xqm)	(mutex_lock(&xqm##_lock, PINOD))
+#define XFS_QM_LOCK(xqm)	(mutex_lock(&xqm##_lock))
 #define XFS_QM_UNLOCK(xqm)	(mutex_unlock(&xqm##_lock))
 #define XFS_QM_HOLD(xqm)	((xqm)->qm_nrefs++)
 #define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index d9d2993de435..90402a1c3983 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -363,7 +363,7 @@ xfs_qm_init(void)
 		KERN_INFO "SGI XFS Quota Management subsystem\n";
 
 	printk(message);
-	mutex_init(&xfs_Gqm_lock, MUTEX_DEFAULT, "xfs_qmlock");
+	mutex_init(&xfs_Gqm_lock);
 	vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs);
 	xfs_qm_init_procfs();
 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 24690e1af659..86a1d09f48d5 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -233,7 +233,7 @@ xfs_qm_scall_quotaoff(
 	 */
 	ASSERT(mp->m_quotainfo);
 	if (mp->m_quotainfo)
-		mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+		mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
 
 	ASSERT(mp->m_quotainfo);
 
@@ -508,7 +508,7 @@ xfs_qm_scall_quotaon(
 	/*
 	 * Switch on quota enforcement in core.
 	 */
-	mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+	mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
 	mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
 	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
 
@@ -617,7 +617,7 @@ xfs_qm_scall_setqlim(
 	 * a quotaoff from happening). (XXXThis doesn't currently happen
 	 * because we take the vfslock before calling xfs_qm_sysent).
 	 */
-	mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+	mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
 
 	/*
 	 * Get the dquot (locked), and join it to the transaction.
@@ -1426,7 +1426,7 @@ xfs_qm_internalqcheck(
 	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
 	XFS_bflush(mp->m_ddev_targp);
 
-	mutex_lock(&qcheck_lock, PINOD);
+	mutex_lock(&qcheck_lock);
 	/* There should be absolutely no quota activity while this
 	   is going on. */
 	qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 7a9f3beb818c..b7ddd04aae32 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -51,7 +51,7 @@
 #define XFS_QI_MPLNEXT(mp)	((mp)->m_quotainfo->qi_dqlist.qh_next)
 #define XFS_QI_MPLNDQUOTS(mp)	((mp)->m_quotainfo->qi_dqlist.qh_nelems)
 
-#define XQMLCK(h)			(mutex_lock(&((h)->qh_lock), PINOD))
+#define XQMLCK(h)			(mutex_lock(&((h)->qh_lock)))
 #define XQMUNLCK(h)			(mutex_unlock(&((h)->qh_lock)))
 #ifdef DEBUG
 struct xfs_dqhash;
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 70ce40914c8a..69ec4f540c3a 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -24,7 +24,7 @@ static uuid_t	*uuid_table;
 void
 uuid_init(void)
 {
-	mutex_init(&uuid_monitor, MUTEX_DEFAULT, "uuid_monitor");
+	mutex_init(&uuid_monitor);
 }
 
 /*
@@ -94,7 +94,7 @@ uuid_table_insert(uuid_t *uuid)
 {
 	int	i, hole;
 
-	mutex_lock(&uuid_monitor, PVFS);
+	mutex_lock(&uuid_monitor);
 	for (i = 0, hole = -1; i < uuid_table_size; i++) {
 		if (uuid_is_nil(&uuid_table[i])) {
 			hole = i;
@@ -122,7 +122,7 @@ uuid_table_remove(uuid_t *uuid)
 {
 	int	i;
 
-	mutex_lock(&uuid_monitor, PVFS);
+	mutex_lock(&uuid_monitor);
 	for (i = 0; i < uuid_table_size; i++) {
 		if (uuid_is_nil(&uuid_table[i]))
 			continue;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 541d5dd474be..303af86739bf 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -117,7 +117,7 @@ xfs_mount_init(void)
 
 	AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
 	spinlock_init(&mp->m_sb_lock, "xfs_sb");
-	mutex_init(&mp->m_ilock, MUTEX_DEFAULT, "xfs_ilock");
+	mutex_init(&mp->m_ilock);
 	initnsema(&mp->m_growlock, 1, "xfs_grow");
 	/*
 	 * Initialize the AIL.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 08b2e0a5d807..3432fd5a3986 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -533,7 +533,7 @@ typedef struct xfs_mod_sb {
 	int		msb_delta;	/* Change to make to specified field */
 } xfs_mod_sb_t;
 
-#define	XFS_MOUNT_ILOCK(mp)	mutex_lock(&((mp)->m_ilock), PINOD)
+#define	XFS_MOUNT_ILOCK(mp)	mutex_lock(&((mp)->m_ilock))
 #define	XFS_MOUNT_IUNLOCK(mp)	mutex_unlock(&((mp)->m_ilock))
 #define	XFS_SB_LOCK(mp)		mutex_spinlock(&(mp)->m_sb_lock)
 #define	XFS_SB_UNLOCK(mp,s)	mutex_spinunlock(&(mp)->m_sb_lock,(s))
-- 
cgit v1.2.3


From 1b1dcc1b57a49136f118a0f16367256ff9994a69 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Mon, 9 Jan 2006 15:59:24 -0800
Subject: [PATCH] mutex subsystem, semaphore to mutex: VFS, ->i_sem

This patch converts the inode semaphore to a mutex. I have tested it on
XFS and compiled as much as one can consider on an ia64. Anyway your
luck with it might be different.

Modified-by: Ingo Molnar <mingo@elte.hu>

(finished the conversion)

Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/platforms/cell/spufs/inode.c | 12 ++---
 drivers/block/loop.c                      |  4 +-
 drivers/char/mem.c                        |  4 +-
 drivers/isdn/capi/capifs.c                |  6 +--
 drivers/md/dm.c                           |  4 +-
 drivers/md/md.c                           |  8 +--
 drivers/pci/proc.c                        |  4 +-
 drivers/usb/core/inode.c                  | 28 +++++------
 drivers/usb/gadget/file_storage.c         |  4 +-
 drivers/usb/gadget/inode.c                |  4 +-
 fs/affs/inode.c                           |  4 +-
 fs/autofs/root.c                          |  4 +-
 fs/autofs4/root.c                         |  4 +-
 fs/binfmt_misc.c                          | 12 ++---
 fs/block_dev.c                            |  4 +-
 fs/buffer.c                               |  6 +--
 fs/cifs/cifsfs.c                          |  6 +--
 fs/cifs/inode.c                           |  8 +--
 fs/coda/dir.c                             |  4 +-
 fs/coda/file.c                            |  8 +--
 fs/configfs/dir.c                         | 54 ++++++++++----------
 fs/configfs/file.c                        |  4 +-
 fs/configfs/inode.c                       |  6 +--
 fs/debugfs/inode.c                        |  8 +--
 fs/devfs/base.c                           | 22 ++++-----
 fs/devpts/inode.c                         |  8 +--
 fs/direct-io.c                            | 30 +++++------
 fs/dquot.c                                | 16 +++---
 fs/exportfs/expfs.c                       | 12 ++---
 fs/ext2/acl.c                             | 10 ++--
 fs/ext2/ext2.h                            |  2 +-
 fs/ext2/super.c                           |  4 +-
 fs/ext2/xattr.c                           |  2 +-
 fs/ext3/acl.c                             | 10 ++--
 fs/ext3/super.c                           |  4 +-
 fs/ext3/xattr.c                           |  2 +-
 fs/fat/dir.c                              |  4 +-
 fs/fat/file.c                             |  4 +-
 fs/fifo.c                                 |  6 +--
 fs/fuse/file.c                            |  4 +-
 fs/hfs/inode.c                            |  4 +-
 fs/hfsplus/bitmap.c                       |  8 +--
 fs/hfsplus/inode.c                        |  4 +-
 fs/hpfs/dir.c                             |  6 +--
 fs/hppfs/hppfs_kern.c                     |  6 +--
 fs/hugetlbfs/inode.c                      |  4 +-
 fs/inode.c                                |  2 +-
 fs/jffs/inode-v23.c                       |  2 +-
 fs/jfs/jfs_incore.h                       |  4 +-
 fs/libfs.c                                |  8 +--
 fs/namei.c                                | 82 +++++++++++++++----------------
 fs/namespace.c                            | 12 ++---
 fs/nfs/dir.c                              | 10 ++--
 fs/nfsd/nfs4recover.c                     | 20 ++++----
 fs/nfsd/vfs.c                             | 12 ++---
 fs/ntfs/attrib.c                          |  4 +-
 fs/ntfs/dir.c                             |  8 +--
 fs/ntfs/file.c                            | 18 +++----
 fs/ntfs/index.c                           |  6 +--
 fs/ntfs/inode.c                           |  8 +--
 fs/ntfs/namei.c                           |  6 +--
 fs/ntfs/quota.c                           |  6 +--
 fs/ntfs/super.c                           | 16 +++---
 fs/ocfs2/alloc.c                          | 24 ++++-----
 fs/ocfs2/cluster/nodemanager.c            |  2 +-
 fs/ocfs2/dir.c                            |  4 +-
 fs/ocfs2/file.c                           |  8 +--
 fs/ocfs2/inode.c                          | 12 ++---
 fs/ocfs2/journal.c                        | 14 +++---
 fs/ocfs2/localalloc.c                     |  6 +--
 fs/open.c                                 | 24 ++++-----
 fs/pipe.c                                 | 44 ++++++++---------
 fs/quota.c                                |  6 +--
 fs/read_write.c                           |  4 +-
 fs/readdir.c                              |  4 +-
 fs/reiserfs/file.c                        | 10 ++--
 fs/reiserfs/inode.c                       | 14 +++---
 fs/reiserfs/ioctl.c                       |  4 +-
 fs/reiserfs/super.c                       |  4 +-
 fs/reiserfs/tail_conversion.c             |  2 +-
 fs/reiserfs/xattr.c                       | 34 ++++++-------
 fs/reiserfs/xattr_acl.c                   |  6 +--
 fs/relayfs/inode.c                        | 12 ++---
 fs/sysfs/dir.c                            | 31 ++++++------
 fs/sysfs/file.c                           | 17 +++----
 fs/sysfs/inode.c                          |  8 ++-
 fs/sysfs/symlink.c                        |  5 +-
 fs/ufs/super.c                            |  6 +--
 fs/xattr.c                                |  8 +--
 fs/xfs/linux-2.6/xfs_iops.c               |  2 +-
 fs/xfs/linux-2.6/xfs_lrw.c                | 18 +++----
 fs/xfs/xfs_dmapi.h                        | 14 +++---
 include/linux/ext3_fs_i.h                 |  2 +-
 include/linux/fs.h                        |  7 +--
 include/linux/jffs2_fs_i.h                |  4 +-
 include/linux/nfsd/nfsfh.h                |  6 +--
 include/linux/pipe_fs_i.h                 |  2 +-
 include/linux/reiserfs_fs.h               |  2 +-
 ipc/mqueue.c                              |  8 +--
 kernel/cpuset.c                           | 10 ++--
 mm/filemap.c                              | 30 +++++------
 mm/filemap_xip.c                          |  6 +--
 mm/memory.c                               |  4 +-
 mm/msync.c                                |  2 +-
 mm/rmap.c                                 |  8 +--
 mm/shmem.c                                |  6 +--
 mm/swapfile.c                             |  8 +--
 mm/truncate.c                             |  2 +-
 net/sunrpc/rpc_pipe.c                     | 58 +++++++++++-----------
 net/unix/af_unix.c                        |  4 +-
 security/inode.c                          |  8 +--
 sound/core/oss/pcm_oss.c                  |  2 -
 sound/core/seq/seq_memory.c               |  4 --
 113 files changed, 563 insertions(+), 573 deletions(-)

(limited to 'fs')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 1f3507c75e90..d2ba358c6e38 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -137,7 +137,7 @@ spufs_delete_inode(struct inode *inode)
 static void spufs_prune_dir(struct dentry *dir)
 {
 	struct dentry *dentry, *tmp;
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
 		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
@@ -154,7 +154,7 @@ static void spufs_prune_dir(struct dentry *dir)
 		}
 	}
 	shrink_dcache_parent(dir);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 }
 
 static int spufs_rmdir(struct inode *root, struct dentry *dir_dentry)
@@ -162,15 +162,15 @@ static int spufs_rmdir(struct inode *root, struct dentry *dir_dentry)
 	struct spu_context *ctx;
 
 	/* remove all entries */
-	down(&root->i_sem);
+	mutex_lock(&root->i_mutex);
 	spufs_prune_dir(dir_dentry);
-	up(&root->i_sem);
+	mutex_unlock(&root->i_mutex);
 
 	/* We have to give up the mm_struct */
 	ctx = SPUFS_I(dir_dentry->d_inode)->i_ctx;
 	spu_forget(ctx);
 
-	/* XXX Do we need to hold i_sem here ? */
+	/* XXX Do we need to hold i_mutex here ? */
 	return simple_rmdir(root, dir_dentry);
 }
 
@@ -330,7 +330,7 @@ long spufs_create_thread(struct nameidata *nd,
 out_dput:
 	dput(dentry);
 out_dir:
-	up(&nd->dentry->d_inode->i_sem);
+	mutex_unlock(&nd->dentry->d_inode->i_mutex);
 out:
 	return ret;
 }
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a452b13620a2..bed9ad76c04c 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -215,7 +215,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	unsigned offset, bv_offs;
 	int len, ret;
 
-	down(&mapping->host->i_sem);
+	mutex_lock(&mapping->host->i_mutex);
 	index = pos >> PAGE_CACHE_SHIFT;
 	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
 	bv_offs = bvec->bv_offset;
@@ -278,7 +278,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	}
 	ret = 0;
 out:
-	up(&mapping->host->i_sem);
+	mutex_unlock(&mapping->host->i_mutex);
 	return ret;
 unlock:
 	unlock_page(page);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 5b2d18035073..704c3c07f0ab 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -741,7 +741,7 @@ static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
 {
 	loff_t ret;
 
-	down(&file->f_dentry->d_inode->i_sem);
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 	switch (orig) {
 		case 0:
 			file->f_pos = offset;
@@ -756,7 +756,7 @@ static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
 		default:
 			ret = -EINVAL;
 	}
-	up(&file->f_dentry->d_inode->i_sem);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 	return ret;
 }
 
diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 207cae366256..0a37aded4b54 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -138,7 +138,7 @@ static struct dentry *get_node(int num)
 {
 	char s[10];
 	struct dentry *root = capifs_root;
-	down(&root->d_inode->i_sem);
+	mutex_lock(&root->d_inode->i_mutex);
 	return lookup_one_len(s, root, sprintf(s, "%d", num));
 }
 
@@ -159,7 +159,7 @@ void capifs_new_ncci(unsigned int number, dev_t device)
 	dentry = get_node(number);
 	if (!IS_ERR(dentry) && !dentry->d_inode)
 		d_instantiate(dentry, inode);
-	up(&capifs_root->d_inode->i_sem);
+	mutex_unlock(&capifs_root->d_inode->i_mutex);
 }
 
 void capifs_free_ncci(unsigned int number)
@@ -175,7 +175,7 @@ void capifs_free_ncci(unsigned int number)
 		}
 		dput(dentry);
 	}
-	up(&capifs_root->d_inode->i_sem);
+	mutex_unlock(&capifs_root->d_inode->i_mutex);
 }
 
 static int __init capifs_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0e481512f918..5c210b0a4cb0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -837,9 +837,9 @@ static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 
-	down(&md->suspended_bdev->bd_inode->i_sem);
+	mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
 	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-	up(&md->suspended_bdev->bd_inode->i_sem);
+	mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
 }
 
 static int __bind(struct mapped_device *md, struct dm_table *t)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e423a16ba3c9..0302723fa21f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3460,9 +3460,9 @@ static int update_size(mddev_t *mddev, unsigned long size)
 
 		bdev = bdget_disk(mddev->gendisk, 0);
 		if (bdev) {
-			down(&bdev->bd_inode->i_sem);
+			mutex_lock(&bdev->bd_inode->i_mutex);
 			i_size_write(bdev->bd_inode, mddev->array_size << 10);
-			up(&bdev->bd_inode->i_sem);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
 			bdput(bdev);
 		}
 	}
@@ -3486,9 +3486,9 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
 
 		bdev = bdget_disk(mddev->gendisk, 0);
 		if (bdev) {
-			down(&bdev->bd_inode->i_sem);
+			mutex_lock(&bdev->bd_inode->i_mutex);
 			i_size_write(bdev->bd_inode, mddev->array_size << 10);
-			up(&bdev->bd_inode->i_sem);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
 			bdput(bdev);
 		}
 	}
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 9eb465727fce..9cb6dd0834be 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -25,7 +25,7 @@ proc_bus_pci_lseek(struct file *file, loff_t off, int whence)
 	loff_t new = -1;
 	struct inode *inode = file->f_dentry->d_inode;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	switch (whence) {
 	case 0:
 		new = off;
@@ -41,7 +41,7 @@ proc_bus_pci_lseek(struct file *file, loff_t off, int whence)
 		new = -EINVAL;
 	else
 		file->f_pos = new;
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return new;
 }
 
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 4ddc453023a2..3cf945cc5b9a 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -184,13 +184,13 @@ static void update_bus(struct dentry *bus)
 	bus->d_inode->i_gid = busgid;
 	bus->d_inode->i_mode = S_IFDIR | busmode;
 
-	down(&bus->d_inode->i_sem);
+	mutex_lock(&bus->d_inode->i_mutex);
 
 	list_for_each_entry(dev, &bus->d_subdirs, d_u.d_child)
 		if (dev->d_inode)
 			update_dev(dev);
 
-	up(&bus->d_inode->i_sem);
+	mutex_unlock(&bus->d_inode->i_mutex);
 }
 
 static void update_sb(struct super_block *sb)
@@ -201,7 +201,7 @@ static void update_sb(struct super_block *sb)
 	if (!root)
 		return;
 
-	down(&root->d_inode->i_sem);
+	mutex_lock(&root->d_inode->i_mutex);
 
 	list_for_each_entry(bus, &root->d_subdirs, d_u.d_child) {
 		if (bus->d_inode) {
@@ -219,7 +219,7 @@ static void update_sb(struct super_block *sb)
 		}
 	}
 
-	up(&root->d_inode->i_sem);
+	mutex_unlock(&root->d_inode->i_mutex);
 }
 
 static int remount(struct super_block *sb, int *flags, char *data)
@@ -333,10 +333,10 @@ static int usbfs_empty (struct dentry *dentry)
 static int usbfs_unlink (struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	dentry->d_inode->i_nlink--;
 	dput(dentry);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	d_delete(dentry);
 	return 0;
 }
@@ -346,7 +346,7 @@ static int usbfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int error = -ENOTEMPTY;
 	struct inode * inode = dentry->d_inode;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	dentry_unhash(dentry);
 	if (usbfs_empty(dentry)) {
 		dentry->d_inode->i_nlink -= 2;
@@ -355,7 +355,7 @@ static int usbfs_rmdir(struct inode *dir, struct dentry *dentry)
 		dir->i_nlink--;
 		error = 0;
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	if (!error)
 		d_delete(dentry);
 	dput(dentry);
@@ -380,7 +380,7 @@ static loff_t default_file_lseek (struct file *file, loff_t offset, int orig)
 {
 	loff_t retval = -EINVAL;
 
-	down(&file->f_dentry->d_inode->i_sem);
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 	switch(orig) {
 	case 0:
 		if (offset > 0) {
@@ -397,7 +397,7 @@ static loff_t default_file_lseek (struct file *file, loff_t offset, int orig)
 	default:
 		break;
 	}
-	up(&file->f_dentry->d_inode->i_sem);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 	return retval;
 }
 
@@ -480,7 +480,7 @@ static int fs_create_by_name (const char *name, mode_t mode,
 	}
 
 	*dentry = NULL;
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry)) {
 		if ((mode & S_IFMT) == S_IFDIR)
@@ -489,7 +489,7 @@ static int fs_create_by_name (const char *name, mode_t mode,
 			error = usbfs_create (parent->d_inode, *dentry, mode);
 	} else
 		error = PTR_ERR(dentry);
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 
 	return error;
 }
@@ -528,7 +528,7 @@ static void fs_remove_file (struct dentry *dentry)
 	if (!parent || !parent->d_inode)
 		return;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	if (usbfs_positive(dentry)) {
 		if (dentry->d_inode) {
 			if (S_ISDIR(dentry->d_inode->i_mode))
@@ -538,7 +538,7 @@ static void fs_remove_file (struct dentry *dentry)
 		dput(dentry);
 		}
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 }
 
 /* --------------------------------------------------------------------- */
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 0cea9782d7d4..de59c58896d6 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -1891,7 +1891,7 @@ static int fsync_sub(struct lun *curlun)
 		return -EINVAL;
 
 	inode = filp->f_dentry->d_inode;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	current->flags |= PF_SYNCWRITE;
 	rc = filemap_fdatawrite(inode->i_mapping);
 	err = filp->f_op->fsync(filp, filp->f_dentry, 1);
@@ -1901,7 +1901,7 @@ static int fsync_sub(struct lun *curlun)
 	if (!rc)
 		rc = err;
 	current->flags &= ~PF_SYNCWRITE;
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	VLDBG(curlun, "fdatasync -> %d\n", rc);
 	return rc;
 }
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 5c40980a5bd9..c6c279de832e 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -1562,10 +1562,10 @@ restart:
 		spin_unlock_irq (&dev->lock);
 
 		/* break link to dcache */
-		down (&parent->i_sem);
+		mutex_lock (&parent->i_mutex);
 		d_delete (dentry);
 		dput (dentry);
-		up (&parent->i_sem);
+		mutex_unlock (&parent->i_mutex);
 
 		/* fds may still be open */
 		goto restart;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 9ebe881c6786..44d439cb69f4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -244,10 +244,10 @@ affs_put_inode(struct inode *inode)
 	pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
 	affs_free_prealloc(inode);
 	if (atomic_read(&inode->i_count) == 1) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		if (inode->i_size != AFFS_I(inode)->mmu_private)
 			affs_truncate(inode);
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 }
 
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index a1ab1c0ed215..808134a5a2fa 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -229,9 +229,9 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr
 	dentry->d_flags |= DCACHE_AUTOFS_PENDING;
 	d_add(dentry, NULL);
 
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	autofs_revalidate(dentry, nd);
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 
 	/*
 	 * If we are still pending, check if we had to handle
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2241405ffc41..541b19e6fec9 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -489,9 +489,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	d_add(dentry, NULL);
 
 	if (dentry->d_op && dentry->d_op->d_revalidate) {
-		up(&dir->i_sem);
+		mutex_unlock(&dir->i_mutex);
 		(dentry->d_op->d_revalidate)(dentry, nd);
-		down(&dir->i_sem);
+		mutex_lock(&dir->i_mutex);
 	}
 
 	/*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 2568eb41cb3a..9ccc7d8275b8 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -588,11 +588,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 		case 2: set_bit(Enabled, &e->flags);
 			break;
 		case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root);
-			down(&root->d_inode->i_sem);
+			mutex_lock(&root->d_inode->i_mutex);
 
 			kill_node(e);
 
-			up(&root->d_inode->i_sem);
+			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
 			break;
 		default: return res;
@@ -622,7 +622,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		return PTR_ERR(e);
 
 	root = dget(sb->s_root);
-	down(&root->d_inode->i_sem);
+	mutex_lock(&root->d_inode->i_mutex);
 	dentry = lookup_one_len(e->name, root, strlen(e->name));
 	err = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -658,7 +658,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 out2:
 	dput(dentry);
 out:
-	up(&root->d_inode->i_sem);
+	mutex_unlock(&root->d_inode->i_mutex);
 	dput(root);
 
 	if (err) {
@@ -703,12 +703,12 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 		case 1: enabled = 0; break;
 		case 2: enabled = 1; break;
 		case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root);
-			down(&root->d_inode->i_sem);
+			mutex_lock(&root->d_inode->i_mutex);
 
 			while (!list_empty(&entries))
 				kill_node(list_entry(entries.next, Node, list));
 
-			up(&root->d_inode->i_sem);
+			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
 		default: return res;
 	}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e0df94c37b7e..6e50346fb1ee 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -202,7 +202,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 	loff_t size;
 	loff_t retval;
 
-	down(&bd_inode->i_sem);
+	mutex_lock(&bd_inode->i_mutex);
 	size = i_size_read(bd_inode);
 
 	switch (origin) {
@@ -219,7 +219,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 		}
 		retval = offset;
 	}
-	up(&bd_inode->i_sem);
+	mutex_unlock(&bd_inode->i_mutex);
 	return retval;
 }
 	
diff --git a/fs/buffer.c b/fs/buffer.c
index 55f0975a9b15..6466bc8a3dc7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -352,11 +352,11 @@ static long do_fsync(unsigned int fd, int datasync)
 	 * We need to protect against concurrent writers,
 	 * which could cause livelocks in fsync_buffers_list
 	 */
-	down(&mapping->host->i_sem);
+	mutex_lock(&mapping->host->i_mutex);
 	err = file->f_op->fsync(file, file->f_dentry, datasync);
 	if (!ret)
 		ret = err;
-	up(&mapping->host->i_sem);
+	mutex_unlock(&mapping->host->i_mutex);
 	err = filemap_fdatawait(mapping);
 	if (!ret)
 		ret = err;
@@ -2338,7 +2338,7 @@ int generic_commit_write(struct file *file, struct page *page,
 	__block_commit_write(inode,page,from,to);
 	/*
 	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_sem.
+	 * cannot change under us because we hold i_mutex.
 	 */
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2a13a2bac8f1..e10213b7541e 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -860,9 +860,9 @@ static int cifs_oplock_thread(void * dummyarg)
 				DeleteOplockQEntry(oplock_item);
 				/* can not grab inode sem here since it would
 				deadlock when oplock received on delete 
-				since vfs_unlink holds the i_sem across
+				since vfs_unlink holds the i_mutex across
 				the call */
-				/* down(&inode->i_sem);*/
+				/* mutex_lock(&inode->i_mutex);*/
 				if (S_ISREG(inode->i_mode)) {
 					rc = filemap_fdatawrite(inode->i_mapping);
 					if(CIFS_I(inode)->clientCanCacheRead == 0) {
@@ -871,7 +871,7 @@ static int cifs_oplock_thread(void * dummyarg)
 					}
 				} else
 					rc = 0;
-				/* up(&inode->i_sem);*/
+				/* mutex_unlock(&inode->i_mutex);*/
 				if (rc)
 					CIFS_I(inode)->write_behind_rc = rc;
 				cFYI(1,("Oplock flush inode %p rc %d",inode,rc));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9558f51bca55..3ebce9430f4a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1040,9 +1040,9 @@ int cifs_revalidate(struct dentry *direntry)
 	}
 
 	/* can not grab this sem since kernel filesys locking documentation
-	   indicates i_sem may be taken by the kernel on lookup and rename
-	   which could deadlock if we grab the i_sem here as well */
-/*	down(&direntry->d_inode->i_sem);*/
+	   indicates i_mutex may be taken by the kernel on lookup and rename
+	   which could deadlock if we grab the i_mutex here as well */
+/*	mutex_lock(&direntry->d_inode->i_mutex);*/
 	/* need to write out dirty pages here  */
 	if (direntry->d_inode->i_mapping) {
 		/* do we need to lock inode until after invalidate completes
@@ -1066,7 +1066,7 @@ int cifs_revalidate(struct dentry *direntry)
 			}
 		}
 	}
-/*	up(&direntry->d_inode->i_sem); */
+/*	mutex_unlock(&direntry->d_inode->i_mutex); */
 	
 	kfree(full_path);
 	FreeXid(xid);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2391766e9c7c..8f1a517f8b4e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -453,7 +453,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir)
 	coda_vfs_stat.readdir++;
 
 	host_inode = host_file->f_dentry->d_inode;
-	down(&host_inode->i_sem);
+	mutex_lock(&host_inode->i_mutex);
 	host_file->f_pos = coda_file->f_pos;
 
 	if (!host_file->f_op->readdir) {
@@ -475,7 +475,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir)
 	}
 out:
 	coda_file->f_pos = host_file->f_pos;
-	up(&host_inode->i_sem);
+	mutex_unlock(&host_inode->i_mutex);
 
 	return ret;
 }
diff --git a/fs/coda/file.c b/fs/coda/file.c
index e6bc022568f3..30b4630bd735 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -77,14 +77,14 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
 		return -EINVAL;
 
 	host_inode = host_file->f_dentry->d_inode;
-	down(&coda_inode->i_sem);
+	mutex_lock(&coda_inode->i_mutex);
 
 	ret = host_file->f_op->write(host_file, buf, count, ppos);
 
 	coda_inode->i_size = host_inode->i_size;
 	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
 	coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
-	up(&coda_inode->i_sem);
+	mutex_unlock(&coda_inode->i_mutex);
 
 	return ret;
 }
@@ -272,9 +272,9 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 	if (host_file->f_op && host_file->f_op->fsync) {
 		host_dentry = host_file->f_dentry;
 		host_inode = host_dentry->d_inode;
-		down(&host_inode->i_sem);
+		mutex_lock(&host_inode->i_mutex);
 		err = host_file->f_op->fsync(host_file, host_dentry, datasync);
-		up(&host_inode->i_sem);
+		mutex_unlock(&host_inode->i_mutex);
 	}
 
 	if ( !err && !datasync ) {
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index e48b539243a1..b668ec61527e 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -288,10 +288,10 @@ static struct dentry * configfs_lookup(struct inode *dir,
 
 /*
  * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
- * attributes and are removed by rmdir().  We recurse, taking i_sem
+ * attributes and are removed by rmdir().  We recurse, taking i_mutex
  * on all children that are candidates for default detach.  If the
  * result is clean, then configfs_detach_group() will handle dropping
- * i_sem.  If there is an error, the caller will clean up the i_sem
+ * i_mutex.  If there is an error, the caller will clean up the i_mutex
  * holders via configfs_detach_rollback().
  */
 static int configfs_detach_prep(struct dentry *dentry)
@@ -309,8 +309,8 @@ static int configfs_detach_prep(struct dentry *dentry)
 		if (sd->s_type & CONFIGFS_NOT_PINNED)
 			continue;
 		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
-			down(&sd->s_dentry->d_inode->i_sem);
-			/* Mark that we've taken i_sem */
+			mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+			/* Mark that we've taken i_mutex */
 			sd->s_type |= CONFIGFS_USET_DROPPING;
 
 			ret = configfs_detach_prep(sd->s_dentry);
@@ -327,7 +327,7 @@ out:
 }
 
 /*
- * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
+ * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is
  * set.
  */
 static void configfs_detach_rollback(struct dentry *dentry)
@@ -341,7 +341,7 @@ static void configfs_detach_rollback(struct dentry *dentry)
 
 			if (sd->s_type & CONFIGFS_USET_DROPPING) {
 				sd->s_type &= ~CONFIGFS_USET_DROPPING;
-				up(&sd->s_dentry->d_inode->i_sem);
+				mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
 			}
 		}
 	}
@@ -424,11 +424,11 @@ static void detach_groups(struct config_group *group)
 
 		/*
 		 * From rmdir/unregister, a configfs_detach_prep() pass
-		 * has taken our i_sem for us.  Drop it.
+		 * has taken our i_mutex for us.  Drop it.
 		 * From mkdir/register cleanup, there is no sem held.
 		 */
 		if (sd->s_type & CONFIGFS_USET_DROPPING)
-			up(&child->d_inode->i_sem);
+			mutex_unlock(&child->d_inode->i_mutex);
 
 		d_delete(child);
 		dput(child);
@@ -493,11 +493,11 @@ static int populate_groups(struct config_group *group)
 		/* FYI, we're faking mkdir here
 		 * I'm not sure we need this semaphore, as we're called
 		 * from our parent's mkdir.  That holds our parent's
-		 * i_sem, so afaik lookup cannot continue through our
+		 * i_mutex, so afaik lookup cannot continue through our
 		 * parent to find us, let alone mess with our tree.
-		 * That said, taking our i_sem is closer to mkdir
+		 * That said, taking our i_mutex is closer to mkdir
 		 * emulation, and shouldn't hurt. */
-		down(&dentry->d_inode->i_sem);
+		mutex_lock(&dentry->d_inode->i_mutex);
 
 		for (i = 0; group->default_groups[i]; i++) {
 			new_group = group->default_groups[i];
@@ -507,7 +507,7 @@ static int populate_groups(struct config_group *group)
 				break;
 		}
 
-		up(&dentry->d_inode->i_sem);
+		mutex_unlock(&dentry->d_inode->i_mutex);
 	}
 
 	if (ret)
@@ -856,7 +856,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
 	down_write(&configfs_rename_sem);
 	parent = item->parent->dentry;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 
 	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (!IS_ERR(new_dentry)) {
@@ -872,7 +872,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
 			error = -EEXIST;
 		dput(new_dentry);
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	up_write(&configfs_rename_sem);
 
 	return error;
@@ -884,9 +884,9 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_dentry;
 	struct configfs_dirent * parent_sd = dentry->d_fsdata;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	file->private_data = configfs_new_dirent(parent_sd, NULL);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	return file->private_data ? 0 : -ENOMEM;
 
@@ -897,9 +897,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_dentry;
 	struct configfs_dirent * cursor = file->private_data;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	list_del_init(&cursor->s_sibling);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	release_configfs_dirent(cursor);
 
@@ -975,7 +975,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 {
 	struct dentry * dentry = file->f_dentry;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += file->f_pos;
@@ -983,7 +983,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			if (offset >= 0)
 				break;
 		default:
-			up(&file->f_dentry->d_inode->i_sem);
+			mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -1007,7 +1007,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			list_add_tail(&cursor->s_sibling, p);
 		}
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	return offset;
 }
 
@@ -1037,7 +1037,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	sd = configfs_sb->s_root->d_fsdata;
 	link_group(to_config_group(sd->s_element), group);
 
-	down(&configfs_sb->s_root->d_inode->i_sem);
+	mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
 
 	name.name = group->cg_item.ci_name;
 	name.len = strlen(name.name);
@@ -1057,7 +1057,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	else
 		d_delete(dentry);
 
-	up(&configfs_sb->s_root->d_inode->i_sem);
+	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
 
 	if (dentry) {
 	    dput(dentry);
@@ -1079,18 +1079,18 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 		return;
 	}
 
-	down(&configfs_sb->s_root->d_inode->i_sem);
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	if (configfs_detach_prep(dentry)) {
 		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
 	}
 	configfs_detach_group(&group->cg_item);
 	dentry->d_inode->i_flags |= S_DEAD;
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	d_delete(dentry);
 
-	up(&configfs_sb->s_root->d_inode->i_sem);
+	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
 
 	dput(dentry);
 
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index af1ffc9a15c0..c26cd61f13af 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -336,9 +336,9 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
 	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
 	int error = 0;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	return error;
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 6b274c6d428f..6577c588de9d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -122,7 +122,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
 
 /*
  * Unhashes the dentry corresponding to given configfs_dirent
- * Called with parent inode's i_sem held.
+ * Called with parent inode's i_mutex held.
  */
 void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 {
@@ -145,7 +145,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 	struct configfs_dirent * sd;
 	struct configfs_dirent * parent_sd = dir->d_fsdata;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element)
 			continue;
@@ -156,7 +156,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 			break;
 		}
 	}
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 }
 
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a86ac4aeaedb..d4f1a2cddd47 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -146,7 +146,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 	}
 
 	*dentry = NULL;
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry)) {
 		if ((mode & S_IFMT) == S_IFDIR)
@@ -155,7 +155,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 			error = debugfs_create(parent->d_inode, *dentry, mode);
 	} else
 		error = PTR_ERR(dentry);
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 
 	return error;
 }
@@ -273,7 +273,7 @@ void debugfs_remove(struct dentry *dentry)
 	if (!parent || !parent->d_inode)
 		return;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	if (debugfs_positive(dentry)) {
 		if (dentry->d_inode) {
 			if (S_ISDIR(dentry->d_inode->i_mode))
@@ -283,7 +283,7 @@ void debugfs_remove(struct dentry *dentry)
 		dput(dentry);
 		}
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 1274422a5384..b621521e09d4 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2162,27 +2162,27 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
 	 *
 	 * make sure that
 	 *   d_instantiate always runs under lock
-	 *   we release i_sem lock before going to sleep
+	 *   we release i_mutex lock before going to sleep
 	 *
 	 * unfortunately sometimes d_revalidate is called with
-	 * and sometimes without i_sem lock held. The following checks
+	 * and sometimes without i_mutex lock held. The following checks
 	 * attempt to deduce when we need to add (and drop resp.) lock
 	 * here. This relies on current (2.6.2) calling coventions:
 	 *
-	 *   lookup_hash is always run under i_sem and is passing NULL
+	 *   lookup_hash is always run under i_mutex and is passing NULL
 	 *   as nd
 	 *
-	 *   open(...,O_CREATE,...) calls _lookup_hash under i_sem
+	 *   open(...,O_CREATE,...) calls _lookup_hash under i_mutex
 	 *   and sets flags to LOOKUP_OPEN|LOOKUP_CREATE
 	 *
 	 *   all other invocations of ->d_revalidate seem to happen
-	 *   outside of i_sem
+	 *   outside of i_mutex
 	 */
 	need_lock = nd &&
 	    (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT));
 
 	if (need_lock)
-		down(&dir->i_sem);
+		mutex_lock(&dir->i_mutex);
 
 	if (is_devfsd_or_child(fs_info)) {
 		devfs_handle_t de = lookup_info->de;
@@ -2221,9 +2221,9 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
 		add_wait_queue(&lookup_info->wait_queue, &wait);
 		read_unlock(&parent->u.dir.lock);
 		/* at this point it is always (hopefully) locked */
-		up(&dir->i_sem);
+		mutex_unlock(&dir->i_mutex);
 		schedule();
-		down(&dir->i_sem);
+		mutex_lock(&dir->i_mutex);
 		/*
 		 * This does not need nor should remove wait from wait_queue.
 		 * Wait queue head is never reused - nothing is ever added to it
@@ -2238,7 +2238,7 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
 
       out:
 	if (need_lock)
-		up(&dir->i_sem);
+		mutex_unlock(&dir->i_mutex);
 	return 1;
 }				/*  End Function devfs_d_revalidate_wait  */
 
@@ -2284,9 +2284,9 @@ static struct dentry *devfs_lookup(struct inode *dir, struct dentry *dentry,
 	/*  Unlock directory semaphore, which will release any waiters. They
 	   will get the hashed dentry, and may be forced to wait for
 	   revalidation  */
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	wait_for_devfsd_finished(fs_info);	/*  If I'm not devfsd, must wait  */
-	down(&dir->i_sem);	/*  Grab it again because them's the rules  */
+	mutex_lock(&dir->i_mutex);	/*  Grab it again because them's the rules  */
 	de = lookup_info.de;
 	/*  If someone else has been so kind as to make the inode, we go home
 	   early  */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f2be44d4491f..bfb8a230bac9 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -130,7 +130,7 @@ static struct dentry *get_node(int num)
 {
 	char s[12];
 	struct dentry *root = devpts_root;
-	down(&root->d_inode->i_sem);
+	mutex_lock(&root->d_inode->i_mutex);
 	return lookup_one_len(s, root, sprintf(s, "%d", num));
 }
 
@@ -161,7 +161,7 @@ int devpts_pty_new(struct tty_struct *tty)
 	if (!IS_ERR(dentry) && !dentry->d_inode)
 		d_instantiate(dentry, inode);
 
-	up(&devpts_root->d_inode->i_sem);
+	mutex_unlock(&devpts_root->d_inode->i_mutex);
 
 	return 0;
 }
@@ -178,7 +178,7 @@ struct tty_struct *devpts_get_tty(int number)
 		dput(dentry);
 	}
 
-	up(&devpts_root->d_inode->i_sem);
+	mutex_unlock(&devpts_root->d_inode->i_mutex);
 
 	return tty;
 }
@@ -196,7 +196,7 @@ void devpts_pty_kill(int number)
 		}
 		dput(dentry);
 	}
-	up(&devpts_root->d_inode->i_sem);
+	mutex_unlock(&devpts_root->d_inode->i_mutex);
 }
 
 static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3931e7f1e6bf..30dbbd1df511 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -56,7 +56,7 @@
  * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
  * This determines whether we need to do the fancy locking which prevents
  * direct-IO from being able to read uninitialised disk blocks.  If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_sem is
+ * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
  * not held for the entire direct write (taken briefly, initially, during a
  * direct read though, but its never held for the duration of a direct-IO).
  */
@@ -930,7 +930,7 @@ out:
 }
 
 /*
- * Releases both i_sem and i_alloc_sem
+ * Releases both i_mutex and i_alloc_sem
  */
 static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
@@ -1062,11 +1062,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 	/*
 	 * All block lookups have been performed. For READ requests
-	 * we can let i_sem go now that its achieved its purpose
+	 * we can let i_mutex go now that its achieved its purpose
 	 * of protecting us from looking up uninitialized blocks.
 	 */
 	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
-		up(&dio->inode->i_sem);
+		mutex_unlock(&dio->inode->i_mutex);
 
 	/*
 	 * OK, all BIOs are submitted, so we can decrement bio_count to truly
@@ -1145,18 +1145,18 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
  * The locking rules are governed by the dio_lock_type parameter.
  *
  * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_sem is not held on entry; it is never taken.
+ * For writes, i_mutex is not held on entry; it is never taken.
  *
  * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_sem and return with i_sem held, even though
+ * For writes we are called under i_mutex and return with i_mutex held, even though
  * it is internally dropped.
- * For reads, i_sem is not held on entry, but it is taken and dropped before
+ * For reads, i_mutex is not held on entry, but it is taken and dropped before
  * returning.
  *
  * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
  *	uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_sem, return without it, never touch it.
- * For reads, i_sem is held on entry and will be released before returning.
+ * For writes we are called without i_mutex, return without it, never touch it.
+ * For reads, i_mutex is held on entry and will be released before returning.
  *
  * Additional i_alloc_sem locking requirements described inline below.
  */
@@ -1214,11 +1214,11 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * For block device access DIO_NO_LOCKING is used,
 	 *	neither readers nor writers do any locking at all
 	 * For regular files using DIO_LOCKING,
-	 *	readers need to grab i_sem and i_alloc_sem
-	 *	writers need to grab i_alloc_sem only (i_sem is already held)
+	 *	readers need to grab i_mutex and i_alloc_sem
+	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
 	 * For regular files using DIO_OWN_LOCKING,
 	 *	neither readers nor writers take any locks here
-	 *	(i_sem is already held and release for writers here)
+	 *	(i_mutex is already held and release for writers here)
 	 */
 	dio->lock_type = dio_lock_type;
 	if (dio_lock_type != DIO_NO_LOCKING) {
@@ -1228,7 +1228,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 			mapping = iocb->ki_filp->f_mapping;
 			if (dio_lock_type != DIO_OWN_LOCKING) {
-				down(&inode->i_sem);
+				mutex_lock(&inode->i_mutex);
 				reader_with_isem = 1;
 			}
 
@@ -1240,7 +1240,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 			}
 
 			if (dio_lock_type == DIO_OWN_LOCKING) {
-				up(&inode->i_sem);
+				mutex_unlock(&inode->i_mutex);
 				reader_with_isem = 0;
 			}
 		}
@@ -1266,7 +1266,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 out:
 	if (reader_with_isem)
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	if (rw & WRITE)
 		current->flags &= ~PF_SYNCWRITE;
 	return retval;
diff --git a/fs/dquot.c b/fs/dquot.c
index 2a62b3dc20ec..cb6d5bfbdfd5 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -100,7 +100,7 @@
  * operation is just reading pointers from inode (or not using them at all) the
  * read lock is enough. If pointers are altered function must hold write lock
  * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
- * for altering the flag i_sem is also needed).  If operation is holding
+ * for altering the flag i_mutex is also needed).  If operation is holding
  * reference to dquot in other way (e.g. quotactl ops) it must be guarded by
  * dqonoff_sem.
  * This locking assures that:
@@ -117,9 +117,9 @@
  * spinlock to internal buffers before writing.
  *
  * Lock ordering (including related VFS locks) is the following:
- *   i_sem > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem >
+ *   i_mutex > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem >
  *   > dquot->dq_lock > dqio_sem
- * i_sem on quota files is special (it's below dqio_sem)
+ * i_mutex on quota files is special (it's below dqio_sem)
  */
 
 static DEFINE_SPINLOCK(dq_list_lock);
@@ -1369,11 +1369,11 @@ int vfs_quota_off(struct super_block *sb, int type)
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
 			if (!sb_has_quota_enabled(sb, cnt)) {
-				down(&toputinode[cnt]->i_sem);
+				mutex_lock(&toputinode[cnt]->i_mutex);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
 				truncate_inode_pages(&toputinode[cnt]->i_data, 0);
-				up(&toputinode[cnt]->i_sem);
+				mutex_unlock(&toputinode[cnt]->i_mutex);
 				mark_inode_dirty(toputinode[cnt]);
 				iput(toputinode[cnt]);
 			}
@@ -1417,7 +1417,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	write_inode_now(inode, 1);
 	/* And now flush the block cache so that kernel sees the changes */
 	invalidate_bdev(sb->s_bdev, 0);
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	down(&dqopt->dqonoff_sem);
 	if (sb_has_quota_enabled(sb, type)) {
 		error = -EBUSY;
@@ -1449,7 +1449,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 		goto out_file_init;
 	}
 	up(&dqopt->dqio_sem);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	set_enable_flags(dqopt, type);
 
 	add_dquot_ref(sb, type);
@@ -1470,7 +1470,7 @@ out_lock:
 		inode->i_flags |= oldflags;
 		up_write(&dqopt->dqptr_sem);
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 out_fmt:
 	put_quota_format(fmt);
 
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c49d6254379a..5bfe40085fbc 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -177,9 +177,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 			struct dentry *ppd;
 			struct dentry *npd;
 
-			down(&pd->d_inode->i_sem);
+			mutex_lock(&pd->d_inode->i_mutex);
 			ppd = CALL(nops,get_parent)(pd);
-			up(&pd->d_inode->i_sem);
+			mutex_unlock(&pd->d_inode->i_mutex);
 
 			if (IS_ERR(ppd)) {
 				err = PTR_ERR(ppd);
@@ -201,9 +201,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 				break;
 			}
 			dprintk("find_exported_dentry: found name: %s\n", nbuf);
-			down(&ppd->d_inode->i_sem);
+			mutex_lock(&ppd->d_inode->i_mutex);
 			npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
-			up(&ppd->d_inode->i_sem);
+			mutex_unlock(&ppd->d_inode->i_mutex);
 			if (IS_ERR(npd)) {
 				err = PTR_ERR(npd);
 				dprintk("find_exported_dentry: lookup failed: %d\n", err);
@@ -242,9 +242,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 		struct dentry *nresult;
 		err = CALL(nops,get_name)(target_dir, nbuf, result);
 		if (!err) {
-			down(&target_dir->d_inode->i_sem);
+			mutex_lock(&target_dir->d_inode->i_mutex);
 			nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
-			up(&target_dir->d_inode->i_sem);
+			mutex_unlock(&target_dir->d_inode->i_mutex);
 			if (!IS_ERR(nresult)) {
 				if (nresult->d_inode) {
 					dput(result);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 6af2f4130290..239133d01d91 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -149,7 +149,7 @@ ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 }
 
 /*
- * inode->i_sem: don't care
+ * inode->i_mutex: don't care
  */
 static struct posix_acl *
 ext2_get_acl(struct inode *inode, int type)
@@ -211,7 +211,7 @@ ext2_get_acl(struct inode *inode, int type)
 }
 
 /*
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 static int
 ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
@@ -301,8 +301,8 @@ ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
 /*
  * Initialize the ACLs of a new inode. Called from ext2_new_inode.
  *
- * dir->i_sem: down
- * inode->i_sem: up (access to inode is still exclusive)
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
  */
 int
 ext2_init_acl(struct inode *inode, struct inode *dir)
@@ -361,7 +361,7 @@ cleanup:
  * for directories) are added. There are no more bits available in the
  * file mode.
  *
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 int
 ext2_acl_chmod(struct inode *inode)
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e977f8566d14..00de0a7312a2 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -53,7 +53,7 @@ struct ext2_inode_info {
 #ifdef CONFIG_EXT2_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
-	 * data. Taking i_sem even when reading would cause contention
+	 * data. Taking i_mutex even when reading would cause contention
 	 * between readers of EAs and writers of regular file data, so
 	 * instead we synchronize on xattr_sem when reading or changing
 	 * EAs.
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 522fa70dd8ea..8d6819846fc9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1152,7 +1152,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
 	struct buffer_head tmp_bh;
 	struct buffer_head *bh;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 				sb->s_blocksize - offset : towrite;
@@ -1189,7 +1189,7 @@ out:
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(inode);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return len - towrite;
 }
 
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 0099462d4271..f7a3b5fee274 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -325,7 +325,7 @@ cleanup:
 /*
  * Inode operation listxattr()
  *
- * dentry->d_inode->i_sem: don't care
+ * dentry->d_inode->i_mutex: don't care
  */
 ssize_t
 ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 3ac38266fc9e..9ed132c96034 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -152,7 +152,7 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
 /*
  * Inode operation get_posix_acl().
  *
- * inode->i_sem: don't care
+ * inode->i_mutex: don't care
  */
 static struct posix_acl *
 ext3_get_acl(struct inode *inode, int type)
@@ -216,7 +216,7 @@ ext3_get_acl(struct inode *inode, int type)
 /*
  * Set the access or default ACL of an inode.
  *
- * inode->i_sem: down unless called from ext3_new_inode
+ * inode->i_mutex: down unless called from ext3_new_inode
  */
 static int
 ext3_set_acl(handle_t *handle, struct inode *inode, int type,
@@ -306,8 +306,8 @@ ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
 /*
  * Initialize the ACLs of a new inode. Called from ext3_new_inode.
  *
- * dir->i_sem: down
- * inode->i_sem: up (access to inode is still exclusive)
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
  */
 int
 ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
@@ -368,7 +368,7 @@ cleanup:
  * for directories) are added. There are no more bits available in the
  * file mode.
  *
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 int
 ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7c45acf94589..c3dbebdb9897 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2601,7 +2601,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 				sb->s_blocksize - offset : towrite;
@@ -2644,7 +2644,7 @@ out:
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	ext3_mark_inode_dirty(handle, inode);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return len - towrite;
 }
 
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 430de9f63be3..238199d82ce5 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -140,7 +140,7 @@ ext3_xattr_handler(int name_index)
 /*
  * Inode operation listxattr()
  *
- * dentry->d_inode->i_sem: don't care
+ * dentry->d_inode->i_mutex: don't care
  */
 ssize_t
 ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index eef1b81aa294..db0de5c621c7 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -729,13 +729,13 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
 
 	buf.dirent = d1;
 	buf.result = 0;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
 		ret = __fat_readdir(inode, filp, &buf, fat_ioctl_filldir,
 				    short_only, both);
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	if (ret >= 0)
 		ret = buf.result;
 	return ret;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 9b07c328a6fc..d30876cf35f5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -41,7 +41,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		if (err)
 			return err;
 
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 
 		if (IS_RDONLY(inode)) {
 			err = -EROFS;
@@ -103,7 +103,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
 		mark_inode_dirty(inode);
 	up:
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
 	default:
diff --git a/fs/fifo.c b/fs/fifo.c
index 5455916241f0..923371b753ab 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -35,7 +35,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	int ret;
 
 	ret = -ERESTARTSYS;
-	if (down_interruptible(PIPE_SEM(*inode)))
+	if (mutex_lock_interruptible(PIPE_MUTEX(*inode)))
 		goto err_nolock_nocleanup;
 
 	if (!inode->i_pipe) {
@@ -119,7 +119,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	}
 
 	/* Ok! */
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 	return 0;
 
 err_rd:
@@ -139,7 +139,7 @@ err:
 		free_pipe_info(inode);
 
 err_nocleanup:
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 err_nolock_nocleanup:
 	return ret;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 05dedddf4289..63d2980df5c9 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -560,9 +560,9 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 	struct inode *inode = file->f_dentry->d_inode;
 	ssize_t res;
 	/* Don't allow parallel writes to the same file */
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	res = fuse_direct_io(file, buf, count, ppos, 1);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return res;
 }
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d499393a8ae7..050a49276499 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -547,13 +547,13 @@ static int hfs_file_release(struct inode *inode, struct file *file)
 	if (atomic_read(&file->f_count) != 0)
 		return 0;
 	if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		hfs_file_truncate(inode);
 		//if (inode->i_flags & S_DEAD) {
 		//	hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
 		//	hfs_delete_inode(inode);
 		//}
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	return 0;
 }
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index c7d316455fa0..9fb51632303c 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -29,7 +29,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		return size;
 
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
-	down(&HFSPLUS_SB(sb).alloc_file->i_sem);
+	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
 			       (filler_t *)mapping->a_ops->readpage, NULL);
@@ -143,7 +143,7 @@ done:
 	sb->s_dirt = 1;
 	dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
 out:
-	up(&HFSPLUS_SB(sb).alloc_file->i_sem);
+	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	return start;
 }
 
@@ -164,7 +164,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
 		return -2;
 
-	down(&HFSPLUS_SB(sb).alloc_file->i_sem);
+	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
 	page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL);
@@ -215,7 +215,7 @@ out:
 	kunmap(page);
 	HFSPLUS_SB(sb).free_blocks += len;
 	sb->s_dirt = 1;
-	up(&HFSPLUS_SB(sb).alloc_file->i_sem);
+	mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 
 	return 0;
 }
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index fc98583cf045..983bcd02ac1c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -276,13 +276,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	if (atomic_read(&file->f_count) != 0)
 		return 0;
 	if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		hfsplus_file_truncate(inode);
 		if (inode->i_flags & S_DEAD) {
 			hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
 			hfsplus_delete_inode(inode);
 		}
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	return 0;
 }
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 0217c3a04441..5591f9623aa2 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -32,19 +32,19 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 
 	/*printk("dir lseek\n");*/
 	if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
-	down(&i->i_sem);
+	mutex_lock(&i->i_mutex);
 	pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
 	while (pos != new_off) {
 		if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
 		else goto fail;
 		if (pos == 12) goto fail;
 	}
-	up(&i->i_sem);
+	mutex_unlock(&i->i_mutex);
 ok:
 	unlock_kernel();
 	return filp->f_pos = new_off;
 fail:
-	up(&i->i_sem);
+	mutex_unlock(&i->i_mutex);
 	/*printk("illegal lseek: %016llx\n", new_off);*/
 	unlock_kernel();
 	return -ESPIPE;
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 52930915bad8..a44dc5897399 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -171,12 +171,12 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
 
 	err = -ENOMEM;
 	parent = HPPFS_I(ino)->proc_dentry;
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	proc_dentry = d_lookup(parent, &dentry->d_name);
 	if(proc_dentry == NULL){
 		proc_dentry = d_alloc(parent, &dentry->d_name);
 		if(proc_dentry == NULL){
-			up(&parent->d_inode->i_sem);
+			mutex_unlock(&parent->d_inode->i_mutex);
 			goto out;
 		}
 		new = (*parent->d_inode->i_op->lookup)(parent->d_inode,
@@ -186,7 +186,7 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
 			proc_dentry = new;
 		}
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 
 	if(IS_ERR(proc_dentry))
 		return(proc_dentry);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c41315a6e42..ff1b7d108bd0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -118,7 +118,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	file_accessed(file);
 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
 	vma->vm_ops = &hugetlb_vm_ops;
@@ -133,7 +133,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (inode->i_size < len)
 		inode->i_size = len;
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	return ret;
 }
diff --git a/fs/inode.c b/fs/inode.c
index fd568caf7f74..e08767fd57b0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -192,7 +192,7 @@ void inode_init_once(struct inode *inode)
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
-	sema_init(&inode->i_sem, 1);
+	mutex_init(&inode->i_mutex);
 	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	rwlock_init(&inode->i_data.tree_lock);
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 2559ee10beda..fc3855a1aef3 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1415,7 +1415,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
 	 * This will never trigger with sane page sizes.  leave it in
 	 * anyway, since I'm thinking about how to merge larger writes
 	 * (the current idea is to poke a thread that does the actual
-	 * I/O and starts by doing a down(&inode->i_sem).  then we
+	 * I/O and starts by doing a mutex_lock(&inode->i_mutex).  then we
 	 * would need to get the page cache pages and have a list of
 	 * I/O requests and do write-merging here.
 	 * -- prumpf
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index c0fd7b3eadc6..dc21a5bd54d4 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -58,7 +58,7 @@ struct jfs_inode_info {
 	/*
 	 * rdwrlock serializes xtree between reads & writes and synchronizes
 	 * changes to special inodes.  It's use would be redundant on
-	 * directories since the i_sem taken in the VFS is sufficient.
+	 * directories since the i_mutex taken in the VFS is sufficient.
 	 */
 	struct rw_semaphore rdwrlock;
 	/*
@@ -68,7 +68,7 @@ struct jfs_inode_info {
 	 * inode is blocked in txBegin or TxBeginAnon
 	 */
 	struct semaphore commit_sem;
-	/* xattr_sem allows us to access the xattrs without taking i_sem */
+	/* xattr_sem allows us to access the xattrs without taking i_mutex */
 	struct rw_semaphore xattr_sem;
 	lid_t	xtlid;		/* lid of xtree lock on directory */
 #ifdef CONFIG_JFS_POSIX_ACL
diff --git a/fs/libfs.c b/fs/libfs.c
index 9c50523382e7..63c020e6589e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -74,7 +74,7 @@ int dcache_dir_close(struct inode *inode, struct file *file)
 
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 {
-	down(&file->f_dentry->d_inode->i_sem);
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += file->f_pos;
@@ -82,7 +82,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			if (offset >= 0)
 				break;
 		default:
-			up(&file->f_dentry->d_inode->i_sem);
+			mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -106,7 +106,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			spin_unlock(&dcache_lock);
 		}
 	}
-	up(&file->f_dentry->d_inode->i_sem);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 	return offset;
 }
 
@@ -356,7 +356,7 @@ int simple_commit_write(struct file *file, struct page *page,
 
 	/*
 	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold the i_sem.
+	 * cannot change under us because we hold the i_mutex.
 	 */
 	if (pos > inode->i_size)
 		i_size_write(inode, pos);
diff --git a/fs/namei.c b/fs/namei.c
index 300eae088d5f..0a8f073435af 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -438,7 +438,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 	struct dentry * result;
 	struct inode *dir = parent->d_inode;
 
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	/*
 	 * First re-do the cached lookup just in case it was created
 	 * while we waited for the directory semaphore..
@@ -464,7 +464,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 			else
 				result = dentry;
 		}
-		up(&dir->i_sem);
+		mutex_unlock(&dir->i_mutex);
 		return result;
 	}
 
@@ -472,7 +472,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 	 * Uhhuh! Nasty case: the cache was re-populated while
 	 * we waited on the semaphore. Need to revalidate.
 	 */
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	if (result->d_op && result->d_op->d_revalidate) {
 		if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
 			dput(result);
@@ -1366,7 +1366,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 	struct dentry *p;
 
 	if (p1 == p2) {
-		down(&p1->d_inode->i_sem);
+		mutex_lock(&p1->d_inode->i_mutex);
 		return NULL;
 	}
 
@@ -1374,30 +1374,30 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
 	for (p = p1; p->d_parent != p; p = p->d_parent) {
 		if (p->d_parent == p2) {
-			down(&p2->d_inode->i_sem);
-			down(&p1->d_inode->i_sem);
+			mutex_lock(&p2->d_inode->i_mutex);
+			mutex_lock(&p1->d_inode->i_mutex);
 			return p;
 		}
 	}
 
 	for (p = p2; p->d_parent != p; p = p->d_parent) {
 		if (p->d_parent == p1) {
-			down(&p1->d_inode->i_sem);
-			down(&p2->d_inode->i_sem);
+			mutex_lock(&p1->d_inode->i_mutex);
+			mutex_lock(&p2->d_inode->i_mutex);
 			return p;
 		}
 	}
 
-	down(&p1->d_inode->i_sem);
-	down(&p2->d_inode->i_sem);
+	mutex_lock(&p1->d_inode->i_mutex);
+	mutex_lock(&p2->d_inode->i_mutex);
 	return NULL;
 }
 
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
-	up(&p1->d_inode->i_sem);
+	mutex_unlock(&p1->d_inode->i_mutex);
 	if (p1 != p2) {
-		up(&p2->d_inode->i_sem);
+		mutex_unlock(&p2->d_inode->i_mutex);
 		up(&p1->d_inode->i_sb->s_vfs_rename_sem);
 	}
 }
@@ -1563,14 +1563,14 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
 
 	dir = nd->dentry;
 	nd->flags &= ~LOOKUP_PARENT;
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(nd);
 	path.mnt = nd->mnt;
 
 do_last:
 	error = PTR_ERR(path.dentry);
 	if (IS_ERR(path.dentry)) {
-		up(&dir->d_inode->i_sem);
+		mutex_unlock(&dir->d_inode->i_mutex);
 		goto exit;
 	}
 
@@ -1579,7 +1579,7 @@ do_last:
 		if (!IS_POSIXACL(dir->d_inode))
 			mode &= ~current->fs->umask;
 		error = vfs_create(dir->d_inode, path.dentry, mode, nd);
-		up(&dir->d_inode->i_sem);
+		mutex_unlock(&dir->d_inode->i_mutex);
 		dput(nd->dentry);
 		nd->dentry = path.dentry;
 		if (error)
@@ -1593,7 +1593,7 @@ do_last:
 	/*
 	 * It already exists.
 	 */
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	error = -EEXIST;
 	if (flag & O_EXCL)
@@ -1665,7 +1665,7 @@ do_link:
 		goto exit;
 	}
 	dir = nd->dentry;
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(nd);
 	path.mnt = nd->mnt;
 	__putname(nd->last.name);
@@ -1680,13 +1680,13 @@ do_link:
  * Simple function to lookup and return a dentry and create it
  * if it doesn't exist.  Is SMP-safe.
  *
- * Returns with nd->dentry->d_inode->i_sem locked.
+ * Returns with nd->dentry->d_inode->i_mutex locked.
  */
 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 {
 	struct dentry *dentry = ERR_PTR(-EEXIST);
 
-	down(&nd->dentry->d_inode->i_sem);
+	mutex_lock(&nd->dentry->d_inode->i_mutex);
 	/*
 	 * Yucky last component or no last component at all?
 	 * (foo/., foo/.., /////)
@@ -1784,7 +1784,7 @@ asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
 		}
 		dput(dentry);
 	}
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 	path_release(&nd);
 out:
 	putname(tmp);
@@ -1836,7 +1836,7 @@ asmlinkage long sys_mkdir(const char __user * pathname, int mode)
 			error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
 			dput(dentry);
 		}
-		up(&nd.dentry->d_inode->i_sem);
+		mutex_unlock(&nd.dentry->d_inode->i_mutex);
 		path_release(&nd);
 out:
 		putname(tmp);
@@ -1885,7 +1885,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	DQUOT_INIT(dir);
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	dentry_unhash(dentry);
 	if (d_mountpoint(dentry))
 		error = -EBUSY;
@@ -1897,7 +1897,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 				dentry->d_inode->i_flags |= S_DEAD;
 		}
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	if (!error) {
 		d_delete(dentry);
 	}
@@ -1932,14 +1932,14 @@ asmlinkage long sys_rmdir(const char __user * pathname)
 			error = -EBUSY;
 			goto exit1;
 	}
-	down(&nd.dentry->d_inode->i_sem);
+	mutex_lock(&nd.dentry->d_inode->i_mutex);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
 		error = vfs_rmdir(nd.dentry->d_inode, dentry);
 		dput(dentry);
 	}
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 exit1:
 	path_release(&nd);
 exit:
@@ -1959,7 +1959,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	DQUOT_INIT(dir);
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	if (d_mountpoint(dentry))
 		error = -EBUSY;
 	else {
@@ -1967,7 +1967,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 		if (!error)
 			error = dir->i_op->unlink(dir, dentry);
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
@@ -1979,7 +1979,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 
 /*
  * Make sure that the actual truncation of the file will occur outside its
- * directory's i_sem.  Truncate can take a long time if there is a lot of
+ * directory's i_mutex.  Truncate can take a long time if there is a lot of
  * writeout happening, and we don't want to prevent access to the directory
  * while waiting on the I/O.
  */
@@ -2001,7 +2001,7 @@ asmlinkage long sys_unlink(const char __user * pathname)
 	error = -EISDIR;
 	if (nd.last_type != LAST_NORM)
 		goto exit1;
-	down(&nd.dentry->d_inode->i_sem);
+	mutex_lock(&nd.dentry->d_inode->i_mutex);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
@@ -2015,7 +2015,7 @@ asmlinkage long sys_unlink(const char __user * pathname)
 	exit2:
 		dput(dentry);
 	}
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 	if (inode)
 		iput(inode);	/* truncate the inode here */
 exit1:
@@ -2075,7 +2075,7 @@ asmlinkage long sys_symlink(const char __user * oldname, const char __user * new
 			error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
 			dput(dentry);
 		}
-		up(&nd.dentry->d_inode->i_sem);
+		mutex_unlock(&nd.dentry->d_inode->i_mutex);
 		path_release(&nd);
 out:
 		putname(to);
@@ -2113,10 +2113,10 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (error)
 		return error;
 
-	down(&old_dentry->d_inode->i_sem);
+	mutex_lock(&old_dentry->d_inode->i_mutex);
 	DQUOT_INIT(dir);
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
-	up(&old_dentry->d_inode->i_sem);
+	mutex_unlock(&old_dentry->d_inode->i_mutex);
 	if (!error)
 		fsnotify_create(dir, new_dentry->d_name.name);
 	return error;
@@ -2157,7 +2157,7 @@ asmlinkage long sys_link(const char __user * oldname, const char __user * newnam
 		error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
 		dput(new_dentry);
 	}
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 out_release:
 	path_release(&nd);
 out:
@@ -2178,7 +2178,7 @@ exit:
  *	   sb->s_vfs_rename_sem. We might be more accurate, but that's another
  *	   story.
  *	c) we have to lock _three_ objects - parents and victim (if it exists).
- *	   And that - after we got ->i_sem on parents (until then we don't know
+ *	   And that - after we got ->i_mutex on parents (until then we don't know
  *	   whether the target exists).  Solution: try to be smart with locking
  *	   order for inodes.  We rely on the fact that tree topology may change
  *	   only under ->s_vfs_rename_sem _and_ that parent of the object we
@@ -2195,9 +2195,9 @@ exit:
  *	   stuff into VFS), but the former is not going away. Solution: the same
  *	   trick as in rmdir().
  *	e) conversion from fhandle to dentry may come in the wrong moment - when
- *	   we are removing the target. Solution: we will have to grab ->i_sem
+ *	   we are removing the target. Solution: we will have to grab ->i_mutex
  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- *	   ->i_sem on parents, which works but leads to some truely excessive
+ *	   ->i_mutex on parents, which works but leads to some truely excessive
  *	   locking].
  */
 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2222,7 +2222,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 
 	target = new_dentry->d_inode;
 	if (target) {
-		down(&target->i_sem);
+		mutex_lock(&target->i_mutex);
 		dentry_unhash(new_dentry);
 	}
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
@@ -2232,7 +2232,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	if (target) {
 		if (!error)
 			target->i_flags |= S_DEAD;
-		up(&target->i_sem);
+		mutex_unlock(&target->i_mutex);
 		if (d_unhashed(new_dentry))
 			d_rehash(new_dentry);
 		dput(new_dentry);
@@ -2255,7 +2255,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 	dget(new_dentry);
 	target = new_dentry->d_inode;
 	if (target)
-		down(&target->i_sem);
+		mutex_lock(&target->i_mutex);
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
 		error = -EBUSY;
 	else
@@ -2266,7 +2266,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 			d_move(old_dentry, new_dentry);
 	}
 	if (target)
-		up(&target->i_sem);
+		mutex_unlock(&target->i_mutex);
 	dput(new_dentry);
 	return error;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 3e8fb61ad597..f0e353f5bc30 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -814,7 +814,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 		return -ENOTDIR;
 
 	err = -ENOENT;
-	down(&nd->dentry->d_inode->i_sem);
+	mutex_lock(&nd->dentry->d_inode->i_mutex);
 	if (IS_DEADDIR(nd->dentry->d_inode))
 		goto out_unlock;
 
@@ -826,7 +826,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
 		err = attach_recursive_mnt(mnt, nd, NULL);
 out_unlock:
-	up(&nd->dentry->d_inode->i_sem);
+	mutex_unlock(&nd->dentry->d_inode->i_mutex);
 	if (!err)
 		security_sb_post_addmount(mnt, nd);
 	return err;
@@ -962,7 +962,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 		goto out;
 
 	err = -ENOENT;
-	down(&nd->dentry->d_inode->i_sem);
+	mutex_lock(&nd->dentry->d_inode->i_mutex);
 	if (IS_DEADDIR(nd->dentry->d_inode))
 		goto out1;
 
@@ -1004,7 +1004,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 	list_del_init(&old_nd.mnt->mnt_expire);
 	spin_unlock(&vfsmount_lock);
 out1:
-	up(&nd->dentry->d_inode->i_sem);
+	mutex_unlock(&nd->dentry->d_inode->i_mutex);
 out:
 	up_write(&namespace_sem);
 	if (!err)
@@ -1573,7 +1573,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	user_nd.dentry = dget(current->fs->root);
 	read_unlock(&current->fs->lock);
 	down_write(&namespace_sem);
-	down(&old_nd.dentry->d_inode->i_sem);
+	mutex_lock(&old_nd.dentry->d_inode->i_mutex);
 	error = -EINVAL;
 	if (IS_MNT_SHARED(old_nd.mnt) ||
 		IS_MNT_SHARED(new_nd.mnt->mnt_parent) ||
@@ -1626,7 +1626,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	path_release(&root_parent);
 	path_release(&parent_nd);
 out2:
-	up(&old_nd.dentry->d_inode->i_sem);
+	mutex_unlock(&old_nd.dentry->d_inode->i_mutex);
 	up_write(&namespace_sem);
 	path_release(&user_nd);
 	path_release(&old_nd);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e9255198f767..a1554bead692 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -194,7 +194,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	spin_unlock(&inode->i_lock);
 	/* Ensure consistent page alignment of the data.
 	 * Note: assumes we have exclusive access to this mapping either
-	 *	 through inode->i_sem or some other mechanism.
+	 *	 through inode->i_mutex or some other mechanism.
 	 */
 	if (page->index == 0)
 		invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
@@ -573,7 +573,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 {
-	down(&filp->f_dentry->d_inode->i_sem);
+	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += filp->f_pos;
@@ -589,7 +589,7 @@ loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 		((struct nfs_open_context *)filp->private_data)->dir_cookie = 0;
 	}
 out:
-	up(&filp->f_dentry->d_inode->i_sem);
+	mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
 	return offset;
 }
 
@@ -1001,7 +1001,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	openflags &= ~(O_CREAT|O_TRUNC);
 
 	/*
-	 * Note: we're not holding inode->i_sem and so may be racing with
+	 * Note: we're not holding inode->i_mutex and so may be racing with
 	 * operations that change the directory. We therefore save the
 	 * change attribute *before* we do the RPC call.
 	 */
@@ -1051,7 +1051,7 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 		return dentry;
 	if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR))
 		return NULL;
-	/* Note: caller is already holding the dir->i_sem! */
+	/* Note: caller is already holding the dir->i_mutex! */
 	dentry = d_alloc(parent, &name);
 	if (dentry == NULL)
 		return NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 954cf893d50c..be963a133aaa 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -121,9 +121,9 @@ out:
 static void
 nfsd4_sync_rec_dir(void)
 {
-	down(&rec_dir.dentry->d_inode->i_sem);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
 	nfsd_sync_dir(rec_dir.dentry);
-	up(&rec_dir.dentry->d_inode->i_sem);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 }
 
 int
@@ -143,7 +143,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	nfs4_save_user(&uid, &gid);
 
 	/* lock the parent */
-	down(&rec_dir.dentry->d_inode->i_sem);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
 
 	dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
 	if (IS_ERR(dentry)) {
@@ -159,7 +159,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 out_put:
 	dput(dentry);
 out_unlock:
-	up(&rec_dir.dentry->d_inode->i_sem);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (status == 0) {
 		clp->cl_firststate = 1;
 		nfsd4_sync_rec_dir();
@@ -259,9 +259,9 @@ nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
 		printk("nfsd4: non-file found in client recovery directory\n");
 		return -EINVAL;
 	}
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	status = vfs_unlink(dir->d_inode, dentry);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 	return status;
 }
 
@@ -274,9 +274,9 @@ nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
 	 * any regular files anyway, just in case the directory was created by
 	 * a kernel from the future.... */
 	nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	status = vfs_rmdir(dir->d_inode, dentry);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 	return status;
 }
 
@@ -288,9 +288,9 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
 
 	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
 
-	down(&rec_dir.dentry->d_inode->i_sem);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
 	dentry = lookup_one_len(name, rec_dir.dentry, namlen);
-	up(&rec_dir.dentry->d_inode->i_sem);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
 		return status;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index df4019f04560..bb36b4304491 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -390,12 +390,12 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
 
 	error = -EOPNOTSUPP;
 	if (inode->i_op && inode->i_op->setxattr) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		security_inode_setxattr(dentry, key, buf, len, 0);
 		error = inode->i_op->setxattr(dentry, key, buf, len, 0);
 		if (!error)
 			security_inode_post_setxattr(dentry, key, buf, len, 0);
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 out:
 	kfree(buf);
@@ -739,9 +739,9 @@ nfsd_sync(struct file *filp)
         int err;
 	struct inode *inode = filp->f_dentry->d_inode;
 	dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name);
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	err=nfsd_dosync(filp, filp->f_dentry, filp->f_op);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	return err;
 }
@@ -885,9 +885,9 @@ static void kill_suid(struct dentry *dentry)
 	struct iattr	ia;
 	ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	notify_change(dentry, &ia);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 
 static inline int
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index eda056bac256..9480a0526cd3 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1532,7 +1532,7 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
  * NOTE to self: No changes in the attribute list are required to move from
  *		 a resident to a non-resident attribute.
  *
- * Locking: - The caller must hold i_sem on the inode.
+ * Locking: - The caller must hold i_mutex on the inode.
  */
 int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
 {
@@ -1728,7 +1728,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
 	/*
 	 * This needs to be last since the address space operations ->readpage
 	 * and ->writepage can run concurrently with us as they are not
-	 * serialized on i_sem.  Note, we are not allowed to fail once we flip
+	 * serialized on i_mutex.  Note, we are not allowed to fail once we flip
 	 * this switch, which is another reason to do this last.
 	 */
 	NInoSetNonResident(ni);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 795c3d1930f5..b0690d4c8906 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -69,7 +69,7 @@ ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
  * work but we don't care for how quickly one can access them. This also fixes
  * the dcache aliasing issues.
  *
- * Locking:  - Caller must hold i_sem on the directory.
+ * Locking:  - Caller must hold i_mutex on the directory.
  *	     - Each page cache page in the index allocation mapping must be
  *	       locked whilst being accessed otherwise we may find a corrupt
  *	       page due to it being under ->writepage at the moment which
@@ -1085,11 +1085,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
  * While this will return the names in random order this doesn't matter for
  * ->readdir but OTOH results in a faster ->readdir.
  *
- * VFS calls ->readdir without BKL but with i_sem held. This protects the VFS
+ * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
  * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
  * modifications).
  *
- * Locking:  - Caller must hold i_sem on the directory.
+ * Locking:  - Caller must hold i_mutex on the directory.
  *	     - Each page cache page in the index allocation mapping must be
  *	       locked whilst being accessed otherwise we may find a corrupt
  *	       page due to it being under ->writepage at the moment which
@@ -1520,7 +1520,7 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
  * Note: In the past @filp could be NULL so we ignore it as we don't need it
  * anyway.
  *
- * Locking: Caller must hold i_sem on the inode.
+ * Locking: Caller must hold i_mutex on the inode.
  *
  * TODO: We should probably also write all attribute/index inodes associated
  * with this inode but since we have no simple way of getting to them we ignore
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 727533891813..30f71acdc1cb 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -106,7 +106,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
  * this is the case, the necessary zeroing will also have happened and that all
  * metadata is self-consistent.
  *
- * Locking: i_sem on the vfs inode corrseponsind to the ntfs inode @ni must be
+ * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
  *	    held by the caller.
  */
 static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
@@ -473,7 +473,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
  * @bytes:	number of bytes to be written
  *
  * This is called for non-resident attributes from ntfs_file_buffered_write()
- * with i_sem held on the inode (@pages[0]->mapping->host).  There are
+ * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
  * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
  * data has not yet been copied into the @pages.
  * 
@@ -1637,7 +1637,7 @@ err_out:
  * @pos:	byte position in file at which the write begins
  * @bytes:	number of bytes to be written
  *
- * This is called from ntfs_file_buffered_write() with i_sem held on the inode
+ * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
  * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
  * locked but not kmap()ped.  The source data has already been copied into the
  * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
@@ -1814,7 +1814,7 @@ err_out:
 /**
  * ntfs_file_buffered_write -
  *
- * Locking: The vfs is holding ->i_sem on the inode.
+ * Locking: The vfs is holding ->i_mutex on the inode.
  */
 static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 		const struct iovec *iov, unsigned long nr_segs,
@@ -2196,9 +2196,9 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err = sync_page_range(inode, mapping, pos, ret);
 		if (err < 0)
@@ -2221,12 +2221,12 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
 	struct kiocb kiocb;
 	ssize_t ret;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	init_sync_kiocb(&kiocb, file);
 	ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
 	if (ret == -EIOCBQUEUED)
 		ret = wait_on_sync_kiocb(&kiocb);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err = sync_page_range(inode, mapping, *ppos - ret, ret);
 		if (err < 0)
@@ -2269,7 +2269,7 @@ static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
  * Note: In the past @filp could be NULL so we ignore it as we don't need it
  * anyway.
  *
- * Locking: Caller must hold i_sem on the inode.
+ * Locking: Caller must hold i_mutex on the inode.
  *
  * TODO: We should probably also write all attribute/index inodes associated
  * with this inode but since we have no simple way of getting to them we ignore
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 8f2d5727546f..9f5427c2d105 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -32,7 +32,7 @@
  * Allocate a new index context, initialize it with @idx_ni and return it.
  * Return NULL if allocation failed.
  *
- * Locking:  Caller must hold i_sem on the index inode.
+ * Locking:  Caller must hold i_mutex on the index inode.
  */
 ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
 {
@@ -50,7 +50,7 @@ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
  *
  * Release the index context @ictx, releasing all associated resources.
  *
- * Locking:  Caller must hold i_sem on the index inode.
+ * Locking:  Caller must hold i_mutex on the index inode.
  */
 void ntfs_index_ctx_put(ntfs_index_context *ictx)
 {
@@ -106,7 +106,7 @@ void ntfs_index_ctx_put(ntfs_index_context *ictx)
  * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
  * ensure that the changes are written to disk.
  *
- * Locking:  - Caller must hold i_sem on the index inode.
+ * Locking:  - Caller must hold i_mutex on the index inode.
  *	     - Each page cache page in the index allocation mapping must be
  *	       locked whilst being accessed otherwise we may find a corrupt
  *	       page due to it being under ->writepage at the moment which
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index b24f4c4b2c5c..bda7a08911a5 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2125,13 +2125,13 @@ void ntfs_put_inode(struct inode *vi)
 		ntfs_inode *ni = NTFS_I(vi);
 		if (NInoIndexAllocPresent(ni)) {
 			struct inode *bvi = NULL;
-			down(&vi->i_sem);
+			mutex_lock(&vi->i_mutex);
 			if (atomic_read(&vi->i_count) == 2) {
 				bvi = ni->itype.index.bmp_ino;
 				if (bvi)
 					ni->itype.index.bmp_ino = NULL;
 			}
-			up(&vi->i_sem);
+			mutex_unlock(&vi->i_mutex);
 			if (bvi)
 				iput(bvi);
 		}
@@ -2311,7 +2311,7 @@ static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
  *
  * Returns 0 on success or -errno on error.
  *
- * Called with ->i_sem held.  In all but one case ->i_alloc_sem is held for
+ * Called with ->i_mutex held.  In all but one case ->i_alloc_sem is held for
  * writing.  The only case in the kernel where ->i_alloc_sem is not held is
  * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
  * with the current i_size as the offset.  The analogous place in NTFS is in
@@ -2831,7 +2831,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
  * We also abort all changes of user, group, and mode as we do not implement
  * the NTFS ACLs yet.
  *
- * Called with ->i_sem held.  For the ATTR_SIZE (i.e. ->truncate) case, also
+ * Called with ->i_mutex held.  For the ATTR_SIZE (i.e. ->truncate) case, also
  * called with ->i_alloc_sem held for writing.
  *
  * Basically this is a copy of generic notify_change() and inode_setattr()
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 351dbc3b6e40..5ea9eb93af62 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -96,7 +96,7 @@
  *    name. We then convert the name to the current NLS code page, and proceed
  *    searching for a dentry with this name, etc, as in case 2), above.
  *
- * Locking: Caller must hold i_sem on the directory.
+ * Locking: Caller must hold i_mutex on the directory.
  */
 static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
 		struct nameidata *nd)
@@ -254,7 +254,7 @@ handle_name:
 	nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
 
 	/*
-	 * Note: No need for dent->d_lock lock as i_sem is held on the
+	 * Note: No need for dent->d_lock lock as i_mutex is held on the
 	 * parent inode.
 	 */
 
@@ -374,7 +374,7 @@ struct inode_operations ntfs_dir_inode_ops = {
  * The code is based on the ext3 ->get_parent() implementation found in
  * fs/ext3/namei.c::ext3_get_parent().
  *
- * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_sem down.
+ * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down.
  *
  * Return the dentry of the parent directory on success or the error code on
  * error (IS_ERR() is true).
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index 833df2a4e9fb..d0ef4182147b 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -48,7 +48,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 		ntfs_error(vol->sb, "Quota inodes are not open.");
 		return FALSE;
 	}
-	down(&vol->quota_q_ino->i_sem);
+	mutex_lock(&vol->quota_q_ino->i_mutex);
 	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
 	if (!ictx) {
 		ntfs_error(vol->sb, "Failed to get index context.");
@@ -98,7 +98,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 	ntfs_index_entry_mark_dirty(ictx);
 set_done:
 	ntfs_index_ctx_put(ictx);
-	up(&vol->quota_q_ino->i_sem);
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
 	/*
 	 * We set the flag so we do not try to mark the quotas out of date
 	 * again on remount.
@@ -110,7 +110,7 @@ done:
 err_out:
 	if (ictx)
 		ntfs_index_ctx_put(ictx);
-	up(&vol->quota_q_ino->i_sem);
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
 	return FALSE;
 }
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c16db9e1a8a..280e383fc84e 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1213,10 +1213,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
 	 * Find the inode number for the hibernation file by looking up the
 	 * filename hiberfil.sys in the root directory.
 	 */
-	down(&vol->root_ino->i_sem);
+	mutex_lock(&vol->root_ino->i_mutex);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
 			&name);
-	up(&vol->root_ino->i_sem);
+	mutex_unlock(&vol->root_ino->i_mutex);
 	if (IS_ERR_MREF(mref)) {
 		ret = MREF_ERR(mref);
 		/* If the file does not exist, Windows is not hibernated. */
@@ -1307,10 +1307,10 @@ static BOOL load_and_init_quota(ntfs_volume *vol)
 	 * Find the inode number for the quota file by looking up the filename
 	 * $Quota in the extended system files directory $Extend.
 	 */
-	down(&vol->extend_ino->i_sem);
+	mutex_lock(&vol->extend_ino->i_mutex);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
 			&name);
-	up(&vol->extend_ino->i_sem);
+	mutex_unlock(&vol->extend_ino->i_mutex);
 	if (IS_ERR_MREF(mref)) {
 		/*
 		 * If the file does not exist, quotas are disabled and have
@@ -1390,10 +1390,10 @@ static BOOL load_and_init_usnjrnl(ntfs_volume *vol)
 	 * Find the inode number for the transaction log file by looking up the
 	 * filename $UsnJrnl in the extended system files directory $Extend.
 	 */
-	down(&vol->extend_ino->i_sem);
+	mutex_lock(&vol->extend_ino->i_mutex);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
 			&name);
-	up(&vol->extend_ino->i_sem);
+	mutex_unlock(&vol->extend_ino->i_mutex);
 	if (IS_ERR_MREF(mref)) {
 		/*
 		 * If the file does not exist, transaction logging is disabled,
@@ -2312,9 +2312,9 @@ static void ntfs_put_super(struct super_block *sb)
 	if (!list_empty(&sb->s_dirty)) {
 		const char *s1, *s2;
 
-		down(&vol->mft_ino->i_sem);
+		mutex_lock(&vol->mft_ino->i_mutex);
 		truncate_inode_pages(vol->mft_ino->i_mapping, 0);
-		up(&vol->mft_ino->i_sem);
+		mutex_unlock(&vol->mft_ino->i_mutex);
 		write_inode_now(vol->mft_ino, 1);
 		if (!list_empty(&sb->s_dirty)) {
 			static const char *_s1 = "inodes";
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 465f797451ee..6b9812db3779 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -966,7 +966,7 @@ static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 	mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
 		   num_clusters);
 
-	BUG_ON(!down_trylock(&tl_inode->i_sem));
+	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
 
@@ -1108,7 +1108,7 @@ bail:
 	return status;
 }
 
-/* Expects you to already be holding tl_inode->i_sem */
+/* Expects you to already be holding tl_inode->i_mutex */
 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 {
 	int status;
@@ -1123,7 +1123,7 @@ static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	BUG_ON(!down_trylock(&tl_inode->i_sem));
+	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
 	tl = &di->id2.i_dealloc;
@@ -1198,9 +1198,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	int status;
 	struct inode *tl_inode = osb->osb_tl_inode;
 
-	down(&tl_inode->i_sem);
+	mutex_lock(&tl_inode->i_mutex);
 	status = __ocfs2_flush_truncate_log(osb);
-	up(&tl_inode->i_sem);
+	mutex_unlock(&tl_inode->i_mutex);
 
 	return status;
 }
@@ -1363,7 +1363,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 	mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
 	     tl_copy->i_blkno);
 
-	down(&tl_inode->i_sem);
+	mutex_lock(&tl_inode->i_mutex);
 	for(i = 0; i < num_recs; i++) {
 		if (ocfs2_truncate_log_needs_flush(osb)) {
 			status = __ocfs2_flush_truncate_log(osb);
@@ -1395,7 +1395,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 	}
 
 bail_up:
-	up(&tl_inode->i_sem);
+	mutex_unlock(&tl_inode->i_mutex);
 
 	mlog_exit(status);
 	return status;
@@ -1840,7 +1840,7 @@ start:
 
 	mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
 
-	down(&tl_inode->i_sem);
+	mutex_lock(&tl_inode->i_mutex);
 	tl_sem = 1;
 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
 	 * record is free for use. If there isn't any, we flush to get
@@ -1875,7 +1875,7 @@ start:
 		goto bail;
 	}
 
-	up(&tl_inode->i_sem);
+	mutex_unlock(&tl_inode->i_mutex);
 	tl_sem = 0;
 
 	ocfs2_commit_trans(handle);
@@ -1890,7 +1890,7 @@ bail:
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 
 	if (tl_sem)
-		up(&tl_inode->i_sem);
+		mutex_unlock(&tl_inode->i_mutex);
 
 	if (handle)
 		ocfs2_commit_trans(handle);
@@ -1994,7 +1994,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		down(&ext_alloc_inode->i_sem);
+		mutex_lock(&ext_alloc_inode->i_mutex);
 		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
 
 		status = ocfs2_meta_lock(ext_alloc_inode,
@@ -2026,7 +2026,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 		if (tc->tc_ext_alloc_locked)
 			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
 
-		up(&tc->tc_ext_alloc_inode->i_sem);
+		mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
 		iput(tc->tc_ext_alloc_inode);
 	}
 
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 5fd60c105913..cf7828f23361 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -653,7 +653,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
 	struct config_group *o2hb_group = NULL, *ret = NULL;
 	void *defs = NULL;
 
-	/* this runs under the parent dir's i_sem; there can be only
+	/* this runs under the parent dir's i_mutex; there can be only
 	 * one caller in here at a time */
 	if (o2nm_single_cluster)
 		goto out; /* ENOSPC */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 856e20ae8263..57158fa75d91 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -202,7 +202,7 @@ bail:
 }
 
 /*
- * NOTE: this should always be called with parent dir i_sem taken.
+ * NOTE: this should always be called with parent dir i_mutex taken.
  */
 int ocfs2_find_files_on_disk(const char *name,
 			     int namelen,
@@ -245,7 +245,7 @@ leave:
  * Return 0 if the name does not exist
  * Return -EEXIST if the directory contains the name
  *
- * Callers should have i_sem + a cluster lock on dir
+ * Callers should have i_mutex + a cluster lock on dir
  */
 int ocfs2_check_dir_for_entry(struct inode *dir,
 			      const char *name,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 72ae9e3306f4..ca5f9f90d794 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -492,7 +492,7 @@ restart_all:
 	}
 
 	/* blocks peope in read/write from reading our allocation
-	 * until we're done changing it. We depend on i_sem to block
+	 * until we're done changing it. We depend on i_mutex to block
 	 * other extend/truncate calls while we're here. Ordering wrt
 	 * start_trans is important here -- always do it before! */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -958,8 +958,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		filp->f_flags &= ~O_DIRECT;
 #endif
 
-	down(&inode->i_sem);
-	/* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
+	mutex_lock(&inode->i_mutex);
+	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
 	if (filp->f_flags & O_DIRECT) {
 		have_alloc_sem = 1;
 		down_read(&inode->i_alloc_sem);
@@ -1123,7 +1123,7 @@ out:
 		up_read(&inode->i_alloc_sem);
 	if (rw_level != -1) 
 		ocfs2_rw_unlock(inode, rw_level);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	mlog_exit(ret);
 	return ret;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index a91ba4dec936..d4ecc0627716 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -485,10 +485,10 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail;
 	}
 
-	down(&inode_alloc_inode->i_sem);
+	mutex_lock(&inode_alloc_inode->i_mutex);
 	status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
 	if (status < 0) {
-		up(&inode_alloc_inode->i_sem);
+		mutex_unlock(&inode_alloc_inode->i_mutex);
 
 		mlog_errno(status);
 		goto bail;
@@ -536,7 +536,7 @@ bail_commit:
 	ocfs2_commit_trans(handle);
 bail_unlock:
 	ocfs2_meta_unlock(inode_alloc_inode, 1);
-	up(&inode_alloc_inode->i_sem);
+	mutex_unlock(&inode_alloc_inode->i_mutex);
 	brelse(inode_alloc_bh);
 bail:
 	iput(inode_alloc_inode);
@@ -567,10 +567,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
 	/* Lock the orphan dir. The lock will be held for the entire
 	 * delete_inode operation. We do this now to avoid races with
 	 * recovery completion on other nodes. */
-	down(&orphan_dir_inode->i_sem);
+	mutex_lock(&orphan_dir_inode->i_mutex);
 	status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
 	if (status < 0) {
-		up(&orphan_dir_inode->i_sem);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
 
 		mlog_errno(status);
 		goto bail;
@@ -593,7 +593,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
 
 bail_unlock_dir:
 	ocfs2_meta_unlock(orphan_dir_inode, 1);
-	up(&orphan_dir_inode->i_sem);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
 	brelse(orphan_dir_bh);
 bail:
 	iput(orphan_dir_inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 04428042e5e5..303c8d96457f 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -216,7 +216,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
 	atomic_inc(&inode->i_count);
 
 	/* we're obviously changing it... */
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	/* sanity check */
 	BUG_ON(OCFS2_I(inode)->ip_handle);
@@ -241,7 +241,7 @@ static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
 		OCFS2_I(inode)->ip_handle = NULL;
 		list_del_init(&OCFS2_I(inode)->ip_handle_list);
 
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 		iput(inode);
 	}
 }
@@ -1433,10 +1433,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	down(&orphan_dir_inode->i_sem);
+	mutex_lock(&orphan_dir_inode->i_mutex);
 	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
 	if (status < 0) {
-		up(&orphan_dir_inode->i_sem);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
 		mlog_errno(status);
 		goto out;
 	}
@@ -1451,7 +1451,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		if (!bh)
 			status = -EINVAL;
 		if (status < 0) {
-			up(&orphan_dir_inode->i_sem);
+			mutex_unlock(&orphan_dir_inode->i_mutex);
 			if (bh)
 				brelse(bh);
 			mlog_errno(status);
@@ -1465,7 +1465,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 
 			if (!ocfs2_check_dir_entry(orphan_dir_inode,
 						  de, bh, local)) {
-				up(&orphan_dir_inode->i_sem);
+				mutex_unlock(&orphan_dir_inode->i_mutex);
 				status = -EINVAL;
 				mlog_errno(status);
 				brelse(bh);
@@ -1509,7 +1509,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		}
 		brelse(bh);
 	}
-	up(&orphan_dir_inode->i_sem);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
 
 	ocfs2_meta_unlock(orphan_dir_inode, 0);
 	have_disk_lock = 0;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index fe373a2101d9..149b35181666 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -334,7 +334,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
 				  &alloc_bh, 0, inode);
@@ -367,7 +367,7 @@ bail:
 		brelse(alloc_bh);
 
 	if (inode) {
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 		iput(inode);
 	}
 
@@ -446,7 +446,7 @@ bail:
 
 /*
  * make sure we've got at least bitswanted contiguous bits in the
- * local alloc. You lose them when you drop i_sem.
+ * local alloc. You lose them when you drop i_mutex.
  *
  * We will add ourselves to the transaction passed in, but may start
  * our own in order to shift windows.
diff --git a/fs/open.c b/fs/open.c
index 75f3329e8a67..a3b3a9b5c2ff 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -211,9 +211,9 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 		newattrs.ia_valid |= ATTR_FILE;
 	}
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	err = notify_change(dentry, &newattrs);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	return err;
 }
 
@@ -398,9 +398,9 @@ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
 		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
 			goto dput_and_out;
 	}
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	error = notify_change(nd.dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 dput_and_out:
 	path_release(&nd);
 out:
@@ -451,9 +451,9 @@ long do_utimes(char __user * filename, struct timeval * times)
 		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
 			goto dput_and_out;
 	}
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	error = notify_change(nd.dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 dput_and_out:
 	path_release(&nd);
 out:
@@ -620,13 +620,13 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	err = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		goto out_putf;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	err = notify_change(dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 out_putf:
 	fput(file);
@@ -654,13 +654,13 @@ asmlinkage long sys_chmod(const char __user * filename, mode_t mode)
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		goto dput_and_out;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	error = notify_change(nd.dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 dput_and_out:
 	path_release(&nd);
@@ -696,9 +696,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 	}
 	if (!S_ISDIR(inode->i_mode))
 		newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	error = notify_change(dentry, &newattrs);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 out:
 	return error;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index 66aa0b938d6a..acb030b61fb0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -44,10 +44,10 @@ void pipe_wait(struct inode * inode)
 	 * is considered a noninteractive wait:
 	 */
 	prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 	schedule();
 	finish_wait(PIPE_WAIT(*inode), &wait);
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 }
 
 static inline int
@@ -136,7 +136,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	info = inode->i_pipe;
 	for (;;) {
 		int bufs = info->nrbufs;
@@ -200,7 +200,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 		}
 		pipe_wait(inode);
 	}
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 	/* Signal writers asynchronously that there is more room.  */
 	if (do_wakeup) {
 		wake_up_interruptible(PIPE_WAIT(*inode));
@@ -237,7 +237,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	info = inode->i_pipe;
 
 	if (!PIPE_READERS(*inode)) {
@@ -341,7 +341,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 		PIPE_WAITING_WRITERS(*inode)--;
 	}
 out:
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 	if (do_wakeup) {
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
@@ -381,7 +381,7 @@ pipe_ioctl(struct inode *pino, struct file *filp,
 
 	switch (cmd) {
 		case FIONREAD:
-			down(PIPE_SEM(*inode));
+			mutex_lock(PIPE_MUTEX(*inode));
 			info =  inode->i_pipe;
 			count = 0;
 			buf = info->curbuf;
@@ -390,7 +390,7 @@ pipe_ioctl(struct inode *pino, struct file *filp,
 				count += info->bufs[buf].len;
 				buf = (buf+1) & (PIPE_BUFFERS-1);
 			}
-			up(PIPE_SEM(*inode));
+			mutex_unlock(PIPE_MUTEX(*inode));
 			return put_user(count, (int __user *)arg);
 		default:
 			return -EINVAL;
@@ -433,7 +433,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	PIPE_READERS(*inode) -= decr;
 	PIPE_WRITERS(*inode) -= decw;
 	if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
@@ -443,7 +443,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 	}
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	return 0;
 }
@@ -454,9 +454,9 @@ pipe_read_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	if (retval < 0)
 		return retval;
@@ -471,9 +471,9 @@ pipe_write_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	if (retval < 0)
 		return retval;
@@ -488,14 +488,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 
 	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
 
 	if (retval >= 0)
 		retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
 
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	if (retval < 0)
 		return retval;
@@ -534,9 +534,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 {
 	/* We could have perhaps used atomic_t, but this and friends
 	   below are the only places.  So it doesn't seem worthwhile.  */
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	PIPE_READERS(*inode)++;
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	return 0;
 }
@@ -544,9 +544,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 static int
 pipe_write_open(struct inode *inode, struct file *filp)
 {
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	PIPE_WRITERS(*inode)++;
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	return 0;
 }
@@ -554,12 +554,12 @@ pipe_write_open(struct inode *inode, struct file *filp)
 static int
 pipe_rdwr_open(struct inode *inode, struct file *filp)
 {
-	down(PIPE_SEM(*inode));
+	mutex_lock(PIPE_MUTEX(*inode));
 	if (filp->f_mode & FMODE_READ)
 		PIPE_READERS(*inode)++;
 	if (filp->f_mode & FMODE_WRITE)
 		PIPE_WRITERS(*inode)++;
-	up(PIPE_SEM(*inode));
+	mutex_unlock(PIPE_MUTEX(*inode));
 
 	return 0;
 }
diff --git a/fs/quota.c b/fs/quota.c
index 612e04db4b93..d14d872646d4 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -168,7 +168,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	sync_blockdev(sb->s_bdev);
 
 	/* Now when everything is written we can discard the pagecache so
-	 * that userspace sees the changes. We need i_sem and so we could
+	 * that userspace sees the changes. We need i_mutex and so we could
 	 * not do it inside dqonoff_sem. Moreover we need to be carefull
 	 * about races with quotaoff() (that is the reason why we have own
 	 * reference to inode). */
@@ -184,9 +184,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	up(&sb_dqopt(sb)->dqonoff_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (discard[cnt]) {
-			down(&discard[cnt]->i_sem);
+			mutex_lock(&discard[cnt]->i_mutex);
 			truncate_inode_pages(&discard[cnt]->i_data, 0);
-			up(&discard[cnt]->i_sem);
+			mutex_unlock(&discard[cnt]->i_mutex);
 			iput(discard[cnt]);
 		}
 	}
diff --git a/fs/read_write.c b/fs/read_write.c
index df3468a22fea..3f7a1a62165f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -33,7 +33,7 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 	long long retval;
 	struct inode *inode = file->f_mapping->host;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	switch (origin) {
 		case 2:
 			offset += inode->i_size;
@@ -49,7 +49,7 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 		}
 		retval = offset;
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
 
diff --git a/fs/readdir.c b/fs/readdir.c
index b03579bc0210..b6109329b607 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -30,13 +30,13 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
 	if (res)
 		goto out;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
 		res = file->f_op->readdir(file, buf, filler);
 		file_accessed(file);
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 out:
 	return res;
 }
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 7892a865b58a..127e7d2cabdd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -49,7 +49,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 	}
 
 	reiserfs_write_lock(inode->i_sb);
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	/* freeing preallocation only involves relogging blocks that
 	 * are already in the current transaction.  preallocation gets
 	 * freed at the end of each transaction, so it is impossible for
@@ -100,7 +100,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 		err = reiserfs_truncate_file(inode, 0);
 	}
       out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	reiserfs_write_unlock(inode->i_sb);
 	return err;
 }
@@ -1342,7 +1342,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 		return -EFAULT;
 
-	down(&inode->i_sem);	// locks the entire file for just us
+	mutex_lock(&inode->i_mutex);	// locks the entire file for just us
 
 	pos = *ppos;
 
@@ -1532,12 +1532,12 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 		    generic_osync_inode(inode, file->f_mapping,
 					OSYNC_METADATA | OSYNC_DATA);
 
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	reiserfs_async_progress_wait(inode->i_sb);
 	return (already_written != 0) ? already_written : res;
 
       out:
-	up(&inode->i_sem);	// unlock the file on exit.
+	mutex_unlock(&inode->i_mutex);	// unlock the file on exit.
 	return res;
 }
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a5e3a0ddbe53..ffa34b861bdb 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -40,12 +40,12 @@ void reiserfs_delete_inode(struct inode *inode)
 
 	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
 	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 
 		reiserfs_delete_xattrs(inode);
 
 		if (journal_begin(&th, inode->i_sb, jbegin_count)) {
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 			goto out;
 		}
 		reiserfs_update_inode_transaction(inode);
@@ -59,11 +59,11 @@ void reiserfs_delete_inode(struct inode *inode)
 			DQUOT_FREE_INODE(inode);
 
 		if (journal_end(&th, inode->i_sb, jbegin_count)) {
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 			goto out;
 		}
 
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 
 		/* check return value from reiserfs_delete_object after
 		 * ending the transaction
@@ -551,7 +551,7 @@ static int convert_tail_for_hole(struct inode *inode,
 
 	/* we don't have to make sure the conversion did not happen while
 	 ** we were locking the page because anyone that could convert
-	 ** must first take i_sem.
+	 ** must first take i_mutex.
 	 **
 	 ** We must fix the tail page for writing because it might have buffers
 	 ** that are mapped, but have a block number of 0.  This indicates tail
@@ -586,7 +586,7 @@ static inline int _allocate_block(struct reiserfs_transaction_handle *th,
 	BUG_ON(!th->t_trans_id);
 
 #ifdef REISERFS_PREALLOCATE
-	if (!(flags & GET_BLOCK_NO_ISEM)) {
+	if (!(flags & GET_BLOCK_NO_IMUX)) {
 		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
 						  path, block);
 	}
@@ -2318,7 +2318,7 @@ static int map_block_for_writepage(struct inode *inode,
 	/* this is where we fill in holes in the file. */
 	if (use_get_block) {
 		retval = reiserfs_get_block(inode, block, bh_result,
-					    GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM
+					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
 					    | GET_BLOCK_NO_DANGLE);
 		if (!retval) {
 			if (!buffer_mapped(bh_result)
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 81fc00285f60..ba8bf8df6dc7 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -120,7 +120,7 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp)
 	/* we need to make sure nobody is changing the file size beneath
 	 ** us
 	 */
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	write_from = inode->i_size & (blocksize - 1);
 	/* if we are on a block boundary, we are already unpacked.  */
@@ -156,7 +156,7 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp)
 	page_cache_release(page);
 
       out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	reiserfs_write_unlock(inode->i_sb);
 	return retval;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 42afb5bef111..397d9590c8f2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2211,7 +2211,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 	size_t towrite = len;
 	struct buffer_head tmp_bh, *bh;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 		    sb->s_blocksize - offset : towrite;
@@ -2250,7 +2250,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(inode);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return len - towrite;
 }
 
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index c92e124f628e..196e971c03c9 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -205,7 +205,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in
 					 1) * p_s_sb->s_blocksize;
 	pos1 = pos;
 
-	// we are protected by i_sem. The tail can not disapper, not
+	// we are protected by i_mutex. The tail can not disapper, not
 	// append can be done either
 	// we are in truncate or packing tail in file_release
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 02091eaac0b4..f1895f0a278e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -67,11 +67,11 @@ static struct dentry *create_xa_root(struct super_block *sb)
 		goto out;
 	} else if (!xaroot->d_inode) {
 		int err;
-		down(&privroot->d_inode->i_sem);
+		mutex_lock(&privroot->d_inode->i_mutex);
 		err =
 		    privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot,
 						   0700);
-		up(&privroot->d_inode->i_sem);
+		mutex_unlock(&privroot->d_inode->i_mutex);
 
 		if (err) {
 			dput(xaroot);
@@ -219,7 +219,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode,
 	} else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
 		goto out;
 	} else {
-		/* inode->i_sem is down, so nothing else can try to create
+		/* inode->i_mutex is down, so nothing else can try to create
 		 * the same xattr */
 		err = xadir->d_inode->i_op->create(xadir->d_inode, xafile,
 						   0700 | S_IFREG, NULL);
@@ -268,7 +268,7 @@ static struct file *open_xa_file(const struct inode *inode, const char *name,
  * and don't mess with f->f_pos, but the idea is the same.  Do some
  * action on each and every entry in the directory.
  *
- * we're called with i_sem held, so there are no worries about the directory
+ * we're called with i_mutex held, so there are no worries about the directory
  * changing underneath us.
  */
 static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -426,7 +426,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf)
 	int res = -ENOTDIR;
 	if (!file->f_op || !file->f_op->readdir)
 		goto out;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 //        down(&inode->i_zombie);
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
@@ -435,7 +435,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf)
 		unlock_kernel();
 	}
 //        up(&inode->i_zombie);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
       out:
 	return res;
 }
@@ -480,7 +480,7 @@ static inline __u32 xattr_hash(const char *msg, int len)
 /* Generic extended attribute operations that can be used by xa plugins */
 
 /*
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 int
 reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
@@ -535,7 +535,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 	/* Resize it so we're ok to write there */
 	newattrs.ia_size = buffer_size;
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-	down(&xinode->i_sem);
+	mutex_lock(&xinode->i_mutex);
 	err = notify_change(fp->f_dentry, &newattrs);
 	if (err)
 		goto out_filp;
@@ -598,7 +598,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 	}
 
       out_filp:
-	up(&xinode->i_sem);
+	mutex_unlock(&xinode->i_mutex);
 	fput(fp);
 
       out:
@@ -606,7 +606,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 }
 
 /*
- * inode->i_sem: down
+ * inode->i_mutex: down
  */
 int
 reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
@@ -793,7 +793,7 @@ reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
 
 }
 
-/* This is called w/ inode->i_sem downed */
+/* This is called w/ inode->i_mutex downed */
 int reiserfs_delete_xattrs(struct inode *inode)
 {
 	struct file *fp;
@@ -946,7 +946,7 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
 
 /*
  * Inode operation getxattr()
- * Preliminary locking: we down dentry->d_inode->i_sem
+ * Preliminary locking: we down dentry->d_inode->i_mutex
  */
 ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
@@ -970,7 +970,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
 /*
  * Inode operation setxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_mutex down
  */
 int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
@@ -1008,7 +1008,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 /*
  * Inode operation removexattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_mutex down
  */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
@@ -1091,7 +1091,7 @@ reiserfs_listxattr_filler(void *buf, const char *name, int namelen,
 /*
  * Inode operation listxattr()
  *
- * Preliminary locking: we down dentry->d_inode->i_sem
+ * Preliminary locking: we down dentry->d_inode->i_mutex
  */
 ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
@@ -1289,9 +1289,9 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 		if (!IS_ERR(dentry)) {
 			if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) {
 				struct inode *inode = dentry->d_parent->d_inode;
-				down(&inode->i_sem);
+				mutex_lock(&inode->i_mutex);
 				err = inode->i_op->mkdir(inode, dentry, 0700);
-				up(&inode->i_sem);
+				mutex_unlock(&inode->i_mutex);
 				if (err) {
 					dput(dentry);
 					dentry = NULL;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index a47ac9aac8b2..2dc953504cc0 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -174,7 +174,7 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
 /*
  * Inode operation get_posix_acl().
  *
- * inode->i_sem: down
+ * inode->i_mutex: down
  * BKL held [before 2.5.x]
  */
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
@@ -237,7 +237,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 /*
  * Inode operation set_posix_acl().
  *
- * inode->i_sem: down
+ * inode->i_mutex: down
  * BKL held [before 2.5.x]
  */
 static int
@@ -312,7 +312,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return error;
 }
 
-/* dir->i_sem: down,
+/* dir->i_mutex: locked,
  * inode is new and not released into the wild yet */
 int
 reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 7b7f2cb5f0e1..383523011aad 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -109,7 +109,7 @@ static struct dentry *relayfs_create_entry(const char *name,
 	}
 
 	parent = dget(parent);
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	d = lookup_one_len(name, parent, strlen(name));
 	if (IS_ERR(d)) {
 		d = NULL;
@@ -139,7 +139,7 @@ release_mount:
 	simple_release_fs(&relayfs_mount, &relayfs_mount_count);
 
 exit:
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	dput(parent);
 	return d;
 }
@@ -204,7 +204,7 @@ int relayfs_remove(struct dentry *dentry)
 		return -EINVAL;
 
 	parent = dget(parent);
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	if (dentry->d_inode) {
 		if (S_ISDIR(dentry->d_inode->i_mode))
 			error = simple_rmdir(parent->d_inode, dentry);
@@ -215,7 +215,7 @@ int relayfs_remove(struct dentry *dentry)
 	}
 	if (!error)
 		dput(dentry);
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	dput(parent);
 
 	if (!error)
@@ -476,7 +476,7 @@ static ssize_t relay_file_read(struct file *filp,
 	ssize_t ret = 0;
 	void *from;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if(!relay_file_read_avail(buf, *ppos))
 		goto out;
 
@@ -494,7 +494,7 @@ static ssize_t relay_file_read(struct file *filp,
 	relay_file_read_consume(buf, read_start, count);
 	*ppos = relay_file_read_end_pos(buf, read_start, count);
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d36780382176..49bd219275db 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -99,7 +99,7 @@ static int create_dir(struct kobject * k, struct dentry * p,
 	int error;
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
 
-	down(&p->d_inode->i_sem);
+	mutex_lock(&p->d_inode->i_mutex);
 	*d = lookup_one_len(n, p, strlen(n));
 	if (!IS_ERR(*d)) {
 		error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, SYSFS_DIR);
@@ -122,7 +122,7 @@ static int create_dir(struct kobject * k, struct dentry * p,
 		dput(*d);
 	} else
 		error = PTR_ERR(*d);
-	up(&p->d_inode->i_sem);
+	mutex_unlock(&p->d_inode->i_mutex);
 	return error;
 }
 
@@ -246,7 +246,7 @@ static void remove_dir(struct dentry * d)
 	struct dentry * parent = dget(d->d_parent);
 	struct sysfs_dirent * sd;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	d_delete(d);
 	sd = d->d_fsdata;
  	list_del_init(&sd->s_sibling);
@@ -257,7 +257,7 @@ static void remove_dir(struct dentry * d)
 	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
 		 atomic_read(&d->d_count));
 
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	dput(parent);
 }
 
@@ -286,7 +286,7 @@ void sysfs_remove_dir(struct kobject * kobj)
 		return;
 
 	pr_debug("sysfs %s: removing dir\n",dentry->d_name.name);
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	parent_sd = dentry->d_fsdata;
 	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element || !(sd->s_type & SYSFS_NOT_PINNED))
@@ -295,7 +295,7 @@ void sysfs_remove_dir(struct kobject * kobj)
 		sysfs_drop_dentry(sd, dentry);
 		sysfs_put(sd);
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	remove_dir(dentry);
 	/**
@@ -318,7 +318,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 	down_write(&sysfs_rename_sem);
 	parent = kobj->parent->dentry;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 
 	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (!IS_ERR(new_dentry)) {
@@ -334,7 +334,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 			error = -EEXIST;
 		dput(new_dentry);
 	}
-	up(&parent->d_inode->i_sem);	
+	mutex_unlock(&parent->d_inode->i_mutex);
 	up_write(&sysfs_rename_sem);
 
 	return error;
@@ -345,9 +345,9 @@ static int sysfs_dir_open(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_dentry;
 	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	file->private_data = sysfs_new_dirent(parent_sd, NULL);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	return file->private_data ? 0 : -ENOMEM;
 
@@ -358,9 +358,9 @@ static int sysfs_dir_close(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_dentry;
 	struct sysfs_dirent * cursor = file->private_data;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	list_del_init(&cursor->s_sibling);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	release_sysfs_dirent(cursor);
 
@@ -436,7 +436,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 {
 	struct dentry * dentry = file->f_dentry;
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += file->f_pos;
@@ -444,7 +444,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			if (offset >= 0)
 				break;
 		default:
-			up(&file->f_dentry->d_inode->i_sem);
+			mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -468,7 +468,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			list_add_tail(&cursor->s_sibling, p);
 		}
 	}
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	return offset;
 }
 
@@ -483,4 +483,3 @@ struct file_operations sysfs_dir_operations = {
 EXPORT_SYMBOL_GPL(sysfs_create_dir);
 EXPORT_SYMBOL_GPL(sysfs_remove_dir);
 EXPORT_SYMBOL_GPL(sysfs_rename_dir);
-
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4013d7905e84..d0e3d8495165 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -364,9 +364,9 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type)
 	umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG;
 	int error = 0;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	error = sysfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	return error;
 }
@@ -398,7 +398,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 	struct dentry * victim;
 	int res = -ENOENT;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	victim = lookup_one_len(attr->name, dir, strlen(attr->name));
 	if (!IS_ERR(victim)) {
 		/* make sure dentry is really there */
@@ -420,7 +420,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 		 */
 		dput(victim);
 	}
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	return res;
 }
@@ -441,22 +441,22 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 	struct iattr newattrs;
 	int res = -ENOENT;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	victim = lookup_one_len(attr->name, dir, strlen(attr->name));
 	if (!IS_ERR(victim)) {
 		if (victim->d_inode &&
 		    (victim->d_parent->d_inode == dir->d_inode)) {
 			inode = victim->d_inode;
-			down(&inode->i_sem);
+			mutex_lock(&inode->i_mutex);
 			newattrs.ia_mode = (mode & S_IALLUGO) |
 						(inode->i_mode & ~S_IALLUGO);
 			newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 			res = notify_change(victim, &newattrs);
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 		}
 		dput(victim);
 	}
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	return res;
 }
@@ -480,4 +480,3 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 EXPORT_SYMBOL_GPL(sysfs_create_file);
 EXPORT_SYMBOL_GPL(sysfs_remove_file);
 EXPORT_SYMBOL_GPL(sysfs_update_file);
-
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 970a33f03299..c3133219941c 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -201,7 +201,7 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd)
 
 /*
  * Unhashes the dentry corresponding to given sysfs_dirent
- * Called with parent inode's i_sem held.
+ * Called with parent inode's i_mutex held.
  */
 void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent)
 {
@@ -232,7 +232,7 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name)
 		/* no inode means this hasn't been made visible yet */
 		return;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element)
 			continue;
@@ -243,7 +243,5 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name)
 			break;
 		}
 	}
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 }
-
-
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index de402fa915f2..e38d6338a20d 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -86,9 +86,9 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 
 	BUG_ON(!kobj || !kobj->dentry || !name);
 
-	down(&dentry->d_inode->i_sem);
+	mutex_lock(&dentry->d_inode->i_mutex);
 	error = sysfs_add_link(dentry, name, target);
-	up(&dentry->d_inode->i_sem);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	return error;
 }
 
@@ -177,4 +177,3 @@ struct inode_operations sysfs_symlink_inode_operations = {
 
 EXPORT_SYMBOL_GPL(sysfs_create_link);
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
-
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2ba11a9aa995..e9a42c711a9e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1275,7 +1275,7 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type,
 	size_t towrite = len;
 	struct buffer_head *bh;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 				sb->s_blocksize - offset : towrite;
@@ -1297,7 +1297,7 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type,
 	}
 out:
 	if (len == towrite) {
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
 	if (inode->i_size < off+len-towrite)
@@ -1305,7 +1305,7 @@ out:
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return len - towrite;
 }
 
diff --git a/fs/xattr.c b/fs/xattr.c
index bcc2156d4d28..386a532ee5a9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -51,7 +51,7 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 		}
 	}
 
-	down(&d->d_inode->i_sem);
+	mutex_lock(&d->d_inode->i_mutex);
 	error = security_inode_setxattr(d, kname, kvalue, size, flags);
 	if (error)
 		goto out;
@@ -73,7 +73,7 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 			fsnotify_xattr(d);
 	}
 out:
-	up(&d->d_inode->i_sem);
+	mutex_unlock(&d->d_inode->i_mutex);
 	kfree(kvalue);
 	return error;
 }
@@ -323,9 +323,9 @@ removexattr(struct dentry *d, char __user *name)
 		error = security_inode_removexattr(d, kname);
 		if (error)
 			goto out;
-		down(&d->d_inode->i_sem);
+		mutex_lock(&d->d_inode->i_mutex);
 		error = d->d_inode->i_op->removexattr(d, kname);
-		up(&d->d_inode->i_sem);
+		mutex_unlock(&d->d_inode->i_mutex);
 		if (!error)
 			fsnotify_xattr(d);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 14215a7db59f..41c478bb1ffc 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -203,7 +203,7 @@ validate_fields(
 		ip->i_nlink = va.va_nlink;
 		ip->i_blocks = va.va_nblocks;
 
-		/* we're under i_sem so i_size can't change under us */
+		/* we're under i_mutex so i_size can't change under us */
 		if (i_size_read(ip) != va.va_size)
 			i_size_write(ip, va.va_size);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 279e9bc92aba..5675117ef227 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -254,7 +254,7 @@ xfs_read(
 	}
 
 	if (unlikely(ioflags & IO_ISDIRECT))
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
 	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
@@ -286,7 +286,7 @@ xfs_read(
 
 unlock_isem:
 	if (unlikely(ioflags & IO_ISDIRECT))
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
@@ -655,7 +655,7 @@ relock:
 		iolock = XFS_IOLOCK_EXCL;
 		locktype = VRWLOCK_WRITE;
 
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 	} else {
 		iolock = XFS_IOLOCK_SHARED;
 		locktype = VRWLOCK_WRITE_DIRECT;
@@ -686,7 +686,7 @@ start:
 		int		dmflags = FILP_DELAY_FLAG(file);
 
 		if (need_isem)
-			dmflags |= DM_FLAGS_ISEM;
+			dmflags |= DM_FLAGS_IMUX;
 
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
@@ -772,7 +772,7 @@ retry:
 		if (need_isem) {
 			/* demote the lock now the cached pages are gone */
 			XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 
 			iolock = XFS_IOLOCK_SHARED;
 			locktype = VRWLOCK_WRITE_DIRECT;
@@ -817,14 +817,14 @@ retry:
 
 		xfs_rwunlock(bdp, locktype);
 		if (need_isem)
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
 				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
 				0, 0, 0); /* Delay flag intentionally  unused */
 		if (error)
 			goto out_nounlocks;
 		if (need_isem)
-			down(&inode->i_sem);
+			mutex_lock(&inode->i_mutex);
 		xfs_rwlock(bdp, locktype);
 		pos = xip->i_d.di_size;
 		ret = 0;
@@ -926,7 +926,7 @@ retry:
 	
 		xfs_rwunlock(bdp, locktype);
 		if (need_isem)
-			up(&inode->i_sem);
+			mutex_unlock(&inode->i_mutex);
 
 		error = sync_page_range(inode, mapping, pos, ret);
 		if (!error)
@@ -938,7 +938,7 @@ retry:
 	xfs_rwunlock(bdp, locktype);
  out_unlock_isem:
 	if (need_isem)
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
  out_nounlocks:
 	return -error;
 }
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index 864bf6955689..b4c7f2bc55a0 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -152,7 +152,7 @@ typedef enum {
 
 #define DM_FLAGS_NDELAY		0x001	/* return EAGAIN after dm_pending() */
 #define DM_FLAGS_UNWANTED	0x002	/* event not in fsys dm_eventset_t */
-#define DM_FLAGS_ISEM		0x004	/* thread holds i_sem */
+#define DM_FLAGS_IMUX		0x004	/* thread holds i_mutex */
 #define DM_FLAGS_IALLOCSEM_RD	0x010	/* thread holds i_alloc_sem rd */
 #define DM_FLAGS_IALLOCSEM_WR	0x020	/* thread holds i_alloc_sem wr */
 
@@ -161,21 +161,21 @@ typedef enum {
  */
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0)
 #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
-			      DM_FLAGS_ISEM : 0)
-#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM)
+			      DM_FLAGS_IMUX : 0)
+#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
 #endif
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \
     (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,22))
 #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
-			      DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_ISEM)
-#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM)
+			      DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_IMUX)
+#define DM_SEM_FLAG_WR	(DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
 #endif
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,21)
 #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
-			      0 : DM_FLAGS_ISEM)
-#define DM_SEM_FLAG_WR	(DM_FLAGS_ISEM)
+			      0 : DM_FLAGS_IMUX)
+#define DM_SEM_FLAG_WR	(DM_FLAGS_IMUX)
 #endif
 
 
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 2914f7b07156..e71dd98dbcae 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -87,7 +87,7 @@ struct ext3_inode_info {
 #ifdef CONFIG_EXT3_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
-	 * data. Taking i_sem even when reading would cause contention
+	 * data. Taking i_mutex even when reading would cause contention
 	 * between readers of EAs and writers of regular file data, so
 	 * instead we synchronize on xattr_sem when reading or changing
 	 * EAs.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4c82219b0fae..01654b218e42 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -219,6 +219,7 @@ extern int dir_notify_enable;
 #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/mutex.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -484,7 +485,7 @@ struct inode {
 	unsigned long		i_blocks;
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
-	struct semaphore	i_sem;
+	struct mutex		i_mutex;
 	struct rw_semaphore	i_alloc_sem;
 	struct inode_operations	*i_op;
 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
@@ -1191,7 +1192,7 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc);
  *    directory.  The name should be stored in the @name (with the
  *    understanding that it is already pointing to a a %NAME_MAX+1 sized
  *    buffer.   get_name() should return %0 on success, a negative error code
- *    or error.  @get_name will be called without @parent->i_sem held.
+ *    or error.  @get_name will be called without @parent->i_mutex held.
  *
  * get_parent:
  *    @get_parent should find the parent directory for the given @child which
@@ -1213,7 +1214,7 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc);
  *    nfsd_find_fh_dentry() in either the @obj or @parent parameters.
  *
  * Locking rules:
- *    get_parent is called with child->d_inode->i_sem down
+ *    get_parent is called with child->d_inode->i_mutex down
  *    get_name is not (which is possibly inconsistent)
  */
 
diff --git a/include/linux/jffs2_fs_i.h b/include/linux/jffs2_fs_i.h
index ef85ab56302b..ad565bf9dcc1 100644
--- a/include/linux/jffs2_fs_i.h
+++ b/include/linux/jffs2_fs_i.h
@@ -8,11 +8,11 @@
 #include <asm/semaphore.h>
 
 struct jffs2_inode_info {
-	/* We need an internal semaphore similar to inode->i_sem.
+	/* We need an internal mutex similar to inode->i_mutex.
 	   Unfortunately, we can't used the existing one, because
 	   either the GC would deadlock, or we'd have to release it
 	   before letting GC proceed. Or we'd have to put ugliness
-	   into the GC code so it didn't attempt to obtain the i_sem
+	   into the GC code so it didn't attempt to obtain the i_mutex
 	   for the inode(s) which are already locked */
 	struct semaphore sem;
 
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index bb842ea41033..0798b7781a6e 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -294,7 +294,7 @@ fill_post_wcc(struct svc_fh *fhp)
 /*
  * Lock a file handle/inode
  * NOTE: both fh_lock and fh_unlock are done "by hand" in
- * vfs.c:nfsd_rename as it needs to grab 2 i_sem's at once
+ * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
  * so, any changes here should be reflected there.
  */
 static inline void
@@ -317,7 +317,7 @@ fh_lock(struct svc_fh *fhp)
 	}
 
 	inode = dentry->d_inode;
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	fill_pre_wcc(fhp);
 	fhp->fh_locked = 1;
 }
@@ -333,7 +333,7 @@ fh_unlock(struct svc_fh *fhp)
 
 	if (fhp->fh_locked) {
 		fill_post_wcc(fhp);
-		up(&fhp->fh_dentry->d_inode->i_sem);
+		mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
 		fhp->fh_locked = 0;
 	}
 }
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 1767073df26f..b12e59c75752 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -37,7 +37,7 @@ struct pipe_inode_info {
    memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE		PAGE_SIZE
 
-#define PIPE_SEM(inode)		(&(inode).i_sem)
+#define PIPE_MUTEX(inode)	(&(inode).i_mutex)
 #define PIPE_WAIT(inode)	(&(inode).i_pipe->wait)
 #define PIPE_READERS(inode)	((inode).i_pipe->readers)
 #define PIPE_WRITERS(inode)	((inode).i_pipe->writers)
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 001ab82df051..e276c5ba2bb7 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1857,7 +1857,7 @@ void padd_item(char *item, int total_length, int length);
 #define GET_BLOCK_CREATE 1	/* add anything you need to find block */
 #define GET_BLOCK_NO_HOLE 2	/* return -ENOENT for file holes */
 #define GET_BLOCK_READ_DIRECT 4	/* read the tail if indirect item not found */
-#define GET_BLOCK_NO_ISEM     8	/* i_sem is not held, don't preallocate */
+#define GET_BLOCK_NO_IMUX     8	/* i_mutex is not held, don't preallocate */
 #define GET_BLOCK_NO_DANGLE   16	/* don't leave any transactions running */
 
 int restart_transaction(struct reiserfs_transaction_handle *th,
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c8943b53d8e6..a8aa6152eea6 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -660,7 +660,7 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
 	if (fd < 0)
 		goto out_putname;
 
-	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	mutex_lock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
 	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
@@ -697,7 +697,7 @@ out_putfd:
 out_err:
 	fd = error;
 out_upsem:
-	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	mutex_unlock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
 out_putname:
 	putname(name);
 	return fd;
@@ -714,7 +714,7 @@ asmlinkage long sys_mq_unlink(const char __user *u_name)
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
-	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	mutex_lock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
 	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -735,7 +735,7 @@ out_err:
 	dput(dentry);
 
 out_unlock:
-	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	mutex_unlock(&mqueue_mnt->mnt_root->d_inode->i_mutex);
 	putname(name);
 	if (inode)
 		iput(inode);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index eab64e23bcae..2a75e44e1a41 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1513,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
 	struct dentry *dentry;
 	int error;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	dentry = cpuset_get_dentry(dir, cft->name);
 	if (!IS_ERR(dentry)) {
 		error = cpuset_create_file(dentry, 0644 | S_IFREG);
@@ -1522,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
 		dput(dentry);
 	} else
 		error = PTR_ERR(dentry);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 	return error;
 }
 
@@ -1793,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	/*
 	 * Release manage_sem before cpuset_populate_dir() because it
-	 * will down() this new directory's i_sem and if we race with
+	 * will down() this new directory's i_mutex and if we race with
 	 * another mkdir, we might deadlock.
 	 */
 	up(&manage_sem);
@@ -1812,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	struct cpuset *c_parent = dentry->d_parent->d_fsdata;
 
-	/* the vfs holds inode->i_sem already */
+	/* the vfs holds inode->i_mutex already */
 	return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
 }
 
@@ -1823,7 +1823,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cpuset *parent;
 	char *pathbuf = NULL;
 
-	/* the vfs holds both inode->i_sem already */
+	/* the vfs holds both inode->i_mutex already */
 
 	down(&manage_sem);
 	cpuset_update_task_memory_state();
diff --git a/mm/filemap.c b/mm/filemap.c
index 478f4c74cc31..5fca2737c971 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -61,7 +61,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
- *  ->i_sem
+ *  ->i_mutex
  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
@@ -73,9 +73,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->lock_page		(access_process_vm)
  *
  *  ->mmap_sem
- *    ->i_sem			(msync)
+ *    ->i_mutex			(msync)
  *
- *  ->i_sem
+ *  ->i_mutex
  *    ->i_alloc_sem             (various)
  *
  *  ->inode_lock
@@ -276,7 +276,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
  * integrity" operation.  It waits upon in-flight writeout before starting and
  * waiting upon new writeout.  If there was an IO error, return it.
  *
- * We need to re-take i_sem during the generic_osync_inode list walk because
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
  * it is otherwise livelockable.
  */
 int sync_page_range(struct inode *inode, struct address_space *mapping,
@@ -290,9 +290,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 		return 0;
 	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 	if (ret == 0) {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	if (ret == 0)
 		ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,7 +301,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 EXPORT_SYMBOL(sync_page_range);
 
 /*
- * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
  * as it forces O_SYNC writers to different parts of the same file
  * to be serialised right until io completion.
  */
@@ -1892,7 +1892,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	/*
 	 * Sync the fs metadata but not the minor inode changes and
 	 * of course not the data as we did direct DMA for the IO.
-	 * i_sem is held, which protects generic_osync_inode() from
+	 * i_mutex is held, which protects generic_osync_inode() from
 	 * livelocking.
 	 */
 	if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2195,10 +2195,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
 						&iocb->ki_pos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		ssize_t err;
@@ -2220,9 +2220,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
 	struct iovec local_iov = { .iov_base = (void __user *)buf,
 					.iov_len = count };
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		ssize_t err;
@@ -2256,9 +2256,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 	struct inode *inode = mapping->host;
 	ssize_t ret;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err;
@@ -2272,7 +2272,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 EXPORT_SYMBOL(generic_file_writev);
 
 /*
- * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
+ * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
  * went wrong during pagecache shootdown.
  */
 static ssize_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9cf687e4a29a..e2b34e95913e 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 	*ppos = pos;
 	/*
 	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_sem.
+	 * cannot change under us because we hold i_mutex.
 	 */
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
@@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	loff_t pos;
 	ssize_t ret;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	if (!access_ok(VERIFY_READ, buf, len)) {
 		ret=-EFAULT;
@@ -390,7 +390,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
  out_backing:
 	current->backing_dev_info = NULL;
  out_up:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/memory.c b/mm/memory.c
index 3944fec38012..7a11ddd5060f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1784,13 +1784,13 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 	if (!inode->i_op || !inode->i_op->truncate_range)
 		return -ENOSYS;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	down_write(&inode->i_alloc_sem);
 	unmap_mapping_range(mapping, offset, (end - offset), 1);
 	truncate_inode_pages_range(mapping, offset, end);
 	inode->i_op->truncate_range(inode, offset, end);
 	up_write(&inode->i_alloc_sem);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
diff --git a/mm/msync.c b/mm/msync.c
index 1b5b6f662dcf..3563a56e1a51 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -137,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma,
 			ret = filemap_fdatawrite(mapping);
 			if (file->f_op && file->f_op->fsync) {
 				/*
-				 * We don't take i_sem here because mmap_sem
+				 * We don't take i_mutex here because mmap_sem
 				 * is already held.
 				 */
 				err = file->f_op->fsync(file,file->f_dentry,1);
diff --git a/mm/rmap.c b/mm/rmap.c
index 66ec43053a4d..dfbb89f99a15 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,13 +20,13 @@
 /*
  * Lock ordering in mm:
  *
- * inode->i_sem	(while writing or truncating, not reading or faulting)
+ * inode->i_mutex	(while writing or truncating, not reading or faulting)
  *   inode->i_alloc_sem
  *
  * When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
- * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
- * taken together; in truncation, i_sem is taken outermost.
+ * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
+ * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
+ * taken together; in truncation, i_mutex is taken outermost.
  *
  * mm->mmap_sem
  *   page->flags PG_locked (lock_page)
diff --git a/mm/shmem.c b/mm/shmem.c
index a1f2f02af724..343b3c0937e5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1370,7 +1370,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 
 	pos = *ppos;
 	written = 0;
@@ -1455,7 +1455,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
 	if (written)
 		err = written;
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return err;
 }
 
@@ -1491,7 +1491,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 
 		/*
 		 * We must evaluate after, since reads (unlike writes)
-		 * are called without i_sem protection against truncate
+		 * are called without i_mutex protection against truncate
 		 */
 		nr = PAGE_CACHE_SIZE;
 		i_size = i_size_read(inode);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 80f948a2028b..6544565a7c0f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1187,9 +1187,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		set_blocksize(bdev, p->old_block_size);
 		bd_release(bdev);
 	} else {
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		inode->i_flags &= ~S_SWAPFILE;
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	filp_close(swap_file, NULL);
 	err = 0;
@@ -1406,7 +1406,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		p->bdev = bdev;
 	} else if (S_ISREG(inode->i_mode)) {
 		p->bdev = inode->i_sb->s_bdev;
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		did_down = 1;
 		if (IS_SWAPFILE(inode)) {
 			error = -EBUSY;
@@ -1596,7 +1596,7 @@ out:
 	if (did_down) {
 		if (!error)
 			inode->i_flags |= S_SWAPFILE;
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	return error;
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index b1a463d0fe71..6cb3fff25f67 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -196,7 +196,7 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
  * @mapping: mapping to truncate
  * @lstart: offset from which to truncate
  *
- * Called under (and serialised by) inode->i_sem.
+ * Called under (and serialised by) inode->i_mutex.
  */
 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 {
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index e14c1cae7460..9764c80ab0b2 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -69,13 +69,13 @@ rpc_timeout_upcall_queue(void *data)
 	struct rpc_inode *rpci = (struct rpc_inode *)data;
 	struct inode *inode = &rpci->vfs_inode;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops == NULL)
 		goto out;
 	if (rpci->nreaders == 0 && !list_empty(&rpci->pipe))
 		__rpc_purge_upcall(inode, -ETIMEDOUT);
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 }
 
 int
@@ -84,7 +84,7 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
 	struct rpc_inode *rpci = RPC_I(inode);
 	int res = -EPIPE;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops == NULL)
 		goto out;
 	if (rpci->nreaders) {
@@ -100,7 +100,7 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
 		res = 0;
 	}
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	wake_up(&rpci->waitq);
 	return res;
 }
@@ -116,7 +116,7 @@ rpc_close_pipes(struct inode *inode)
 {
 	struct rpc_inode *rpci = RPC_I(inode);
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops != NULL) {
 		rpci->nreaders = 0;
 		__rpc_purge_list(rpci, &rpci->in_upcall, -EPIPE);
@@ -127,7 +127,7 @@ rpc_close_pipes(struct inode *inode)
 		rpci->ops = NULL;
 	}
 	rpc_inode_setowner(inode, NULL);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	cancel_delayed_work(&rpci->queue_timeout);
 	flush_scheduled_work();
 }
@@ -154,7 +154,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
 	struct rpc_inode *rpci = RPC_I(inode);
 	int res = -ENXIO;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops != NULL) {
 		if (filp->f_mode & FMODE_READ)
 			rpci->nreaders ++;
@@ -162,7 +162,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
 			rpci->nwriters ++;
 		res = 0;
 	}
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return res;
 }
 
@@ -172,7 +172,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
 	struct rpc_inode *rpci = RPC_I(inode);
 	struct rpc_pipe_msg *msg;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops == NULL)
 		goto out;
 	msg = (struct rpc_pipe_msg *)filp->private_data;
@@ -190,7 +190,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
 	if (rpci->ops->release_pipe)
 		rpci->ops->release_pipe(inode);
 out:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
 
@@ -202,7 +202,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 	struct rpc_pipe_msg *msg;
 	int res = 0;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	if (rpci->ops == NULL) {
 		res = -EPIPE;
 		goto out_unlock;
@@ -229,7 +229,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 		rpci->ops->destroy_msg(msg);
 	}
 out_unlock:
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return res;
 }
 
@@ -240,11 +240,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
 	struct rpc_inode *rpci = RPC_I(inode);
 	int res;
 
-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
 	res = -EPIPE;
 	if (rpci->ops != NULL)
 		res = rpci->ops->downcall(filp, buf, len);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);
 	return res;
 }
 
@@ -322,7 +322,7 @@ rpc_info_open(struct inode *inode, struct file *file)
 
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		down(&inode->i_sem);
+		mutex_lock(&inode->i_mutex);
 		clnt = RPC_I(inode)->private;
 		if (clnt) {
 			atomic_inc(&clnt->cl_users);
@@ -331,7 +331,7 @@ rpc_info_open(struct inode *inode, struct file *file)
 			single_release(inode, file);
 			ret = -EINVAL;
 		}
-		up(&inode->i_sem);
+		mutex_unlock(&inode->i_mutex);
 	}
 	return ret;
 }
@@ -491,7 +491,7 @@ rpc_depopulate(struct dentry *parent)
 	struct dentry *dentry, *dvec[10];
 	int n = 0;
 
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 repeat:
 	spin_lock(&dcache_lock);
 	list_for_each_safe(pos, next, &parent->d_subdirs) {
@@ -519,7 +519,7 @@ repeat:
 		} while (n);
 		goto repeat;
 	}
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 }
 
 static int
@@ -532,7 +532,7 @@ rpc_populate(struct dentry *parent,
 	struct dentry *dentry;
 	int mode, i;
 
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	for (i = start; i < eof; i++) {
 		dentry = d_alloc_name(parent, files[i].name);
 		if (!dentry)
@@ -552,10 +552,10 @@ rpc_populate(struct dentry *parent,
 			dir->i_nlink++;
 		d_add(dentry, inode);
 	}
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	return 0;
 out_bad:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	printk(KERN_WARNING "%s: %s failed to populate directory %s\n",
 			__FILE__, __FUNCTION__, parent->d_name.name);
 	return -ENOMEM;
@@ -609,7 +609,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd)
 	if ((error = rpc_lookup_parent(path, nd)) != 0)
 		return ERR_PTR(error);
 	dir = nd->dentry->d_inode;
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	dentry = lookup_hash(nd);
 	if (IS_ERR(dentry))
 		goto out_err;
@@ -620,7 +620,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd)
 	}
 	return dentry;
 out_err:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(nd);
 	return dentry;
 }
@@ -646,7 +646,7 @@ rpc_mkdir(char *path, struct rpc_clnt *rpc_client)
 	if (error)
 		goto err_depopulate;
 out:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
 	return dentry;
 err_depopulate:
@@ -671,7 +671,7 @@ rpc_rmdir(char *path)
 	if ((error = rpc_lookup_parent(path, &nd)) != 0)
 		return error;
 	dir = nd.dentry->d_inode;
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	dentry = lookup_hash(&nd);
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
@@ -681,7 +681,7 @@ rpc_rmdir(char *path)
 	error = __rpc_rmdir(dir, dentry);
 	dput(dentry);
 out_release:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
 	return error;
 }
@@ -710,7 +710,7 @@ rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags)
 	rpci->ops = ops;
 	inode_dir_notify(dir, DN_CREATE);
 out:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
 	return dentry;
 err_dput:
@@ -732,7 +732,7 @@ rpc_unlink(char *path)
 	if ((error = rpc_lookup_parent(path, &nd)) != 0)
 		return error;
 	dir = nd.dentry->d_inode;
-	down(&dir->i_sem);
+	mutex_lock(&dir->i_mutex);
 	dentry = lookup_hash(&nd);
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
@@ -746,7 +746,7 @@ rpc_unlink(char *path)
 	dput(dentry);
 	inode_dir_notify(dir, DN_DELETE);
 out_release:
-	up(&dir->i_sem);
+	mutex_unlock(&dir->i_mutex);
 	rpc_release_path(&nd);
 	return error;
 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5f6ae79b8b16..1b5989b1b670 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -784,7 +784,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
 		if (err)
 			goto out_mknod_dput;
-		up(&nd.dentry->d_inode->i_sem);
+		mutex_unlock(&nd.dentry->d_inode->i_mutex);
 		dput(nd.dentry);
 		nd.dentry = dentry;
 
@@ -823,7 +823,7 @@ out:
 out_mknod_dput:
 	dput(dentry);
 out_mknod_unlock:
-	up(&nd.dentry->d_inode->i_sem);
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 	path_release(&nd);
 out_mknod_parent:
 	if (err==-EEXIST)
diff --git a/security/inode.c b/security/inode.c
index a5964502ae30..0f77b0223662 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -172,7 +172,7 @@ static int create_by_name(const char *name, mode_t mode,
 		return -EFAULT;
 	}
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	*dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry)) {
 		if ((mode & S_IFMT) == S_IFDIR)
@@ -181,7 +181,7 @@ static int create_by_name(const char *name, mode_t mode,
 			error = create(parent->d_inode, *dentry, mode);
 	} else
 		error = PTR_ERR(dentry);
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 
 	return error;
 }
@@ -302,7 +302,7 @@ void securityfs_remove(struct dentry *dentry)
 	if (!parent || !parent->d_inode)
 		return;
 
-	down(&parent->d_inode->i_sem);
+	mutex_lock(&parent->d_inode->i_mutex);
 	if (positive(dentry)) {
 		if (dentry->d_inode) {
 			if (S_ISDIR(dentry->d_inode->i_mode))
@@ -312,7 +312,7 @@ void securityfs_remove(struct dentry *dentry)
 			dput(dentry);
 		}
 	}
-	up(&parent->d_inode->i_sem);
+	mutex_unlock(&parent->d_inode->i_mutex);
 	simple_release_fs(&mount, &mount_count);
 }
 EXPORT_SYMBOL_GPL(securityfs_remove);
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 16df1246a131..7fd072392c7e 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -2135,9 +2135,7 @@ static ssize_t snd_pcm_oss_write(struct file *file, const char __user *buf, size
 	substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
 	if (substream == NULL)
 		return -ENXIO;
-	up(&file->f_dentry->d_inode->i_sem);
 	result = snd_pcm_oss_write1(substream, buf, count);
-	down(&file->f_dentry->d_inode->i_sem);
 #ifdef OSS_DEBUG
 	printk("pcm_oss: write %li bytes (wrote %li bytes)\n", (long)count, (long)result);
 #endif
diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c
index 9ee6c177db0c..40b4f679c80e 100644
--- a/sound/core/seq/seq_memory.c
+++ b/sound/core/seq/seq_memory.c
@@ -32,10 +32,6 @@
 #include "seq_info.h"
 #include "seq_lock.h"
 
-/* semaphore in struct file record */
-#define semaphore_of(fp)	((fp)->f_dentry->d_inode->i_sem)
-
-
 static inline int snd_seq_pool_available(struct snd_seq_pool *pool)
 {
 	return pool->total_elements - atomic_read(&pool->counter);
-- 
cgit v1.2.3


From 7892f2f48d165a34b0b8130c8a195dfd807b8cb6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 15:59:25 -0800
Subject: [PATCH] mutex subsystem, semaphore to mutex: VFS, sb->s_lock

This patch converts the superblock-lock semaphore to a mutex, affecting
lock_super()/unlock_super(). Tested on ext3 and XFS.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/ext3/super.c    | 2 +-
 fs/ocfs2/super.c   | 2 +-
 fs/super.c         | 2 +-
 include/linux/fs.h | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c3dbebdb9897..56bf76586019 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2150,7 +2150,7 @@ int ext3_force_commit(struct super_block *sb)
 
 static void ext3_write_super (struct super_block * sb)
 {
-	if (down_trylock(&sb->s_lock) == 0)
+	if (mutex_trylock(&sb->s_lock) != 0)
 		BUG();
 	sb->s_dirt = 0;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 48bf7f0ce544..364d64bd5f10 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -169,7 +169,7 @@ static match_table_t tokens = {
  */
 static void ocfs2_write_super(struct super_block *sb)
 {
-	if (down_trylock(&sb->s_lock) == 0)
+	if (mutex_trylock(&sb->s_lock) != 0)
 		BUG();
 	sb->s_dirt = 0;
 }
diff --git a/fs/super.c b/fs/super.c
index 0a30e51692cf..c177b92419c5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -72,7 +72,7 @@ static struct super_block *alloc_super(void)
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		init_rwsem(&s->s_umount);
-		sema_init(&s->s_lock, 1);
+		mutex_init(&s->s_lock);
 		down_write(&s->s_umount);
 		s->s_count = S_BIAS;
 		atomic_set(&s->s_active, 1);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 01654b218e42..92ae3e2067b0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -821,7 +821,7 @@ struct super_block {
 	unsigned long		s_magic;
 	struct dentry		*s_root;
 	struct rw_semaphore	s_umount;
-	struct semaphore	s_lock;
+	struct mutex		s_lock;
 	int			s_count;
 	int			s_syncing;
 	int			s_need_sync_fs;
@@ -893,13 +893,13 @@ static inline int has_fs_excl(void)
 static inline void lock_super(struct super_block * sb)
 {
 	get_fs_excl();
-	down(&sb->s_lock);
+	mutex_lock(&sb->s_lock);
 }
 
 static inline void unlock_super(struct super_block * sb)
 {
 	put_fs_excl();
-	up(&sb->s_lock);
+	mutex_unlock(&sb->s_lock);
 }
 
 /*
-- 
cgit v1.2.3


From df2e71fb9115a8d4f721fb1464db09adc8332bc5 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Mon, 9 Jan 2006 20:51:37 -0800
Subject: [PATCH] dump_thread() cleanup

)

From: Adrian Bunk <bunk@stusta.de>

- create one common dump_thread() prototype in kernel.h

- dump_thread() is only used in fs/binfmt_aout.c and can therefore be
  removed on all architectures where CONFIG_BINFMT_AOUT is not
  available

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/alpha_ksyms.c     |  1 -
 arch/alpha/mm/init.c                |  1 +
 arch/arm26/kernel/armksyms.c        |  1 -
 arch/cris/kernel/crisksyms.c        |  2 --
 arch/cris/kernel/process.c          | 28 ----------------------
 arch/frv/kernel/frv_ksyms.c         |  2 --
 arch/frv/kernel/process.c           | 22 ------------------
 arch/h8300/kernel/h8300_ksyms.c     |  3 ---
 arch/h8300/kernel/process.c         | 28 ----------------------
 arch/m32r/kernel/m32r_ksyms.c       |  3 ---
 arch/m32r/kernel/process.c          |  8 -------
 arch/m68k/kernel/m68k_ksyms.c       |  2 --
 arch/m68knommu/kernel/m68k_ksyms.c  |  2 --
 arch/m68knommu/kernel/process.c     | 46 -------------------------------------
 arch/s390/kernel/process.c          | 21 -----------------
 arch/sh/kernel/process.c            | 20 ----------------
 arch/sh/kernel/sh_ksyms.c           |  2 --
 arch/sh64/kernel/process.c          | 20 ----------------
 arch/sh64/kernel/sh_ksyms.c         |  2 --
 arch/sparc/kernel/sparc_ksyms.c     |  2 --
 arch/sparc64/kernel/binfmt_aout32.c |  2 --
 arch/sparc64/kernel/sparc64_ksyms.c |  2 --
 arch/v850/kernel/process.c          | 24 -------------------
 arch/v850/kernel/v850_ksyms.c       |  2 --
 fs/binfmt_aout.c                    |  2 --
 fs/binfmt_flat.c                    |  2 --
 include/asm-um/processor-generic.h  |  1 -
 include/linux/kernel.h              |  4 ++++
 28 files changed, 5 insertions(+), 250 deletions(-)

(limited to 'fs')

diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c
index f3e98f837784..1898ea79d0e2 100644
--- a/arch/alpha/kernel/alpha_ksyms.c
+++ b/arch/alpha/kernel/alpha_ksyms.c
@@ -40,7 +40,6 @@
 #include <asm/unistd.h>
 
 extern struct hwrpb_struct *hwrpb;
-extern void dump_thread(struct pt_regs *, struct user *);
 extern spinlock_t rtc_lock;
 
 /* these are C runtime functions with special calling conventions: */
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 90752f6d8867..486d7945583d 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -7,6 +7,7 @@
 /* 2.3.x zone allocator, 1999 Andrea Arcangeli <andrea@suse.de> */
 
 #include <linux/config.h>
+#include <linux/pagemap.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
diff --git a/arch/arm26/kernel/armksyms.c b/arch/arm26/kernel/armksyms.c
index 35514b398e2e..811a6376c624 100644
--- a/arch/arm26/kernel/armksyms.c
+++ b/arch/arm26/kernel/armksyms.c
@@ -35,7 +35,6 @@
 #include <asm/checksum.h>
 #include <asm/mach-types.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu(struct pt_regs *, struct user_fp_struct *);
 extern void inswb(unsigned int port, void *to, int len);
 extern void outswb(unsigned int port, const void *to, int len);
diff --git a/arch/cris/kernel/crisksyms.c b/arch/cris/kernel/crisksyms.c
index 85833d704ebb..de39725da920 100644
--- a/arch/cris/kernel/crisksyms.c
+++ b/arch/cris/kernel/crisksyms.c
@@ -21,7 +21,6 @@
 #include <asm/pgtable.h>
 #include <asm/fasttimer.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern unsigned long get_cmos_time(void);
 extern void __Udiv(void);
 extern void __Umod(void);
@@ -33,7 +32,6 @@ extern void __lshrdi3(void);
 extern void iounmap(volatile void * __iomem);
 
 /* Platform dependent support */
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(get_cmos_time);
 EXPORT_SYMBOL(loops_per_usec);
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 7c80afb10460..4ab3e87115b6 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -257,34 +257,6 @@ void flush_thread(void)
 {
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-#if 0
-	int i;
-
-	/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-	for (i = 0; i < 8; i++)
-		dump->u_debugreg[i] = current->debugreg[i];  
-
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-	dump->regs = *regs;
-
-	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
-#endif 
-}
-
 /* Fill in the fpu structure for a core dump. */
 int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 {
diff --git a/arch/frv/kernel/frv_ksyms.c b/arch/frv/kernel/frv_ksyms.c
index 5f118c89d091..0f1c6cbc4f50 100644
--- a/arch/frv/kernel/frv_ksyms.c
+++ b/arch/frv/kernel/frv_ksyms.c
@@ -18,7 +18,6 @@
 #include <asm/hardirq.h>
 #include <asm/cacheflush.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern long __memcpy_user(void *dst, const void *src, size_t count);
 extern long __memset_user(void *dst, const void *src, size_t count);
 
@@ -27,7 +26,6 @@ extern long __memset_user(void *dst, const void *src, size_t count);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 54a452136f00..c4488379ac3b 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -243,28 +243,6 @@ int copy_thread(int nr, unsigned long clone_flags,
 	return 0;
 } /* end copy_thread() */
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs *regs, struct user *dump)
-{
-#if 0
-	/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = user_stack(regs) & ~(PAGE_SIZE - 1);
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-	dump->regs = *(struct user_context *) regs;
-#endif
-}
-
 /*
  * sys_execve() executes a new program.
  */
diff --git a/arch/h8300/kernel/h8300_ksyms.c b/arch/h8300/kernel/h8300_ksyms.c
index 5a630233112f..3e0d80ea4464 100644
--- a/arch/h8300/kernel/h8300_ksyms.c
+++ b/arch/h8300/kernel/h8300_ksyms.c
@@ -22,11 +22,8 @@
 //asmlinkage long long __lshrdi3 (long long, int);
 extern char h8300_debug_device[];
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 /* platform dependent support */
 
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index fe21adf3e75e..585ed5efd0f7 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -207,34 +207,6 @@ int copy_thread(int nr, unsigned long clone_flags,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = rdusp() & ~(PAGE_SIZE - 1);
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk +
-					  (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-
-	dump->u_ar0 = (struct user_regs_struct *)(((int)(&dump->regs)) -((int)(dump)));
-	dump->regs.er0 = regs->er0;
-	dump->regs.er1 = regs->er1;
-	dump->regs.er2 = regs->er2;
-	dump->regs.er3 = regs->er3;
-	dump->regs.er4 = regs->er4;
-	dump->regs.er5 = regs->er5;
-	dump->regs.er6 = regs->er6;
-	dump->regs.orig_er0 = regs->orig_er0;
-	dump->regs.ccr = regs->ccr;
-	dump->regs.pc  = regs->pc;
-}
-
 /*
  * sys_execve() executes a new program.
  */
diff --git a/arch/m32r/kernel/m32r_ksyms.c b/arch/m32r/kernel/m32r_ksyms.c
index e5ec134d81d9..dbc8a392105f 100644
--- a/arch/m32r/kernel/m32r_ksyms.c
+++ b/arch/m32r/kernel/m32r_ksyms.c
@@ -18,8 +18,6 @@
 #include <asm/irq.h>
 #include <asm/tlbflush.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
 extern struct drive_info_struct drive_info;
 EXPORT_SYMBOL(drive_info);
@@ -27,7 +25,6 @@ EXPORT_SYMBOL(drive_info);
 
 /* platform dependent support */
 EXPORT_SYMBOL(boot_cpu_data);
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 3bf55d92933f..2a1f250349b7 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -260,14 +260,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long spu,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-	/* M32R_FIXME */
-}
-
 /*
  * Capture the user space registers if the task is not running (in user space)
  */
diff --git a/arch/m68k/kernel/m68k_ksyms.c b/arch/m68k/kernel/m68k_ksyms.c
index 73e2f5e168dd..3d7f2000b714 100644
--- a/arch/m68k/kernel/m68k_ksyms.c
+++ b/arch/m68k/kernel/m68k_ksyms.c
@@ -23,8 +23,6 @@ asmlinkage long long __lshrdi3 (long long, int);
 asmlinkage long long __muldi3 (long long, long long);
 extern char m68k_debug_device[];
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 /* platform dependent support */
 
 EXPORT_SYMBOL(m68k_machtype);
diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c
index b2c62eeb3bab..eddb8d3e130a 100644
--- a/arch/m68knommu/kernel/m68k_ksyms.c
+++ b/arch/m68knommu/kernel/m68k_ksyms.c
@@ -18,7 +18,6 @@
 #include <asm/checksum.h>
 #include <asm/current.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 
 /* platform dependent support */
@@ -26,7 +25,6 @@ extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(dump_fpu);
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strstr);
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 82e7ec888806..8b3cf57ba706 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -275,52 +275,6 @@ int dump_fpu(struct pt_regs *regs, struct user_m68kfp_struct *fpu)
 	return 1;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-	struct switch_stack *sw;
-
-	/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = rdusp() & ~(PAGE_SIZE - 1);
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk +
-					  (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-	dump->u_ar0 = (struct user_regs_struct *)((int)&dump->regs - (int)dump);
-	sw = ((struct switch_stack *)regs) - 1;
-	dump->regs.d1 = regs->d1;
-	dump->regs.d2 = regs->d2;
-	dump->regs.d3 = regs->d3;
-	dump->regs.d4 = regs->d4;
-	dump->regs.d5 = regs->d5;
-	dump->regs.d6 = sw->d6;
-	dump->regs.d7 = sw->d7;
-	dump->regs.a0 = regs->a0;
-	dump->regs.a1 = regs->a1;
-	dump->regs.a2 = regs->a2;
-	dump->regs.a3 = sw->a3;
-	dump->regs.a4 = sw->a4;
-	dump->regs.a5 = sw->a5;
-	dump->regs.a6 = sw->a6;
-	dump->regs.d0 = regs->d0;
-	dump->regs.orig_d0 = regs->orig_d0;
-	dump->regs.stkadj = regs->stkadj;
-	dump->regs.sr = regs->sr;
-	dump->regs.pc = regs->pc;
-	dump->regs.fmtvec = (regs->format << 12) | regs->vector;
-	/* dump floating point stuff */
-	dump->u_fpvalid = dump_fpu (regs, &dump->m68kfp);
-}
-
 /*
  *	Generic dumping code. Used for panic and debug.
  */
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index a942bf2d58e9..7dd58f8ac6b5 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -352,27 +352,6 @@ int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
 	return 1;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-
-/* changed the size calculations - should hopefully work better. lbt */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = regs->gprs[15] & ~(PAGE_SIZE - 1);
-	dump->u_tsize = current->mm->end_code >> PAGE_SHIFT;
-	dump->u_dsize = (current->mm->brk + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = (TASK_SIZE - dump->start_stack) >> PAGE_SHIFT;
-	memcpy(&dump->regs, regs, sizeof(s390_regs));
-	dump_fpu (regs, &dump->regs.fp_regs);
-	dump->regs.per_info = current->thread.per_info;
-}
-
 unsigned long get_wchan(struct task_struct *p)
 {
 	struct stack_frame *sf, *low, *high;
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index fd4f240b833d..8a2bea34ddd2 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -305,26 +305,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-	dump->magic = CMAGIC;
-	dump->start_code = current->mm->start_code;
-	dump->start_data  = current->mm->start_data;
-	dump->start_stack = regs->regs[15] & ~(PAGE_SIZE - 1);
-	dump->u_tsize = (current->mm->end_code - dump->start_code) >> PAGE_SHIFT;
-	dump->u_dsize = (current->mm->brk + (PAGE_SIZE-1) - dump->start_data) >> PAGE_SHIFT;
-	dump->u_ssize = (current->mm->start_stack - dump->start_stack +
-			 PAGE_SIZE - 1) >> PAGE_SHIFT;
-	/* Debug registers will come here. */
-
-	dump->regs = *regs;
-
-	dump->u_fpvalid = dump_fpu(regs, &dump->fpu);
-}
-
 /* Tracing by user break controller.  */
 static void
 ubc_set_tracing(int asid, unsigned long pc)
diff --git a/arch/sh/kernel/sh_ksyms.c b/arch/sh/kernel/sh_ksyms.c
index 6954fd62470a..1cf94a618be3 100644
--- a/arch/sh/kernel/sh_ksyms.c
+++ b/arch/sh/kernel/sh_ksyms.c
@@ -21,14 +21,12 @@
 #include <asm/cacheflush.h>
 #include <asm/checksum.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 extern struct hw_interrupt_type no_irq_type;
 
 EXPORT_SYMBOL(sh_mv);
 
 /* platform dependent support */
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(enable_irq);
diff --git a/arch/sh64/kernel/process.c b/arch/sh64/kernel/process.c
index b95d04141855..419b5a710441 100644
--- a/arch/sh64/kernel/process.c
+++ b/arch/sh64/kernel/process.c
@@ -775,26 +775,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread(struct pt_regs * regs, struct user * dump)
-{
-	dump->magic = CMAGIC;
-	dump->start_code = current->mm->start_code;
-	dump->start_data  = current->mm->start_data;
-	dump->start_stack = regs->regs[15] & ~(PAGE_SIZE - 1);
-	dump->u_tsize = (current->mm->end_code - dump->start_code) >> PAGE_SHIFT;
-	dump->u_dsize = (current->mm->brk + (PAGE_SIZE-1) - dump->start_data) >> PAGE_SHIFT;
-	dump->u_ssize = (current->mm->start_stack - dump->start_stack +
-			 PAGE_SIZE - 1) >> PAGE_SHIFT;
-	/* Debug registers will come here. */
-
-	dump->regs = *regs;
-
-	dump->u_fpvalid = dump_fpu(regs, &dump->fpu);
-}
-
 asmlinkage int sys_fork(unsigned long r2, unsigned long r3,
 			unsigned long r4, unsigned long r5,
 			unsigned long r6, unsigned long r7,
diff --git a/arch/sh64/kernel/sh_ksyms.c b/arch/sh64/kernel/sh_ksyms.c
index 0b5497d70bd3..472b450e61be 100644
--- a/arch/sh64/kernel/sh_ksyms.c
+++ b/arch/sh64/kernel/sh_ksyms.c
@@ -29,7 +29,6 @@
 #include <asm/delay.h>
 #include <asm/irq.h>
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 
 #if 0
@@ -41,7 +40,6 @@ EXPORT_SYMBOL(drive_info);
 #endif
 
 /* platform dependent support */
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(enable_irq);
diff --git a/arch/sparc/kernel/sparc_ksyms.c b/arch/sparc/kernel/sparc_ksyms.c
index 1c8fd0fd9305..0b0d492c953b 100644
--- a/arch/sparc/kernel/sparc_ksyms.c
+++ b/arch/sparc/kernel/sparc_ksyms.c
@@ -82,8 +82,6 @@ extern int __lshrdi3(int, int);
 extern int __muldi3(int, int);
 extern int __divdi3(int, int);
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 /* Private functions with odd calling conventions. */
 extern void ___atomic24_add(void);
 extern void ___atomic24_sub(void);
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index edf52d06b280..202a80c24b6f 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -36,8 +36,6 @@ static int load_aout32_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout32_library(struct file*);
 static int aout32_core_dump(long signr, struct pt_regs * regs, struct file *file);
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 static struct linux_binfmt aout32_format = {
 	NULL, THIS_MODULE, load_aout32_binary, load_aout32_library, aout32_core_dump,
 	PAGE_SIZE
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index fb7a5370dbfc..d177d7e5c9d3 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -94,7 +94,6 @@ extern void (*prom_palette)(int);
 
 extern int __ashrdi3(int, int);
 
-extern void dump_thread(struct pt_regs *, struct user *);
 extern int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs);
 
 extern unsigned long phys_base;
@@ -241,7 +240,6 @@ EXPORT_SYMBOL(io_remap_pfn_range);
 EXPORT_SYMBOL(_sigpause_common);
 EXPORT_SYMBOL(verify_compat_iovec);
 
-EXPORT_SYMBOL(dump_thread);
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(pte_alloc_one_kernel);
 #ifndef CONFIG_SMP
diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c
index 39cf247cdae4..062ffa0a9998 100644
--- a/arch/v850/kernel/process.c
+++ b/arch/v850/kernel/process.c
@@ -163,30 +163,6 @@ int copy_thread (int nr, unsigned long clone_flags,
 	return 0;
 }
 
-/*
- * fill in the user structure for a core dump..
- */
-void dump_thread (struct pt_regs *regs, struct user *dump)
-{
-#if 0  /* Later.  XXX */
-	dump->magic = CMAGIC;
-	dump->start_code = 0;
-	dump->start_stack = regs->gpr[GPR_SP];
-	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-	dump->u_dsize = ((unsigned long) (current->mm->brk +
-					  (PAGE_SIZE-1))) >> PAGE_SHIFT;
-	dump->u_dsize -= dump->u_tsize;
-	dump->u_ssize = 0;
-
-	if (dump->start_stack < TASK_SIZE)
-		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
-
-	dump->u_ar0 = (struct user_regs_struct *)((int)&dump->regs - (int)dump);
-	dump->regs = *regs;
-	dump->u_fpvalid = 0;
-#endif
-}
-
 /*
  * sys_execve() executes a new program.
  */
diff --git a/arch/v850/kernel/v850_ksyms.c b/arch/v850/kernel/v850_ksyms.c
index 0ca64900dd91..8ffc29c1c89d 100644
--- a/arch/v850/kernel/v850_ksyms.c
+++ b/arch/v850/kernel/v850_ksyms.c
@@ -21,8 +21,6 @@ extern void *trap_table;
 EXPORT_SYMBOL (trap_table);
 
 /* platform dependent support */
-extern void dump_thread (struct pt_regs *, struct user *);
-EXPORT_SYMBOL (dump_thread);
 EXPORT_SYMBOL (kernel_thread);
 EXPORT_SYMBOL (enable_irq);
 EXPORT_SYMBOL (disable_irq);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 72011826f0cb..f312103434d4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -33,8 +33,6 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout_library(struct file*);
 static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file);
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 static struct linux_binfmt aout_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_aout_binary,
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 9d6625829b99..b72dc31a0970 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -77,8 +77,6 @@ static int load_flat_shared_library(int id, struct lib_info *p);
 static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
 static int flat_core_dump(long signr, struct pt_regs * regs, struct file *file);
 
-extern void dump_thread(struct pt_regs *, struct user *);
-
 static struct linux_binfmt flat_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_flat_binary,
diff --git a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h
index 075771c371f6..da07a69ce82a 100644
--- a/include/asm-um/processor-generic.h
+++ b/include/asm-um/processor-generic.h
@@ -89,7 +89,6 @@ extern struct task_struct *alloc_task_struct(void);
 
 extern void release_thread(struct task_struct *);
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
-extern void dump_thread(struct pt_regs *regs, struct user *u);
 
 static inline void prepare_to_copy(struct task_struct *tsk)
 {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d0e6ca3b00ef..e6ee2d95da7a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -47,6 +47,8 @@ extern int console_printk[];
 #define default_console_loglevel (console_printk[3])
 
 struct completion;
+struct pt_regs;
+struct user;
 
 /**
  * might_sleep - annotation for functions that can sleep
@@ -123,6 +125,8 @@ extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int session_of_pgrp(int pgrp);
 
+extern void dump_thread(struct pt_regs *regs, struct user *dump);
+
 #ifdef CONFIG_PRINTK
 asmlinkage int vprintk(const char *fmt, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
-- 
cgit v1.2.3


From 4ae362be509306eafa6441603686d33fefe321c1 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 9 Jan 2006 20:51:50 -0800
Subject: [PATCH] kdump: read previous kernel's memory

- Moving the crash_dump.c file to arch dependent part as kmap_atomic_pfn is
  specific to i386 and highmem may not exist in other archs.

- Use ioremap for x86_64 to map the previous kernel memory.

- In copy_oldmem_page(), we now directly copy to the user/kernel buffer and
  avoid the unneccesary copy to a kmalloc'd page.

Signed-off-by: Rachita Kothiyal <rachita@in.ibm.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/Makefile       |  1 +
 arch/i386/kernel/crash_dump.c   | 74 +++++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/Makefile     |  1 +
 arch/x86_64/kernel/crash_dump.c | 47 ++++++++++++++++++++++++++
 fs/proc/vmcore.c                |  3 ++
 kernel/Makefile                 |  1 -
 kernel/crash_dump.c             | 64 -----------------------------------
 7 files changed, 126 insertions(+), 65 deletions(-)
 create mode 100644 arch/i386/kernel/crash_dump.c
 create mode 100644 arch/x86_64/kernel/crash_dump.c
 delete mode 100644 kernel/crash_dump.c

(limited to 'fs')

diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index be1880bb75b4..60c3f76dfca4 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
+obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
diff --git a/arch/i386/kernel/crash_dump.c b/arch/i386/kernel/crash_dump.c
new file mode 100644
index 000000000000..3f532df488bc
--- /dev/null
+++ b/arch/i386/kernel/crash_dump.c
@@ -0,0 +1,74 @@
+/*
+ *	kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *	Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <asm/uaccess.h>
+
+static void *kdump_buf_page;
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *	space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *	otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ *
+ * Calling copy_to_user() in atomic context is not desirable. Hence first
+ * copying the data to a pre-allocated kernel page and then copying to user
+ * space in non-atomic context.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                               size_t csize, unsigned long offset, int userbuf)
+{
+	void  *vaddr;
+
+	if (!csize)
+		return 0;
+
+	vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+
+	if (!userbuf) {
+		memcpy(buf, (vaddr + offset), csize);
+		kunmap_atomic(vaddr, KM_PTE0);
+	} else {
+		if (!kdump_buf_page) {
+			printk(KERN_WARNING "Kdump: Kdump buffer page not"
+				" allocated\n");
+			return -EFAULT;
+		}
+		copy_page(kdump_buf_page, vaddr);
+		kunmap_atomic(vaddr, KM_PTE0);
+		if (copy_to_user(buf, (kdump_buf_page + offset), csize))
+			return -EFAULT;
+	}
+
+	return csize;
+}
+
+static int __init kdump_buf_page_init(void)
+{
+	int ret = 0;
+
+	kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!kdump_buf_page) {
+		printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
+			 " page\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index fe4cbd1c4b2f..12bc54005e2f 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o  nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o mpparse.o \
 		genapic.o genapic_cluster.o genapic_flat.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
+obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_PM)		+= suspend.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
diff --git a/arch/x86_64/kernel/crash_dump.c b/arch/x86_64/kernel/crash_dump.c
new file mode 100644
index 000000000000..942deac4d43a
--- /dev/null
+++ b/arch/x86_64/kernel/crash_dump.c
@@ -0,0 +1,47 @@
+/*
+ *	kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *	Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/errno.h>
+#include <linux/crash_dump.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *	space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *	otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                               size_t csize, unsigned long offset, int userbuf)
+{
+	void  *vaddr;
+
+	if (!csize)
+		return 0;
+
+	vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+
+	if (userbuf) {
+		if (copy_to_user(buf, (vaddr + offset), csize)) {
+			iounmap(vaddr);
+			return -EFAULT;
+		}
+	} else
+	memcpy(buf, (vaddr + offset), csize);
+
+	iounmap(vaddr);
+	return csize;
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 3b2e7b69e63a..5378d7c78419 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -35,6 +35,9 @@ static size_t elfcorebuf_sz;
 /* Total size of vmcore file. */
 static u64 vmcore_size;
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 struct proc_dir_entry *proc_vmcore = NULL;
 
 /* Reads a page from the oldmem device from given offset. */
diff --git a/kernel/Makefile b/kernel/Makefile
index a940bac02837..1e039700c0ad 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
deleted file mode 100644
index fccb27dbc623..000000000000
--- a/kernel/crash_dump.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *	kernel/crash_dump.c - Memory preserving reboot related code.
- *
- *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
- *	Copyright (C) IBM Corporation, 2004. All rights reserved
- */
-
-#include <linux/smp_lock.h>
-#include <linux/errno.h>
-#include <linux/proc_fs.h>
-#include <linux/bootmem.h>
-#include <linux/highmem.h>
-#include <linux/crash_dump.h>
-
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/kexec.h>
-
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
-#ifndef HAVE_ARCH_COPY_OLDMEM_PAGE
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- *	space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- *	otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-				size_t csize, unsigned long offset, int userbuf)
-{
-	void *page, *vaddr;
-
-	if (!csize)
-		return 0;
-
-	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
-
-	vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
-	copy_page(page, vaddr);
-	kunmap_atomic(vaddr, KM_PTE0);
-
-	if (userbuf) {
-		if (copy_to_user(buf, (page + offset), csize)) {
-			kfree(page);
-			return -EFAULT;
-		}
-	} else {
-		memcpy(buf, (page + offset), csize);
-	}
-
-	kfree(page);
-	return csize;
-}
-#endif
-- 
cgit v1.2.3


From 05970d476f2c8aa0f47e4e82473b0551b1e4e6d4 Mon Sep 17 00:00:00 2001
From: Maneesh Soni <maneesh@in.ibm.com>
Date: Mon, 9 Jan 2006 20:51:52 -0800
Subject: [PATCH] kexec: change CONFIG_PHYSICAL_START dependency

I have heard some complaints about people not finding CONFIG_CRASH_DUMP
option and also some objections about its dependency on CONFIG_EMBEDDED.
The following patch ends that dependency.  I thought of hiding it under
CONFIG_KEXEC, but CONFIG_PHYSICAL_START could also be used for some reasons
other than kexec/kdump and hence left it visible.  I will also update the
documentation accordingly.

o Following patch removes the config dependency of CONFIG_PHYSICAL_START
  on CONFIG_EMBEDDED. The reason being CONFIG_CRASH_DUMP option for
  kdump needs CONFIG_PHYSICAL_START which makes CONFIG_CRASH_DUMP depend
  on CONFIG_EMBEDDED. It is not always obvious for kdump users to choose
  CONFIG_EMBEDDED.

o It also shifts the palce where this option appears, to make it closer
  to kexec and kdump options.

Signed-off-by: Maneesh Soni <maneesh@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Haren Myneni <haren@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig   | 33 +++++++++++++++++++++------------
 arch/x86_64/Kconfig | 31 +++++++++++++++++++------------
 fs/Kconfig          |  2 +-
 3 files changed, 41 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index d849c6870e3a..554ce3f344c9 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -645,17 +645,6 @@ config SECCOMP
 
 source kernel/Kconfig.hz
 
-config PHYSICAL_START
-	hex "Physical address where the kernel is loaded" if EMBEDDED
-	default "0x100000"
-	help
-	  This gives the physical address where the kernel is loaded.
-	  Primarily used in the case of kexec on panic where the
-	  fail safe kernel needs to run at a different address than
-	  the panic-ed kernel.
-
-	  Don't change this unless you know what you are doing.
-
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
@@ -675,11 +664,31 @@ config KEXEC
 
 config CRASH_DUMP
 	bool "kernel crash dumps (EXPERIMENTAL)"
-	depends on EMBEDDED
 	depends on EXPERIMENTAL
 	depends on HIGHMEM
 	help
 	  Generate crash dump after being started by kexec.
+
+config PHYSICAL_START
+	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+
+	default "0x1000000" if CRASH_DUMP
+	default "0x100000"
+	help
+	  This gives the physical address where the kernel is loaded. Normally
+	  for regular kernels this value is 0x100000 (1MB). But in the case
+	  of kexec on panic the fail safe kernel needs to run at a different
+	  address than the panic-ed kernel. This option is used to set the load
+	  address for kernels used to capture crash dump on being kexec'ed
+	  after panic. The default value for crash dump kernels is
+	  0x1000000 (16MB). This can also be set based on the "X" value as
+	  specified in the "crashkernel=YM@XM" command line boot parameter
+	  passed to the panic-ed kernel. Typically this parameter is set as
+	  crashkernel=64M@16M. Please take a look at
+	  Documentation/kdump/kdump.txt for more details about crash dumps.
+
+	  Don't change this unless you know what you are doing.
+
 endmenu
 
 
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index dd2d116b9ab8..348b4a0d0d6f 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -399,17 +399,6 @@ config X86_MCE_AMD
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.
 
-config PHYSICAL_START
-	hex "Physical address where the kernel is loaded" if EMBEDDED
-	default "0x100000"
-	help
-	  This gives the physical address where the kernel is loaded.
-	  Primarily used in the case of kexec on panic where the
-	  fail safe kernel needs to run at a different address than
-	  the panic-ed kernel.
-
-	  Don't change this unless you know what you are doing.
-
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
@@ -429,11 +418,29 @@ config KEXEC
 
 config CRASH_DUMP
 	bool "kernel crash dumps (EXPERIMENTAL)"
-	depends on EMBEDDED
 	depends on EXPERIMENTAL
 	help
 		Generate crash dump after being started by kexec.
 
+config PHYSICAL_START
+	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+	default "0x1000000" if CRASH_DUMP
+	default "0x100000"
+	help
+	  This gives the physical address where the kernel is loaded. Normally
+	  for regular kernels this value is 0x100000 (1MB). But in the case
+	  of kexec on panic the fail safe kernel needs to run at a different
+	  address than the panic-ed kernel. This option is used to set the load
+	  address for kernels used to capture crash dump on being kexec'ed
+	  after panic. The default value for crash dump kernels is
+	  0x1000000 (16MB). This can also be set based on the "X" value as
+	  specified in the "crashkernel=YM@XM" command line boot parameter
+	  passed to the panic-ed kernel. Typically this parameter is set as
+	  crashkernel=64M@16M. Please take a look at
+	  Documentation/kdump/kdump.txt for more details about crash dumps.
+
+	  Don't change this unless you know what you are doing.
+
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
diff --git a/fs/Kconfig b/fs/Kconfig
index 382e3b2883d5..ef78e3a42d32 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -798,7 +798,7 @@ config PROC_KCORE
 
 config PROC_VMCORE
         bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && EMBEDDED && EXPERIMENTAL && CRASH_DUMP
+        depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
         help
         Exports the dump image of crashed kernel in ELF format.
 
-- 
cgit v1.2.3


From 5be196e5f925dab2309530fabce69c2e562b9791 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:51:55 -0800
Subject: [PATCH] add vfs_* helpers for xattr operations

Add vfs_getxattr, vfs_setxattr and vfs_removexattr helpers for common checks
around invocation of the xattr methods.  NFSD already was missing some of the
checks and there will be more soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: James Morris <jmorris@namei.org>

(James, I haven't touched selinux yet because it's doing various odd things
and I'm not sure how it would interact with the security attribute fallbacks
you added.  Could you investigate whether it could use vfs_getxattr or if not
add a __vfs_getxattr helper to share the bits it is fine with?)

For NFSv4: instead of just converting it add an nfsd_getxattr helper for the
code shared by NFSv2/3 and NFSv4 ACLs.  In fact that code isn't even
NFS-specific, but I'll wait for more users to pop up first before moving it to
common code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c         | 125 +++++++++++++++++-------------------------
 fs/xattr.c            | 146 ++++++++++++++++++++++++++++++++------------------
 include/linux/xattr.h |   4 ++
 3 files changed, 145 insertions(+), 130 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index bb36b4304491..eef0576a7785 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -48,8 +48,8 @@
 #include <linux/fsnotify.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
-#ifdef CONFIG_NFSD_V4
 #include <linux/xattr.h>
+#ifdef CONFIG_NFSD_V4
 #include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
 #include <linux/nfsd_idmap.h>
@@ -365,8 +365,30 @@ out_nfserr:
 	goto out;
 }
 
-#if defined(CONFIG_NFSD_V4)
+#if defined(CONFIG_NFSD_V2_ACL) || \
+    defined(CONFIG_NFSD_V3_ACL) || \
+    defined(CONFIG_NFSD_V4)
+static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
+{
+	ssize_t buflen;
+	int error;
+
+	buflen = vfs_getxattr(dentry, key, NULL, 0);
+	if (buflen <= 0)
+		return buflen;
 
+	*buf = kmalloc(buflen, GFP_KERNEL);
+	if (!*buf)
+		return -ENOMEM;
+
+	error = vfs_getxattr(dentry, key, *buf, buflen);
+	if (error < 0)
+		return error;
+	return buflen;
+}
+#endif
+
+#if defined(CONFIG_NFSD_V4)
 static int
 set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
 {
@@ -374,7 +396,6 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
 	size_t buflen;
 	char *buf = NULL;
 	int error = 0;
-	struct inode *inode = dentry->d_inode;
 
 	buflen = posix_acl_xattr_size(pacl->a_count);
 	buf = kmalloc(buflen, GFP_KERNEL);
@@ -388,15 +409,7 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
 		goto out;
 	}
 
-	error = -EOPNOTSUPP;
-	if (inode->i_op && inode->i_op->setxattr) {
-		mutex_lock(&inode->i_mutex);
-		security_inode_setxattr(dentry, key, buf, len, 0);
-		error = inode->i_op->setxattr(dentry, key, buf, len, 0);
-		if (!error)
-			security_inode_post_setxattr(dentry, key, buf, len, 0);
-		mutex_unlock(&inode->i_mutex);
-	}
+	error = vfs_setxattr(dentry, key, buf, len, 0);
 out:
 	kfree(buf);
 	return error;
@@ -455,44 +468,19 @@ out_nfserr:
 static struct posix_acl *
 _get_posix_acl(struct dentry *dentry, char *key)
 {
-	struct inode *inode = dentry->d_inode;
-	char *buf = NULL;
-	int buflen, error = 0;
+	void *buf = NULL;
 	struct posix_acl *pacl = NULL;
+	int buflen;
 
-	error = -EOPNOTSUPP;
-	if (inode->i_op == NULL)
-		goto out_err;
-	if (inode->i_op->getxattr == NULL)
-		goto out_err;
-
-	error = security_inode_getxattr(dentry, key);
-	if (error)
-		goto out_err;
-
-	buflen = inode->i_op->getxattr(dentry, key, NULL, 0);
-	if (buflen <= 0) {
-		error = buflen < 0 ? buflen : -ENODATA;
-		goto out_err;
-	}
-
-	buf = kmalloc(buflen, GFP_KERNEL);
-	if (buf == NULL) {
-		error = -ENOMEM;
-		goto out_err;
-	}
-
-	error = inode->i_op->getxattr(dentry, key, buf, buflen);
-	if (error < 0)
-		goto out_err;
+	buflen = nfsd_getxattr(dentry, key, &buf);
+	if (!buflen)
+		buflen = -ENODATA;
+	if (buflen <= 0)
+		return ERR_PTR(buflen);
 
 	pacl = posix_acl_from_xattr(buf, buflen);
- out:
 	kfree(buf);
 	return pacl;
- out_err:
-	pacl = ERR_PTR(error);
-	goto out;
 }
 
 int
@@ -1884,39 +1872,25 @@ nfsd_get_posix_acl(struct svc_fh *fhp, int type)
 	ssize_t size;
 	struct posix_acl *acl;
 
-	if (!IS_POSIXACL(inode) || !inode->i_op || !inode->i_op->getxattr)
+	if (!IS_POSIXACL(inode))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
 		return ERR_PTR(-EOPNOTSUPP);
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			name = POSIX_ACL_XATTR_ACCESS;
-			break;
-		case ACL_TYPE_DEFAULT:
-			name = POSIX_ACL_XATTR_DEFAULT;
-			break;
-		default:
-			return ERR_PTR(-EOPNOTSUPP);
 	}
 
-	size = inode->i_op->getxattr(fhp->fh_dentry, name, NULL, 0);
+	size = nfsd_getxattr(fhp->fh_dentry, name, &value);
+	if (size < 0)
+		return ERR_PTR(size);
 
-	if (size < 0) {
-		acl = ERR_PTR(size);
-		goto getout;
-	} else if (size > 0) {
-		value = kmalloc(size, GFP_KERNEL);
-		if (!value) {
-			acl = ERR_PTR(-ENOMEM);
-			goto getout;
-		}
-		size = inode->i_op->getxattr(fhp->fh_dentry, name, value, size);
-		if (size < 0) {
-			acl = ERR_PTR(size);
-			goto getout;
-		}
-	}
 	acl = posix_acl_from_xattr(value, size);
-
-getout:
 	kfree(value);
 	return acl;
 }
@@ -1957,16 +1931,13 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
 	} else
 		size = 0;
 
-	if (!fhp->fh_locked)
-		fh_lock(fhp);  /* unlocking is done automatically */
 	if (size)
-		error = inode->i_op->setxattr(fhp->fh_dentry, name,
-					      value, size, 0);
+		error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
 	else {
 		if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
 			error = 0;
 		else {
-			error = inode->i_op->removexattr(fhp->fh_dentry, name);
+			error = vfs_removexattr(fhp->fh_dentry, name);
 			if (error == -ENODATA)
 				error = 0;
 		}
diff --git a/fs/xattr.c b/fs/xattr.c
index 386a532ee5a9..fee804e69a9a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -19,6 +19,96 @@
 #include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 
+
+int
+vfs_setxattr(struct dentry *dentry, char *name, void *value,
+		size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	mutex_lock(&inode->i_mutex);
+	error = security_inode_setxattr(dentry, name, value, size, flags);
+	if (error)
+		goto out;
+	error = -EOPNOTSUPP;
+	if (inode->i_op->setxattr) {
+		error = inode->i_op->setxattr(dentry, name, value, size, flags);
+		if (!error) {
+			fsnotify_xattr(dentry);
+			security_inode_post_setxattr(dentry, name, value,
+						     size, flags);
+		}
+	} else if (!strncmp(name, XATTR_SECURITY_PREFIX,
+				sizeof XATTR_SECURITY_PREFIX - 1)) {
+		const char *suffix = name + sizeof XATTR_SECURITY_PREFIX - 1;
+		error = security_inode_setsecurity(inode, suffix, value,
+						   size, flags);
+		if (!error)
+			fsnotify_xattr(dentry);
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_setxattr);
+
+ssize_t
+vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = security_inode_getxattr(dentry, name);
+	if (error)
+		return error;
+
+	if (inode->i_op->getxattr)
+		error = inode->i_op->getxattr(dentry, name, value, size);
+	else
+		error = -EOPNOTSUPP;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX,
+				sizeof XATTR_SECURITY_PREFIX - 1)) {
+		const char *suffix = name + sizeof XATTR_SECURITY_PREFIX - 1;
+		int ret = security_inode_getsecurity(inode, suffix, value,
+						     size, error);
+		/*
+		 * Only overwrite the return value if a security module
+		 * is actually active.
+		 */
+		if (ret != -EOPNOTSUPP)
+			error = ret;
+	}
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_getxattr);
+
+int
+vfs_removexattr(struct dentry *dentry, char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	if (!inode->i_op->removexattr)
+		return -EOPNOTSUPP;
+
+	error = security_inode_removexattr(dentry, name);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	error = inode->i_op->removexattr(dentry, name);
+	mutex_unlock(&inode->i_mutex);
+
+	if (!error)
+		fsnotify_xattr(dentry);
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_removexattr);
+
+
 /*
  * Extended attribute SET operations
  */
@@ -51,29 +141,7 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 		}
 	}
 
-	mutex_lock(&d->d_inode->i_mutex);
-	error = security_inode_setxattr(d, kname, kvalue, size, flags);
-	if (error)
-		goto out;
-	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->setxattr) {
-		error = d->d_inode->i_op->setxattr(d, kname, kvalue,
-						   size, flags);
-		if (!error) {
-			fsnotify_xattr(d);
-			security_inode_post_setxattr(d, kname, kvalue,
-						     size, flags);
-		}
-	} else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
-			    sizeof XATTR_SECURITY_PREFIX - 1)) {
-		const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
-		error = security_inode_setsecurity(d->d_inode, suffix, kvalue,
-						   size, flags);
-		if (!error)
-			fsnotify_xattr(d);
-	}
-out:
-	mutex_unlock(&d->d_inode->i_mutex);
+	error = vfs_setxattr(d, kname, kvalue, size, flags);
 	kfree(kvalue);
 	return error;
 }
@@ -147,22 +215,7 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
 			return -ENOMEM;
 	}
 
-	error = security_inode_getxattr(d, kname);
-	if (error)
-		goto out;
-	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->getxattr)
-		error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
-
-	if (!strncmp(kname, XATTR_SECURITY_PREFIX,
-		     sizeof XATTR_SECURITY_PREFIX - 1)) {
-		const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
-		int rv = security_inode_getsecurity(d->d_inode, suffix, kvalue,
-						    size, error);
-		/* Security module active: overwrite error value */
-		if (rv != -EOPNOTSUPP)
-			error = rv;
-	}
+	error = vfs_getxattr(d, kname, kvalue, size);
 	if (error > 0) {
 		if (size && copy_to_user(value, kvalue, error))
 			error = -EFAULT;
@@ -171,7 +224,6 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
 		   than XATTR_SIZE_MAX bytes. Not possible. */
 		error = -E2BIG;
 	}
-out:
 	kfree(kvalue);
 	return error;
 }
@@ -318,19 +370,7 @@ removexattr(struct dentry *d, char __user *name)
 	if (error < 0)
 		return error;
 
-	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->removexattr) {
-		error = security_inode_removexattr(d, kname);
-		if (error)
-			goto out;
-		mutex_lock(&d->d_inode->i_mutex);
-		error = d->d_inode->i_op->removexattr(d, kname);
-		mutex_unlock(&d->d_inode->i_mutex);
-		if (!error)
-			fsnotify_xattr(d);
-	}
-out:
-	return error;
+	return vfs_removexattr(d, kname);
 }
 
 asmlinkage long
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 23f9c61d9546..366f0ab4219f 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -25,6 +25,10 @@ struct xattr_handler {
 		   size_t size, int flags);
 };
 
+ssize_t vfs_getxattr(struct dentry *, char *, void *, size_t);
+int vfs_setxattr(struct dentry *, char *, void *, size_t, int);
+int vfs_removexattr(struct dentry *, char *);
+
 ssize_t generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size);
 ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
 int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags);
-- 
cgit v1.2.3


From e0ad7b073eb7317e5afe0385b02dcb1d52a1eedf Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Mon, 9 Jan 2006 20:51:56 -0800
Subject: [PATCH] move xattr permission checks into the VFS

)

From: Christoph Hellwig <hch@lst.de>

The xattr code has rather complex permission checks because the rules are very
different for different attribute namespaces.  This patch moves as much as we
can into the generic code.  Currently all the major disk based filesystems
duplicate these checks, while many minor filesystems or network filesystems
lack some or all of them.

To do this we need defines for the extended attribute names in common code, I
moved them up from JFS which had the nicest defintions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jfs/xattr.c        | 15 -------------
 fs/xattr.c            | 61 +++++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/xattr.h | 15 +++++++++++++
 3 files changed, 72 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 23aa5066b5a4..9dde36a1eb5d 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -83,21 +83,6 @@ struct ea_buffer {
 #define EA_NEW		0x0004
 #define EA_MALLOC	0x0008
 
-/* Namespaces */
-#define XATTR_SYSTEM_PREFIX "system."
-#define XATTR_SYSTEM_PREFIX_LEN (sizeof (XATTR_SYSTEM_PREFIX) - 1)
-
-#define XATTR_USER_PREFIX "user."
-#define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1)
-
-#define XATTR_OS2_PREFIX "os2."
-#define XATTR_OS2_PREFIX_LEN (sizeof (XATTR_OS2_PREFIX) - 1)
-
-/* XATTR_SECURITY_PREFIX is defined in include/linux/xattr.h */
-#define XATTR_SECURITY_PREFIX_LEN (sizeof (XATTR_SECURITY_PREFIX) - 1)
-
-#define XATTR_TRUSTED_PREFIX "trusted."
-#define XATTR_TRUSTED_PREFIX_LEN (sizeof (XATTR_TRUSTED_PREFIX) - 1)
 
 /*
  * These three routines are used to recognize on-disk extended attributes
diff --git a/fs/xattr.c b/fs/xattr.c
index fee804e69a9a..80eca7d3d69f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -20,6 +20,47 @@
 #include <asm/uaccess.h>
 
 
+/*
+ * Check permissions for extended attribute access.  This is a bit complicated
+ * because different namespaces have very different rules.
+ */
+static int
+xattr_permission(struct inode *inode, const char *name, int mask)
+{
+	/*
+	 * We can never set or remove an extended attribute on a read-only
+	 * filesystem  or on an immutable / append-only inode.
+	 */
+	if (mask & MAY_WRITE) {
+		if (IS_RDONLY(inode))
+			return -EROFS;
+		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+			return -EPERM;
+	}
+
+	/*
+	 * No restriction for security.* and system.* from the VFS.  Decision
+	 * on these is left to the underlying filesystem / security module.
+	 */
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	    !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return 0;
+
+	/*
+	 * The trusted.* namespace can only accessed by a privilegued user.
+	 */
+	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
+
+	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
+		if (!S_ISREG(inode->i_mode) &&
+		    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
+			return -EPERM;
+	}
+
+	return permission(inode, mask, NULL);
+}
+
 int
 vfs_setxattr(struct dentry *dentry, char *name, void *value,
 		size_t size, int flags)
@@ -27,6 +68,10 @@ vfs_setxattr(struct dentry *dentry, char *name, void *value,
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
 	mutex_lock(&inode->i_mutex);
 	error = security_inode_setxattr(dentry, name, value, size, flags);
 	if (error)
@@ -40,8 +85,8 @@ vfs_setxattr(struct dentry *dentry, char *name, void *value,
 						     size, flags);
 		}
 	} else if (!strncmp(name, XATTR_SECURITY_PREFIX,
-				sizeof XATTR_SECURITY_PREFIX - 1)) {
-		const char *suffix = name + sizeof XATTR_SECURITY_PREFIX - 1;
+				XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
 		error = security_inode_setsecurity(inode, suffix, value,
 						   size, flags);
 		if (!error)
@@ -59,6 +104,10 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+	error = xattr_permission(inode, name, MAY_READ);
+	if (error)
+		return error;
+
 	error = security_inode_getxattr(dentry, name);
 	if (error)
 		return error;
@@ -69,8 +118,8 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 		error = -EOPNOTSUPP;
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
-				sizeof XATTR_SECURITY_PREFIX - 1)) {
-		const char *suffix = name + sizeof XATTR_SECURITY_PREFIX - 1;
+				XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
 		int ret = security_inode_getsecurity(inode, suffix, value,
 						     size, error);
 		/*
@@ -94,6 +143,10 @@ vfs_removexattr(struct dentry *dentry, char *name)
 	if (!inode->i_op->removexattr)
 		return -EOPNOTSUPP;
 
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
 	error = security_inode_removexattr(dentry, name);
 	if (error)
 		return error;
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 366f0ab4219f..cda8a96e2fa0 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -13,7 +13,22 @@
 #define XATTR_CREATE	0x1	/* set value, fail if attr already exists */
 #define XATTR_REPLACE	0x2	/* set value, fail if attr does not exist */
 
+/* Namespaces */
+#define XATTR_OS2_PREFIX "os2."
+#define XATTR_OS2_PREFIX_LEN (sizeof (XATTR_OS2_PREFIX) - 1)
+
 #define XATTR_SECURITY_PREFIX	"security."
+#define XATTR_SECURITY_PREFIX_LEN (sizeof (XATTR_SECURITY_PREFIX) - 1)
+
+#define XATTR_SYSTEM_PREFIX "system."
+#define XATTR_SYSTEM_PREFIX_LEN (sizeof (XATTR_SYSTEM_PREFIX) - 1)
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+#define XATTR_TRUSTED_PREFIX_LEN (sizeof (XATTR_TRUSTED_PREFIX) - 1)
+
+#define XATTR_USER_PREFIX "user."
+#define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1)
+
 
 struct xattr_handler {
 	char *prefix;
-- 
cgit v1.2.3


From 44a0033f6d5f3e7e2fc37d7b44a1d105c70d4682 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:51:57 -0800
Subject: [PATCH] remove jfs xattr permission checks

remove checks now in the VFS

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jfs/xattr.c | 51 +++++++++------------------------------------------
 1 file changed, 9 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 9dde36a1eb5d..952da5f917cd 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -758,36 +758,23 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
 static int can_set_xattr(struct inode *inode, const char *name,
 			 const void *value, size_t value_len)
 {
-	if (IS_RDONLY(inode))
-		return -EROFS;
-
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return -EPERM;
-
-	if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0)
-		/*
-		 * "system.*"
-		 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 		return can_set_system_xattr(inode, name, value, value_len);
 
-	if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0)
-		return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
-
-#ifdef CONFIG_JFS_SECURITY
-	if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)
-	    == 0)
-		return 0;	/* Leave it to the security module */
-#endif
-		
-	if((strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) != 0) &&
-	   (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) != 0))
+	/*
+	 * Don't allow setting an attribute in an unknown namespace.
+	 */
+	if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+	    strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))
 		return -EOPNOTSUPP;
 
 	if (!S_ISREG(inode->i_mode) &&
 	    (!S_ISDIR(inode->i_mode) || inode->i_mode &S_ISVTX))
 		return -EPERM;
 
-	return permission(inode, MAY_WRITE, NULL);
+	return 0;
 }
 
 int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
@@ -957,22 +944,6 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	return rc;
 }
 
-static int can_get_xattr(struct inode *inode, const char *name)
-{
-#ifdef CONFIG_JFS_SECURITY
-	if(strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0)
-		return 0;
-#endif
-
-	if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0)
-		return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
-
-	if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0)
-		return 0;
-
-	return permission(inode, MAY_READ, NULL);
-}
-
 ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
 		       size_t buf_size)
 {
@@ -983,12 +954,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
 	ssize_t size;
 	int namelen = strlen(name);
 	char *os2name = NULL;
-	int rc;
 	char *value;
 
-	if ((rc = can_get_xattr(inode, name)))
-		return rc;
-
 	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
 		os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
 				  GFP_KERNEL);
-- 
cgit v1.2.3


From 267fd05791aafea3786494365a5d13d7923ef526 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Mon, 9 Jan 2006 20:51:58 -0800
Subject: [PATCH] remove ext2 xattr permission checks

)

From: Christoph Hellwig <hch@lst.de>

remove checks now in the VFS

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/xattr.c         |  4 ----
 fs/ext2/xattr_trusted.c |  4 ----
 fs/ext2/xattr_user.c    | 14 --------------
 3 files changed, 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f7a3b5fee274..a2ca3107d475 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -389,10 +389,6 @@ ext2_xattr_set(struct inode *inode, int name_index, const char *name,
 	ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
 		  name_index, name, value, (long)value_len);
 
-	if (IS_RDONLY(inode))
-		return -EROFS;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return -EPERM;
 	if (value == NULL)
 		value_len = 0;
 	if (name == NULL)
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 52b30ee6a25f..2c072bfea23b 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -38,8 +38,6 @@ ext2_xattr_trusted_get(struct inode *inode, const char *name,
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
 	return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name,
 			      buffer, size);
 }
@@ -50,8 +48,6 @@ ext2_xattr_trusted_set(struct inode *inode, const char *name,
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
 	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
 			      value, size, flags);
 }
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 0c03ea131a94..f383e7c3a7b5 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -35,16 +35,10 @@ static int
 ext2_xattr_user_get(struct inode *inode, const char *name,
 		    void *buffer, size_t size)
 {
-	int error;
-
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	error = permission(inode, MAY_READ, NULL);
-	if (error)
-		return error;
-
 	return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size);
 }
 
@@ -52,18 +46,10 @@ static int
 ext2_xattr_user_set(struct inode *inode, const char *name,
 		    const void *value, size_t size, int flags)
 {
-	int error;
-
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	if ( !S_ISREG(inode->i_mode) &&
-	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-		return -EPERM;
-	error = permission(inode, MAY_WRITE, NULL);
-	if (error)
-		return error;
 
 	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
 			      value, size, flags);
-- 
cgit v1.2.3


From c37ef806a3e1c0bca65fd03b7590d56d19625da4 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Mon, 9 Jan 2006 20:51:58 -0800
Subject: [PATCH] remove ext3 xattr permission checks

)

From: Christoph Hellwig <hch@lst.de>

remove checks now in the VFS

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/xattr.c         |  4 ----
 fs/ext3/xattr_trusted.c |  4 ----
 fs/ext3/xattr_user.c    | 15 ---------------
 3 files changed, 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 238199d82ce5..e8d60bf6b7df 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -946,10 +946,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	};
 	int error;
 
-	if (IS_RDONLY(inode))
-		return -EROFS;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return -EPERM;
 	if (!name)
 		return -EINVAL;
 	if (strlen(name) > 255)
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index f68bfd1cf519..7c693c94f14d 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -39,8 +39,6 @@ ext3_xattr_trusted_get(struct inode *inode, const char *name,
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
 	return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name,
 			      buffer, size);
 }
@@ -51,8 +49,6 @@ ext3_xattr_trusted_set(struct inode *inode, const char *name,
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
 	return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name,
 			      value, size, flags);
 }
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index e907cae7a07c..a85a0a17c4fd 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -37,16 +37,10 @@ static int
 ext3_xattr_user_get(struct inode *inode, const char *name,
 		    void *buffer, size_t size)
 {
-	int error;
-
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	error = permission(inode, MAY_READ, NULL);
-	if (error)
-		return error;
-
 	return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size);
 }
 
@@ -54,19 +48,10 @@ static int
 ext3_xattr_user_set(struct inode *inode, const char *name,
 		    const void *value, size_t size, int flags)
 {
-	int error;
-
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	if ( !S_ISREG(inode->i_mode) &&
-	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-		return -EPERM;
-	error = permission(inode, MAY_WRITE, NULL);
-	if (error)
-		return error;
-
 	return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name,
 			      value, size, flags);
 }
-- 
cgit v1.2.3


From b98932cb514eef404c6168c9a15cf28851498dea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:51:59 -0800
Subject: [PATCH] remove reiserfs xattr permission checks

remove checks now in the VFS

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/xattr.c      | 21 ---------------------
 fs/reiserfs/xattr_user.c | 30 ------------------------------
 2 files changed, 51 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index f1895f0a278e..6f99e01f94ab 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -497,12 +497,6 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
 	struct iattr newattrs;
 	__u32 xahash = 0;
 
-	if (IS_RDONLY(inode))
-		return -EROFS;
-
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return -EPERM;
-
 	if (get_inode_sd_version(inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
@@ -758,9 +752,6 @@ int reiserfs_xattr_del(struct inode *inode, const char *name)
 	struct dentry *dir;
 	int err;
 
-	if (IS_RDONLY(inode))
-		return -EROFS;
-
 	dir = open_xa_dir(inode, FL_READONLY);
 	if (IS_ERR(dir)) {
 		err = PTR_ERR(dir);
@@ -984,12 +975,6 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	if (IS_RDONLY(dentry->d_inode))
-		return -EROFS;
-
-	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode))
-		return -EROFS;
-
 	reiserfs_write_lock_xattr_i(dentry->d_inode);
 	lock = !has_xattr_dir(dentry->d_inode);
 	if (lock)
@@ -1019,12 +1004,6 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
 	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
 		return -EOPNOTSUPP;
 
-	if (IS_RDONLY(dentry->d_inode))
-		return -EROFS;
-
-	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode))
-		return -EPERM;
-
 	reiserfs_write_lock_xattr_i(dentry->d_inode);
 	reiserfs_read_lock_xattrs(dentry->d_sb);
 
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 51458048ca66..073f39364b11 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -16,18 +16,10 @@ static int
 user_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
 
-	int error;
-
 	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
 		return -EINVAL;
-
 	if (!reiserfs_xattrs_user(inode->i_sb))
 		return -EOPNOTSUPP;
-
-	error = reiserfs_permission_locked(inode, MAY_READ, NULL);
-	if (error)
-		return error;
-
 	return reiserfs_xattr_get(inode, name, buffer, size);
 }
 
@@ -36,43 +28,21 @@ user_set(struct inode *inode, const char *name, const void *buffer,
 	 size_t size, int flags)
 {
 
-	int error;
-
 	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
 		return -EINVAL;
 
 	if (!reiserfs_xattrs_user(inode->i_sb))
 		return -EOPNOTSUPP;
-
-	if (!S_ISREG(inode->i_mode) &&
-	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-		return -EPERM;
-
-	error = reiserfs_permission_locked(inode, MAY_WRITE, NULL);
-	if (error)
-		return error;
-
 	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
 static int user_del(struct inode *inode, const char *name)
 {
-	int error;
-
 	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
 		return -EINVAL;
 
 	if (!reiserfs_xattrs_user(inode->i_sb))
 		return -EOPNOTSUPP;
-
-	if (!S_ISREG(inode->i_mode) &&
-	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-		return -EPERM;
-
-	error = reiserfs_permission_locked(inode, MAY_WRITE, NULL);
-	if (error)
-		return error;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3542c6e18f6470bad2bde1e94331e4f488a8d3f1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:00 -0800
Subject: [PATCH] remove xfs xattr permission checks

remove checks now in the VFS

XFS has an additional xattr interface through obscure ioctl.  it requires
raised capabilities but we need to add some read-only/immutable checks anyway

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Nathan Scott <nathans@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/xfs/linux-2.6/xfs_ioctl.c |  5 +++++
 fs/xfs/xfs_attr.c            | 29 +++--------------------------
 2 files changed, 8 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index b78b5eb9e96c..f98c5be3dfe7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -530,6 +530,8 @@ xfs_attrmulti_attr_set(
 	char			*kbuf;
 	int			error = EFAULT;
 
+	if (IS_RDONLY(&vp->v_inode))
+		return -EROFS;
 	if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
 		return EPERM;
 	if (len > XATTR_SIZE_MAX)
@@ -557,6 +559,9 @@ xfs_attrmulti_attr_remove(
 {
 	int			error;
 
+
+	if (IS_RDONLY(&vp->v_inode))
+		return -EROFS;
 	if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
 		return EPERM;
 
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 5484eeb460c8..1a11c2b51701 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -117,11 +117,6 @@ xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen,
 	     ip->i_d.di_anextents == 0))
 		return(ENOATTR);
 
-	if (!(flags & (ATTR_KERNACCESS|ATTR_SECURE))) {
-		if ((error = xfs_iaccess(ip, S_IRUSR, cred)))
-			return(XFS_ERROR(error));
-	}
-
 	/*
 	 * Fill in the arg structure for this request.
 	 */
@@ -425,7 +420,7 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f
 	     struct cred *cred)
 {
 	xfs_inode_t	*dp;
-	int             namelen, error;
+	int             namelen;
 
 	namelen = strlen(name);
 	if (namelen >= MAXNAMELEN)
@@ -437,14 +432,6 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return (EIO);
 
-	xfs_ilock(dp, XFS_ILOCK_SHARED);
-	if (!(flags & ATTR_SECURE) &&
-	     (error = xfs_iaccess(dp, S_IWUSR, cred))) {
-		xfs_iunlock(dp, XFS_ILOCK_SHARED);
-		return(XFS_ERROR(error));
-	}
-	xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
 	return xfs_attr_set_int(dp, name, namelen, value, valuelen, flags);
 }
 
@@ -579,7 +566,7 @@ int
 xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred)
 {
 	xfs_inode_t         *dp;
-	int                 namelen, error;
+	int                 namelen;
 
 	namelen = strlen(name);
 	if (namelen >= MAXNAMELEN)
@@ -592,11 +579,7 @@ xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred)
 		return (EIO);
 
 	xfs_ilock(dp, XFS_ILOCK_SHARED);
-	if (!(flags & ATTR_SECURE) &&
-	     (error = xfs_iaccess(dp, S_IWUSR, cred))) {
-		xfs_iunlock(dp, XFS_ILOCK_SHARED);
-		return(XFS_ERROR(error));
-	} else if (XFS_IFORK_Q(dp) == 0 ||
+	if (XFS_IFORK_Q(dp) == 0 ||
 		   (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
 		    dp->i_d.di_anextents == 0)) {
 		xfs_iunlock(dp, XFS_ILOCK_SHARED);
@@ -668,12 +651,6 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
 		return (EIO);
 
 	xfs_ilock(dp, XFS_ILOCK_SHARED);
-	if (!(flags & ATTR_SECURE) &&
-	     (error = xfs_iaccess(dp, S_IRUSR, cred))) {
-		xfs_iunlock(dp, XFS_ILOCK_SHARED);
-		return(XFS_ERROR(error));
-	}
-
 	/*
 	 * Decide on what work routines to call based on the inode size.
 	 */
-- 
cgit v1.2.3


From 870f481793b585323fbda3e87c54efc116f46351 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:01 -0800
Subject: [PATCH] replace inode_update_time with file_update_time

To allow various options to work per-mount instead of per-sb we need a
struct vfsmount when updating ctime and mtime.  This preparation patch
replaces the inode_update_time routine with a file_update_atime routine so
we can easily get at the vfsmount.  (and the file makes more sense in this
context anyway).  Also get rid of the unused second argument - we always
want to update the ctime when calling this routine.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c                 | 27 +++++++++++++++------------
 fs/ncpfs/file.c            |  2 +-
 fs/ntfs/file.c             |  2 +-
 fs/ntfs/inode.c            | 20 +++++++++++++++++++-
 fs/ocfs2/mmap.c            |  8 ++------
 fs/pipe.c                  |  2 +-
 fs/reiserfs/file.c         |  2 +-
 fs/xfs/linux-2.6/xfs_lrw.c |  2 +-
 include/linux/fs.h         |  2 +-
 mm/filemap.c               |  2 +-
 mm/filemap_xip.c           |  2 +-
 11 files changed, 44 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index e08767fd57b0..e177769f3b41 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1204,16 +1204,20 @@ void update_atime(struct inode *inode)
 EXPORT_SYMBOL(update_atime);
 
 /**
- *	inode_update_time	-	update mtime and ctime time
- *	@inode: inode accessed
- *	@ctime_too: update ctime too
+ *	file_update_time	-	update mtime and ctime time
+ *	@file: file accessed
  *
- *	Update the mtime time on an inode and mark it for writeback.
- *	When ctime_too is specified update the ctime too.
+ *	Update the mtime and ctime members of an inode and mark the inode
+ *	for writeback.  Note that this function is meant exclusively for
+ *	usage in the file write path of filesystems, and filesystems may
+ *	choose to explicitly ignore update via this function with the
+ *	S_NOCTIME inode flag, e.g. for network filesystem where these
+ *	timestamps are handled by the server.
  */
 
-void inode_update_time(struct inode *inode, int ctime_too)
+void file_update_time(struct file *file)
 {
+	struct inode *inode = file->f_dentry->d_inode;
 	struct timespec now;
 	int sync_it = 0;
 
@@ -1227,16 +1231,15 @@ void inode_update_time(struct inode *inode, int ctime_too)
 		sync_it = 1;
 	inode->i_mtime = now;
 
-	if (ctime_too) {
-		if (!timespec_equal(&inode->i_ctime, &now))
-			sync_it = 1;
-		inode->i_ctime = now;
-	}
+	if (!timespec_equal(&inode->i_ctime, &now))
+		sync_it = 1;
+	inode->i_ctime = now;
+
 	if (sync_it)
 		mark_inode_dirty_sync(inode);
 }
 
-EXPORT_SYMBOL(inode_update_time);
+EXPORT_SYMBOL(file_update_time);
 
 int inode_needs_sync(struct inode *inode)
 {
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 4947d9b11fc1..973b444d6914 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -262,7 +262,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 	}
 	vfree(bouncebuffer);
 
-	inode_update_time(inode, 1);
+	file_update_time(file);
 
 	*ppos = pos;
 
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 30f71acdc1cb..fb413d3d8618 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2173,7 +2173,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 	err = remove_suid(file->f_dentry);
 	if (err)
 		goto out;
-	inode_update_time(inode, 1);
+	file_update_time(file);
 	written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
 			count);
 out:
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index bda7a08911a5..ea1bd3feea1b 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2767,7 +2767,25 @@ unm_done:
 	up_write(&ni->runlist.lock);
 done:
 	/* Update the mtime and ctime on the base inode. */
-	inode_update_time(VFS_I(base_ni), 1);
+	/* normally ->truncate shouldn't update ctime or mtime,
+	 * but ntfs did before so it got a copy & paste version
+	 * of file_update_time.  one day someone should fix this
+	 * for real.
+	 */
+	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
+		struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb);
+		int sync_it = 0;
+
+		if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+		    !timespec_equal(&VFS_I(base_ni)->i_ctime, &now))
+			sync_it = 1;
+		VFS_I(base_ni)->i_mtime = now;
+		VFS_I(base_ni)->i_ctime = now;
+
+		if (sync_it)
+			mark_inode_dirty_sync(VFS_I(base_ni));
+	}
+
 	if (likely(!err)) {
 		NInoClearTruncateFailed(ni);
 		ntfs_debug("Done.");
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index afdeec4b0eef..843cf9ddefe8 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -80,12 +80,8 @@ static struct vm_operations_struct ocfs2_file_vm_ops = {
 	.nopage = ocfs2_nopage,
 };
 
-int ocfs2_mmap(struct file *file,
-	       struct vm_area_struct *vma)
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
-	struct inode *inode = mapping->host;
-
 	/* We don't want to support shared writable mappings yet. */
 	if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
 	    && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
@@ -95,7 +91,7 @@ int ocfs2_mmap(struct file *file,
 		return -EINVAL;
 	}
 
-	update_atime(inode);
+	file_accessed(file);
 	vma->vm_ops = &ocfs2_file_vm_ops;
 	return 0;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index acb030b61fb0..eef0f29e86ef 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -347,7 +347,7 @@ out:
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
 	}
 	if (ret > 0)
-		inode_update_time(inode, 1);	/* mtime and ctime */
+		file_update_time(filp);
 	return ret;
 }
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 127e7d2cabdd..ad6fa964b0e7 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1360,7 +1360,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 	if (res)
 		goto out;
 
-	inode_update_time(inode, 1);	/* Both mtime and ctime */
+	file_update_time(file);
 
 	// Ok, we are done with all the checks.
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 5675117ef227..885dfafeabee 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -713,7 +713,7 @@ start:
 	}
 
 	if (likely(!(ioflags & IO_INVIS))) {
-		inode_update_time(inode, 1);
+		file_update_time(file);
 		xfs_ichgtime_fast(xip, inode,
 				  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 92ae3e2067b0..1feee2e7e47b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1716,7 +1716,7 @@ extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const vo
 extern int inode_change_ok(struct inode *, struct iattr *);
 extern int __must_check inode_setattr(struct inode *, struct iattr *);
 
-extern void inode_update_time(struct inode *inode, int ctime_too);
+extern void file_update_time(struct file *file);
 
 static inline ino_t parent_ino(struct dentry *dentry)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 5fca2737c971..96de772be487 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2108,7 +2108,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 	if (err)
 		goto out;
 
-	inode_update_time(inode, 1);
+	file_update_time(file);
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (unlikely(file->f_flags & O_DIRECT)) {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index e2b34e95913e..b960ac8e5918 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -383,7 +383,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	if (ret)
 		goto out_backing;
 
-	inode_update_time(inode, 1);
+	file_update_time(filp);
 
 	ret = __xip_file_write (filp, buf, count, pos, ppos);
 
-- 
cgit v1.2.3


From 9cf6f4b3a3d24d8a41515b509b8fb6448f9201e9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:02 -0800
Subject: [PATCH] switch autofs4 to touch_atime()

After my lookup_hash patch ->d_revalidate always gets a valid struct nameidata
passed (unless you use lookup_one_len which autofs4 doesn't), so we can switch
it from update_atime to touch_atime.  This is a bit of an academic excercise
because autofs has a 1:1 vfsmount superblock relation, but I want to get rid
of update_atime so filesystems authors can't easily screw up per-mountpoint
noatime support.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/autofs4/root.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 541b19e6fec9..14aa70282e8c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -86,7 +86,7 @@ static int autofs4_root_readdir(struct file *file, void *dirent,
 
 /* Update usage from here to top of tree, so that scan of
    top-level directories will give a useful result */
-static void autofs4_update_usage(struct dentry *dentry)
+static void autofs4_update_usage(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct dentry *top = dentry->d_sb->s_root;
 
@@ -95,7 +95,7 @@ static void autofs4_update_usage(struct dentry *dentry)
 		struct autofs_info *ino = autofs4_dentry_ino(dentry);
 
 		if (ino) {
-			update_atime(dentry->d_inode);
+			touch_atime(mnt, dentry);
 			ino->last_used = jiffies;
 		}
 	}
@@ -289,10 +289,10 @@ out:
 	return autofs4_dcache_readdir(file, dirent, filldir);
 }
 
-static int try_to_fill_dentry(struct dentry *dentry, 
-			      struct super_block *sb,
-			      struct autofs_sb_info *sbi, int flags)
+static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int flags)
 {
+	struct super_block *sb = mnt->mnt_sb;
+	struct autofs_sb_info *sbi = autofs4_sbi(sb);
 	struct autofs_info *de_info = autofs4_dentry_ino(dentry);
 	int status = 0;
 
@@ -367,7 +367,7 @@ static int try_to_fill_dentry(struct dentry *dentry,
 	/* We don't update the usages for the autofs daemon itself, this
 	   is necessary for recursive autofs mounts */
 	if (!autofs4_oz_mode(sbi))
-		autofs4_update_usage(dentry);
+		autofs4_update_usage(mnt, dentry);
 
 	spin_lock(&dentry->d_lock);
 	dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
@@ -392,7 +392,7 @@ static int autofs4_revalidate(struct dentry * dentry, struct nameidata *nd)
 	/* Pending dentry */
 	if (autofs4_ispending(dentry)) {
 		if (!oz_mode)
-			status = try_to_fill_dentry(dentry, dir->i_sb, sbi, flags);
+			status = try_to_fill_dentry(nd->mnt, dentry, flags);
 		return status;
 	}
 
@@ -409,14 +409,14 @@ static int autofs4_revalidate(struct dentry * dentry, struct nameidata *nd)
 			 dentry, dentry->d_name.len, dentry->d_name.name);
 		spin_unlock(&dcache_lock);
 		if (!oz_mode)
-			status = try_to_fill_dentry(dentry, dir->i_sb, sbi, flags);
+			status = try_to_fill_dentry(nd->mnt, dentry, flags);
 		return status;
 	}
 	spin_unlock(&dcache_lock);
 
 	/* Update the usage list */
 	if (!oz_mode)
-		autofs4_update_usage(dentry);
+		autofs4_update_usage(nd->mnt, dentry);
 
 	return 1;
 }
-- 
cgit v1.2.3


From 869243a0f6143f76e7c847e707eee6ece9cbf821 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:03 -0800
Subject: [PATCH] remove update_atime

All callers use touch_atime now which takes a vfsmount and allows us to
implement per-mount noatime.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c         |  9 ++++++---
 include/linux/fs.h | 10 +---------
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index e177769f3b41..76980a9c92e7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1176,17 +1176,20 @@ sector_t bmap(struct inode * inode, sector_t block)
 EXPORT_SYMBOL(bmap);
 
 /**
- *	update_atime	-	update the access time
+ *	touch_atime	-	update the access time
+ *	@mnt: mount the inode is accessed on
  *	@inode: inode accessed
  *
  *	Update the accessed time on an inode and mark it for writeback.
  *	This function automatically handles read only file systems and media,
  *	as well as the "noatime" flag and inode specific "noatime" markers.
  */
-void update_atime(struct inode *inode)
+void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 {
+	struct inode *inode = dentry->d_inode;
 	struct timespec now;
 
+	/* per-mountpoint checks will go here */
 	if (IS_NOATIME(inode))
 		return;
 	if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
@@ -1201,7 +1204,7 @@ void update_atime(struct inode *inode)
 	}
 }
 
-EXPORT_SYMBOL(update_atime);
+EXPORT_SYMBOL(touch_atime);
 
 /**
  *	file_update_time	-	update mtime and ctime time
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1feee2e7e47b..85c5656756b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -235,9 +235,6 @@ struct kstatfs;
 struct vm_area_struct;
 struct vfsmount;
 
-/* Used to be a macro which just called the function, now just a function */
-extern void update_atime (struct inode *);
-
 extern void __init inode_init(unsigned long);
 extern void __init inode_init_early(void);
 extern void __init mnt_init(unsigned long);
@@ -1118,12 +1115,7 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
-static inline void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
-{
-	/* per-mountpoint checks will go here */
-	update_atime(dentry->d_inode);
-}
-
+extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
 static inline void file_accessed(struct file *file)
 {
 	if (!(file->f_flags & O_NOATIME))
-- 
cgit v1.2.3


From ec3cad96901373ad0e21611cfbcc372fe09df1f7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:11 -0800
Subject: [PATCH] move rtc compat ioctl handling to fs/compat_ioctl.c

This patch implements generic handling of RTC_IRQP_READ32, RTC_IRQP_SET32,
RTC_EPOCH_READ32 and RTC_EPOCH_SET32 in fs/compat_ioctl.c.  It's based on the
x86_64 code which needed a little massaging to be endian-clean.

parisc used COMPAT_IOCTL or generic w_long handlers for these whichce is wrong
and can't work because the ioctls encode sizeof(unsigned long) in their ioctl
number.  parisc also duplicated COMPAT_IOCTL entries for other rtc ioctls
which I remove in this patch, too.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Matthew Wilcox <matthew@wil.cx>
Acked-by: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/parisc/kernel/ioctl32.c  | 19 -----------------
 arch/x86_64/ia32/ia32_ioctl.c | 47 -------------------------------------------
 fs/compat_ioctl.c             | 47 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/arch/parisc/kernel/ioctl32.c b/arch/parisc/kernel/ioctl32.c
index 4eada1bb27f0..805f31486cf9 100644
--- a/arch/parisc/kernel/ioctl32.c
+++ b/arch/parisc/kernel/ioctl32.c
@@ -36,25 +36,6 @@ HANDLE_IOCTL(SIOCGPPPSTATS, dev_ifsioc)
 HANDLE_IOCTL(SIOCGPPPCSTATS, dev_ifsioc)
 HANDLE_IOCTL(SIOCGPPPVER, dev_ifsioc)
 
-#if defined(CONFIG_GEN_RTC)
-COMPATIBLE_IOCTL(RTC_AIE_ON)
-COMPATIBLE_IOCTL(RTC_AIE_OFF)
-COMPATIBLE_IOCTL(RTC_UIE_ON)
-COMPATIBLE_IOCTL(RTC_UIE_OFF)
-COMPATIBLE_IOCTL(RTC_PIE_ON)
-COMPATIBLE_IOCTL(RTC_PIE_OFF)
-COMPATIBLE_IOCTL(RTC_WIE_ON)
-COMPATIBLE_IOCTL(RTC_WIE_OFF)
-COMPATIBLE_IOCTL(RTC_ALM_SET)   /* struct rtc_time only has ints */
-COMPATIBLE_IOCTL(RTC_ALM_READ)  /* struct rtc_time only has ints */
-COMPATIBLE_IOCTL(RTC_RD_TIME)   /* struct rtc_time only has ints */
-COMPATIBLE_IOCTL(RTC_SET_TIME)  /* struct rtc_time only has ints */
-HANDLE_IOCTL(RTC_IRQP_READ, w_long)
-COMPATIBLE_IOCTL(RTC_IRQP_SET)
-HANDLE_IOCTL(RTC_EPOCH_READ, w_long)
-COMPATIBLE_IOCTL(RTC_EPOCH_SET)
-#endif
-
 IOCTL_TABLE_END
 
 int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
index e335bd0b637d..e11cc5699352 100644
--- a/arch/x86_64/ia32/ia32_ioctl.c
+++ b/arch/x86_64/ia32/ia32_ioctl.c
@@ -16,45 +16,6 @@
 
 #define CODE
 #include "compat_ioctl.c"
-  
-#define RTC_IRQP_READ32	_IOR('p', 0x0b, unsigned int)	 /* Read IRQ rate   */
-#define RTC_IRQP_SET32	_IOW('p', 0x0c, unsigned int)	 /* Set IRQ rate    */
-#define RTC_EPOCH_READ32	_IOR('p', 0x0d, unsigned)	 /* Read epoch      */
-#define RTC_EPOCH_SET32		_IOW('p', 0x0e, unsigned)	 /* Set epoch       */
-
-static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg) 
-{ 
-	unsigned long val;
-	mm_segment_t oldfs = get_fs(); 
-	int ret; 
-	
-	switch (cmd) { 
-	case RTC_IRQP_READ32: 
-		set_fs(KERNEL_DS); 
-		ret = sys_ioctl(fd, RTC_IRQP_READ, (unsigned long)&val); 
-		set_fs(oldfs); 
-		if (!ret)
-			ret = put_user(val, (unsigned int __user *) arg); 
-		return ret; 
-
-	case RTC_IRQP_SET32: 
-		cmd = RTC_IRQP_SET; 
-		break; 
-
-	case RTC_EPOCH_READ32:
-		set_fs(KERNEL_DS); 
-		ret = sys_ioctl(fd, RTC_EPOCH_READ, (unsigned long) &val); 
-		set_fs(oldfs); 
-		if (!ret)
-			ret = put_user(val, (unsigned int __user *) arg); 
-		return ret; 
-
-	case RTC_EPOCH_SET32:
-		cmd = RTC_EPOCH_SET; 
-		break; 
-	} 
-	return sys_ioctl(fd,cmd,arg); 
-} 
 
 
 #define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) }, 
@@ -64,14 +25,6 @@ struct ioctl_trans ioctl_start[] = {
 #include <linux/compat_ioctl.h>
 #define DECLARES
 #include "compat_ioctl.c"
-
-/* And these ioctls need translation */
-/* realtime device */
-HANDLE_IOCTL(RTC_IRQP_READ,  rtc32_ioctl)
-HANDLE_IOCTL(RTC_IRQP_READ32,rtc32_ioctl)
-HANDLE_IOCTL(RTC_IRQP_SET32, rtc32_ioctl)
-HANDLE_IOCTL(RTC_EPOCH_READ32, rtc32_ioctl)
-HANDLE_IOCTL(RTC_EPOCH_SET32, rtc32_ioctl)
 /* take care of sizeof(sizeof()) breakage */
 }; 
 
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 55d9a3a954cf..b9aeacc11c8f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2475,6 +2475,49 @@ static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg
 	return -EINVAL;
 }
 
+#define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
+#define RTC_IRQP_SET32		_IOW('p', 0x0c, compat_ulong_t)
+#define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
+#define RTC_EPOCH_SET32		_IOW('p', 0x0e, compat_ulong_t)
+
+static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
+{
+	mm_segment_t oldfs = get_fs();
+	compat_ulong_t val32;
+	unsigned long kval;
+	int ret;
+
+	switch (cmd) {
+	case RTC_IRQP_READ32:
+	case RTC_EPOCH_READ32:
+		set_fs(KERNEL_DS);
+		ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
+					RTC_IRQP_READ : RTC_EPOCH_READ,
+					(unsigned long)&kval);
+		set_fs(oldfs);
+		if (ret)
+			return ret;
+		val32 = kval;
+		return put_user(val32, (unsigned int __user *)arg);
+	case RTC_IRQP_SET32:
+	case RTC_EPOCH_SET32:
+		ret = get_user(val32, (unsigned int __user *)arg);
+		if (ret)
+			return ret;
+		kval = val32;
+
+		set_fs(KERNEL_DS);
+		ret = sys_ioctl(fd, (cmd == RTC_IRQP_SET32) ?
+				RTC_IRQP_SET : RTC_EPOCH_SET,
+				(unsigned long)&kval);
+		set_fs(oldfs);
+		return ret;
+	default:
+		/* unreached */
+		return -ENOIOCTLCMD;
+	}
+}
+
 #if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE)
 struct ncp_ioctl_request_32 {
 	u32 function;
@@ -2858,6 +2901,10 @@ HANDLE_IOCTL(SIOCSIWENCODE, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCGIWENCODE, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
 HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
+HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
+HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
+HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
+HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl)
 
 #if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE)
 HANDLE_IOCTL(NCP_IOC_NCPREQUEST_32, do_ncp_ncprequest)
-- 
cgit v1.2.3


From e6a6d2efcb7e7c87c5fe0395803da1453b29cbef Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:14 -0800
Subject: [PATCH] sanitize building of fs/compat_ioctl.c

Now that all these entries in the arch ioctl32.c files are gone [1], we can
build fs/compat_ioctl.c as a normal object and kill tons of cruft.  We need a
special do_ioctl32_pointer handler for s390 so the compat_ptr call is done.
This is not needed but harmless on all other architectures.  Also remove some
superflous includes in fs/compat_ioctl.c

Tested on ppc64.

[1] parisc still had it's PPP handler left, which is not fully correct
    for ppp and besides that ppp uses the generic SIOCPRIV ioctl so it'd
    kick in for all netdevice users.  We can introduce a proper handler
    in one of the next patch series by adding a compat_ioctl method to
    struct net_device but for now let's just kill it - parisc doesn't
    compile in mainline anyway and I don't want this to block this
    patchset.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <willy@debian.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/ia32/Makefile         |  4 +---
 arch/ia64/ia32/ia32_ioctl.c     | 45 -------------------------------------
 arch/mips/kernel/Makefile       |  3 +--
 arch/mips/kernel/ioctl32.c      | 50 -----------------------------------------
 arch/parisc/kernel/Makefile     |  3 +--
 arch/parisc/kernel/ioctl32.c    | 41 ---------------------------------
 arch/powerpc/kernel/Makefile    |  3 +--
 arch/powerpc/kernel/ioctl32.c   | 45 -------------------------------------
 arch/s390/kernel/Makefile       |  3 +--
 arch/s390/kernel/compat_ioctl.c | 47 --------------------------------------
 arch/sparc64/kernel/Makefile    |  4 +---
 arch/sparc64/kernel/ioctl32.c   | 39 --------------------------------
 arch/x86_64/ia32/Makefile       |  4 +---
 arch/x86_64/ia32/ia32_ioctl.c   | 32 --------------------------
 fs/Makefile                     |  2 +-
 fs/compat_ioctl.c               | 37 +++++++++++++++++-------------
 include/linux/compat_ioctl.h    |  8 -------
 17 files changed, 29 insertions(+), 341 deletions(-)
 delete mode 100644 arch/ia64/ia32/ia32_ioctl.c
 delete mode 100644 arch/mips/kernel/ioctl32.c
 delete mode 100644 arch/parisc/kernel/ioctl32.c
 delete mode 100644 arch/powerpc/kernel/ioctl32.c
 delete mode 100644 arch/s390/kernel/compat_ioctl.c
 delete mode 100644 arch/sparc64/kernel/ioctl32.c
 delete mode 100644 arch/x86_64/ia32/ia32_ioctl.c

(limited to 'fs')

diff --git a/arch/ia64/ia32/Makefile b/arch/ia64/ia32/Makefile
index 2ed90da81166..61cb60affd95 100644
--- a/arch/ia64/ia32/Makefile
+++ b/arch/ia64/ia32/Makefile
@@ -2,11 +2,9 @@
 # Makefile for the ia32 kernel emulation subsystem.
 #
 
-obj-y := ia32_entry.o sys_ia32.o ia32_ioctl.o ia32_signal.o \
+obj-y := ia32_entry.o sys_ia32.o ia32_signal.o \
 	 ia32_support.o ia32_traps.o binfmt_elf32.o ia32_ldt.o
 
-CFLAGS_ia32_ioctl.o += -Ifs/
-
 # Don't let GCC uses f16-f31 so that save_ia32_fpstate_live() and
 # restore_ia32_fpstate_live() can be sure the live register contain user-level state.
 CFLAGS_ia32_signal.o += -mfixed-range=f16-f31
diff --git a/arch/ia64/ia32/ia32_ioctl.c b/arch/ia64/ia32/ia32_ioctl.c
deleted file mode 100644
index 88739394f6df..000000000000
--- a/arch/ia64/ia32/ia32_ioctl.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * IA32 Architecture-specific ioctl shim code
- *
- * Copyright (C) 2000 VA Linux Co
- * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
- * Copyright (C) 2001-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/signal.h>	/* argh, msdos_fs.h isn't self-contained... */
-#include <linux/syscalls.h>
-#include "ia32priv.h"
-  
-#define	INCLUDES
-#include "compat_ioctl.c"
-
-#define IOCTL_NR(a)	((a) & ~(_IOC_SIZEMASK << _IOC_SIZESHIFT))
-
-#define DO_IOCTL(fd, cmd, arg) ({			\
-	int _ret;					\
-	mm_segment_t _old_fs = get_fs();		\
-							\
-	set_fs(KERNEL_DS);				\
-	_ret = sys_ioctl(fd, cmd, (unsigned long)arg);	\
-	set_fs(_old_fs);				\
-	_ret;						\
-})
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define COMPATIBLE_IOCTL(cmd)		HANDLE_IOCTL((cmd),sys_ioctl)
-#define HANDLE_IOCTL(cmd,handler)	{ (cmd), (ioctl_trans_handler_t)(handler), NULL },
-#define IOCTL_TABLE_START \
-	struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END \
-	};
-
-IOCTL_TABLE_START
-#define DECLARES
-#include "compat_ioctl.c"
-#include <linux/compat_ioctl.h>
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index 72f2126ad19d..f36c4f20ee8a 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_MIPS_BOARDS_GEN)	+= irq-msc01.o
 obj-$(CONFIG_32BIT)		+= scall32-o32.o
 obj-$(CONFIG_64BIT)		+= scall64-64.o
 obj-$(CONFIG_BINFMT_IRIX)	+= binfmt_irix.o
-obj-$(CONFIG_MIPS32_COMPAT)	+= ioctl32.o linux32.o signal32.o
+obj-$(CONFIG_MIPS32_COMPAT)	+= linux32.o signal32.o
 obj-$(CONFIG_MIPS32_N32)	+= binfmt_elfn32.o scall64-n32.o signal_n32.o
 obj-$(CONFIG_MIPS32_O32)	+= binfmt_elfo32.o scall64-o32.o ptrace32.o
 
@@ -60,6 +60,5 @@ obj-$(CONFIG_PROC_FS)		+= proc.o
 obj-$(CONFIG_64BIT)		+= cpu-bugs64.o
 
 CFLAGS_cpu-bugs64.o	= $(shell if $(CC) $(CFLAGS) -Wa,-mdaddi -c -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-DHAVE_AS_SET_DADDI"; fi)
-CFLAGS_ioctl32.o	+= -Ifs/
 
 EXTRA_AFLAGS := $(CFLAGS)
diff --git a/arch/mips/kernel/ioctl32.c b/arch/mips/kernel/ioctl32.c
deleted file mode 100644
index 9ea1fc748864..000000000000
--- a/arch/mips/kernel/ioctl32.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Copyright (C) 2000 Silicon Graphics, Inc.
- * Written by Ulf Carlsson (ulfc@engr.sgi.com)
- * Copyright (C) 2000, 2004 Ralf Baechle
- * Copyright (C) 2002, 2003  Maciej W. Rozycki
- */
-#define INCLUDES
-#include "compat_ioctl.c"
-
-#include <linux/config.h>
-#include <linux/types.h>
-#include <linux/compat.h>
-#include <linux/ioctl32.h>
-#include <linux/syscalls.h>
-
-#ifdef CONFIG_SIBYTE_TBPROF
-#include <asm/sibyte/trace_prof.h>
-#endif
-
-#define A(__x) ((unsigned long)(__x))
-
-long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define COMPATIBLE_IOCTL(cmd)		HANDLE_IOCTL((cmd),sys_ioctl)
-#define HANDLE_IOCTL(cmd,handler)	{ (cmd), (ioctl_trans_handler_t)(handler), NULL },
-#define IOCTL_TABLE_START \
-	struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END \
-	};
-
-IOCTL_TABLE_START
-
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "compat_ioctl.c"
-
-/*HANDLE_IOCTL(RTC_IRQP_READ, w_long)
-COMPATIBLE_IOCTL(RTC_IRQP_SET)
-HANDLE_IOCTL(RTC_EPOCH_READ, w_long)
-COMPATIBLE_IOCTL(RTC_EPOCH_SET)
-*/
-
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index 171f9c239f60..27827bc3717e 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -6,7 +6,6 @@ extra-y			:= init_task.o head.o vmlinux.lds
 
 AFLAGS_entry.o	:= -traditional
 AFLAGS_pacache.o := -traditional
-CFLAGS_ioctl32.o := -Ifs/
 
 obj-y	     	:= cache.o pacache.o setup.o traps.o time.o irq.o \
 		   pa7300lc.o syscall.o entry.o sys_parisc.o firmware.o \
@@ -19,6 +18,6 @@ obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PA11)	+= pci-dma.o
 obj-$(CONFIG_PCI)	+= pci.o
 obj-$(CONFIG_MODULES)	+= module.o
-obj-$(CONFIG_64BIT)	+= binfmt_elf32.o sys_parisc32.o ioctl32.o signal32.o
+obj-$(CONFIG_64BIT)	+= binfmt_elf32.o sys_parisc32.o signal32.o
 # only supported for PCX-W/U in 64-bit mode at the moment
 obj-$(CONFIG_64BIT)	+= perf.o perf_asm.o
diff --git a/arch/parisc/kernel/ioctl32.c b/arch/parisc/kernel/ioctl32.c
deleted file mode 100644
index 805f31486cf9..000000000000
--- a/arch/parisc/kernel/ioctl32.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/* $Id: ioctl32.c,v 1.5 2002/10/18 00:21:43 varenet Exp $
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
- * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * ioctls.
- */
-
-#include <linux/syscalls.h>
-
-#define INCLUDES
-#include "compat_ioctl.c"
-
-#include <asm/perf.h>
-#include <asm/ioctls.h>
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define HANDLE_IOCTL(cmd, handler) { cmd, (ioctl_trans_handler_t)handler, NULL },
-#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd, sys_ioctl) 
-
-#define IOCTL_TABLE_START  struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END    };
-
-IOCTL_TABLE_START
-#include <linux/compat_ioctl.h>
-
-#define DECLARES
-#include "compat_ioctl.c"
-
-/* And these ioctls need translation */
-HANDLE_IOCTL(SIOCGPPPSTATS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGPPPCSTATS, dev_ifsioc)
-HANDLE_IOCTL(SIOCGPPPVER, dev_ifsioc)
-
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 6e03b595b6c8..17ed5018288b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -4,7 +4,6 @@
 
 ifeq ($(CONFIG_PPC64),y)
 EXTRA_CFLAGS	+= -mno-minimal-toc
-CFLAGS_ioctl32.o += -Ifs/
 endif
 ifeq ($(CONFIG_PPC32),y)
 CFLAGS_prom_init.o      += -fPIC
@@ -16,7 +15,7 @@ obj-y				:= semaphore.o cputable.o ptrace.o syscalls.o \
 obj-y				+= vdso32/
 obj-$(CONFIG_PPC64)		+= setup_64.o binfmt_elf32.o sys_ppc32.o \
 				   signal_64.o ptrace32.o systbl.o \
-				   paca.o ioctl32.o cpu_setup_power4.o \
+				   paca.o cpu_setup_power4.o \
 				   firmware.o sysfs.o idle_64.o
 obj-$(CONFIG_PPC64)		+= vdso64/
 obj-$(CONFIG_ALTIVEC)		+= vecemu.o vector.o
diff --git a/arch/powerpc/kernel/ioctl32.c b/arch/powerpc/kernel/ioctl32.c
deleted file mode 100644
index 0fa3d27fef01..000000000000
--- a/arch/powerpc/kernel/ioctl32.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Based on sparc64 ioctl32.c by:
- *
- * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
- * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
- *
- * ppc64 changes:
- *
- * Copyright (C) 2000  Ken Aaker (kdaaker@rchland.vnet.ibm.com)
- * Copyright (C) 2001  Anton Blanchard (antonb@au.ibm.com)
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * ioctls.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define INCLUDES
-#include "compat_ioctl.c"
-#include <linux/syscalls.h>
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define HANDLE_IOCTL(cmd,handler) { cmd, (ioctl_trans_handler_t)handler, NULL },
-#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
-
-#define IOCTL_TABLE_START \
-	struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END \
-	};
-
-IOCTL_TABLE_START
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "compat_ioctl.c"
-
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 4865e4b49464..9269b5788fac 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -17,8 +17,7 @@ obj-$(CONFIG_MODULES)		+= s390_ksyms.o module.o
 obj-$(CONFIG_SMP)		+= smp.o
 
 obj-$(CONFIG_COMPAT)		+= compat_linux.o compat_signal.o \
-					compat_ioctl.o compat_wrapper.o \
-					compat_exec_domain.o
+					compat_wrapper.o compat_exec_domain.o
 obj-$(CONFIG_BINFMT_ELF32)	+= binfmt_elf32.o
 
 obj-$(CONFIG_VIRT_TIMER)	+= vtime.o
diff --git a/arch/s390/kernel/compat_ioctl.c b/arch/s390/kernel/compat_ioctl.c
deleted file mode 100644
index d716b1768c99..000000000000
--- a/arch/s390/kernel/compat_ioctl.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- *  S390 version
- *    Copyright (C) 2000-2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
- *    Author(s): Gerhard Tonn (ton@de.ibm.com)
- *               Arnd Bergmann (arndb@de.ibm.com)
- *
- * Original implementation from 32-bit Sparc compat code which is
- * Copyright (C) 2000 Silicon Graphics, Inc.
- * Written by Ulf Carlsson (ulfc@engr.sgi.com) 
- */
-
-#include "compat_linux.h"
-#define INCLUDES
-#define CODE
-#include "../../../fs/compat_ioctl.c"
-#include <asm/dasd.h>
-#include <asm/cmb.h>
-#include <asm/tape390.h>
-#include <asm/ccwdev.h>
-#include "../../../drivers/s390/char/raw3270.h"
-
-static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd,
-				unsigned long arg, struct file *f)
-{
-	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
-}
-
-static int do_ioctl32_ulong(unsigned int fd, unsigned int cmd,
-				unsigned long arg, struct file *f)
-{
-	return sys_ioctl(fd, cmd, arg);
-}
-
-#define COMPATIBLE_IOCTL(cmd)		HANDLE_IOCTL((cmd),(ioctl_trans_handler_t)do_ioctl32_pointer)
-#define ULONG_IOCTL(cmd)		HANDLE_IOCTL((cmd),(ioctl_trans_handler_t)do_ioctl32_ulong)
-#define HANDLE_IOCTL(cmd,handler)	{ (cmd), (ioctl_trans_handler_t)(handler), NULL },
-
-struct ioctl_trans ioctl_start[] = {
-/* architecture independent ioctls */
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "../../../fs/compat_ioctl.c"
-};
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile
index 6f00ab8b9d23..83d67eb18895 100644
--- a/arch/sparc64/kernel/Makefile
+++ b/arch/sparc64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-y		:= process.o setup.o cpu.o idprom.o \
 obj-$(CONFIG_PCI)	 += ebus.o isa.o pci_common.o pci_iommu.o \
 			    pci_psycho.o pci_sabre.o pci_schizo.o
 obj-$(CONFIG_SMP)	 += smp.o trampoline.o
-obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o ioctl32.o
+obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o
 obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o
 obj-$(CONFIG_BINFMT_AOUT32) += binfmt_aout32.o
 obj-$(CONFIG_MODULES) += module.o
@@ -40,5 +40,3 @@ endif
 
 head.o: head.S ttable.S itlb_base.S dtlb_base.S dtlb_backend.S dtlb_prot.S \
 	etrap.S rtrap.S winfixup.S entry.S
-
-CFLAGS_ioctl32.o += -Ifs/
diff --git a/arch/sparc64/kernel/ioctl32.c b/arch/sparc64/kernel/ioctl32.c
deleted file mode 100644
index 196b208665a2..000000000000
--- a/arch/sparc64/kernel/ioctl32.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* $Id: ioctl32.c,v 1.136 2002/01/14 09:49:52 davem Exp $
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
- * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
- * Copyright (C) 2003  Pavel Machek (pavel@suse.cz)
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * ioctls.
- */
-
-#define INCLUDES
-#include "compat_ioctl.c"
-#include <linux/syscalls.h>
-
-#define CODE
-#include "compat_ioctl.c"
-
-#define COMPATIBLE_IOCTL(cmd)		HANDLE_IOCTL((cmd),sys_ioctl)
-#define HANDLE_IOCTL(cmd,handler)	{ (cmd), (ioctl_trans_handler_t)(handler), NULL },
-#define IOCTL_TABLE_START \
-	struct ioctl_trans ioctl_start[] = {
-#define IOCTL_TABLE_END \
-	};
-
-IOCTL_TABLE_START
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "compat_ioctl.c"
-#if 0
-HANDLE_IOCTL(RTC32_IRQP_READ, do_rtc_ioctl)
-HANDLE_IOCTL(RTC32_IRQP_SET, do_rtc_ioctl)
-HANDLE_IOCTL(RTC32_EPOCH_READ, do_rtc_ioctl)
-HANDLE_IOCTL(RTC32_EPOCH_SET, do_rtc_ioctl)
-#endif
-/* take care of sizeof(sizeof()) breakage */
-IOCTL_TABLE_END
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
index f76217d8f579..051608d55920 100644
--- a/arch/x86_64/ia32/Makefile
+++ b/arch/x86_64/ia32/Makefile
@@ -2,8 +2,7 @@
 # Makefile for the ia32 kernel emulation subsystem.
 #
 
-obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \
-	ia32_signal.o tls32.o \
+obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \
 	ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
@@ -29,4 +28,3 @@ $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
 
 AFLAGS_vsyscall-sysenter.o = -m32
 AFLAGS_vsyscall-syscall.o = -m32
-CFLAGS_ia32_ioctl.o += -Ifs/
diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
deleted file mode 100644
index e11cc5699352..000000000000
--- a/arch/x86_64/ia32/ia32_ioctl.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/* $Id: ia32_ioctl.c,v 1.25 2002/10/11 07:17:06 ak Exp $
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
- * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
- * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * ioctls.
- */
-
-#define INCLUDES
-#include <linux/syscalls.h>
-#include "compat_ioctl.c"
-#include <asm/ia32.h>
-
-#define CODE
-#include "compat_ioctl.c"
-
-
-#define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) }, 
-#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
-
-struct ioctl_trans ioctl_start[] = { 
-#include <linux/compat_ioctl.h>
-#define DECLARES
-#include "compat_ioctl.c"
-/* take care of sizeof(sizeof()) breakage */
-}; 
-
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
-
diff --git a/fs/Makefile b/fs/Makefile
index 35e9aec608e4..1db711319c80 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,7 +14,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
-obj-$(CONFIG_COMPAT)		+= compat.o
+obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
 nfsd-$(CONFIG_NFSD)		:= nfsctl.o
 obj-y				+= $(nfsd-y) $(nfsd-m)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index b9aeacc11c8f..890bc30fbe20 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -10,7 +10,6 @@
  * ioctls.
  */
 
-#ifdef INCLUDES
 #include <linux/config.h>
 #include <linux/types.h>
 #include <linux/compat.h>
@@ -81,13 +80,9 @@
 #include <linux/capi.h>
 
 #include <scsi/scsi.h>
-/* Ugly hack. */
-#undef __KERNEL__
 #include <scsi/scsi_ioctl.h>
-#define __KERNEL__
 #include <scsi/sg.h>
 
-#include <asm/types.h>
 #include <asm/uaccess.h>
 #include <linux/ethtool.h>
 #include <linux/mii.h>
@@ -95,7 +90,6 @@
 #include <linux/watchdog.h>
 #include <linux/dm-ioctl.h>
 
-#include <asm/module.h>
 #include <linux/soundcard.h>
 #include <linux/lp.h>
 #include <linux/ppdev.h>
@@ -128,11 +122,6 @@
 #include <linux/dvb/frontend.h>
 #include <linux/dvb/video.h>
 
-#undef INCLUDES
-#endif
-
-#ifdef CODE
-
 /* Aiee. Someone does not find a difference between int and long */
 #define EXT2_IOC32_GETFLAGS               _IOR('f', 1, int)
 #define EXT2_IOC32_SETFLAGS               _IOW('f', 2, int)
@@ -148,6 +137,12 @@
 #define EXT2_IOC32_GETVERSION             _IOR('v', 1, int)
 #define EXT2_IOC32_SETVERSION             _IOW('v', 2, int)
 
+static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd,
+			      unsigned long arg, struct file *f)
+{
+	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
+}
+
 static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	mm_segment_t old_fs = get_fs();
@@ -2705,10 +2700,20 @@ static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned lon
 }
 #endif
 
-#undef CODE
-#endif
+#define HANDLE_IOCTL(cmd,handler) \
+	{ (cmd), (ioctl_trans_handler_t)(handler) },
+
+/* pointer to compatible structure or no argument */
+#define COMPATIBLE_IOCTL(cmd) \
+	{ (cmd), do_ioctl32_pointer },
+
+/* argument is an unsigned long integer, not a pointer */
+#define ULONG_IOCTL(cmd) \
+	{ (cmd), (ioctl_trans_handler_t)sys_ioctl },
 
-#ifdef DECLARES
+
+struct ioctl_trans ioctl_start[] = {
+#include <linux/compat_ioctl.h>
 HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
 HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
 #ifdef CONFIG_NET
@@ -2921,6 +2926,6 @@ HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event)
 HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
 HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
 HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
+};
 
-#undef DECLARES
-#endif
+int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 339878952f12..8fad50f8e389 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -2,14 +2,6 @@
  * compatible types passed or none at all... Please include
  * only stuff that is compatible on *all architectures*.
  */
-#ifndef COMPATIBLE_IOCTL /* pointer to compatible structure or no argument */
-#define COMPATIBLE_IOCTL(cmd)  HANDLE_IOCTL((cmd),(ioctl_trans_handler_t)sys_ioctl)
-#endif
-
-#ifndef ULONG_IOCTL /* argument is an unsigned long integer, not a pointer */
-#define ULONG_IOCTL(cmd)  HANDLE_IOCTL((cmd),(ioctl_trans_handler_t)sys_ioctl)
-#endif
-
 
 COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
 COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
-- 
cgit v1.2.3


From 24a44dca356e7c68e33bece4effa7021e7429493 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:14 -0800
Subject: [PATCH] ntfs: remove superflous MS_NOATIME/MS_NODIRATIME assignments

MS_RDONLU implies not atime updates at all, no need for the MS_NOATIME and
MS_NODIRATIME flags.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ntfs/super.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 280e383fc84e..c3a3f1a8310b 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -443,8 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	ntfs_debug("Entering with remount options string: %s", opt);
 #ifndef NTFS_RW
-	/* For read-only compiled driver, enforce all read-only flags. */
-	*flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+	/* For read-only compiled driver, enforce read-only flag. */
+	*flags |= MS_RDONLY;
 #else /* NTFS_RW */
 	/*
 	 * For the read-write compiled driver, if we are remounting read-write,
@@ -1721,7 +1721,7 @@ static BOOL load_system_files(ntfs_volume *vol)
 						es3);
 				goto iput_mirr_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s",
 					!vol->mftmirr_ino ? es1 : es2, es3);
 		} else
@@ -1837,7 +1837,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_vol_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -1874,7 +1874,7 @@ get_ctx_vol_failed:
 				}
 				goto iput_logfile_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -1919,7 +1919,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_root_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -1943,7 +1943,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+		sb->s_flags |= MS_RDONLY;
 		/*
 		 * Do not set NVolErrors() because ntfs_remount() might manage
 		 * to set the dirty flag in which case all would be well.
@@ -1970,7 +1970,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+		sb->s_flags |= MS_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif
@@ -1989,7 +1989,7 @@ get_ctx_vol_failed:
 			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+		sb->s_flags |= MS_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
@@ -2030,7 +2030,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_quota_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2053,7 +2053,7 @@ get_ctx_vol_failed:
 			goto iput_quota_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+		sb->s_flags |= MS_RDONLY;
 		NVolSetErrors(vol);
 	}
 	/*
@@ -2074,7 +2074,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_usnjrnl_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -2097,7 +2097,7 @@ get_ctx_vol_failed:
 			goto iput_usnjrnl_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+		sb->s_flags |= MS_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
@@ -2689,7 +2689,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 
 	ntfs_debug("Entering.");
 #ifndef NTFS_RW
-	sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+	sb->s_flags |= MS_RDONLY;
 #endif /* ! NTFS_RW */
 	/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
 	sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
-- 
cgit v1.2.3


From 0d456fa4261f43433287a10fe3ec04a9818fac64 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:15 -0800
Subject: [PATCH] 9p: remove superflous MS_NODIRATIME assignment

MS_NOATIME implies MS_NODIRATIME

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index ae0f06b3c11a..2c4fa75be025 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -91,7 +91,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	sb->s_op = &v9fs_super_ops;
 
 	sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
-	    MS_NODIRATIME | MS_NOATIME;
+	    MS_NOATIME;
 }
 
 /**
-- 
cgit v1.2.3


From fc33a7bb9c6dd8f6e4a014976200f8fdabb3a45c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:17 -0800
Subject: [PATCH] per-mountpoint noatime/nodiratime

Turn noatime and nodiratime into per-mount instead of per-sb flags.

After all the preparations this is a rather trivial patch.  The mount code
needs to treat the two options as per-mount instead of per-superblock, and
touch_atime needs to be changed to check the new MNT_ flags in addition to
the MS_ flags that are kept for filesystems that are always
noatime/nodiratime but not user settable anymore.  Besides that core code
only nfs needed an update because it's leaving atime updates to the server
and thus sets the S_NOATIME flag on every inode, but needs to know whether
it's a real noatime mount for an getattr optimization.

While we're at it I've killed the IS_NOATIME/IS_NODIRATIME macros that were
only used by touch_atime.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c                  | 17 +++++++++++++----
 fs/namespace.c              | 12 +++++++++---
 fs/nfs/inode.c              | 17 +++++++++++++----
 fs/xfs/linux-2.6/xfs_iops.c |  3 +++
 include/linux/fs.h          |  5 +----
 include/linux/mount.h       |  8 +++++---
 6 files changed, 44 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 76980a9c92e7..108138d4e909 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/mount.h>
 
 /*
  * This is needed for the following functions:
@@ -1189,12 +1190,20 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	struct timespec now;
 
-	/* per-mountpoint checks will go here */
-	if (IS_NOATIME(inode))
+	if (IS_RDONLY(inode))
 		return;
-	if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+
+	if ((inode->i_flags & S_NOATIME) ||
+	    (inode->i_sb->s_flags & MS_NOATIME) ||
+	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		return;
-	if (IS_RDONLY(inode))
+
+	/*
+	 * We may have a NULL vfsmount when coming from NFSD
+	 */
+	if (mnt &&
+	    ((mnt->mnt_flags & MNT_NOATIME) ||
+	     ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))))
 		return;
 
 	now = current_fs_time(inode->i_sb);
diff --git a/fs/namespace.c b/fs/namespace.c
index f0e353f5bc30..2ca6145f43d6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -355,14 +355,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 		{ MS_SYNCHRONOUS, ",sync" },
 		{ MS_DIRSYNC, ",dirsync" },
 		{ MS_MANDLOCK, ",mand" },
-		{ MS_NOATIME, ",noatime" },
-		{ MS_NODIRATIME, ",nodiratime" },
 		{ 0, NULL }
 	};
 	static struct proc_fs_info mnt_info[] = {
 		{ MNT_NOSUID, ",nosuid" },
 		{ MNT_NODEV, ",nodev" },
 		{ MNT_NOEXEC, ",noexec" },
+		{ MNT_NOATIME, ",noatime" },
+		{ MNT_NODIRATIME, ",nodiratime" },
 		{ 0, NULL }
 	};
 	struct proc_fs_info *fs_infop;
@@ -1286,7 +1286,13 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		mnt_flags |= MNT_NODEV;
 	if (flags & MS_NOEXEC)
 		mnt_flags |= MNT_NOEXEC;
-	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE);
+	if (flags & MS_NOATIME)
+		mnt_flags |= MNT_NOATIME;
+	if (flags & MS_NODIRATIME)
+		mnt_flags |= MNT_NODIRATIME;
+
+	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
+		   MS_NOATIME | MS_NODIRATIME);
 
 	/* ... and get the mountpoint */
 	retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3e4ba9cb7f80..a77ee95b7efb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -950,11 +950,20 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 
 	/* Flush out writes to the server in order to update c/mtime */
 	nfs_sync_inode(inode, 0, 0, FLUSH_WAIT|FLUSH_NOCOMMIT);
-	if (__IS_FLG(inode, MS_NOATIME))
-		need_atime = 0;
-	else if (__IS_FLG(inode, MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+
+	/*
+	 * We may force a getattr if the user cares about atime.
+	 *
+	 * Note that we only have to check the vfsmount flags here:
+	 *  - NFS always sets S_NOATIME by so checking it would give a
+	 *    bogus result
+	 *  - NFS never sets MS_NOATIME or MS_NODIRATIME so there is
+	 *    no point in checking those.
+	 */
+ 	if ((mnt->mnt_flags & MNT_NOATIME) ||
+ 	    ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 		need_atime = 0;
-	/* We may force a getattr if the user cares about atime */
+
 	if (need_atime)
 		err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	else
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 41c478bb1ffc..97fb1470cf28 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -54,6 +54,9 @@
 #include <linux/xattr.h>
 #include <linux/namei.h>
 
+#define IS_NOATIME(inode) ((inode->i_sb->s_flags & MS_NOATIME) ||	\
+	(S_ISDIR(inode->i_mode) && inode->i_sb->s_flags & MS_NODIRATIME))
+
 /*
  * Change the requested timestamp in the given inode.
  * We don't lock across timestamp updates, and we don't log them but
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 85c5656756b6..d1e370d25f7b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -114,8 +114,7 @@ extern int dir_notify_enable;
 /*
  * Superblock flags that can be altered by MS_REMOUNT
  */
-#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_NOATIME|\
-			 MS_NODIRATIME)
+#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK)
 
 /*
  * Old magic mount flag and mask
@@ -161,8 +160,6 @@ extern int dir_notify_enable;
 #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)
 #define IS_APPEND(inode)	((inode)->i_flags & S_APPEND)
 #define IS_IMMUTABLE(inode)	((inode)->i_flags & S_IMMUTABLE)
-#define IS_NOATIME(inode)	(__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME))
-#define IS_NODIRATIME(inode)	__IS_FLG(inode, MS_NODIRATIME)
 #define IS_POSIXACL(inode)	__IS_FLG(inode, MS_POSIXACL)
 
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b98a709f1794..b7472ae91fa4 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -20,10 +20,12 @@
 #define MNT_NOSUID	0x01
 #define MNT_NODEV	0x02
 #define MNT_NOEXEC	0x04
-#define MNT_SHARED	0x10	/* if the vfsmount is a shared mount */
-#define MNT_UNBINDABLE	0x20	/* if the vfsmount is a unbindable mount */
+#define MNT_NOATIME	0x08
+#define MNT_NODIRATIME	0x10
 
-#define MNT_PNODE_MASK	(MNT_SHARED | MNT_UNBINDABLE)
+#define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
+#define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
+#define MNT_PNODE_MASK	0x3000	/* propogation flag mask */
 
 struct vfsmount {
 	struct list_head mnt_hash;
-- 
cgit v1.2.3


From 7ff92053ddff48d9d7908a353bd85f893944463e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:17 -0800
Subject: [PATCH] don't include ioctl32.h in drivers

These days ioctl32.h is only used for communication of fs/compat.c and
fs/compat_ioctl.c and doesn't contain anything of interest to drivers.

Remove inclusion in various drivers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/kernel/s390_ksyms.c       | 1 -
 arch/x86_64/kernel/x8664_ksyms.c    | 1 -
 drivers/char/drm/drm_ioc32.c        | 1 -
 drivers/char/drm/i915_ioc32.c       | 1 -
 drivers/char/drm/mga_ioc32.c        | 1 -
 drivers/char/drm/r128_ioc32.c       | 1 -
 drivers/char/drm/radeon_ioc32.c     | 1 -
 drivers/ieee1394/amdtp.c            | 1 -
 drivers/ieee1394/dv1394.c           | 1 -
 drivers/ieee1394/video1394.c        | 1 -
 drivers/message/fusion/mptctl.c     | 2 --
 drivers/s390/crypto/z90main.c       | 1 -
 drivers/scsi/aacraid/linit.c        | 1 -
 drivers/scsi/ch.c                   | 1 -
 drivers/scsi/megaraid/megaraid_mm.h | 1 -
 fs/xfs/linux-2.6/xfs_ioctl32.c      | 1 -
 16 files changed, 17 deletions(-)

(limited to 'fs')

diff --git a/arch/s390/kernel/s390_ksyms.c b/arch/s390/kernel/s390_ksyms.c
index bee654abb6d3..4176c77670c4 100644
--- a/arch/s390/kernel/s390_ksyms.c
+++ b/arch/s390/kernel/s390_ksyms.c
@@ -10,7 +10,6 @@
 #include <linux/smp.h>
 #include <linux/syscalls.h>
 #include <linux/interrupt.h>
-#include <linux/ioctl32.h>
 #include <asm/checksum.h>
 #include <asm/cpcmd.h>
 #include <asm/delay.h>
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 4a54221e10bc..b7fd6681c629 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -13,7 +13,6 @@
 #include <linux/string.h>
 #include <linux/syscalls.h>
 #include <linux/tty.h>
-#include <linux/ioctl32.h>
 
 #include <asm/semaphore.h>
 #include <asm/processor.h>
diff --git a/drivers/char/drm/drm_ioc32.c b/drivers/char/drm/drm_ioc32.c
index dd91ff6b474c..e9e2db18952d 100644
--- a/drivers/char/drm/drm_ioc32.c
+++ b/drivers/char/drm/drm_ioc32.c
@@ -28,7 +28,6 @@
  * IN THE SOFTWARE.
  */
 #include <linux/compat.h>
-#include <linux/ioctl32.h>
 
 #include "drmP.h"
 #include "drm_core.h"
diff --git a/drivers/char/drm/i915_ioc32.c b/drivers/char/drm/i915_ioc32.c
index 2218a946ec87..296248cdc767 100644
--- a/drivers/char/drm/i915_ioc32.c
+++ b/drivers/char/drm/i915_ioc32.c
@@ -30,7 +30,6 @@
  * IN THE SOFTWARE.
  */
 #include <linux/compat.h>
-#include <linux/ioctl32.h>
 
 #include "drmP.h"
 #include "drm.h"
diff --git a/drivers/char/drm/mga_ioc32.c b/drivers/char/drm/mga_ioc32.c
index 24a9d4e86af0..54a18eb2fc04 100644
--- a/drivers/char/drm/mga_ioc32.c
+++ b/drivers/char/drm/mga_ioc32.c
@@ -31,7 +31,6 @@
  * IN THE SOFTWARE.
  */
 #include <linux/compat.h>
-#include <linux/ioctl32.h>
 
 #include "drmP.h"
 #include "drm.h"
diff --git a/drivers/char/drm/r128_ioc32.c b/drivers/char/drm/r128_ioc32.c
index 1e2e367b8b82..9dd6d4116e47 100644
--- a/drivers/char/drm/r128_ioc32.c
+++ b/drivers/char/drm/r128_ioc32.c
@@ -30,7 +30,6 @@
  * IN THE SOFTWARE.
  */
 #include <linux/compat.h>
-#include <linux/ioctl32.h>
 
 #include "drmP.h"
 #include "drm.h"
diff --git a/drivers/char/drm/radeon_ioc32.c b/drivers/char/drm/radeon_ioc32.c
index fef4a2b84c1e..0ccfd3618ff1 100644
--- a/drivers/char/drm/radeon_ioc32.c
+++ b/drivers/char/drm/radeon_ioc32.c
@@ -28,7 +28,6 @@
  * IN THE SOFTWARE.
  */
 #include <linux/compat.h>
-#include <linux/ioctl32.h>
 
 #include "drmP.h"
 #include "drm.h"
diff --git a/drivers/ieee1394/amdtp.c b/drivers/ieee1394/amdtp.c
index 75897509c401..17390d762cf7 100644
--- a/drivers/ieee1394/amdtp.c
+++ b/drivers/ieee1394/amdtp.c
@@ -80,7 +80,6 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/poll.h>
-#include <linux/ioctl32.h>
 #include <linux/compat.h>
 #include <linux/cdev.h>
 #include <asm/uaccess.h>
diff --git a/drivers/ieee1394/dv1394.c b/drivers/ieee1394/dv1394.c
index 196db7439272..efeaa944bd0a 100644
--- a/drivers/ieee1394/dv1394.c
+++ b/drivers/ieee1394/dv1394.c
@@ -108,7 +108,6 @@
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
-#include <linux/ioctl32.h>
 #include <linux/compat.h>
 #include <linux/cdev.h>
 
diff --git a/drivers/ieee1394/video1394.c b/drivers/ieee1394/video1394.c
index 608479b2df14..39fb88309e8e 100644
--- a/drivers/ieee1394/video1394.c
+++ b/drivers/ieee1394/video1394.c
@@ -48,7 +48,6 @@
 #include <linux/vmalloc.h>
 #include <linux/timex.h>
 #include <linux/mm.h>
-#include <linux/ioctl32.h>
 #include <linux/compat.h>
 #include <linux/cdev.h>
 
diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c
index 959d2c5951b8..7c340240a50e 100644
--- a/drivers/message/fusion/mptctl.c
+++ b/drivers/message/fusion/mptctl.c
@@ -2585,8 +2585,6 @@ static struct miscdevice mptctl_miscdev = {
 
 #ifdef CONFIG_COMPAT
 
-#include <linux/ioctl32.h>
-
 static int
 compat_mptfwxfer_ioctl(struct file *filp, unsigned int cmd,
 			unsigned long arg)
diff --git a/drivers/s390/crypto/z90main.c b/drivers/s390/crypto/z90main.c
index 135ae04e6e75..2f54d033d7cf 100644
--- a/drivers/s390/crypto/z90main.c
+++ b/drivers/s390/crypto/z90main.c
@@ -30,7 +30,6 @@
 #include <linux/delay.h>       // mdelay
 #include <linux/init.h>
 #include <linux/interrupt.h>   // for tasklets
-#include <linux/ioctl32.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 3cb68af90456..9b9062f02462 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -46,7 +46,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/syscalls.h>
-#include <linux/ioctl32.h>
 #include <linux/delay.h>
 #include <linux/smp_lock.h>
 #include <asm/semaphore.h>
diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c
index 0920220f3313..4299fabca554 100644
--- a/drivers/scsi/ch.c
+++ b/drivers/scsi/ch.c
@@ -20,7 +20,6 @@
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
 #include <linux/completion.h>
-#include <linux/ioctl32.h>
 #include <linux/compat.h>
 #include <linux/chio.h>			/* here are all the ioctls */
 
diff --git a/drivers/scsi/megaraid/megaraid_mm.h b/drivers/scsi/megaraid/megaraid_mm.h
index eb8c390a0fa3..3d9e67d6849d 100644
--- a/drivers/scsi/megaraid/megaraid_mm.h
+++ b/drivers/scsi/megaraid/megaraid_mm.h
@@ -22,7 +22,6 @@
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
 #include <linux/list.h>
-#include <linux/ioctl32.h>
 
 #include "mbox_defs.h"
 #include "megaraid_ioctl.h"
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index c83ae15bb0e6..a7c9ba1a9f7b 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -19,7 +19,6 @@
 #include <linux/compat.h>
 #include <linux/init.h>
 #include <linux/ioctl.h>
-#include <linux/ioctl32.h>
 #include <linux/syscalls.h>
 #include <linux/types.h>
 #include <linux/fs.h>
-- 
cgit v1.2.3


From 2ff678b8da6478d861c1b0ecb3ac14575760e906 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:34 -0800
Subject: [PATCH] hrtimer: switch itimers to hrtimer

switch itimers to a hrtimers-based implementation

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |   6 +--
 fs/proc/array.c       |   6 +--
 include/linux/sched.h |   5 ++-
 include/linux/timer.h |   2 +-
 kernel/exit.c         |   2 +-
 kernel/fork.c         |   6 +--
 kernel/itimer.c       | 106 ++++++++++++++++++++++++--------------------------
 7 files changed, 65 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index fd02ea4a81e9..b5bcf1aae0ab 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -632,10 +632,10 @@ static inline int de_thread(struct task_struct *tsk)
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->real_timer.data = (unsigned long)current;
+		sig->real_timer.data = current;
 		spin_unlock_irq(lock);
-		if (del_timer_sync(&sig->real_timer))
-			add_timer(&sig->real_timer);
+		if (hrtimer_cancel(&sig->real_timer))
+			hrtimer_restart(&sig->real_timer);
 		spin_lock_irq(lock);
 	}
 	while (atomic_read(&sig->count) > count) {
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5e9251f65317..7eb1bd7f800c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -330,7 +330,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
-	unsigned long it_real_value = 0;
+	DEFINE_KTIME(it_real_value);
 	struct task_struct *t;
 	char tcomm[sizeof(task->comm)];
 
@@ -386,7 +386,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 			utime = cputime_add(utime, task->signal->utime);
 			stime = cputime_add(stime, task->signal->stime);
 		}
-		it_real_value = task->signal->it_real_value;
+		it_real_value = task->signal->real_timer.expires;
 	}
 	ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
 	read_unlock(&tasklist_lock);
@@ -435,7 +435,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		priority,
 		nice,
 		num_threads,
-		jiffies_to_clock_t(it_real_value),
+		(long) ktime_to_clock_t(it_real_value),
 		start_time,
 		vsize,
 		mm ? get_mm_rss(mm) : 0,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 85b53f87c703..ee4677ad204e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -105,6 +105,7 @@ extern unsigned long nr_iowait(void);
 #include <linux/param.h>
 #include <linux/resource.h>
 #include <linux/timer.h>
+#include <linux/hrtimer.h>
 
 #include <asm/processor.h>
 
@@ -398,8 +399,8 @@ struct signal_struct {
 	struct list_head posix_timers;
 
 	/* ITIMER_REAL timer for the process */
-	struct timer_list real_timer;
-	unsigned long it_real_value, it_real_incr;
+	struct hrtimer real_timer;
+	ktime_t it_real_incr;
 
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
 	cputime_t it_prof_expires, it_virt_expires;
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 72f3a7781106..9b9877fd2505 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -96,6 +96,6 @@ static inline void add_timer(struct timer_list *timer)
 
 extern void init_timers(void);
 extern void run_local_timers(void);
-extern void it_real_fn(unsigned long);
+extern int it_real_fn(void *);
 
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 309a46fa16f8..e75a51f33768 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -842,7 +842,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
- 		del_timer_sync(&tsk->signal->real_timer);
+ 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index b18d64554feb..3bdcab49998d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -801,10 +801,10 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 
-	sig->it_real_value = sig->it_real_incr = 0;
+	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC);
+	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = (unsigned long) tsk;
-	init_timer(&sig->real_timer);
+	sig->real_timer.data = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 7c1b25e25e47..c2c05c4ff28d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,36 +12,46 @@
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/posix-timers.h>
+#include <linux/hrtimer.h>
 
 #include <asm/uaccess.h>
 
-static unsigned long it_real_value(struct signal_struct *sig)
+/**
+ * itimer_get_remtime - get remaining time for the timer
+ *
+ * @timer: the timer to read
+ *
+ * Returns the delta between the expiry time and now, which can be
+ * less than zero or 1usec for an pending expired timer
+ */
+static struct timeval itimer_get_remtime(struct hrtimer *timer)
 {
-	unsigned long val = 0;
-	if (timer_pending(&sig->real_timer)) {
-		val = sig->real_timer.expires - jiffies;
+	ktime_t rem = hrtimer_get_remaining(timer);
 
-		/* look out for negative/zero itimer.. */
-		if ((long) val <= 0)
-			val = 1;
-	}
-	return val;
+	/*
+	 * Racy but safe: if the itimer expires after the above
+	 * hrtimer_get_remtime() call but before this condition
+	 * then we return 0 - which is correct.
+	 */
+	if (hrtimer_active(timer)) {
+		if (rem.tv64 <= 0)
+			rem.tv64 = NSEC_PER_USEC;
+	} else
+		rem.tv64 = 0;
+
+	return ktime_to_timeval(rem);
 }
 
 int do_getitimer(int which, struct itimerval *value)
 {
 	struct task_struct *tsk = current;
-	unsigned long interval, val;
 	cputime_t cinterval, cval;
 
 	switch (which) {
 	case ITIMER_REAL:
-		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
-		spin_unlock_irq(&tsk->sighand->siglock);
-		jiffies_to_timeval(val, &value->it_value);
-		jiffies_to_timeval(interval, &value->it_interval);
+		value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
+		value->it_interval =
+			ktime_to_timeval(tsk->signal->it_real_incr);
 		break;
 	case ITIMER_VIRTUAL:
 		read_lock(&tasklist_lock);
@@ -113,59 +123,45 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 }
 
 
-void it_real_fn(unsigned long __data)
+/*
+ * The timer is automagically restarted, when interval != 0
+ */
+int it_real_fn(void *data)
 {
-	struct task_struct * p = (struct task_struct *) __data;
-	unsigned long inc = p->signal->it_real_incr;
+	struct task_struct *tsk = (struct task_struct *) data;
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
 
-	/*
-	 * Now restart the timer if necessary.  We don't need any locking
-	 * here because do_setitimer makes sure we have finished running
-	 * before it touches anything.
-	 * Note, we KNOW we are (or should be) at a jiffie edge here so
-	 * we don't need the +1 stuff.  Also, we want to use the prior
-	 * expire value so as to not "slip" a jiffie if we are late.
-	 * Deal with requesting a time prior to "now" here rather than
-	 * in add_timer.
-	 */
-	if (!inc)
-		return;
-	while (time_before_eq(p->signal->real_timer.expires, jiffies))
-		p->signal->real_timer.expires += inc;
-	add_timer(&p->signal->real_timer);
+	if (tsk->signal->it_real_incr.tv64 != 0) {
+		hrtimer_forward(&tsk->signal->real_timer,
+			       tsk->signal->it_real_incr);
+
+		return HRTIMER_RESTART;
+	}
+	return HRTIMER_NORESTART;
 }
 
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
- 	unsigned long val, interval, expires;
+	struct hrtimer *timer;
+	ktime_t expires;
 	cputime_t cval, cinterval, nval, ninterval;
 
 	switch (which) {
 	case ITIMER_REAL:
-again:
-		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
-		/* We are sharing ->siglock with it_real_fn() */
-		if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) {
-			spin_unlock_irq(&tsk->sighand->siglock);
-			goto again;
-		}
-		tsk->signal->it_real_incr =
-			timeval_to_jiffies(&value->it_interval);
-		expires = timeval_to_jiffies(&value->it_value);
-		if (expires)
-			mod_timer(&tsk->signal->real_timer,
-				  jiffies + 1 + expires);
-		spin_unlock_irq(&tsk->sighand->siglock);
+		timer = &tsk->signal->real_timer;
+		hrtimer_cancel(timer);
 		if (ovalue) {
-			jiffies_to_timeval(val, &ovalue->it_value);
-			jiffies_to_timeval(interval,
-					   &ovalue->it_interval);
+			ovalue->it_value = itimer_get_remtime(timer);
+			ovalue->it_interval
+				= ktime_to_timeval(tsk->signal->it_real_incr);
 		}
+		tsk->signal->it_real_incr =
+			timeval_to_ktime(value->it_interval);
+		expires = timeval_to_ktime(value->it_value);
+		if (expires.tv64 != 0)
+			hrtimer_start(timer, expires, HRTIMER_REL);
 		break;
 	case ITIMER_VIRTUAL:
 		nval = timeval_to_cputime(&value->it_value);
-- 
cgit v1.2.3


From e866cfa939de7f52c154a9495eb5767f89abf453 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Mon, 9 Jan 2006 20:52:51 -0800
Subject: [PATCH] d_instantiate_unique / NFS inode leakage

If we have found aliased dentry that we return, inode reference is not
dropped and inode is not attached anywhere, so it seems the reference to
inode is leaked in that case.

Cc: Trond Myklebust <trond.myklebust@fys.uio.no>,
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 1536f15c4d4c..134d6775183f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -808,10 +808,14 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
  *
  * Fill in inode information in the entry. On success, it returns NULL.
  * If an unhashed alias of "entry" already exists, then we return the
- * aliased dentry instead.
+ * aliased dentry instead and drop one reference to inode.
  *
  * Note that in order to avoid conflicts with rename() etc, the caller
  * had better be holding the parent directory semaphore.
+ *
+ * This also assumes that the inode count has been incremented
+ * (or otherwise set) by the caller to indicate that it is now
+ * in use by the dcache.
  */
 struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 {
@@ -838,6 +842,7 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 		dget_locked(alias);
 		spin_unlock(&dcache_lock);
 		BUG_ON(!d_unhashed(alias));
+		iput(inode);
 		return alias;
 	}
 	list_add(&entry->d_alias, &inode->i_dentry);
-- 
cgit v1.2.3


From 3c6bee1d4037a5c569f30d40bd852a57ba250912 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Mon, 9 Jan 2006 20:54:01 -0800
Subject: [PATCH] turn "const static" into "static const"

ICC likes to complain about storage class not being first, GCC doesn't
care much (except for cases like "inline static").
have a hard time seeing how it could break anything.

Thanks to Gabriel A. Devenyi for pointing out
http://linuxicc.sourceforge.net/ which is what made me create this patch.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/plat-omap/dma.c                   |  2 +-
 arch/h8300/kernel/gpio.c                   |  4 ++--
 arch/h8300/platform/h8300h/ptrace_h8300h.c | 12 ++++++------
 arch/h8300/platform/h8s/ints.c             |  4 ++--
 arch/h8300/platform/h8s/ints_h8s.c         |  4 ++--
 drivers/ide/pci/pdc202xx_new.c             |  2 +-
 drivers/mtd/nand/au1550nd.c                |  2 +-
 drivers/mtd/nand/rtc_from4.c               |  2 +-
 drivers/mtd/nand/spia.c                    |  2 +-
 drivers/net/8139too.c                      |  2 +-
 drivers/net/pci-skeleton.c                 |  2 +-
 drivers/net/r8169.c                        |  2 +-
 drivers/net/sis190.c                       |  2 +-
 drivers/usb/atm/speedtch.c                 |  4 ++--
 drivers/usb/image/microtek.c               |  2 +-
 fs/cifs/cifs_uniupr.h                      |  2 +-
 16 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/arch/arm/plat-omap/dma.c b/arch/arm/plat-omap/dma.c
index f5cc21ad0956..a4e5ac77f6df 100644
--- a/arch/arm/plat-omap/dma.c
+++ b/arch/arm/plat-omap/dma.c
@@ -64,7 +64,7 @@ static int dma_chan_count;
 static spinlock_t dma_chan_lock;
 static struct omap_dma_lch dma_chan[OMAP_LOGICAL_DMA_CH_COUNT];
 
-const static u8 omap1_dma_irq[OMAP_LOGICAL_DMA_CH_COUNT] = {
+static const u8 omap1_dma_irq[OMAP_LOGICAL_DMA_CH_COUNT] = {
 	INT_DMA_CH0_6, INT_DMA_CH1_7, INT_DMA_CH2_8, INT_DMA_CH3,
 	INT_DMA_CH4, INT_DMA_CH5, INT_1610_DMA_CH6, INT_1610_DMA_CH7,
 	INT_1610_DMA_CH8, INT_1610_DMA_CH9, INT_1610_DMA_CH10,
diff --git a/arch/h8300/kernel/gpio.c b/arch/h8300/kernel/gpio.c
index 795682b873e2..d195568ca8a2 100644
--- a/arch/h8300/kernel/gpio.c
+++ b/arch/h8300/kernel/gpio.c
@@ -122,7 +122,7 @@ int h8300_get_gpio_dir(int port_bit)
 static char *port_status(int portno)
 {
 	static char result[10];
-	const static char io[2]={'I','O'};
+	static const char io[2]={'I','O'};
 	char *rp;
 	int c;
 	unsigned char used,ddr;
@@ -143,7 +143,7 @@ static int gpio_proc_read(char *buf, char **start, off_t offset,
                           int len, int *unused_i, void *unused_v)
 {
 	int c,outlen;
-	const static char port_name[]="123456789ABCDEFGH";
+	static const char port_name[]="123456789ABCDEFGH";
 	outlen = 0;
 	for (c = 0; c < MAX_PORT; c++) {
 		if (ddrs[c] == NULL)
diff --git a/arch/h8300/platform/h8300h/ptrace_h8300h.c b/arch/h8300/platform/h8300h/ptrace_h8300h.c
index 6ac93c05a1ae..746b1ae672a1 100644
--- a/arch/h8300/platform/h8300h/ptrace_h8300h.c
+++ b/arch/h8300/platform/h8300h/ptrace_h8300h.c
@@ -98,7 +98,7 @@ struct optable {
 		.type       = jmp, \
 	}
 
-const static struct optable optable_0[] = {
+static const struct optable optable_0[] = {
 	OPTABLE(0x00,0xff, 1,none), /* 0x00 */
 	OPTABLE(0x01,0xff,-1,none), /* 0x01 */
 	OPTABLE(0x02,0xfe, 1,none), /* 0x02-0x03 */
@@ -131,31 +131,31 @@ const static struct optable optable_0[] = {
 	OPTABLE(0x80,0x80, 1,none), /* 0x80-0xff */
 };
 
-const static struct optable optable_1[] = {
+static const struct optable optable_1[] = {
 	OPTABLE(0x00,0xff,-3,none), /* 0x0100 */
 	OPTABLE(0x40,0xf0,-3,none), /* 0x0140-0x14f */
 	OPTABLE(0x80,0xf0, 1,none), /* 0x0180-0x018f */
 	OPTABLE(0xc0,0xc0, 2,none), /* 0x01c0-0x01ff */
 };
 
-const static struct optable optable_2[] = {
+static const struct optable optable_2[] = {
 	OPTABLE(0x00,0x20, 2,none), /* 0x6a0?/0x6a8?/0x6b0?/0x6b8? */
 	OPTABLE(0x20,0x20, 3,none), /* 0x6a2?/0x6aa?/0x6b2?/0x6ba? */
 };
 
-const static struct optable optable_3[] = {
+static const struct optable optable_3[] = {
 	OPTABLE(0x69,0xfb, 2,none), /* 0x010069/0x01006d/014069/0x01406d */
 	OPTABLE(0x6b,0xff,-4,none), /* 0x01006b/0x01406b */
 	OPTABLE(0x6f,0xff, 3,none), /* 0x01006f/0x01406f */
 	OPTABLE(0x78,0xff, 5,none), /* 0x010078/0x014078 */
 };
 
-const static struct optable optable_4[] = {
+static const struct optable optable_4[] = {
 	OPTABLE(0x00,0x78, 3,none), /* 0x0100690?/0x01006d0?/0140690/0x01406d0?/0x0100698?/0x01006d8?/0140698?/0x01406d8? */
 	OPTABLE(0x20,0x78, 4,none), /* 0x0100692?/0x01006d2?/0140692/0x01406d2?/0x010069a?/0x01006da?/014069a?/0x01406da? */
 };
 
-const static struct optables_list {
+static const struct optables_list {
 	const struct optable *ptr;
 	int size;
 } optables[] = {
diff --git a/arch/h8300/platform/h8s/ints.c b/arch/h8300/platform/h8s/ints.c
index 5441cdd12a39..f6ed663bdde0 100644
--- a/arch/h8300/platform/h8s/ints.c
+++ b/arch/h8300/platform/h8s/ints.c
@@ -52,7 +52,7 @@ struct irq_pins {
 	unsigned char bit_no;
 };
 /* ISTR = 0 */
-const static struct irq_pins irq_assign_table0[16]={
+static const struct irq_pins irq_assign_table0[16]={
         {H8300_GPIO_P5,H8300_GPIO_B0},{H8300_GPIO_P5,H8300_GPIO_B1},
 	{H8300_GPIO_P5,H8300_GPIO_B2},{H8300_GPIO_P5,H8300_GPIO_B3},
 	{H8300_GPIO_P5,H8300_GPIO_B4},{H8300_GPIO_P5,H8300_GPIO_B5},
@@ -63,7 +63,7 @@ const static struct irq_pins irq_assign_table0[16]={
 	{H8300_GPIO_PF,H8300_GPIO_B1},{H8300_GPIO_PF,H8300_GPIO_B2},
 };
 /* ISTR = 1 */
-const static struct irq_pins irq_assign_table1[16]={
+static const struct irq_pins irq_assign_table1[16]={
 	{H8300_GPIO_P8,H8300_GPIO_B0},{H8300_GPIO_P8,H8300_GPIO_B1},
 	{H8300_GPIO_P8,H8300_GPIO_B2},{H8300_GPIO_P8,H8300_GPIO_B3},
 	{H8300_GPIO_P8,H8300_GPIO_B4},{H8300_GPIO_P8,H8300_GPIO_B5},
diff --git a/arch/h8300/platform/h8s/ints_h8s.c b/arch/h8300/platform/h8s/ints_h8s.c
index f53de493e3e8..8268dfd12f1f 100644
--- a/arch/h8300/platform/h8s/ints_h8s.c
+++ b/arch/h8300/platform/h8s/ints_h8s.c
@@ -42,7 +42,7 @@ struct irq_pins {
 	unsigned char bit_no;
 } __attribute__((aligned(1),packed));
 /* ISTR = 0 */
-const static struct irq_pins irq_assign_table0[16]={
+static const struct irq_pins irq_assign_table0[16]={
         {H8300_GPIO_P5,H8300_GPIO_B0},{H8300_GPIO_P5,H8300_GPIO_B1},
 	{H8300_GPIO_P5,H8300_GPIO_B2},{H8300_GPIO_P5,H8300_GPIO_B3},
 	{H8300_GPIO_P5,H8300_GPIO_B4},{H8300_GPIO_P5,H8300_GPIO_B5},
@@ -53,7 +53,7 @@ const static struct irq_pins irq_assign_table0[16]={
 	{H8300_GPIO_PF,H8300_GPIO_B1},{H8300_GPIO_PF,H8300_GPIO_B2},
 }; 
 /* ISTR = 1 */
-const static struct irq_pins irq_assign_table1[16]={
+static const struct irq_pins irq_assign_table1[16]={
 	{H8300_GPIO_P8,H8300_GPIO_B0},{H8300_GPIO_P8,H8300_GPIO_B1},
 	{H8300_GPIO_P8,H8300_GPIO_B2},{H8300_GPIO_P8,H8300_GPIO_B3},
 	{H8300_GPIO_P8,H8300_GPIO_B4},{H8300_GPIO_P8,H8300_GPIO_B5},
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index 211641a54398..fe06ebb0e5bf 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -39,7 +39,7 @@
 
 #define PDC202_DEBUG_CABLE	0
 
-const static char *pdc_quirk_drives[] = {
+static const char *pdc_quirk_drives[] = {
 	"QUANTUM FIREBALLlct08 08",
 	"QUANTUM FIREBALLP KA6.4",
 	"QUANTUM FIREBALLP KA9.1",
diff --git a/drivers/mtd/nand/au1550nd.c b/drivers/mtd/nand/au1550nd.c
index 9c5945d6df88..201e1362da14 100644
--- a/drivers/mtd/nand/au1550nd.c
+++ b/drivers/mtd/nand/au1550nd.c
@@ -43,7 +43,7 @@ static int nand_width = 1; /* default x8*/
 /*
  * Define partitions for flash device
  */
-const static struct mtd_partition partition_info[] = {
+static const struct mtd_partition partition_info[] = {
 	{
 		.name 	= "NAND FS 0",
 	  	.offset = 0,
diff --git a/drivers/mtd/nand/rtc_from4.c b/drivers/mtd/nand/rtc_from4.c
index 3a5841c9d950..4129c03dfd90 100644
--- a/drivers/mtd/nand/rtc_from4.c
+++ b/drivers/mtd/nand/rtc_from4.c
@@ -96,7 +96,7 @@ static struct mtd_info *rtc_from4_mtd = NULL;
  */
 static void __iomem *rtc_from4_fio_base = (void *)P2SEGADDR(RTC_FROM4_FIO_BASE);
 
-const static struct mtd_partition partition_info[] = {
+static const struct mtd_partition partition_info[] = {
         {
                 .name   = "Renesas flash partition 1",
                 .offset = 0,
diff --git a/drivers/mtd/nand/spia.c b/drivers/mtd/nand/spia.c
index 32541cbb0103..9cf1ce718ec1 100644
--- a/drivers/mtd/nand/spia.c
+++ b/drivers/mtd/nand/spia.c
@@ -67,7 +67,7 @@ module_param(spia_peddr, int, 0);
 /*
  * Define partitions for flash device
  */
-const static struct mtd_partition partition_info[] = {
+static const struct mtd_partition partition_info[] = {
 	{
 		.name	= "SPIA flash partition 1",
 		.offset	= 0,
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index d2102a27d307..adfba44dac5a 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -505,7 +505,7 @@ enum chip_flags {
 #define HW_REVID_MASK	HW_REVID(1, 1, 1, 1, 1, 1, 1)
 
 /* directly indexed by chip_t, above */
-const static struct {
+static const struct {
 	const char *name;
 	u32 version; /* from RTL8139C/RTL8139D docs */
 	u32 flags;
diff --git a/drivers/net/pci-skeleton.c b/drivers/net/pci-skeleton.c
index a1ac4bd1696e..a7bb54df75a8 100644
--- a/drivers/net/pci-skeleton.c
+++ b/drivers/net/pci-skeleton.c
@@ -415,7 +415,7 @@ typedef enum {
 
 
 /* directly indexed by chip_t, above */
-const static struct {
+static const struct {
 	const char *name;
 	u8 version; /* from RTL8139C docs */
 	u32 RxConfigMask; /* should clear the bits supported by this chip */
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 14a76f7cf900..2e1bed153c39 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -170,7 +170,7 @@ enum phy_version {
 #define _R(NAME,MAC,MASK) \
 	{ .name = NAME, .mac_version = MAC, .RxConfigMask = MASK }
 
-const static struct {
+static const struct {
 	const char *name;
 	u8 mac_version;
 	u32 RxConfigMask;	/* Clears the bits supported by this chip */
diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c
index 478791e09bf7..b420182eec4b 100644
--- a/drivers/net/sis190.c
+++ b/drivers/net/sis190.c
@@ -329,7 +329,7 @@ static struct mii_chip_info {
 	{ NULL, }
 };
 
-const static struct {
+static const struct {
 	const char *name;
 } sis_chip_info[] = {
 	{ "SiS 190 PCI Fast Ethernet adapter" },
diff --git a/drivers/usb/atm/speedtch.c b/drivers/usb/atm/speedtch.c
index b28336148658..c1b47d74e206 100644
--- a/drivers/usb/atm/speedtch.c
+++ b/drivers/usb/atm/speedtch.c
@@ -532,9 +532,9 @@ static void speedtch_handle_int(struct urb *int_urb, struct pt_regs *regs)
 	int ret = int_urb->status;
 
 	/* The magic interrupt for "up state" */
-	const static unsigned char up_int[6]   = { 0xa1, 0x00, 0x01, 0x00, 0x00, 0x00 };
+	static const unsigned char up_int[6]   = { 0xa1, 0x00, 0x01, 0x00, 0x00, 0x00 };
 	/* The magic interrupt for "down state" */
-	const static unsigned char down_int[6] = { 0xa1, 0x00, 0x00, 0x00, 0x00, 0x00 };
+	static const unsigned char down_int[6] = { 0xa1, 0x00, 0x00, 0x00, 0x00, 0x00 };
 
 	atm_dbg(usbatm, "%s entered\n", __func__);
 
diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c
index 458f2acdeb0a..28538db9eaf3 100644
--- a/drivers/usb/image/microtek.c
+++ b/drivers/usb/image/microtek.c
@@ -674,7 +674,7 @@ struct vendor_product
 
 
 /* These are taken from the msmUSB.inf file on the Windows driver CD */
-const static struct vendor_product mts_supported_products[] =
+static const struct vendor_product mts_supported_products[] =
 {
 	{ "Phantom 336CX",	mts_sup_unknown},
 	{ "Phantom 336CX",	mts_sup_unknown},
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index decd138f14d4..da2ad5b451ac 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -242,7 +242,7 @@ static signed char UniCaseRangeLff20[27] = {
 /*
  * Lower Case Range
  */
-const static struct UniCaseRange CifsUniLowerRange[] = {
+static const struct UniCaseRange CifsUniLowerRange[] = {
 	0x0380, 0x03ab, UniCaseRangeL0380,
 	0x0400, 0x042f, UniCaseRangeL0400,
 	0x0490, 0x04cb, UniCaseRangeL0490,
-- 
cgit v1.2.3


From 4610a6bea7742ab34e40dcb776bd3feb52da10d6 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 9 Jan 2006 20:54:05 -0800
Subject: [PATCH] fs/ext2/bitmap.c: ext2_count_free() is only required #ifdef
 EXT2FS_DEBUG

There's no need for ext2_count_free() #ifndef EXT2FS_DEBUG.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/bitmap.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/bitmap.c b/fs/ext2/bitmap.c
index 20145b74623f..e9983a0dd396 100644
--- a/fs/ext2/bitmap.c
+++ b/fs/ext2/bitmap.c
@@ -7,8 +7,12 @@
  * Universite Pierre et Marie Curie (Paris VI)
  */
 
+#ifdef EXT2FS_DEBUG
+
 #include <linux/buffer_head.h>
 
+#include "ext2.h"
+
 static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
 unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
@@ -23,3 +27,6 @@ unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
 			nibblemap[(map->b_data[i] >> 4) & 0xf];
 	return (sum);
 }
+
+#endif  /*  EXT2FS_DEBUG  */
+
-- 
cgit v1.2.3


From 3af13763d696468d31f71a798155b33f681f221f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 9 Jan 2006 20:54:06 -0800
Subject: [PATCH] fs/ext3/: small cleanups

This patch contains the following cleanups:
- there's no need for ext3_count_free() #ifndef EXT3FS_DEBUG
- having prototypes for ext3_count_free() in two different headers is
  nonsense

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/balloc.c | 2 --
 fs/ext3/bitmap.c | 8 +++++++-
 fs/ext3/bitmap.h | 8 --------
 fs/ext3/ialloc.c | 1 -
 4 files changed, 7 insertions(+), 12 deletions(-)
 delete mode 100644 fs/ext3/bitmap.h

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index ae1148c24c53..c6393fb4c35a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,8 +20,6 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 
-#include "bitmap.h"
-
 /*
  * balloc.c contains the blocks allocation and deallocation routines
  */
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
index 5b4ba3e246e6..cb16b4c5d5df 100644
--- a/fs/ext3/bitmap.c
+++ b/fs/ext3/bitmap.c
@@ -7,8 +7,11 @@
  * Universite Pierre et Marie Curie (Paris VI)
  */
 
+#ifdef EXT3FS_DEBUG
+
 #include <linux/buffer_head.h>
-#include "bitmap.h"
+
+#include "ext3_fs.h"
 
 static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
@@ -24,3 +27,6 @@ unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
 			nibblemap[(map->b_data[i] >> 4) & 0xf];
 	return (sum);
 }
+
+#endif  /*  EXT3FS_DEBUG  */
+
diff --git a/fs/ext3/bitmap.h b/fs/ext3/bitmap.h
deleted file mode 100644
index 6ee503a6bb4e..000000000000
--- a/fs/ext3/bitmap.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*  linux/fs/ext3/bitmap.c
- *
- * Copyright (C) 2005 Simtec Electronics
- *	Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern unsigned long ext3_count_free (struct buffer_head *, unsigned int );
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 69078079b19c..dc826464f313 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -26,7 +26,6 @@
 
 #include <asm/byteorder.h>
 
-#include "bitmap.h"
 #include "xattr.h"
 #include "acl.h"
 
-- 
cgit v1.2.3


From 792db3af38a55b2079df504b9f5aa57b2dbee48d Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Mon, 9 Jan 2006 20:54:45 -0800
Subject: [PATCH] fs/binfmt_elf: Remove unneeded kmalloc() return value casts

Remove unneeded casts of kmalloc() return value in binfmt_elf.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c       | 2 +-
 fs/binfmt_elf_fdpic.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 80ca932ba0bd..a4f6f57d91aa 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -622,7 +622,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 				goto out_free_file;
 
 			retval = -ENOMEM;
-			elf_interpreter = (char *) kmalloc(elf_ppnt->p_filesz,
+			elf_interpreter = kmalloc(elf_ppnt->p_filesz,
 							   GFP_KERNEL);
 			if (!elf_interpreter)
 				goto out_free_file;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index e0344f69c79d..5b3076e8ee90 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -187,7 +187,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
 				goto error;
 
 			/* read the name of the interpreter into memory */
-			interpreter_name = (char *) kmalloc(phdr->p_filesz, GFP_KERNEL);
+			interpreter_name = kmalloc(phdr->p_filesz, GFP_KERNEL);
 			if (!interpreter_name)
 				goto error;
 
-- 
cgit v1.2.3


From 9979ead5d1eb23191a00453559927c5abf9087e2 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 9 Jan 2006 20:54:52 -0800
Subject: [PATCH] fs/hfsplus/: remove the hfsplus_inode_check() debug function

This patch removes the hfsplus_inode_check() debug function.

It also removes the now obsolete last_inode_cnt and inode_cnt from struct
hfsplus_sb_info.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hfsplus/hfsplus_fs.h |  3 ---
 fs/hfsplus/inode.c      | 10 ----------
 fs/hfsplus/super.c      | 19 -------------------
 3 files changed, 32 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index df16fcbff3fb..0fa1ab6250bf 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -143,9 +143,6 @@ struct hfsplus_sb_info {
 
 	unsigned long flags;
 
-	atomic_t inode_cnt;
-	u32 last_inode_cnt;
-
 	struct hlist_head rsrc_inodes;
 };
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 983bcd02ac1c..7acff6c5464f 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -182,11 +182,6 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	igrab(dir);
 	hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes);
 	mark_inode_dirty(inode);
-	{
-	void hfsplus_inode_check(struct super_block *sb);
-	atomic_inc(&HFSPLUS_SB(sb).inode_cnt);
-	hfsplus_inode_check(sb);
-	}
 out:
 	d_add(dentry, inode);
 	return NULL;
@@ -317,11 +312,6 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 	if (!inode)
 		return NULL;
 
-	{
-	void hfsplus_inode_check(struct super_block *sb);
-	atomic_inc(&HFSPLUS_SB(sb).inode_cnt);
-	hfsplus_inode_check(sb);
-	}
 	inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
 	inode->i_mode = mode;
 	inode->i_uid = current->fsuid;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 6daaf7c755a6..d791780def50 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -22,29 +22,12 @@ static void hfsplus_destroy_inode(struct inode *inode);
 
 #include "hfsplus_fs.h"
 
-void hfsplus_inode_check(struct super_block *sb)
-{
-#if 0
-	u32 cnt = atomic_read(&HFSPLUS_SB(sb).inode_cnt);
-	u32 last_cnt = HFSPLUS_SB(sb).last_inode_cnt;
-
-	if (cnt <= (last_cnt / 2) ||
-	    cnt >= (last_cnt * 2)) {
-		HFSPLUS_SB(sb).last_inode_cnt = cnt;
-		printk("inode_check: %u,%u,%u\n", cnt, last_cnt,
-			HFSPLUS_SB(sb).cat_tree ? HFSPLUS_SB(sb).cat_tree->node_hash_cnt : 0);
-	}
-#endif
-}
-
 static void hfsplus_read_inode(struct inode *inode)
 {
 	struct hfs_find_data fd;
 	struct hfsplus_vh *vhdr;
 	int err;
 
-	atomic_inc(&HFSPLUS_SB(inode->i_sb).inode_cnt);
-	hfsplus_inode_check(inode->i_sb);
 	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
 	init_MUTEX(&HFSPLUS_I(inode).extents_lock);
 	HFSPLUS_I(inode).flags = 0;
@@ -155,12 +138,10 @@ static int hfsplus_write_inode(struct inode *inode, int unused)
 static void hfsplus_clear_inode(struct inode *inode)
 {
 	dprint(DBG_INODE, "hfsplus_clear_inode: %lu\n", inode->i_ino);
-	atomic_dec(&HFSPLUS_SB(inode->i_sb).inode_cnt);
 	if (HFSPLUS_IS_RSRC(inode)) {
 		HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
 		iput(HFSPLUS_I(inode).rsrc_inode);
 	}
-	hfsplus_inode_check(inode->i_sb);
 }
 
 static void hfsplus_write_super(struct super_block *sb)
-- 
cgit v1.2.3


From 845884d332c060b0dfc54ba5a580d0f1a99c58a2 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@snapgear.com>
Date: Tue, 10 Jan 2006 16:59:37 +1000
Subject: [PATCH] uclinux: delay binfmt_flat trace

Modify the initial trace output (which is based on flags in the binary
header) so that it is not done until after the magic number check.  This
may well not be a flat format binary, so the flags could be invalid.
(Prime example, running a script).

Changes prompted by patches from Stuart Hughs.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_flat.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b72dc31a0970..108d56bbd0d0 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -442,19 +442,22 @@ static int load_flat_file(struct linux_binprm * bprm,
 	flags     = ntohl(hdr->flags);
 	rev       = ntohl(hdr->rev);
 
-	if (flags & FLAT_FLAG_KTRACE)
-		printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename);
-
-	if (strncmp(hdr->magic, "bFLT", 4) ||
-			(rev != FLAT_VERSION && rev != OLD_FLAT_VERSION)) {
+	if (strncmp(hdr->magic, "bFLT", 4)) {
 		/*
 		 * because a lot of people do not manage to produce good
 		 * flat binaries,  we leave this printk to help them realise
 		 * the problem.  We only print the error if its not a script file
 		 */
 		if (strncmp(hdr->magic, "#!", 2))
-			printk("BINFMT_FLAT: bad magic/rev (0x%x, need 0x%x)\n",
-					rev, (int) FLAT_VERSION);
+			printk("BINFMT_FLAT: bad header magic\n");
+		return -ENOEXEC;
+	}
+
+	if (flags & FLAT_FLAG_KTRACE)
+		printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename);
+
+	if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) {
+		printk("BINFMT_FLAT: bad flat file version 0x%x (supported 0x%x and 0x%x)\n", rev, FLAT_VERSION, OLD_FLAT_VERSION);
 		return -ENOEXEC;
 	}
 	
-- 
cgit v1.2.3


From 7823c7c121839f5a003acdf9c2f1839bfa2f6c43 Mon Sep 17 00:00:00 2001
From: Luiz Fernando Capitulino <lcapitulino@mandriva.com.br>
Date: Wed, 11 Jan 2006 01:38:27 +0100
Subject: ext2: trivial indentation fix.

This memset() line was indented with seven spaces, this patch fixes
it to use a tab instead. Yes, very trivial but it's the third time
I have to look at this line..

Signed-off-by: Luiz Capitulino <lcapitulino@mandriva.com.br>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 fs/ext2/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 5b5f52876b42..7442bdd1267a 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -592,7 +592,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
 		goto fail;
 	}
 	kaddr = kmap_atomic(page, KM_USER0);
-       memset(kaddr, 0, chunk_size);
+	memset(kaddr, 0, chunk_size);
 	de = (struct ext2_dir_entry_2 *)kaddr;
 	de->name_len = 1;
 	de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1));
-- 
cgit v1.2.3


From 74da6cd06225da6971943bea6a33f4cb7f6b76a3 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Wed, 11 Jan 2006 01:51:26 +0100
Subject: missing printk loglevel and tiny tiny whitespace change in
 binfmt_elf()

Patch adds a mising printk loglevel (I think KERN_WARNING is appropriate
here) in fs/binfmt_elf.c, and while I was there I made some tiny tiny tiny
adjustments to whitespacing in the neighborhood.

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 fs/binfmt_elf.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a4f6f57d91aa..f979ebbce49c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1634,17 +1634,17 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 	ELF_CORE_WRITE_EXTRA_DATA;
 #endif
 
-	if ((off_t) file->f_pos != offset) {
+	if ((off_t)file->f_pos != offset) {
 		/* Sanity check */
-		printk("elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
-		       (off_t) file->f_pos, offset);
+		printk(KERN_WARNING "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
+		       (off_t)file->f_pos, offset);
 	}
 
 end_coredump:
 	set_fs(fs);
 
 cleanup:
-	while(!list_empty(&thread_list)) {
+	while (!list_empty(&thread_list)) {
 		struct list_head *tmp = thread_list.next;
 		list_del(tmp);
 		kfree(list_entry(tmp, struct elf_thread_status, list));
-- 
cgit v1.2.3


From e8d2a424675d9878356397e2ecfc632bbf09aa2c Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Wed, 11 Jan 2006 01:52:40 +0100
Subject: add loglevel to printk in fs/afs/cmservice.c

This is a small patch that adds loglevel to a printk in
fs/afs/cmservice.c

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 fs/afs/cmservice.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 0a57fd7c726f..9eef6bf156ab 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -118,7 +118,7 @@ static int kafscmd(void *arg)
 	_SRXAFSCM_xxxx_t func;
 	int die;
 
-	printk("kAFS: Started kafscmd %d\n", current->pid);
+	printk(KERN_INFO "kAFS: Started kafscmd %d\n", current->pid);
 
 	daemonize("kafscmd");
 
-- 
cgit v1.2.3


From 099f7f0a828bfef7b7047101cf52dc44a4b1d76f Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Wed, 11 Jan 2006 02:06:20 +0100
Subject: xfs: header included twice

Header included twice.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 fs/xfs/xfs_iomap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 45a77a3a6c07..ca7afc83a893 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -40,7 +40,6 @@
 #include "xfs_ialloc.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
-#include "xfs_bit.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
-- 
cgit v1.2.3


From c566c6dbf9cb0eed6e11ec7a4141e7af68ec2316 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Wed, 11 Jan 2006 02:08:57 +0100
Subject: fs/attr.c: header included twice

Header included twice.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 fs/attr.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index b34732506f1d..d63e5096f2f2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -14,7 +14,6 @@
 #include <linux/fcntl.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
-#include <linux/time.h>
 
 /* Taken over from the old code... */
 
-- 
cgit v1.2.3


From 7da942e5bc889bc970554876a8c03d7e2af9923d Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Wed, 11 Jan 2006 02:10:28 +0100
Subject: fs/proc/vmcore.c: header included twice

Header included twice.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 fs/proc/vmcore.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 5378d7c78419..124e35442ac8 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -14,7 +14,6 @@
 #include <linux/a.out.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
-#include <linux/proc_fs.h>
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
-- 
cgit v1.2.3


From f60d36690ce11a6a41b89e63b4201fbba3aa0bfb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:22:04 +1100
Subject: [XFS] remove over-eager assert

SGI-PV: 941804
SGI-Modid: xfs-linux-melb:xfs-kern:201702a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index e03fa2a3d5ed..5f6dce3b4fd6 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3829,7 +3829,6 @@ xfs_reclaim(
 	vn_iowait(vp);
 
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
-	ASSERT(VN_CACHED(vp) == 0);
 
 	/* If we have nothing to flush with this inode then complete the
 	 * teardown now, otherwise break the link between the xfs inode
-- 
cgit v1.2.3


From d3a9b1f9da4ee3e6d284148412097621b1c9e575 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:23:43 +1100
Subject: [XFS] merge xfs_arch.h userspace changes back

SGI-PV: 907752
SGI-Modid: xfs-linux-melb:xfs-kern:201882a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_arch.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 68e5051d8e24..f1aeda599eb1 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -40,6 +40,22 @@
 #undef XFS_NATIVE_HOST
 #endif
 
+#ifdef XFS_NATIVE_HOST
+#define cpu_to_be16(val)	((__be16)(val))
+#define cpu_to_be32(val)	((__be32)(val))
+#define cpu_to_be64(val)	((__be64)(val))
+#define be16_to_cpu(val)	((__uint16_t)(val)
+#define be32_to_cpu(val)	((__uint32_t)(val))
+#define be64_to_cpu(val)	((__uint64_t)(val))
+#else
+#define cpu_to_be16(val)	(__swab16((__uint16_t)(val)))
+#define cpu_to_be32(val)	(__swab32((__uint32_t)(val)))
+#define cpu_to_be64(val)	(__swab64((__uint64_t)(val)))
+#define be16_to_cpu(val)	(__swab16((__be16)(val)))
+#define be32_to_cpu(val)	(__swab32((__be32)(val)))
+#define be64_to_cpu(val)	(__swab64((__be64)(val)))
+#endif
+
 #endif	/* __KERNEL__ */
 
 /* do we need conversion? */
-- 
cgit v1.2.3


From 4ef19dddbaf2f24e492c18112fd8a04ce116daca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:27:18 +1100
Subject: [XFS] enable write barriers by default

SGI-PV: 912426
SGI-Modid: xfs-linux-melb:xfs-kern:201981a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c |  3 +++
 fs/xfs/xfs_vfsops.c          | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 6116b5bf433e..d8ec7463d8f4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -306,6 +306,7 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
 		xfs_fs_cmn_err(CE_NOTE, mp,
 		  "Disabling barriers, not supported with external log device");
 		mp->m_flags &= ~XFS_MOUNT_BARRIER;
+		return;
 	}
 
 	if (mp->m_ddev_targp->pbr_bdev->bd_disk->queue->ordered ==
@@ -313,6 +314,7 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
 		xfs_fs_cmn_err(CE_NOTE, mp,
 		  "Disabling barriers, not supported by the underlying device");
 		mp->m_flags &= ~XFS_MOUNT_BARRIER;
+		return;
 	}
 
 	error = xfs_barrier_test(mp);
@@ -320,6 +322,7 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
 		xfs_fs_cmn_err(CE_NOTE, mp,
 		  "Disabling barriers, trial barrier write failed");
 		mp->m_flags &= ~XFS_MOUNT_BARRIER;
+		return;
 	}
 }
 
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 7bdbd991ab1c..3406399e25b3 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -312,6 +312,8 @@ xfs_start_flags(
 		mp->m_flags |= XFS_MOUNT_NOUUID;
 	if (ap->flags & XFSMNT_BARRIER)
 		mp->m_flags |= XFS_MOUNT_BARRIER;
+	else
+		mp->m_flags &= ~XFS_MOUNT_BARRIER;
 
 	return 0;
 }
@@ -655,6 +657,11 @@ xfs_mntupdate(
 	else
 		mp->m_flags &= ~XFS_MOUNT_NOATIME;
 
+	if (args->flags & XFSMNT_BARRIER)
+		mp->m_flags |= XFS_MOUNT_BARRIER;
+	else
+		mp->m_flags &= ~XFS_MOUNT_BARRIER;
+
 	if ((vfsp->vfs_flag & VFS_RDONLY) &&
 	    !(*flags & MS_RDONLY)) {
 		vfsp->vfs_flag &= ~VFS_RDONLY;
@@ -1634,6 +1641,7 @@ xfs_vget(
 #define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
 #define MNTOPT_BARRIER	"barrier"	/* use writer barriers for log write and
 					 * unwritten extent conversion */
+#define MNTOPT_NOBARRIER "nobarrier"	/* .. disable */
 #define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
 #define MNTOPT_64BITINODE   "inode64"	/* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP	"ikeep"		/* do not free empty inode clusters */
@@ -1681,6 +1689,7 @@ xfs_parseargs(
 
 	args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
 	args->flags |= XFSMNT_COMPAT_ATTR;
+	args->flags |= XFSMNT_BARRIER;
 
 #if 0	/* XXX: off by default, until some remaining issues ironed out */
 	args->flags |= XFSMNT_IDELETE; /* default to on */
@@ -1806,6 +1815,8 @@ xfs_parseargs(
 			args->flags |= XFSMNT_NOUUID;
 		} else if (!strcmp(this_char, MNTOPT_BARRIER)) {
 			args->flags |= XFSMNT_BARRIER;
+		} else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
+			args->flags &= ~XFSMNT_BARRIER;
 		} else if (!strcmp(this_char, MNTOPT_IKEEP)) {
 			args->flags &= ~XFSMNT_IDELETE;
 		} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
@@ -1892,7 +1903,6 @@ xfs_showargs(
 		{ XFS_MOUNT_NOUUID,		"," MNTOPT_NOUUID },
 		{ XFS_MOUNT_NORECOVERY,		"," MNTOPT_NORECOVERY },
 		{ XFS_MOUNT_OSYNCISOSYNC,	"," MNTOPT_OSYNCISOSYNC },
-		{ XFS_MOUNT_BARRIER,		"," MNTOPT_BARRIER },
 		{ XFS_MOUNT_IDELETE,		"," MNTOPT_NOIKEEP },
 		{ 0, NULL }
 	};
@@ -1941,6 +1951,9 @@ xfs_showargs(
 	if (!(vfsp->vfs_flag & VFS_32BITINODES))
 		seq_printf(m, "," MNTOPT_64BITINODE);
 
+	if (!(vfsp->vfs_flag & XFS_MOUNT_BARRIER))
+		seq_printf(m, "," MNTOPT_NOBARRIER);
+
 	if (vfsp->vfs_flag & VFS_GRPID)
 		seq_printf(m, "," MNTOPT_GRPID);
 
-- 
cgit v1.2.3


From 061f7209bdfb0193b306f88b4ff36b2574b001d3 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:27:50 +1100
Subject: [XFS] Do not inherit properties for the quota inodes from the root
 inode.

SGI-PV: 945264
SGI-Modid: xfs-linux-melb:xfs-kern:24366a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index bb6991a7a617..c04c34776c43 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1392,11 +1392,12 @@ xfs_qm_qino_alloc(
 {
 	xfs_trans_t	*tp;
 	int		error;
-	unsigned long s;
+	unsigned long	s;
 	cred_t		zerocr;
+	xfs_inode_t	zeroino;
 	int		committed;
 
-	tp = xfs_trans_alloc(mp,XFS_TRANS_QM_QINOCREATE);
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
 	if ((error = xfs_trans_reserve(tp,
 				      XFS_QM_QINOCREATE_SPACE_RES(mp),
 				      XFS_CREATE_LOG_RES(mp), 0,
@@ -1406,8 +1407,9 @@ xfs_qm_qino_alloc(
 		return (error);
 	}
 	memset(&zerocr, 0, sizeof(zerocr));
+	memset(&zeroino, 0, sizeof(zeroino));
 
-	if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, S_IFREG, 1, 0,
+	if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0,
 				   &zerocr, 0, 1, ip, &committed))) {
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
 				 XFS_TRANS_ABORT);
-- 
cgit v1.2.3


From dd9f438e32900d67def49fa1b8961b3e19b6fefc Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:28:28 +1100
Subject: [XFS] Implement the di_extsize allocator hint for non-realtime files
 as well.  Also provides a mechanism for inheriting this property from the
 parent directory for new files.

SGI-PV: 945264
SGI-Modid: xfs-linux-melb:xfs-kern:24367a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_bmap.c     | 373 +++++++++++++++++++++++++++++------------------
 fs/xfs/xfs_bmap.h     |   7 +-
 fs/xfs/xfs_dinode.h   |  11 +-
 fs/xfs/xfs_fs.h       |   2 +
 fs/xfs/xfs_inode.c    |  16 ++-
 fs/xfs/xfs_iomap.c    | 390 +++++++++++++++++++++++++++-----------------------
 fs/xfs/xfs_vnodeops.c | 158 ++++++++++----------
 7 files changed, 547 insertions(+), 410 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index e415a4698e9c..8a32d65211b0 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2146,13 +2146,176 @@ xfs_bmap_add_extent_hole_real(
 	return 0; /* keep gcc quite */
 }
 
+/*
+ * Adjust the size of the new extent based on di_extsize and rt extsize.
+ */
+STATIC int
+xfs_bmap_extsize_align(
+	xfs_mount_t	*mp,
+	xfs_bmbt_irec_t	*gotp,		/* next extent pointer */
+	xfs_bmbt_irec_t	*prevp,		/* previous extent pointer */
+	xfs_extlen_t	extsz,		/* align to this extent size */
+	int		rt,		/* is this a realtime inode? */
+	int		eof,		/* is extent at end-of-file? */
+	int		delay,		/* creating delalloc extent? */
+	int		convert,	/* overwriting unwritten extent? */
+	xfs_fileoff_t	*offp,		/* in/out: aligned offset */
+	xfs_extlen_t	*lenp)		/* in/out: aligned length */
+{
+	xfs_fileoff_t	orig_off;	/* original offset */
+	xfs_extlen_t	orig_alen;	/* original length */
+	xfs_fileoff_t	orig_end;	/* original off+len */
+	xfs_fileoff_t	nexto;		/* next file offset */
+	xfs_fileoff_t	prevo;		/* previous file offset */
+	xfs_fileoff_t	align_off;	/* temp for offset */
+	xfs_extlen_t	align_alen;	/* temp for length */
+	xfs_extlen_t	temp;		/* temp for calculations */
+
+	if (convert)
+		return 0;
+
+	orig_off = align_off = *offp;
+	orig_alen = align_alen = *lenp;
+	orig_end = orig_off + orig_alen;
+
+	/*
+	 * If this request overlaps an existing extent, then don't
+	 * attempt to perform any additional alignment.
+	 */
+	if (!delay && !eof &&
+	    (orig_off >= gotp->br_startoff) &&
+	    (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
+		return 0;
+	}
+
+	/*
+	 * If the file offset is unaligned vs. the extent size
+	 * we need to align it.  This will be possible unless
+	 * the file was previously written with a kernel that didn't
+	 * perform this alignment, or if a truncate shot us in the
+	 * foot.
+	 */
+	temp = do_mod(orig_off, extsz);
+	if (temp) {
+		align_alen += temp;
+		align_off -= temp;
+	}
+	/*
+	 * Same adjustment for the end of the requested area.
+	 */
+	if ((temp = (align_alen % extsz))) {
+		align_alen += extsz - temp;
+	}
+	/*
+	 * If the previous block overlaps with this proposed allocation
+	 * then move the start forward without adjusting the length.
+	 */
+	if (prevp->br_startoff != NULLFILEOFF) {
+		if (prevp->br_startblock == HOLESTARTBLOCK)
+			prevo = prevp->br_startoff;
+		else
+			prevo = prevp->br_startoff + prevp->br_blockcount;
+	} else
+		prevo = 0;
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	/*
+	 * If the next block overlaps with this proposed allocation
+	 * then move the start back without adjusting the length,
+	 * but not before offset 0.
+	 * This may of course make the start overlap previous block,
+	 * and if we hit the offset 0 limit then the next block
+	 * can still overlap too.
+	 */
+	if (!eof && gotp->br_startoff != NULLFILEOFF) {
+		if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
+		    (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
+			nexto = gotp->br_startoff + gotp->br_blockcount;
+		else
+			nexto = gotp->br_startoff;
+	} else
+		nexto = NULLFILEOFF;
+	if (!eof &&
+	    align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto)
+		align_off = nexto > align_alen ? nexto - align_alen : 0;
+	/*
+	 * If we're now overlapping the next or previous extent that
+	 * means we can't fit an extsz piece in this hole.  Just move
+	 * the start forward to the first valid spot and set
+	 * the length so we hit the end.
+	 */
+	if (align_off != orig_off && align_off < prevo)
+		align_off = prevo;
+	if (align_off + align_alen != orig_end &&
+	    align_off + align_alen > nexto &&
+	    nexto != NULLFILEOFF) {
+		ASSERT(nexto > prevo);
+		align_alen = nexto - align_off;
+	}
+
+	/*
+	 * If realtime, and the result isn't a multiple of the realtime
+	 * extent size we need to remove blocks until it is.
+	 */
+	if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+		/*
+		 * We're not covering the original request, or
+		 * we won't be able to once we fix the length.
+		 */
+		if (orig_off < align_off ||
+		    orig_end > align_off + align_alen ||
+		    align_alen - temp < orig_alen)
+			return XFS_ERROR(EINVAL);
+		/*
+		 * Try to fix it by moving the start up.
+		 */
+		if (align_off + temp <= orig_off) {
+			align_alen -= temp;
+			align_off += temp;
+		}
+		/*
+		 * Try to fix it by moving the end in.
+		 */
+		else if (align_off + align_alen - temp >= orig_end)
+			align_alen -= temp;
+		/*
+		 * Set the start to the minimum then trim the length.
+		 */
+		else {
+			align_alen -= orig_off - align_off;
+			align_off = orig_off;
+			align_alen -= align_alen % mp->m_sb.sb_rextsize;
+		}
+		/*
+		 * Result doesn't cover the request, fail it.
+		 */
+		if (orig_off < align_off || orig_end > align_off + align_alen)
+			return XFS_ERROR(EINVAL);
+	} else {
+		ASSERT(orig_off >= align_off);
+		ASSERT(orig_end <= align_off + align_alen);
+	}
+
+#ifdef DEBUG
+	if (!eof && gotp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off + align_alen <= gotp->br_startoff);
+	if (prevp->br_startoff != NULLFILEOFF)
+		ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
+#endif
+
+	*lenp = align_alen;
+	*offp = align_off;
+	return 0;
+}
+
 #define XFS_ALLOC_GAP_UNITS	4
 
 /*
  * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
  * It figures out where to ask the underlying allocator to put the new extent.
  */
-STATIC int				/* error */
+STATIC int
 xfs_bmap_alloc(
 	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
 {
@@ -2163,10 +2326,10 @@ xfs_bmap_alloc(
 	xfs_mount_t	*mp;		/* mount point structure */
 	int		nullfb;		/* true if ap->firstblock isn't set */
 	int		rt;		/* true if inode is realtime */
-#ifdef __KERNEL__
-	xfs_extlen_t	prod=0;		/* product factor for allocators */
-	xfs_extlen_t	ralen=0;	/* realtime allocation length */
-#endif
+	xfs_extlen_t	prod = 0;	/* product factor for allocators */
+	xfs_extlen_t	ralen = 0;	/* realtime allocation length */
+	xfs_extlen_t	align;		/* minimum allocation alignment */
+	xfs_rtblock_t	rtx;
 
 #define	ISVALID(x,y)	\
 	(rt ? \
@@ -2182,125 +2345,25 @@ xfs_bmap_alloc(
 	nullfb = ap->firstblock == NULLFSBLOCK;
 	rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
 	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
-#ifdef __KERNEL__
 	if (rt) {
-		xfs_extlen_t	extsz;		/* file extent size for rt */
-		xfs_fileoff_t	nexto;		/* next file offset */
-		xfs_extlen_t	orig_alen;	/* original ap->alen */
-		xfs_fileoff_t	orig_end;	/* original off+len */
-		xfs_fileoff_t	orig_off;	/* original ap->off */
-		xfs_extlen_t	mod_off;	/* modulus calculations */
-		xfs_fileoff_t	prevo;		/* previous file offset */
-		xfs_rtblock_t	rtx;		/* realtime extent number */
-		xfs_extlen_t	temp;		/* temp for rt calculations */
-
-		/*
-		 * Set prod to match the realtime extent size.
-		 */
-		if (!(extsz = ap->ip->i_d.di_extsize))
-			extsz = mp->m_sb.sb_rextsize;
-		prod = extsz / mp->m_sb.sb_rextsize;
-		orig_off = ap->off;
-		orig_alen = ap->alen;
-		orig_end = orig_off + orig_alen;
-		/*
-		 * If the file offset is unaligned vs. the extent size
-		 * we need to align it.  This will be possible unless
-		 * the file was previously written with a kernel that didn't
-		 * perform this alignment.
-		 */
-		mod_off = do_mod(orig_off, extsz);
-		if (mod_off) {
-			ap->alen += mod_off;
-			ap->off -= mod_off;
-		}
-		/*
-		 * Same adjustment for the end of the requested area.
-		 */
-		if ((temp = (ap->alen % extsz)))
-			ap->alen += extsz - temp;
-		/*
-		 * If the previous block overlaps with this proposed allocation
-		 * then move the start forward without adjusting the length.
-		 */
-		prevo =
-			ap->prevp->br_startoff == NULLFILEOFF ?
-				0 :
-				(ap->prevp->br_startoff +
-				 ap->prevp->br_blockcount);
-		if (ap->off != orig_off && ap->off < prevo)
-			ap->off = prevo;
-		/*
-		 * If the next block overlaps with this proposed allocation
-		 * then move the start back without adjusting the length,
-		 * but not before offset 0.
-		 * This may of course make the start overlap previous block,
-		 * and if we hit the offset 0 limit then the next block
-		 * can still overlap too.
-		 */
-		nexto = (ap->eof || ap->gotp->br_startoff == NULLFILEOFF) ?
-			NULLFILEOFF : ap->gotp->br_startoff;
-		if (!ap->eof &&
-		    ap->off + ap->alen != orig_end &&
-		    ap->off + ap->alen > nexto)
-			ap->off = nexto > ap->alen ? nexto - ap->alen : 0;
-		/*
-		 * If we're now overlapping the next or previous extent that
-		 * means we can't fit an extsz piece in this hole.  Just move
-		 * the start forward to the first valid spot and set
-		 * the length so we hit the end.
-		 */
-		if ((ap->off != orig_off && ap->off < prevo) ||
-		    (ap->off + ap->alen != orig_end &&
-		     ap->off + ap->alen > nexto)) {
-			ap->off = prevo;
-			ap->alen = nexto - prevo;
-		}
-		/*
-		 * If the result isn't a multiple of rtextents we need to
-		 * remove blocks until it is.
-		 */
-		if ((temp = (ap->alen % mp->m_sb.sb_rextsize))) {
-			/*
-			 * We're not covering the original request, or
-			 * we won't be able to once we fix the length.
-			 */
-			if (orig_off < ap->off ||
-			    orig_end > ap->off + ap->alen ||
-			    ap->alen - temp < orig_alen)
-				return XFS_ERROR(EINVAL);
-			/*
-			 * Try to fix it by moving the start up.
-			 */
-			if (ap->off + temp <= orig_off) {
-				ap->alen -= temp;
-				ap->off += temp;
-			}
-			/*
-			 * Try to fix it by moving the end in.
-			 */
-			else if (ap->off + ap->alen - temp >= orig_end)
-				ap->alen -= temp;
-			/*
-			 * Set the start to the minimum then trim the length.
-			 */
-			else {
-				ap->alen -= orig_off - ap->off;
-				ap->off = orig_off;
-				ap->alen -= ap->alen % mp->m_sb.sb_rextsize;
-			}
-			/*
-			 * Result doesn't cover the request, fail it.
-			 */
-			if (orig_off < ap->off || orig_end > ap->off + ap->alen)
-				return XFS_ERROR(EINVAL);
-		}
+		align = ap->ip->i_d.di_extsize ?
+			ap->ip->i_d.di_extsize : mp->m_sb.sb_rextsize;
+		/* Set prod to match the extent size */
+		prod = align / mp->m_sb.sb_rextsize;
+
+		error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+						align, rt, ap->eof, 0,
+						ap->conv, &ap->off, &ap->alen);
+		if (error)
+			return error;
+		ASSERT(ap->alen);
 		ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+
 		/*
 		 * If the offset & length are not perfectly aligned
 		 * then kill prod, it will just get us in trouble.
 		 */
-		if (do_mod(ap->off, extsz) || ap->alen % extsz)
+		if (do_mod(ap->off, align) || ap->alen % align)
 			prod = 1;
 		/*
 		 * Set ralen to be the actual requested length in rtextents.
@@ -2326,15 +2389,24 @@ xfs_bmap_alloc(
 			ap->rval = rtx * mp->m_sb.sb_rextsize;
 		} else
 			ap->rval = 0;
+	} else {
+		align = (ap->userdata && ap->ip->i_d.di_extsize &&
+			(ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)) ?
+			ap->ip->i_d.di_extsize : 0;
+		if (unlikely(align)) {
+			error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+							align, rt,
+							ap->eof, 0, ap->conv,
+							&ap->off, &ap->alen);
+			ASSERT(!error);
+			ASSERT(ap->alen);
+		}
+		if (nullfb)
+			ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+		else
+			ap->rval = ap->firstblock;
 	}
-#else
-	if (rt)
-		ap->rval = 0;
-#endif	/* __KERNEL__ */
-	else if (nullfb)
-		ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
-	else
-		ap->rval = ap->firstblock;
+
 	/*
 	 * If allocating at eof, and there's a previous real block,
 	 * try to use it's last block as our starting point.
@@ -2598,11 +2670,12 @@ xfs_bmap_alloc(
 			args.total = ap->total;
 			args.minlen = ap->minlen;
 		}
-		if (ap->ip->i_d.di_extsize) {
+		if (unlikely(ap->userdata && ap->ip->i_d.di_extsize &&
+			    (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE))) {
 			args.prod = ap->ip->i_d.di_extsize;
 			if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
 				args.mod = (xfs_extlen_t)(args.prod - args.mod);
-		} else if (mp->m_sb.sb_blocksize >= NBPP) {
+		} else if (unlikely(mp->m_sb.sb_blocksize >= NBPP)) {
 			args.prod = 1;
 			args.mod = 0;
 		} else {
@@ -4590,6 +4663,7 @@ xfs_bmapi(
 	char		contig;		/* allocation must be one extent */
 	char		delay;		/* this request is for delayed alloc */
 	char		exact;		/* don't do all of wasdelayed extent */
+	char		convert;	/* unwritten extent I/O completion */
 	xfs_bmbt_rec_t	*ep;		/* extent list entry pointer */
 	int		error;		/* error return */
 	xfs_bmbt_irec_t	got;		/* current extent list record */
@@ -4643,7 +4717,7 @@ xfs_bmapi(
 	}
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
-	rt = XFS_IS_REALTIME_INODE(ip);
+	rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(ifp->if_ext_max ==
 	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
@@ -4654,6 +4728,7 @@ xfs_bmapi(
 	delay = (flags & XFS_BMAPI_DELAY) != 0;
 	trim = (flags & XFS_BMAPI_ENTIRE) == 0;
 	userdata = (flags & XFS_BMAPI_METADATA) == 0;
+	convert = (flags & XFS_BMAPI_CONVERT) != 0;
 	exact = (flags & XFS_BMAPI_EXACT) != 0;
 	rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
 	contig = (flags & XFS_BMAPI_CONTIG) != 0;
@@ -4748,16 +4823,26 @@ xfs_bmapi(
 			}
 			minlen = contig ? alen : 1;
 			if (delay) {
-				xfs_extlen_t	extsz = 0;
+				xfs_extlen_t	extsz;
 
 				/* Figure out the extent size, adjust alen */
 				if (rt) {
 					if (!(extsz = ip->i_d.di_extsize))
 						extsz = mp->m_sb.sb_rextsize;
-					alen = roundup(alen, extsz);
-					extsz = alen / mp->m_sb.sb_rextsize;
+				} else {
+					extsz = ip->i_d.di_extsize;
+				}
+				if (extsz) {
+					error = xfs_bmap_extsize_align(mp,
+							&got, &prev, extsz,
+							rt, eof, delay, convert,
+							&aoff, &alen);
+					ASSERT(!error);
 				}
 
+				if (rt)
+					extsz = alen / mp->m_sb.sb_rextsize;
+
 				/*
 				 * Make a transaction-less quota reservation for
 				 * delayed allocation blocks. This number gets
@@ -4785,14 +4870,15 @@ xfs_bmapi(
 					xfs_bmap_worst_indlen(ip, alen);
 				ASSERT(indlen > 0);
 
-				if (rt)
+				if (rt) {
 					error = xfs_mod_incore_sb(mp,
 							XFS_SBS_FREXTENTS,
 							-(extsz), rsvd);
-				else
+				} else {
 					error = xfs_mod_incore_sb(mp,
 							XFS_SBS_FDBLOCKS,
 							-(alen), rsvd);
+				}
 				if (!error) {
 					error = xfs_mod_incore_sb(mp,
 							XFS_SBS_FDBLOCKS,
@@ -4811,6 +4897,7 @@ xfs_bmapi(
 				if (error) {
 					if (XFS_IS_QUOTA_ON(ip->i_mount))
 						/* unreserve the blocks now */
+						(void)
 						XFS_TRANS_UNRESERVE_QUOTA_NBLKS(
 							mp, NULL, ip,
 							(long)alen, 0, rt ?
@@ -4849,6 +4936,7 @@ xfs_bmapi(
 				bma.firstblock = *firstblock;
 				bma.alen = alen;
 				bma.off = aoff;
+				bma.conv = convert;
 				bma.wasdel = wasdelay;
 				bma.minlen = minlen;
 				bma.low = flist->xbf_low;
@@ -5270,8 +5358,7 @@ xfs_bunmapi(
 		return 0;
 	}
 	XFS_STATS_INC(xs_blk_unmap);
-	isrt = (whichfork == XFS_DATA_FORK) &&
-	       (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
+	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
 	start = bno;
 	bno = start + len - 1;
 	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
@@ -5443,7 +5530,7 @@ xfs_bunmapi(
 		}
 		if (wasdel) {
 			ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
-			/* Update realtim/data freespace, unreserve quota */
+			/* Update realtime/data freespace, unreserve quota */
 			if (isrt) {
 				xfs_filblks_t rtexts;
 
@@ -5451,14 +5538,14 @@ xfs_bunmapi(
 				do_div(rtexts, mp->m_sb.sb_rextsize);
 				xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
 						(int)rtexts, rsvd);
-				XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip,
-					-((long)del.br_blockcount), 0,
+				(void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
+					NULL, ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_RTBLKS);
 			} else {
 				xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
 						(int)del.br_blockcount, rsvd);
-				XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip,
-					-((long)del.br_blockcount), 0,
+				(void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
+					NULL, ip, -((long)del.br_blockcount), 0,
 					XFS_QMOPT_RES_REGBLKS);
 			}
 			ip->i_delayed_blks -= del.br_blockcount;
@@ -5652,7 +5739,9 @@ xfs_getbmap(
 		   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
 		return XFS_ERROR(EINVAL);
 	if (whichfork == XFS_DATA_FORK) {
-		if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) {
+		if ((ip->i_d.di_extsize && (ip->i_d.di_flags &
+				(XFS_DIFLAG_REALTIME|XFS_DIFLAG_EXTSIZE))) ||
+		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
 			prealloced = 1;
 			fixlen = XFS_MAXIOFFSET(mp);
 		} else {
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 2e0717a01309..12cc63dfc2c4 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -62,6 +62,10 @@ typedef	struct xfs_bmap_free
 #define	XFS_BMAPI_IGSTATE	0x200	/* Ignore state - */
 					/* combine contig. space */
 #define	XFS_BMAPI_CONTIG	0x400	/* must allocate only one extent */
+/*	XFS_BMAPI_DIRECT_IO	0x800	*/
+#define XFS_BMAPI_CONVERT	0x1000	/* unwritten extent conversion - */
+					/* need write cache flushing and no */
+					/* additional allocation alignments */
 
 #define	XFS_BMAPI_AFLAG(w)	xfs_bmapi_aflag(w)
 static inline int xfs_bmapi_aflag(int w)
@@ -101,7 +105,8 @@ typedef struct xfs_bmalloca {
 	char			wasdel;	/* replacing a delayed allocation */
 	char			userdata;/* set if is user data */
 	char			low;	/* low on space, using seq'l ags */
-	char			aeof;   /* allocated space at eof */
+	char			aeof;	/* allocated space at eof */
+	char			conv;	/* overwriting unwritten extents */
 } xfs_bmalloca_t;
 
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index c5a0e537ff1a..f697aab8a3d2 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -246,8 +246,10 @@ typedef enum xfs_dinode_fmt
 #define XFS_DIFLAG_NOATIME_BIT   6	/* do not update atime */
 #define XFS_DIFLAG_NODUMP_BIT    7	/* do not dump */
 #define XFS_DIFLAG_RTINHERIT_BIT 8	/* create with realtime bit set */
-#define XFS_DIFLAG_PROJINHERIT_BIT  9	/* create with parents projid */
-#define XFS_DIFLAG_NOSYMLINKS_BIT  10	/* disallow symlink creation */
+#define XFS_DIFLAG_PROJINHERIT_BIT   9	/* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT   10	/* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT      11	/* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12	/* inherit inode extent size */
 #define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
 #define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
 #define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
@@ -259,11 +261,14 @@ typedef enum xfs_dinode_fmt
 #define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
 #define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
 #define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
 
 #define XFS_DIFLAG_ANY \
 	(XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
 	 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
 	 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
-	 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS)
+	 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+	 XFS_DIFLAG_EXTSZINHERIT)
 
 #endif	/* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index ba096f80f48d..3280f49496ba 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -65,6 +65,8 @@ struct fsxattr {
 #define XFS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */
 #define XFS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
 #define XFS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
+#define XFS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
+#define XFS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
 #define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index df0d4572d70a..e486c7d244c2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -809,6 +809,10 @@ _xfs_dic2xflags(
 			flags |= XFS_XFLAG_PROJINHERIT;
 		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 			flags |= XFS_XFLAG_NOSYMLINKS;
+		if (di_flags & XFS_DIFLAG_EXTSIZE)
+			flags |= XFS_XFLAG_EXTSIZE;
+		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
+			flags |= XFS_XFLAG_EXTSZINHERIT;
 	}
 
 	return flags;
@@ -1192,11 +1196,19 @@ xfs_ialloc(
 			if ((mode & S_IFMT) == S_IFDIR) {
 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 					di_flags |= XFS_DIFLAG_RTINHERIT;
-			} else {
+				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+					ip->i_d.di_extsize = pip->i_d.di_extsize;
+				}
+			} else if ((mode & S_IFMT) == S_IFREG) {
 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) {
 					di_flags |= XFS_DIFLAG_REALTIME;
 					ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 				}
+				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+					di_flags |= XFS_DIFLAG_EXTSIZE;
+					ip->i_d.di_extsize = pip->i_d.di_extsize;
+				}
 			}
 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
 			    xfs_inherit_noatime)
@@ -1262,7 +1274,7 @@ xfs_isize_check(
 	if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
 		return;
 
-	if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME )
+	if (ip->i_d.di_flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_EXTSIZE))
 		return;
 
 	nimaps = 2;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 45a77a3a6c07..5ecf3e3e86aa 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -263,7 +263,7 @@ phase2:
 	case BMAPI_WRITE:
 		/* If we found an extent, return it */
 		if (nimaps &&
-		    (imap.br_startblock != HOLESTARTBLOCK) && 
+		    (imap.br_startblock != HOLESTARTBLOCK) &&
 		    (imap.br_startblock != DELAYSTARTBLOCK)) {
 			xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
 					offset, count, iomapp, &imap, flags);
@@ -317,6 +317,58 @@ out:
 	return XFS_ERROR(error);
 }
 
+STATIC int
+xfs_iomap_eof_align_last_fsb(
+	xfs_mount_t	*mp,
+	xfs_iocore_t	*io,
+	xfs_fsize_t	isize,
+	xfs_extlen_t	extsize,
+	xfs_fileoff_t	*last_fsb)
+{
+	xfs_fileoff_t	new_last_fsb = 0;
+	xfs_extlen_t	align;
+	int		eof, error;
+
+	if (io->io_flags & XFS_IOCORE_RT)
+		;
+	/*
+	 * If mounted with the "-o swalloc" option, roundup the allocation
+	 * request to a stripe width boundary if the file size is >=
+	 * stripe width and we are allocating past the allocation eof.
+	 */
+	else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
+	        (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+		new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
+	/*
+	 * Roundup the allocation request to a stripe unit (m_dalign) boundary
+	 * if the file size is >= stripe unit size, and we are allocating past
+	 * the allocation eof.
+	 */
+	else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+		new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+
+	/*
+	 * Always round up the allocation request to an extent boundary
+	 * (when file on a real-time subvolume or has di_extsize hint).
+	 */
+	if (extsize) {
+		if (new_last_fsb)
+			align = roundup_64(new_last_fsb, extsize);
+		else
+			align = extsize;
+		new_last_fsb = roundup_64(*last_fsb, align);
+	}
+
+	if (new_last_fsb) {
+		error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
+		if (error)
+			return error;
+		if (eof)
+			*last_fsb = new_last_fsb;
+	}
+	return 0;
+}
+
 STATIC int
 xfs_flush_space(
 	xfs_inode_t	*ip,
@@ -363,19 +415,20 @@ xfs_iomap_write_direct(
 	xfs_iocore_t	*io = &ip->i_iocore;
 	xfs_fileoff_t	offset_fsb;
 	xfs_fileoff_t	last_fsb;
-	xfs_filblks_t	count_fsb;
+	xfs_filblks_t	count_fsb, resaligned;
 	xfs_fsblock_t	firstfsb;
+	xfs_extlen_t	extsz, temp;
+	xfs_fsize_t	isize;
 	int		nimaps;
-	int		error;
 	int		bmapi_flag;
 	int		quota_flag;
 	int		rt;
 	xfs_trans_t	*tp;
 	xfs_bmbt_irec_t imap;
 	xfs_bmap_free_t free_list;
-	xfs_filblks_t	qblocks, resblks;
+	uint		qblocks, resblks, resrtextents;
 	int		committed;
-	int		resrtextents;
+	int		error;
 
 	/*
 	 * Make sure that the dquots are there. This doesn't hold
@@ -385,37 +438,52 @@ xfs_iomap_write_direct(
 	if (error)
 		return XFS_ERROR(error);
 
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-	count_fsb = last_fsb - offset_fsb;
-	if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) {
-		xfs_fileoff_t	map_last_fsb;
-
-		map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff;
-		if (map_last_fsb < last_fsb) {
-			last_fsb = map_last_fsb;
-			count_fsb = last_fsb - offset_fsb;
-		}
-		ASSERT(count_fsb > 0);
+	rt = XFS_IS_REALTIME_INODE(ip);
+	if (unlikely(rt)) {
+		if (!(extsz = ip->i_d.di_extsize))
+			extsz = mp->m_sb.sb_rextsize;
+	} else {
+		extsz = ip->i_d.di_extsize;
 	}
 
-	/*
-	 * Determine if reserving space on the data or realtime partition.
-	 */
-	if ((rt = XFS_IS_REALTIME_INODE(ip))) {
-		xfs_extlen_t	extsz;
+	isize = ip->i_d.di_size;
+	if (io->io_new_size > isize)
+		isize = io->io_new_size;
 
-		if (!(extsz = ip->i_d.di_extsize))
-			extsz = mp->m_sb.sb_rextsize;
-		resrtextents = qblocks = (count_fsb + extsz - 1);
-		do_div(resrtextents, mp->m_sb.sb_rextsize);
-		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-		quota_flag = XFS_QMOPT_RES_RTBLKS;
+  	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+  	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	if ((offset + count) > isize) {
+		error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz,
+							&last_fsb);
+		if (error)
+			goto error_out;
 	} else {
-		resrtextents = 0;
-		resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, count_fsb);
-		quota_flag = XFS_QMOPT_RES_REGBLKS;
+		if (found && (ret_imap->br_startblock == HOLESTARTBLOCK))
+			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
+					ret_imap->br_blockcount +
+					ret_imap->br_startoff);
 	}
+	count_fsb = last_fsb - offset_fsb;
+	ASSERT(count_fsb > 0);
+
+	resaligned = count_fsb;
+	if (unlikely(extsz)) {
+		if ((temp = do_mod(offset_fsb, extsz)))
+			resaligned += temp;
+		if ((temp = do_mod(resaligned, extsz)))
+			resaligned += extsz - temp;
+	}
+
+	if (unlikely(rt)) {
+		resrtextents = qblocks = resaligned;
+		resrtextents /= mp->m_sb.sb_rextsize;
+  		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+  		quota_flag = XFS_QMOPT_RES_RTBLKS;
+  	} else {
+  		resrtextents = 0;
+		resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+  		quota_flag = XFS_QMOPT_RES_REGBLKS;
+  	}
 
 	/*
 	 * Allocate and setup the transaction
@@ -426,7 +494,6 @@ xfs_iomap_write_direct(
 			XFS_WRITE_LOG_RES(mp), resrtextents,
 			XFS_TRANS_PERM_LOG_RES,
 			XFS_WRITE_LOG_COUNT);
-
 	/*
 	 * Check for running out of space, note: need lock to return
 	 */
@@ -436,20 +503,20 @@ xfs_iomap_write_direct(
 	if (error)
 		goto error_out;
 
-	if (XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag)) {
-		error = (EDQUOT);
+	error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
+					      qblocks, 0, quota_flag);
+	if (error)
 		goto error1;
-	}
 
-	bmapi_flag = XFS_BMAPI_WRITE;
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_ihold(tp, ip);
 
-	if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt))
+	bmapi_flag = XFS_BMAPI_WRITE;
+	if ((flags & BMAPI_DIRECT) && (offset < ip->i_d.di_size || extsz))
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
-	 * Issue the bmapi() call to allocate the blocks
+	 * Issue the xfs_bmapi() call to allocate the blocks
 	 */
 	XFS_BMAP_INIT(&free_list, &firstfsb);
 	nimaps = 1;
@@ -501,6 +568,62 @@ error_out:
 	return XFS_ERROR(error);
 }
 
+/*
+ * If the caller is doing a write at the end of the file,
+ * then extend the allocation out to the file system's write
+ * iosize.  We clean up any extra space left over when the
+ * file is closed in xfs_inactive().
+ *
+ * For sync writes, we are flushing delayed allocate space to
+ * try to make additional space available for allocation near
+ * the filesystem full boundary - preallocation hurts in that
+ * situation, of course.
+ */
+STATIC int
+xfs_iomap_eof_want_preallocate(
+	xfs_mount_t	*mp,
+	xfs_iocore_t	*io,
+	xfs_fsize_t	isize,
+	xfs_off_t	offset,
+	size_t		count,
+	int		ioflag,
+	xfs_bmbt_irec_t *imap,
+	int		nimaps,
+	int		*prealloc)
+{
+	xfs_fileoff_t   start_fsb;
+	xfs_filblks_t   count_fsb;
+	xfs_fsblock_t	firstblock;
+	int		n, error, imaps;
+
+	*prealloc = 0;
+	if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize)
+		return 0;
+
+	/*
+	 * If there are any real blocks past eof, then don't
+	 * do any speculative allocation.
+	 */
+	start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
+	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+	while (count_fsb > 0) {
+		imaps = nimaps;
+		error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
+				  0, &firstblock, 0, imap, &imaps, NULL);
+		if (error)
+			return error;
+		for (n = 0; n < imaps; n++) {
+			if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
+			    (imap[n].br_startblock != DELAYSTARTBLOCK))
+				return 0;
+			start_fsb += imap[n].br_blockcount;
+			count_fsb -= imap[n].br_blockcount;
+		}
+	}
+	*prealloc = 1;
+	return 0;
+}
+
 int
 xfs_iomap_write_delay(
 	xfs_inode_t	*ip,
@@ -514,13 +637,15 @@ xfs_iomap_write_delay(
 	xfs_iocore_t	*io = &ip->i_iocore;
 	xfs_fileoff_t	offset_fsb;
 	xfs_fileoff_t	last_fsb;
-	xfs_fsize_t	isize;
+	xfs_off_t	aligned_offset;
+	xfs_fileoff_t	ioalign;
 	xfs_fsblock_t	firstblock;
+	xfs_extlen_t	extsz;
+	xfs_fsize_t	isize;
 	int		nimaps;
-	int		error;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-	int		aeof;
-	int		fsynced = 0;
+	int		prealloc, fsynced = 0;
+	int		error;
 
 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
 
@@ -528,152 +653,57 @@ xfs_iomap_write_delay(
 	 * Make sure that the dquots are there. This doesn't hold
 	 * the ilock across a disk read.
 	 */
-
 	error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
 	if (error)
 		return XFS_ERROR(error);
 
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		if (!(extsz = ip->i_d.di_extsize))
+			extsz = mp->m_sb.sb_rextsize;
+	} else {
+		extsz = ip->i_d.di_extsize;
+	}
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
 retry:
 	isize = ip->i_d.di_size;
-	if (io->io_new_size > isize) {
+	if (io->io_new_size > isize)
 		isize = io->io_new_size;
-	}
 
-	aeof = 0;
-	offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-	/*
-	 * If the caller is doing a write at the end of the file,
-	 * then extend the allocation (and the buffer used for the write)
-	 * out to the file system's write iosize.  We clean up any extra
-	 * space left over when the file is closed in xfs_inactive().
-	 *
-	 * For sync writes, we are flushing delayed allocate space to
-	 * try to make additional space available for allocation near
-	 * the filesystem full boundary - preallocation hurts in that
-	 * situation, of course.
-	 */
-	if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) {
-		xfs_off_t	aligned_offset;
-		xfs_filblks_t   count_fsb;
-		unsigned int	iosize;
-		xfs_fileoff_t	ioalign;
-		int		n;
-		xfs_fileoff_t   start_fsb;
+	error = xfs_iomap_eof_want_preallocate(mp, io, isize, offset, count,
+				ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+	if (error)
+		return error;
 
-		/*
-		 * If there are any real blocks past eof, then don't
-		 * do any speculative allocation.
-		 */
-		start_fsb = XFS_B_TO_FSBT(mp,
-					((xfs_ufsize_t)(offset + count - 1)));
-		count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
-		while (count_fsb > 0) {
-			nimaps = XFS_WRITE_IMAPS;
-			error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
-					0, &firstblock, 0, imap, &nimaps, NULL);
-			if (error) {
-				return error;
-			}
-			for (n = 0; n < nimaps; n++) {
-				if ( !(io->io_flags & XFS_IOCORE_RT)  && 
-					!imap[n].br_startblock) {
-					cmn_err(CE_PANIC,"Access to block "
-						"zero:  fs <%s> inode: %lld "
-						"start_block : %llx start_off "
-						": %llx blkcnt : %llx "
-						"extent-state : %x \n",
-						(ip->i_mount)->m_fsname,
-						(long long)ip->i_ino,
-						imap[n].br_startblock,
-						imap[n].br_startoff,
-						imap[n].br_blockcount,
-						imap[n].br_state);
-        			}
-				if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
-				    (imap[n].br_startblock != DELAYSTARTBLOCK)) {
-					goto write_map;
-				}
-				start_fsb += imap[n].br_blockcount;
-				count_fsb -= imap[n].br_blockcount;
-			}
-		}
-		iosize = mp->m_writeio_blocks;
+	if (prealloc) {
 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-		last_fsb = ioalign + iosize;
-		aeof = 1;
+		last_fsb = ioalign + mp->m_writeio_blocks;
+	} else {
+		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
 	}
-write_map:
-	nimaps = XFS_WRITE_IMAPS;
-	firstblock = NULLFSBLOCK;
 
-	/*
-	 * If mounted with the "-o swalloc" option, roundup the allocation
-	 * request to a stripe width boundary if the file size is >=
-	 * stripe width and we are allocating past the allocation eof.
-	 */
-	if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_swidth 
-	    && (mp->m_flags & XFS_MOUNT_SWALLOC)
-	    && (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)) && aeof) {
-		int eof;
-		xfs_fileoff_t new_last_fsb;
-
-		new_last_fsb = roundup_64(last_fsb, mp->m_swidth);
-		error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
-		if (error) {
-			return error;
-		}
-		if (eof) {
-			last_fsb = new_last_fsb;
-		}
-	/*
-	 * Roundup the allocation request to a stripe unit (m_dalign) boundary
-	 * if the file size is >= stripe unit size, and we are allocating past
-	 * the allocation eof.
-	 */
-	} else if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_dalign &&
-		   (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)) && aeof) {
-		int eof;
-		xfs_fileoff_t new_last_fsb;
-		new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
-		error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
-		if (error) {
-			return error;
-		}
-		if (eof) {
-			last_fsb = new_last_fsb;
-		}
-	/*
-	 * Round up the allocation request to a real-time extent boundary
-	 * if the file is on the real-time subvolume.
-	 */
-	} else if (io->io_flags & XFS_IOCORE_RT && aeof) {
-		int eof;
-		xfs_fileoff_t new_last_fsb;
-
-		new_last_fsb = roundup_64(last_fsb, mp->m_sb.sb_rextsize);
-		error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
-		if (error) {
+	if (prealloc || extsz) {
+		error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz,
+							&last_fsb);
+		if (error)
 			return error;
-		}
-		if (eof)
-			last_fsb = new_last_fsb;
 	}
+
+	nimaps = XFS_WRITE_IMAPS;
+	firstblock = NULLFSBLOCK;
 	error = xfs_bmapi(NULL, ip, offset_fsb,
 			  (xfs_filblks_t)(last_fsb - offset_fsb),
 			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
 			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
 			  &nimaps, NULL);
-	/*
-	 * This can be EDQUOT, if nimaps == 0
-	 */
-	if (error && (error != ENOSPC)) {
+	if (error && (error != ENOSPC))
 		return XFS_ERROR(error);
-	}
+
 	/*
 	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-	 * then we must have run out of space.
+	 * then we must have run out of space - flush delalloc, and retry..
 	 */
 	if (nimaps == 0) {
 		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
@@ -685,9 +715,7 @@ write_map:
 		goto retry;
 	}
 
-	*ret_imap = imap[0];
-	*nmaps = 1;
-	if ( !(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
+	if (!(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
 		cmn_err(CE_PANIC,"Access to block zero:  fs <%s> inode: %lld "
                         "start_block : %llx start_off : %llx blkcnt : %llx "
                         "extent-state : %x \n",
@@ -696,6 +724,10 @@ write_map:
                         ret_imap->br_startblock, ret_imap->br_startoff,
                         ret_imap->br_blockcount,ret_imap->br_state);
 	}
+
+	*ret_imap = imap[0];
+	*nmaps = 1;
+
 	return 0;
 }
 
@@ -868,17 +900,17 @@ xfs_iomap_write_unwritten(
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_iocore_t    *io = &ip->i_iocore;
-	xfs_trans_t	*tp;
 	xfs_fileoff_t	offset_fsb;
 	xfs_filblks_t	count_fsb;
 	xfs_filblks_t	numblks_fsb;
-	xfs_bmbt_irec_t	imap;
+	xfs_fsblock_t	firstfsb;
+	int		nimaps;
+	xfs_trans_t	*tp;
+	xfs_bmbt_irec_t imap;
+	xfs_bmap_free_t free_list;
+	uint		resblks;
 	int		committed;
 	int		error;
-	int		nres;
-	int		nimaps;
-	xfs_fsblock_t	firstfsb;
-	xfs_bmap_free_t	free_list;
 
 	xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN,
 				&ip->i_iocore, offset, count);
@@ -887,9 +919,9 @@ xfs_iomap_write_unwritten(
 	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
 
-	do {
-		nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
 
+	do {
 		/*
 		 * set up a transaction to convert the range of extents
 		 * from unwritten to real. Do allocations in a loop until
@@ -897,7 +929,7 @@ xfs_iomap_write_unwritten(
 		 */
 
 		tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
-		error = xfs_trans_reserve(tp, nres,
+		error = xfs_trans_reserve(tp, resblks,
 				XFS_WRITE_LOG_RES(mp), 0,
 				XFS_TRANS_PERM_LOG_RES,
 				XFS_WRITE_LOG_COUNT);
@@ -916,7 +948,7 @@ xfs_iomap_write_unwritten(
 		XFS_BMAP_INIT(&free_list, &firstfsb);
 		nimaps = 1;
 		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
-				  XFS_BMAPI_WRITE, &firstfsb,
+				  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
 				  1, &imap, &nimaps, &free_list);
 		if (error)
 			goto error_on_bmapi_transaction;
@@ -930,7 +962,7 @@ xfs_iomap_write_unwritten(
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (error)
 			goto error0;
-		
+
 		if ( !(io->io_flags & XFS_IOCORE_RT)  && !imap.br_startblock) {
 			cmn_err(CE_PANIC,"Access to block zero:  fs <%s> "
 				"inode: %lld start_block : %llx start_off : "
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 5f6dce3b4fd6..a2b422c984f2 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -540,24 +540,6 @@ xfs_setattr(
 			goto error_return;
 		}
 
-		/*
-		 * Can't set extent size unless the file is marked, or
-		 * about to be marked as a realtime file.
-		 *
-		 * This check will be removed when fixed size extents
-		 * with buffered data writes is implemented.
-		 *
-		 */
-		if ((mask & XFS_AT_EXTSIZE)			&&
-		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
-		     vap->va_extsize) &&
-		    (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
-		       ((mask & XFS_AT_XFLAGS) &&
-			(vap->va_xflags & XFS_XFLAG_REALTIME))))) {
-			code = XFS_ERROR(EINVAL);
-			goto error_return;
-		}
-
 		/*
 		 * Can't change realtime flag if any extents are allocated.
 		 */
@@ -820,13 +802,17 @@ xfs_setattr(
 					di_flags |= XFS_DIFLAG_RTINHERIT;
 				if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 					di_flags |= XFS_DIFLAG_NOSYMLINKS;
-			} else {
+				if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
+					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+			} else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 				if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 					di_flags |= XFS_DIFLAG_REALTIME;
 					ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 				} else {
 					ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
 				}
+				if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
+					di_flags |= XFS_DIFLAG_EXTSIZE;
 			}
 			ip->i_d.di_flags = di_flags;
 		}
@@ -1568,7 +1554,8 @@ xfs_release(
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
 		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
 		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-		    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)))) {
+		    (!(ip->i_d.di_flags &
+				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
 				return (error);
 			/* Update linux inode block count after free above */
@@ -1644,9 +1631,10 @@ xfs_inactive(
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
 		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
-		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-		    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)) ||
-		     (ip->i_delayed_blks != 0))) {
+		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+		     (!(ip->i_d.di_flags &
+				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
+		      (ip->i_delayed_blks != 0)))) {
 			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
 				return (VN_INACTIVE_CACHE);
 			/* Update linux inode block count after free above */
@@ -3998,42 +3986,36 @@ xfs_alloc_file_space(
 	int			alloc_type,
 	int			attr_flags)
 {
+	xfs_mount_t		*mp = ip->i_mount;
+	xfs_off_t		count;
 	xfs_filblks_t		allocated_fsb;
 	xfs_filblks_t		allocatesize_fsb;
-	int			committed;
-	xfs_off_t		count;
-	xfs_filblks_t		datablocks;
-	int			error;
+	xfs_extlen_t		extsz, temp;
+	xfs_fileoff_t		startoffset_fsb;
 	xfs_fsblock_t		firstfsb;
-	xfs_bmap_free_t		free_list;
-	xfs_bmbt_irec_t		*imapp;
-	xfs_bmbt_irec_t		imaps[1];
-	xfs_mount_t		*mp;
-	int			numrtextents;
-	int			reccount;
-	uint			resblks;
+	int			nimaps;
+	int			bmapi_flag;
+	int			quota_flag;
 	int			rt;
-	int			rtextsize;
-	xfs_fileoff_t		startoffset_fsb;
 	xfs_trans_t		*tp;
-	int			xfs_bmapi_flags;
+	xfs_bmbt_irec_t		imaps[1], *imapp;
+	xfs_bmap_free_t		free_list;
+	uint			qblocks, resblks, resrtextents;
+	int			committed;
+	int			error;
 
 	vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
-	mp = ip->i_mount;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	/*
-	 * determine if this is a realtime file
-	 */
-	if ((rt = XFS_IS_REALTIME_INODE(ip)) != 0) {
-		if (ip->i_d.di_extsize)
-			rtextsize = ip->i_d.di_extsize;
-		else
-			rtextsize = mp->m_sb.sb_rextsize;
-	} else
-		rtextsize = 0;
+	rt = XFS_IS_REALTIME_INODE(ip);
+	if (unlikely(rt)) {
+		if (!(extsz = ip->i_d.di_extsize))
+			extsz = mp->m_sb.sb_rextsize;
+	} else {
+		extsz = ip->i_d.di_extsize;
+	}
 
 	if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
 		return error;
@@ -4044,8 +4026,8 @@ xfs_alloc_file_space(
 	count = len;
 	error = 0;
 	imapp = &imaps[0];
-	reccount = 1;
-	xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
+	nimaps = 1;
+	bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
 	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
 
@@ -4066,43 +4048,51 @@ xfs_alloc_file_space(
 	}
 
 	/*
-	 * allocate file space until done or until there is an error
+	 * Allocate file space until done or until there is an error
 	 */
 retry:
 	while (allocatesize_fsb && !error) {
+		xfs_fileoff_t	s, e;
+
 		/*
-		 * determine if reserving space on
-		 * the data or realtime partition.
+		 * Determine space reservations for data/realtime,
 		 */
-		if (rt) {
-			xfs_fileoff_t s, e;
-
+		if (unlikely(extsz)) {
 			s = startoffset_fsb;
-			do_div(s, rtextsize);
-			s *= rtextsize;
-			e = roundup_64(startoffset_fsb + allocatesize_fsb,
-				rtextsize);
-			numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize;
-			datablocks = 0;
+			do_div(s, extsz);
+			s *= extsz;
+			e = startoffset_fsb + allocatesize_fsb;
+			if ((temp = do_mod(startoffset_fsb, extsz)))
+				e += temp;
+			if ((temp = do_mod(e, extsz)))
+				e += extsz - temp;
+		} else {
+			s = 0;
+			e = allocatesize_fsb;
+		}
+
+		if (unlikely(rt)) {
+			resrtextents = qblocks = (uint)(e - s);
+			resrtextents /= mp->m_sb.sb_rextsize;
+			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+			quota_flag = XFS_QMOPT_RES_RTBLKS;
 		} else {
-			datablocks = allocatesize_fsb;
-			numrtextents = 0;
+			resrtextents = 0;
+			resblks = qblocks = \
+				XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
+			quota_flag = XFS_QMOPT_RES_REGBLKS;
 		}
 
 		/*
-		 * allocate and setup the transaction
+		 * Allocate and setup the transaction.
 		 */
 		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-		resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
-		error = xfs_trans_reserve(tp,
-					  resblks,
-					  XFS_WRITE_LOG_RES(mp),
-					  numrtextents,
+		error = xfs_trans_reserve(tp, resblks,
+					  XFS_WRITE_LOG_RES(mp), resrtextents,
 					  XFS_TRANS_PERM_LOG_RES,
 					  XFS_WRITE_LOG_COUNT);
-
 		/*
-		 * check for running out of space
+		 * Check for running out of space
 		 */
 		if (error) {
 			/*
@@ -4113,8 +4103,8 @@ retry:
 			break;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
-				ip->i_udquot, ip->i_gdquot, resblks, 0, 0);
+		error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
+						      qblocks, 0, quota_flag);
 		if (error)
 			goto error1;
 
@@ -4122,19 +4112,19 @@ retry:
 		xfs_trans_ihold(tp, ip);
 
 		/*
-		 * issue the bmapi() call to allocate the blocks
+		 * Issue the xfs_bmapi() call to allocate the blocks
 		 */
 		XFS_BMAP_INIT(&free_list, &firstfsb);
 		error = xfs_bmapi(tp, ip, startoffset_fsb,
-				  allocatesize_fsb, xfs_bmapi_flags,
-				  &firstfsb, 0, imapp, &reccount,
+				  allocatesize_fsb, bmapi_flag,
+				  &firstfsb, 0, imapp, &nimaps,
 				  &free_list);
 		if (error) {
 			goto error0;
 		}
 
 		/*
-		 * complete the transaction
+		 * Complete the transaction
 		 */
 		error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
 		if (error) {
@@ -4149,7 +4139,7 @@ retry:
 
 		allocated_fsb = imapp->br_blockcount;
 
-		if (reccount == 0) {
+		if (nimaps == 0) {
 			error = XFS_ERROR(ENOSPC);
 			break;
 		}
@@ -4172,9 +4162,11 @@ dmapi_enospc_check:
 
 	return error;
 
- error0:
+error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 	xfs_bmap_cancel(&free_list);
- error1:
+	XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
+
+error1:	/* Just cancel transaction */
 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	goto dmapi_enospc_check;
@@ -4419,8 +4411,8 @@ xfs_free_file_space(
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
-				ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
-				XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+				ip->i_udquot, ip->i_gdquot, resblks, 0,
+				XFS_QMOPT_RES_REGBLKS);
 		if (error)
 			goto error1;
 
-- 
cgit v1.2.3


From 70a061f1fdbfa4805233a113868d059e9614731a Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:28:45 +1100
Subject: [XFS] Fix typo from when enabling write barriers by default, flags
 botch in showargs.

SGI-PV: 912426
SGI-Modid: xfs-linux-melb:xfs-kern:24383a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vfsops.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 3406399e25b3..e344906bdcbb 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -1924,36 +1924,30 @@ xfs_showargs(
 
 	if (mp->m_logbufs > 0)
 		seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
-
 	if (mp->m_logbsize > 0)
 		seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
 
 	if (mp->m_logname)
 		seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
-
 	if (mp->m_rtname)
 		seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
 
 	if (mp->m_dalign > 0)
 		seq_printf(m, "," MNTOPT_SUNIT "=%d",
 				(int)XFS_FSB_TO_BB(mp, mp->m_dalign));
-
 	if (mp->m_swidth > 0)
 		seq_printf(m, "," MNTOPT_SWIDTH "=%d",
 				(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
 
 	if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR))
 		seq_printf(m, "," MNTOPT_ATTR2);
-
 	if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE))
 		seq_printf(m, "," MNTOPT_LARGEIO);
+	if (!(mp->m_flags & XFS_MOUNT_BARRIER))
+		seq_printf(m, "," MNTOPT_NOBARRIER);
 
 	if (!(vfsp->vfs_flag & VFS_32BITINODES))
 		seq_printf(m, "," MNTOPT_64BITINODE);
-
-	if (!(vfsp->vfs_flag & XFS_MOUNT_BARRIER))
-		seq_printf(m, "," MNTOPT_NOBARRIER);
-
 	if (vfsp->vfs_flag & VFS_GRPID)
 		seq_printf(m, "," MNTOPT_GRPID);
 
-- 
cgit v1.2.3


From c7d437da3dda0ac7199c320b6a48c04ec37a614d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:28:56 +1100
Subject: [XFS] do barrier checks earlier.  quota initialization may write to
 the filesystem

SGI-PV: 912426
SGI-Modid: xfs-linux-melb:xfs-kern:202355a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vfsops.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index e344906bdcbb..bfdabde23db0 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -506,13 +506,14 @@ xfs_mount(
 	if (error)
 		goto error2;
 
+	if ((mp->m_flags & XFS_MOUNT_NOATIME) &&
+	    !(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY))
+		xfs_mountfs_check_barriers(mp);
+
 	error = XFS_IOINIT(vfsp, args, flags);
 	if (error)
 		goto error2;
 
-	if ((args->flags & XFSMNT_BARRIER) &&
-	    !(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY))
-		xfs_mountfs_check_barriers(mp);
 	return 0;
 
 error2:
-- 
cgit v1.2.3


From 1661dc8e7a2c6aeb8f2fb6a8197909c95c220a71 Mon Sep 17 00:00:00 2001
From: Yingping Lu <yingping@sgi.com>
Date: Wed, 11 Jan 2006 15:29:39 +1100
Subject: [XFS] Fixed an assertion failure in xfs_reclaim caused by delayed
 block. The assertion failure came from XFS QA41. The fix is done by enabling
 truncate for delayed block in xfs_inactive.

SGI-PV: 945412
SGI-Modid: xfs-linux-melb:xfs-kern:202521a

Signed-off-by: Yingping Lu <yingping@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index a2b422c984f2..f6aed9ef6a61 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1612,7 +1612,8 @@ xfs_inactive(
 	 * only one with a reference to the inode.
 	 */
 	truncate = ((ip->i_d.di_nlink == 0) &&
-	    ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) &&
+            ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) ||
+             (ip->i_delayed_blks > 0)) &&
 	    ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
 
 	mp = ip->i_mount;
-- 
cgit v1.2.3


From 1df84c930a5101223da016f256b46b192dbf2b30 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:29:52 +1100
Subject: [XFS] Mark some lookup tables const.  Thanks to Arjan van de Ven for
 spotting these.

SGI-PV: 946028
SGI-Modid: xfs-linux-melb:xfs-kern:202617a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_stats.c | 2 +-
 fs/xfs/support/debug.c       | 2 +-
 fs/xfs/xfs_mount.c           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 6c40a74be7c8..8955720a2c6b 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -34,7 +34,7 @@ xfs_read_xfsstats(
 	__uint64_t	xs_write_bytes = 0;
 	__uint64_t	xs_read_bytes = 0;
 
-	static struct xstats_entry {
+	static const struct xstats_entry {
 		char	*desc;
 		int	endpoint;
 	} xstats[] = {
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index bb6dc91ea261..e281c3a71ad0 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -27,7 +27,7 @@ static DEFINE_SPINLOCK(xfs_err_lock);
 /* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
 #define XFS_MAX_ERR_LEVEL	7
 #define XFS_ERR_MASK		((1 << 3) - 1)
-static char		*err_level[XFS_MAX_ERR_LEVEL+1] =
+static const char * const	err_level[XFS_MAX_ERR_LEVEL+1] =
 					{KERN_EMERG, KERN_ALERT, KERN_CRIT,
 					 KERN_ERR, KERN_WARNING, KERN_NOTICE,
 					 KERN_INFO, KERN_DEBUG};
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 303af86739bf..f9754896cd3b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -51,7 +51,7 @@ STATIC int	xfs_uuid_mount(xfs_mount_t *);
 STATIC void	xfs_uuid_unmount(xfs_mount_t *mp);
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
 
-static struct {
+static const struct {
     short offset;
     short type;     /* 0 = integer
 		* 1 = binary / string (no translation)
-- 
cgit v1.2.3


From e13a73f02595c564e423bda5742fb8df4ebca455 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:30:08 +1100
Subject: [XFS] Write log dummy record when freezing filesystem

SGI-PV: 945483
SGI-Modid: xfs-linux-melb:xfs-kern:202638a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_fsops.c  | 26 ++++++++++++++++++++++++++
 fs/xfs/xfs_fsops.h  |  1 +
 fs/xfs/xfs_vfsops.c |  2 ++
 3 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index d1236d6f4045..163031c1e394 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -540,6 +540,32 @@ xfs_reserve_blocks(
 	return(0);
 }
 
+void
+xfs_fs_log_dummy(xfs_mount_t *mp)
+{
+	xfs_trans_t *tp;
+	xfs_inode_t *ip;
+
+
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+	atomic_inc(&mp->m_active_trans);
+	if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
+		xfs_trans_cancel(tp, 0);
+		return;
+	}
+
+	ip = mp->m_rootip;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_set_sync(tp);
+	xfs_trans_commit(tp, 0, NULL);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
 int
 xfs_fs_goingdown(
 	xfs_mount_t	*mp,
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index f32713f14f9a..300d0c9d61ad 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,5 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
+extern void xfs_fs_log_dummy(xfs_mount_t *mp);
 
 #endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index bfdabde23db0..dcdb0a8578e1 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -53,6 +53,7 @@
 #include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_clnt.h"
+#include "xfs_fsops.h"
 
 STATIC int xfs_sync(bhv_desc_t *, int, cred_t *);
 
@@ -1967,6 +1968,7 @@ xfs_freeze(
 	/* Push the superblock and write an unmount record */
 	xfs_log_unmount_write(mp);
 	xfs_unmountfs_writesb(mp);
+	xfs_fs_log_dummy(mp);
 }
 
 
-- 
cgit v1.2.3


From 13059ff04c5071c3f6f8a4bd9e51631849f63fa4 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:32:01 +1100
Subject: [XFS] Reverse the sense of COMPAT_ATTR and ATTR2, keeps it simple and
 consistent.

SGI-PV: 941645
SGI-Modid: xfs-linux-melb:xfs-kern:202961a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_attr_leaf.c | 12 ++++++------
 fs/xfs/xfs_bmap.c      | 12 ++++++------
 fs/xfs/xfs_clnt.h      |  2 +-
 fs/xfs/xfs_dir.c       |  2 +-
 fs/xfs/xfs_mount.h     |  2 +-
 fs/xfs/xfs_vfsops.c    | 19 ++++++++-----------
 6 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 1c7421840c18..fe91eac4e2a7 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -128,7 +128,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
 		return (offset >= minforkoff) ? minforkoff : 0;
 	}
 
-	if (unlikely(mp->m_flags & XFS_MOUNT_COMPAT_ATTR)) {
+	if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
 		if (bytes <= XFS_IFORK_ASIZE(dp))
 			return mp->m_attroffset >> 3;
 		return 0;
@@ -157,7 +157,7 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
 {
 	unsigned long s;
 
-	if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR) &&
+	if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
 	    !(XFS_SB_VERSION_HASATTR2(&mp->m_sb))) {
 		s = XFS_SB_LOCK(mp);
 		if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb)) {
@@ -311,7 +311,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 	 */
 	totsize -= size;
 	if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname &&
-	    !(mp->m_flags & XFS_MOUNT_COMPAT_ATTR)) {
+	    (mp->m_flags & XFS_MOUNT_ATTR2)) {
 		/*
 		 * Last attribute now removed, revert to original
 		 * inode format making all literal area available
@@ -330,7 +330,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 		dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
 		ASSERT(dp->i_d.di_forkoff);
 		ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname ||
-			(mp->m_flags & XFS_MOUNT_COMPAT_ATTR));
+			!(mp->m_flags & XFS_MOUNT_ATTR2));
 		dp->i_afp->if_ext_max =
 			XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 		dp->i_df.if_ext_max =
@@ -739,7 +739,7 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
 				+ name_loc->namelen
 				+ INT_GET(name_loc->valuelen, ARCH_CONVERT);
 	}
-	if (!(dp->i_mount->m_flags & XFS_MOUNT_COMPAT_ATTR) &&
+	if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
 	    (bytes == sizeof(struct xfs_attr_sf_hdr)))
 		return(-1);
 	return(xfs_attr_shortform_bytesfit(dp, bytes));
@@ -778,7 +778,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
 		goto out;
 
 	if (forkoff == -1) {
-		ASSERT(!(dp->i_mount->m_flags & XFS_MOUNT_COMPAT_ATTR));
+		ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
 
 		/*
 		 * Last attribute was removed, revert to original
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8a32d65211b0..8ac170b828f5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3948,7 +3948,7 @@ xfs_bmap_add_attrfork(
 		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
 		if (!ip->i_d.di_forkoff)
 			ip->i_d.di_forkoff = mp->m_attroffset >> 3;
-		else if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR))
+		else if (mp->m_flags & XFS_MOUNT_ATTR2)
 			version = 2;
 		break;
 	default:
@@ -4096,13 +4096,13 @@ xfs_bmap_compute_maxlevels(
 	 */
 	if (whichfork == XFS_DATA_FORK) {
 		maxleafents = MAXEXTNUM;
-		sz = (mp->m_flags & XFS_MOUNT_COMPAT_ATTR) ?
-			mp->m_attroffset : XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+		sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
+			XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset;
 	} else {
 		maxleafents = MAXAEXTNUM;
-		sz = (mp->m_flags & XFS_MOUNT_COMPAT_ATTR) ?
-			mp->m_sb.sb_inodesize - mp->m_attroffset :
-			XFS_BMDR_SPACE_CALC(MINABTPTRS);
+		sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
+			XFS_BMDR_SPACE_CALC(MINABTPTRS) :
+			mp->m_sb.sb_inodesize - mp->m_attroffset;
 	}
 	maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
 	minleafrecs = mp->m_bmap_dmnr[0];
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index 328a528b926d..f57cc9ac875e 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -57,7 +57,7 @@ struct xfs_mount_args {
 /*
  * XFS mount option flags -- args->flags1
  */
-#define	XFSMNT_COMPAT_ATTR	0x00000001	/* do not use ATTR2 format */
+#define	XFSMNT_ATTR2		0x00000001	/* allow ATTR2 EA format */
 #define	XFSMNT_WSYNC		0x00000002	/* safe mode nfs mount
 						 * compatible */
 #define	XFSMNT_INO64		0x00000004	/* move inode numbers up
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
index 3dd30391f551..bb87d2a700a9 100644
--- a/fs/xfs/xfs_dir.c
+++ b/fs/xfs/xfs_dir.c
@@ -176,7 +176,7 @@ xfs_dir_mount(xfs_mount_t *mp)
 	uint shortcount, leafcount, count;
 
 	mp->m_dirversion = 1;
-	if (mp->m_flags & XFS_MOUNT_COMPAT_ATTR) {
+	if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
 		shortcount = (mp->m_attroffset -
 				(uint)sizeof(xfs_dir_sf_hdr_t)) /
 				 (uint)sizeof(xfs_dir_sf_entry_t);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 3432fd5a3986..fbfa240bd139 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -393,7 +393,7 @@ typedef struct xfs_mount {
 						   user */
 #define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
 						   allocations */
-#define XFS_MOUNT_COMPAT_ATTR	(1ULL << 8)	/* do not use attr2 format */
+#define XFS_MOUNT_ATTR2		(1ULL << 8)	/* allow use of attr2 format */
 			     /*	(1ULL << 9)	-- currently unused */
 #define XFS_MOUNT_NORECOVERY	(1ULL << 10)	/* no recovery - dirty fs */
 #define XFS_MOUNT_SHARED	(1ULL << 11)	/* shared mount */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index dcdb0a8578e1..aef4ec441e0c 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -291,8 +291,8 @@ xfs_start_flags(
 		mp->m_flags |= XFS_MOUNT_IDELETE;
 	if (ap->flags & XFSMNT_DIRSYNC)
 		mp->m_flags |= XFS_MOUNT_DIRSYNC;
-	if (ap->flags & XFSMNT_COMPAT_ATTR)
-		mp->m_flags |= XFS_MOUNT_COMPAT_ATTR;
+	if (ap->flags & XFSMNT_ATTR2)
+		mp->m_flags |= XFS_MOUNT_ATTR2;
 
 	if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
 		mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
@@ -350,6 +350,10 @@ xfs_finish_flags(
 		}
 	}
 
+	if (XFS_SB_VERSION_HASATTR2(&mp->m_sb)) {
+		mp->m_flags |= XFS_MOUNT_ATTR2;
+	}
+
 	/*
 	 * prohibit r/w mounts of read-only filesystems
 	 */
@@ -385,10 +389,6 @@ xfs_finish_flags(
 			return XFS_ERROR(EINVAL);
 	}
 
-	if (XFS_SB_VERSION_HASATTR2(&mp->m_sb)) {
-		mp->m_flags &= ~XFS_MOUNT_COMPAT_ATTR;
-	}
-
 	return 0;
 }
 
@@ -1690,7 +1690,6 @@ xfs_parseargs(
 	int			iosize;
 
 	args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
-	args->flags |= XFSMNT_COMPAT_ATTR;
 	args->flags |= XFSMNT_BARRIER;
 
 #if 0	/* XXX: off by default, until some remaining issues ironed out */
@@ -1828,9 +1827,9 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
 			args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
 		} else if (!strcmp(this_char, MNTOPT_ATTR2)) {
-			args->flags &= ~XFSMNT_COMPAT_ATTR;
+			args->flags |= XFSMNT_ATTR2;
 		} else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
-			args->flags |= XFSMNT_COMPAT_ATTR;
+			args->flags &= ~XFSMNT_ATTR2;
 		} else if (!strcmp(this_char, "osyncisdsync")) {
 			/* no-op, this is now the default */
 printk("XFS: osyncisdsync is now the default, option is deprecated.\n");
@@ -1941,8 +1940,6 @@ xfs_showargs(
 		seq_printf(m, "," MNTOPT_SWIDTH "=%d",
 				(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
 
-	if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR))
-		seq_printf(m, "," MNTOPT_ATTR2);
 	if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE))
 		seq_printf(m, "," MNTOPT_LARGEIO);
 	if (!(mp->m_flags & XFS_MOUNT_BARRIER))
-- 
cgit v1.2.3


From b04ed21a1fdbfe48ee0738519a4d1af09589dfea Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:32:17 +1100
Subject: [XFS] Disable write barriers for now till intermittent IO errors are
 understood.

SGI-PV: 912426
SGI-Modid: xfs-linux-melb:xfs-kern:202962a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vfsops.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index aef4ec441e0c..f22bebf8f513 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -507,8 +507,7 @@ xfs_mount(
 	if (error)
 		goto error2;
 
-	if ((mp->m_flags & XFS_MOUNT_NOATIME) &&
-	    !(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY))
+	if ((mp->m_flags & XFS_MOUNT_BARRIER) && !(vfsp->vfs_flag & VFS_RDONLY))
 		xfs_mountfs_check_barriers(mp);
 
 	error = XFS_IOINIT(vfsp, args, flags);
@@ -1690,7 +1689,6 @@ xfs_parseargs(
 	int			iosize;
 
 	args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
-	args->flags |= XFSMNT_BARRIER;
 
 #if 0	/* XXX: off by default, until some remaining issues ironed out */
 	args->flags |= XFSMNT_IDELETE; /* default to on */
@@ -1942,8 +1940,8 @@ xfs_showargs(
 
 	if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE))
 		seq_printf(m, "," MNTOPT_LARGEIO);
-	if (!(mp->m_flags & XFS_MOUNT_BARRIER))
-		seq_printf(m, "," MNTOPT_NOBARRIER);
+	if (mp->m_flags & XFS_MOUNT_BARRIER)
+		seq_printf(m, "," MNTOPT_BARRIER);
 
 	if (!(vfsp->vfs_flag & VFS_32BITINODES))
 		seq_printf(m, "," MNTOPT_64BITINODE);
-- 
cgit v1.2.3


From a255a7456d4051e804fd6efff9a7c0f43483a7fc Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:32:30 +1100
Subject: [XFS] Make d_maxiosz report the real maximum (INT_MAX) so we dont
 incorrectly limit people using this interface to size IO buffers.

SGI-PV: 910890
SGI-Modid: xfs-linux-melb:xfs-kern:24657a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f98c5be3dfe7..71eabf4bf397 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -751,8 +751,7 @@ xfs_ioctl(
 			mp->m_rtdev_targp : mp->m_ddev_targp;
 
 		da.d_mem = da.d_miniosz = 1 << target->pbr_sshift;
-		/* The size dio will do in one go */
-		da.d_maxiosz = 64 * PAGE_CACHE_SIZE;
+		da.d_maxiosz = INT_MAX;
 
 		if (copy_to_user(arg, &da, sizeof(da)))
 			return -XFS_ERROR(EFAULT);
-- 
cgit v1.2.3


From 3ddb8fa98ccce6c3b2afd2f4b95a10b3bb60d1f0 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:33:02 +1100
Subject: [XFS] Sort out cosmetic differences between user and kernel copies of
 some sources.

SGI-PV: 907752
SGI-Modid: xfs-linux-melb:xfs-kern:24659a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_arch.h     |  6 +++---
 fs/xfs/xfs_bmap.c     | 23 ++++++++++++-----------
 fs/xfs/xfs_dinode.h   | 11 +++++++++--
 fs/xfs/xfs_dir.h      |  2 ++
 fs/xfs/xfs_dir2.h     |  3 ---
 fs/xfs/xfs_fs.h       |  8 ++++----
 fs/xfs/xfs_iomap.c    |  1 +
 fs/xfs/xfs_sb.h       | 17 -----------------
 fs/xfs/xfs_trans.h    |  1 -
 fs/xfs/xfs_vnodeops.c |  2 +-
 10 files changed, 32 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index f1aeda599eb1..71de20cedb5c 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -202,7 +202,7 @@ static inline void be64_add(__be64 *a, __s64 b)
  */ 
 
 #define XFS_GET_DIR_INO4(di) \
-	(((u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
+	(((__u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
 
 #define XFS_PUT_DIR_INO4(from, di) \
 do { \
@@ -213,9 +213,9 @@ do { \
 } while (0)
 
 #define XFS_DI_HI(di) \
-	(((u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
+	(((__u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
 #define XFS_DI_LO(di) \
-	(((u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7]))
+	(((__u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7]))
 
 #define XFS_GET_DIR_INO8(di)        \
 	(((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8ac170b828f5..34bfb8ad0a25 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3653,14 +3653,16 @@ xfs_bmap_search_extents(
 
 	ep = xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp,
 					  lastxp, gotp, prevp);
-	rt = ip->i_d.di_flags & XFS_DIFLAG_REALTIME;
-	if(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM)) {
+	rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	if (unlikely(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM))) {
                 cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld "
 			"start_block : %llx start_off : %llx blkcnt : %llx "
 			"extent-state : %x \n",
-			(ip->i_mount)->m_fsname,(long long)ip->i_ino,
-			gotp->br_startblock, gotp->br_startoff,
-			gotp->br_blockcount,gotp->br_state);
+			(ip->i_mount)->m_fsname, (long long)ip->i_ino,
+			(unsigned long long)gotp->br_startblock,
+			(unsigned long long)gotp->br_startoff,
+			(unsigned long long)gotp->br_blockcount,
+			gotp->br_state);
         }
         return ep;
 }
@@ -4883,19 +4885,18 @@ xfs_bmapi(
 					error = xfs_mod_incore_sb(mp,
 							XFS_SBS_FDBLOCKS,
 							-(indlen), rsvd);
-					if (error && rt) {
-						xfs_mod_incore_sb(ip->i_mount,
+					if (error && rt)
+						xfs_mod_incore_sb(mp,
 							XFS_SBS_FREXTENTS,
 							extsz, rsvd);
-					} else if (error) {
-						xfs_mod_incore_sb(ip->i_mount,
+					else if (error)
+						xfs_mod_incore_sb(mp,
 							XFS_SBS_FDBLOCKS,
 							alen, rsvd);
-					}
 				}
 
 				if (error) {
-					if (XFS_IS_QUOTA_ON(ip->i_mount))
+					if (XFS_IS_QUOTA_ON(mp))
 						/* unreserve the blocks now */
 						(void)
 						XFS_TRANS_UNRESERVE_QUOTA_NBLKS(
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index f697aab8a3d2..79d0d9e1fbab 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -199,10 +199,16 @@ typedef enum xfs_dinode_fmt
 
 #define XFS_DFORK_DSIZE(dip,mp) \
 	XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp)
+#define XFS_DFORK_DSIZE_HOST(dip,mp) \
+	XFS_CFORK_DSIZE(&(dip)->di_core, mp)
 #define XFS_DFORK_ASIZE(dip,mp) \
 	XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp)
+#define XFS_DFORK_ASIZE_HOST(dip,mp) \
+	XFS_CFORK_ASIZE(&(dip)->di_core, mp)
 #define	XFS_DFORK_SIZE(dip,mp,w) \
 	XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w)
+#define	XFS_DFORK_SIZE_HOST(dip,mp,w) \
+	XFS_CFORK_SIZE(&(dip)->di_core, mp, w)
 
 #define	XFS_DFORK_Q(dip)                    XFS_CFORK_Q_DISK(&(dip)->di_core)
 #define	XFS_DFORK_BOFF(dip)		    XFS_CFORK_BOFF_DISK(&(dip)->di_core)
@@ -216,6 +222,7 @@ typedef enum xfs_dinode_fmt
 #define	XFS_CFORK_FMT_SET(dcp,w,n) \
 	((w) == XFS_DATA_FORK ? \
 		((dcp)->di_format = (n)) : ((dcp)->di_aformat = (n)))
+#define	XFS_DFORK_FORMAT(dip,w) XFS_CFORK_FORMAT(&(dip)->di_core, w)
 
 #define	XFS_CFORK_NEXTENTS_DISK(dcp,w) \
 	((w) == XFS_DATA_FORK ? \
@@ -223,13 +230,13 @@ typedef enum xfs_dinode_fmt
 	 	INT_GET((dcp)->di_anextents, ARCH_CONVERT))
 #define XFS_CFORK_NEXTENTS(dcp,w) \
 	((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents)
+#define	XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w)
+#define	XFS_DFORK_NEXTENTS_HOST(dip,w) XFS_CFORK_NEXTENTS(&(dip)->di_core, w)
 
 #define	XFS_CFORK_NEXT_SET(dcp,w,n) \
 	((w) == XFS_DATA_FORK ? \
 		((dcp)->di_nextents = (n)) : ((dcp)->di_anextents = (n)))
 
-#define	XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w)
-
 #define	XFS_BUF_TO_DINODE(bp)	((xfs_dinode_t *)XFS_BUF_PTR(bp))
 
 /*
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
index 488defe86ba6..8cc8afb9f6c0 100644
--- a/fs/xfs/xfs_dir.h
+++ b/fs/xfs/xfs_dir.h
@@ -135,6 +135,8 @@ void	xfs_dir_startup(void);	/* called exactly once */
 	((mp)->m_dirops.xd_shortform_to_single(args))
 
 #define	XFS_DIR_IS_V1(mp)	((mp)->m_dirversion == 1)
+#define	XFS_DIR_IS_V2(mp)	((mp)->m_dirversion == 2)
 extern xfs_dirops_t xfsv1_dirops;
+extern xfs_dirops_t xfsv2_dirops;
 
 #endif	/* __XFS_DIR_H__ */
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 7e24ffeda9e1..3158f5dc431f 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -72,9 +72,6 @@ typedef struct xfs_dir2_put_args {
 	struct uio	*uio;		/* uio control structure */
 } xfs_dir2_put_args_t;
 
-#define	XFS_DIR_IS_V2(mp)	((mp)->m_dirversion == 2)
-extern xfs_dirops_t	xfsv2_dirops;
-
 /*
  * Other interfaces used by the rest of the dir v2 code.
  */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 3280f49496ba..14010f1fa82f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -3,15 +3,15 @@
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * GNU Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
+ * You should have received a copy of the GNU Lesser General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5ecf3e3e86aa..d72c83d22ee0 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -608,6 +608,7 @@ xfs_iomap_eof_want_preallocate(
 	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
 	while (count_fsb > 0) {
 		imaps = nimaps;
+		firstblock = NULLFSBLOCK;
 		error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
 				  0, &firstblock, 0, imap, &imaps, NULL);
 		if (error)
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 4a17d335f897..bf168a91ddb8 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -68,18 +68,6 @@ struct xfs_mount;
 	(XFS_SB_VERSION_NUMBITS | \
 	 XFS_SB_VERSION_OKREALFBITS | \
 	 XFS_SB_VERSION_OKSASHFBITS)
-#define XFS_SB_VERSION_MKFS(ia,dia,extflag,dirv2,na,sflag,morebits)	\
-	(((ia) || (dia) || (extflag) || (dirv2) || (na) || (sflag) || \
-	  (morebits)) ? \
-		(XFS_SB_VERSION_4 | \
-		 ((ia) ? XFS_SB_VERSION_ALIGNBIT : 0) | \
-		 ((dia) ? XFS_SB_VERSION_DALIGNBIT : 0) | \
-		 ((extflag) ? XFS_SB_VERSION_EXTFLGBIT : 0) | \
-		 ((dirv2) ? XFS_SB_VERSION_DIRV2BIT : 0) | \
-		 ((na) ? XFS_SB_VERSION_LOGV2BIT : 0) | \
-		 ((sflag) ? XFS_SB_VERSION_SECTORBIT : 0) | \
-		 ((morebits) ? XFS_SB_VERSION_MOREBITSBIT : 0)) : \
-		XFS_SB_VERSION_1)
 
 /*
  * There are two words to hold XFS "feature" bits: the original
@@ -105,11 +93,6 @@ struct xfs_mount;
 	(XFS_SB_VERSION2_OKREALFBITS |	\
 	 XFS_SB_VERSION2_OKSASHFBITS )
 
-/*
- * mkfs macro to set up sb_features2 word
- */
-#define	XFS_SB_VERSION2_MKFS(resvd1, sbcntr)	0
-
 typedef struct xfs_sb
 {
 	__uint32_t	sb_magicnum;	/* magic number == XFS_SB_MAGIC */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a889963fdd14..d77901c07f63 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -973,7 +973,6 @@ void		xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
-void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f6aed9ef6a61..15734c54952e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -4056,7 +4056,7 @@ retry:
 		xfs_fileoff_t	s, e;
 
 		/*
-		 * Determine space reservations for data/realtime,
+		 * Determine space reservations for data/realtime.
 		 */
 		if (unlikely(extsz)) {
 			s = startoffset_fsb;
-- 
cgit v1.2.3


From ee2a4f7caa4a72cdf2329081a1f7eb9939df3aab Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:33:36 +1100
Subject: [XFS] Fix an intermittent pquota panic caused by dodgey quota flags
 to an umount dquot flush call.

SGI-PV: 946444
SGI-Modid: xfs-linux-melb:xfs-kern:24680a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 10 ++++++----
 fs/xfs/xfs_mount.c    |  3 +--
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index c04c34776c43..7dcdd0640c32 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1920,9 +1920,7 @@ xfs_qm_quotacheck(
 	 * at this point (because we intentionally didn't in dqget_noattach).
 	 */
 	if (error) {
-		xfs_qm_dqpurge_all(mp,
-				   XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA|
-				   XFS_QMOPT_PQUOTA|XFS_QMOPT_QUOTAOFF);
+		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
 		goto error_return;
 	}
 	/*
@@ -2745,6 +2743,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
 		xfs_dqunlock(udqp);
 		ASSERT(ip->i_udquot == NULL);
 		ip->i_udquot = udqp;
+		ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp));
 		ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
 		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
@@ -2754,7 +2753,10 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
 		xfs_dqunlock(gdqp);
 		ASSERT(ip->i_gdquot == NULL);
 		ip->i_gdquot = gdqp;
-		ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
+		ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp));
+		ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ?
+			ip->i_d.di_gid : ip->i_d.di_projid) ==
+				be32_to_cpu(gdqp->q_core.d_id));
 		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f9754896cd3b..6088e14f84e3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1077,8 +1077,7 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 
 	xfs_iflush_all(mp);
 
-	XFS_QM_DQPURGEALL(mp,
-		XFS_QMOPT_UQUOTA | XFS_QMOPT_GQUOTA | XFS_QMOPT_UMOUNTING);
+	XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
 
 	/*
 	 * Flush out the log synchronously so that we know for sure
-- 
cgit v1.2.3


From 0d14824c0774d050cb4232e1e470e5fc9e32e587 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:33:51 +1100
Subject: [XFS] Ensure max diosize reported is aligned with minimum diosize.

SGI-PV: 910890
SGI-Modid: xfs-linux-melb:xfs-kern:24689a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 71eabf4bf397..2b385394f28c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -751,7 +751,7 @@ xfs_ioctl(
 			mp->m_rtdev_targp : mp->m_ddev_targp;
 
 		da.d_mem = da.d_miniosz = 1 << target->pbr_sshift;
-		da.d_maxiosz = INT_MAX;
+		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
 
 		if (copy_to_user(arg, &da, sizeof(da)))
 			return -XFS_ERROR(EFAULT);
-- 
cgit v1.2.3


From 65be60541909c7bf8c384cde3502c18cc362939e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sgi.com>
Date: Wed, 11 Jan 2006 15:34:19 +1100
Subject: [XFS] remove unused "readonly" arg from xlog_find_tail and
 xlog_recover

SGI-PV: 946611
SGI-Modid: xfs-linux-melb:xfs-kern:203307a

Signed-off-by: Eric Sandeen <sandeen@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_log.c         | 2 +-
 fs/xfs/xfs_log_priv.h    | 5 ++---
 fs/xfs/xfs_log_recover.c | 8 +++-----
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 29af51275ca9..ecb1067494f0 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -428,7 +428,7 @@ xfs_log_mount(xfs_mount_t	*mp,
 		if (readonly)
 			vfsp->vfs_flag &= ~VFS_RDONLY;
 
-		error = xlog_recover(mp->m_log, readonly);
+		error = xlog_recover(mp->m_log);
 
 		if (readonly)
 			vfsp->vfs_flag |= VFS_RDONLY;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 4518b188ade6..4f93988d7ac6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -556,9 +556,8 @@ typedef struct log {
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 extern int	 xlog_find_tail(xlog_t	*log,
 				xfs_daddr_t *head_blk,
-				xfs_daddr_t *tail_blk,
-				int readonly);
-extern int	 xlog_recover(xlog_t *log, int readonly);
+				xfs_daddr_t *tail_blk);
+extern int	 xlog_recover(xlog_t *log);
 extern int	 xlog_recover_finish(xlog_t *log, int mfsi_flags);
 extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 extern void	 xlog_recover_process_iunlinks(xlog_t *log);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8ab7df768063..d0c9fc4fed60 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -783,8 +783,7 @@ int
 xlog_find_tail(
 	xlog_t			*log,
 	xfs_daddr_t		*head_blk,
-	xfs_daddr_t		*tail_blk,
-	int			readonly)
+	xfs_daddr_t		*tail_blk)
 {
 	xlog_rec_header_t	*rhead;
 	xlog_op_header_t	*op_head;
@@ -3890,14 +3889,13 @@ xlog_do_recover(
  */
 int
 xlog_recover(
-	xlog_t		*log,
-	int		readonly)
+	xlog_t		*log)
 {
 	xfs_daddr_t	head_blk, tail_blk;
 	int		error;
 
 	/* find the tail of the log */
-	if ((error = xlog_find_tail(log, &head_blk, &tail_blk, readonly)))
+	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
 		return error;
 
 	if (tail_blk != head_blk) {
-- 
cgit v1.2.3


From 24ee80882deb5777aeee6f75a05f178ab8fbfa69 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sgi.com>
Date: Wed, 11 Jan 2006 15:34:32 +1100
Subject: [XFS] remove unused vars, args, & unneeded intermediate vars from
 zeroing code

SGI-PV: 946641
SGI-Modid: xfs-linux-melb:xfs-kern:203328a

Signed-off-by: Eric Sandeen <sandeen@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 36 ++++++++++--------------------------
 fs/xfs/xfs_inode.c         |  7 ++-----
 2 files changed, 12 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 885dfafeabee..c73d3c18882c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -362,7 +362,6 @@ STATIC int				/* error (positive) */
 xfs_zero_last_block(
 	struct inode	*ip,
 	xfs_iocore_t	*io,
-	xfs_off_t	offset,
 	xfs_fsize_t	isize,
 	xfs_fsize_t	end_size)
 {
@@ -371,19 +370,16 @@ xfs_zero_last_block(
 	int		nimaps;
 	int		zero_offset;
 	int		zero_len;
-	int		isize_fsb_offset;
 	int		error = 0;
 	xfs_bmbt_irec_t	imap;
 	loff_t		loff;
-	size_t		lsize;
 
 	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
-	ASSERT(offset > isize);
 
 	mp = io->io_mount;
 
-	isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
-	if (isize_fsb_offset == 0) {
+	zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+	if (zero_offset == 0) {
 		/*
 		 * There are no extra bytes in the last block on disk to
 		 * zero, so return.
@@ -413,10 +409,8 @@ xfs_zero_last_block(
 	 */
 	XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
 	loff = XFS_FSB_TO_B(mp, last_fsb);
-	lsize = XFS_FSB_TO_B(mp, 1);
 
-	zero_offset = isize_fsb_offset;
-	zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
+	zero_len = mp->m_sb.sb_blocksize - zero_offset;
 
 	error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
 
@@ -447,20 +441,17 @@ xfs_zero_eof(
 	struct inode	*ip = LINVFS_GET_IP(vp);
 	xfs_fileoff_t	start_zero_fsb;
 	xfs_fileoff_t	end_zero_fsb;
-	xfs_fileoff_t	prev_zero_fsb;
 	xfs_fileoff_t	zero_count_fsb;
 	xfs_fileoff_t	last_fsb;
 	xfs_extlen_t	buf_len_fsb;
-	xfs_extlen_t	prev_zero_count;
 	xfs_mount_t	*mp;
 	int		nimaps;
 	int		error = 0;
 	xfs_bmbt_irec_t	imap;
-	loff_t		loff;
-	size_t		lsize;
 
 	ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+	ASSERT(offset > isize);
 
 	mp = io->io_mount;
 
@@ -468,7 +459,7 @@ xfs_zero_eof(
 	 * First handle zeroing the block on which isize resides.
 	 * We only zero a part of that block so it is handled specially.
 	 */
-	error = xfs_zero_last_block(ip, io, offset, isize, end_size);
+	error = xfs_zero_last_block(ip, io, isize, end_size);
 	if (error) {
 		ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 		ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
@@ -496,8 +487,6 @@ xfs_zero_eof(
 	}
 
 	ASSERT(start_zero_fsb <= end_zero_fsb);
-	prev_zero_fsb = NULLFILEOFF;
-	prev_zero_count = 0;
 	while (start_zero_fsb <= end_zero_fsb) {
 		nimaps = 1;
 		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
@@ -519,10 +508,7 @@ xfs_zero_eof(
 			 * that sits on a hole and sets the page as P_HOLE
 			 * and calls remapf if it is a mapped file.
 			 */
-			prev_zero_fsb = NULLFILEOFF;
-			prev_zero_count = 0;
-			start_zero_fsb = imap.br_startoff +
-					 imap.br_blockcount;
+			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 			continue;
 		}
@@ -543,17 +529,15 @@ xfs_zero_eof(
 		 */
 		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 
-		loff = XFS_FSB_TO_B(mp, start_zero_fsb);
-		lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
-
-		error = xfs_iozero(ip, loff, lsize, end_size);
+		error = xfs_iozero(ip,
+				   XFS_FSB_TO_B(mp, start_zero_fsb),
+				   XFS_FSB_TO_B(mp, buf_len_fsb),
+				   end_size);
 
 		if (error) {
 			goto out_lock;
 		}
 
-		prev_zero_fsb = start_zero_fsb;
-		prev_zero_count = buf_len_fsb;
 		start_zero_fsb = imap.br_startoff + buf_len_fsb;
 		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e486c7d244c2..0063437f291a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1777,22 +1777,19 @@ xfs_igrow_start(
 	xfs_fsize_t	new_size,
 	cred_t		*credp)
 {
-	xfs_fsize_t	isize;
 	int		error;
 
 	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
 	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
 	ASSERT(new_size > ip->i_d.di_size);
 
-	error = 0;
-	isize = ip->i_d.di_size;
 	/*
 	 * Zero any pages that may have been created by
 	 * xfs_write_file() beyond the end of the file
 	 * and any blocks between the old and new file sizes.
 	 */
-	error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize,
-				new_size);
+	error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size,
+			     ip->i_d.di_size, new_size);
 	return error;
 }
 
-- 
cgit v1.2.3


From dd954c69d189cd91571b42d3f926e70351395dc3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:34:50 +1100
Subject: [XFS] turn xlog helper macros into real functions

SGI-PV: 946205
SGI-Modid: xfs-linux-melb:xfs-kern:203360a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_log.c      | 113 +++++++++++++++++++++++++++++++++++++++++---------
 fs/xfs/xfs_log_priv.h |  57 -------------------------
 2 files changed, 93 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ecb1067494f0..f6bac20af7aa 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -178,6 +178,83 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
 #define	xlog_trace_iclog(iclog,state)
 #endif /* XFS_LOG_TRACE */
 
+
+static void
+xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+{
+	if (*qp) {
+		tic->t_next	    = (*qp);
+		tic->t_prev	    = (*qp)->t_prev;
+		(*qp)->t_prev->t_next = tic;
+		(*qp)->t_prev	    = tic;
+	} else {
+		tic->t_prev = tic->t_next = tic;
+		*qp = tic;
+	}
+
+	tic->t_flags |= XLOG_TIC_IN_Q;
+}
+
+static void
+xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+{
+	if (tic == tic->t_next) {
+		*qp = NULL;
+	} else {
+		*qp = tic->t_next;
+		tic->t_next->t_prev = tic->t_prev;
+		tic->t_prev->t_next = tic->t_next;
+	}
+
+	tic->t_next = tic->t_prev = NULL;
+	tic->t_flags &= ~XLOG_TIC_IN_Q;
+}
+
+static void
+xlog_grant_sub_space(struct log *log, int bytes)
+{
+	log->l_grant_write_bytes -= bytes;
+	if (log->l_grant_write_bytes < 0) {
+		log->l_grant_write_bytes += log->l_logsize;
+		log->l_grant_write_cycle--;
+	}
+
+	log->l_grant_reserve_bytes -= bytes;
+	if ((log)->l_grant_reserve_bytes < 0) {
+		log->l_grant_reserve_bytes += log->l_logsize;
+		log->l_grant_reserve_cycle--;
+	}
+
+}
+
+static void
+xlog_grant_add_space_write(struct log *log, int bytes)
+{
+	log->l_grant_write_bytes += bytes;
+	if (log->l_grant_write_bytes > log->l_logsize) {
+		log->l_grant_write_bytes -= log->l_logsize;
+		log->l_grant_write_cycle++;
+	}
+}
+
+static void
+xlog_grant_add_space_reserve(struct log *log, int bytes)
+{
+	log->l_grant_reserve_bytes += bytes;
+	if (log->l_grant_reserve_bytes > log->l_logsize) {
+		log->l_grant_reserve_bytes -= log->l_logsize;
+		log->l_grant_reserve_cycle++;
+	}
+}
+
+static inline void
+xlog_grant_add_space(struct log *log, int bytes)
+{
+	xlog_grant_add_space_write(log, bytes);
+	xlog_grant_add_space_reserve(log, bytes);
+}
+
+
 /*
  * NOTES:
  *
@@ -1320,8 +1397,7 @@ xlog_sync(xlog_t		*log,
 
 	/* move grant heads by roundoff in sync */
 	s = GRANT_LOCK(log);
-	XLOG_GRANT_ADD_SPACE(log, roundoff, 'w');
-	XLOG_GRANT_ADD_SPACE(log, roundoff, 'r');
+	xlog_grant_add_space(log, roundoff);
 	GRANT_UNLOCK(log, s);
 
 	/* put cycle number in every block */
@@ -2389,7 +2465,7 @@ xlog_grant_log_space(xlog_t	   *log,
 
 	/* something is already sleeping; insert new transaction at end */
 	if (log->l_reserve_headq) {
-		XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
+		xlog_ins_ticketq(&log->l_reserve_headq, tic);
 		xlog_trace_loggrant(log, tic,
 				    "xlog_grant_log_space: sleep 1");
 		/*
@@ -2422,7 +2498,7 @@ redo:
 				     log->l_grant_reserve_bytes);
 	if (free_bytes < need_bytes) {
 		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-			XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
+			xlog_ins_ticketq(&log->l_reserve_headq, tic);
 		xlog_trace_loggrant(log, tic,
 				    "xlog_grant_log_space: sleep 2");
 		XFS_STATS_INC(xs_sleep_logspace);
@@ -2439,11 +2515,10 @@ redo:
 		s = GRANT_LOCK(log);
 		goto redo;
 	} else if (tic->t_flags & XLOG_TIC_IN_Q)
-		XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+		xlog_del_ticketq(&log->l_reserve_headq, tic);
 
 	/* we've got enough space */
-	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w');
-	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r');
+	xlog_grant_add_space(log, need_bytes);
 #ifdef DEBUG
 	tail_lsn = log->l_tail_lsn;
 	/*
@@ -2464,7 +2539,7 @@ redo:
 
  error_return:
 	if (tic->t_flags & XLOG_TIC_IN_Q)
-		XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+		xlog_del_ticketq(&log->l_reserve_headq, tic);
 	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
 	/*
 	 * If we are failing, make sure the ticket doesn't have any
@@ -2533,7 +2608,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 
 		if (ntic != log->l_write_headq) {
 			if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-				XLOG_INS_TICKETQ(log->l_write_headq, tic);
+				xlog_ins_ticketq(&log->l_write_headq, tic);
 
 			xlog_trace_loggrant(log, tic,
 				    "xlog_regrant_write_log_space: sleep 1");
@@ -2565,7 +2640,7 @@ redo:
 				     log->l_grant_write_bytes);
 	if (free_bytes < need_bytes) {
 		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-			XLOG_INS_TICKETQ(log->l_write_headq, tic);
+			xlog_ins_ticketq(&log->l_write_headq, tic);
 		XFS_STATS_INC(xs_sleep_logspace);
 		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
 
@@ -2581,9 +2656,10 @@ redo:
 		s = GRANT_LOCK(log);
 		goto redo;
 	} else if (tic->t_flags & XLOG_TIC_IN_Q)
-		XLOG_DEL_TICKETQ(log->l_write_headq, tic);
+		xlog_del_ticketq(&log->l_write_headq, tic);
 
-	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */
+	/* we've got enough space */
+	xlog_grant_add_space_write(log, need_bytes);
 #ifdef DEBUG
 	tail_lsn = log->l_tail_lsn;
 	if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
@@ -2600,7 +2676,7 @@ redo:
 
  error_return:
 	if (tic->t_flags & XLOG_TIC_IN_Q)
-		XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+		xlog_del_ticketq(&log->l_reserve_headq, tic);
 	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
 	/*
 	 * If we are failing, make sure the ticket doesn't have any
@@ -2633,8 +2709,7 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
 		ticket->t_cnt--;
 
 	s = GRANT_LOCK(log);
-	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
-	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
+	xlog_grant_sub_space(log, ticket->t_curr_res);
 	ticket->t_curr_res = ticket->t_unit_res;
 	XLOG_TIC_RESET_RES(ticket);
 	xlog_trace_loggrant(log, ticket,
@@ -2647,7 +2722,7 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
 		return;
 	}
 
-	XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r');
+	xlog_grant_add_space_reserve(log, ticket->t_unit_res);
 	xlog_trace_loggrant(log, ticket,
 			    "xlog_regrant_reserve_log_space: exit");
 	xlog_verify_grant_head(log, 0);
@@ -2683,8 +2758,7 @@ xlog_ungrant_log_space(xlog_t	     *log,
 	s = GRANT_LOCK(log);
 	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
 
-	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
-	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
+	xlog_grant_sub_space(log, ticket->t_curr_res);
 
 	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
 
@@ -2693,8 +2767,7 @@ xlog_ungrant_log_space(xlog_t	     *log,
 	 */
 	if (ticket->t_cnt > 0) {
 		ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-		XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w');
-		XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r');
+		xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
 	}
 
 	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 4f93988d7ac6..5862286552ce 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -494,63 +494,6 @@ typedef struct log {
 
 #define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
 
-#define XLOG_GRANT_SUB_SPACE(log,bytes,type)				\
-    {									\
-	if (type == 'w') {						\
-		(log)->l_grant_write_bytes -= (bytes);			\
-		if ((log)->l_grant_write_bytes < 0) {			\
-			(log)->l_grant_write_bytes += (log)->l_logsize;	\
-			(log)->l_grant_write_cycle--;			\
-		}							\
-	} else {							\
-		(log)->l_grant_reserve_bytes -= (bytes);		\
-		if ((log)->l_grant_reserve_bytes < 0) {			\
-			(log)->l_grant_reserve_bytes += (log)->l_logsize;\
-			(log)->l_grant_reserve_cycle--;			\
-		}							\
-	 }								\
-    }
-#define XLOG_GRANT_ADD_SPACE(log,bytes,type)				\
-    {									\
-	if (type == 'w') {						\
-		(log)->l_grant_write_bytes += (bytes);			\
-		if ((log)->l_grant_write_bytes > (log)->l_logsize) {	\
-			(log)->l_grant_write_bytes -= (log)->l_logsize;	\
-			(log)->l_grant_write_cycle++;			\
-		}							\
-	} else {							\
-		(log)->l_grant_reserve_bytes += (bytes);		\
-		if ((log)->l_grant_reserve_bytes > (log)->l_logsize) {	\
-			(log)->l_grant_reserve_bytes -= (log)->l_logsize;\
-			(log)->l_grant_reserve_cycle++;			\
-		}							\
-	 }								\
-    }
-#define XLOG_INS_TICKETQ(q, tic)			\
-    {							\
-	if (q) {					\
-		(tic)->t_next	    = (q);		\
-		(tic)->t_prev	    = (q)->t_prev;	\
-		(q)->t_prev->t_next = (tic);		\
-		(q)->t_prev	    = (tic);		\
-	} else {					\
-		(tic)->t_prev = (tic)->t_next = (tic);	\
-		(q) = (tic);				\
-	}						\
-	(tic)->t_flags |= XLOG_TIC_IN_Q;		\
-    }
-#define XLOG_DEL_TICKETQ(q, tic)			\
-    {							\
-	if ((tic) == (tic)->t_next) {			\
-		(q) = NULL;				\
-	} else {					\
-		(q) = (tic)->t_next;			\
-		(tic)->t_next->t_prev = (tic)->t_prev;	\
-		(tic)->t_prev->t_next = (tic)->t_next;	\
-	}						\
-	(tic)->t_next = (tic)->t_prev = NULL;		\
-	(tic)->t_flags &= ~XLOG_TIC_IN_Q;		\
-    }
 
 /* common routines */
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
-- 
cgit v1.2.3


From 42fe2b1f7fe788ed5304a7bfa0a0b0db81bc03a8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:35:17 +1100
Subject: [XFS] fix, speedup and simplify atime handling	let the VFS handle
 atime updates and only sync back to the xfs inode when nessecary

SGI-PV: 946679
SGI-Modid: xfs-linux-melb:xfs-kern:203362a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c  | 58 ++++++++++++++++++--------------------------
 fs/xfs/linux-2.6/xfs_lrw.c   |  6 -----
 fs/xfs/linux-2.6/xfs_vnode.c |  1 -
 fs/xfs/xfs_inode.c           |  5 ++++
 fs/xfs/xfs_inode.h           |  2 ++
 fs/xfs/xfs_inode_item.c      |  5 ++++
 fs/xfs/xfs_itable.c          |  7 ++++--
 fs/xfs/xfs_vnodeops.c        | 19 ++++++---------
 8 files changed, 49 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 97fb1470cf28..8fd274fc26d5 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -57,6 +57,24 @@
 #define IS_NOATIME(inode) ((inode->i_sb->s_flags & MS_NOATIME) ||	\
 	(S_ISDIR(inode->i_mode) && inode->i_sb->s_flags & MS_NODIRATIME))
 
+/*
+ * Bring the atime in the XFS inode uptodate.
+ * Used before logging the inode to disk or when the Linux inode goes away.
+ */
+void
+xfs_synchronize_atime(
+	xfs_inode_t	*ip)
+{
+	vnode_t		*vp;
+
+	vp = XFS_ITOV_NULL(ip);
+	if (vp) {
+		struct inode *inode = &vp->v_inode;
+		ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+		ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+	}
+}
+
 /*
  * Change the requested timestamp in the given inode.
  * We don't lock across timestamp updates, and we don't log them but
@@ -76,23 +94,6 @@ xfs_ichgtime(
 	struct inode	*inode = LINVFS_GET_IP(XFS_ITOV(ip));
 	timespec_t	tv;
 
-	/*
-	 * We're not supposed to change timestamps in readonly-mounted
-	 * filesystems.  Throw it away if anyone asks us.
-	 */
-	if (unlikely(IS_RDONLY(inode)))
-		return;
-
-	/*
-	 * Don't update access timestamps on reads if mounted "noatime".
-	 * Throw it away if anyone asks us.
-	 */
-	if (unlikely(
-	    (ip->i_mount->m_flags & XFS_MOUNT_NOATIME || IS_NOATIME(inode)) &&
-	    (flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG)) ==
-			XFS_ICHGTIME_ACC))
-		return;
-
 	nanotime(&tv);
 	if (flags & XFS_ICHGTIME_MOD) {
 		inode->i_mtime = tv;
@@ -129,8 +130,6 @@ xfs_ichgtime(
  * Variant on the above which avoids querying the system clock
  * in situations where we know the Linux inode timestamps have
  * just been updated (and so we can update our inode cheaply).
- * We also skip the readonly and noatime checks here, they are
- * also catered for already.
  */
 void
 xfs_ichgtime_fast(
@@ -141,20 +140,16 @@ xfs_ichgtime_fast(
 	timespec_t	*tvp;
 
 	/*
-	 * We're not supposed to change timestamps in readonly-mounted
-	 * filesystems.  Throw it away if anyone asks us.
+	 * Atime updates for read() & friends are handled lazily now, and
+	 * explicit updates must go through xfs_ichgtime()
 	 */
-	if (unlikely(IS_RDONLY(inode)))
-		return;
+	ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
 
 	/*
-	 * Don't update access timestamps on reads if mounted "noatime".
-	 * Throw it away if anyone asks us.
+	 * We're not supposed to change timestamps in readonly-mounted
+	 * filesystems.  Throw it away if anyone asks us.
 	 */
-	if (unlikely(
-	    (ip->i_mount->m_flags & XFS_MOUNT_NOATIME || IS_NOATIME(inode)) &&
-	    ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG)) ==
-			XFS_ICHGTIME_ACC)))
+	if (unlikely(IS_RDONLY(inode)))
 		return;
 
 	if (flags & XFS_ICHGTIME_MOD) {
@@ -162,11 +157,6 @@ xfs_ichgtime_fast(
 		ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
 		ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
 	}
-	if (flags & XFS_ICHGTIME_ACC) {
-		tvp = &inode->i_atime;
-		ip->i_d.di_atime.t_sec = (__int32_t)tvp->tv_sec;
-		ip->i_d.di_atime.t_nsec = (__int32_t)tvp->tv_nsec;
-	}
 	if (flags & XFS_ICHGTIME_CHG) {
 		tvp = &inode->i_ctime;
 		ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index c73d3c18882c..563cb9e975d2 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -281,9 +281,6 @@ xfs_read(
 
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
-	if (likely(!(ioflags & IO_INVIS)))
-		xfs_ichgtime_fast(ip, inode, XFS_ICHGTIME_ACC);
-
 unlock_isem:
 	if (unlikely(ioflags & IO_ISDIRECT))
 		mutex_unlock(&inode->i_mutex);
@@ -346,9 +343,6 @@ xfs_sendfile(
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
-	if (likely(!(ioflags & IO_INVIS)))
-		xfs_ichgtime_fast(ip, LINVFS_GET_IP(vp), XFS_ICHGTIME_ACC);
-
 	return ret;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index e9bbcb4d6243..260dd8415dd7 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -106,7 +106,6 @@ vn_revalidate_core(
 	inode->i_blocks	    = vap->va_nblocks;
 	inode->i_mtime	    = vap->va_mtime;
 	inode->i_ctime	    = vap->va_ctime;
-	inode->i_atime	    = vap->va_atime;
 	inode->i_blksize    = vap->va_blocksize;
 	if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0063437f291a..6b7ac8bdcac2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3364,6 +3364,11 @@ xfs_iflush_int(
 	ip->i_update_core = 0;
 	SYNCHRONIZE();
 
+	/*
+	 * Make sure to get the latest atime from the Linux inode.
+	 */
+	xfs_synchronize_atime(ip);
+
 	if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC,
 			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 124d30e6143b..c4c1d9bce82e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -436,6 +436,8 @@ void		xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
 void		xfs_lock_inodes(xfs_inode_t **, int, int, uint);
 
+void		xfs_synchronize_atime(xfs_inode_t *);
+
 #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
 
 #ifdef DEBUG
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7f3363c621e1..9369010f3f16 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -271,6 +271,11 @@ xfs_inode_item_format(
 	if (ip->i_update_size)
 		ip->i_update_size = 0;
 
+	/*
+	 * Make sure to get the latest atime from the Linux inode.
+	 */
+	xfs_synchronize_atime(ip);
+
 	vecp->i_addr = (xfs_caddr_t)&ip->i_d;
 	vecp->i_len  = sizeof(xfs_dinode_core_t);
 	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f63646ead816..41f50e7d1c32 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -56,6 +56,7 @@ xfs_bulkstat_one_iget(
 {
 	xfs_dinode_core_t *dic;		/* dinode core info pointer */
 	xfs_inode_t	*ip;		/* incore inode pointer */
+	vnode_t		*vp;
 	int		error;
 
 	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno);
@@ -72,6 +73,7 @@ xfs_bulkstat_one_iget(
 		goto out_iput;
 	}
 
+	vp = XFS_ITOV(ip);
 	dic = &ip->i_d;
 
 	/* xfs_iget returns the following without needing
@@ -84,8 +86,9 @@ xfs_bulkstat_one_iget(
 	buf->bs_uid = dic->di_uid;
 	buf->bs_gid = dic->di_gid;
 	buf->bs_size = dic->di_size;
-	buf->bs_atime.tv_sec = dic->di_atime.t_sec;
-	buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
+	/* atime is only kept uptodate in the Linux inode */
+	buf->bs_atime.tv_sec = vp->v_inode.i_atime.tv_sec;
+	buf->bs_atime.tv_nsec = vp->v_inode.i_atime.tv_nsec;
 	buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
 	buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
 	buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 15734c54952e..688fc2cb4b8d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -182,8 +182,8 @@ xfs_getattr(
 		break;
 	}
 
-	vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
-	vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
+	/* atime is only kept uptodate in the Linux inode */
+	vap->va_atime = vp->v_inode.i_atime;
 	vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 	vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 	vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
@@ -982,10 +982,6 @@ xfs_readlink(
 		goto error_return;
 	}
 
-	if (!(ioflags & IO_INVIS)) {
-		xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
-	}
-
 	/*
 	 * See if the symlink is stored inline.
 	 */
@@ -3226,7 +3222,6 @@ xfs_readdir(
 	xfs_trans_t	*tp = NULL;
 	int		error = 0;
 	uint		lock_mode;
-	xfs_off_t	start_offset;
 
 	vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
 					       (inst_t *)__return_address);
@@ -3237,11 +3232,7 @@ xfs_readdir(
 	}
 
 	lock_mode = xfs_ilock_map_shared(dp);
-	start_offset = uiop->uio_offset;
 	error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
-	if (start_offset != uiop->uio_offset) {
-		xfs_ichgtime(dp, XFS_ICHGTIME_ACC);
-	}
 	xfs_iunlock_map_shared(dp, lock_mode);
 	return error;
 }
@@ -3819,6 +3810,12 @@ xfs_reclaim(
 
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
 
+	/*
+	 * Make sure the atime in the XFS inode is correct before freeing the
+	 * Linux inode.
+	 */
+	xfs_synchronize_atime(ip);
+
 	/* If we have nothing to flush with this inode then complete the
 	 * teardown now, otherwise break the link between the xfs inode
 	 * and the linux inode and clean up the xfs inode later. This
-- 
cgit v1.2.3


From c2cd2550603d847b709035c4c6b666adf560d7b8 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:35:32 +1100
Subject: [XFS] Fix v2 log bufsize reporting in /proc/mounts.

SGI-PV: 946760
SGI-Modid: xfs-linux-melb:xfs-kern:24765a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vfsops.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index f22bebf8f513..b6ad370fab3d 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -333,10 +333,11 @@ xfs_finish_flags(
 
 	/* Fail a mount where the logbuf is smaller then the log stripe */
 	if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
-		if ((ap->logbufsize == -1) &&
+		if ((ap->logbufsize <= 0) &&
 		    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
 			mp->m_logbsize = mp->m_sb.sb_logsunit;
-		} else if (ap->logbufsize < mp->m_sb.sb_logsunit) {
+		} else if (ap->logbufsize > 0 &&
+			   ap->logbufsize < mp->m_sb.sb_logsunit) {
 			cmn_err(CE_WARN,
 	"XFS: logbuf size must be greater than or equal to log stripe size");
 			return XFS_ERROR(EINVAL);
-- 
cgit v1.2.3


From 446ada4a03808f128e8f28daa0f103dc69d22d5b Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:35:44 +1100
Subject: [XFS] Add an XFS callout to security_inode_init_security; SE Linux is
 not functional with XFS without this change.

SGI-PV: 946762
SGI-Modid: xfs-linux-melb:xfs-kern:24766a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8fd274fc26d5..6bf770cfb006 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -53,6 +53,7 @@
 
 #include <linux/xattr.h>
 #include <linux/namei.h>
+#include <linux/security.h>
 
 #define IS_NOATIME(inode) ((inode->i_sb->s_flags & MS_NOATIME) ||	\
 	(S_ISDIR(inode->i_mode) && inode->i_sb->s_flags & MS_NODIRATIME))
@@ -202,6 +203,39 @@ validate_fields(
 	}
 }
 
+/*
+ * Hook in SELinux.  This is not quite correct yet, what we really need
+ * here (as we do for default ACLs) is a mechanism by which creation of
+ * these attrs can be journalled at inode creation time (along with the
+ * inode, of course, such that log replay can't cause these to be lost).
+ */
+STATIC int
+linvfs_init_security(
+	struct vnode	*vp,
+	struct inode	*dir)
+{
+	struct inode	*ip = LINVFS_GET_IP(vp);
+	size_t		length;
+	void		*value;
+	char		*name;
+	int		error;
+
+	error = security_inode_init_security(ip, dir, &name, &value, &length);
+	if (error) {
+		if (error == -EOPNOTSUPP)
+			return 0;
+		return -error;
+	}
+
+	VOP_ATTR_SET(vp, name, value, length, ATTR_SECURE, NULL, error);
+	if (!error)
+		VMODIFY(vp);
+
+	kfree(name);
+	kfree(value);
+	return error;
+}
+
 /*
  * Determine whether a process has a valid fs_struct (kernel daemons
  * like knfsd don't have an fs_struct).
@@ -267,6 +301,9 @@ linvfs_mknod(
 		break;
 	}
 
+	if (!error)
+		error = linvfs_init_security(vp, dir);
+
 	if (default_acl) {
 		if (!error) {
 			error = _ACL_INHERIT(vp, &va, default_acl);
-- 
cgit v1.2.3


From 77a7cce42509461067f49c484aee626f62162fbc Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:35:57 +1100
Subject: [XFS] Fix quotaoff logitem for project quota, affects log recovery
 only.

SGI-PV: 946444
SGI-Modid: xfs-linux-melb:xfs-kern:24768a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index d0c9fc4fed60..7d46cbd6a07a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2562,10 +2562,12 @@ xlog_recover_do_quotaoff_trans(
 
 	/*
 	 * The logitem format's flag tells us if this was user quotaoff,
-	 * group quotaoff or both.
+	 * group/project quotaoff or both.
 	 */
 	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
 		log->l_quotaoffs_flag |= XFS_DQ_USER;
+	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
 	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
 		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
 
-- 
cgit v1.2.3


From 0733af213f2859f7228229f3ac053c025f57d0d5 Mon Sep 17 00:00:00 2001
From: Ryan Hankins <hankins@sgi.com>
Date: Wed, 11 Jan 2006 15:36:44 +1100
Subject: [XFS] Add a stack trace in the case of xfs_forced_shutdown.

SGI-PV: 929558
SGI-Modid: xfs-linux-melb:xfs-kern:203701a

Signed-off-by: Ryan Hankins <hankins@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_trans.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 279e043d7323..2eb6027762f2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1014,6 +1014,7 @@ xfs_trans_cancel(
 	xfs_log_item_t		*lip;
 	int			i;
 #endif
+	xfs_mount_t		*mp = tp->t_mountp;
 
 	/*
 	 * See if the caller is being too lazy to figure out if
@@ -1027,8 +1028,9 @@ xfs_trans_cancel(
 	 * corruption and decide to give up.
 	 */
 	if ((tp->t_flags & XFS_TRANS_DIRTY) &&
-	    !XFS_FORCED_SHUTDOWN(tp->t_mountp))
-		xfs_force_shutdown(tp->t_mountp, XFS_CORRUPT_INCORE);
+	    !XFS_FORCED_SHUTDOWN(mp))
+		XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
+		xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
 #ifdef DEBUG
 	if (!(flags & XFS_TRANS_ABORT)) {
 		licp = &(tp->t_items);
@@ -1040,7 +1042,7 @@ xfs_trans_cancel(
 				}
 
 				lip = lidp->lid_item;
-				if (!XFS_FORCED_SHUTDOWN(tp->t_mountp))
+				if (!XFS_FORCED_SHUTDOWN(mp))
 					ASSERT(!(lip->li_type == XFS_LI_EFD));
 			}
 			licp = licp->lic_next;
@@ -1048,7 +1050,7 @@ xfs_trans_cancel(
 	}
 #endif
 	xfs_trans_unreserve_and_mod_sb(tp);
-	XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp);
+	XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
 
 	if (tp->t_ticket) {
 		if (flags & XFS_TRANS_RELEASE_LOG_RES) {
@@ -1057,7 +1059,7 @@ xfs_trans_cancel(
 		} else {
 			log_flags = 0;
 		}
-		xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+		xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
 	}
 
 	/* mark this thread as no longer being in a transaction */
-- 
cgit v1.2.3


From 60a204f096dd67683f3993798e14905ee9828ba5 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:37:00 +1100
Subject: [XFS] Fix a thinko when generating a forced shutdown stack trace.

SGI-PV: 929558
SGI-Modid: xfs-linux-melb:xfs-kern:203817a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_trans.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2eb6027762f2..d3d714e6b32a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1027,10 +1027,10 @@ xfs_trans_cancel(
 	 * filesystem.  This happens in paths where we detect
 	 * corruption and decide to give up.
 	 */
-	if ((tp->t_flags & XFS_TRANS_DIRTY) &&
-	    !XFS_FORCED_SHUTDOWN(mp))
+	if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
 		XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
 		xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+	}
 #ifdef DEBUG
 	if (!(flags & XFS_TRANS_ABORT)) {
 		licp = &(tp->t_items);
-- 
cgit v1.2.3


From 794fe2dc65f1ec4acacc388053bb52d32d3a932e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:37:17 +1100
Subject: [XFS] endianess annotations and tidying for the uuid code

SGI-PV: 943272
SGI-Modid: xfs-linux-melb:xfs-kern:203709a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/support/uuid.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 69ec4f540c3a..a3d565a67734 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -27,6 +27,16 @@ uuid_init(void)
 	mutex_init(&uuid_monitor);
 }
 
+
+/* IRIX interpretation of an uuid_t */
+typedef struct {
+	__be32	uu_timelow;
+	__be16	uu_timemid;
+	__be16	uu_timehi;
+	__be16	uu_clockseq;
+	__be16	uu_node[3];
+} xfs_uu_t;
+
 /*
  * uuid_getnodeuniq - obtain the node unique fields of a UUID.
  *
@@ -36,16 +46,11 @@ uuid_init(void)
 void
 uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
 {
-	char	*uu = (char *)uuid;
-
-	/* on IRIX, this function assumes big-endian fields within
-	 * the uuid, so we use INT_GET to get the same result on
-	 * little-endian systems
-	 */
+	xfs_uu_t *uup = (xfs_uu_t *)uuid;
 
-	fsid[0] = (INT_GET(*(u_int16_t*)(uu+8), ARCH_CONVERT) << 16) +
-		   INT_GET(*(u_int16_t*)(uu+4), ARCH_CONVERT);
-	fsid[1] =  INT_GET(*(u_int32_t*)(uu  ), ARCH_CONVERT);
+	fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
+		   be16_to_cpu(uup->uu_timemid);
+	fsid[1] = be16_to_cpu(uup->uu_timelow);
 }
 
 void
-- 
cgit v1.2.3


From 216d3b2acba469a9bee98a09bb957e012ba7bc25 Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@sgi.com>
Date: Wed, 11 Jan 2006 15:37:38 +1100
Subject: [XFS] take out the call to vn_mark_bad() used when acl inherit fails
 and it needs to back out the inode creation. Tested by xfs_tests/077.

SGI-PV: 930841
SGI-Modid: xfs-linux-melb:xfs-kern:24842a

Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 6bf770cfb006..fe5e9894fdee 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -320,8 +320,6 @@ linvfs_mknod(
 				teardown.d_inode = ip = LINVFS_GET_IP(vp);
 				teardown.d_name = dentry->d_name;
 
-				vn_mark_bad(vp);
-				
 				if (S_ISDIR(mode))
 					VOP_RMDIR(dvp, &teardown, NULL, err2);
 				else
-- 
cgit v1.2.3


From a6867a6815fa0241848d4620f2dbd2954f4405d7 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Wed, 11 Jan 2006 15:37:58 +1100
Subject: [XFS] Introduce per-filesystem delwri pagebuf flushing to reduce
 contention between filesystems and prevent deadlocks between filesystems when
 a flush dependency exists between them.

SGI-PV: 947098
SGI-Modid: xfs-linux-melb:xfs-kern:24844a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 136 ++++++++++++++++++++++++++++++---------------
 fs/xfs/linux-2.6/xfs_buf.h |   9 +++
 2 files changed, 101 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6fe21d2b8847..2a8acd38fa1e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,6 +33,7 @@
 
 STATIC kmem_cache_t *pagebuf_zone;
 STATIC kmem_shaker_t pagebuf_shake;
+STATIC int xfsbufd(void *);
 STATIC int xfsbufd_wakeup(int, gfp_t);
 STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
 
@@ -1492,6 +1493,30 @@ xfs_free_bufhash(
 	btp->bt_hash = NULL;
 }
 
+/*
+ * buftarg list for delwrite queue processing
+ */
+STATIC LIST_HEAD(xfs_buftarg_list);
+STATIC DEFINE_SPINLOCK(xfs_buftarg_lock);
+
+STATIC void
+xfs_register_buftarg(
+	xfs_buftarg_t           *btp)
+{
+	spin_lock(&xfs_buftarg_lock);
+	list_add(&btp->bt_list, &xfs_buftarg_list);
+	spin_unlock(&xfs_buftarg_lock);
+}
+
+STATIC void
+xfs_unregister_buftarg(
+	xfs_buftarg_t           *btp)
+{
+	spin_lock(&xfs_buftarg_lock);
+	list_del(&btp->bt_list);
+	spin_unlock(&xfs_buftarg_lock);
+}
+
 void
 xfs_free_buftarg(
 	xfs_buftarg_t		*btp,
@@ -1502,6 +1527,12 @@ xfs_free_buftarg(
 		xfs_blkdev_put(btp->pbr_bdev);
 	xfs_free_bufhash(btp);
 	iput(btp->pbr_mapping->host);
+
+	/* unregister the buftarg first so that we don't get a
+	 * wakeup finding a non-existent task */
+	xfs_unregister_buftarg(btp);
+	kthread_stop(btp->bt_task);
+
 	kmem_free(btp, sizeof(*btp));
 }
 
@@ -1591,6 +1622,26 @@ xfs_mapping_buftarg(
 	return 0;
 }
 
+STATIC int
+xfs_alloc_delwrite_queue(
+	xfs_buftarg_t		*btp)
+{
+	int	error = 0;
+
+	INIT_LIST_HEAD(&btp->bt_list);
+	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
+	spinlock_init(&btp->bt_delwrite_lock, "delwri_lock");
+	btp->bt_flags = 0;
+	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
+	if (IS_ERR(btp->bt_task)) {
+		error = PTR_ERR(btp->bt_task);
+		goto out_error;
+	}
+	xfs_register_buftarg(btp);
+out_error:
+	return error;
+}
+
 xfs_buftarg_t *
 xfs_alloc_buftarg(
 	struct block_device	*bdev,
@@ -1606,6 +1657,8 @@ xfs_alloc_buftarg(
 		goto error;
 	if (xfs_mapping_buftarg(btp, bdev))
 		goto error;
+	if (xfs_alloc_delwrite_queue(btp))
+		goto error;
 	xfs_alloc_bufhash(btp, external);
 	return btp;
 
@@ -1618,20 +1671,19 @@ error:
 /*
  * Pagebuf delayed write buffer handling
  */
-
-STATIC LIST_HEAD(pbd_delwrite_queue);
-STATIC DEFINE_SPINLOCK(pbd_delwrite_lock);
-
 STATIC void
 pagebuf_delwri_queue(
 	xfs_buf_t		*pb,
 	int			unlock)
 {
+	struct list_head	*dwq = &pb->pb_target->bt_delwrite_queue;
+	spinlock_t		*dwlk = &pb->pb_target->bt_delwrite_lock;
+
 	PB_TRACE(pb, "delwri_q", (long)unlock);
 	ASSERT((pb->pb_flags & (PBF_DELWRI|PBF_ASYNC)) ==
 					(PBF_DELWRI|PBF_ASYNC));
 
-	spin_lock(&pbd_delwrite_lock);
+	spin_lock(dwlk);
 	/* If already in the queue, dequeue and place at tail */
 	if (!list_empty(&pb->pb_list)) {
 		ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
@@ -1642,9 +1694,9 @@ pagebuf_delwri_queue(
 	}
 
 	pb->pb_flags |= _PBF_DELWRI_Q;
-	list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
+	list_add_tail(&pb->pb_list, dwq);
 	pb->pb_queuetime = jiffies;
-	spin_unlock(&pbd_delwrite_lock);
+	spin_unlock(dwlk);
 
 	if (unlock)
 		pagebuf_unlock(pb);
@@ -1654,16 +1706,17 @@ void
 pagebuf_delwri_dequeue(
 	xfs_buf_t		*pb)
 {
+	spinlock_t		*dwlk = &pb->pb_target->bt_delwrite_lock;
 	int			dequeued = 0;
 
-	spin_lock(&pbd_delwrite_lock);
+	spin_lock(dwlk);
 	if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
 		ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
 		list_del_init(&pb->pb_list);
 		dequeued = 1;
 	}
 	pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
-	spin_unlock(&pbd_delwrite_lock);
+	spin_unlock(dwlk);
 
 	if (dequeued)
 		pagebuf_rele(pb);
@@ -1678,21 +1731,22 @@ pagebuf_runall_queues(
 	flush_workqueue(queue);
 }
 
-/* Defines for pagebuf daemon */
-STATIC struct task_struct *xfsbufd_task;
-STATIC int xfsbufd_force_flush;
-STATIC int xfsbufd_force_sleep;
-
 STATIC int
 xfsbufd_wakeup(
 	int			priority,
 	gfp_t			mask)
 {
-	if (xfsbufd_force_sleep)
-		return 0;
-	xfsbufd_force_flush = 1;
-	barrier();
-	wake_up_process(xfsbufd_task);
+	xfs_buftarg_t		*btp, *n;
+
+	spin_lock(&xfs_buftarg_lock);
+	list_for_each_entry_safe(btp, n, &xfs_buftarg_list, bt_list) {
+		if (test_bit(BT_FORCE_SLEEP, &btp->bt_flags))
+			continue;
+		set_bit(BT_FORCE_FLUSH, &btp->bt_flags);
+		barrier();
+		wake_up_process(btp->bt_task);
+	}
+	spin_unlock(&xfs_buftarg_lock);
 	return 0;
 }
 
@@ -1702,31 +1756,34 @@ xfsbufd(
 {
 	struct list_head	tmp;
 	unsigned long		age;
-	xfs_buftarg_t		*target;
+	xfs_buftarg_t		*target = (xfs_buftarg_t *)data;
 	xfs_buf_t		*pb, *n;
+	struct list_head	*dwq = &target->bt_delwrite_queue;
+	spinlock_t		*dwlk = &target->bt_delwrite_lock;
 
 	current->flags |= PF_MEMALLOC;
 
 	INIT_LIST_HEAD(&tmp);
 	do {
 		if (unlikely(freezing(current))) {
-			xfsbufd_force_sleep = 1;
+			set_bit(BT_FORCE_SLEEP, &target->bt_flags);
 			refrigerator();
 		} else {
-			xfsbufd_force_sleep = 0;
+			clear_bit(BT_FORCE_SLEEP, &target->bt_flags);
 		}
 
 		schedule_timeout_interruptible(
 			xfs_buf_timer_centisecs * msecs_to_jiffies(10));
 
 		age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
-		spin_lock(&pbd_delwrite_lock);
-		list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
+		spin_lock(dwlk);
+		list_for_each_entry_safe(pb, n, dwq, pb_list) {
 			PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
 			ASSERT(pb->pb_flags & PBF_DELWRI);
 
 			if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
-				if (!xfsbufd_force_flush &&
+				if (!test_bit(BT_FORCE_FLUSH,
+						&target->bt_flags) &&
 				    time_before(jiffies,
 						pb->pb_queuetime + age)) {
 					pagebuf_unlock(pb);
@@ -1738,11 +1795,11 @@ xfsbufd(
 				list_move(&pb->pb_list, &tmp);
 			}
 		}
-		spin_unlock(&pbd_delwrite_lock);
+		spin_unlock(dwlk);
 
 		while (!list_empty(&tmp)) {
 			pb = list_entry(tmp.next, xfs_buf_t, pb_list);
-			target = pb->pb_target;
+			ASSERT(target == pb->pb_target);
 
 			list_del_init(&pb->pb_list);
 			pagebuf_iostrategy(pb);
@@ -1753,7 +1810,7 @@ xfsbufd(
 		if (as_list_len > 0)
 			purge_addresses();
 
-		xfsbufd_force_flush = 0;
+		clear_bit(BT_FORCE_FLUSH, &target->bt_flags);
 	} while (!kthread_should_stop());
 
 	return 0;
@@ -1772,17 +1829,17 @@ xfs_flush_buftarg(
 	struct list_head	tmp;
 	xfs_buf_t		*pb, *n;
 	int			pincount = 0;
+	struct list_head	*dwq = &target->bt_delwrite_queue;
+	spinlock_t		*dwlk = &target->bt_delwrite_lock;
 
 	pagebuf_runall_queues(xfsdatad_workqueue);
 	pagebuf_runall_queues(xfslogd_workqueue);
 
 	INIT_LIST_HEAD(&tmp);
-	spin_lock(&pbd_delwrite_lock);
-	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
-
-		if (pb->pb_target != target)
-			continue;
+	spin_lock(dwlk);
+	list_for_each_entry_safe(pb, n, dwq, pb_list) {
 
+		ASSERT(pb->pb_target == target);
 		ASSERT(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q));
 		PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
 		if (pagebuf_ispin(pb)) {
@@ -1792,7 +1849,7 @@ xfs_flush_buftarg(
 
 		list_move(&pb->pb_list, &tmp);
 	}
-	spin_unlock(&pbd_delwrite_lock);
+	spin_unlock(dwlk);
 
 	/*
 	 * Dropped the delayed write list lock, now walk the temporary list
@@ -1847,20 +1904,12 @@ pagebuf_init(void)
 	if (!xfsdatad_workqueue)
 		goto out_destroy_xfslogd_workqueue;
 
-	xfsbufd_task = kthread_run(xfsbufd, NULL, "xfsbufd");
-	if (IS_ERR(xfsbufd_task)) {
-		error = PTR_ERR(xfsbufd_task);
-		goto out_destroy_xfsdatad_workqueue;
-	}
-
 	pagebuf_shake = kmem_shake_register(xfsbufd_wakeup);
 	if (!pagebuf_shake)
-		goto out_stop_xfsbufd;
+		goto out_destroy_xfsdatad_workqueue;
 
 	return 0;
 
- out_stop_xfsbufd:
-	kthread_stop(xfsbufd_task);
  out_destroy_xfsdatad_workqueue:
 	destroy_workqueue(xfsdatad_workqueue);
  out_destroy_xfslogd_workqueue:
@@ -1878,7 +1927,6 @@ void
 pagebuf_terminate(void)
 {
 	kmem_shake_deregister(pagebuf_shake);
-	kthread_stop(xfsbufd_task);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
 	kmem_zone_destroy(pagebuf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 237a35b915d1..f721d47ad4cc 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -88,6 +88,15 @@ typedef struct xfs_buftarg {
 	uint			bt_hashmask;
 	uint			bt_hashshift;
 	xfs_bufhash_t		*bt_hash;
+
+	/* per device delwri queue */
+	struct task_struct	*bt_task;
+	struct list_head	bt_list;
+	struct list_head	bt_delwrite_queue;
+	spinlock_t		bt_delwrite_lock;
+	uint			bt_flags;
+#define BT_FORCE_SLEEP		1
+#define BT_FORCE_FLUSH		2
 } xfs_buftarg_t;
 
 /*
-- 
cgit v1.2.3


From 68bdb6eabcd2869caa795019961a5445a11b5bc1 Mon Sep 17 00:00:00 2001
From: Yingping Lu <yingping@sgi.com>
Date: Wed, 11 Jan 2006 15:38:31 +1100
Subject: [XFS] Fixed delayed_blks assert failure during umount. The
 delayed_blks was caused by ENOSPC but not Rreclaimed by xfs_release or
 xfs_inactive. The fix changed the condition in xfs_release and xfs_inactive
 to invoke xfs_inactive_free_eofblocks for this special case, changed
 xfs_inactive_free_eofblocks to clean the delayed blks after eof. It also
 changed xfs_write to set correct eof when ENOSPC occurs.

SGI-PV: 946267
SGI-Modid: xfs-linux-melb:xfs-kern:203788a

Signed-off-by: Yingping Lu <yingping@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 4 ++++
 fs/xfs/xfs_vnodeops.c      | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 563cb9e975d2..147a28861f6b 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -809,6 +809,10 @@ retry:
 		goto retry;
 	}
 
+	isize = i_size_read(inode);
+	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
+		*offset = isize;
+
 	if (*offset > xip->i_d.di_size) {
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
 		if (*offset > xip->i_d.di_size) {
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 688fc2cb4b8d..3e8f1cbb7049 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1213,7 +1213,8 @@ xfs_inactive_free_eofblocks(
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (!error && (nimaps != 0) &&
-	    (imap.br_startblock != HOLESTARTBLOCK)) {
+	    (imap.br_startblock != HOLESTARTBLOCK ||
+	     ip->i_delayed_blks)) {
 		/*
 		 * Attach the dquots to the inode up front.
 		 */
@@ -1548,7 +1549,8 @@ xfs_release(
 
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
+		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
+		       ip->i_delayed_blks > 0)) &&
 		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 		    (!(ip->i_d.di_flags &
 				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
@@ -1627,7 +1629,8 @@ xfs_inactive(
 
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
+                     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
+                       ip->i_delayed_blks > 0)) &&
 		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
 		     (!(ip->i_d.di_flags &
 				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
-- 
cgit v1.2.3


From ce8e922c0e79c8093452ba9a124981332b75706b Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 15:39:08 +1100
Subject: [XFS] Complete the pagebuf -> xfs_buf naming convention transition,
 finally.

SGI-PV: 947038
SGI-Modid: xfs-linux-melb:xfs-kern:24866a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c  |    6 +-
 fs/xfs/linux-2.6/xfs_buf.c   | 1258 ++++++++++++++++++++----------------------
 fs/xfs/linux-2.6/xfs_buf.h   |  693 ++++++++++-------------
 fs/xfs/linux-2.6/xfs_ioctl.c |    2 +-
 fs/xfs/linux-2.6/xfs_linux.h |    2 +-
 fs/xfs/linux-2.6/xfs_lrw.c   |   10 +-
 fs/xfs/linux-2.6/xfs_stats.h |   18 +-
 fs/xfs/linux-2.6/xfs_super.c |   14 +-
 fs/xfs/xfs_mount.h           |    1 -
 fs/xfs/xfs_rw.c              |    9 +-
 10 files changed, 895 insertions(+), 1118 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 94d3cdfbf9b8..3f6b9e29850c 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -273,7 +273,7 @@ xfs_map_at_offset(
 
 	lock_buffer(bh);
 	bh->b_blocknr = bn;
-	bh->b_bdev = iomapp->iomap_target->pbr_bdev;
+	bh->b_bdev = iomapp->iomap_target->bt_bdev;
 	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
 }
@@ -982,7 +982,7 @@ __linvfs_get_block(
 	}
 
 	/* If this is a realtime file, data might be on a new device */
-	bh_result->b_bdev = iomap.iomap_target->pbr_bdev;
+	bh_result->b_bdev = iomap.iomap_target->bt_bdev;
 
 	/* If we previously allocated a block out beyond eof and
 	 * we are now coming back to use it then we will need to
@@ -1097,7 +1097,7 @@ linvfs_direct_IO(
 	iocb->private = xfs_alloc_ioend(inode);
 
 	ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
-		iomap.iomap_target->pbr_bdev,
+		iomap.iomap_target->bt_bdev,
 		iov, offset, nr_segs,
 		linvfs_get_blocks_direct,
 		linvfs_end_io_direct);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 2a8acd38fa1e..cb77f99cbef1 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -31,77 +31,77 @@
 #include <linux/kthread.h>
 #include "xfs_linux.h"
 
-STATIC kmem_cache_t *pagebuf_zone;
-STATIC kmem_shaker_t pagebuf_shake;
+STATIC kmem_zone_t *xfs_buf_zone;
+STATIC kmem_shaker_t xfs_buf_shake;
 STATIC int xfsbufd(void *);
 STATIC int xfsbufd_wakeup(int, gfp_t);
-STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
+STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
 
 STATIC struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
 
-#ifdef PAGEBUF_TRACE
+#ifdef XFS_BUF_TRACE
 void
-pagebuf_trace(
-	xfs_buf_t	*pb,
+xfs_buf_trace(
+	xfs_buf_t	*bp,
 	char		*id,
 	void		*data,
 	void		*ra)
 {
-	ktrace_enter(pagebuf_trace_buf,
-		pb, id,
-		(void *)(unsigned long)pb->pb_flags,
-		(void *)(unsigned long)pb->pb_hold.counter,
-		(void *)(unsigned long)pb->pb_sema.count.counter,
+	ktrace_enter(xfs_buf_trace_buf,
+		bp, id,
+		(void *)(unsigned long)bp->b_flags,
+		(void *)(unsigned long)bp->b_hold.counter,
+		(void *)(unsigned long)bp->b_sema.count.counter,
 		(void *)current,
 		data, ra,
-		(void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
-		(void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
-		(void *)(unsigned long)pb->pb_buffer_length,
+		(void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
+		(void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
+		(void *)(unsigned long)bp->b_buffer_length,
 		NULL, NULL, NULL, NULL, NULL);
 }
-ktrace_t *pagebuf_trace_buf;
-#define PAGEBUF_TRACE_SIZE	4096
-#define PB_TRACE(pb, id, data)	\
-	pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
+ktrace_t *xfs_buf_trace_buf;
+#define XFS_BUF_TRACE_SIZE	4096
+#define XB_TRACE(bp, id, data)	\
+	xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
 #else
-#define PB_TRACE(pb, id, data)	do { } while (0)
+#define XB_TRACE(bp, id, data)	do { } while (0)
 #endif
 
-#ifdef PAGEBUF_LOCK_TRACKING
-# define PB_SET_OWNER(pb)	((pb)->pb_last_holder = current->pid)
-# define PB_CLEAR_OWNER(pb)	((pb)->pb_last_holder = -1)
-# define PB_GET_OWNER(pb)	((pb)->pb_last_holder)
+#ifdef XFS_BUF_LOCK_TRACKING
+# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
+# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
+# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
 #else
-# define PB_SET_OWNER(pb)	do { } while (0)
-# define PB_CLEAR_OWNER(pb)	do { } while (0)
-# define PB_GET_OWNER(pb)	do { } while (0)
+# define XB_SET_OWNER(bp)	do { } while (0)
+# define XB_CLEAR_OWNER(bp)	do { } while (0)
+# define XB_GET_OWNER(bp)	do { } while (0)
 #endif
 
-#define pb_to_gfp(flags) \
-	((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \
-	  ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
+#define xb_to_gfp(flags) \
+	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
+	  ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
 
-#define pb_to_km(flags) \
-	 (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
+#define xb_to_km(flags) \
+	 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
 
-#define pagebuf_allocate(flags) \
-	kmem_zone_alloc(pagebuf_zone, pb_to_km(flags))
-#define pagebuf_deallocate(pb) \
-	kmem_zone_free(pagebuf_zone, (pb));
+#define xfs_buf_allocate(flags) \
+	kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
+#define xfs_buf_deallocate(bp) \
+	kmem_zone_free(xfs_buf_zone, (bp));
 
 /*
- * Page Region interfaces.
+ *	Page Region interfaces.
  *
- * For pages in filesystems where the blocksize is smaller than the
- * pagesize, we use the page->private field (long) to hold a bitmap
- * of uptodate regions within the page.
+ *	For pages in filesystems where the blocksize is smaller than the
+ *	pagesize, we use the page->private field (long) to hold a bitmap
+ * 	of uptodate regions within the page.
  *
- * Each such region is "bytes per page / bits per long" bytes long.
+ *	Each such region is "bytes per page / bits per long" bytes long.
  *
- * NBPPR == number-of-bytes-per-page-region
- * BTOPR == bytes-to-page-region (rounded up)
- * BTOPRT == bytes-to-page-region-truncated (rounded down)
+ *	NBPPR == number-of-bytes-per-page-region
+ *	BTOPR == bytes-to-page-region (rounded up)
+ *	BTOPRT == bytes-to-page-region-truncated (rounded down)
  */
 #if (BITS_PER_LONG == 32)
 #define PRSHIFT		(PAGE_CACHE_SHIFT - 5)	/* (32 == 1<<5) */
@@ -160,7 +160,7 @@ test_page_region(
 }
 
 /*
- * Mapping of multi-page buffers into contiguous virtual space
+ *	Mapping of multi-page buffers into contiguous virtual space
  */
 
 typedef struct a_list {
@@ -173,7 +173,7 @@ STATIC int		as_list_len;
 STATIC DEFINE_SPINLOCK(as_lock);
 
 /*
- * Try to batch vunmaps because they are costly.
+ *	Try to batch vunmaps because they are costly.
  */
 STATIC void
 free_address(
@@ -216,83 +216,83 @@ purge_addresses(void)
 }
 
 /*
- *	Internal pagebuf object manipulation
+ *	Internal xfs_buf_t object manipulation
  */
 
 STATIC void
-_pagebuf_initialize(
-	xfs_buf_t		*pb,
+_xfs_buf_initialize(
+	xfs_buf_t		*bp,
 	xfs_buftarg_t		*target,
 	loff_t			range_base,
 	size_t			range_length,
-	page_buf_flags_t	flags)
+	xfs_buf_flags_t		flags)
 {
 	/*
-	 * We don't want certain flags to appear in pb->pb_flags.
+	 * We don't want certain flags to appear in b_flags.
 	 */
-	flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
-
-	memset(pb, 0, sizeof(xfs_buf_t));
-	atomic_set(&pb->pb_hold, 1);
-	init_MUTEX_LOCKED(&pb->pb_iodonesema);
-	INIT_LIST_HEAD(&pb->pb_list);
-	INIT_LIST_HEAD(&pb->pb_hash_list);
-	init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
-	PB_SET_OWNER(pb);
-	pb->pb_target = target;
-	pb->pb_file_offset = range_base;
+	flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
+
+	memset(bp, 0, sizeof(xfs_buf_t));
+	atomic_set(&bp->b_hold, 1);
+	init_MUTEX_LOCKED(&bp->b_iodonesema);
+	INIT_LIST_HEAD(&bp->b_list);
+	INIT_LIST_HEAD(&bp->b_hash_list);
+	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
+	XB_SET_OWNER(bp);
+	bp->b_target = target;
+	bp->b_file_offset = range_base;
 	/*
 	 * Set buffer_length and count_desired to the same value initially.
 	 * I/O routines should use count_desired, which will be the same in
 	 * most cases but may be reset (e.g. XFS recovery).
 	 */
-	pb->pb_buffer_length = pb->pb_count_desired = range_length;
-	pb->pb_flags = flags;
-	pb->pb_bn = XFS_BUF_DADDR_NULL;
-	atomic_set(&pb->pb_pin_count, 0);
-	init_waitqueue_head(&pb->pb_waiters);
-
-	XFS_STATS_INC(pb_create);
-	PB_TRACE(pb, "initialize", target);
+	bp->b_buffer_length = bp->b_count_desired = range_length;
+	bp->b_flags = flags;
+	bp->b_bn = XFS_BUF_DADDR_NULL;
+	atomic_set(&bp->b_pin_count, 0);
+	init_waitqueue_head(&bp->b_waiters);
+
+	XFS_STATS_INC(xb_create);
+	XB_TRACE(bp, "initialize", target);
 }
 
 /*
- * Allocate a page array capable of holding a specified number
- * of pages, and point the page buf at it.
+ *	Allocate a page array capable of holding a specified number
+ *	of pages, and point the page buf at it.
  */
 STATIC int
-_pagebuf_get_pages(
-	xfs_buf_t		*pb,
+_xfs_buf_get_pages(
+	xfs_buf_t		*bp,
 	int			page_count,
-	page_buf_flags_t	flags)
+	xfs_buf_flags_t		flags)
 {
 	/* Make sure that we have a page list */
-	if (pb->pb_pages == NULL) {
-		pb->pb_offset = page_buf_poff(pb->pb_file_offset);
-		pb->pb_page_count = page_count;
-		if (page_count <= PB_PAGES) {
-			pb->pb_pages = pb->pb_page_array;
+	if (bp->b_pages == NULL) {
+		bp->b_offset = xfs_buf_poff(bp->b_file_offset);
+		bp->b_page_count = page_count;
+		if (page_count <= XB_PAGES) {
+			bp->b_pages = bp->b_page_array;
 		} else {
-			pb->pb_pages = kmem_alloc(sizeof(struct page *) *
-					page_count, pb_to_km(flags));
-			if (pb->pb_pages == NULL)
+			bp->b_pages = kmem_alloc(sizeof(struct page *) *
+					page_count, xb_to_km(flags));
+			if (bp->b_pages == NULL)
 				return -ENOMEM;
 		}
-		memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
+		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
 	}
 	return 0;
 }
 
 /*
- *	Frees pb_pages if it was malloced.
+ *	Frees b_pages if it was allocated.
  */
 STATIC void
-_pagebuf_free_pages(
+_xfs_buf_free_pages(
 	xfs_buf_t	*bp)
 {
-	if (bp->pb_pages != bp->pb_page_array) {
-		kmem_free(bp->pb_pages,
-			  bp->pb_page_count * sizeof(struct page *));
+	if (bp->b_pages != bp->b_page_array) {
+		kmem_free(bp->b_pages,
+			  bp->b_page_count * sizeof(struct page *));
 	}
 }
 
@@ -300,79 +300,79 @@ _pagebuf_free_pages(
  *	Releases the specified buffer.
  *
  * 	The modification state of any associated pages is left unchanged.
- * 	The buffer most not be on any hash - use pagebuf_rele instead for
+ * 	The buffer most not be on any hash - use xfs_buf_rele instead for
  * 	hashed and refcounted buffers
  */
 void
-pagebuf_free(
+xfs_buf_free(
 	xfs_buf_t		*bp)
 {
-	PB_TRACE(bp, "free", 0);
+	XB_TRACE(bp, "free", 0);
 
-	ASSERT(list_empty(&bp->pb_hash_list));
+	ASSERT(list_empty(&bp->b_hash_list));
 
-	if (bp->pb_flags & _PBF_PAGE_CACHE) {
+	if (bp->b_flags & _XBF_PAGE_CACHE) {
 		uint		i;
 
-		if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
-			free_address(bp->pb_addr - bp->pb_offset);
+		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
+			free_address(bp->b_addr - bp->b_offset);
 
-		for (i = 0; i < bp->pb_page_count; i++)
-			page_cache_release(bp->pb_pages[i]);
-		_pagebuf_free_pages(bp);
-	} else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
+		for (i = 0; i < bp->b_page_count; i++)
+			page_cache_release(bp->b_pages[i]);
+		_xfs_buf_free_pages(bp);
+	} else if (bp->b_flags & _XBF_KMEM_ALLOC) {
 		 /*
-		  * XXX(hch): bp->pb_count_desired might be incorrect (see
-		  * pagebuf_associate_memory for details), but fortunately
+		  * XXX(hch): bp->b_count_desired might be incorrect (see
+		  * xfs_buf_associate_memory for details), but fortunately
 		  * the Linux version of kmem_free ignores the len argument..
 		  */
-		kmem_free(bp->pb_addr, bp->pb_count_desired);
-		_pagebuf_free_pages(bp);
+		kmem_free(bp->b_addr, bp->b_count_desired);
+		_xfs_buf_free_pages(bp);
 	}
 
-	pagebuf_deallocate(bp);
+	xfs_buf_deallocate(bp);
 }
 
 /*
  *	Finds all pages for buffer in question and builds it's page list.
  */
 STATIC int
-_pagebuf_lookup_pages(
+_xfs_buf_lookup_pages(
 	xfs_buf_t		*bp,
 	uint			flags)
 {
-	struct address_space	*mapping = bp->pb_target->pbr_mapping;
-	size_t			blocksize = bp->pb_target->pbr_bsize;
-	size_t			size = bp->pb_count_desired;
+	struct address_space	*mapping = bp->b_target->bt_mapping;
+	size_t			blocksize = bp->b_target->bt_bsize;
+	size_t			size = bp->b_count_desired;
 	size_t			nbytes, offset;
-	gfp_t			gfp_mask = pb_to_gfp(flags);
+	gfp_t			gfp_mask = xb_to_gfp(flags);
 	unsigned short		page_count, i;
 	pgoff_t			first;
 	loff_t			end;
 	int			error;
 
-	end = bp->pb_file_offset + bp->pb_buffer_length;
-	page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
+	end = bp->b_file_offset + bp->b_buffer_length;
+	page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
 
-	error = _pagebuf_get_pages(bp, page_count, flags);
+	error = _xfs_buf_get_pages(bp, page_count, flags);
 	if (unlikely(error))
 		return error;
-	bp->pb_flags |= _PBF_PAGE_CACHE;
+	bp->b_flags |= _XBF_PAGE_CACHE;
 
-	offset = bp->pb_offset;
-	first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
+	offset = bp->b_offset;
+	first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
 
-	for (i = 0; i < bp->pb_page_count; i++) {
+	for (i = 0; i < bp->b_page_count; i++) {
 		struct page	*page;
 		uint		retries = 0;
 
 	      retry:
 		page = find_or_create_page(mapping, first + i, gfp_mask);
 		if (unlikely(page == NULL)) {
-			if (flags & PBF_READ_AHEAD) {
-				bp->pb_page_count = i;
-				for (i = 0; i < bp->pb_page_count; i++)
-					unlock_page(bp->pb_pages[i]);
+			if (flags & XBF_READ_AHEAD) {
+				bp->b_page_count = i;
+				for (i = 0; i < bp->b_page_count; i++)
+					unlock_page(bp->b_pages[i]);
 				return -ENOMEM;
 			}
 
@@ -388,13 +388,13 @@ _pagebuf_lookup_pages(
 					"deadlock in %s (mode:0x%x)\n",
 					__FUNCTION__, gfp_mask);
 
-			XFS_STATS_INC(pb_page_retries);
+			XFS_STATS_INC(xb_page_retries);
 			xfsbufd_wakeup(0, gfp_mask);
 			blk_congestion_wait(WRITE, HZ/50);
 			goto retry;
 		}
 
-		XFS_STATS_INC(pb_page_found);
+		XFS_STATS_INC(xb_page_found);
 
 		nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
 		size -= nbytes;
@@ -402,27 +402,27 @@ _pagebuf_lookup_pages(
 		if (!PageUptodate(page)) {
 			page_count--;
 			if (blocksize >= PAGE_CACHE_SIZE) {
-				if (flags & PBF_READ)
-					bp->pb_locked = 1;
+				if (flags & XBF_READ)
+					bp->b_locked = 1;
 			} else if (!PagePrivate(page)) {
 				if (test_page_region(page, offset, nbytes))
 					page_count++;
 			}
 		}
 
-		bp->pb_pages[i] = page;
+		bp->b_pages[i] = page;
 		offset = 0;
 	}
 
-	if (!bp->pb_locked) {
-		for (i = 0; i < bp->pb_page_count; i++)
-			unlock_page(bp->pb_pages[i]);
+	if (!bp->b_locked) {
+		for (i = 0; i < bp->b_page_count; i++)
+			unlock_page(bp->b_pages[i]);
 	}
 
-	if (page_count == bp->pb_page_count)
-		bp->pb_flags |= PBF_DONE;
+	if (page_count == bp->b_page_count)
+		bp->b_flags |= XBF_DONE;
 
-	PB_TRACE(bp, "lookup_pages", (long)page_count);
+	XB_TRACE(bp, "lookup_pages", (long)page_count);
 	return error;
 }
 
@@ -430,23 +430,23 @@ _pagebuf_lookup_pages(
  *	Map buffer into kernel address-space if nessecary.
  */
 STATIC int
-_pagebuf_map_pages(
+_xfs_buf_map_pages(
 	xfs_buf_t		*bp,
 	uint			flags)
 {
 	/* A single page buffer is always mappable */
-	if (bp->pb_page_count == 1) {
-		bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
-		bp->pb_flags |= PBF_MAPPED;
-	} else if (flags & PBF_MAPPED) {
+	if (bp->b_page_count == 1) {
+		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
+		bp->b_flags |= XBF_MAPPED;
+	} else if (flags & XBF_MAPPED) {
 		if (as_list_len > 64)
 			purge_addresses();
-		bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
-				VM_MAP, PAGE_KERNEL);
-		if (unlikely(bp->pb_addr == NULL))
+		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
+					VM_MAP, PAGE_KERNEL);
+		if (unlikely(bp->b_addr == NULL))
 			return -ENOMEM;
-		bp->pb_addr += bp->pb_offset;
-		bp->pb_flags |= PBF_MAPPED;
+		bp->b_addr += bp->b_offset;
+		bp->b_flags |= XBF_MAPPED;
 	}
 
 	return 0;
@@ -457,9 +457,7 @@ _pagebuf_map_pages(
  */
 
 /*
- *	_pagebuf_find
- *
- *	Looks up, and creates if absent, a lockable buffer for
+ *	Look up, and creates if absent, a lockable buffer for
  *	a given range of an inode.  The buffer is returned
  *	locked.	 If other overlapping buffers exist, they are
  *	released before the new buffer is created and locked,
@@ -467,55 +465,55 @@ _pagebuf_map_pages(
  *	are unlocked.  No I/O is implied by this call.
  */
 xfs_buf_t *
-_pagebuf_find(
+_xfs_buf_find(
 	xfs_buftarg_t		*btp,	/* block device target		*/
 	loff_t			ioff,	/* starting offset of range	*/
 	size_t			isize,	/* length of range		*/
-	page_buf_flags_t	flags,	/* PBF_TRYLOCK			*/
-	xfs_buf_t		*new_pb)/* newly allocated buffer	*/
+	xfs_buf_flags_t		flags,
+	xfs_buf_t		*new_bp)
 {
 	loff_t			range_base;
 	size_t			range_length;
 	xfs_bufhash_t		*hash;
-	xfs_buf_t		*pb, *n;
+	xfs_buf_t		*bp, *n;
 
 	range_base = (ioff << BBSHIFT);
 	range_length = (isize << BBSHIFT);
 
 	/* Check for IOs smaller than the sector size / not sector aligned */
-	ASSERT(!(range_length < (1 << btp->pbr_sshift)));
-	ASSERT(!(range_base & (loff_t)btp->pbr_smask));
+	ASSERT(!(range_length < (1 << btp->bt_sshift)));
+	ASSERT(!(range_base & (loff_t)btp->bt_smask));
 
 	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
 
 	spin_lock(&hash->bh_lock);
 
-	list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) {
-		ASSERT(btp == pb->pb_target);
-		if (pb->pb_file_offset == range_base &&
-		    pb->pb_buffer_length == range_length) {
+	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
+		ASSERT(btp == bp->b_target);
+		if (bp->b_file_offset == range_base &&
+		    bp->b_buffer_length == range_length) {
 			/*
-			 * If we look at something bring it to the
+			 * If we look at something, bring it to the
 			 * front of the list for next time.
 			 */
-			atomic_inc(&pb->pb_hold);
-			list_move(&pb->pb_hash_list, &hash->bh_list);
+			atomic_inc(&bp->b_hold);
+			list_move(&bp->b_hash_list, &hash->bh_list);
 			goto found;
 		}
 	}
 
 	/* No match found */
-	if (new_pb) {
-		_pagebuf_initialize(new_pb, btp, range_base,
+	if (new_bp) {
+		_xfs_buf_initialize(new_bp, btp, range_base,
 				range_length, flags);
-		new_pb->pb_hash = hash;
-		list_add(&new_pb->pb_hash_list, &hash->bh_list);
+		new_bp->b_hash = hash;
+		list_add(&new_bp->b_hash_list, &hash->bh_list);
 	} else {
-		XFS_STATS_INC(pb_miss_locked);
+		XFS_STATS_INC(xb_miss_locked);
 	}
 
 	spin_unlock(&hash->bh_lock);
-	return new_pb;
+	return new_bp;
 
 found:
 	spin_unlock(&hash->bh_lock);
@@ -524,74 +522,72 @@ found:
 	 * if this does not work then we need to drop the
 	 * spinlock and do a hard attempt on the semaphore.
 	 */
-	if (down_trylock(&pb->pb_sema)) {
-		if (!(flags & PBF_TRYLOCK)) {
+	if (down_trylock(&bp->b_sema)) {
+		if (!(flags & XBF_TRYLOCK)) {
 			/* wait for buffer ownership */
-			PB_TRACE(pb, "get_lock", 0);
-			pagebuf_lock(pb);
-			XFS_STATS_INC(pb_get_locked_waited);
+			XB_TRACE(bp, "get_lock", 0);
+			xfs_buf_lock(bp);
+			XFS_STATS_INC(xb_get_locked_waited);
 		} else {
 			/* We asked for a trylock and failed, no need
 			 * to look at file offset and length here, we
-			 * know that this pagebuf at least overlaps our
-			 * pagebuf and is locked, therefore our buffer
-			 * either does not exist, or is this buffer
+			 * know that this buffer at least overlaps our
+			 * buffer and is locked, therefore our buffer
+			 * either does not exist, or is this buffer.
 			 */
-
-			pagebuf_rele(pb);
-			XFS_STATS_INC(pb_busy_locked);
-			return (NULL);
+			xfs_buf_rele(bp);
+			XFS_STATS_INC(xb_busy_locked);
+			return NULL;
 		}
 	} else {
 		/* trylock worked */
-		PB_SET_OWNER(pb);
+		XB_SET_OWNER(bp);
 	}
 
-	if (pb->pb_flags & PBF_STALE) {
-		ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0);
-		pb->pb_flags &= PBF_MAPPED;
+	if (bp->b_flags & XBF_STALE) {
+		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+		bp->b_flags &= XBF_MAPPED;
 	}
-	PB_TRACE(pb, "got_lock", 0);
-	XFS_STATS_INC(pb_get_locked);
-	return (pb);
+	XB_TRACE(bp, "got_lock", 0);
+	XFS_STATS_INC(xb_get_locked);
+	return bp;
 }
 
 /*
- *	xfs_buf_get_flags assembles a buffer covering the specified range.
- *
+ *	Assembles a buffer covering the specified range.
  *	Storage in memory for all portions of the buffer will be allocated,
  *	although backing storage may not be.
  */
 xfs_buf_t *
-xfs_buf_get_flags(			/* allocate a buffer		*/
+xfs_buf_get_flags(
 	xfs_buftarg_t		*target,/* target for buffer		*/
 	loff_t			ioff,	/* starting offset of range	*/
 	size_t			isize,	/* length of range		*/
-	page_buf_flags_t	flags)	/* PBF_TRYLOCK			*/
+	xfs_buf_flags_t		flags)
 {
-	xfs_buf_t		*pb, *new_pb;
+	xfs_buf_t		*bp, *new_bp;
 	int			error = 0, i;
 
-	new_pb = pagebuf_allocate(flags);
-	if (unlikely(!new_pb))
+	new_bp = xfs_buf_allocate(flags);
+	if (unlikely(!new_bp))
 		return NULL;
 
-	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
-	if (pb == new_pb) {
-		error = _pagebuf_lookup_pages(pb, flags);
+	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
+	if (bp == new_bp) {
+		error = _xfs_buf_lookup_pages(bp, flags);
 		if (error)
 			goto no_buffer;
 	} else {
-		pagebuf_deallocate(new_pb);
-		if (unlikely(pb == NULL))
+		xfs_buf_deallocate(new_bp);
+		if (unlikely(bp == NULL))
 			return NULL;
 	}
 
-	for (i = 0; i < pb->pb_page_count; i++)
-		mark_page_accessed(pb->pb_pages[i]);
+	for (i = 0; i < bp->b_page_count; i++)
+		mark_page_accessed(bp->b_pages[i]);
 
-	if (!(pb->pb_flags & PBF_MAPPED)) {
-		error = _pagebuf_map_pages(pb, flags);
+	if (!(bp->b_flags & XBF_MAPPED)) {
+		error = _xfs_buf_map_pages(bp, flags);
 		if (unlikely(error)) {
 			printk(KERN_WARNING "%s: failed to map pages\n",
 					__FUNCTION__);
@@ -599,22 +595,22 @@ xfs_buf_get_flags(			/* allocate a buffer		*/
 		}
 	}
 
-	XFS_STATS_INC(pb_get);
+	XFS_STATS_INC(xb_get);
 
 	/*
 	 * Always fill in the block number now, the mapped cases can do
 	 * their own overlay of this later.
 	 */
-	pb->pb_bn = ioff;
-	pb->pb_count_desired = pb->pb_buffer_length;
+	bp->b_bn = ioff;
+	bp->b_count_desired = bp->b_buffer_length;
 
-	PB_TRACE(pb, "get", (unsigned long)flags);
-	return pb;
+	XB_TRACE(bp, "get", (unsigned long)flags);
+	return bp;
 
  no_buffer:
-	if (flags & (PBF_LOCK | PBF_TRYLOCK))
-		pagebuf_unlock(pb);
-	pagebuf_rele(pb);
+	if (flags & (XBF_LOCK | XBF_TRYLOCK))
+		xfs_buf_unlock(bp);
+	xfs_buf_rele(bp);
 	return NULL;
 }
 
@@ -623,73 +619,73 @@ xfs_buf_read_flags(
 	xfs_buftarg_t		*target,
 	loff_t			ioff,
 	size_t			isize,
-	page_buf_flags_t	flags)
+	xfs_buf_flags_t		flags)
 {
-	xfs_buf_t		*pb;
-
-	flags |= PBF_READ;
-
-	pb = xfs_buf_get_flags(target, ioff, isize, flags);
-	if (pb) {
-		if (!XFS_BUF_ISDONE(pb)) {
-			PB_TRACE(pb, "read", (unsigned long)flags);
-			XFS_STATS_INC(pb_get_read);
-			pagebuf_iostart(pb, flags);
-		} else if (flags & PBF_ASYNC) {
-			PB_TRACE(pb, "read_async", (unsigned long)flags);
+	xfs_buf_t		*bp;
+
+	flags |= XBF_READ;
+
+	bp = xfs_buf_get_flags(target, ioff, isize, flags);
+	if (bp) {
+		if (!XFS_BUF_ISDONE(bp)) {
+			XB_TRACE(bp, "read", (unsigned long)flags);
+			XFS_STATS_INC(xb_get_read);
+			xfs_buf_iostart(bp, flags);
+		} else if (flags & XBF_ASYNC) {
+			XB_TRACE(bp, "read_async", (unsigned long)flags);
 			/*
 			 * Read ahead call which is already satisfied,
 			 * drop the buffer
 			 */
 			goto no_buffer;
 		} else {
-			PB_TRACE(pb, "read_done", (unsigned long)flags);
+			XB_TRACE(bp, "read_done", (unsigned long)flags);
 			/* We do not want read in the flags */
-			pb->pb_flags &= ~PBF_READ;
+			bp->b_flags &= ~XBF_READ;
 		}
 	}
 
-	return pb;
+	return bp;
 
  no_buffer:
-	if (flags & (PBF_LOCK | PBF_TRYLOCK))
-		pagebuf_unlock(pb);
-	pagebuf_rele(pb);
+	if (flags & (XBF_LOCK | XBF_TRYLOCK))
+		xfs_buf_unlock(bp);
+	xfs_buf_rele(bp);
 	return NULL;
 }
 
 /*
- * If we are not low on memory then do the readahead in a deadlock
- * safe manner.
+ *	If we are not low on memory then do the readahead in a deadlock
+ *	safe manner.
  */
 void
-pagebuf_readahead(
+xfs_buf_readahead(
 	xfs_buftarg_t		*target,
 	loff_t			ioff,
 	size_t			isize,
-	page_buf_flags_t	flags)
+	xfs_buf_flags_t		flags)
 {
 	struct backing_dev_info *bdi;
 
-	bdi = target->pbr_mapping->backing_dev_info;
+	bdi = target->bt_mapping->backing_dev_info;
 	if (bdi_read_congested(bdi))
 		return;
 
-	flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD);
+	flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
 	xfs_buf_read_flags(target, ioff, isize, flags);
 }
 
 xfs_buf_t *
-pagebuf_get_empty(
+xfs_buf_get_empty(
 	size_t			len,
 	xfs_buftarg_t		*target)
 {
-	xfs_buf_t		*pb;
+	xfs_buf_t		*bp;
 
-	pb = pagebuf_allocate(0);
-	if (pb)
-		_pagebuf_initialize(pb, target, 0, len, 0);
-	return pb;
+	bp = xfs_buf_allocate(0);
+	if (bp)
+		_xfs_buf_initialize(bp, target, 0, len, 0);
+	return bp;
 }
 
 static inline struct page *
@@ -705,8 +701,8 @@ mem_to_page(
 }
 
 int
-pagebuf_associate_memory(
-	xfs_buf_t		*pb,
+xfs_buf_associate_memory(
+	xfs_buf_t		*bp,
 	void			*mem,
 	size_t			len)
 {
@@ -723,40 +719,40 @@ pagebuf_associate_memory(
 		page_count++;
 
 	/* Free any previous set of page pointers */
-	if (pb->pb_pages)
-		_pagebuf_free_pages(pb);
+	if (bp->b_pages)
+		_xfs_buf_free_pages(bp);
 
-	pb->pb_pages = NULL;
-	pb->pb_addr = mem;
+	bp->b_pages = NULL;
+	bp->b_addr = mem;
 
-	rval = _pagebuf_get_pages(pb, page_count, 0);
+	rval = _xfs_buf_get_pages(bp, page_count, 0);
 	if (rval)
 		return rval;
 
-	pb->pb_offset = offset;
+	bp->b_offset = offset;
 	ptr = (size_t) mem & PAGE_CACHE_MASK;
 	end = PAGE_CACHE_ALIGN((size_t) mem + len);
 	end_cur = end;
 	/* set up first page */
-	pb->pb_pages[0] = mem_to_page(mem);
+	bp->b_pages[0] = mem_to_page(mem);
 
 	ptr += PAGE_CACHE_SIZE;
-	pb->pb_page_count = ++i;
+	bp->b_page_count = ++i;
 	while (ptr < end) {
-		pb->pb_pages[i] = mem_to_page((void *)ptr);
-		pb->pb_page_count = ++i;
+		bp->b_pages[i] = mem_to_page((void *)ptr);
+		bp->b_page_count = ++i;
 		ptr += PAGE_CACHE_SIZE;
 	}
-	pb->pb_locked = 0;
+	bp->b_locked = 0;
 
-	pb->pb_count_desired = pb->pb_buffer_length = len;
-	pb->pb_flags |= PBF_MAPPED;
+	bp->b_count_desired = bp->b_buffer_length = len;
+	bp->b_flags |= XBF_MAPPED;
 
 	return 0;
 }
 
 xfs_buf_t *
-pagebuf_get_no_daddr(
+xfs_buf_get_noaddr(
 	size_t			len,
 	xfs_buftarg_t		*target)
 {
@@ -765,10 +761,10 @@ pagebuf_get_no_daddr(
 	void			*data;
 	int			error;
 
-	bp = pagebuf_allocate(0);
+	bp = xfs_buf_allocate(0);
 	if (unlikely(bp == NULL))
 		goto fail;
-	_pagebuf_initialize(bp, target, 0, len, 0);
+	_xfs_buf_initialize(bp, target, 0, len, 0);
 
  try_again:
 	data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
@@ -777,78 +773,73 @@ pagebuf_get_no_daddr(
 
 	/* check whether alignment matches.. */
 	if ((__psunsigned_t)data !=
-	    ((__psunsigned_t)data & ~target->pbr_smask)) {
+	    ((__psunsigned_t)data & ~target->bt_smask)) {
 		/* .. else double the size and try again */
 		kmem_free(data, malloc_len);
 		malloc_len <<= 1;
 		goto try_again;
 	}
 
-	error = pagebuf_associate_memory(bp, data, len);
+	error = xfs_buf_associate_memory(bp, data, len);
 	if (error)
 		goto fail_free_mem;
-	bp->pb_flags |= _PBF_KMEM_ALLOC;
+	bp->b_flags |= _XBF_KMEM_ALLOC;
 
-	pagebuf_unlock(bp);
+	xfs_buf_unlock(bp);
 
-	PB_TRACE(bp, "no_daddr", data);
+	XB_TRACE(bp, "no_daddr", data);
 	return bp;
  fail_free_mem:
 	kmem_free(data, malloc_len);
  fail_free_buf:
-	pagebuf_free(bp);
+	xfs_buf_free(bp);
  fail:
 	return NULL;
 }
 
 /*
- *	pagebuf_hold
- *
  *	Increment reference count on buffer, to hold the buffer concurrently
  *	with another thread which may release (free) the buffer asynchronously.
- *
  *	Must hold the buffer already to call this function.
  */
 void
-pagebuf_hold(
-	xfs_buf_t		*pb)
+xfs_buf_hold(
+	xfs_buf_t		*bp)
 {
-	atomic_inc(&pb->pb_hold);
-	PB_TRACE(pb, "hold", 0);
+	atomic_inc(&bp->b_hold);
+	XB_TRACE(bp, "hold", 0);
 }
 
 /*
- *	pagebuf_rele
- *
- *	pagebuf_rele releases a hold on the specified buffer.  If the
- *	the hold count is 1, pagebuf_rele calls pagebuf_free.
+ *	Releases a hold on the specified buffer.  If the
+ *	the hold count is 1, calls xfs_buf_free.
  */
 void
-pagebuf_rele(
-	xfs_buf_t		*pb)
+xfs_buf_rele(
+	xfs_buf_t		*bp)
 {
-	xfs_bufhash_t		*hash = pb->pb_hash;
+	xfs_bufhash_t		*hash = bp->b_hash;
 
-	PB_TRACE(pb, "rele", pb->pb_relse);
+	XB_TRACE(bp, "rele", bp->b_relse);
 
-	if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
-		if (pb->pb_relse) {
-			atomic_inc(&pb->pb_hold);
+	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+		if (bp->b_relse) {
+			atomic_inc(&bp->b_hold);
 			spin_unlock(&hash->bh_lock);
-			(*(pb->pb_relse)) (pb);
-		} else if (pb->pb_flags & PBF_FS_MANAGED) {
+			(*(bp->b_relse)) (bp);
+		} else if (bp->b_flags & XBF_FS_MANAGED) {
 			spin_unlock(&hash->bh_lock);
 		} else {
-			ASSERT(!(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)));
-			list_del_init(&pb->pb_hash_list);
+			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
+			list_del_init(&bp->b_hash_list);
 			spin_unlock(&hash->bh_lock);
-			pagebuf_free(pb);
+			xfs_buf_free(bp);
 		}
 	} else {
 		/*
 		 * Catch reference count leaks
 		 */
-		ASSERT(atomic_read(&pb->pb_hold) >= 0);
+		ASSERT(atomic_read(&bp->b_hold) >= 0);
 	}
 }
 
@@ -864,168 +855,122 @@ pagebuf_rele(
  */
 
 /*
- *	pagebuf_cond_lock
- *
- *	pagebuf_cond_lock locks a buffer object, if it is not already locked.
- *	Note that this in no way
- *	locks the underlying pages, so it is only useful for synchronizing
- *	concurrent use of page buffer objects, not for synchronizing independent
- *	access to the underlying pages.
+ *	Locks a buffer object, if it is not already locked.
+ *	Note that this in no way locks the underlying pages, so it is only
+ *	useful for synchronizing concurrent use of buffer objects, not for
+ *	synchronizing independent access to the underlying pages.
  */
 int
-pagebuf_cond_lock(			/* lock buffer, if not locked	*/
-					/* returns -EBUSY if locked)	*/
-	xfs_buf_t		*pb)
+xfs_buf_cond_lock(
+	xfs_buf_t		*bp)
 {
 	int			locked;
 
-	locked = down_trylock(&pb->pb_sema) == 0;
+	locked = down_trylock(&bp->b_sema) == 0;
 	if (locked) {
-		PB_SET_OWNER(pb);
+		XB_SET_OWNER(bp);
 	}
-	PB_TRACE(pb, "cond_lock", (long)locked);
-	return(locked ? 0 : -EBUSY);
+	XB_TRACE(bp, "cond_lock", (long)locked);
+	return locked ? 0 : -EBUSY;
 }
 
 #if defined(DEBUG) || defined(XFS_BLI_TRACE)
-/*
- *	pagebuf_lock_value
- *
- *	Return lock value for a pagebuf
- */
 int
-pagebuf_lock_value(
-	xfs_buf_t		*pb)
+xfs_buf_lock_value(
+	xfs_buf_t		*bp)
 {
-	return(atomic_read(&pb->pb_sema.count));
+	return atomic_read(&bp->b_sema.count);
 }
 #endif
 
 /*
- *	pagebuf_lock
- *
- *	pagebuf_lock locks a buffer object.  Note that this in no way
- *	locks the underlying pages, so it is only useful for synchronizing
- *	concurrent use of page buffer objects, not for synchronizing independent
- *	access to the underlying pages.
+ *	Locks a buffer object.
+ *	Note that this in no way locks the underlying pages, so it is only
+ *	useful for synchronizing concurrent use of buffer objects, not for
+ *	synchronizing independent access to the underlying pages.
  */
-int
-pagebuf_lock(
-	xfs_buf_t		*pb)
+void
+xfs_buf_lock(
+	xfs_buf_t		*bp)
 {
-	PB_TRACE(pb, "lock", 0);
-	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_address_space(pb->pb_target->pbr_mapping);
-	down(&pb->pb_sema);
-	PB_SET_OWNER(pb);
-	PB_TRACE(pb, "locked", 0);
-	return 0;
+	XB_TRACE(bp, "lock", 0);
+	if (atomic_read(&bp->b_io_remaining))
+		blk_run_address_space(bp->b_target->bt_mapping);
+	down(&bp->b_sema);
+	XB_SET_OWNER(bp);
+	XB_TRACE(bp, "locked", 0);
 }
 
 /*
- *	pagebuf_unlock
- *
- *	pagebuf_unlock releases the lock on the buffer object created by
- *	pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages
- *	created by pagebuf_pin).
- *
+ *	Releases the lock on the buffer object.
  *	If the buffer is marked delwri but is not queued, do so before we
- *	unlock the buffer as we need to set flags correctly. We also need to
+ *	unlock the buffer as we need to set flags correctly.  We also need to
  *	take a reference for the delwri queue because the unlocker is going to
  *	drop their's and they don't know we just queued it.
  */
 void
-pagebuf_unlock(				/* unlock buffer		*/
-	xfs_buf_t		*pb)	/* buffer to unlock		*/
+xfs_buf_unlock(
+	xfs_buf_t		*bp)
 {
-	if ((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == PBF_DELWRI) {
-		atomic_inc(&pb->pb_hold);
-		pb->pb_flags |= PBF_ASYNC;
-		pagebuf_delwri_queue(pb, 0);
+	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
+		atomic_inc(&bp->b_hold);
+		bp->b_flags |= XBF_ASYNC;
+		xfs_buf_delwri_queue(bp, 0);
 	}
 
-	PB_CLEAR_OWNER(pb);
-	up(&pb->pb_sema);
-	PB_TRACE(pb, "unlock", 0);
+	XB_CLEAR_OWNER(bp);
+	up(&bp->b_sema);
+	XB_TRACE(bp, "unlock", 0);
 }
 
 
 /*
  *	Pinning Buffer Storage in Memory
- */
-
-/*
- *	pagebuf_pin
- *
- *	pagebuf_pin locks all of the memory represented by a buffer in
- *	memory.  Multiple calls to pagebuf_pin and pagebuf_unpin, for
- *	the same or different buffers affecting a given page, will
- *	properly count the number of outstanding "pin" requests.  The
- *	buffer may be released after the pagebuf_pin and a different
- *	buffer used when calling pagebuf_unpin, if desired.
- *	pagebuf_pin should be used by the file system when it wants be
- *	assured that no attempt will be made to force the affected
- *	memory to disk.	 It does not assure that a given logical page
- *	will not be moved to a different physical page.
+ *	Ensure that no attempt to force a buffer to disk will succeed.
  */
 void
-pagebuf_pin(
-	xfs_buf_t		*pb)
+xfs_buf_pin(
+	xfs_buf_t		*bp)
 {
-	atomic_inc(&pb->pb_pin_count);
-	PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
+	atomic_inc(&bp->b_pin_count);
+	XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
 }
 
-/*
- *	pagebuf_unpin
- *
- *	pagebuf_unpin reverses the locking of memory performed by
- *	pagebuf_pin.  Note that both functions affected the logical
- *	pages associated with the buffer, not the buffer itself.
- */
 void
-pagebuf_unpin(
-	xfs_buf_t		*pb)
+xfs_buf_unpin(
+	xfs_buf_t		*bp)
 {
-	if (atomic_dec_and_test(&pb->pb_pin_count)) {
-		wake_up_all(&pb->pb_waiters);
-	}
-	PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
+	if (atomic_dec_and_test(&bp->b_pin_count))
+		wake_up_all(&bp->b_waiters);
+	XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
 }
 
 int
-pagebuf_ispin(
-	xfs_buf_t		*pb)
+xfs_buf_ispin(
+	xfs_buf_t		*bp)
 {
-	return atomic_read(&pb->pb_pin_count);
+	return atomic_read(&bp->b_pin_count);
 }
 
-/*
- *	pagebuf_wait_unpin
- *
- *	pagebuf_wait_unpin waits until all of the memory associated
- *	with the buffer is not longer locked in memory.  It returns
- *	immediately if none of the affected pages are locked.
- */
-static inline void
-_pagebuf_wait_unpin(
-	xfs_buf_t		*pb)
+STATIC void
+xfs_buf_wait_unpin(
+	xfs_buf_t		*bp)
 {
 	DECLARE_WAITQUEUE	(wait, current);
 
-	if (atomic_read(&pb->pb_pin_count) == 0)
+	if (atomic_read(&bp->b_pin_count) == 0)
 		return;
 
-	add_wait_queue(&pb->pb_waiters, &wait);
+	add_wait_queue(&bp->b_waiters, &wait);
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (atomic_read(&pb->pb_pin_count) == 0)
+		if (atomic_read(&bp->b_pin_count) == 0)
 			break;
-		if (atomic_read(&pb->pb_io_remaining))
-			blk_run_address_space(pb->pb_target->pbr_mapping);
+		if (atomic_read(&bp->b_io_remaining))
+			blk_run_address_space(bp->b_target->bt_mapping);
 		schedule();
 	}
-	remove_wait_queue(&pb->pb_waiters, &wait);
+	remove_wait_queue(&bp->b_waiters, &wait);
 	set_current_state(TASK_RUNNING);
 }
 
@@ -1033,241 +978,216 @@ _pagebuf_wait_unpin(
  *	Buffer Utility Routines
  */
 
-/*
- *	pagebuf_iodone
- *
- *	pagebuf_iodone marks a buffer for which I/O is in progress
- *	done with respect to that I/O.	The pb_iodone routine, if
- *	present, will be called as a side-effect.
- */
 STATIC void
-pagebuf_iodone_work(
+xfs_buf_iodone_work(
 	void			*v)
 {
 	xfs_buf_t		*bp = (xfs_buf_t *)v;
 
-	if (bp->pb_iodone)
-		(*(bp->pb_iodone))(bp);
-	else if (bp->pb_flags & PBF_ASYNC)
+	if (bp->b_iodone)
+		(*(bp->b_iodone))(bp);
+	else if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
 }
 
 void
-pagebuf_iodone(
-	xfs_buf_t		*pb,
+xfs_buf_ioend(
+	xfs_buf_t		*bp,
 	int			schedule)
 {
-	pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
-	if (pb->pb_error == 0)
-		pb->pb_flags |= PBF_DONE;
+	bp->b_flags &= ~(XBF_READ | XBF_WRITE);
+	if (bp->b_error == 0)
+		bp->b_flags |= XBF_DONE;
 
-	PB_TRACE(pb, "iodone", pb->pb_iodone);
+	XB_TRACE(bp, "iodone", bp->b_iodone);
 
-	if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
+	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
 		if (schedule) {
-			INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
-			queue_work(xfslogd_workqueue, &pb->pb_iodone_work);
+			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work, bp);
+			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
 		} else {
-			pagebuf_iodone_work(pb);
+			xfs_buf_iodone_work(bp);
 		}
 	} else {
-		up(&pb->pb_iodonesema);
+		up(&bp->b_iodonesema);
 	}
 }
 
-/*
- *	pagebuf_ioerror
- *
- *	pagebuf_ioerror sets the error code for a buffer.
- */
 void
-pagebuf_ioerror(			/* mark/clear buffer error flag */
-	xfs_buf_t		*pb,	/* buffer to mark		*/
-	int			error)	/* error to store (0 if none)	*/
+xfs_buf_ioerror(
+	xfs_buf_t		*bp,
+	int			error)
 {
 	ASSERT(error >= 0 && error <= 0xffff);
-	pb->pb_error = (unsigned short)error;
-	PB_TRACE(pb, "ioerror", (unsigned long)error);
+	bp->b_error = (unsigned short)error;
+	XB_TRACE(bp, "ioerror", (unsigned long)error);
 }
 
 /*
- *	pagebuf_iostart
- *
- *	pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
- *	If necessary, it will arrange for any disk space allocation required,
- *	and it will break up the request if the block mappings require it.
- *	The pb_iodone routine in the buffer supplied will only be called
+ *	Initiate I/O on a buffer, based on the flags supplied.
+ *	The b_iodone routine in the buffer supplied will only be called
  *	when all of the subsidiary I/O requests, if any, have been completed.
- *	pagebuf_iostart calls the pagebuf_ioinitiate routine or
- *	pagebuf_iorequest, if the former routine is not defined, to start
- *	the I/O on a given low-level request.
  */
 int
-pagebuf_iostart(			/* start I/O on a buffer	  */
-	xfs_buf_t		*pb,	/* buffer to start		  */
-	page_buf_flags_t	flags)	/* PBF_LOCK, PBF_ASYNC, PBF_READ, */
-					/* PBF_WRITE, PBF_DELWRI,	  */
-					/* PBF_DONT_BLOCK		  */
+xfs_buf_iostart(
+	xfs_buf_t		*bp,
+	xfs_buf_flags_t		flags)
 {
 	int			status = 0;
 
-	PB_TRACE(pb, "iostart", (unsigned long)flags);
+	XB_TRACE(bp, "iostart", (unsigned long)flags);
 
-	if (flags & PBF_DELWRI) {
-		pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
-		pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC);
-		pagebuf_delwri_queue(pb, 1);
+	if (flags & XBF_DELWRI) {
+		bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
+		bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
+		xfs_buf_delwri_queue(bp, 1);
 		return status;
 	}
 
-	pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \
-			PBF_READ_AHEAD | _PBF_RUN_QUEUES);
-	pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
-			PBF_READ_AHEAD | _PBF_RUN_QUEUES);
+	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+	bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
+			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
 
-	BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
+	BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
 
 	/* For writes allow an alternate strategy routine to precede
 	 * the actual I/O request (which may not be issued at all in
 	 * a shutdown situation, for example).
 	 */
-	status = (flags & PBF_WRITE) ?
-		pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
+	status = (flags & XBF_WRITE) ?
+		xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
 
 	/* Wait for I/O if we are not an async request.
 	 * Note: async I/O request completion will release the buffer,
 	 * and that can already be done by this point.  So using the
 	 * buffer pointer from here on, after async I/O, is invalid.
 	 */
-	if (!status && !(flags & PBF_ASYNC))
-		status = pagebuf_iowait(pb);
+	if (!status && !(flags & XBF_ASYNC))
+		status = xfs_buf_iowait(bp);
 
 	return status;
 }
 
-/*
- * Helper routine for pagebuf_iorequest
- */
-
 STATIC __inline__ int
-_pagebuf_iolocked(
-	xfs_buf_t		*pb)
+_xfs_buf_iolocked(
+	xfs_buf_t		*bp)
 {
-	ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
-	if (pb->pb_flags & PBF_READ)
-		return pb->pb_locked;
+	ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE));
+	if (bp->b_flags & XBF_READ)
+		return bp->b_locked;
 	return 0;
 }
 
 STATIC __inline__ void
-_pagebuf_iodone(
-	xfs_buf_t		*pb,
+_xfs_buf_ioend(
+	xfs_buf_t		*bp,
 	int			schedule)
 {
-	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
-		pb->pb_locked = 0;
-		pagebuf_iodone(pb, schedule);
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+		bp->b_locked = 0;
+		xfs_buf_ioend(bp, schedule);
 	}
 }
 
 STATIC int
-bio_end_io_pagebuf(
+xfs_buf_bio_end_io(
 	struct bio		*bio,
 	unsigned int		bytes_done,
 	int			error)
 {
-	xfs_buf_t		*pb = (xfs_buf_t *)bio->bi_private;
-	unsigned int		blocksize = pb->pb_target->pbr_bsize;
+	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
+	unsigned int		blocksize = bp->b_target->bt_bsize;
 	struct bio_vec		*bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 
 	if (bio->bi_size)
 		return 1;
 
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		pb->pb_error = EIO;
+		bp->b_error = EIO;
 
 	do {
 		struct page	*page = bvec->bv_page;
 
-		if (unlikely(pb->pb_error)) {
-			if (pb->pb_flags & PBF_READ)
+		if (unlikely(bp->b_error)) {
+			if (bp->b_flags & XBF_READ)
 				ClearPageUptodate(page);
 			SetPageError(page);
-		} else if (blocksize == PAGE_CACHE_SIZE) {
+		} else if (blocksize >= PAGE_CACHE_SIZE) {
 			SetPageUptodate(page);
 		} else if (!PagePrivate(page) &&
-				(pb->pb_flags & _PBF_PAGE_CACHE)) {
+				(bp->b_flags & _XBF_PAGE_CACHE)) {
 			set_page_region(page, bvec->bv_offset, bvec->bv_len);
 		}
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
 
-		if (_pagebuf_iolocked(pb)) {
+		if (_xfs_buf_iolocked(bp)) {
 			unlock_page(page);
 		}
 	} while (bvec >= bio->bi_io_vec);
 
-	_pagebuf_iodone(pb, 1);
+	_xfs_buf_ioend(bp, 1);
 	bio_put(bio);
 	return 0;
 }
 
 STATIC void
-_pagebuf_ioapply(
-	xfs_buf_t		*pb)
+_xfs_buf_ioapply(
+	xfs_buf_t		*bp)
 {
 	int			i, rw, map_i, total_nr_pages, nr_pages;
 	struct bio		*bio;
-	int			offset = pb->pb_offset;
-	int			size = pb->pb_count_desired;
-	sector_t		sector = pb->pb_bn;
-	unsigned int		blocksize = pb->pb_target->pbr_bsize;
-	int			locking = _pagebuf_iolocked(pb);
+	int			offset = bp->b_offset;
+	int			size = bp->b_count_desired;
+	sector_t		sector = bp->b_bn;
+	unsigned int		blocksize = bp->b_target->bt_bsize;
+	int			locking = _xfs_buf_iolocked(bp);
 
-	total_nr_pages = pb->pb_page_count;
+	total_nr_pages = bp->b_page_count;
 	map_i = 0;
 
-	if (pb->pb_flags & _PBF_RUN_QUEUES) {
-		pb->pb_flags &= ~_PBF_RUN_QUEUES;
-		rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC;
+	if (bp->b_flags & _XBF_RUN_QUEUES) {
+		bp->b_flags &= ~_XBF_RUN_QUEUES;
+		rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC;
 	} else {
-		rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
+		rw = (bp->b_flags & XBF_READ) ? READ : WRITE;
 	}
 
-	if (pb->pb_flags & PBF_ORDERED) {
-		ASSERT(!(pb->pb_flags & PBF_READ));
+	if (bp->b_flags & XBF_ORDERED) {
+		ASSERT(!(bp->b_flags & XBF_READ));
 		rw = WRITE_BARRIER;
 	}
 
-	/* Special code path for reading a sub page size pagebuf in --
+	/* Special code path for reading a sub page size buffer in --
 	 * we populate up the whole page, and hence the other metadata
 	 * in the same page.  This optimization is only valid when the
-	 * filesystem block size and the page size are equal.
+	 * filesystem block size is not smaller than the page size.
 	 */
-	if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
-	    (pb->pb_flags & PBF_READ) && locking &&
-	    (blocksize == PAGE_CACHE_SIZE)) {
+	if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
+	    (bp->b_flags & XBF_READ) && locking &&
+	    (blocksize >= PAGE_CACHE_SIZE)) {
 		bio = bio_alloc(GFP_NOIO, 1);
 
-		bio->bi_bdev = pb->pb_target->pbr_bdev;
+		bio->bi_bdev = bp->b_target->bt_bdev;
 		bio->bi_sector = sector - (offset >> BBSHIFT);
-		bio->bi_end_io = bio_end_io_pagebuf;
-		bio->bi_private = pb;
+		bio->bi_end_io = xfs_buf_bio_end_io;
+		bio->bi_private = bp;
 
-		bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
+		bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
 		size = 0;
 
-		atomic_inc(&pb->pb_io_remaining);
+		atomic_inc(&bp->b_io_remaining);
 
 		goto submit_io;
 	}
 
 	/* Lock down the pages which we need to for the request */
-	if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
+	if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) {
 		for (i = 0; size; i++) {
 			int		nbytes = PAGE_CACHE_SIZE - offset;
-			struct page	*page = pb->pb_pages[i];
+			struct page	*page = bp->b_pages[i];
 
 			if (nbytes > size)
 				nbytes = size;
@@ -1277,30 +1197,30 @@ _pagebuf_ioapply(
 			size -= nbytes;
 			offset = 0;
 		}
-		offset = pb->pb_offset;
-		size = pb->pb_count_desired;
+		offset = bp->b_offset;
+		size = bp->b_count_desired;
 	}
 
 next_chunk:
-	atomic_inc(&pb->pb_io_remaining);
+	atomic_inc(&bp->b_io_remaining);
 	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
 	if (nr_pages > total_nr_pages)
 		nr_pages = total_nr_pages;
 
 	bio = bio_alloc(GFP_NOIO, nr_pages);
-	bio->bi_bdev = pb->pb_target->pbr_bdev;
+	bio->bi_bdev = bp->b_target->bt_bdev;
 	bio->bi_sector = sector;
-	bio->bi_end_io = bio_end_io_pagebuf;
-	bio->bi_private = pb;
+	bio->bi_end_io = xfs_buf_bio_end_io;
+	bio->bi_private = bp;
 
 	for (; size && nr_pages; nr_pages--, map_i++) {
-		int	nbytes = PAGE_CACHE_SIZE - offset;
+		int	rbytes, nbytes = PAGE_CACHE_SIZE - offset;
 
 		if (nbytes > size)
 			nbytes = size;
 
-		if (bio_add_page(bio, pb->pb_pages[map_i],
-					nbytes, offset) < nbytes)
+		rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
+		if (rbytes < nbytes)
 			break;
 
 		offset = 0;
@@ -1316,107 +1236,102 @@ submit_io:
 			goto next_chunk;
 	} else {
 		bio_put(bio);
-		pagebuf_ioerror(pb, EIO);
+		xfs_buf_ioerror(bp, EIO);
 	}
 }
 
-/*
- *	pagebuf_iorequest -- the core I/O request routine.
- */
 int
-pagebuf_iorequest(			/* start real I/O		*/
-	xfs_buf_t		*pb)	/* buffer to convey to device	*/
+xfs_buf_iorequest(
+	xfs_buf_t		*bp)
 {
-	PB_TRACE(pb, "iorequest", 0);
+	XB_TRACE(bp, "iorequest", 0);
 
-	if (pb->pb_flags & PBF_DELWRI) {
-		pagebuf_delwri_queue(pb, 1);
+	if (bp->b_flags & XBF_DELWRI) {
+		xfs_buf_delwri_queue(bp, 1);
 		return 0;
 	}
 
-	if (pb->pb_flags & PBF_WRITE) {
-		_pagebuf_wait_unpin(pb);
+	if (bp->b_flags & XBF_WRITE) {
+		xfs_buf_wait_unpin(bp);
 	}
 
-	pagebuf_hold(pb);
+	xfs_buf_hold(bp);
 
 	/* Set the count to 1 initially, this will stop an I/O
 	 * completion callout which happens before we have started
-	 * all the I/O from calling pagebuf_iodone too early.
+	 * all the I/O from calling xfs_buf_ioend too early.
 	 */
-	atomic_set(&pb->pb_io_remaining, 1);
-	_pagebuf_ioapply(pb);
-	_pagebuf_iodone(pb, 0);
+	atomic_set(&bp->b_io_remaining, 1);
+	_xfs_buf_ioapply(bp);
+	_xfs_buf_ioend(bp, 0);
 
-	pagebuf_rele(pb);
+	xfs_buf_rele(bp);
 	return 0;
 }
 
 /*
- *	pagebuf_iowait
- *
- *	pagebuf_iowait waits for I/O to complete on the buffer supplied.
- *	It returns immediately if no I/O is pending.  In any case, it returns
- *	the error code, if any, or 0 if there is no error.
+ *	Waits for I/O to complete on the buffer supplied.
+ *	It returns immediately if no I/O is pending.
+ *	It returns the I/O error code, if any, or 0 if there was no error.
  */
 int
-pagebuf_iowait(
-	xfs_buf_t		*pb)
+xfs_buf_iowait(
+	xfs_buf_t		*bp)
 {
-	PB_TRACE(pb, "iowait", 0);
-	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_address_space(pb->pb_target->pbr_mapping);
-	down(&pb->pb_iodonesema);
-	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
-	return pb->pb_error;
+	XB_TRACE(bp, "iowait", 0);
+	if (atomic_read(&bp->b_io_remaining))
+		blk_run_address_space(bp->b_target->bt_mapping);
+	down(&bp->b_iodonesema);
+	XB_TRACE(bp, "iowaited", (long)bp->b_error);
+	return bp->b_error;
 }
 
-caddr_t
-pagebuf_offset(
-	xfs_buf_t		*pb,
+xfs_caddr_t
+xfs_buf_offset(
+	xfs_buf_t		*bp,
 	size_t			offset)
 {
 	struct page		*page;
 
-	offset += pb->pb_offset;
+	if (bp->b_flags & XBF_MAPPED)
+		return XFS_BUF_PTR(bp) + offset;
 
-	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
-	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
+	offset += bp->b_offset;
+	page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
+	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
 }
 
 /*
- *	pagebuf_iomove
- *
  *	Move data into or out of a buffer.
  */
 void
-pagebuf_iomove(
-	xfs_buf_t		*pb,	/* buffer to process		*/
+xfs_buf_iomove(
+	xfs_buf_t		*bp,	/* buffer to process		*/
 	size_t			boff,	/* starting buffer offset	*/
 	size_t			bsize,	/* length to copy		*/
 	caddr_t			data,	/* data address			*/
-	page_buf_rw_t		mode)	/* read/write flag		*/
+	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
 {
 	size_t			bend, cpoff, csize;
 	struct page		*page;
 
 	bend = boff + bsize;
 	while (boff < bend) {
-		page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
-		cpoff = page_buf_poff(boff + pb->pb_offset);
+		page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
+		cpoff = xfs_buf_poff(boff + bp->b_offset);
 		csize = min_t(size_t,
-			      PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
+			      PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
 
 		ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
 
 		switch (mode) {
-		case PBRW_ZERO:
+		case XBRW_ZERO:
 			memset(page_address(page) + cpoff, 0, csize);
 			break;
-		case PBRW_READ:
+		case XBRW_READ:
 			memcpy(data, page_address(page) + cpoff, csize);
 			break;
-		case PBRW_WRITE:
+		case XBRW_WRITE:
 			memcpy(page_address(page) + cpoff, data, csize);
 		}
 
@@ -1426,12 +1341,12 @@ pagebuf_iomove(
 }
 
 /*
- *	Handling of buftargs.
+ *	Handling of buffer targets (buftargs).
  */
 
 /*
- * Wait for any bufs with callbacks that have been submitted but
- * have not yet returned... walk the hash list for the target.
+ *	Wait for any bufs with callbacks that have been submitted but
+ *	have not yet returned... walk the hash list for the target.
  */
 void
 xfs_wait_buftarg(
@@ -1445,15 +1360,15 @@ xfs_wait_buftarg(
 		hash = &btp->bt_hash[i];
 again:
 		spin_lock(&hash->bh_lock);
-		list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) {
-			ASSERT(btp == bp->pb_target);
-			if (!(bp->pb_flags & PBF_FS_MANAGED)) {
+		list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
+			ASSERT(btp == bp->b_target);
+			if (!(bp->b_flags & XBF_FS_MANAGED)) {
 				spin_unlock(&hash->bh_lock);
 				/*
 				 * Catch superblock reference count leaks
 				 * immediately
 				 */
-				BUG_ON(bp->pb_bn == 0);
+				BUG_ON(bp->b_bn == 0);
 				delay(100);
 				goto again;
 			}
@@ -1463,9 +1378,9 @@ again:
 }
 
 /*
- * Allocate buffer hash table for a given target.
- * For devices containing metadata (i.e. not the log/realtime devices)
- * we need to allocate a much larger hash table.
+ *	Allocate buffer hash table for a given target.
+ *	For devices containing metadata (i.e. not the log/realtime devices)
+ *	we need to allocate a much larger hash table.
  */
 STATIC void
 xfs_alloc_bufhash(
@@ -1488,13 +1403,12 @@ STATIC void
 xfs_free_bufhash(
 	xfs_buftarg_t		*btp)
 {
-	kmem_free(btp->bt_hash,
-			(1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
+	kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t));
 	btp->bt_hash = NULL;
 }
 
 /*
- * buftarg list for delwrite queue processing
+ *	buftarg list for delwrite queue processing
  */
 STATIC LIST_HEAD(xfs_buftarg_list);
 STATIC DEFINE_SPINLOCK(xfs_buftarg_lock);
@@ -1524,12 +1438,13 @@ xfs_free_buftarg(
 {
 	xfs_flush_buftarg(btp, 1);
 	if (external)
-		xfs_blkdev_put(btp->pbr_bdev);
+		xfs_blkdev_put(btp->bt_bdev);
 	xfs_free_bufhash(btp);
-	iput(btp->pbr_mapping->host);
+	iput(btp->bt_mapping->host);
 
-	/* unregister the buftarg first so that we don't get a
-	 * wakeup finding a non-existent task */
+	/* Unregister the buftarg first so that we don't get a
+	 * wakeup finding a non-existent task
+	 */
 	xfs_unregister_buftarg(btp);
 	kthread_stop(btp->bt_task);
 
@@ -1543,11 +1458,11 @@ xfs_setsize_buftarg_flags(
 	unsigned int		sectorsize,
 	int			verbose)
 {
-	btp->pbr_bsize = blocksize;
-	btp->pbr_sshift = ffs(sectorsize) - 1;
-	btp->pbr_smask = sectorsize - 1;
+	btp->bt_bsize = blocksize;
+	btp->bt_sshift = ffs(sectorsize) - 1;
+	btp->bt_smask = sectorsize - 1;
 
-	if (set_blocksize(btp->pbr_bdev, sectorsize)) {
+	if (set_blocksize(btp->bt_bdev, sectorsize)) {
 		printk(KERN_WARNING
 			"XFS: Cannot set_blocksize to %u on device %s\n",
 			sectorsize, XFS_BUFTARG_NAME(btp));
@@ -1567,10 +1482,10 @@ xfs_setsize_buftarg_flags(
 }
 
 /*
-* When allocating the initial buffer target we have not yet
-* read in the superblock, so don't know what sized sectors
-* are being used is at this early stage.  Play safe.
-*/
+ *	When allocating the initial buffer target we have not yet
+ *	read in the superblock, so don't know what sized sectors
+ *	are being used is at this early stage.  Play safe.
+ */
 STATIC int
 xfs_setsize_buftarg_early(
 	xfs_buftarg_t		*btp,
@@ -1618,7 +1533,7 @@ xfs_mapping_buftarg(
 	mapping->a_ops = &mapping_aops;
 	mapping->backing_dev_info = bdi;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	btp->pbr_mapping = mapping;
+	btp->bt_mapping = mapping;
 	return 0;
 }
 
@@ -1651,8 +1566,8 @@ xfs_alloc_buftarg(
 
 	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
 
-	btp->pbr_dev =  bdev->bd_dev;
-	btp->pbr_bdev = bdev;
+	btp->bt_dev =  bdev->bd_dev;
+	btp->bt_bdev = bdev;
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
 	if (xfs_mapping_buftarg(btp, bdev))
@@ -1669,63 +1584,61 @@ error:
 
 
 /*
- * Pagebuf delayed write buffer handling
+ *	Delayed write buffer handling
  */
 STATIC void
-pagebuf_delwri_queue(
-	xfs_buf_t		*pb,
+xfs_buf_delwri_queue(
+	xfs_buf_t		*bp,
 	int			unlock)
 {
-	struct list_head	*dwq = &pb->pb_target->bt_delwrite_queue;
-	spinlock_t		*dwlk = &pb->pb_target->bt_delwrite_lock;
+	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
+	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
 
-	PB_TRACE(pb, "delwri_q", (long)unlock);
-	ASSERT((pb->pb_flags & (PBF_DELWRI|PBF_ASYNC)) ==
-					(PBF_DELWRI|PBF_ASYNC));
+	XB_TRACE(bp, "delwri_q", (long)unlock);
+	ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
 
 	spin_lock(dwlk);
 	/* If already in the queue, dequeue and place at tail */
-	if (!list_empty(&pb->pb_list)) {
-		ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
-		if (unlock) {
-			atomic_dec(&pb->pb_hold);
-		}
-		list_del(&pb->pb_list);
+	if (!list_empty(&bp->b_list)) {
+		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+		if (unlock)
+			atomic_dec(&bp->b_hold);
+		list_del(&bp->b_list);
 	}
 
-	pb->pb_flags |= _PBF_DELWRI_Q;
-	list_add_tail(&pb->pb_list, dwq);
-	pb->pb_queuetime = jiffies;
+	bp->b_flags |= _XBF_DELWRI_Q;
+	list_add_tail(&bp->b_list, dwq);
+	bp->b_queuetime = jiffies;
 	spin_unlock(dwlk);
 
 	if (unlock)
-		pagebuf_unlock(pb);
+		xfs_buf_unlock(bp);
 }
 
 void
-pagebuf_delwri_dequeue(
-	xfs_buf_t		*pb)
+xfs_buf_delwri_dequeue(
+	xfs_buf_t		*bp)
 {
-	spinlock_t		*dwlk = &pb->pb_target->bt_delwrite_lock;
+	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
 	int			dequeued = 0;
 
 	spin_lock(dwlk);
-	if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
-		ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
-		list_del_init(&pb->pb_list);
+	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
+		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+		list_del_init(&bp->b_list);
 		dequeued = 1;
 	}
-	pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
+	bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
 	spin_unlock(dwlk);
 
 	if (dequeued)
-		pagebuf_rele(pb);
+		xfs_buf_rele(bp);
 
-	PB_TRACE(pb, "delwri_dq", (long)dequeued);
+	XB_TRACE(bp, "delwri_dq", (long)dequeued);
 }
 
 STATIC void
-pagebuf_runall_queues(
+xfs_buf_runall_queues(
 	struct workqueue_struct	*queue)
 {
 	flush_workqueue(queue);
@@ -1740,9 +1653,9 @@ xfsbufd_wakeup(
 
 	spin_lock(&xfs_buftarg_lock);
 	list_for_each_entry_safe(btp, n, &xfs_buftarg_list, bt_list) {
-		if (test_bit(BT_FORCE_SLEEP, &btp->bt_flags))
+		if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
 			continue;
-		set_bit(BT_FORCE_FLUSH, &btp->bt_flags);
+		set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
 		barrier();
 		wake_up_process(btp->bt_task);
 	}
@@ -1757,7 +1670,7 @@ xfsbufd(
 	struct list_head	tmp;
 	unsigned long		age;
 	xfs_buftarg_t		*target = (xfs_buftarg_t *)data;
-	xfs_buf_t		*pb, *n;
+	xfs_buf_t		*bp, *n;
 	struct list_head	*dwq = &target->bt_delwrite_queue;
 	spinlock_t		*dwlk = &target->bt_delwrite_lock;
 
@@ -1766,10 +1679,10 @@ xfsbufd(
 	INIT_LIST_HEAD(&tmp);
 	do {
 		if (unlikely(freezing(current))) {
-			set_bit(BT_FORCE_SLEEP, &target->bt_flags);
+			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
 			refrigerator();
 		} else {
-			clear_bit(BT_FORCE_SLEEP, &target->bt_flags);
+			clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
 		}
 
 		schedule_timeout_interruptible(
@@ -1777,49 +1690,49 @@ xfsbufd(
 
 		age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
 		spin_lock(dwlk);
-		list_for_each_entry_safe(pb, n, dwq, pb_list) {
-			PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
-			ASSERT(pb->pb_flags & PBF_DELWRI);
+		list_for_each_entry_safe(bp, n, dwq, b_list) {
+			XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
+			ASSERT(bp->b_flags & XBF_DELWRI);
 
-			if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
-				if (!test_bit(BT_FORCE_FLUSH,
+			if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
+				if (!test_bit(XBT_FORCE_FLUSH,
 						&target->bt_flags) &&
 				    time_before(jiffies,
-						pb->pb_queuetime + age)) {
-					pagebuf_unlock(pb);
+						bp->b_queuetime + age)) {
+					xfs_buf_unlock(bp);
 					break;
 				}
 
-				pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
-				pb->pb_flags |= PBF_WRITE;
-				list_move(&pb->pb_list, &tmp);
+				bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
+				bp->b_flags |= XBF_WRITE;
+				list_move(&bp->b_list, &tmp);
 			}
 		}
 		spin_unlock(dwlk);
 
 		while (!list_empty(&tmp)) {
-			pb = list_entry(tmp.next, xfs_buf_t, pb_list);
-			ASSERT(target == pb->pb_target);
+			bp = list_entry(tmp.next, xfs_buf_t, b_list);
+			ASSERT(target == bp->b_target);
 
-			list_del_init(&pb->pb_list);
-			pagebuf_iostrategy(pb);
+			list_del_init(&bp->b_list);
+			xfs_buf_iostrategy(bp);
 
-			blk_run_address_space(target->pbr_mapping);
+			blk_run_address_space(target->bt_mapping);
 		}
 
 		if (as_list_len > 0)
 			purge_addresses();
 
-		clear_bit(BT_FORCE_FLUSH, &target->bt_flags);
+		clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
 	} while (!kthread_should_stop());
 
 	return 0;
 }
 
 /*
- * Go through all incore buffers, and release buffers if they belong to
- * the given device. This is used in filesystem error handling to
- * preserve the consistency of its metadata.
+ *	Go through all incore buffers, and release buffers if they belong to
+ *	the given device. This is used in filesystem error handling to
+ *	preserve the consistency of its metadata.
  */
 int
 xfs_flush_buftarg(
@@ -1827,73 +1740,72 @@ xfs_flush_buftarg(
 	int			wait)
 {
 	struct list_head	tmp;
-	xfs_buf_t		*pb, *n;
+	xfs_buf_t		*bp, *n;
 	int			pincount = 0;
 	struct list_head	*dwq = &target->bt_delwrite_queue;
 	spinlock_t		*dwlk = &target->bt_delwrite_lock;
 
-	pagebuf_runall_queues(xfsdatad_workqueue);
-	pagebuf_runall_queues(xfslogd_workqueue);
+	xfs_buf_runall_queues(xfsdatad_workqueue);
+	xfs_buf_runall_queues(xfslogd_workqueue);
 
 	INIT_LIST_HEAD(&tmp);
 	spin_lock(dwlk);
-	list_for_each_entry_safe(pb, n, dwq, pb_list) {
-
-		ASSERT(pb->pb_target == target);
-		ASSERT(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q));
-		PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
-		if (pagebuf_ispin(pb)) {
+	list_for_each_entry_safe(bp, n, dwq, b_list) {
+		ASSERT(bp->b_target == target);
+		ASSERT(bp->b_flags & (XBF_DELWRI | _XBF_DELWRI_Q));
+		XB_TRACE(bp, "walkq2", (long)xfs_buf_ispin(bp));
+		if (xfs_buf_ispin(bp)) {
 			pincount++;
 			continue;
 		}
 
-		list_move(&pb->pb_list, &tmp);
+		list_move(&bp->b_list, &tmp);
 	}
 	spin_unlock(dwlk);
 
 	/*
 	 * Dropped the delayed write list lock, now walk the temporary list
 	 */
-	list_for_each_entry_safe(pb, n, &tmp, pb_list) {
-		pagebuf_lock(pb);
-		pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
-		pb->pb_flags |= PBF_WRITE;
+	list_for_each_entry_safe(bp, n, &tmp, b_list) {
+		xfs_buf_lock(bp);
+		bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
+		bp->b_flags |= XBF_WRITE;
 		if (wait)
-			pb->pb_flags &= ~PBF_ASYNC;
+			bp->b_flags &= ~XBF_ASYNC;
 		else
-			list_del_init(&pb->pb_list);
+			list_del_init(&bp->b_list);
 
-		pagebuf_iostrategy(pb);
+		xfs_buf_iostrategy(bp);
 	}
 
 	/*
 	 * Remaining list items must be flushed before returning
 	 */
 	while (!list_empty(&tmp)) {
-		pb = list_entry(tmp.next, xfs_buf_t, pb_list);
+		bp = list_entry(tmp.next, xfs_buf_t, b_list);
 
-		list_del_init(&pb->pb_list);
-		xfs_iowait(pb);
-		xfs_buf_relse(pb);
+		list_del_init(&bp->b_list);
+		xfs_iowait(bp);
+		xfs_buf_relse(bp);
 	}
 
 	if (wait)
-		blk_run_address_space(target->pbr_mapping);
+		blk_run_address_space(target->bt_mapping);
 
 	return pincount;
 }
 
 int __init
-pagebuf_init(void)
+xfs_buf_init(void)
 {
 	int		error = -ENOMEM;
 
-#ifdef PAGEBUF_TRACE
-	pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
+#ifdef XFS_BUF_TRACE
+	xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
 #endif
 
-	pagebuf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
-	if (!pagebuf_zone)
+	xfs_buf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
+	if (!xfs_buf_zone)
 		goto out_free_trace_buf;
 
 	xfslogd_workqueue = create_workqueue("xfslogd");
@@ -1904,8 +1816,8 @@ pagebuf_init(void)
 	if (!xfsdatad_workqueue)
 		goto out_destroy_xfslogd_workqueue;
 
-	pagebuf_shake = kmem_shake_register(xfsbufd_wakeup);
-	if (!pagebuf_shake)
+	xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup);
+	if (!xfs_buf_shake)
 		goto out_destroy_xfsdatad_workqueue;
 
 	return 0;
@@ -1915,22 +1827,22 @@ pagebuf_init(void)
  out_destroy_xfslogd_workqueue:
 	destroy_workqueue(xfslogd_workqueue);
  out_free_buf_zone:
-	kmem_zone_destroy(pagebuf_zone);
+	kmem_zone_destroy(xfs_buf_zone);
  out_free_trace_buf:
-#ifdef PAGEBUF_TRACE
-	ktrace_free(pagebuf_trace_buf);
+#ifdef XFS_BUF_TRACE
+	ktrace_free(xfs_buf_trace_buf);
 #endif
 	return error;
 }
 
 void
-pagebuf_terminate(void)
+xfs_buf_terminate(void)
 {
-	kmem_shake_deregister(pagebuf_shake);
+	kmem_shake_deregister(xfs_buf_shake);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
-	kmem_zone_destroy(pagebuf_zone);
-#ifdef PAGEBUF_TRACE
-	ktrace_free(pagebuf_trace_buf);
+	kmem_zone_destroy(xfs_buf_zone);
+#ifdef XFS_BUF_TRACE
+	ktrace_free(xfs_buf_trace_buf);
 #endif
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index f721d47ad4cc..4dd6592d5a4c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -32,44 +32,47 @@
  *	Base types
  */
 
-#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-
-#define page_buf_ctob(pp)	((pp) * PAGE_CACHE_SIZE)
-#define page_buf_btoc(dd)	(((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)
-#define page_buf_btoct(dd)	((dd) >> PAGE_CACHE_SHIFT)
-#define page_buf_poff(aa)	((aa) & ~PAGE_CACHE_MASK)
-
-typedef enum page_buf_rw_e {
-	PBRW_READ = 1,			/* transfer into target memory */
-	PBRW_WRITE = 2,			/* transfer from target memory */
-	PBRW_ZERO = 3			/* Zero target memory */
-} page_buf_rw_t;
-
-
-typedef enum page_buf_flags_e {		/* pb_flags values */
-	PBF_READ = (1 << 0),	/* buffer intended for reading from device */
-	PBF_WRITE = (1 << 1),	/* buffer intended for writing to device   */
-	PBF_MAPPED = (1 << 2),  /* buffer mapped (pb_addr valid)           */
-	PBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
-	PBF_DONE = (1 << 5),    /* all pages in the buffer uptodate	   */
-	PBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
-	PBF_STALE = (1 << 7),	/* buffer has been staled, do not find it  */
-	PBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
- 	PBF_ORDERED = (1 << 11),    /* use ordered writes		   */
-	PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead		   */
+#define XFS_BUF_DADDR_NULL	((xfs_daddr_t) (-1LL))
+
+#define xfs_buf_ctob(pp)	((pp) * PAGE_CACHE_SIZE)
+#define xfs_buf_btoc(dd)	(((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
+#define xfs_buf_btoct(dd)	((dd) >> PAGE_CACHE_SHIFT)
+#define xfs_buf_poff(aa)	((aa) & ~PAGE_CACHE_MASK)
+
+typedef enum {
+	XBRW_READ = 1,			/* transfer into target memory */
+	XBRW_WRITE = 2,			/* transfer from target memory */
+	XBRW_ZERO = 3,			/* Zero target memory */
+} xfs_buf_rw_t;
+
+typedef enum {
+	XBF_READ = (1 << 0),	/* buffer intended for reading from device */
+	XBF_WRITE = (1 << 1),	/* buffer intended for writing to device   */
+	XBF_MAPPED = (1 << 2),  /* buffer mapped (b_addr valid)            */
+	XBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
+	XBF_DONE = (1 << 5),    /* all pages in the buffer uptodate	   */
+	XBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
+	XBF_STALE = (1 << 7),	/* buffer has been staled, do not find it  */
+	XBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
+ 	XBF_ORDERED = (1 << 11),    /* use ordered writes		   */
+	XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead		   */
 
 	/* flags used only as arguments to access routines */
-	PBF_LOCK = (1 << 14),       /* lock requested			   */
-	PBF_TRYLOCK = (1 << 15),    /* lock requested, but do not wait	   */
-	PBF_DONT_BLOCK = (1 << 16), /* do not block in current thread	   */
+	XBF_LOCK = (1 << 14),       /* lock requested			   */
+	XBF_TRYLOCK = (1 << 15),    /* lock requested, but do not wait	   */
+	XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread	   */
 
 	/* flags used only internally */
-	_PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache		   */
-	_PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc()		   */
-	_PBF_RUN_QUEUES = (1 << 19),/* run block device task queue	   */
-	_PBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue		   */
-} page_buf_flags_t;
+	_XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache		   */
+	_XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc()		   */
+	_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue	   */
+	_XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue		   */
+} xfs_buf_flags_t;
 
+typedef enum {
+	XBT_FORCE_SLEEP = (0 << 1),
+	XBT_FORCE_FLUSH = (1 << 1),
+} xfs_buftarg_flags_t;
 
 typedef struct xfs_bufhash {
 	struct list_head	bh_list;
@@ -77,14 +80,14 @@ typedef struct xfs_bufhash {
 } xfs_bufhash_t;
 
 typedef struct xfs_buftarg {
-	dev_t			pbr_dev;
-	struct block_device	*pbr_bdev;
-	struct address_space	*pbr_mapping;
-	unsigned int		pbr_bsize;
-	unsigned int		pbr_sshift;
-	size_t			pbr_smask;
-
-	/* per-device buffer hash table */
+	dev_t			bt_dev;
+	struct block_device	*bt_bdev;
+	struct address_space	*bt_mapping;
+	unsigned int		bt_bsize;
+	unsigned int		bt_sshift;
+	size_t			bt_smask;
+
+	/* per device buffer hash table */
 	uint			bt_hashmask;
 	uint			bt_hashshift;
 	xfs_bufhash_t		*bt_hash;
@@ -94,469 +97,333 @@ typedef struct xfs_buftarg {
 	struct list_head	bt_list;
 	struct list_head	bt_delwrite_queue;
 	spinlock_t		bt_delwrite_lock;
-	uint			bt_flags;
-#define BT_FORCE_SLEEP		1
-#define BT_FORCE_FLUSH		2
+	unsigned long		bt_flags;
 } xfs_buftarg_t;
 
 /*
- *	xfs_buf_t:  Buffer structure for page cache-based buffers
+ *	xfs_buf_t:  Buffer structure for pagecache-based buffers
  *
- * This buffer structure is used by the page cache buffer management routines
- * to refer to an assembly of pages forming a logical buffer.  The actual I/O
- * is performed with buffer_head structures, as required by drivers.
- * 
- * The buffer structure is used on temporary basis only, and discarded when
- * released.  The real data storage is recorded in the page cache.  Metadata is
+ * This buffer structure is used by the pagecache buffer management routines
+ * to refer to an assembly of pages forming a logical buffer.
+ *
+ * The buffer structure is used on a temporary basis only, and discarded when
+ * released.  The real data storage is recorded in the pagecache. Buffers are
  * hashed to the block device on which the file system resides.
  */
 
 struct xfs_buf;
+typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
+typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
+typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 
-/* call-back function on I/O completion */
-typedef void (*page_buf_iodone_t)(struct xfs_buf *);
-/* call-back function on I/O completion */
-typedef void (*page_buf_relse_t)(struct xfs_buf *);
-/* pre-write function */
-typedef int (*page_buf_bdstrat_t)(struct xfs_buf *);
-
-#define PB_PAGES	2
+#define XB_PAGES	2
 
 typedef struct xfs_buf {
-	struct semaphore	pb_sema;	/* semaphore for lockables  */
-	unsigned long		pb_queuetime;	/* time buffer was queued   */
-	atomic_t		pb_pin_count;	/* pin count		    */
-	wait_queue_head_t	pb_waiters;	/* unpin waiters	    */
-	struct list_head	pb_list;
-	page_buf_flags_t	pb_flags;	/* status flags */
-	struct list_head	pb_hash_list;	/* hash table list */
-	xfs_bufhash_t		*pb_hash;	/* hash table list start */
-	xfs_buftarg_t		*pb_target;	/* buffer target (device) */
-	atomic_t		pb_hold;	/* reference count */
-	xfs_daddr_t		pb_bn;		/* block number for I/O */
-	loff_t			pb_file_offset;	/* offset in file */
-	size_t			pb_buffer_length; /* size of buffer in bytes */
-	size_t			pb_count_desired; /* desired transfer size */
-	void			*pb_addr;	/* virtual address of buffer */
-	struct work_struct	pb_iodone_work;
-	atomic_t		pb_io_remaining;/* #outstanding I/O requests */
-	page_buf_iodone_t	pb_iodone;	/* I/O completion function */
-	page_buf_relse_t	pb_relse;	/* releasing function */
-	page_buf_bdstrat_t	pb_strat;	/* pre-write function */
-	struct semaphore	pb_iodonesema;	/* Semaphore for I/O waiters */
-	void			*pb_fspriv;
-	void			*pb_fspriv2;
-	void			*pb_fspriv3;
-	unsigned short		pb_error;	/* error code on I/O */
- 	unsigned short		pb_locked;	/* page array is locked */
- 	unsigned int		pb_page_count;	/* size of page array */
-	unsigned int		pb_offset;	/* page offset in first page */
-	struct page		**pb_pages;	/* array of page pointers */
-	struct page		*pb_page_array[PB_PAGES]; /* inline pages */
-#ifdef PAGEBUF_LOCK_TRACKING
-	int			pb_last_holder;
+	struct semaphore	b_sema;		/* semaphore for lockables */
+	unsigned long		b_queuetime;	/* time buffer was queued */
+	atomic_t		b_pin_count;	/* pin count */
+	wait_queue_head_t	b_waiters;	/* unpin waiters */
+	struct list_head	b_list;
+	xfs_buf_flags_t		b_flags;	/* status flags */
+	struct list_head	b_hash_list;	/* hash table list */
+	xfs_bufhash_t		*b_hash;	/* hash table list start */
+	xfs_buftarg_t		*b_target;	/* buffer target (device) */
+	atomic_t		b_hold;		/* reference count */
+	xfs_daddr_t		b_bn;		/* block number for I/O */
+	xfs_off_t		b_file_offset;	/* offset in file */
+	size_t			b_buffer_length;/* size of buffer in bytes */
+	size_t			b_count_desired;/* desired transfer size */
+	void			*b_addr;	/* virtual address of buffer */
+	struct work_struct	b_iodone_work;
+	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
+	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
+	xfs_buf_relse_t		b_relse;	/* releasing function */
+	xfs_buf_bdstrat_t	b_strat;	/* pre-write function */
+	struct semaphore	b_iodonesema;	/* Semaphore for I/O waiters */
+	void			*b_fspriv;
+	void			*b_fspriv2;
+	void			*b_fspriv3;
+	unsigned short		b_error;	/* error code on I/O */
+	unsigned short		b_locked;	/* page array is locked */
+	unsigned int		b_page_count;	/* size of page array */
+	unsigned int		b_offset;	/* page offset in first page */
+	struct page		**b_pages;	/* array of page pointers */
+	struct page		*b_page_array[XB_PAGES]; /* inline pages */
+#ifdef XFS_BUF_LOCK_TRACKING
+	int			b_last_holder;
 #endif
 } xfs_buf_t;
 
 
 /* Finding and Reading Buffers */
-
-extern xfs_buf_t *_pagebuf_find(	/* find buffer for block if	*/
-					/* the block is in memory	*/
-		xfs_buftarg_t *,	/* inode for block		*/
-		loff_t,			/* starting offset of range	*/
-		size_t,			/* length of range		*/
-		page_buf_flags_t,	/* PBF_LOCK			*/
-	        xfs_buf_t *);		/* newly allocated buffer	*/
-
+extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
+				xfs_buf_flags_t, xfs_buf_t *);
 #define xfs_incore(buftarg,blkno,len,lockit) \
-	_pagebuf_find(buftarg, blkno ,len, lockit, NULL)
-
-extern xfs_buf_t *xfs_buf_get_flags(	/* allocate a buffer		*/
-		xfs_buftarg_t *,	/* inode for buffer		*/
-		loff_t,			/* starting offset of range     */
-		size_t,			/* length of range              */
-		page_buf_flags_t);	/* PBF_LOCK, PBF_READ,		*/
-					/* PBF_ASYNC			*/
+	_xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
 
+extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t,
+				xfs_buf_flags_t);
 #define xfs_buf_get(target, blkno, len, flags) \
-	xfs_buf_get_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED)
-
-extern xfs_buf_t *xfs_buf_read_flags(	/* allocate and read a buffer	*/
-		xfs_buftarg_t *,	/* inode for buffer		*/
-		loff_t,			/* starting offset of range	*/
-		size_t,			/* length of range		*/
-		page_buf_flags_t);	/* PBF_LOCK, PBF_ASYNC		*/
+	xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
 
+extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t,
+				xfs_buf_flags_t);
 #define xfs_buf_read(target, blkno, len, flags) \
-	xfs_buf_read_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED)
+	xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
 
-extern xfs_buf_t *pagebuf_get_empty(	/* allocate pagebuf struct with	*/
-					/*  no memory or disk address	*/
-		size_t len,
-		xfs_buftarg_t *);	/* mount point "fake" inode	*/
-
-extern xfs_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct	*/
-					/* without disk address		*/
-		size_t len,
-		xfs_buftarg_t *);	/* mount point "fake" inode	*/
-
-extern int pagebuf_associate_memory(
-		xfs_buf_t *,
-		void *,
-		size_t);
-
-extern void pagebuf_hold(		/* increment reference count	*/
-		xfs_buf_t *);		/* buffer to hold		*/
-
-extern void pagebuf_readahead(		/* read ahead into cache	*/
-		xfs_buftarg_t  *,	/* target for buffer (or NULL)	*/
-		loff_t,			/* starting offset of range     */
-		size_t,			/* length of range              */
-		page_buf_flags_t);	/* additional read flags	*/
+extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
+extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
+extern void xfs_buf_hold(xfs_buf_t *);
+extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
+				xfs_buf_flags_t);
 
 /* Releasing Buffers */
-
-extern void pagebuf_free(		/* deallocate a buffer		*/
-		xfs_buf_t *);		/* buffer to deallocate		*/
-
-extern void pagebuf_rele(		/* release hold on a buffer	*/
-		xfs_buf_t *);		/* buffer to release		*/
+extern void xfs_buf_free(xfs_buf_t *);
+extern void xfs_buf_rele(xfs_buf_t *);
 
 /* Locking and Unlocking Buffers */
-
-extern int pagebuf_cond_lock(		/* lock buffer, if not locked	*/
-					/* (returns -EBUSY if locked)	*/
-		xfs_buf_t *);		/* buffer to lock		*/
-
-extern int pagebuf_lock_value(		/* return count on lock		*/
-		xfs_buf_t *);          /* buffer to check              */
-
-extern int pagebuf_lock(		/* lock buffer                  */
-		xfs_buf_t *);          /* buffer to lock               */
-
-extern void pagebuf_unlock(		/* unlock buffer		*/
-		xfs_buf_t *);		/* buffer to unlock		*/
+extern int xfs_buf_cond_lock(xfs_buf_t *);
+extern int xfs_buf_lock_value(xfs_buf_t *);
+extern void xfs_buf_lock(xfs_buf_t *);
+extern void xfs_buf_unlock(xfs_buf_t *);
 
 /* Buffer Read and Write Routines */
-
-extern void pagebuf_iodone(		/* mark buffer I/O complete	*/
-		xfs_buf_t *,		/* buffer to mark		*/
-		int);			/* run completion locally, or in
-					 * a helper thread.		*/
-
-extern void pagebuf_ioerror(		/* mark buffer in error	(or not) */
-		xfs_buf_t *,		/* buffer to mark		*/
-		int);			/* error to store (0 if none)	*/
-
-extern int pagebuf_iostart(		/* start I/O on a buffer	*/
-		xfs_buf_t *,		/* buffer to start		*/
-		page_buf_flags_t);	/* PBF_LOCK, PBF_ASYNC,		*/
-					/* PBF_READ, PBF_WRITE,		*/
-					/* PBF_DELWRI			*/
-
-extern int pagebuf_iorequest(		/* start real I/O		*/
-		xfs_buf_t *);		/* buffer to convey to device	*/
-
-extern int pagebuf_iowait(		/* wait for buffer I/O done	*/
-		xfs_buf_t *);		/* buffer to wait on		*/
-
-extern void pagebuf_iomove(		/* move data in/out of pagebuf	*/
-		xfs_buf_t *,		/* buffer to manipulate		*/
-		size_t,			/* starting buffer offset	*/
-		size_t,			/* length in buffer		*/
-		caddr_t,		/* data pointer			*/
-		page_buf_rw_t);		/* direction			*/
-
-static inline int pagebuf_iostrategy(xfs_buf_t *pb)
+extern void xfs_buf_ioend(xfs_buf_t *,	int);
+extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
+extern int xfs_buf_iorequest(xfs_buf_t *);
+extern int xfs_buf_iowait(xfs_buf_t *);
+extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
+				xfs_buf_rw_t);
+
+static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
 {
-	return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb);
+	return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
 }
 
-static inline int pagebuf_geterror(xfs_buf_t *pb)
+static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
-	return pb ? pb->pb_error : ENOMEM;
+	return bp ? bp->b_error : ENOMEM;
 }
 
 /* Buffer Utility Routines */
-
-extern caddr_t pagebuf_offset(		/* pointer at offset in buffer	*/
-		xfs_buf_t *,		/* buffer to offset into	*/
-		size_t);		/* offset			*/
+extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
 
 /* Pinning Buffer Storage in Memory */
-
-extern void pagebuf_pin(		/* pin buffer in memory		*/
-		xfs_buf_t *);		/* buffer to pin		*/
-
-extern void pagebuf_unpin(		/* unpin buffered data		*/
-		xfs_buf_t *);		/* buffer to unpin		*/
-
-extern int pagebuf_ispin(		/* check if buffer is pinned	*/
-		xfs_buf_t *);		/* buffer to check		*/
+extern void xfs_buf_pin(xfs_buf_t *);
+extern void xfs_buf_unpin(xfs_buf_t *);
+extern int xfs_buf_ispin(xfs_buf_t *);
 
 /* Delayed Write Buffer Routines */
-
-extern void pagebuf_delwri_dequeue(xfs_buf_t *);
+extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
 
 /* Buffer Daemon Setup Routines */
+extern int xfs_buf_init(void);
+extern void xfs_buf_terminate(void);
 
-extern int pagebuf_init(void);
-extern void pagebuf_terminate(void);
-
-
-#ifdef PAGEBUF_TRACE
-extern ktrace_t *pagebuf_trace_buf;
-extern void pagebuf_trace(
-		xfs_buf_t *,		/* buffer being traced		*/
-		char *,			/* description of operation	*/
-		void *,			/* arbitrary diagnostic value	*/
-		void *);		/* return address		*/
+#ifdef XFS_BUF_TRACE
+extern ktrace_t *xfs_buf_trace_buf;
+extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #else
-# define pagebuf_trace(pb, id, ptr, ra)	do { } while (0)
+#define xfs_buf_trace(bp,id,ptr,ra)	do { } while (0)
 #endif
 
-#define pagebuf_target_name(target)	\
-	({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; })
+#define xfs_buf_target_name(target)	\
+	({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
 
 
+#define XFS_B_ASYNC		XBF_ASYNC
+#define XFS_B_DELWRI		XBF_DELWRI
+#define XFS_B_READ		XBF_READ
+#define XFS_B_WRITE		XBF_WRITE
+#define XFS_B_STALE		XBF_STALE
 
-/* These are just for xfs_syncsub... it sets an internal variable
- * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
- */
-#define XFS_B_ASYNC		PBF_ASYNC
-#define XFS_B_DELWRI		PBF_DELWRI
-#define XFS_B_READ		PBF_READ
-#define XFS_B_WRITE		PBF_WRITE
-#define XFS_B_STALE		PBF_STALE
-
-#define XFS_BUF_TRYLOCK		PBF_TRYLOCK
-#define XFS_INCORE_TRYLOCK	PBF_TRYLOCK
-#define XFS_BUF_LOCK		PBF_LOCK
-#define XFS_BUF_MAPPED		PBF_MAPPED
-
-#define BUF_BUSY		PBF_DONT_BLOCK
-
-#define XFS_BUF_BFLAGS(x)	((x)->pb_flags)
-#define XFS_BUF_ZEROFLAGS(x)	\
-	((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI))
-
-#define XFS_BUF_STALE(x)	((x)->pb_flags |= XFS_B_STALE)
-#define XFS_BUF_UNSTALE(x)	((x)->pb_flags &= ~XFS_B_STALE)
-#define XFS_BUF_ISSTALE(x)	((x)->pb_flags & XFS_B_STALE)
-#define XFS_BUF_SUPER_STALE(x)	do {				\
-					XFS_BUF_STALE(x);	\
-					pagebuf_delwri_dequeue(x);	\
-					XFS_BUF_DONE(x);	\
-				} while (0)
+#define XFS_BUF_TRYLOCK		XBF_TRYLOCK
+#define XFS_INCORE_TRYLOCK	XBF_TRYLOCK
+#define XFS_BUF_LOCK		XBF_LOCK
+#define XFS_BUF_MAPPED		XBF_MAPPED
 
-#define XFS_BUF_MANAGE		PBF_FS_MANAGED
-#define XFS_BUF_UNMANAGE(x)	((x)->pb_flags &= ~PBF_FS_MANAGED)
-
-#define XFS_BUF_DELAYWRITE(x)	 ((x)->pb_flags |= PBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(x)	 pagebuf_delwri_dequeue(x)
-#define XFS_BUF_ISDELAYWRITE(x)	 ((x)->pb_flags & PBF_DELWRI)
-
-#define XFS_BUF_ERROR(x,no)	 pagebuf_ioerror(x,no)
-#define XFS_BUF_GETERROR(x)	 pagebuf_geterror(x)
-#define XFS_BUF_ISERROR(x)	 (pagebuf_geterror(x)?1:0)
-
-#define XFS_BUF_DONE(x)		 ((x)->pb_flags |= PBF_DONE)
-#define XFS_BUF_UNDONE(x)	 ((x)->pb_flags &= ~PBF_DONE)
-#define XFS_BUF_ISDONE(x)	 ((x)->pb_flags & PBF_DONE)
-
-#define XFS_BUF_BUSY(x)		 do { } while (0)
-#define XFS_BUF_UNBUSY(x)	 do { } while (0)
-#define XFS_BUF_ISBUSY(x)	 (1)
-
-#define XFS_BUF_ASYNC(x)	 ((x)->pb_flags |= PBF_ASYNC)
-#define XFS_BUF_UNASYNC(x)	 ((x)->pb_flags &= ~PBF_ASYNC)
-#define XFS_BUF_ISASYNC(x)	 ((x)->pb_flags & PBF_ASYNC)
-
-#define XFS_BUF_ORDERED(x)	 ((x)->pb_flags |= PBF_ORDERED)
-#define XFS_BUF_UNORDERED(x)	 ((x)->pb_flags &= ~PBF_ORDERED)
-#define XFS_BUF_ISORDERED(x)	 ((x)->pb_flags & PBF_ORDERED)
-
-#define XFS_BUF_SHUT(x)		 printk("XFS_BUF_SHUT not implemented yet\n")
-#define XFS_BUF_UNSHUT(x)	 printk("XFS_BUF_UNSHUT not implemented yet\n")
-#define XFS_BUF_ISSHUT(x)	 (0)
-
-#define XFS_BUF_HOLD(x)		pagebuf_hold(x)
-#define XFS_BUF_READ(x)		((x)->pb_flags |= PBF_READ)
-#define XFS_BUF_UNREAD(x)	((x)->pb_flags &= ~PBF_READ)
-#define XFS_BUF_ISREAD(x)	((x)->pb_flags & PBF_READ)
-
-#define XFS_BUF_WRITE(x)	((x)->pb_flags |= PBF_WRITE)
-#define XFS_BUF_UNWRITE(x)	((x)->pb_flags &= ~PBF_WRITE)
-#define XFS_BUF_ISWRITE(x)	((x)->pb_flags & PBF_WRITE)
-
-#define XFS_BUF_ISUNINITIAL(x)	 (0)
-#define XFS_BUF_UNUNINITIAL(x)	 (0)
-
-#define XFS_BUF_BP_ISMAPPED(bp)	 1
-
-#define XFS_BUF_IODONE_FUNC(buf)	(buf)->pb_iodone
-#define XFS_BUF_SET_IODONE_FUNC(buf, func)	\
-			(buf)->pb_iodone = (func)
-#define XFS_BUF_CLR_IODONE_FUNC(buf)		\
-			(buf)->pb_iodone = NULL
-#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func)	\
-			(buf)->pb_strat = (func)
-#define XFS_BUF_CLR_BDSTRAT_FUNC(buf)		\
-			(buf)->pb_strat = NULL
-
-#define XFS_BUF_FSPRIVATE(buf, type)		\
-			((type)(buf)->pb_fspriv)
-#define XFS_BUF_SET_FSPRIVATE(buf, value)	\
-			(buf)->pb_fspriv = (void *)(value)
-#define XFS_BUF_FSPRIVATE2(buf, type)		\
-			((type)(buf)->pb_fspriv2)
-#define XFS_BUF_SET_FSPRIVATE2(buf, value)	\
-			(buf)->pb_fspriv2 = (void *)(value)
-#define XFS_BUF_FSPRIVATE3(buf, type)		\
-			((type)(buf)->pb_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(buf, value)	\
-			(buf)->pb_fspriv3  = (void *)(value)
-#define XFS_BUF_SET_START(buf)
-
-#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
-			(buf)->pb_relse = (value)
-
-#define XFS_BUF_PTR(bp)		(xfs_caddr_t)((bp)->pb_addr)
-
-static inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset)
-{
-	if (bp->pb_flags & PBF_MAPPED)
-		return XFS_BUF_PTR(bp) + offset;
-	return (xfs_caddr_t) pagebuf_offset(bp, offset);
-}
+#define BUF_BUSY		XBF_DONT_BLOCK
+
+#define XFS_BUF_BFLAGS(bp)	((bp)->b_flags)
+#define XFS_BUF_ZEROFLAGS(bp)	\
+	((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI))
+
+#define XFS_BUF_STALE(bp)	((bp)->b_flags |= XFS_B_STALE)
+#define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XFS_B_STALE)
+#define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XFS_B_STALE)
+#define XFS_BUF_SUPER_STALE(bp)	do {				\
+					XFS_BUF_STALE(bp);	\
+					xfs_buf_delwri_dequeue(bp);	\
+					XFS_BUF_DONE(bp);	\
+				} while (0)
 
-#define XFS_BUF_SET_PTR(bp, val, count)		\
-				pagebuf_associate_memory(bp, val, count)
-#define XFS_BUF_ADDR(bp)	((bp)->pb_bn)
-#define XFS_BUF_SET_ADDR(bp, blk)		\
-			((bp)->pb_bn = (xfs_daddr_t)(blk))
-#define XFS_BUF_OFFSET(bp)	((bp)->pb_file_offset)
-#define XFS_BUF_SET_OFFSET(bp, off)		\
-			((bp)->pb_file_offset = (off))
-#define XFS_BUF_COUNT(bp)	((bp)->pb_count_desired)
-#define XFS_BUF_SET_COUNT(bp, cnt)		\
-			((bp)->pb_count_desired = (cnt))
-#define XFS_BUF_SIZE(bp)	((bp)->pb_buffer_length)
-#define XFS_BUF_SET_SIZE(bp, cnt)		\
-			((bp)->pb_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)
-#define XFS_BUF_SET_REF(bp, ref)
-
-#define XFS_BUF_ISPINNED(bp)	pagebuf_ispin(bp)
-
-#define XFS_BUF_VALUSEMA(bp)	pagebuf_lock_value(bp)
-#define XFS_BUF_CPSEMA(bp)	(pagebuf_cond_lock(bp) == 0)
-#define XFS_BUF_VSEMA(bp)	pagebuf_unlock(bp)
-#define XFS_BUF_PSEMA(bp,x)	pagebuf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema);
-
-/* setup the buffer target from a buftarg structure */
-#define XFS_BUF_SET_TARGET(bp, target)	\
-		(bp)->pb_target = (target)
-#define XFS_BUF_TARGET(bp)	((bp)->pb_target)
-#define XFS_BUFTARG_NAME(target)	\
-		pagebuf_target_name(target)
-
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)
-#define XFS_BUF_SET_REF(bp, ref)
-
-static inline int	xfs_bawrite(void *mp, xfs_buf_t *bp)
+#define XFS_BUF_MANAGE		XBF_FS_MANAGED
+#define XFS_BUF_UNMANAGE(bp)	((bp)->b_flags &= ~XBF_FS_MANAGED)
+
+#define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
+#define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
+#define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
+
+#define XFS_BUF_ERROR(bp,no)	xfs_buf_ioerror(bp,no)
+#define XFS_BUF_GETERROR(bp)	xfs_buf_geterror(bp)
+#define XFS_BUF_ISERROR(bp)	(xfs_buf_geterror(bp) ? 1 : 0)
+
+#define XFS_BUF_DONE(bp)	((bp)->b_flags |= XBF_DONE)
+#define XFS_BUF_UNDONE(bp)	((bp)->b_flags &= ~XBF_DONE)
+#define XFS_BUF_ISDONE(bp)	((bp)->b_flags & XBF_DONE)
+
+#define XFS_BUF_BUSY(bp)	do { } while (0)
+#define XFS_BUF_UNBUSY(bp)	do { } while (0)
+#define XFS_BUF_ISBUSY(bp)	(1)
+
+#define XFS_BUF_ASYNC(bp)	((bp)->b_flags |= XBF_ASYNC)
+#define XFS_BUF_UNASYNC(bp)	((bp)->b_flags &= ~XBF_ASYNC)
+#define XFS_BUF_ISASYNC(bp)	((bp)->b_flags & XBF_ASYNC)
+
+#define XFS_BUF_ORDERED(bp)	((bp)->b_flags |= XBF_ORDERED)
+#define XFS_BUF_UNORDERED(bp)	((bp)->b_flags &= ~XBF_ORDERED)
+#define XFS_BUF_ISORDERED(bp)	((bp)->b_flags & XBF_ORDERED)
+
+#define XFS_BUF_SHUT(bp)	do { } while (0)
+#define XFS_BUF_UNSHUT(bp)	do { } while (0)
+#define XFS_BUF_ISSHUT(bp)	(0)
+
+#define XFS_BUF_HOLD(bp)	xfs_buf_hold(bp)
+#define XFS_BUF_READ(bp)	((bp)->b_flags |= XBF_READ)
+#define XFS_BUF_UNREAD(bp)	((bp)->b_flags &= ~XBF_READ)
+#define XFS_BUF_ISREAD(bp)	((bp)->b_flags & XBF_READ)
+
+#define XFS_BUF_WRITE(bp)	((bp)->b_flags |= XBF_WRITE)
+#define XFS_BUF_UNWRITE(bp)	((bp)->b_flags &= ~XBF_WRITE)
+#define XFS_BUF_ISWRITE(bp)	((bp)->b_flags & XBF_WRITE)
+
+#define XFS_BUF_ISUNINITIAL(bp)	(0)
+#define XFS_BUF_UNUNINITIAL(bp)	(0)
+
+#define XFS_BUF_BP_ISMAPPED(bp)	(1)
+
+#define XFS_BUF_IODONE_FUNC(bp)			((bp)->b_iodone)
+#define XFS_BUF_SET_IODONE_FUNC(bp, func)	((bp)->b_iodone = (func))
+#define XFS_BUF_CLR_IODONE_FUNC(bp)		((bp)->b_iodone = NULL)
+#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func)	((bp)->b_strat = (func))
+#define XFS_BUF_CLR_BDSTRAT_FUNC(bp)		((bp)->b_strat = NULL)
+
+#define XFS_BUF_FSPRIVATE(bp, type)		((type)(bp)->b_fspriv)
+#define XFS_BUF_SET_FSPRIVATE(bp, val)		((bp)->b_fspriv = (void*)(val))
+#define XFS_BUF_FSPRIVATE2(bp, type)		((type)(bp)->b_fspriv2)
+#define XFS_BUF_SET_FSPRIVATE2(bp, val)		((bp)->b_fspriv2 = (void*)(val))
+#define XFS_BUF_FSPRIVATE3(bp, type)		((type)(bp)->b_fspriv3)
+#define XFS_BUF_SET_FSPRIVATE3(bp, val)		((bp)->b_fspriv3 = (void*)(val))
+#define XFS_BUF_SET_START(bp)			do { } while (0)
+#define XFS_BUF_SET_BRELSE_FUNC(bp, func)	((bp)->b_relse = (func))
+
+#define XFS_BUF_PTR(bp)			(xfs_caddr_t)((bp)->b_addr)
+#define XFS_BUF_SET_PTR(bp, val, cnt)	xfs_buf_associate_memory(bp, val, cnt)
+#define XFS_BUF_ADDR(bp)		((bp)->b_bn)
+#define XFS_BUF_SET_ADDR(bp, bno)	((bp)->b_bn = (xfs_daddr_t)(bno))
+#define XFS_BUF_OFFSET(bp)		((bp)->b_file_offset)
+#define XFS_BUF_SET_OFFSET(bp, off)	((bp)->b_file_offset = (off))
+#define XFS_BUF_COUNT(bp)		((bp)->b_count_desired)
+#define XFS_BUF_SET_COUNT(bp, cnt)	((bp)->b_count_desired = (cnt))
+#define XFS_BUF_SIZE(bp)		((bp)->b_buffer_length)
+#define XFS_BUF_SET_SIZE(bp, cnt)	((bp)->b_buffer_length = (cnt))
+
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	do { } while (0)
+#define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
+#define XFS_BUF_SET_REF(bp, ref)		do { } while (0)
+
+#define XFS_BUF_ISPINNED(bp)	xfs_buf_ispin(bp)
+
+#define XFS_BUF_VALUSEMA(bp)	xfs_buf_lock_value(bp)
+#define XFS_BUF_CPSEMA(bp)	(xfs_buf_cond_lock(bp) == 0)
+#define XFS_BUF_VSEMA(bp)	xfs_buf_unlock(bp)
+#define XFS_BUF_PSEMA(bp,x)	xfs_buf_lock(bp)
+#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema);
+
+#define XFS_BUF_SET_TARGET(bp, target)	((bp)->b_target = (target))
+#define XFS_BUF_TARGET(bp)		((bp)->b_target)
+#define XFS_BUFTARG_NAME(target)	xfs_buf_target_name(target)
+
+static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
 {
-	bp->pb_fspriv3 = mp;
-	bp->pb_strat = xfs_bdstrat_cb;
-	pagebuf_delwri_dequeue(bp);
-	return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | _PBF_RUN_QUEUES);
+	bp->b_fspriv3 = mp;
+	bp->b_strat = xfs_bdstrat_cb;
+	xfs_buf_delwri_dequeue(bp);
+	return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
 }
 
-static inline void	xfs_buf_relse(xfs_buf_t *bp)
+static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-	if (!bp->pb_relse)
-		pagebuf_unlock(bp);
-	pagebuf_rele(bp);
+	if (!bp->b_relse)
+		xfs_buf_unlock(bp);
+	xfs_buf_rele(bp);
 }
 
-#define xfs_bpin(bp)		pagebuf_pin(bp)
-#define xfs_bunpin(bp)		pagebuf_unpin(bp)
+#define xfs_bpin(bp)		xfs_buf_pin(bp)
+#define xfs_bunpin(bp)		xfs_buf_unpin(bp)
 
 #define xfs_buftrace(id, bp)	\
-	    pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
+	    xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
 
-#define xfs_biodone(pb)		    \
-	    pagebuf_iodone(pb, 0)
+#define xfs_biodone(bp)		xfs_buf_ioend(bp, 0)
 
-#define xfs_biomove(pb, off, len, data, rw) \
-	    pagebuf_iomove((pb), (off), (len), (data), \
-		((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ)
+#define xfs_biomove(bp, off, len, data, rw) \
+	    xfs_buf_iomove((bp), (off), (len), (data), \
+		((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ)
 
-#define xfs_biozero(pb, off, len) \
-	    pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO)
+#define xfs_biozero(bp, off, len) \
+	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 
 
-static inline int	XFS_bwrite(xfs_buf_t *pb)
+static inline int XFS_bwrite(xfs_buf_t *bp)
 {
-	int	iowait = (pb->pb_flags & PBF_ASYNC) == 0;
+	int	iowait = (bp->b_flags & XBF_ASYNC) == 0;
 	int	error = 0;
 
 	if (!iowait)
-		pb->pb_flags |= _PBF_RUN_QUEUES;
+		bp->b_flags |= _XBF_RUN_QUEUES;
 
-	pagebuf_delwri_dequeue(pb);
-	pagebuf_iostrategy(pb);
+	xfs_buf_delwri_dequeue(bp);
+	xfs_buf_iostrategy(bp);
 	if (iowait) {
-		error = pagebuf_iowait(pb);
-		xfs_buf_relse(pb);
+		error = xfs_buf_iowait(bp);
+		xfs_buf_relse(bp);
 	}
 	return error;
 }
 
-#define XFS_bdwrite(pb)		     \
-	    pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
+#define XFS_bdwrite(bp)		xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC)
 
 static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
 {
-	bp->pb_strat = xfs_bdstrat_cb;
-	bp->pb_fspriv3 = mp;
-
-	return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
+	bp->b_strat = xfs_bdstrat_cb;
+	bp->b_fspriv3 = mp;
+	return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
 }
 
-#define XFS_bdstrat(bp) pagebuf_iorequest(bp)
+#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
 
-#define xfs_iowait(pb)	pagebuf_iowait(pb)
+#define xfs_iowait(bp)	xfs_buf_iowait(bp)
 
 #define xfs_baread(target, rablkno, ralen)  \
-	pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK)
-
-#define xfs_buf_get_empty(len, target)	pagebuf_get_empty((len), (target))
-#define xfs_buf_get_noaddr(len, target)	pagebuf_get_no_daddr((len), (target))
-#define xfs_buf_free(bp)		pagebuf_free(bp)
+	xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
 
 
 /*
  *	Handling of buftargs.
  */
-
 extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
 extern void xfs_free_buftarg(xfs_buftarg_t *, int);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
 
-#define xfs_getsize_buftarg(buftarg) \
-	block_size((buftarg)->pbr_bdev)
-#define xfs_readonly_buftarg(buftarg) \
-	bdev_read_only((buftarg)->pbr_bdev)
-#define xfs_binval(buftarg) \
-	xfs_flush_buftarg(buftarg, 1)
-#define XFS_bflush(buftarg) \
-	xfs_flush_buftarg(buftarg, 1)
+#define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
+#define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
+
+#define xfs_binval(buftarg)		xfs_flush_buftarg(buftarg, 1)
+#define XFS_bflush(buftarg)		xfs_flush_buftarg(buftarg, 1)
 
 #endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2b385394f28c..7ad7b680e996 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -750,7 +750,7 @@ xfs_ioctl(
 			(ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
 			mp->m_rtdev_targp : mp->m_ddev_targp;
 
-		da.d_mem = da.d_miniosz = 1 << target->pbr_sshift;
+		da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
 		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
 
 		if (copy_to_user(arg, &da, sizeof(da)))
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index d8e21ba0cccc..95efe948a095 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -232,7 +232,7 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh)
 #define xfs_itruncate_data(ip, off)	\
 	(-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off)))
 #define xfs_statvfs_fsid(statp, mp)	\
-	({ u64 id = huge_encode_dev((mp)->m_dev);	\
+	({ u64 id = huge_encode_dev((mp)->m_ddev_targp->bt_dev); \
 	   __kernel_fsid_t *fsid = &(statp)->f_fsid;	\
 	(fsid->val[0] = (u32)id, fsid->val[1] = (u32)(id >> 32)); })
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 147a28861f6b..e0ab45fbfebd 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -233,8 +233,8 @@ xfs_read(
 		xfs_buftarg_t	*target =
 			(ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
-		if ((*offset & target->pbr_smask) ||
-		    (size & target->pbr_smask)) {
+		if ((*offset & target->bt_smask) ||
+		    (size & target->bt_smask)) {
 			if (*offset == ip->i_d.di_size) {
 				return (0);
 			}
@@ -618,7 +618,7 @@ xfs_write(
 			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 
-		if ((pos & target->pbr_smask) || (count & target->pbr_smask))
+		if ((pos & target->bt_smask) || (count & target->bt_smask))
 			return XFS_ERROR(-EINVAL);
 
 		if (!VN_CACHED(vp) && pos < i_size_read(inode))
@@ -938,7 +938,7 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
 
 	mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
 	if (!XFS_FORCED_SHUTDOWN(mp)) {
-		pagebuf_iorequest(bp);
+		xfs_buf_iorequest(bp);
 		return 0;
 	} else {
 		xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
@@ -991,7 +991,7 @@ xfsbdstrat(
 		 * if (XFS_BUF_IS_GRIO(bp)) {
 		 */
 
-		pagebuf_iorequest(bp);
+		xfs_buf_iorequest(bp);
 		return 0;
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index 50027c4a5618..8ba7a2fa6c1d 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -109,15 +109,15 @@ struct xfsstats {
 	__uint32_t		vn_remove;	/* # times vn_remove called */
 	__uint32_t		vn_free;	/* # times vn_free called */
 #define XFSSTAT_END_BUF			(XFSSTAT_END_VNODE_OPS+9)
-	__uint32_t		pb_get;
-	__uint32_t		pb_create;
-	__uint32_t		pb_get_locked;
-	__uint32_t		pb_get_locked_waited;
-	__uint32_t		pb_busy_locked;
-	__uint32_t		pb_miss_locked;
-	__uint32_t		pb_page_retries;
-	__uint32_t		pb_page_found;
-	__uint32_t		pb_get_read;
+	__uint32_t		xb_get;
+	__uint32_t		xb_create;
+	__uint32_t		xb_get_locked;
+	__uint32_t		xb_get_locked_waited;
+	__uint32_t		xb_busy_locked;
+	__uint32_t		xb_miss_locked;
+	__uint32_t		xb_page_retries;
+	__uint32_t		xb_page_found;
+	__uint32_t		xb_get_read;
 /* Extra precision counters */
 	__uint64_t		xs_xstrat_bytes;
 	__uint64_t		xs_write_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index d8ec7463d8f4..556c1437b17d 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -309,7 +309,7 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
 		return;
 	}
 
-	if (mp->m_ddev_targp->pbr_bdev->bd_disk->queue->ordered ==
+	if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered ==
 					QUEUE_ORDERED_NONE) {
 		xfs_fs_cmn_err(CE_NOTE, mp,
 		  "Disabling barriers, not supported by the underlying device");
@@ -330,7 +330,7 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->pbr_bdev, NULL);
+	blkdev_issue_flush(buftarg->bt_bdev, NULL);
 }
 
 STATIC struct inode *
@@ -969,9 +969,9 @@ init_xfs_fs( void )
 	if (error < 0)
 		goto undo_zones;
 
-	error = pagebuf_init();
+	error = xfs_buf_init();
 	if (error < 0)
-		goto undo_pagebuf;
+		goto undo_buffers;
 
 	vn_init();
 	xfs_init();
@@ -985,9 +985,9 @@ init_xfs_fs( void )
 	return 0;
 
 undo_register:
-	pagebuf_terminate();
+	xfs_buf_terminate();
 
-undo_pagebuf:
+undo_buffers:
 	linvfs_destroy_zones();
 
 undo_zones:
@@ -1001,7 +1001,7 @@ exit_xfs_fs( void )
 	XFS_DM_EXIT(&xfs_fs_type);
 	unregister_filesystem(&xfs_fs_type);
 	xfs_cleanup();
-	pagebuf_terminate();
+	xfs_buf_terminate();
 	linvfs_destroy_zones();
 	ktrace_uninit();
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fbfa240bd139..cd3cf9613a00 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -308,7 +308,6 @@ typedef struct xfs_mount {
 	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
 	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
 	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
-#define m_dev		m_ddev_targp->pbr_dev
 	__uint8_t		m_dircook_elog;	/* log d-cookie entry bits */
 	__uint8_t		m_blkbit_log;	/* blocklog + NBBY */
 	__uint8_t		m_blkbb_log;	/* blocklog - BBSHIFT */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index c4b20872f07d..a59c102cf214 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -238,6 +238,7 @@ xfs_bioerror_relse(
 	}
 	return (EIO);
 }
+
 /*
  * Prints out an ALERT message about I/O error.
  */
@@ -252,11 +253,9 @@ xfs_ioerror_alert(
  "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
  "       (\"%s\") error %d buf count %zd",
 		(!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
-		XFS_BUFTARG_NAME(bp->pb_target),
-		(__uint64_t)blkno,
-		func,
-		XFS_BUF_GETERROR(bp),
-		XFS_BUF_COUNT(bp));
+		XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+		(__uint64_t)blkno, func,
+		XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
 }
 
 /*
-- 
cgit v1.2.3


From f6d6d4fcd180f8e47bf6b13fc6cce1e6c156d0ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 15:40:13 +1100
Subject: [XFS] Initial pass at going directly-to-bio on the buffered IO path. 
 This allows us to submit much larger I/Os instead of sending down lots of
 small buffer_heads.  To do this we need to have a rather complicated I/O
 submission and completion tracking infrastructure.  Part of the latter has
 been merged already a long time ago for direct I/O support. Part of the
 problem is that we need to track sub-pagesize regions and for that we still
 need buffer_heads for the time beeing.  Long-term I hope we can move to
 better data strucutures and/or maybe move this to fs/mpage.c instead of
 having it in XFS.  Original patch from Nathan Scott with various updates from
 David Chinner and Christoph Hellwig.

SGI-PV: 947118
SGI-Modid: xfs-linux-melb:xfs-kern:203822a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c  | 776 +++++++++++++++++++++++--------------------
 fs/xfs/linux-2.6/xfs_aops.h  |  10 +
 fs/xfs/linux-2.6/xfs_iops.h  |   5 -
 fs/xfs/linux-2.6/xfs_linux.h |   4 -
 4 files changed, 431 insertions(+), 364 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 3f6b9e29850c..e99d04d3fe82 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -43,8 +43,6 @@
 #include <linux/writeback.h>
 
 STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
-STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *,
-		struct writeback_control *wbc, void *, int, int);
 
 #if defined(XFS_RW_TRACE)
 void
@@ -58,7 +56,7 @@ xfs_page_trace(
 	bhv_desc_t	*bdp;
 	vnode_t		*vp = LINVFS_GET_VP(inode);
 	loff_t		isize = i_size_read(inode);
-	loff_t		offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	loff_t		offset = page_offset(page);
 	int		delalloc = -1, unmapped = -1, unwritten = -1;
 
 	if (page_has_buffers(page))
@@ -103,15 +101,56 @@ xfs_finish_ioend(
 		queue_work(xfsdatad_workqueue, &ioend->io_work);
 }
 
+/*
+ * We're now finished for good with this ioend structure.
+ * Update the page state via the associated buffer_heads,
+ * release holds on the inode and bio, and finally free
+ * up memory.  Do not use the ioend after this.
+ */
 STATIC void
 xfs_destroy_ioend(
 	xfs_ioend_t		*ioend)
 {
+	struct buffer_head	*bh, *next;
+
+	for (bh = ioend->io_buffer_head; bh; bh = next) {
+		next = bh->b_private;
+		bh->b_end_io(bh, ioend->io_uptodate);
+	}
+
 	vn_iowake(ioend->io_vnode);
 	mempool_free(ioend, xfs_ioend_pool);
 }
 
 /*
+ * Buffered IO write completion for delayed allocate extents.
+ * TODO: Update ondisk isize now that we know the file data
+ * has been flushed (i.e. the notorious "NULL file" problem).
+ */
+STATIC void
+xfs_end_bio_delalloc(
+	void			*data)
+{
+	xfs_ioend_t		*ioend = data;
+
+	xfs_destroy_ioend(ioend);
+}
+
+/*
+ * Buffered IO write completion for regular, written extents.
+ */
+STATIC void
+xfs_end_bio_written(
+	void			*data)
+{
+	xfs_ioend_t		*ioend = data;
+
+	xfs_destroy_ioend(ioend);
+}
+
+/*
+ * IO write completion for unwritten extents.
+ *
  * Issue transactions to convert a buffer range from unwritten
  * to written extents.
  */
@@ -123,21 +162,10 @@ xfs_end_bio_unwritten(
 	vnode_t			*vp = ioend->io_vnode;
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
-	struct buffer_head	*bh, *next;
 	int			error;
 
 	if (ioend->io_uptodate)
 		VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
-
-	/* ioend->io_buffer_head is only non-NULL for buffered I/O */
-	for (bh = ioend->io_buffer_head; bh; bh = next) {
-		next = bh->b_private;
-
-		bh->b_end_io = NULL;
-		clear_buffer_unwritten(bh);
-		end_buffer_async_write(bh, ioend->io_uptodate);
-	}
-
 	xfs_destroy_ioend(ioend);
 }
 
@@ -149,7 +177,8 @@ xfs_end_bio_unwritten(
  */
 STATIC xfs_ioend_t *
 xfs_alloc_ioend(
-	struct inode		*inode)
+	struct inode		*inode,
+	unsigned int		type)
 {
 	xfs_ioend_t		*ioend;
 
@@ -162,45 +191,25 @@ xfs_alloc_ioend(
 	 */
 	atomic_set(&ioend->io_remaining, 1);
 	ioend->io_uptodate = 1; /* cleared if any I/O fails */
+	ioend->io_list = NULL;
+	ioend->io_type = type;
 	ioend->io_vnode = LINVFS_GET_VP(inode);
 	ioend->io_buffer_head = NULL;
+	ioend->io_buffer_tail = NULL;
 	atomic_inc(&ioend->io_vnode->v_iocount);
 	ioend->io_offset = 0;
 	ioend->io_size = 0;
 
-	INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
+	if (type == IOMAP_UNWRITTEN)
+		INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
+	else if (type == IOMAP_DELAY)
+		INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc, ioend);
+	else
+		INIT_WORK(&ioend->io_work, xfs_end_bio_written, ioend);
 
 	return ioend;
 }
 
-void
-linvfs_unwritten_done(
-	struct buffer_head	*bh,
-	int			uptodate)
-{
-	xfs_ioend_t		*ioend = bh->b_private;
-	static spinlock_t	unwritten_done_lock = SPIN_LOCK_UNLOCKED;
-	unsigned long		flags;
-
-	ASSERT(buffer_unwritten(bh));
-	bh->b_end_io = NULL;
-
-	if (!uptodate)
-		ioend->io_uptodate = 0;
-
-	/*
-	 * Deep magic here.  We reuse b_private in the buffer_heads to build
-	 * a chain for completing the I/O from user context after we've issued
-	 * a transaction to convert the unwritten extent.
-	 */
-	spin_lock_irqsave(&unwritten_done_lock, flags);
-	bh->b_private = ioend->io_buffer_head;
-	ioend->io_buffer_head = bh;
-	spin_unlock_irqrestore(&unwritten_done_lock, flags);
-
-	xfs_finish_ioend(ioend);
-}
-
 STATIC int
 xfs_map_blocks(
 	struct inode		*inode,
@@ -228,7 +237,7 @@ xfs_offset_to_map(
 	xfs_iomap_t		*iomapp,
 	unsigned long		offset)
 {
-	loff_t			full_offset;	/* offset from start of file */
+	xfs_off_t		full_offset;	/* offset from start of file */
 
 	ASSERT(offset < PAGE_CACHE_SIZE);
 
@@ -243,16 +252,223 @@ xfs_offset_to_map(
 	return NULL;
 }
 
+/*
+ * BIO completion handler for buffered IO.
+ */
+STATIC int
+xfs_end_bio(
+	struct bio		*bio,
+	unsigned int		bytes_done,
+	int			error)
+{
+	xfs_ioend_t		*ioend = bio->bi_private;
+
+	if (bio->bi_size)
+		return 1;
+
+	ASSERT(ioend);
+	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+
+	/* Toss bio and pass work off to an xfsdatad thread */
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		ioend->io_uptodate = 0;
+	bio->bi_private = NULL;
+	bio->bi_end_io = NULL;
+
+	bio_put(bio);
+	xfs_finish_ioend(ioend);
+	return 0;
+}
+
+STATIC void
+xfs_submit_ioend_bio(
+	xfs_ioend_t	*ioend,
+	struct bio	*bio)
+{
+	atomic_inc(&ioend->io_remaining);
+
+	bio->bi_private = ioend;
+	bio->bi_end_io = xfs_end_bio;
+
+	submit_bio(WRITE, bio);
+	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
+	bio_put(bio);
+}
+
+STATIC struct bio *
+xfs_alloc_ioend_bio(
+	struct buffer_head	*bh)
+{
+	struct bio		*bio;
+	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
+
+	do {
+		bio = bio_alloc(GFP_NOIO, nvecs);
+		nvecs >>= 1;
+	} while (!bio);
+
+	ASSERT(bio->bi_private == NULL);
+	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	bio_get(bio);
+	return bio;
+}
+
+STATIC void
+xfs_start_buffer_writeback(
+	struct buffer_head	*bh)
+{
+	ASSERT(buffer_mapped(bh));
+	ASSERT(buffer_locked(bh));
+	ASSERT(!buffer_delay(bh));
+	ASSERT(!buffer_unwritten(bh));
+
+	mark_buffer_async_write(bh);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+}
+
+STATIC void
+xfs_start_page_writeback(
+	struct page		*page,
+	struct writeback_control *wbc,
+	int			clear_dirty,
+	int			buffers)
+{
+	ASSERT(PageLocked(page));
+	ASSERT(!PageWriteback(page));
+	set_page_writeback(page);
+	if (clear_dirty)
+		clear_page_dirty(page);
+	unlock_page(page);
+	if (!buffers) {
+		end_page_writeback(page);
+		wbc->pages_skipped++;	/* We didn't write this page */
+	}
+}
+
+static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+{
+	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+}
+
+/*
+ * Submit all of the bios for all of the ioends we have saved up,
+ * covering the initial writepage page and also any probed pages.
+ */
+STATIC void
+xfs_submit_ioend(
+	xfs_ioend_t		*ioend)
+{
+	xfs_ioend_t		*next;
+	struct buffer_head	*bh;
+	struct bio		*bio;
+	sector_t		lastblock = 0;
+
+	do {
+		next = ioend->io_list;
+		bio = NULL;
+
+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+			xfs_start_buffer_writeback(bh);
+
+			if (!bio) {
+ retry:
+				bio = xfs_alloc_ioend_bio(bh);
+			} else if (bh->b_blocknr != lastblock + 1) {
+				xfs_submit_ioend_bio(ioend, bio);
+				goto retry;
+			}
+
+			if (bio_add_buffer(bio, bh) != bh->b_size) {
+				xfs_submit_ioend_bio(ioend, bio);
+				goto retry;
+			}
+
+			lastblock = bh->b_blocknr;
+		}
+		if (bio)
+			xfs_submit_ioend_bio(ioend, bio);
+		xfs_finish_ioend(ioend);
+	} while ((ioend = next) != NULL);
+}
+
+/*
+ * Cancel submission of all buffer_heads so far in this endio.
+ * Toss the endio too.  Only ever called for the initial page
+ * in a writepage request, so only ever one page.
+ */
+STATIC void
+xfs_cancel_ioend(
+	xfs_ioend_t		*ioend)
+{
+	xfs_ioend_t		*next;
+	struct buffer_head	*bh, *next_bh;
+
+	do {
+		next = ioend->io_list;
+		bh = ioend->io_buffer_head;
+		do {
+			next_bh = bh->b_private;
+			clear_buffer_async_write(bh);
+			unlock_buffer(bh);
+		} while ((bh = next_bh) != NULL);
+
+		vn_iowake(ioend->io_vnode);
+		mempool_free(ioend, xfs_ioend_pool);
+	} while ((ioend = next) != NULL);
+}
+
+/*
+ * Test to see if we've been building up a completion structure for
+ * earlier buffers -- if so, we try to append to this ioend if we
+ * can, otherwise we finish off any current ioend and start another.
+ * Return true if we've finished the given ioend.
+ */
+STATIC void
+xfs_add_to_ioend(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	unsigned int		p_offset,
+	unsigned int		type,
+	xfs_ioend_t		**result,
+	int			need_ioend)
+{
+	xfs_ioend_t		*ioend = *result;
+
+	if (!ioend || need_ioend || type != ioend->io_type) {
+		xfs_ioend_t	*previous = *result;
+		xfs_off_t	offset;
+
+		offset = (xfs_off_t)bh->b_page->index << PAGE_CACHE_SHIFT;
+		offset += p_offset;
+		ioend = xfs_alloc_ioend(inode, type);
+		ioend->io_offset = offset;
+		ioend->io_buffer_head = bh;
+		ioend->io_buffer_tail = bh;
+		if (previous)
+			previous->io_list = ioend;
+		*result = ioend;
+	} else {
+		ioend->io_buffer_tail->b_private = bh;
+		ioend->io_buffer_tail = bh;
+	}
+
+	bh->b_private = NULL;
+	ioend->io_size += bh->b_size;
+}
+
 STATIC void
 xfs_map_at_offset(
 	struct page		*page,
 	struct buffer_head	*bh,
 	unsigned long		offset,
 	int			block_bits,
-	xfs_iomap_t		*iomapp)
+	xfs_iomap_t		*iomapp,
+	xfs_ioend_t		*ioend)
 {
 	xfs_daddr_t		bn;
-	loff_t			delta;
+	xfs_off_t		delta;
 	int			sector_shift;
 
 	ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
@@ -276,60 +492,7 @@ xfs_map_at_offset(
 	bh->b_bdev = iomapp->iomap_target->bt_bdev;
 	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
-}
-
-/*
- * Look for a page at index which is unlocked and contains our
- * unwritten extent flagged buffers at its head.  Returns page
- * locked and with an extra reference count, and length of the
- * unwritten extent component on this page that we can write,
- * in units of filesystem blocks.
- */
-STATIC struct page *
-xfs_probe_unwritten_page(
-	struct address_space	*mapping,
-	pgoff_t			index,
-	xfs_iomap_t		*iomapp,
-	xfs_ioend_t		*ioend,
-	unsigned long		max_offset,
-	unsigned long		*fsbs,
-	unsigned int            bbits)
-{
-	struct page		*page;
-
-	page = find_trylock_page(mapping, index);
-	if (!page)
-		return NULL;
-	if (PageWriteback(page))
-		goto out;
-
-	if (page->mapping && page_has_buffers(page)) {
-		struct buffer_head	*bh, *head;
-		unsigned long		p_offset = 0;
-
-		*fsbs = 0;
-		bh = head = page_buffers(page);
-		do {
-			if (!buffer_unwritten(bh) || !buffer_uptodate(bh))
-				break;
-			if (!xfs_offset_to_map(page, iomapp, p_offset))
-				break;
-			if (p_offset >= max_offset)
-				break;
-			xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
-			set_buffer_unwritten_io(bh);
-			bh->b_private = ioend;
-			p_offset += bh->b_size;
-			(*fsbs)++;
-		} while ((bh = bh->b_this_page) != head);
-
-		if (p_offset)
-			return page;
-	}
-
-out:
-	unlock_page(page);
-	return NULL;
+	clear_buffer_unwritten(bh);
 }
 
 /*
@@ -372,15 +535,16 @@ out:
 	return ret;
 }
 
-STATIC unsigned int
+STATIC size_t
 xfs_probe_unmapped_cluster(
 	struct inode		*inode,
 	struct page		*startpage,
 	struct buffer_head	*bh,
 	struct buffer_head	*head)
 {
+	size_t			len, total = 0;
 	pgoff_t			tindex, tlast, tloff;
-	unsigned int		pg_offset, len, total = 0;
+	unsigned int		pg_offset;
 	struct address_space	*mapping = inode->i_mapping;
 
 	/* First sum forwards in this page */
@@ -414,14 +578,15 @@ xfs_probe_unmapped_cluster(
 }
 
 /*
- * Probe for a given page (index) in the inode and test if it is delayed
- * and without unwritten buffers.  Returns page locked and with an extra
- * reference count.
+ * Probe for a given page (index) in the inode and test if it is suitable
+ * for writing as part of an unwritten or delayed allocate extent.
+ * Returns page locked and with an extra reference count if so, else NULL.
  */
 STATIC struct page *
-xfs_probe_delalloc_page(
+xfs_probe_delayed_page(
 	struct inode		*inode,
-	pgoff_t			index)
+	pgoff_t			index,
+	unsigned int		type)
 {
 	struct page		*page;
 
@@ -437,12 +602,12 @@ xfs_probe_delalloc_page(
 
 		bh = head = page_buffers(page);
 		do {
-			if (buffer_unwritten(bh)) {
-				acceptable = 0;
+			if (buffer_unwritten(bh))
+				acceptable = (type == IOMAP_UNWRITTEN);
+			else if (buffer_delay(bh))
+				acceptable = (type == IOMAP_DELAY);
+			else
 				break;
-			} else if (buffer_delay(bh)) {
-				acceptable = 1;
-			}
 		} while ((bh = bh->b_this_page) != head);
 
 		if (acceptable)
@@ -454,161 +619,30 @@ out:
 	return NULL;
 }
 
-STATIC int
-xfs_map_unwritten(
-	struct inode		*inode,
-	struct page		*start_page,
-	struct buffer_head	*head,
-	struct buffer_head	*curr,
-	unsigned long		p_offset,
-	int			block_bits,
-	xfs_iomap_t		*iomapp,
-	struct writeback_control *wbc,
-	int			startio,
-	int			all_bh)
-{
-	struct buffer_head	*bh = curr;
-	xfs_iomap_t		*tmp;
-	xfs_ioend_t		*ioend;
-	loff_t			offset;
-	unsigned long		nblocks = 0;
-
-	offset = start_page->index;
-	offset <<= PAGE_CACHE_SHIFT;
-	offset += p_offset;
-
-	ioend = xfs_alloc_ioend(inode);
-
-	/* First map forwards in the page consecutive buffers
-	 * covering this unwritten extent
-	 */
-	do {
-		if (!buffer_unwritten(bh))
-			break;
-		tmp = xfs_offset_to_map(start_page, iomapp, p_offset);
-		if (!tmp)
-			break;
-		xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
-		set_buffer_unwritten_io(bh);
-		bh->b_private = ioend;
-		p_offset += bh->b_size;
-		nblocks++;
-	} while ((bh = bh->b_this_page) != head);
-
-	atomic_add(nblocks, &ioend->io_remaining);
-
-	/* If we reached the end of the page, map forwards in any
-	 * following pages which are also covered by this extent.
-	 */
-	if (bh == head) {
-		struct address_space	*mapping = inode->i_mapping;
-		pgoff_t			tindex, tloff, tlast;
-		unsigned long		bs;
-		unsigned int		pg_offset, bbits = inode->i_blkbits;
-		struct page		*page;
-
-		tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-		tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
-		tloff = min(tlast, tloff);
-		for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
-			page = xfs_probe_unwritten_page(mapping,
-						tindex, iomapp, ioend,
-						PAGE_CACHE_SIZE, &bs, bbits);
-			if (!page)
-				break;
-			nblocks += bs;
-			atomic_add(bs, &ioend->io_remaining);
-			xfs_convert_page(inode, page, iomapp, wbc, ioend,
-							startio, all_bh);
-			/* stop if converting the next page might add
-			 * enough blocks that the corresponding byte
-			 * count won't fit in our ulong page buf length */
-			if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
-				goto enough;
-		}
-
-		if (tindex == tlast &&
-		    (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
-			page = xfs_probe_unwritten_page(mapping,
-							tindex, iomapp, ioend,
-							pg_offset, &bs, bbits);
-			if (page) {
-				nblocks += bs;
-				atomic_add(bs, &ioend->io_remaining);
-				xfs_convert_page(inode, page, iomapp, wbc, ioend,
-							startio, all_bh);
-				if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
-					goto enough;
-			}
-		}
-	}
-
-enough:
-	ioend->io_size = (xfs_off_t)nblocks << block_bits;
-	ioend->io_offset = offset;
-	xfs_finish_ioend(ioend);
-	return 0;
-}
-
-STATIC void
-xfs_submit_page(
-	struct page		*page,
-	struct writeback_control *wbc,
-	struct buffer_head	*bh_arr[],
-	int			bh_count,
-	int			probed_page,
-	int			clear_dirty)
-{
-	struct buffer_head	*bh;
-	int			i;
-
-	BUG_ON(PageWriteback(page));
-	if (bh_count)
-		set_page_writeback(page);
-	if (clear_dirty)
-		clear_page_dirty(page);
-	unlock_page(page);
-
-	if (bh_count) {
-		for (i = 0; i < bh_count; i++) {
-			bh = bh_arr[i];
-			mark_buffer_async_write(bh);
-			if (buffer_unwritten(bh))
-				set_buffer_unwritten_io(bh);
-			set_buffer_uptodate(bh);
-			clear_buffer_dirty(bh);
-		}
-
-		for (i = 0; i < bh_count; i++)
-			submit_bh(WRITE, bh_arr[i]);
-
-		if (probed_page && clear_dirty)
-			wbc->nr_to_write--;	/* Wrote an "extra" page */
-	}
-}
-
 /*
  * Allocate & map buffers for page given the extent map. Write it out.
  * except for the original page of a writepage, this is called on
  * delalloc/unwritten pages only, for the original page it is possible
  * that the page has no mapping at all.
  */
-STATIC void
+STATIC int
 xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
 	xfs_iomap_t		*iomapp,
+	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
 	void			*private,
 	int			startio,
 	int			all_bh)
 {
-	struct buffer_head	*bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
+	struct buffer_head	*bh, *head;
 	xfs_iomap_t		*mp = iomapp, *tmp;
-	unsigned long		offset, end_offset;
-	int			index = 0;
+	unsigned long		p_offset, end_offset;
+	unsigned int		type;
 	int			bbits = inode->i_blkbits;
 	int			len, page_dirty;
+	int			count = 0, done = 0, uptodate = 1;
 
 	end_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1));
 
@@ -621,59 +655,66 @@ xfs_convert_page(
 	end_offset = roundup(end_offset, len);
 	page_dirty = end_offset / len;
 
-	offset = 0;
+	p_offset = 0;
 	bh = head = page_buffers(page);
 	do {
-		if (offset >= end_offset)
+		if (p_offset >= end_offset)
 			break;
-		if (!(PageUptodate(page) || buffer_uptodate(bh)))
+		if (!buffer_uptodate(bh))
+			uptodate = 0;
+		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
+			done = 1;
 			continue;
-		if (buffer_mapped(bh) && all_bh &&
-		    !(buffer_unwritten(bh) || buffer_delay(bh))) {
-			if (startio) {
+		}
+
+		if (buffer_unwritten(bh))
+			type = IOMAP_UNWRITTEN;
+		else if (buffer_delay(bh))
+			type = IOMAP_DELAY;
+		else {
+			type = 0;
+			if (!(buffer_mapped(bh) && all_bh && startio)) {
+				done = 1;
+			} else if (startio) {
 				lock_buffer(bh);
-				bh_arr[index++] = bh;
+				xfs_add_to_ioend(inode, bh, p_offset,
+						type, ioendp, done);
+				count++;
 				page_dirty--;
 			}
 			continue;
 		}
-		tmp = xfs_offset_to_map(page, mp, offset);
-		if (!tmp)
+		tmp = xfs_offset_to_map(page, mp, p_offset);
+		if (!tmp) {
+			done = 1;
 			continue;
+		}
 		ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
 		ASSERT(!(tmp->iomap_flags & IOMAP_DELAY));
 
-		/* If this is a new unwritten extent buffer (i.e. one
-		 * that we haven't passed in private data for, we must
-		 * now map this buffer too.
-		 */
-		if (buffer_unwritten(bh) && !bh->b_end_io) {
-			ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN);
-			xfs_map_unwritten(inode, page, head, bh, offset,
-					bbits, tmp, wbc, startio, all_bh);
-		} else if (! (buffer_unwritten(bh) && buffer_locked(bh))) {
-			xfs_map_at_offset(page, bh, offset, bbits, tmp);
-			if (buffer_unwritten(bh)) {
-				set_buffer_unwritten_io(bh);
-				bh->b_private = private;
-				ASSERT(private);
-			}
-		}
+		xfs_map_at_offset(page, bh, p_offset, bbits, tmp, *ioendp);
 		if (startio) {
-			bh_arr[index++] = bh;
+			xfs_add_to_ioend(inode, bh, p_offset,
+					type, ioendp, done);
+			count++;
 		} else {
 			set_buffer_dirty(bh);
 			unlock_buffer(bh);
 			mark_buffer_dirty(bh);
 		}
 		page_dirty--;
-	} while (offset += len, (bh = bh->b_this_page) != head);
+	} while (p_offset += len, (bh = bh->b_this_page) != head);
 
-	if (startio && index) {
-		xfs_submit_page(page, wbc, bh_arr, index, 1, !page_dirty);
-	} else {
-		unlock_page(page);
+	if (uptodate && bh == head)
+		SetPageUptodate(page);
+
+	if (startio) {
+		if (count)
+			wbc->nr_to_write--;
+		xfs_start_page_writeback(page, wbc, !page_dirty, count);
 	}
+
+	return done;
 }
 
 /*
@@ -685,19 +726,22 @@ xfs_cluster_write(
 	struct inode		*inode,
 	pgoff_t			tindex,
 	xfs_iomap_t		*iomapp,
+	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
 	int			startio,
 	int			all_bh,
 	pgoff_t			tlast)
 {
 	struct page		*page;
+	unsigned int		type = (*ioendp)->io_type;
+	int			done;
 
-	for (; tindex <= tlast; tindex++) {
-		page = xfs_probe_delalloc_page(inode, tindex);
+	for (done = 0; tindex <= tlast && !done; tindex++) {
+		page = xfs_probe_delayed_page(inode, tindex, type);
 		if (!page)
 			break;
-		xfs_convert_page(inode, page, iomapp, wbc, NULL,
-				startio, all_bh);
+		done = xfs_convert_page(inode, page, iomapp, ioendp,
+						wbc, NULL, startio, all_bh);
 	}
 }
 
@@ -728,18 +772,21 @@ xfs_page_state_convert(
 	int		startio,
 	int		unmapped) /* also implies page uptodate */
 {
-	struct buffer_head	*bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
+	struct buffer_head	*bh, *head;
 	xfs_iomap_t		*iomp, iomap;
+	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
 	unsigned long           p_offset = 0;
+	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index, tlast;
-	int			len, err, i, cnt = 0, uptodate = 1;
-	int			flags;
-	int			page_dirty;
+	int			flags, len, err, done = 1;
+	int			uptodate = 1;
+	int			page_dirty, count = 0, trylock_flag = 0;
 
 	/* wait for other IO threads? */
-	flags = (startio && wbc->sync_mode != WB_SYNC_NONE) ? 0 : BMAPI_TRYLOCK;
+	if (startio && wbc->sync_mode != WB_SYNC_NONE)
+		trylock_flag |= BMAPI_TRYLOCK;
 
 	/* Is this page beyond the end of the file? */
 	offset = i_size_read(inode);
@@ -754,98 +801,98 @@ xfs_page_state_convert(
 		}
 	}
 
-	end_offset = min_t(unsigned long long,
-			(loff_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
-	offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-
 	/*
 	 * page_dirty is initially a count of buffers on the page before
 	 * EOF and is decrememted as we move each into a cleanable state.
-	 */
+	 *
+	 * Derivation:
+	 *
+	 * End offset is the highest offset that this page should represent.
+	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
+	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
+	 * hence give us the correct page_dirty count. On any other page,
+	 * it will be zero and in that case we need page_dirty to be the
+	 * count of buffers on the page.
+ 	 */
+	end_offset = min_t(unsigned long long,
+			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
 	len = 1 << inode->i_blkbits;
-	p_offset = max(p_offset, PAGE_CACHE_SIZE);
-	p_offset = roundup(p_offset, len);
+	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
+					PAGE_CACHE_SIZE);
+	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 	page_dirty = p_offset / len;
 
 	iomp = NULL;
-	p_offset = 0;
 	bh = head = page_buffers(page);
+	offset = page_offset(page);
+
+	/* TODO: fix up "done" variable and iomap pointer (boolean) */
+	/* TODO: cleanup count and page_dirty */
 
 	do {
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
 			uptodate = 0;
-		if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio)
+		if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
+			done = 1;
 			continue;
+		}
 
 		if (iomp) {
 			iomp = xfs_offset_to_map(page, &iomap, p_offset);
+			done = (iomp == NULL);
 		}
 
 		/*
 		 * First case, map an unwritten extent and prepare for
 		 * extent state conversion transaction on completion.
-		 */
-		if (buffer_unwritten(bh)) {
-			if (!startio)
-				continue;
-			if (!iomp) {
-				err = xfs_map_blocks(inode, offset, len, &iomap,
-						BMAPI_WRITE|BMAPI_IGNSTATE);
-				if (err) {
-					goto error;
-				}
-				iomp = xfs_offset_to_map(page, &iomap,
-								p_offset);
-			}
-			if (iomp) {
-				if (!bh->b_end_io) {
-					err = xfs_map_unwritten(inode, page,
-							head, bh, p_offset,
-							inode->i_blkbits, iomp,
-							wbc, startio, unmapped);
-					if (err) {
-						goto error;
-					}
-				} else {
-					set_bit(BH_Lock, &bh->b_state);
-				}
-				BUG_ON(!buffer_locked(bh));
-				bh_arr[cnt++] = bh;
-				page_dirty--;
-			}
-		/*
+		 *
 		 * Second case, allocate space for a delalloc buffer.
 		 * We can return EAGAIN here in the release page case.
 		 */
-		} else if (buffer_delay(bh)) {
+		if (buffer_unwritten(bh) || buffer_delay(bh)) {
+			if (buffer_unwritten(bh)) {
+				type = IOMAP_UNWRITTEN;
+				flags = BMAPI_WRITE|BMAPI_IGNSTATE;
+			} else {
+				type = IOMAP_DELAY;
+				flags = BMAPI_ALLOCATE;
+				if (!startio)
+					flags |= trylock_flag;
+			}
+
 			if (!iomp) {
+				done = 1;
 				err = xfs_map_blocks(inode, offset, len, &iomap,
-						BMAPI_ALLOCATE | flags);
-				if (err) {
+						flags);
+				if (err)
 					goto error;
-				}
 				iomp = xfs_offset_to_map(page, &iomap,
 								p_offset);
+				done = (iomp == NULL);
 			}
 			if (iomp) {
 				xfs_map_at_offset(page, bh, p_offset,
-						inode->i_blkbits, iomp);
+						inode->i_blkbits, iomp, ioend);
 				if (startio) {
-					bh_arr[cnt++] = bh;
+					xfs_add_to_ioend(inode, bh, p_offset,
+						type, &ioend, done);
 				} else {
 					set_buffer_dirty(bh);
 					unlock_buffer(bh);
 					mark_buffer_dirty(bh);
 				}
 				page_dirty--;
+				count++;
+			} else {
+				done = 1;
 			}
 		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
 			   (unmapped || startio)) {
 
+			type = 0;
 			if (!buffer_mapped(bh)) {
-				int	size;
 
 				/*
 				 * Getting here implies an unmapped buffer
@@ -853,6 +900,8 @@ xfs_page_state_convert(
 				 * need to write the whole page out.
 				 */
 				if (!iomp) {
+					int	size;
+
 					size = xfs_probe_unmapped_cluster(
 							inode, page, bh, head);
 					err = xfs_map_blocks(inode, offset,
@@ -863,52 +912,70 @@ xfs_page_state_convert(
 					}
 					iomp = xfs_offset_to_map(page, &iomap,
 								     p_offset);
+					done = (iomp == NULL);
 				}
 				if (iomp) {
-					xfs_map_at_offset(page,
-							bh, p_offset,
-							inode->i_blkbits, iomp);
+					xfs_map_at_offset(page, bh, p_offset,
+							inode->i_blkbits, iomp,
+							ioend);
 					if (startio) {
-						bh_arr[cnt++] = bh;
+						xfs_add_to_ioend(inode,
+							bh, p_offset, type,
+							&ioend, done);
 					} else {
 						set_buffer_dirty(bh);
 						unlock_buffer(bh);
 						mark_buffer_dirty(bh);
 					}
 					page_dirty--;
+					count++;
+				} else {
+					done = 1;
 				}
 			} else if (startio) {
 				if (buffer_uptodate(bh) &&
 				    !test_and_set_bit(BH_Lock, &bh->b_state)) {
-					bh_arr[cnt++] = bh;
+					ASSERT(buffer_mapped(bh));
+					xfs_add_to_ioend(inode,
+							bh, p_offset, type,
+							&ioend, done);
 					page_dirty--;
+					count++;
+				} else {
+					done = 1;
 				}
+			} else {
+				done = 1;
 			}
 		}
-	} while (offset += len, p_offset += len,
-		((bh = bh->b_this_page) != head));
+
+		if (!iohead)
+			iohead = ioend;
+
+	} while (offset += len, ((bh = bh->b_this_page) != head));
 
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
 
-	if (startio) {
-		xfs_submit_page(page, wbc, bh_arr, cnt, 0, !page_dirty);
-	}
+	if (startio)
+		xfs_start_page_writeback(page, wbc, 1, count);
 
-	if (iomp) {
+	if (ioend && iomp && !done) {
 		offset = (iomp->iomap_offset + iomp->iomap_bsize - 1) >>
 					PAGE_CACHE_SHIFT;
 		tlast = min_t(pgoff_t, offset, last_index);
-		xfs_cluster_write(inode, page->index + 1, iomp, wbc,
-					startio, unmapped, tlast);
+		xfs_cluster_write(inode, page->index + 1, iomp, &ioend,
+					wbc, startio, unmapped, tlast);
 	}
 
+	if (iohead)
+		xfs_submit_ioend(iohead);
+
 	return page_dirty;
 
 error:
-	for (i = 0; i < cnt; i++) {
-		unlock_buffer(bh_arr[i]);
-	}
+	if (iohead)
+		xfs_cancel_ioend(iohead);
 
 	/*
 	 * If it's delalloc and we have nowhere to put it,
@@ -916,9 +983,8 @@ error:
 	 * us to try again.
 	 */
 	if (err != -EAGAIN) {
-		if (!unmapped) {
+		if (!unmapped)
 			block_invalidatepage(page, 0);
-		}
 		ClearPageUptodate(page);
 	}
 	return err;
@@ -1094,7 +1160,7 @@ linvfs_direct_IO(
 	if (error)
 		return -error;
 
-	iocb->private = xfs_alloc_ioend(inode);
+	iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
 
 	ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
 		iomap.iomap_target->bt_bdev,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4720758a9ade..55339dd5a30d 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,14 +23,24 @@ extern mempool_t *xfs_ioend_pool;
 
 typedef void (*xfs_ioend_func_t)(void *);
 
+/*
+ * xfs_ioend struct manages large extent writes for XFS.
+ * It can manage several multi-page bio's at once.
+ */
 typedef struct xfs_ioend {
+	struct xfs_ioend	*io_list;	/* next ioend in chain */
+	unsigned int		io_type;	/* delalloc / unwritten */
 	unsigned int		io_uptodate;	/* I/O status register */
 	atomic_t		io_remaining;	/* hold count */
 	struct vnode		*io_vnode;	/* file being written to */
 	struct buffer_head	*io_buffer_head;/* buffer linked list head */
+	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
 } xfs_ioend_t;
 
+extern struct address_space_operations linvfs_aops;
+extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+
 #endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index ee784b63acbf..6899a6b4a50a 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -26,11 +26,6 @@ extern struct file_operations linvfs_file_operations;
 extern struct file_operations linvfs_invis_file_operations;
 extern struct file_operations linvfs_dir_operations;
 
-extern struct address_space_operations linvfs_aops;
-
-extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern void linvfs_unwritten_done(struct buffer_head *, int);
-
 extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *,
                         int, unsigned int, void __user *);
 
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 95efe948a095..67389b745526 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -110,10 +110,6 @@
  * delalloc and these ondisk-uninitialised buffers.
  */
 BUFFER_FNS(PrivateStart, unwritten);
-static inline void set_buffer_unwritten_io(struct buffer_head *bh)
-{
-	bh->b_end_io = linvfs_unwritten_done;
-}
 
 #define restricted_chown	xfs_params.restrict_chown.val
 #define irix_sgid_inherit	xfs_params.sgid_inherit.val
-- 
cgit v1.2.3


From 10ce444428c2329eb2aaf84850b5c7c09cecc58c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 20:48:14 +1100
Subject: [XFS] use pagevec lookups  This reduces the time spend in the radix
 tree lookups and avoids unessecary look roundtrips.

SGI-PV: 947118
SGI-Modid: xfs-linux-melb:xfs-kern:203823a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 144 +++++++++++++++++++++++++++-----------------
 1 file changed, 88 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index e99d04d3fe82..e998009c0f52 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -40,6 +40,7 @@
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
 #include <linux/mpage.h>
+#include <linux/pagevec.h>
 #include <linux/writeback.h>
 
 STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
@@ -501,18 +502,13 @@ xfs_map_at_offset(
  */
 STATIC unsigned int
 xfs_probe_unmapped_page(
-	struct address_space	*mapping,
-	pgoff_t			index,
+	struct page		*page,
 	unsigned int		pg_offset)
 {
-	struct page		*page;
 	int			ret = 0;
 
-	page = find_trylock_page(mapping, index);
-	if (!page)
-		return 0;
 	if (PageWriteback(page))
-		goto out;
+		return 0;
 
 	if (page->mapping && PageDirty(page)) {
 		if (page_has_buffers(page)) {
@@ -530,8 +526,6 @@ xfs_probe_unmapped_page(
 			ret = PAGE_CACHE_SIZE;
 	}
 
-out:
-	unlock_page(page);
 	return ret;
 }
 
@@ -542,59 +536,75 @@ xfs_probe_unmapped_cluster(
 	struct buffer_head	*bh,
 	struct buffer_head	*head)
 {
-	size_t			len, total = 0;
+	struct pagevec		pvec;
 	pgoff_t			tindex, tlast, tloff;
-	unsigned int		pg_offset;
-	struct address_space	*mapping = inode->i_mapping;
+	size_t			total = 0;
+	int			done = 0, i;
 
 	/* First sum forwards in this page */
 	do {
 		if (buffer_mapped(bh))
-			break;
+			return total;
 		total += bh->b_size;
 	} while ((bh = bh->b_this_page) != head);
 
-	/* If we reached the end of the page, sum forwards in
-	 * following pages.
-	 */
-	if (bh == head) {
-		tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-		/* Prune this back to avoid pathological behavior */
-		tloff = min(tlast, startpage->index + 64);
-		for (tindex = startpage->index + 1; tindex < tloff; tindex++) {
-			len = xfs_probe_unmapped_page(mapping, tindex,
-							PAGE_CACHE_SIZE);
-			if (!len)
-				return total;
+	/* if we reached the end of the page, sum forwards in following pages */
+	tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+	tindex = startpage->index + 1;
+
+	/* Prune this back to avoid pathological behavior */
+	tloff = min(tlast, startpage->index + 64);
+
+	pagevec_init(&pvec, 0);
+	while (!done && tindex <= tloff) {
+		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
+
+		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
+			break;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			size_t pg_offset, len = 0;
+
+			if (tindex == tlast) {
+				pg_offset =
+				    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
+				if (!pg_offset)
+					break;
+			} else
+				pg_offset = PAGE_CACHE_SIZE;
+
+			if (page->index == tindex && !TestSetPageLocked(page)) {
+				len = xfs_probe_unmapped_page(page, pg_offset);
+				unlock_page(page);
+			}
+
+			if (!len) {
+				done = 1;
+				break;
+			}
+
 			total += len;
 		}
-		if (tindex == tlast &&
-		    (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
-			total += xfs_probe_unmapped_page(mapping,
-							tindex, pg_offset);
-		}
+
+		pagevec_release(&pvec);
+		cond_resched();
 	}
+
 	return total;
 }
 
 /*
- * Probe for a given page (index) in the inode and test if it is suitable
- * for writing as part of an unwritten or delayed allocate extent.
- * Returns page locked and with an extra reference count if so, else NULL.
+ * Test if a given page is suitable for writing as part of an unwritten
+ * or delayed allocate extent.
  */
-STATIC struct page *
-xfs_probe_delayed_page(
-	struct inode		*inode,
-	pgoff_t			index,
+STATIC int
+xfs_is_delayed_page(
+	struct page		*page,
 	unsigned int		type)
 {
-	struct page		*page;
-
-	page = find_trylock_page(inode->i_mapping, index);
-	if (!page)
-		return NULL;
 	if (PageWriteback(page))
-		goto out;
+		return 0;
 
 	if (page->mapping && page_has_buffers(page)) {
 		struct buffer_head	*bh, *head;
@@ -611,12 +621,10 @@ xfs_probe_delayed_page(
 		} while ((bh = bh->b_this_page) != head);
 
 		if (acceptable)
-			return page;
+			return 1;
 	}
 
-out:
-	unlock_page(page);
-	return NULL;
+	return 0;
 }
 
 /*
@@ -629,10 +637,10 @@ STATIC int
 xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
+	loff_t			tindex,
 	xfs_iomap_t		*iomapp,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
-	void			*private,
 	int			startio,
 	int			all_bh)
 {
@@ -644,6 +652,17 @@ xfs_convert_page(
 	int			len, page_dirty;
 	int			count = 0, done = 0, uptodate = 1;
 
+	if (page->index != tindex)
+		goto fail;
+	if (TestSetPageLocked(page))
+		goto fail;
+	if (PageWriteback(page))
+		goto fail_unlock_page;
+	if (page->mapping != inode->i_mapping)
+		goto fail_unlock_page;
+	if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
+		goto fail_unlock_page;
+
 	end_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1));
 
 	/*
@@ -715,6 +734,10 @@ xfs_convert_page(
 	}
 
 	return done;
+ fail_unlock_page:
+	unlock_page(page);
+ fail:
+	return 1;
 }
 
 /*
@@ -732,16 +755,25 @@ xfs_cluster_write(
 	int			all_bh,
 	pgoff_t			tlast)
 {
-	struct page		*page;
-	unsigned int		type = (*ioendp)->io_type;
-	int			done;
+	struct pagevec		pvec;
+	int			done = 0, i;
 
-	for (done = 0; tindex <= tlast && !done; tindex++) {
-		page = xfs_probe_delayed_page(inode, tindex, type);
-		if (!page)
+	pagevec_init(&pvec, 0);
+	while (!done && tindex <= tlast) {
+		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
+
+		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
 			break;
-		done = xfs_convert_page(inode, page, iomapp, ioendp,
-						wbc, NULL, startio, all_bh);
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
+					iomapp, ioendp, wbc, startio, all_bh);
+			if (done)
+				break;
+		}
+
+		pagevec_release(&pvec);
+		cond_resched();
 	}
 }
 
-- 
cgit v1.2.3


From 1defeac9d4fffa3eabc4fba887e8ff5b1da7f361 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 20:48:33 +1100
Subject: [XFS] clean up the xfs_offset_to_map interface	Currently we pass a
 struct page and a relative offset into that page around, and returns the
 current xfs_iomap_t if the block at the specified offset fits into it, or a
 NULL pointer otherwise.  This patch passed the full 64bit offset into the
 inode that all callers have anyway, and changes the return value to a simple
 boolean.  Also the function gets a more descriptive name: xfs_iomap_valid.

SGI-PV: 947118
SGI-Modid: xfs-linux-melb:xfs-kern:203825a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 136 +++++++++++++++++---------------------------
 1 file changed, 53 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index e998009c0f52..b306e25f0f07 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -228,29 +228,13 @@ xfs_map_blocks(
 	return -error;
 }
 
-/*
- * Finds the corresponding mapping in block @map array of the
- * given @offset within a @page.
- */
-STATIC xfs_iomap_t *
-xfs_offset_to_map(
-	struct page		*page,
+STATIC inline int
+xfs_iomap_valid(
 	xfs_iomap_t		*iomapp,
-	unsigned long		offset)
+	loff_t			offset)
 {
-	xfs_off_t		full_offset;	/* offset from start of file */
-
-	ASSERT(offset < PAGE_CACHE_SIZE);
-
-	full_offset = page->index;		/* NB: using 64bit number */
-	full_offset <<= PAGE_CACHE_SHIFT;	/* offset from file start */
-	full_offset += offset;			/* offset from page start */
-
-	if (full_offset < iomapp->iomap_offset)
-		return NULL;
-	if (iomapp->iomap_offset + (iomapp->iomap_bsize -1) >= full_offset)
-		return iomapp;
-	return NULL;
+	return offset >= iomapp->iomap_offset &&
+		offset < iomapp->iomap_offset + iomapp->iomap_bsize;
 }
 
 /*
@@ -461,31 +445,23 @@ xfs_add_to_ioend(
 
 STATIC void
 xfs_map_at_offset(
-	struct page		*page,
 	struct buffer_head	*bh,
-	unsigned long		offset,
+	loff_t			offset,
 	int			block_bits,
-	xfs_iomap_t		*iomapp,
-	xfs_ioend_t		*ioend)
+	xfs_iomap_t		*iomapp)
 {
 	xfs_daddr_t		bn;
-	xfs_off_t		delta;
 	int			sector_shift;
 
 	ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
 	ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
 	ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL);
 
-	delta = page->index;
-	delta <<= PAGE_CACHE_SHIFT;
-	delta += offset;
-	delta -= iomapp->iomap_offset;
-	delta >>= block_bits;
-
 	sector_shift = block_bits - BBSHIFT;
-	bn = iomapp->iomap_bn >> sector_shift;
-	bn += delta;
-	BUG_ON(!bn && !(iomapp->iomap_flags & IOMAP_REALTIME));
+	bn = (iomapp->iomap_bn >> sector_shift) +
+	      ((offset - iomapp->iomap_offset) >> block_bits);
+
+	ASSERT(bn || (iomapp->iomap_flags & IOMAP_REALTIME));
 	ASSERT((bn << sector_shift) >= iomapp->iomap_bn);
 
 	lock_buffer(bh);
@@ -569,8 +545,10 @@ xfs_probe_unmapped_cluster(
 			if (tindex == tlast) {
 				pg_offset =
 				    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-				if (!pg_offset)
+				if (!pg_offset) {
+					done = 1;
 					break;
+				}
 			} else
 				pg_offset = PAGE_CACHE_SIZE;
 
@@ -585,6 +563,7 @@ xfs_probe_unmapped_cluster(
 			}
 
 			total += len;
+			tindex++;
 		}
 
 		pagevec_release(&pvec);
@@ -638,19 +617,19 @@ xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
 	loff_t			tindex,
-	xfs_iomap_t		*iomapp,
+	xfs_iomap_t		*mp,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
 	int			startio,
 	int			all_bh)
 {
 	struct buffer_head	*bh, *head;
-	xfs_iomap_t		*mp = iomapp, *tmp;
 	unsigned long		p_offset, end_offset;
 	unsigned int		type;
 	int			bbits = inode->i_blkbits;
 	int			len, page_dirty;
 	int			count = 0, done = 0, uptodate = 1;
+ 	xfs_off_t		f_offset = page_offset(page);
 
 	if (page->index != tindex)
 		goto fail;
@@ -703,15 +682,15 @@ xfs_convert_page(
 			}
 			continue;
 		}
-		tmp = xfs_offset_to_map(page, mp, p_offset);
-		if (!tmp) {
+
+		if (!xfs_iomap_valid(mp, f_offset + p_offset)) {
 			done = 1;
 			continue;
 		}
-		ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
-		ASSERT(!(tmp->iomap_flags & IOMAP_DELAY));
+		ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
+		ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
 
-		xfs_map_at_offset(page, bh, p_offset, bbits, tmp, *ioendp);
+		xfs_map_at_offset(bh, f_offset + p_offset, bbits, mp);
 		if (startio) {
 			xfs_add_to_ioend(inode, bh, p_offset,
 					type, ioendp, done);
@@ -805,15 +784,14 @@ xfs_page_state_convert(
 	int		unmapped) /* also implies page uptodate */
 {
 	struct buffer_head	*bh, *head;
-	xfs_iomap_t		*iomp, iomap;
+	xfs_iomap_t		iomap;
 	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
 	unsigned long           p_offset = 0;
 	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index, tlast;
-	int			flags, len, err, done = 1;
-	int			uptodate = 1;
+	int			flags, len, err, iomap_valid = 0, uptodate = 1;
 	int			page_dirty, count = 0, trylock_flag = 0;
 
 	/* wait for other IO threads? */
@@ -854,11 +832,9 @@ xfs_page_state_convert(
 	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 	page_dirty = p_offset / len;
 
-	iomp = NULL;
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
 
-	/* TODO: fix up "done" variable and iomap pointer (boolean) */
 	/* TODO: cleanup count and page_dirty */
 
 	do {
@@ -867,14 +843,16 @@ xfs_page_state_convert(
 		if (!buffer_uptodate(bh))
 			uptodate = 0;
 		if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
-			done = 1;
+			/*
+			 * the iomap is actually still valid, but the ioend
+			 * isn't.  shouldn't happen too often.
+			 */
+			iomap_valid = 0;
 			continue;
 		}
 
-		if (iomp) {
-			iomp = xfs_offset_to_map(page, &iomap, p_offset);
-			done = (iomp == NULL);
-		}
+		if (iomap_valid)
+			iomap_valid = xfs_iomap_valid(&iomap, offset);
 
 		/*
 		 * First case, map an unwritten extent and prepare for
@@ -894,22 +872,20 @@ xfs_page_state_convert(
 					flags |= trylock_flag;
 			}
 
-			if (!iomp) {
-				done = 1;
+			if (!iomap_valid) {
 				err = xfs_map_blocks(inode, offset, len, &iomap,
 						flags);
 				if (err)
 					goto error;
-				iomp = xfs_offset_to_map(page, &iomap,
-								p_offset);
-				done = (iomp == NULL);
+				iomap_valid = xfs_iomap_valid(&iomap, offset);
 			}
-			if (iomp) {
-				xfs_map_at_offset(page, bh, p_offset,
-						inode->i_blkbits, iomp, ioend);
+			if (iomap_valid) {
+				xfs_map_at_offset(bh, offset,
+						inode->i_blkbits, &iomap);
 				if (startio) {
 					xfs_add_to_ioend(inode, bh, p_offset,
-						type, &ioend, done);
+							type, &ioend,
+							!iomap_valid);
 				} else {
 					set_buffer_dirty(bh);
 					unlock_buffer(bh);
@@ -917,8 +893,6 @@ xfs_page_state_convert(
 				}
 				page_dirty--;
 				count++;
-			} else {
-				done = 1;
 			}
 		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
 			   (unmapped || startio)) {
@@ -931,7 +905,7 @@ xfs_page_state_convert(
 				 * was found, and we are in a path where we
 				 * need to write the whole page out.
 				 */
-				if (!iomp) {
+				if (!iomap_valid) {
 					int	size;
 
 					size = xfs_probe_unmapped_cluster(
@@ -939,21 +913,19 @@ xfs_page_state_convert(
 					err = xfs_map_blocks(inode, offset,
 							size, &iomap,
 							BMAPI_WRITE|BMAPI_MMAP);
-					if (err) {
+					if (err)
 						goto error;
-					}
-					iomp = xfs_offset_to_map(page, &iomap,
-								     p_offset);
-					done = (iomp == NULL);
+					iomap_valid = xfs_iomap_valid(&iomap,
+								     offset);
 				}
-				if (iomp) {
-					xfs_map_at_offset(page, bh, p_offset,
-							inode->i_blkbits, iomp,
-							ioend);
+				if (iomap_valid) {
+					xfs_map_at_offset(bh, offset,
+							inode->i_blkbits,
+							&iomap);
 					if (startio) {
 						xfs_add_to_ioend(inode,
 							bh, p_offset, type,
-							&ioend, done);
+							&ioend, !iomap_valid);
 					} else {
 						set_buffer_dirty(bh);
 						unlock_buffer(bh);
@@ -961,8 +933,6 @@ xfs_page_state_convert(
 					}
 					page_dirty--;
 					count++;
-				} else {
-					done = 1;
 				}
 			} else if (startio) {
 				if (buffer_uptodate(bh) &&
@@ -970,14 +940,14 @@ xfs_page_state_convert(
 					ASSERT(buffer_mapped(bh));
 					xfs_add_to_ioend(inode,
 							bh, p_offset, type,
-							&ioend, done);
+							&ioend, !iomap_valid);
 					page_dirty--;
 					count++;
 				} else {
-					done = 1;
+					iomap_valid = 0;
 				}
 			} else {
-				done = 1;
+				iomap_valid = 0;
 			}
 		}
 
@@ -992,11 +962,11 @@ xfs_page_state_convert(
 	if (startio)
 		xfs_start_page_writeback(page, wbc, 1, count);
 
-	if (ioend && iomp && !done) {
-		offset = (iomp->iomap_offset + iomp->iomap_bsize - 1) >>
+	if (ioend && iomap_valid) {
+		offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
 					PAGE_CACHE_SHIFT;
 		tlast = min_t(pgoff_t, offset, last_index);
-		xfs_cluster_write(inode, page->index + 1, iomp, &ioend,
+		xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
 					wbc, startio, unmapped, tlast);
 	}
 
-- 
cgit v1.2.3


From 9260dc6b2ee011f728bae50edce11022567be096 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 20:48:47 +1100
Subject: [XFS] various fixes for xfs_convert_page  fix various bogusities in
 handling offets  From David Chinner and Christoph Hellwig

SGI-PV: 947118
SGI-Modid: xfs-linux-melb:xfs-kern:203826a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 89 ++++++++++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b306e25f0f07..64c909e5c1e5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -624,12 +624,13 @@ xfs_convert_page(
 	int			all_bh)
 {
 	struct buffer_head	*bh, *head;
-	unsigned long		p_offset, end_offset;
+	xfs_off_t		end_offset;
+	unsigned long		p_offset;
 	unsigned int		type;
 	int			bbits = inode->i_blkbits;
 	int			len, page_dirty;
 	int			count = 0, done = 0, uptodate = 1;
- 	xfs_off_t		f_offset = page_offset(page);
+ 	xfs_off_t		offset = page_offset(page);
 
 	if (page->index != tindex)
 		goto fail;
@@ -642,21 +643,33 @@ xfs_convert_page(
 	if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
 		goto fail_unlock_page;
 
-	end_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1));
-
 	/*
 	 * page_dirty is initially a count of buffers on the page before
 	 * EOF and is decrememted as we move each into a cleanable state.
+	 *
+	 * Derivation:
+	 *
+	 * End offset is the highest offset that this page should represent.
+	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
+	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
+	 * hence give us the correct page_dirty count. On any other page,
+	 * it will be zero and in that case we need page_dirty to be the
+	 * count of buffers on the page.
 	 */
+	end_offset = min_t(unsigned long long,
+			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+			i_size_read(inode));
+
 	len = 1 << inode->i_blkbits;
-	end_offset = max(end_offset, PAGE_CACHE_SIZE);
-	end_offset = roundup(end_offset, len);
-	page_dirty = end_offset / len;
+	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
+					PAGE_CACHE_SIZE);
+	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
+	page_dirty = p_offset / len;
 
 	p_offset = 0;
 	bh = head = page_buffers(page);
 	do {
-		if (p_offset >= end_offset)
+		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
 			uptodate = 0;
@@ -665,43 +678,45 @@ xfs_convert_page(
 			continue;
 		}
 
-		if (buffer_unwritten(bh))
-			type = IOMAP_UNWRITTEN;
-		else if (buffer_delay(bh))
-			type = IOMAP_DELAY;
-		else {
-			type = 0;
-			if (!(buffer_mapped(bh) && all_bh && startio)) {
+		if (buffer_unwritten(bh) || buffer_delay(bh)) {
+			if (buffer_unwritten(bh))
+				type = IOMAP_UNWRITTEN;
+			else
+				type = IOMAP_DELAY;
+
+			if (!xfs_iomap_valid(mp, offset)) {
 				done = 1;
-			} else if (startio) {
+				continue;
+			}
+
+			ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
+			ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
+
+			xfs_map_at_offset(bh, offset, bbits, mp);
+			if (startio) {
+				xfs_add_to_ioend(inode, bh, p_offset,
+						type, ioendp, done);
+			} else {
+				set_buffer_dirty(bh);
+				unlock_buffer(bh);
+				mark_buffer_dirty(bh);
+			}
+			page_dirty--;
+			count++;
+		} else {
+			type = 0;
+			if (buffer_mapped(bh) && all_bh && startio) {
 				lock_buffer(bh);
 				xfs_add_to_ioend(inode, bh, p_offset,
 						type, ioendp, done);
 				count++;
 				page_dirty--;
+			} else {
+				done = 1;
 			}
-			continue;
-		}
-
-		if (!xfs_iomap_valid(mp, f_offset + p_offset)) {
-			done = 1;
-			continue;
-		}
-		ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
-		ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
-
-		xfs_map_at_offset(bh, f_offset + p_offset, bbits, mp);
-		if (startio) {
-			xfs_add_to_ioend(inode, bh, p_offset,
-					type, ioendp, done);
-			count++;
-		} else {
-			set_buffer_dirty(bh);
-			unlock_buffer(bh);
-			mark_buffer_dirty(bh);
 		}
-		page_dirty--;
-	} while (p_offset += len, (bh = bh->b_this_page) != head);
+	} while (offset += len, p_offset += len,
+		 (bh = bh->b_this_page) != head);
 
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
-- 
cgit v1.2.3


From d5cb48aaac5bba1721bce3232e6fb022ade2c0b7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 20:49:02 +1100
Subject: [XFS] consolidate some code in xfs_page_state_convert  The unmapped
 buffer case is very similar to delayed and unwritten extends. Reorganize the
 code to share some code for these cases.

SGI-PV: 947118
SGI-Modid: xfs-linux-melb:xfs-kern:203827a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 91 +++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 64c909e5c1e5..98d26c8e0565 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -806,7 +806,8 @@ xfs_page_state_convert(
 	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index, tlast;
-	int			flags, len, err, iomap_valid = 0, uptodate = 1;
+	ssize_t			size, len;
+	int			flags, err, iomap_valid = 0, uptodate = 1;
 	int			page_dirty, count = 0, trylock_flag = 0;
 
 	/* wait for other IO threads? */
@@ -875,21 +876,36 @@ xfs_page_state_convert(
 		 *
 		 * Second case, allocate space for a delalloc buffer.
 		 * We can return EAGAIN here in the release page case.
-		 */
-		if (buffer_unwritten(bh) || buffer_delay(bh)) {
+		 *
+		 * Third case, an unmapped buffer was found, and we are
+		 * in a path where we need to write the whole page out.
+ 		 */
+		if (buffer_unwritten(bh) || buffer_delay(bh) ||
+		    ((buffer_uptodate(bh) || PageUptodate(page)) &&
+		     !buffer_mapped(bh) && (unmapped || startio))) {
 			if (buffer_unwritten(bh)) {
 				type = IOMAP_UNWRITTEN;
 				flags = BMAPI_WRITE|BMAPI_IGNSTATE;
-			} else {
+			} else if (buffer_delay(bh)) {
 				type = IOMAP_DELAY;
 				flags = BMAPI_ALLOCATE;
 				if (!startio)
 					flags |= trylock_flag;
+			} else {
+				type = 0;
+				flags = BMAPI_WRITE|BMAPI_MMAP;
 			}
 
 			if (!iomap_valid) {
-				err = xfs_map_blocks(inode, offset, len, &iomap,
-						flags);
+				if (type == 0) {
+					size = xfs_probe_unmapped_cluster(inode,
+							page, bh, head);
+				} else {
+					size = len;
+				}
+
+				err = xfs_map_blocks(inode, offset, size,
+						&iomap, flags);
 				if (err)
 					goto error;
 				iomap_valid = xfs_iomap_valid(&iomap, offset);
@@ -909,61 +925,22 @@ xfs_page_state_convert(
 				page_dirty--;
 				count++;
 			}
-		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
-			   (unmapped || startio)) {
-
+		} else if (buffer_uptodate(bh) && startio) {
 			type = 0;
-			if (!buffer_mapped(bh)) {
-
-				/*
-				 * Getting here implies an unmapped buffer
-				 * was found, and we are in a path where we
-				 * need to write the whole page out.
-				 */
-				if (!iomap_valid) {
-					int	size;
-
-					size = xfs_probe_unmapped_cluster(
-							inode, page, bh, head);
-					err = xfs_map_blocks(inode, offset,
-							size, &iomap,
-							BMAPI_WRITE|BMAPI_MMAP);
-					if (err)
-						goto error;
-					iomap_valid = xfs_iomap_valid(&iomap,
-								     offset);
-				}
-				if (iomap_valid) {
-					xfs_map_at_offset(bh, offset,
-							inode->i_blkbits,
-							&iomap);
-					if (startio) {
-						xfs_add_to_ioend(inode,
-							bh, p_offset, type,
-							&ioend, !iomap_valid);
-					} else {
-						set_buffer_dirty(bh);
-						unlock_buffer(bh);
-						mark_buffer_dirty(bh);
-					}
-					page_dirty--;
-					count++;
-				}
-			} else if (startio) {
-				if (buffer_uptodate(bh) &&
-				    !test_and_set_bit(BH_Lock, &bh->b_state)) {
-					ASSERT(buffer_mapped(bh));
-					xfs_add_to_ioend(inode,
-							bh, p_offset, type,
-							&ioend, !iomap_valid);
-					page_dirty--;
-					count++;
-				} else {
-					iomap_valid = 0;
-				}
+
+			if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
+				ASSERT(buffer_mapped(bh));
+				xfs_add_to_ioend(inode,
+						bh, p_offset, type,
+						&ioend, !iomap_valid);
+				page_dirty--;
+				count++;
 			} else {
 				iomap_valid = 0;
 			}
+		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
+			   (unmapped || startio)) {
+			iomap_valid = 0;
 		}
 
 		if (!iohead)
-- 
cgit v1.2.3


From 7336cea8c2737bbaf0296d67782f760828301d56 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 20:49:16 +1100
Subject: [XFS] pass full 64bit offsets to xfs_add_to_ioend

SGI-PV: 947118
SGI-Modid: xfs-linux-melb:xfs-kern:203828a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 98d26c8e0565..562867316639 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -414,7 +414,7 @@ STATIC void
 xfs_add_to_ioend(
 	struct inode		*inode,
 	struct buffer_head	*bh,
-	unsigned int		p_offset,
+	xfs_off_t		offset,
 	unsigned int		type,
 	xfs_ioend_t		**result,
 	int			need_ioend)
@@ -423,10 +423,7 @@ xfs_add_to_ioend(
 
 	if (!ioend || need_ioend || type != ioend->io_type) {
 		xfs_ioend_t	*previous = *result;
-		xfs_off_t	offset;
 
-		offset = (xfs_off_t)bh->b_page->index << PAGE_CACHE_SHIFT;
-		offset += p_offset;
 		ioend = xfs_alloc_ioend(inode, type);
 		ioend->io_offset = offset;
 		ioend->io_buffer_head = bh;
@@ -666,7 +663,6 @@ xfs_convert_page(
 	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 	page_dirty = p_offset / len;
 
-	p_offset = 0;
 	bh = head = page_buffers(page);
 	do {
 		if (offset >= end_offset)
@@ -694,7 +690,7 @@ xfs_convert_page(
 
 			xfs_map_at_offset(bh, offset, bbits, mp);
 			if (startio) {
-				xfs_add_to_ioend(inode, bh, p_offset,
+				xfs_add_to_ioend(inode, bh, offset,
 						type, ioendp, done);
 			} else {
 				set_buffer_dirty(bh);
@@ -707,7 +703,7 @@ xfs_convert_page(
 			type = 0;
 			if (buffer_mapped(bh) && all_bh && startio) {
 				lock_buffer(bh);
-				xfs_add_to_ioend(inode, bh, p_offset,
+				xfs_add_to_ioend(inode, bh, offset,
 						type, ioendp, done);
 				count++;
 				page_dirty--;
@@ -715,8 +711,7 @@ xfs_convert_page(
 				done = 1;
 			}
 		}
-	} while (offset += len, p_offset += len,
-		 (bh = bh->b_this_page) != head);
+	} while (offset += len, (bh = bh->b_this_page) != head);
 
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
@@ -914,7 +909,7 @@ xfs_page_state_convert(
 				xfs_map_at_offset(bh, offset,
 						inode->i_blkbits, &iomap);
 				if (startio) {
-					xfs_add_to_ioend(inode, bh, p_offset,
+					xfs_add_to_ioend(inode, bh, offset,
 							type, &ioend,
 							!iomap_valid);
 				} else {
@@ -930,8 +925,7 @@ xfs_page_state_convert(
 
 			if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
 				ASSERT(buffer_mapped(bh));
-				xfs_add_to_ioend(inode,
-						bh, p_offset, type,
+				xfs_add_to_ioend(inode, bh, offset, type,
 						&ioend, !iomap_valid);
 				page_dirty--;
 				count++;
-- 
cgit v1.2.3


From 6c4fe19f66a839bce68fcb7b99cdcb0f31c7a59e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 20:49:28 +1100
Subject: [XFS] cluster rewrites	We can cluster mapped pages aswell, this
 improves performances on rewrites since we can reduce the number of allocator
 calls.

SGI-PV: 947118
SGI-Modid: xfs-linux-melb:xfs-kern:203829a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 63 +++++++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 562867316639..9ea33ea6a225 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -470,13 +470,13 @@ xfs_map_at_offset(
 }
 
 /*
- * Look for a page at index which is unlocked and not mapped
- * yet - clustering for mmap write case.
+ * Look for a page at index that is suitable for clustering.
  */
 STATIC unsigned int
-xfs_probe_unmapped_page(
+xfs_probe_page(
 	struct page		*page,
-	unsigned int		pg_offset)
+	unsigned int		pg_offset,
+	int			mapped)
 {
 	int			ret = 0;
 
@@ -489,25 +489,28 @@ xfs_probe_unmapped_page(
 
 			bh = head = page_buffers(page);
 			do {
-				if (buffer_mapped(bh) || !buffer_uptodate(bh))
+				if (!buffer_uptodate(bh))
+					break;
+				if (mapped != buffer_mapped(bh))
 					break;
 				ret += bh->b_size;
 				if (ret >= pg_offset)
 					break;
 			} while ((bh = bh->b_this_page) != head);
 		} else
-			ret = PAGE_CACHE_SIZE;
+			ret = mapped ? 0 : PAGE_CACHE_SIZE;
 	}
 
 	return ret;
 }
 
 STATIC size_t
-xfs_probe_unmapped_cluster(
+xfs_probe_cluster(
 	struct inode		*inode,
 	struct page		*startpage,
 	struct buffer_head	*bh,
-	struct buffer_head	*head)
+	struct buffer_head	*head,
+	int			mapped)
 {
 	struct pagevec		pvec;
 	pgoff_t			tindex, tlast, tloff;
@@ -516,7 +519,7 @@ xfs_probe_unmapped_cluster(
 
 	/* First sum forwards in this page */
 	do {
-		if (buffer_mapped(bh))
+		if (mapped != buffer_mapped(bh))
 			return total;
 		total += bh->b_size;
 	} while ((bh = bh->b_this_page) != head);
@@ -550,7 +553,7 @@ xfs_probe_unmapped_cluster(
 				pg_offset = PAGE_CACHE_SIZE;
 
 			if (page->index == tindex && !TestSetPageLocked(page)) {
-				len = xfs_probe_unmapped_page(page, pg_offset);
+				len = xfs_probe_page(page, pg_offset, mapped);
 				unlock_page(page);
 			}
 
@@ -592,6 +595,8 @@ xfs_is_delayed_page(
 				acceptable = (type == IOMAP_UNWRITTEN);
 			else if (buffer_delay(bh))
 				acceptable = (type == IOMAP_DELAY);
+			else if (buffer_mapped(bh))
+				acceptable = (type == 0);
 			else
 				break;
 		} while ((bh = bh->b_this_page) != head);
@@ -804,6 +809,7 @@ xfs_page_state_convert(
 	ssize_t			size, len;
 	int			flags, err, iomap_valid = 0, uptodate = 1;
 	int			page_dirty, count = 0, trylock_flag = 0;
+	int			all_bh = unmapped;
 
 	/* wait for other IO threads? */
 	if (startio && wbc->sync_mode != WB_SYNC_NONE)
@@ -845,6 +851,8 @@ xfs_page_state_convert(
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
+	flags = -1;
+	type = 0;
 
 	/* TODO: cleanup count and page_dirty */
 
@@ -878,6 +886,12 @@ xfs_page_state_convert(
 		if (buffer_unwritten(bh) || buffer_delay(bh) ||
 		    ((buffer_uptodate(bh) || PageUptodate(page)) &&
 		     !buffer_mapped(bh) && (unmapped || startio))) {
+		     	/*
+			 * Make sure we don't use a read-only iomap
+			 */
+		     	if (flags == BMAPI_READ)
+				iomap_valid = 0;
+
 			if (buffer_unwritten(bh)) {
 				type = IOMAP_UNWRITTEN;
 				flags = BMAPI_WRITE|BMAPI_IGNSTATE;
@@ -887,14 +901,14 @@ xfs_page_state_convert(
 				if (!startio)
 					flags |= trylock_flag;
 			} else {
-				type = 0;
+				type = IOMAP_NEW;
 				flags = BMAPI_WRITE|BMAPI_MMAP;
 			}
 
 			if (!iomap_valid) {
-				if (type == 0) {
-					size = xfs_probe_unmapped_cluster(inode,
-							page, bh, head);
+				if (type == IOMAP_NEW) {
+					size = xfs_probe_cluster(inode,
+							page, bh, head, 0);
 				} else {
 					size = len;
 				}
@@ -921,10 +935,27 @@ xfs_page_state_convert(
 				count++;
 			}
 		} else if (buffer_uptodate(bh) && startio) {
-			type = 0;
+			/*
+			 * we got here because the buffer is already mapped.
+			 * That means it must already have extents allocated
+			 * underneath it. Map the extent by reading it.
+			 */
+			if (!iomap_valid || type != 0) {
+				flags = BMAPI_READ;
+				size = xfs_probe_cluster(inode, page, bh,
+								head, 1);
+				err = xfs_map_blocks(inode, offset, size,
+						&iomap, flags);
+				if (err)
+					goto error;
+				iomap_valid = xfs_iomap_valid(&iomap, offset);
+			}
 
+			type = 0;
 			if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
 				ASSERT(buffer_mapped(bh));
+				if (iomap_valid)
+					all_bh = 1;
 				xfs_add_to_ioend(inode, bh, offset, type,
 						&ioend, !iomap_valid);
 				page_dirty--;
@@ -953,7 +984,7 @@ xfs_page_state_convert(
 					PAGE_CACHE_SHIFT;
 		tlast = min_t(pgoff_t, offset, last_index);
 		xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
-					wbc, startio, unmapped, tlast);
+					wbc, startio, all_bh, tlast);
 	}
 
 	if (iohead)
-- 
cgit v1.2.3


From f5e596bbef3b0fa583c66c5a83dc9737c0fe2610 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 20:49:42 +1100
Subject: [XFS] fix writeback control handling  fix a reversed condition on
 where to trylock and deal with block layer congestion properly.	Patch
 from David Chinner and Christoph Hellwig.

SGI-PV: 947118
SGI-Modid: xfs-linux-melb:xfs-kern:203830a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 9ea33ea6a225..52707b5ddcb8 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -722,8 +722,17 @@ xfs_convert_page(
 		SetPageUptodate(page);
 
 	if (startio) {
-		if (count)
-			wbc->nr_to_write--;
+		if (count) {
+			struct backing_dev_info *bdi;
+
+			bdi = inode->i_mapping->backing_dev_info;
+			if (bdi_write_congested(bdi)) {
+				wbc->encountered_congestion = 1;
+				done = 1;
+			} else if (--wbc->nr_to_write <= 0) {
+				done = 1;
+			}
+		}
 		xfs_start_page_writeback(page, wbc, !page_dirty, count);
 	}
 
@@ -812,7 +821,7 @@ xfs_page_state_convert(
 	int			all_bh = unmapped;
 
 	/* wait for other IO threads? */
-	if (startio && wbc->sync_mode != WB_SYNC_NONE)
+	if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking))
 		trylock_flag |= BMAPI_TRYLOCK;
 
 	/* Is this page beyond the end of the file? */
-- 
cgit v1.2.3


From da7f93e9eec0885317351eb8a20cc550ed48f470 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 20:49:57 +1100
Subject: [XFS] fix up per-device xfsbufd

SGI-PV: 947098
SGI-Modid: xfs-linux-melb:xfs-kern:203831a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index cb77f99cbef1..6282f034b269 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1649,14 +1649,13 @@ xfsbufd_wakeup(
 	int			priority,
 	gfp_t			mask)
 {
-	xfs_buftarg_t		*btp, *n;
+	xfs_buftarg_t		*btp;
 
 	spin_lock(&xfs_buftarg_lock);
-	list_for_each_entry_safe(btp, n, &xfs_buftarg_list, bt_list) {
+	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
 		if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
 			continue;
 		set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-		barrier();
 		wake_up_process(btp->bt_task);
 	}
 	spin_unlock(&xfs_buftarg_lock);
-- 
cgit v1.2.3


From 204ab25f36fbd44a24458c0227cf2629c8caf00d Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 20:50:22 +1100
Subject: [XFS] Fix up offset type inconsistencies and gcc warnings from
 earlier changes.

SGI-PV: 947038
SGI-Modid: xfs-linux-melb:xfs-kern:24875a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6282f034b269..e44b7c1a3a36 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -223,7 +223,7 @@ STATIC void
 _xfs_buf_initialize(
 	xfs_buf_t		*bp,
 	xfs_buftarg_t		*target,
-	loff_t			range_base,
+	xfs_off_t		range_base,
 	size_t			range_length,
 	xfs_buf_flags_t		flags)
 {
@@ -348,7 +348,7 @@ _xfs_buf_lookup_pages(
 	gfp_t			gfp_mask = xb_to_gfp(flags);
 	unsigned short		page_count, i;
 	pgoff_t			first;
-	loff_t			end;
+	xfs_off_t		end;
 	int			error;
 
 	end = bp->b_file_offset + bp->b_buffer_length;
@@ -467,12 +467,12 @@ _xfs_buf_map_pages(
 xfs_buf_t *
 _xfs_buf_find(
 	xfs_buftarg_t		*btp,	/* block device target		*/
-	loff_t			ioff,	/* starting offset of range	*/
+	xfs_off_t		ioff,	/* starting offset of range	*/
 	size_t			isize,	/* length of range		*/
 	xfs_buf_flags_t		flags,
 	xfs_buf_t		*new_bp)
 {
-	loff_t			range_base;
+	xfs_off_t		range_base;
 	size_t			range_length;
 	xfs_bufhash_t		*hash;
 	xfs_buf_t		*bp, *n;
@@ -482,7 +482,7 @@ _xfs_buf_find(
 
 	/* Check for IOs smaller than the sector size / not sector aligned */
 	ASSERT(!(range_length < (1 << btp->bt_sshift)));
-	ASSERT(!(range_base & (loff_t)btp->bt_smask));
+	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 
 	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
 
@@ -561,7 +561,7 @@ found:
 xfs_buf_t *
 xfs_buf_get_flags(
 	xfs_buftarg_t		*target,/* target for buffer		*/
-	loff_t			ioff,	/* starting offset of range	*/
+	xfs_off_t		ioff,	/* starting offset of range	*/
 	size_t			isize,	/* length of range		*/
 	xfs_buf_flags_t		flags)
 {
@@ -617,7 +617,7 @@ xfs_buf_get_flags(
 xfs_buf_t *
 xfs_buf_read_flags(
 	xfs_buftarg_t		*target,
-	loff_t			ioff,
+	xfs_off_t		ioff,
 	size_t			isize,
 	xfs_buf_flags_t		flags)
 {
@@ -661,7 +661,7 @@ xfs_buf_read_flags(
 void
 xfs_buf_readahead(
 	xfs_buftarg_t		*target,
-	loff_t			ioff,
+	xfs_off_t		ioff,
 	size_t			isize,
 	xfs_buf_flags_t		flags)
 {
-- 
cgit v1.2.3


From 75e17b3caf29b262000dc7348f1be9a7d5403463 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@sgi.com>
Date: Wed, 11 Jan 2006 20:58:44 +1100
Subject: [XFS] add helper to get xfs_inode from vnode

SGI-PV: 947206
SGI-Modid: xfs-linux-melb:xfs-kern:203960a

Signed-off-by: Christoph Hellwig <hch@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c  |  4 +---
 fs/xfs/linux-2.6/xfs_file.c  |  6 ++----
 fs/xfs/linux-2.6/xfs_ioctl.c |  5 +----
 fs/xfs/linux-2.6/xfs_iops.c  | 16 ++++++++++++++++
 fs/xfs/xfs_dfrag.c           | 16 ++++------------
 fs/xfs/xfs_iget.c            |  5 +----
 fs/xfs/xfs_inode.h           |  2 ++
 fs/xfs/xfs_rename.c          |  7 ++-----
 fs/xfs/xfs_utils.c           |  9 +++------
 fs/xfs/xfs_vnodeops.c        |  4 +---
 10 files changed, 33 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 52707b5ddcb8..d1db8c17a74e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -54,7 +54,6 @@ xfs_page_trace(
 	int		mask)
 {
 	xfs_inode_t	*ip;
-	bhv_desc_t	*bdp;
 	vnode_t		*vp = LINVFS_GET_VP(inode);
 	loff_t		isize = i_size_read(inode);
 	loff_t		offset = page_offset(page);
@@ -63,8 +62,7 @@ xfs_page_trace(
 	if (page_has_buffers(page))
 		xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
 
-	bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
-	ip = XFS_BHVTOI(bdp);
+	ip = xfs_vtoi(vp);
 	if (!ip->i_rwtrace)
 		return;
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 06111d0bbae4..ced4404339c7 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -509,16 +509,14 @@ linvfs_open_exec(
 	vnode_t		*vp = LINVFS_GET_VP(inode);
 	xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
 	int		error = 0;
-	bhv_desc_t	*bdp;
 	xfs_inode_t	*ip;
 
 	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
-		bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
-		if (!bdp) {
+		ip = xfs_vtoi(vp);
+		if (!ip) {
 			error = -EINVAL;
 			goto open_exec_out;
 		}
-		ip = XFS_BHVTOI(bdp);
 		if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) {
 			error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
 					       0, 0, 0, NULL);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 7ad7b680e996..b3b2cfda273c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -145,13 +145,10 @@ xfs_find_handle(
 
 	if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
 		xfs_inode_t	*ip;
-		bhv_desc_t	*bhv;
 		int		lock_mode;
 
 		/* need to get access to the xfs_inode to read the generation */
-		bhv = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops);
-		ASSERT(bhv);
-		ip = XFS_BHVTOI(bhv);
+		ip = xfs_vtoi(vp);
 		ASSERT(ip);
 		lock_mode = xfs_ilock_map_shared(ip);
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index fe5e9894fdee..d388d14efe3e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -58,6 +58,22 @@
 #define IS_NOATIME(inode) ((inode->i_sb->s_flags & MS_NOATIME) ||	\
 	(S_ISDIR(inode->i_mode) && inode->i_sb->s_flags & MS_NODIRATIME))
 
+/*
+ * Get a XFS inode from a given vnode.
+ */
+xfs_inode_t *
+xfs_vtoi(
+	struct vnode	*vp)
+{
+	bhv_desc_t      *bdp;
+
+	bdp = bhv_lookup_range(VN_BHV_HEAD(vp),
+			VNODE_POSITION_XFS, VNODE_POSITION_XFS);
+	if (unlikely(bdp == NULL))
+		return NULL;
+	return XFS_BHVTOI(bdp);
+}
+
 /*
  * Bring the atime in the XFS inode uptodate.
  * Used before logging the inode to disk or when the Linux inode goes away.
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 070259a4254c..c6191d00ad27 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -60,8 +60,6 @@ xfs_swapext(
 	xfs_bstat_t	*sbp;
 	struct file	*fp = NULL, *tfp = NULL;
 	vnode_t		*vp, *tvp;
-	bhv_desc_t      *bdp, *tbdp;
-	vn_bhv_head_t   *bhp, *tbhp;
 	static uint	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
 	int		ilf_fields, tilf_fields;
 	int		error = 0;
@@ -90,13 +88,10 @@ xfs_swapext(
 		goto error0;
 	}
 
-	bhp = VN_BHV_HEAD(vp);
-	bdp = vn_bhv_lookup(bhp, &xfs_vnodeops);
-	if (bdp == NULL) {
+	ip = xfs_vtoi(vp);
+	if (ip == NULL) {
 		error = XFS_ERROR(EBADF);
 		goto error0;
-	} else {
-		ip = XFS_BHVTOI(bdp);
 	}
 
 	if (((tfp = fget((int)sxp->sx_fdtmp)) == NULL) ||
@@ -105,13 +100,10 @@ xfs_swapext(
 		goto error0;
 	}
 
-	tbhp = VN_BHV_HEAD(tvp);
-	tbdp = vn_bhv_lookup(tbhp, &xfs_vnodeops);
-	if (tbdp == NULL) {
+	tip = xfs_vtoi(tvp);
+	if (tip == NULL) {
 		error = XFS_ERROR(EBADF);
 		goto error0;
-	} else {
-		tip = XFS_BHVTOI(tbdp);
 	}
 
 	if (ip->i_mount != tip->i_mount) {
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index fc19eedbd11b..8e380a1fb79b 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -493,7 +493,6 @@ xfs_iget(
 
 retry:
 	if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) {
-		bhv_desc_t	*bdp;
 		xfs_inode_t	*ip;
 
 		vp = LINVFS_GET_VP(inode);
@@ -517,14 +516,12 @@ retry:
 			 * to wait for the inode to go away.
 			 */
 			if (is_bad_inode(inode) ||
-			    ((bdp = vn_bhv_lookup(VN_BHV_HEAD(vp),
-						  &xfs_vnodeops)) == NULL)) {
+			    ((ip = xfs_vtoi(vp)) == NULL)) {
 				iput(inode);
 				delay(1);
 				goto retry;
 			}
 
-			ip = XFS_BHVTOI(bdp);
 			if (lock_flags != 0)
 				xfs_ilock(ip, lock_flags);
 			XFS_STATS_INC(xs_ig_found);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c4c1d9bce82e..1cfbcf18ce86 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -436,6 +436,8 @@ void		xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
 void		xfs_lock_inodes(xfs_inode_t **, int, int, uint);
 
+xfs_inode_t	*xfs_vtoi(struct vnode *vp);
+
 void		xfs_synchronize_atime(xfs_inode_t *);
 
 #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 4d4e8f4e768e..81a05cfd77d2 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -243,7 +243,6 @@ xfs_rename(
 	xfs_inode_t	*inodes[4];
 	int		target_ip_dropped = 0;	/* dropped target_ip link? */
 	vnode_t		*src_dir_vp;
-	bhv_desc_t	*target_dir_bdp;
 	int		spaceres;
 	int		target_link_zero = 0;
 	int		num_inodes;
@@ -260,14 +259,12 @@ xfs_rename(
 	 * Find the XFS behavior descriptor for the target directory
 	 * vnode since it was not handed to us.
 	 */
-	target_dir_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(target_dir_vp),
-						&xfs_vnodeops);
-	if (target_dir_bdp == NULL) {
+	target_dp = xfs_vtoi(target_dir_vp);
+	if (target_dp == NULL) {
 		return XFS_ERROR(EXDEV);
 	}
 
 	src_dp = XFS_BHVTOI(src_dir_bdp);
-	target_dp = XFS_BHVTOI(target_dir_bdp);
 	mp = src_dp->i_mount;
 
 	if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) ||
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index fefe1d60377f..34654ec6ae10 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -55,16 +55,13 @@ xfs_get_dir_entry(
 	xfs_inode_t	**ipp)
 {
 	vnode_t		*vp;
-	bhv_desc_t	*bdp;
 
 	vp = VNAME_TO_VNODE(dentry);
-	bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops);
-	if (!bdp) {
-		*ipp = NULL;
+
+	*ipp = xfs_vtoi(vp);
+	if (!*ipp)
 		return XFS_ERROR(ENOENT);
-	}
 	VN_HOLD(vp);
-	*ipp = XFS_BHVTOI(bdp);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 3e8f1cbb7049..b12fcfcb196d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2578,7 +2578,6 @@ xfs_link(
 	int			cancel_flags;
 	int			committed;
 	vnode_t			*target_dir_vp;
-	bhv_desc_t		*src_bdp;
 	int			resblks;
 	char			*target_name = VNAME(dentry);
 	int			target_namelen;
@@ -2591,8 +2590,7 @@ xfs_link(
 	if (VN_ISDIR(src_vp))
 		return XFS_ERROR(EPERM);
 
-	src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
-	sip = XFS_BHVTOI(src_bdp);
+	sip = xfs_vtoi(src_vp);
 	tdp = XFS_BHVTOI(target_dir_bdp);
 	mp = tdp->i_mount;
 	if (XFS_FORCED_SHUTDOWN(mp))
-- 
cgit v1.2.3


From 71df099dc3f9cd17e8564eb647d7c1fb2ee83e2d Mon Sep 17 00:00:00 2001
From: Yingping Lu <yingping@sgi.com>
Date: Wed, 11 Jan 2006 21:02:29 +1100
Subject: [XFS] xfssyncd is responsible for flushing inode or device's data by
 extracting the work from its queue. In addition, this processing also
 decrement the inode's i_count. If there are any remaining works in queue
 before this process terminates, we have unbalanced increment and decrement of
 i_count. Thus it can cause assertion failure of vn_count. The fix allows
 xyssyncd to process any remaining work before it is shutdown.

SGI-PV: 945935
SGI-Modid: xfs-linux-melb:xfs-kern:203970a

Signed-off-by: Yingping Lu <yingping@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 556c1437b17d..f22e426d9e42 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -579,7 +579,7 @@ xfssyncd(
 		timeleft = schedule_timeout_interruptible(timeleft);
 		/* swsusp */
 		try_to_freeze();
-		if (kthread_should_stop())
+		if (kthread_should_stop() && list_empty(&vfsp->vfs_sync_list))
 			break;
 
 		spin_lock(&vfsp->vfs_sync_lock);
-- 
cgit v1.2.3


From 1259845d3f3e1d1cf96b2a78f3aec824b9d1e109 Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@sgi.com>
Date: Wed, 11 Jan 2006 21:02:47 +1100
Subject: [XFS] remove XFS_LOG_RES_DEBUG and turn on the res history all the
 time to get more useful error info on space for trans items

SGI-PV: 947110
SGI-Modid: xfs-linux-melb:xfs-kern:24886a

Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_log.c      |  8 ++------
 fs/xfs/xfs_log.h      | 11 +----------
 fs/xfs/xfs_log_priv.h | 15 +++------------
 3 files changed, 6 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f6bac20af7aa..3d9a36e77363 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1591,7 +1591,6 @@ xlog_state_finish_copy(xlog_t		*log,
  * print out info relating to regions written which consume
  * the reservation
  */
-#if defined(XFS_LOG_RES_DEBUG)
 STATIC void
 xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 {
@@ -1681,11 +1680,11 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 			ticket->t_res_arr_sum, ticket->t_res_o_flow,
 			ticket->t_res_num_ophdrs, ophdr_spc,
 			ticket->t_res_arr_sum + 
-			  ticket->t_res_o_flow + ophdr_spc,
+			ticket->t_res_o_flow + ophdr_spc,
 			ticket->t_res_num);
 
 	for (i = 0; i < ticket->t_res_num; i++) {
-	   	uint r_type = ticket->t_res_arr[i].r_type; 
+		uint r_type = ticket->t_res_arr[i].r_type; 
 		cmn_err(CE_WARN,
 			    "region[%u]: %s - %u bytes\n",
 			    i, 
@@ -1694,9 +1693,6 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 			    ticket->t_res_arr[i].r_len);
 	}
 }
-#else
-#define xlog_print_tic_res(mp, ticket)
-#endif
 
 /*
  * Write some region out to in-core log
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index f40d4391fcfc..4b2ac88dbb83 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -96,7 +96,6 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 
 
 /* Region types for iovec's i_type */
-#if defined(XFS_LOG_RES_DEBUG)
 #define XLOG_REG_TYPE_BFORMAT		1
 #define XLOG_REG_TYPE_BCHUNK		2
 #define XLOG_REG_TYPE_EFI_FORMAT	3
@@ -117,21 +116,13 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_COMMIT		18
 #define XLOG_REG_TYPE_TRANSHDR		19
 #define XLOG_REG_TYPE_MAX		19
-#endif
 
-#if defined(XFS_LOG_RES_DEBUG)
 #define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
-#else
-#define XLOG_VEC_SET_TYPE(vecp, t)
-#endif
-
 
 typedef struct xfs_log_iovec {
 	xfs_caddr_t		i_addr;		/* beginning address of region */
 	int		i_len;		/* length in bytes of region */
-#if defined(XFS_LOG_RES_DEBUG)
- 	uint		i_type;		/* type of region */
-#endif
+	uint		i_type;		/* type of region */
 } xfs_log_iovec_t;
 
 typedef void* xfs_log_ticket_t;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 5862286552ce..34bcbf50789c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -253,7 +253,6 @@ typedef __uint32_t xlog_tid_t;
 
 
 /* Ticket reservation region accounting */ 
-#if defined(XFS_LOG_RES_DEBUG)
 #define XLOG_TIC_LEN_MAX	15
 #define XLOG_TIC_RESET_RES(t) ((t)->t_res_num = \
 				(t)->t_res_arr_sum = (t)->t_res_num_ophdrs = 0)
@@ -278,15 +277,9 @@ typedef __uint32_t xlog_tid_t;
  * we don't care about.
  */
 typedef struct xlog_res {
-	uint	r_len;
-	uint	r_type;
+	uint	r_len;	/* region length		:4 */
+	uint	r_type;	/* region's transaction type	:4 */
 } xlog_res_t;
-#else
-#define XLOG_TIC_RESET_RES(t)
-#define XLOG_TIC_ADD_OPHDR(t)
-#define XLOG_TIC_ADD_REGION(t, len, type)
-#endif
-
 
 typedef struct xlog_ticket {
 	sv_t		   t_sema;	 /* sleep on this semaphore      : 20 */
@@ -301,14 +294,12 @@ typedef struct xlog_ticket {
 	char		   t_flags;	 /* properties of reservation	 : 1  */
 	uint		   t_trans_type; /* transaction type             : 4  */
 
-#if defined (XFS_LOG_RES_DEBUG)
         /* reservation array fields */
 	uint		   t_res_num;                    /* num in array : 4 */
-	xlog_res_t	   t_res_arr[XLOG_TIC_LEN_MAX];  /* array of res : X */ 
 	uint		   t_res_num_ophdrs;		 /* num op hdrs  : 4 */
 	uint		   t_res_arr_sum;		 /* array sum    : 4 */
 	uint		   t_res_o_flow;		 /* sum overflow : 4 */
-#endif
+	xlog_res_t	   t_res_arr[XLOG_TIC_LEN_MAX];  /* array of res : 8 * 15 */ 
 } xlog_ticket_t;
 
 #endif
-- 
cgit v1.2.3


From ca5ccbf98d792d8727e893765cc2df479ba399f2 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 21:03:04 +1100
Subject: [XFS] Fix some build fallout from atime changes.

SGI-PV: 946679
SGI-Modid: xfs-linux-melb:xfs-kern:24899a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_vnode.h | 19 +++++++++++++++++++
 fs/xfs/xfs_itable.c          |  4 +---
 fs/xfs/xfs_vnodeops.c        |  3 +--
 3 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index f2bbb327c081..0fe2419461d6 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -565,6 +565,25 @@ static inline int VN_BAD(struct vnode *vp)
 	return is_bad_inode(LINVFS_GET_IP(vp));
 }
 
+/*
+ * Extracting atime values in various formats
+ */
+static inline void vn_atime_to_bstime(struct vnode *vp, xfs_bstime_t *bs_atime)
+{
+	bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec;
+	bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec;
+}
+
+static inline void vn_atime_to_timespec(struct vnode *vp, struct timespec *ts)
+{
+	*ts = vp->v_inode.i_atime;
+}
+
+static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
+{
+	*tt = vp->v_inode.i_atime.tv_sec;
+}
+
 /*
  * Some useful predicates.
  */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 41f50e7d1c32..c59450e1be40 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -86,9 +86,7 @@ xfs_bulkstat_one_iget(
 	buf->bs_uid = dic->di_uid;
 	buf->bs_gid = dic->di_gid;
 	buf->bs_size = dic->di_size;
-	/* atime is only kept uptodate in the Linux inode */
-	buf->bs_atime.tv_sec = vp->v_inode.i_atime.tv_sec;
-	buf->bs_atime.tv_nsec = vp->v_inode.i_atime.tv_nsec;
+	vn_atime_to_bstime(vp, &buf->bs_atime);
 	buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
 	buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
 	buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index b12fcfcb196d..f8916349a9c6 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -182,8 +182,7 @@ xfs_getattr(
 		break;
 	}
 
-	/* atime is only kept uptodate in the Linux inode */
-	vap->va_atime = vp->v_inode.i_atime;
+	vn_atime_to_timespec(vp, &vap->va_atime);
 	vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 	vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 	vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
-- 
cgit v1.2.3


From 6ab65429b4871c42bfd0013f7f5e49d40c0642cd Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 11 Jan 2006 21:03:28 +1100
Subject: [XFS] Fix compiler warnings from older gcc versions wrt printfalike
 arguments.

SGI-PV: 907752
SGI-Modid: xfs-linux-melb:xfs-kern:24901a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_iomap.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d72c83d22ee0..5c6d873e292c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -551,8 +551,10 @@ xfs_iomap_write_direct(
                         "extent-state : %x \n",
                         (ip->i_mount)->m_fsname,
                         (long long)ip->i_ino,
-                        ret_imap->br_startblock, ret_imap->br_startoff,
-                        ret_imap->br_blockcount,ret_imap->br_state);
+                        (unsigned long long)ret_imap->br_startblock,
+			(unsigned long long)ret_imap->br_startoff,
+                        (unsigned long long)ret_imap->br_blockcount,
+			ret_imap->br_state);
         }
 	return 0;
 
@@ -722,8 +724,10 @@ retry:
                         "extent-state : %x \n",
                         (ip->i_mount)->m_fsname,
                         (long long)ip->i_ino,
-                        ret_imap->br_startblock, ret_imap->br_startoff,
-                        ret_imap->br_blockcount,ret_imap->br_state);
+                        (unsigned long long)ret_imap->br_startblock,
+			(unsigned long long)ret_imap->br_startoff,
+                        (unsigned long long)ret_imap->br_blockcount,
+			ret_imap->br_state);
 	}
 
 	*ret_imap = imap[0];
@@ -854,17 +858,21 @@ xfs_iomap_write_allocate(
 		 */
 
 		for (i = 0; i < nimaps; i++) {
-			if ( !(io->io_flags & XFS_IOCORE_RT)  && 
-				!imap[i].br_startblock) {
+			if (!(io->io_flags & XFS_IOCORE_RT)  &&
+			    !imap[i].br_startblock) {
 				cmn_err(CE_PANIC,"Access to block zero:  "
 					"fs <%s> inode: %lld "
-					"start_block : %llx start_off : %llx " 
+					"start_block : %llx start_off : %llx "
 					"blkcnt : %llx extent-state : %x \n",
 					(ip->i_mount)->m_fsname,
 					(long long)ip->i_ino,
-					imap[i].br_startblock,
-					imap[i].br_startoff,
-				        imap[i].br_blockcount,imap[i].br_state);
+					(unsigned long long)
+						imap[i].br_startblock,
+					(unsigned long long)
+						imap[i].br_startoff,
+					(unsigned long long)
+				        	imap[i].br_blockcount,
+					imap[i].br_state);
                         }
 			if ((offset_fsb >= imap[i].br_startoff) &&
 			    (offset_fsb < (imap[i].br_startoff +
@@ -970,8 +978,10 @@ xfs_iomap_write_unwritten(
 				"%llx blkcnt : %llx extent-state : %x \n",
 				(ip->i_mount)->m_fsname,
 				(long long)ip->i_ino,
-				imap.br_startblock,imap.br_startoff,
-				imap.br_blockcount,imap.br_state);
+				(unsigned long long)imap.br_startblock,
+				(unsigned long long)imap.br_startoff,
+				(unsigned long long)imap.br_blockcount,
+				imap.br_state);
         	}
 
 		if ((numblks_fsb = imap.br_blockcount) == 0) {
-- 
cgit v1.2.3


From 3762ec6bf76cdd32653c409dbad09f7b85807c68 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Thu, 12 Jan 2006 10:29:53 +1100
Subject: [XFS] Merge in trivial changes, sync up headers with userspace
 equivalents.

SGI-PV: 907752
SGI-Modid: xfs-linux-melb:xfs-kern:24961a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/quota/xfs_dquot_item.c |  4 +--
 fs/xfs/support/debug.c        | 58 +++++++++++++------------------
 fs/xfs/support/debug.h        | 25 +++++++-------
 fs/xfs/xfs_arch.h             |  2 +-
 fs/xfs/xfs_attr_leaf.h        | 79 +++++++++++++++++++++++--------------------
 fs/xfs/xfs_bmap.c             |  4 +--
 fs/xfs/xfs_dir_leaf.h         | 64 +++++++++++++++++++----------------
 fs/xfs/xfs_error.c            |  1 -
 fs/xfs/xfs_error.h            |  8 ++---
 fs/xfs/xfs_inode.c            | 33 +++++++++---------
 fs/xfs/xfs_inode_item.c       |  4 +--
 11 files changed, 141 insertions(+), 141 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2f69822344e5..2ec6b441849c 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -239,7 +239,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	 * trying to duplicate our effort.
 	 */
 	ASSERT(qip->qli_pushbuf_flag != 0);
-	ASSERT(qip->qli_push_owner == get_thread_id());
+	ASSERT(qip->qli_push_owner == current_pid());
 
 	/*
 	 * If flushlock isn't locked anymore, chances are that the
@@ -333,7 +333,7 @@ xfs_qm_dquot_logitem_trylock(
 			qip->qli_pushbuf_flag = 1;
 			ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
 #ifdef DEBUG
-			qip->qli_push_owner = get_thread_id();
+			qip->qli_push_owner = current_pid();
 #endif
 			/*
 			 * The dquot is left locked.
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index e281c3a71ad0..b08b3d9345b7 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -32,39 +32,6 @@ static const char * const	err_level[XFS_MAX_ERR_LEVEL+1] =
 					 KERN_ERR, KERN_WARNING, KERN_NOTICE,
 					 KERN_INFO, KERN_DEBUG};
 
-void
-assfail(char *a, char *f, int l)
-{
-    printk("XFS assertion failed: %s, file: %s, line: %d\n", a, f, l);
-    BUG();
-}
-
-#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM))
-
-unsigned long
-random(void)
-{
-	static unsigned long	RandomValue = 1;
-	/* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
-	register long	rv = RandomValue;
-	register long	lo;
-	register long	hi;
-
-	hi = rv / 127773;
-	lo = rv % 127773;
-	rv = 16807 * lo - 2836 * hi;
-	if( rv <= 0 ) rv += 2147483647;
-	return( RandomValue = rv );
-}
-
-int
-get_thread_id(void)
-{
-	return current->pid;
-}
-
-#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */
-
 void
 cmn_err(register int level, char *fmt, ...)
 {
@@ -90,7 +57,6 @@ cmn_err(register int level, char *fmt, ...)
 		BUG();
 }
 
-
 void
 icmn_err(register int level, char *fmt, va_list ap)
 {
@@ -109,3 +75,27 @@ icmn_err(register int level, char *fmt, va_list ap)
 	if (level == CE_PANIC)
 		BUG();
 }
+
+void
+assfail(char *expr, char *file, int line)
+{
+	printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
+	BUG();
+}
+
+#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM))
+unsigned long random(void)
+{
+	static unsigned long	RandomValue = 1;
+	/* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
+	register long	rv = RandomValue;
+	register long	lo;
+	register long	hi;
+
+	hi = rv / 127773;
+	lo = rv % 127773;
+	rv = 16807 * lo - 2836 * hi;
+	if (rv <= 0) rv += 2147483647;
+	return RandomValue = rv;
+}
+#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index aff558664c32..e3bf58112e7e 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -31,24 +31,23 @@ extern void icmn_err(int, char *, va_list)
 	__attribute__ ((format (printf, 2, 0)));
 extern void cmn_err(int, char *, ...)
 	__attribute__ ((format (printf, 2, 3)));
+extern void assfail(char *expr, char *f, int l);
 
-#ifndef STATIC
-# define STATIC static
-#endif
+#define prdev(fmt,targ,args...) \
+	printk("Device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
 
-#ifdef DEBUG
-# define ASSERT(EX)	((EX) ? ((void)0) : assfail(#EX, __FILE__, __LINE__))
-#else
-# define ASSERT(x)	((void)0)
-#endif
+#define ASSERT_ALWAYS(expr)	\
+	(unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
-extern void assfail(char *, char *, int);
-#ifdef DEBUG
+#ifndef DEBUG
+# define ASSERT(expr)	((void)0)
+#else
+# define ASSERT(expr)	ASSERT_ALWAYS(expr)
 extern unsigned long random(void);
-extern int get_thread_id(void);
 #endif
 
-#define ASSERT_ALWAYS(EX)  ((EX)?((void)0):assfail(#EX, __FILE__, __LINE__))
-#define	debug_stop_all_cpus(param)	/* param is "cpumask_t *" */
+#ifndef STATIC
+# define STATIC static
+#endif
 
 #endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 71de20cedb5c..c4836890b726 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -44,7 +44,7 @@
 #define cpu_to_be16(val)	((__be16)(val))
 #define cpu_to_be32(val)	((__be32)(val))
 #define cpu_to_be64(val)	((__be64)(val))
-#define be16_to_cpu(val)	((__uint16_t)(val)
+#define be16_to_cpu(val)	((__uint16_t)(val))
 #define be32_to_cpu(val)	((__uint32_t)(val))
 #define be64_to_cpu(val)	((__uint64_t)(val))
 #else
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index f6143ff251a0..541e34109bb9 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -63,7 +63,7 @@ struct xfs_trans;
  * the leaf_entry.  The namespaces are independent only because we also look
  * at the namespace bit when we are looking for a matching attribute name.
  *
- * We also store a "incomplete" bit in the leaf_entry.  It shows that an
+ * We also store an "incomplete" bit in the leaf_entry.  It shows that an
  * attribute is in the middle of being created and should not be shown to
  * the user if we crash during the time that the bit is set.  We clear the
  * bit when we have finished setting up the attribute.  We do this because
@@ -72,42 +72,48 @@ struct xfs_trans;
  */
 #define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
 
+typedef struct xfs_attr_leaf_map {	/* RLE map of free bytes */
+	__uint16_t	base;	 	/* base of free region */
+	__uint16_t	size;	  	/* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr {	/* constant-structure header block */
+	xfs_da_blkinfo_t info;		/* block type, links, etc. */
+	__uint16_t	count;		/* count of active leaf_entry's */
+	__uint16_t	usedbytes;	/* num bytes of names/values stored */
+	__uint16_t	firstused;	/* first used byte in name area */
+	__uint8_t	holes;		/* != 0 if blk needs compaction */
+	__uint8_t	pad1;
+	xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+					/* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry {	/* sorted on key, not name */
+	xfs_dahash_t	hashval;	/* hash value of name */
+	__uint16_t	nameidx;	/* index into buffer of name/value */
+	__uint8_t	flags;		/* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+	__uint8_t	pad2;		/* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+	__uint16_t	valuelen;	/* number of bytes in value */
+	__uint8_t	namelen;	/* length of name bytes */
+	__uint8_t	nameval[1];	/* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+	xfs_dablk_t	valueblk;	/* block number of value bytes */
+	__uint32_t	valuelen;	/* number of bytes in value */
+	__uint8_t	namelen;	/* length of name bytes */
+	__uint8_t	name[1];	/* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
 typedef struct xfs_attr_leafblock {
-	struct xfs_attr_leaf_hdr {	/* constant-structure header block */
-		xfs_da_blkinfo_t info;	/* block type, links, etc. */
-		__uint16_t count;	/* count of active leaf_entry's */
-		__uint16_t usedbytes;	/* num bytes of names/values stored */
-		__uint16_t firstused;	/* first used byte in name area */
-		__uint8_t  holes;	/* != 0 if blk needs compaction */
-		__uint8_t  pad1;
-		struct xfs_attr_leaf_map {	  /* RLE map of free bytes */
-			__uint16_t base;	  /* base of free region */
-			__uint16_t size;	  /* length of free region */
-		} freemap[XFS_ATTR_LEAF_MAPSIZE]; /* N largest free regions */
-	} hdr;
-	struct xfs_attr_leaf_entry {	/* sorted on key, not name */
-		xfs_dahash_t hashval;	/* hash value of name */
-		__uint16_t nameidx;	/* index into buffer of name/value */
-		__uint8_t flags;	/* LOCAL/ROOT/SECURE/INCOMPLETE flag */
-		__uint8_t pad2;		/* unused pad byte */
-	} entries[1];			/* variable sized array */
-	struct xfs_attr_leaf_name_local {
-		__uint16_t valuelen;	/* number of bytes in value */
-		__uint8_t namelen;	/* length of name bytes */
-		__uint8_t nameval[1];	/* name/value bytes */
-	} namelist;			/* grows from bottom of buf */
-	struct xfs_attr_leaf_name_remote {
-		xfs_dablk_t valueblk;	/* block number of value bytes */
-		__uint32_t valuelen;	/* number of bytes in value */
-		__uint8_t namelen;	/* length of name bytes */
-		__uint8_t name[1];	/* name bytes */
-	} valuelist;			/* grows from bottom of buf */
+	xfs_attr_leaf_hdr_t	hdr;	/* constant-structure header block */
+	xfs_attr_leaf_entry_t	entries[1];	/* sorted on key, not name */
+	xfs_attr_leaf_name_local_t namelist;	/* grows from bottom of buf */
+	xfs_attr_leaf_name_remote_t valuelist;	/* grows from bottom of buf */
 } xfs_attr_leafblock_t;
-typedef struct xfs_attr_leaf_hdr xfs_attr_leaf_hdr_t;
-typedef struct xfs_attr_leaf_map xfs_attr_leaf_map_t;
-typedef struct xfs_attr_leaf_entry xfs_attr_leaf_entry_t;
-typedef struct xfs_attr_leaf_name_local xfs_attr_leaf_name_local_t;
-typedef struct xfs_attr_leaf_name_remote xfs_attr_leaf_name_remote_t;
 
 /*
  * Flags used in the leaf_entry[i].flags field.
@@ -150,7 +156,8 @@ xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
 		(leafp))[INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT)];
 }
 
-#define XFS_ATTR_LEAF_NAME(leafp,idx)		xfs_attr_leaf_name(leafp,idx)
+#define XFS_ATTR_LEAF_NAME(leafp,idx)		\
+	xfs_attr_leaf_name(leafp,idx)
 static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
 {
 	return (&((char *)
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 34bfb8ad0a25..70625e577c70 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4493,8 +4493,8 @@ xfs_bmap_read_extents(
 		num_recs = be16_to_cpu(block->bb_numrecs);
 		if (unlikely(i + num_recs > room)) {
 			ASSERT(i + num_recs <= room);
-			xfs_fs_cmn_err(CE_WARN, ip->i_mount,
-				"corrupt dinode %Lu, (btree extents).  Unmount and run xfs_repair.",
+			xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+				"corrupt dinode %Lu, (btree extents).",
 				(unsigned long long) ip->i_ino);
 			XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
 					 XFS_ERRLEVEL_LOW,
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
index ab6b09eef9ab..eb8cd9a4667f 100644
--- a/fs/xfs/xfs_dir_leaf.h
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -67,34 +67,38 @@ struct xfs_trans;
  */
 #define XFS_DIR_LEAF_MAPSIZE	3	/* how many freespace slots */
 
+typedef struct xfs_dir_leaf_map {	/* RLE map of free bytes */
+	__uint16_t	base;	 	/* base of free region */
+	__uint16_t	size; 		/* run length of free region */
+} xfs_dir_leaf_map_t;
+
+typedef struct xfs_dir_leaf_hdr {	/* constant-structure header block */
+	xfs_da_blkinfo_t info;		/* block type, links, etc. */
+	__uint16_t	count;		/* count of active leaf_entry's */
+	__uint16_t	namebytes;	/* num bytes of name strings stored */
+	__uint16_t	firstused;	/* first used byte in name area */
+	__uint8_t	holes;		/* != 0 if blk needs compaction */
+	__uint8_t	pad1;
+	xfs_dir_leaf_map_t freemap[XFS_DIR_LEAF_MAPSIZE];
+} xfs_dir_leaf_hdr_t;
+
+typedef struct xfs_dir_leaf_entry {	/* sorted on key, not name */
+	xfs_dahash_t	hashval;	/* hash value of name */
+	__uint16_t	nameidx;	/* index into buffer of name */
+	__uint8_t	namelen;	/* length of name string */
+	__uint8_t	pad2;
+} xfs_dir_leaf_entry_t;
+
+typedef struct xfs_dir_leaf_name {
+	xfs_dir_ino_t	inumber;	/* inode number for this key */
+	__uint8_t	name[1];	/* name string itself */
+} xfs_dir_leaf_name_t;
+
 typedef struct xfs_dir_leafblock {
-	struct xfs_dir_leaf_hdr {	/* constant-structure header block */
-		xfs_da_blkinfo_t info;	/* block type, links, etc. */
-		__uint16_t count;	/* count of active leaf_entry's */
-		__uint16_t namebytes;	/* num bytes of name strings stored */
-		__uint16_t firstused;	/* first used byte in name area */
-		__uint8_t  holes;	/* != 0 if blk needs compaction */
-		__uint8_t  pad1;
-		struct xfs_dir_leaf_map {/* RLE map of free bytes */
-			__uint16_t base; /* base of free region */
-			__uint16_t size; /* run length of free region */
-		} freemap[XFS_DIR_LEAF_MAPSIZE]; /* N largest free regions */
-	} hdr;
-	struct xfs_dir_leaf_entry {	/* sorted on key, not name */
-		xfs_dahash_t hashval;	/* hash value of name */
-		__uint16_t nameidx;	/* index into buffer of name */
-		__uint8_t namelen;	/* length of name string */
-		__uint8_t pad2;
-	} entries[1];			/* var sized array */
-	struct xfs_dir_leaf_name {
-		xfs_dir_ino_t inumber;	/* inode number for this key */
-		__uint8_t name[1];	/* name string itself */
-	} namelist[1];			/* grows from bottom of buf */
+	xfs_dir_leaf_hdr_t	hdr;	/* constant-structure header block */
+	xfs_dir_leaf_entry_t	entries[1];	/* var sized array */
+	xfs_dir_leaf_name_t	namelist[1];	/* grows from bottom of buf */
 } xfs_dir_leafblock_t;
-typedef struct xfs_dir_leaf_hdr xfs_dir_leaf_hdr_t;
-typedef struct xfs_dir_leaf_map xfs_dir_leaf_map_t;
-typedef struct xfs_dir_leaf_entry xfs_dir_leaf_entry_t;
-typedef struct xfs_dir_leaf_name xfs_dir_leaf_name_t;
 
 /*
  * Length of name for which a 512-byte block filesystem
@@ -126,11 +130,10 @@ typedef union {
 #define	XFS_PUT_COOKIE(c,mp,bno,entry,hash)	\
 	((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
 
-typedef struct xfs_dir_put_args
-{
+typedef struct xfs_dir_put_args {
 	xfs_dircook_t	cook;		/* cookie of (next) entry */
 	xfs_intino_t	ino;		/* inode number */
-	struct xfs_dirent	*dbp;		/* buffer pointer */
+	struct xfs_dirent *dbp;		/* buffer pointer */
 	char		*name;		/* directory entry name */
 	int		namelen;	/* length of name */
 	int		done;		/* output: set if value was stored */
@@ -138,7 +141,8 @@ typedef struct xfs_dir_put_args
 	struct uio	*uio;		/* uio control structure */
 } xfs_dir_put_args_t;
 
-#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)	xfs_dir_leaf_entsize_byname(len)
+#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)	\
+	xfs_dir_leaf_entsize_byname(len)
 static inline int xfs_dir_leaf_entsize_byname(int len)
 {
 	return (uint)sizeof(xfs_dir_leaf_name_t)-1 + len;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index d7b6b5d16704..2a21c5024017 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -54,7 +54,6 @@ xfs_error_trap(int e)
 		if (e != xfs_etrap[i])
 			continue;
 		cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
-		debug_stop_all_cpus((void *)-1LL);
 		BUG();
 		break;
 	}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 06d8a8426c16..26b8e709a569 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -18,9 +18,6 @@
 #ifndef	__XFS_ERROR_H__
 #define	__XFS_ERROR_H__
 
-#define prdev(fmt,targ,args...) \
-	printk("XFS: device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
-
 #define XFS_ERECOVER	1	/* Failure to recover log */
 #define XFS_ELOGSTAT	2	/* Failure to stat log in user space */
 #define XFS_ENOLOGSPACE	3	/* Reservation too large */
@@ -182,8 +179,11 @@ extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud);
 struct xfs_mount;
 /* PRINTFLIKE4 */
 extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-			    char *fmt, ...);
+			char *fmt, ...);
 /* PRINTFLIKE3 */
 extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
 
+#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
+	xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
+
 #endif	/* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6b7ac8bdcac2..1d7f5a7e063e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -404,9 +404,8 @@ xfs_iformat(
 	    INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) +
 		INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) >
 	    INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) {
-		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
-			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu."
-			"  Unmount and run xfs_repair.",
+		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
 			(unsigned long long)ip->i_ino,
 			(int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT)
 			    + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)),
@@ -418,9 +417,8 @@ xfs_iformat(
 	}
 
 	if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) {
-		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
-			"corrupt dinode %Lu, forkoff = 0x%x."
-			"  Unmount and run xfs_repair.",
+		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt dinode %Lu, forkoff = 0x%x.",
 			(unsigned long long)ip->i_ino,
 			(int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT)));
 		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -451,8 +449,9 @@ xfs_iformat(
 			 * no local regular files yet
 			 */
 			if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) {
-				xfs_fs_cmn_err(CE_WARN, ip->i_mount,
-					"corrupt inode (local format for regular file) %Lu.  Unmount and run xfs_repair.",
+				xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+					"corrupt inode %Lu "
+					"(local format for regular file).",
 					(unsigned long long) ip->i_ino);
 				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 						     XFS_ERRLEVEL_LOW,
@@ -462,8 +461,9 @@ xfs_iformat(
 
 			di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
 			if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-				xfs_fs_cmn_err(CE_WARN, ip->i_mount,
-					"corrupt inode %Lu (bad size %Ld for local inode).  Unmount and run xfs_repair.",
+				xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+					"corrupt inode %Lu "
+					"(bad size %Ld for local inode).",
 					(unsigned long long) ip->i_ino,
 					(long long) di_size);
 				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -551,8 +551,9 @@ xfs_iformat_local(
 	 * kmem_alloc() or memcpy() below.
 	 */
 	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
-			"corrupt inode %Lu (bad size %d for local fork, size = %d).  Unmount and run xfs_repair.",
+		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt inode %Lu "
+			"(bad size %d for local fork, size = %d).",
 			(unsigned long long) ip->i_ino, size,
 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
 		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -610,8 +611,8 @@ xfs_iformat_extents(
 	 * kmem_alloc() or memcpy() below.
 	 */
 	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
-			"corrupt inode %Lu ((a)extents = %d).  Unmount and run xfs_repair.",
+		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt inode %Lu ((a)extents = %d).",
 			(unsigned long long) ip->i_ino, nex);
 		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
@@ -692,8 +693,8 @@ xfs_iformat_btree(
 	    || XFS_BMDR_SPACE_CALC(nrecs) >
 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
 	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
-			"corrupt inode %Lu (btree).  Unmount and run xfs_repair.",
+		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt inode %Lu (btree).",
 			(unsigned long long) ip->i_ino);
 		XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
 				 ip->i_mount);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9369010f3f16..36aa1fcb90a5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -608,7 +608,7 @@ xfs_inode_item_trylock(
 		if (iip->ili_pushbuf_flag == 0) {
 			iip->ili_pushbuf_flag = 1;
 #ifdef DEBUG
-			iip->ili_push_owner = get_thread_id();
+			iip->ili_push_owner = current_pid();
 #endif
 			/*
 			 * Inode is left locked in shared mode.
@@ -787,7 +787,7 @@ xfs_inode_item_pushbuf(
 	 * trying to duplicate our effort.
 	 */
 	ASSERT(iip->ili_pushbuf_flag != 0);
-	ASSERT(iip->ili_push_owner == get_thread_id());
+	ASSERT(iip->ili_push_owner == current_pid());
 
 	/*
 	 * If flushlock isn't locked anymore, chances are that the
-- 
cgit v1.2.3


From 0d1335b3106687d87fcfa0e4d90f2a961bd7e1db Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Thu, 12 Jan 2006 10:32:51 +1100
Subject: [XFS] Fix follow_link when dealing with symlinks larger than 256
 bytes. Thanks to Yamamoto Takashi.

SGI-PV: 947953
SGI-Modid: xfs-linux-melb:xfs-kern:24962a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index d388d14efe3e..129403958044 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -546,7 +546,7 @@ linvfs_follow_link(
 	ASSERT(dentry);
 	ASSERT(nd);
 
-	link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL);
+	link = (char *)kmalloc(MAXPATHLEN+1, GFP_KERNEL);
 	if (!link) {
 		nd_set_link(nd, ERR_PTR(-ENOMEM));
 		return NULL;
@@ -562,12 +562,12 @@ linvfs_follow_link(
 	vp = LINVFS_GET_VP(dentry->d_inode);
 
 	iov.iov_base = link;
-	iov.iov_len = MAXNAMELEN;
+	iov.iov_len = MAXPATHLEN;
 
 	uio->uio_iov = &iov;
 	uio->uio_offset = 0;
 	uio->uio_segflg = UIO_SYSSPACE;
-	uio->uio_resid = MAXNAMELEN;
+	uio->uio_resid = MAXPATHLEN;
 	uio->uio_iovcnt = 1;
 
 	VOP_READLINK(vp, uio, 0, NULL, error);
@@ -575,7 +575,7 @@ linvfs_follow_link(
 		kfree(link);
 		link = ERR_PTR(-error);
 	} else {
-		link[MAXNAMELEN - uio->uio_resid] = '\0';
+		link[MAXPATHLEN - uio->uio_resid] = '\0';
 	}
 	kfree(uio);
 
-- 
cgit v1.2.3


From 9e9e3941d0a3497c1e7fb9ce62059705825bb775 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Wed, 11 Jan 2006 12:17:37 -0800
Subject: [PATCH] kdump: vmcore compilation warning fix

o fs/proc/vmcore.c compilation gives warnings on ppc64. The reason being
  that u64 is defined as unsigned long hence u64* is not same as loff_t*
  and compiler cribs.

o Changed the parameter type to u64* instead of loff_t* to resolve the
  conflict.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/vmcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 124e35442ac8..4063fb32f78c 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -41,7 +41,7 @@ struct proc_dir_entry *proc_vmcore = NULL;
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
-			     loff_t *ppos, int userbuf)
+				u64 *ppos, int userbuf)
 {
 	unsigned long pfn, offset;
 	size_t nr_bytes;
-- 
cgit v1.2.3


From ef43bc4fc32bec8fda7bae8948b774616dc9e496 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Jan 2006 12:17:40 -0800
Subject: [PATCH] reiserfs: fix assertion failure in reiserfs+journaled quotas

Sometimes we call do_journal_end() with t_refcount == 0.  If quota is
turned on and we happen to have some inode with preallocation bad things
happen as we try to use the current handle for quota operations.  Checks
for t_refcount in journal_begin() fail and we Oops.  We raise t_refcount to
make those checks happy.  We should not cause any bad as all the needed
quota blocks should be already attached to the transaction (they were
attached to the transaction when we allocated those preallocation blocks).

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/journal.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3f17ef844fb6..4491fcf2a0e6 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -3925,10 +3925,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 		flush = 1;
 	}
 #ifdef REISERFS_PREALLOCATE
-	/* quota ops might need to nest, setup the journal_info pointer for them */
+	/* quota ops might need to nest, setup the journal_info pointer for them
+	 * and raise the refcount so that it is > 0. */
 	current->journal_info = th;
+	th->t_refcount++;
 	reiserfs_discard_all_prealloc(th);	/* it should not involve new blocks into
 						 * the transaction */
+	th->t_refcount--;
 	current->journal_info = th->t_handle_save;
 #endif
 
-- 
cgit v1.2.3


From c59ede7b78db329949d9cdcd7064e22d357560ef Mon Sep 17 00:00:00 2001
From: "Randy.Dunlap" <rdunlap@xenotime.net>
Date: Wed, 11 Jan 2006 12:17:46 -0800
Subject: [PATCH] move capable() to capability.h

- Move capable() from sched.h to capability.h;

- Use <linux/capability.h> where capable() is used
	(in include/, block/, ipc/, kernel/, a few drivers/,
	mm/, security/, & sound/;
	many more drivers/ to go)

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ioctl.c                   | 2 +-
 block/scsi_ioctl.c              | 1 +
 drivers/acorn/char/i2c.c        | 1 +
 drivers/base/firmware_class.c   | 1 +
 drivers/base/memory.c           | 2 +-
 drivers/firmware/efivars.c      | 2 +-
 drivers/oprofile/event_buffer.c | 1 +
 drivers/parisc/led.c            | 1 +
 drivers/parisc/pdc_stable.c     | 2 +-
 fs/xfs/linux-2.6/xfs_cred.h     | 4 +++-
 include/linux/capability.h      | 3 +++
 include/linux/mm.h              | 1 +
 include/linux/sched.h           | 4 ----
 ipc/mqueue.c                    | 1 +
 ipc/msg.c                       | 1 +
 ipc/sem.c                       | 1 +
 ipc/shm.c                       | 1 +
 ipc/util.c                      | 1 +
 kernel/acct.c                   | 1 +
 kernel/capability.c             | 1 +
 kernel/exit.c                   | 1 +
 kernel/fork.c                   | 1 +
 kernel/kexec.c                  | 1 +
 kernel/module.c                 | 1 +
 kernel/ptrace.c                 | 1 +
 kernel/sched.c                  | 1 +
 kernel/signal.c                 | 1 +
 kernel/sys.c                    | 1 +
 kernel/sysctl.c                 | 1 +
 kernel/time.c                   | 1 +
 kernel/uid16.c                  | 1 +
 mm/filemap.c                    | 1 +
 mm/mlock.c                      | 1 +
 mm/mmap.c                       | 1 +
 mm/mremap.c                     | 1 +
 mm/swapfile.c                   | 1 +
 security/commoncap.c            | 1 +
 security/dummy.c                | 1 +
 security/keys/keyctl.c          | 1 +
 security/security.c             | 1 +
 sound/pci/emu10k1/emufx.c       | 1 +
 41 files changed, 44 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/block/ioctl.c b/block/ioctl.c
index 82030e1dfd63..e1109491c234 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>		/* for capable() */
+#include <linux/capability.h>
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 18de84c8ccd8..cc72210687eb 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -21,6 +21,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/cdrom.h>
 #include <linux/slab.h>
diff --git a/drivers/acorn/char/i2c.c b/drivers/acorn/char/i2c.c
index c22bb9dca1ec..c26c08b36829 100644
--- a/drivers/acorn/char/i2c.c
+++ b/drivers/acorn/char/i2c.c
@@ -12,6 +12,7 @@
  *  On Acorn machines, the following i2c devices are on the bus:
  *	- PCF8583 real time clock & static RAM
  */
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/time.h>
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 5b3d5e9ddcb6..3d384e3d34de 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -7,6 +7,7 @@
  *
  */
 
+#include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 58801d718cc2..d1a05224627e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -13,8 +13,8 @@
 #include <linux/sysdev.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>	/* capable() */
 #include <linux/topology.h>
+#include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/memory.h>
 #include <linux/kobject.h>
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index bda5bce681b6..343379f23a53 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -65,11 +65,11 @@
  *   v0.01 release to linux-ia64@linuxia64.org
  */
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/sched.h>		/* for capable() */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/string.h>
diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c
index 166bca790133..b80318f03420 100644
--- a/drivers/oprofile/event_buffer.c
+++ b/drivers/oprofile/event_buffer.c
@@ -15,6 +15,7 @@
 #include <linux/vmalloc.h>
 #include <linux/oprofile.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
 #include <linux/dcookies.h>
 #include <linux/fs.h>
 #include <asm/uaccess.h>
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index f357d3f60360..3627a2d7f79f 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -30,6 +30,7 @@
 #include <linux/types.h>
 #include <linux/ioport.h>
 #include <linux/utsname.h>
+#include <linux/capability.h>
 #include <linux/delay.h>
 #include <linux/netdevice.h>
 #include <linux/inetdevice.h>
diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c
index 38bdca2fac6b..42a3c54e8e6c 100644
--- a/drivers/parisc/pdc_stable.c
+++ b/drivers/parisc/pdc_stable.c
@@ -42,9 +42,9 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>		/* for capable() */
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/sysfs.h>
 #include <linux/kobject.h>
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 4af491024727..e7f3da61c6c3 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_CRED_H__
 #define __XFS_CRED_H__
 
+#include <linux/capability.h>
+
 /*
  * Credentials
  */
@@ -27,7 +29,7 @@ typedef struct cred {
 
 extern struct cred *sys_cred;
 
-/* this is a hack.. (assums sys_cred is the only cred_t in the system) */
+/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
 static __inline int capable_cred(cred_t *cr, int cid)
 {
 	return (cr == sys_cred) ? 1 : capable(cid);
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6b4618902d3d..5a23ce752629 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -43,6 +43,7 @@ typedef struct __user_cap_data_struct {
 #ifdef __KERNEL__
 
 #include <linux/spinlock.h>
+#include <asm/current.h>
 
 /* #define STRICT_CAP_T_TYPECHECKS */
 
@@ -356,6 +357,8 @@ static inline kernel_cap_t cap_invert(kernel_cap_t c)
 
 #define cap_is_fs_cap(c)     (CAP_TO_MASK(c) & CAP_FS_MASK)
 
+extern int capable(int cap);
+
 #endif /* __KERNEL__ */
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e53d2c6fd5f4..c643016499a1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3,6 +3,7 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/capability.h>
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2ae8711bfba1..3b74c4bf2934 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1112,10 +1112,6 @@ static inline int sas_ss_flags(unsigned long sp)
 		: on_sig_stack(sp) ? SS_ONSTACK : 0);
 }
 
-
-/* code is in security.c or kernel/sys.c if !SECURITY */
-extern int capable(int cap);
-
 /*
  * Routines for handling mm_structs
  */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a8aa6152eea6..4e776f9c80e7 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -11,6 +11,7 @@
  * This file is released under the GPL.
  */
 
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
diff --git a/ipc/msg.c b/ipc/msg.c
index d035bd2aba96..a91b64763b86 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -15,6 +15,7 @@
  * (c) 1999 Manfred Spraul <manfreds@colorfullife.com>
  */
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/msg.h>
diff --git a/ipc/sem.c b/ipc/sem.c
index cb5bb2a5df96..46bb8a678dec 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -73,6 +73,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/capability.h>
 #include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "util.h"
diff --git a/ipc/shm.c b/ipc/shm.c
index 0b92e874fc06..4c28d2d8e305 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -27,6 +27,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/capability.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
 
diff --git a/ipc/util.c b/ipc/util.c
index 23f1cec150c1..38b9a0af3bd8 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -20,6 +20,7 @@
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/capability.h>
 #include <linux/highuid.h>
 #include <linux/security.h>
 #include <linux/rcupdate.h>
diff --git a/kernel/acct.c b/kernel/acct.c
index 38d57fa6b78f..065d8b4e51ef 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -47,6 +47,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/acct.h>
+#include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/tty.h>
 #include <linux/security.h>
diff --git a/kernel/capability.c b/kernel/capability.c
index 8986a37a67ea..bfa3c92e16f2 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
  * 30 May 2002:	Cleanup, Robert M. Love <rml@tech9.net>
  */ 
 
+#include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/security.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 802722814925..f8e609ff1893 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -10,6 +10,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp_lock.h>
 #include <linux/module.h>
+#include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 3bdcab49998d..16a776ec2c0b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,7 @@
 #include <linux/binfmts.h>
 #include <linux/mman.h>
 #include <linux/fs.h>
+#include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/security.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index de1441656efd..bf39d28e4c0e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,7 @@
  * Version 2.  See the file COPYING for more details.
  */
 
+#include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/slab.h>
diff --git a/kernel/module.c b/kernel/module.c
index e4276046a1b6..618ed6e23ecc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -28,6 +28,7 @@
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <linux/rcupdate.h>
+#include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/moduleparam.h>
 #include <linux/errno.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index cceaf09ac413..5f33cdb6fff5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -7,6 +7,7 @@
  * to continually duplicate across every architecture.
  */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 34a945bcc022..d129e560cc0d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -27,6 +27,7 @@
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
+#include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/security.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 08aa5b263f36..1da2e74beb97 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <linux/posix-timers.h>
 #include <linux/signal.h>
 #include <linux/audit.h>
+#include <linux/capability.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 9ccf713491f9..d09cac23fdfd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/kexec.h>
 #include <linux/workqueue.h>
+#include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/key.h>
 #include <linux/times.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 03b0598f2369..62d4d9566876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
 #include <linux/capability.h>
diff --git a/kernel/time.c b/kernel/time.c
index 169e8329e0b6..7477b1d2079e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -29,6 +29,7 @@
 
 #include <linux/module.h>
 #include <linux/timex.h>
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/syscalls.h>
diff --git a/kernel/uid16.c b/kernel/uid16.c
index f669941e8b26..aa25605027c8 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -10,6 +10,7 @@
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/security.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index 96de772be487..a965b6b35f26 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/aio.h>
+#include <linux/capability.h>
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
diff --git a/mm/mlock.c b/mm/mlock.c
index 4ae3a46ff768..b90c59573abf 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -5,6 +5,7 @@
  *  (C) Copyright 2002 Christoph Hellwig
  */
 
+#include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/mempolicy.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 64ba4dbcb7de..47556d2b3e90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index ddaeee9a0b69..1903bdf65e42 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -13,6 +13,7 @@
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/highmem.h>
 #include <linux/security.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d8a5afc8b2a3..957fef43fa60 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,7 @@
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
+#include <linux/capability.h>
 #include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
diff --git a/security/commoncap.c b/security/commoncap.c
index 04c12f58d656..8a6e097f99ea 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -7,6 +7,7 @@
  *
  */
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/security/dummy.c b/security/dummy.c
index a15c54709fde..f1a5bd98bf10 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -14,6 +14,7 @@
 
 #undef DEBUG
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 3d2ebae029c1..90db5c76cf6e 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -16,6 +16,7 @@
 #include <linux/syscalls.h>
 #include <linux/keyctl.h>
 #include <linux/fs.h>
+#include <linux/capability.h>
 #include <linux/err.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/security/security.c b/security/security.c
index ed5fb80769c3..f693e1f66b98 100644
--- a/security/security.c
+++ b/security/security.c
@@ -11,6 +11,7 @@
  *	(at your option) any later version.
  */
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c
index 1a903390ad6d..509837252735 100644
--- a/sound/pci/emu10k1/emufx.c
+++ b/sound/pci/emu10k1/emufx.c
@@ -27,6 +27,7 @@
 
 #include <sound/driver.h>
 #include <linux/pci.h>
+#include <linux/capability.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-- 
cgit v1.2.3


From 16f7e0fe2ecc30f30652e8185e1772cdebe39109 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 11 Jan 2006 12:17:46 -0800
Subject: [PATCH] capable/capability.h (fs/)

fs: Use <linux/capability.h> where capable() is used.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/attr.c                      | 1 +
 fs/autofs/root.c               | 1 +
 fs/autofs4/root.c              | 1 +
 fs/buffer.c                    | 1 +
 fs/compat_ioctl.c              | 1 +
 fs/dcookies.c                  | 1 +
 fs/dquot.c                     | 1 +
 fs/ext2/acl.c                  | 1 +
 fs/ext2/balloc.c               | 1 +
 fs/ext2/ioctl.c                | 1 +
 fs/ext2/xattr_trusted.c        | 1 +
 fs/ext3/acl.c                  | 1 +
 fs/ext3/balloc.c               | 1 +
 fs/ext3/ioctl.c                | 1 +
 fs/ext3/xattr_trusted.c        | 1 +
 fs/fat/file.c                  | 1 +
 fs/fcntl.c                     | 1 +
 fs/file_table.c                | 1 +
 fs/hfsplus/ioctl.c             | 1 +
 fs/hugetlbfs/inode.c           | 1 +
 fs/ioctl.c                     | 1 +
 fs/ioprio.c                    | 1 +
 fs/jffs2/fs.c                  | 1 +
 fs/jfs/xattr.c                 | 1 +
 fs/namei.c                     | 1 +
 fs/namespace.c                 | 1 +
 fs/ncpfs/ioctl.c               | 1 +
 fs/ocfs2/file.c                | 1 +
 fs/open.c                      | 1 +
 fs/proc/base.c                 | 1 +
 fs/proc/kcore.c                | 1 +
 fs/quota.c                     | 1 +
 fs/reiserfs/ioctl.c            | 1 +
 fs/reiserfs/xattr.c            | 1 +
 fs/reiserfs/xattr_acl.c        | 1 +
 fs/reiserfs/xattr_trusted.c    | 1 +
 fs/smbfs/proc.c                | 1 +
 fs/sysfs/inode.c               | 1 +
 fs/udf/file.c                  | 1 +
 fs/ufs/balloc.c                | 1 +
 fs/xfs/linux-2.6/xfs_ioctl.c   | 1 +
 fs/xfs/linux-2.6/xfs_iops.c    | 1 +
 fs/xfs/quota/xfs_qm_syscalls.c | 3 +++
 fs/xfs/xfs_acl.c               | 1 +
 fs/xfs/xfs_attr.c              | 3 +++
 fs/xfs/xfs_vnodeops.c          | 3 +++
 46 files changed, 52 insertions(+)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index d63e5096f2f2..97de94670878 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
+#include <linux/capability.h>
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
 #include <linux/quotaops.h>
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 808134a5a2fa..870e2cf33016 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -10,6 +10,7 @@
  *
  * ------------------------------------------------------------------------- */
 
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
 #include <linux/param.h>
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 14aa70282e8c..e93a7ae467c9 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -12,6 +12,7 @@
  *
  * ------------------------------------------------------------------------- */
 
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
 #include <linux/param.h>
diff --git a/fs/buffer.c b/fs/buffer.c
index 6466bc8a3dc7..b9bb7ad6897b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -26,6 +26,7 @@
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
+#include <linux/capability.h>
 #include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/quotaops.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 890bc30fbe20..f51696358a21 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/compat.h>
 #include <linux/kernel.h>
+#include <linux/capability.h>
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 02aa0ddc582a..f8274a8f83bd 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/mount.h>
+#include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
diff --git a/fs/dquot.c b/fs/dquot.c
index cb6d5bfbdfd5..1966c890b48d 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -77,6 +77,7 @@
 #include <linux/kmod.h>
 #include <linux/namei.h>
 #include <linux/buffer_head.h>
+#include <linux/capability.h>
 #include <linux/quotaops.h>
 
 #include <asm/uaccess.h>
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 239133d01d91..35acc43b897f 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
  */
 
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index bb6908066494..2c00953d4b0b 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -16,6 +16,7 @@
 #include <linux/quotaops.h>
 #include <linux/sched.h>
 #include <linux/buffer_head.h>
+#include <linux/capability.h>
 
 /*
  * balloc.c contains the blocks allocation and deallocation routines
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 709d8676b962..3ca9afdf713d 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -8,6 +8,7 @@
  */
 
 #include "ext2.h"
+#include <linux/capability.h>
 #include <linux/time.h>
 #include <linux/sched.h>
 #include <asm/current.h>
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2c072bfea23b..f28a6a499c96 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -7,6 +7,7 @@
 
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
 #include <linux/ext2_fs.h>
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 9ed132c96034..47a9da2dfb4f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/ext3_jbd.h>
 #include <linux/ext3_fs.h>
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index c6393fb4c35a..6250fcdf14a1 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -13,6 +13,7 @@
 
 #include <linux/config.h>
 #include <linux/time.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 706d68608381..556cd5510078 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -9,6 +9,7 @@
 
 #include <linux/fs.h>
 #include <linux/jbd.h>
+#include <linux/capability.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
 #include <linux/time.h>
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index 7c693c94f14d..86d91f1186dc 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -7,6 +7,7 @@
 
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
 #include <linux/ext3_jbd.h>
diff --git a/fs/fat/file.c b/fs/fat/file.c
index d30876cf35f5..e99c5a73b39e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -6,6 +6,7 @@
  *  regular file handling primitives for fat-based filesystems
  */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9903bde475f2..d0767fe58362 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/capability.h>
 #include <linux/dnotify.h>
 #include <linux/smp_lock.h>
 #include <linux/slab.h>
diff --git a/fs/file_table.c b/fs/file_table.c
index 6142250104a6..768b58167543 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -16,6 +16,7 @@
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
+#include <linux/capability.h>
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index e07aa096e07c..13cf848ac833 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -12,6 +12,7 @@
  * hfsplus ioctls
  */
 
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ff1b7d108bd0..ab4c3a9d51b8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/string.h>
+#include <linux/capability.h>
 #include <linux/backing-dev.h>
 #include <linux/hugetlb.h>
 #include <linux/pagevec.h>
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 569209181425..f8aeec3ca10c 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -8,6 +8,7 @@
 #include <linux/syscalls.h>
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
+#include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/security.h>
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 4bf1c6365a19..ca77008146c0 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
+#include <linux/capability.h>
 #include <linux/syscalls.h>
 
 static int set_task_ioprio(struct task_struct *task, int ioprio)
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index d0fcc5f3497e..09e5d10b8840 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -11,6 +11,7 @@
  *
  */
 
+#include <linux/capability.h>
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 952da5f917cd..f23048f9471f 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -17,6 +17,7 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
diff --git a/fs/namei.c b/fs/namei.c
index 0a8f073435af..1e5746eb1380 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -28,6 +28,7 @@
 #include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
+#include <linux/capability.h>
 #include <linux/file.h>
 #include <asm/namei.h>
 #include <asm/uaccess.h>
diff --git a/fs/namespace.c b/fs/namespace.c
index 2ca6145f43d6..8bc15b362d23 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/quotaops.h>
 #include <linux/acct.h>
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/namespace.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index fd3efdca5ae3..d6e0c089e1b1 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -10,6 +10,7 @@
 #include <linux/config.h>
 
 #include <asm/uaccess.h>
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/ioctl.h>
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ca5f9f90d794..eaf33caa0a1f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -23,6 +23,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/slab.h>
diff --git a/fs/open.c b/fs/open.c
index a3b3a9b5c2ff..8e20c1f32563 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,6 +16,7 @@
 #include <linux/tty.h>
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
+#include <linux/capability.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 634355e16986..20feb7568deb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -55,6 +55,7 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/init.h>
+#include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/string.h>
 #include <linux/seq_file.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 1c7da988fcc3..adc2cd95169a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -14,6 +14,7 @@
 #include <linux/proc_fs.h>
 #include <linux/user.h>
 #include <linux/a.out.h>
+#include <linux/capability.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
 #include <linux/vmalloc.h>
diff --git a/fs/quota.c b/fs/quota.c
index d14d872646d4..ba9e0bf32f67 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -15,6 +15,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
+#include <linux/capability.h>
 #include <linux/quotaops.h>
 
 /* Check validity of generic quotactl commands */
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index ba8bf8df6dc7..745c88100895 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -2,6 +2,7 @@
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/time.h>
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6f99e01f94ab..cc061bfd437b 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -30,6 +30,7 @@
  */
 
 #include <linux/reiserfs_fs.h>
+#include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/errno.h>
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 2dc953504cc0..43de3ba83332 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -1,3 +1,4 @@
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/posix_acl.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 2501f7e66ab9..024a938ca60f 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -1,4 +1,5 @@
 #include <linux/reiserfs_fs.h>
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index d6baec0f24ad..b1b878b81730 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index c3133219941c..689f7bcfaf30 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -11,6 +11,7 @@
 #include <linux/pagemap.h>
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
+#include <linux/capability.h>
 #include "sysfs.h"
 
 extern struct super_block * sysfs_sb;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 8a388289040d..a6f2acc1f15c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -31,6 +31,7 @@
 #include <asm/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/string.h> /* memset */
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index faf1512173eb..a9f4421ddb6f 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/capability.h>
 #include <linux/sched.h>
 #include <linux/bitops.h>
 #include <asm/byteorder.h>
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f98c5be3dfe7..21667ba6dcd5 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -52,6 +52,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 
+#include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 97fb1470cf28..9b8ee3470ecc 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -51,6 +51,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 
+#include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/namei.h>
 
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 86a1d09f48d5..676884394aae 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -15,6 +15,9 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+
+#include <linux/capability.h>
+
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_bit.h"
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cc9c91b9e771..4ff0f4e41c61 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -36,6 +36,7 @@
 #include "xfs_mac.h"
 #include "xfs_attr.h"
 
+#include <linux/capability.h>
 #include <linux/posix_acl_xattr.h>
 
 STATIC int	xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1a11c2b51701..e5e91e9c7e89 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -15,6 +15,9 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+
+#include <linux/capability.h>
+
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index e03fa2a3d5ed..e92cacde02f5 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -15,6 +15,9 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+
+#include <linux/capability.h>
+
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
-- 
cgit v1.2.3


From 2966387b481a11a90a7bed6600fc17b4253f6980 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 11 Jan 2006 22:44:03 +0100
Subject: [PATCH] x86_64: Implement compat code for sg driver
 SG_GET_REQUEST_TABLE ioctl

Apparently helps with some non SANE scanner drivers.

Cc: axboe@suse.de

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat_ioctl.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f51696358a21..f0b7256b2f87 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -916,6 +916,40 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
+struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
+	char req_state;
+	char orphan;
+	char sg_io_owned;
+	char problem;
+	int pack_id;
+	compat_uptr_t usr_ptr;
+	unsigned int duration;
+	int unused;
+};
+
+static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+	int err, i;
+	sg_req_info_t *r;
+	struct compat_sg_req_info *o = (struct compat_sg_req_info *)arg;
+	r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
+	err = sys_ioctl(fd,cmd,(unsigned long)r);
+	if (err < 0)
+		return err;
+	for (i = 0; i < SG_MAX_QUEUE; i++) {
+		void __user *ptr;
+		int d;
+
+		if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) ||
+		    get_user(ptr, &r[i].usr_ptr) ||
+		    get_user(d, &r[i].duration) ||
+		    put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) ||
+		    put_user(d, &o[i].duration))
+			return -EFAULT;
+	}
+	return err;
+}
+
 struct sock_fprog32 {
 	unsigned short	len;
 	compat_caddr_t	filter;
@@ -2794,6 +2828,7 @@ HANDLE_IOCTL(FDPOLLDRVSTAT32, fd_ioctl_trans)
 HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans)
 HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans)
 HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
+HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
 HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans)
-- 
cgit v1.2.3


From c6b44d10f25e5a93eca5135b686a35775c63546e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 12 Jan 2006 01:05:26 -0800
Subject: [PATCH] Implement ioctl emulation for the parport character device

Fixes bugzilla.kernel.org bug 2903.

Cc: <tim@cyberelk.net>
Cc: <andrea@suse.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat_ioctl.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f0b7256b2f87..5dd0207ffd46 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -122,6 +122,7 @@
 #include <linux/dvb/dmx.h>
 #include <linux/dvb/frontend.h>
 #include <linux/dvb/video.h>
+#include <linux/lp.h>
 
 /* Aiee. Someone does not find a difference between int and long */
 #define EXT2_IOC32_GETFLAGS               _IOR('f', 1, int)
@@ -2735,6 +2736,20 @@ static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned lon
 }
 #endif
 
+static int
+lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+	struct compat_timeval *tc = (struct compat_timeval *)arg;
+	struct timeval *tn = compat_alloc_user_space(sizeof(struct timeval));
+	struct timeval ts;
+	if (get_user(ts.tv_sec, &tc->tv_sec) ||
+	    get_user(ts.tv_usec, &tc->tv_usec) ||
+	    put_user(ts.tv_sec, &tn->tv_sec) ||
+	    put_user(ts.tv_usec, &tn->tv_usec))
+		return -EFAULT;
+	return sys_ioctl(fd, cmd, (unsigned long)tn);
+}
+
 #define HANDLE_IOCTL(cmd,handler) \
 	{ (cmd), (ioctl_trans_handler_t)(handler) },
 
@@ -2962,6 +2977,20 @@ HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event)
 HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
 HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
 HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
+
+/* parport */
+COMPATIBLE_IOCTL(LPTIME)
+COMPATIBLE_IOCTL(LPCHAR)
+COMPATIBLE_IOCTL(LPABORTOPEN)
+COMPATIBLE_IOCTL(LPCAREFUL)
+COMPATIBLE_IOCTL(LPWAIT)
+COMPATIBLE_IOCTL(LPSETIRQ)
+COMPATIBLE_IOCTL(LPGETSTATUS)
+COMPATIBLE_IOCTL(LPGETSTATUS)
+COMPATIBLE_IOCTL(LPRESET)
+/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/
+COMPATIBLE_IOCTL(LPGETFLAGS)
+HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
 };
 
 int ioctl_table_size = ARRAY_SIZE(ioctl_start);
-- 
cgit v1.2.3